From 5f204ce9e9030765e943f68833eb3320ba76e6e9 Mon Sep 17 00:00:00 2001 From: Feufochmar Date: Sat, 9 Jun 2018 18:58:46 +0200 Subject: [PATCH] Example list to Markov chain generator. --- py-phonagen/generator-list2chain.py | 34 +++++++ py-phonagen/phonagen.py | 149 ++++++++++++++++++++++++++-- 2 files changed, 175 insertions(+), 8 deletions(-) create mode 100644 py-phonagen/generator-list2chain.py diff --git a/py-phonagen/generator-list2chain.py b/py-phonagen/generator-list2chain.py new file mode 100644 index 0000000..5815702 --- /dev/null +++ b/py-phonagen/generator-list2chain.py @@ -0,0 +1,34 @@ +#! /usr/bin/env python3 + +import argparse +import phonagen + +def parseArgs(): + # Define argument parser + parser = argparse.ArgumentParser(description='Convert an example list to a chain generator.') + parser.add_argument('file', metavar='listfile', help='list file to convert') + parser.add_argument('--id', metavar='id', help='id of the generator', required = True) + parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='') + parser.add_argument('--phonology', metavar='phonology', help='id of the phonology on which is based the generator', required = True) + parser.add_argument('--phonologyfile', metavar='phonologyfile', help='file containing the phonology; optional, if provided, examples will be checked for unknown phonemes and phonology will present in the output', default='') + parser.add_argument('--order', metavar='order', help='order of the chain; 1 by default', default=1, type=int) + parser.add_argument('--output', metavar='output-file', help='Output file for the generator. The file is printed to standard output if not given.', default='') + # Parse arguments + return parser.parse_args() + +# Main +if __name__ == '__main__': + args = parseArgs() + generator = phonagen.ChainGenerator(id = args.id, description = args.description, phonology = args.phonology, order = args.order) + phonology = phonagen.Phonology() + # load the phonology from a file + if args.phonologyfile != '': + phonologyFile = phonagen.PhonagenFile() + phonologyFile.load(args.phonologyfile) + phonology = phonologyFile.getPhonology(args.phonology) + # + generator.fromExamples(args.file, phonology) + phonagenFile = phonagen.PhonagenFile() + phonagenFile.addPhonology(phonology) + phonagenFile.addGenerator(generator) + phonagenFile.writeTo(args.output) diff --git a/py-phonagen/phonagen.py b/py-phonagen/phonagen.py index 1ae0110..6254993 100644 --- a/py-phonagen/phonagen.py +++ b/py-phonagen/phonagen.py @@ -3,32 +3,41 @@ import json import io import sys import csv +import random class Phonology: """Phonology class""" - def __init__(self, id = '', description = '', transcriptions = [], mainTranscription = '', entries = []): + def __init__(self, id = '', description = '', mainTranscription = ''): self.id = id self.description = description - self.transcriptions = transcriptions + self.transcriptions = [] self.mainTranscription = mainTranscription - self.entries = entries + self.entries = {} # id -> entry def isValid(self): return self.id != '' + def has(self, id): + return id in self.entries + def toJsonStruct(self): """Convert a Phonology to a Json structure""" return { 'id': self.id, 'description': self.description, 'transcriptions': self.transcriptions, 'main-transcription': self.mainTranscription, - 'entries': self.entries } + 'entries': [x for x in self.entries.values()] } def fromJsonStruct(self, struct): - """Create a Phonology from a Json structure""" - return Phonology(struct['id'], struct['decription'], struct['transcriptions'], struct['main-transcription'], struct['entries']) + """Fill a Phonology from a Json structure""" + self.id = struct['id'] + self.description = struct['description'] + self.transcriptions = struct['transcriptions'] + self.mainTranscription = struct['main-transcription'] + self.entries = {x['id']: x for x in struct['entries']} def fromCsv(self, file): + """Fill a Phonology from a Csv file""" with open(file) as csvfile: fileReader = csv.reader(csvfile) # get csv header @@ -68,7 +77,118 @@ class Phonology: # if description is not provided, add an empty one if 'description' not in header: entry.update({'description': ''}) - self.entries.append(entry) + self.entries.update({entry['id']: entry}) + +class Distribution: + """Discrete distribution""" + def __init__(self): + self.items = {} + + def addTo(self, value, occurences = 1): + oc = occurences + if value in self.items: + oc = oc + self.items[value] + self.items.update({value: oc}) + + def pickFrom(self): + return random.choices([k for k in self.items.keys()], [v for v in self.items.values])[0] + + def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'): + return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items] + + def fromJsonStruct(self, struct, itemRef = 'value', occurencesRef = 'occurences'): + self.items = {} + for item in struct: + self.items.update({item[itemRef]: item[occurencesRef]}) + +class Generator: + """Parent class for all generators""" + def __init__(self, id = '', description = '', phonology = ''): + self.id = id + self.description = description + self.phonology = phonology + self.isTyped = False + + def isValid(self): + return (self.id != '') and self.isTyped + + def toJsonStruct(self): + return { 'id': self.id, + 'description': self.description, + 'phonology': self.phonology } + + def fromJsonStruct(self, struct): + self.id = struct['id'] + self.description = struct['description'] + self.phonology = struct['phonology'] + +class ChainGenerator(Generator): + """Chains-based generator""" + def __init__(self, order = 1, **kwargs): + super().__init__(**kwargs) + self.order = order + self.chains = {} # input -> distribution of outputs + self.isTyped = True + + def toJsonStruct(self): + struct = super().toJsonStruct() + struct.update({'type': 'chains', + 'order': self.order, + 'chains': [{'input': x, 'possible-outputs': self.chains[x].toJsonStruct(itemRef = 'value', occurencesRef = 'occurences')} for x in self.chains]}) + return struct + + def fromJsonStruct(self, struct): + super().fromJsonStruct(struct) + self.order = struct['order'] + for chainStruct in struct['chains']: + dist = Distribution() + dist.fromJsonStruct(chainStruct['possible-outputs'], itemRef = 'value', occurencesRef = 'occurences') + self.chains.update({chainStruct['input']: dist}) + + def fromExamples(self, file, phonology): + """Train a chain generator on an example file""" + with open(file) as exampleFile: + fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True) + for row in fileReader: + if len(row) != 0: + row.append("") # Add terminator element (empty string) + previous = tuple("" for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order) + for item in row: + if (item != "") and (phonology.isValid()) and (not phonology.has(item)): + raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id) + if previous in self.chains: + self.chains[previous].addTo(item) + else: + dist = Distribution() + dist.addTo(item) + self.chains.update({previous: dist}) + previous = previous[1:] + (item,) + +class RuleGenerator(Generator): + """Rules-based generator""" + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.rules = {} + self.isTyped = True + + def toJsonStruct(self): + struct = super().toJsonStruct() + # TODO: add rules + return struct + + def fromJsonStruct(self, struct): + super().fromJsonStruct(struct) + # TODO: rules + +generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator } +def makeGenerator(struct): + """Function instanciating a generator from a JSON structure""" + if struct['type'] in generatorTypeToClass: + generator = generatorTypeToClass[struct['type']]() + else: + generator = Generator() + generator.fromJsonStruct(struct) + return generator class PhonagenFile: """A phonagen file, with phonologies and generators""" @@ -90,8 +210,21 @@ class PhonagenFile: def getGenerator(self, id): return self.generators[id] + def load(self, file): + """Load from a JSON file""" + with open(file, 'r', encoding='utf-8') as inputFile: + jsonStruct = json.load(inputFile) + # Load phonologies + for struct in jsonStruct['phonologies']: + phonology = Phonology() + phonology.fromJsonStruct(struct) + self.addPhonology(phonology) + # Load generators + for struct in jsonStruct['generators']: + self.addGenerator(makeGenerator(struct)) + def writeTo(self, file = ''): - """Output a JSON file from lists of phonologies and generators""" + """Output to a JSON file (or stdout)""" outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()], 'generators': [x.toJsonStruct() for x in self.generators.values()] } if file == '':