From a618ce05302cdf40ffb53dfbd79aad353b6a3b52 Mon Sep 17 00:00:00 2001 From: Feufochmar Date: Sun, 10 Jun 2018 22:55:04 +0200 Subject: [PATCH] Add list2rule generator (generating rules from examples) --- py-phonagen/generator-list2rule.py | 31 ++++ py-phonagen/phonagen.py | 261 ++++++++++++++++++++++++++++- 2 files changed, 285 insertions(+), 7 deletions(-) create mode 100755 py-phonagen/generator-list2rule.py diff --git a/py-phonagen/generator-list2rule.py b/py-phonagen/generator-list2rule.py new file mode 100755 index 0000000..a0c75ec --- /dev/null +++ b/py-phonagen/generator-list2rule.py @@ -0,0 +1,31 @@ +#! /usr/bin/env python3 + +import argparse +import phonagen + +def parseArgs(): + # Define argument parser + parser = argparse.ArgumentParser(description='Convert an example list to a rule generator.') + parser.add_argument('file', metavar='listfile', help='list file to convert') + parser.add_argument('--id', metavar='id', help='id of the generator', required = True) + parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='') + parser.add_argument('--phonology', metavar='phonology', help='id of the phonology on which is based the generator', required = True) + parser.add_argument('--phonologyfile', metavar='phonologyfile', help='file containing the phonology; mandatory; phonology will present in the output', required = True) + parser.add_argument('--output', metavar='output-file', help='Output file for the generator. The file is printed to standard output if not given.', default='') + # Parse arguments + return parser.parse_args() + +# Main +if __name__ == '__main__': + args = parseArgs() + generator = phonagen.RuleGenerator(id = args.id, description = args.description, phonology = args.phonology) + phonology = phonagen.Phonology() + phonologyFile = phonagen.PhonagenFile() + phonologyFile.load(args.phonologyfile) + phonology = phonologyFile.getPhonology(args.phonology) + # + generator.fromExamples(args.file, phonology) + phonagenFile = phonagen.PhonagenFile() + phonagenFile.addPhonology(phonology) + phonagenFile.addGenerator(generator) + phonagenFile.writeTo(args.output) diff --git a/py-phonagen/phonagen.py b/py-phonagen/phonagen.py index 492e43b..4decd52 100644 --- a/py-phonagen/phonagen.py +++ b/py-phonagen/phonagen.py @@ -4,6 +4,7 @@ import io import sys import csv import random +import unicodedata class Phonology: """Phonology class""" @@ -79,6 +80,72 @@ class Phonology: entry.update({'description': ''}) self.entries.update({entry['id']: entry}) + def formatWord(self, idList): + """Return a table of transcription -> string corresponding to the same word""" + result = {x: "" for x in self.transcriptions} + for x in idList: + phoneme = self.entries[x] + for y in result: + result[y].append(phoneme[y]) + return result + + def getStress(self): + """Return the phoneme id of the stress phoneme""" + # search for #stress tag in description + found = [x['id'] for x in self.entries.values() if '#stress' in x['description']] + if len(found) == 0: + # if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription + found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])] + if len(found) == 0: + raise Exception('No stress phoneme in phonology', self.id) + return found[0] + + def getSyllableBreak(self): + """Return the phoneme id of the syllable break phoneme""" + # search for #syllable-break tag in description + found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']] + if len(found) == 0: + # if not tagged, search for '.' (full stop, u+002E) in phoneme transcription + found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']] + if len(found) == 0: + raise Exception('No syllable break phoneme in phonology', self.id) + return found[0] + + vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ" + def isVowel(phoneme): + return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels) + + consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ" + def isConsonant(phoneme): + return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants) + + def isOnset(self, id): + """Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription""" + entry = self.entries[id] + description = entry['description'] + result = ('#onset' in description) or ('#consonant' in description) + if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description): + result = Phonology.isConsonant(entry['phoneme']) + return result + + def isNucleus(self, id): + """Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription""" + entry = self.entries[id] + description = entry['description'] + result = ('#nucleus' in description) or ('#vowel' in description) + if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description): + result = Phonology.isVowel(entry['phoneme']) + return result + + def isCoda(self, id): + """Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription""" + entry = self.entries[id] + description = entry['description'] + result = ('#coda' in description) or ('#consonant' in description) + if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description): + result = Phonology.isConsonant(entry['phoneme']) + return result + class Distribution: """Discrete distribution""" def __init__(self): @@ -91,7 +158,7 @@ class Distribution: self.items.update({value: oc}) def pickFrom(self): - return random.choices([k for k in self.items.keys()], [v for v in self.items.values])[0] + return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0] def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'): return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items] @@ -101,6 +168,9 @@ class Distribution: for item in struct: self.items.update({item[itemRef]: item[occurencesRef]}) + def isEmpty(self): + return len(self.items) == 0 + class Generator: """Parent class for all generators""" def __init__(self, id = '', description = '', phonology = ''): @@ -122,6 +192,9 @@ class Generator: self.description = struct['description'] self.phonology = struct['phonology'] + def generateWord(self): + raise Exception('Word generation not supported on abstract generator') + class ChainGenerator(Generator): """Chains-based generator""" def __init__(self, order = 1, **kwargs): @@ -151,10 +224,10 @@ class ChainGenerator(Generator): fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True) for row in fileReader: if len(row) != 0: - row.append("") # Add terminator element (empty string) - previous = tuple("" for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order) + row.append('') # Add terminator element (empty string) + previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order) for item in row: - if (item != "") and (phonology.isValid()) and (not phonology.has(item)): + if (item != '') and (phonology.isValid()) and (not phonology.has(item)): raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id) if previous in self.chains: self.chains[previous].addTo(item) @@ -164,6 +237,17 @@ class ChainGenerator(Generator): self.chains.update({previous: dist}) previous = previous[1:] + (item,) + def generateWord(self): + outputIdList = [] + nextItem = '.' # + previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order) + while nextItem != '': + nextItem = self.chains[previous].pickFrom() + if nextItem != '': + outputIdList.append(nextItem) + previous = previous[1:] + (nextItem,) + return outputIdList + class RuleGenerator(Generator): """Rules-based generator""" def __init__(self, **kwargs): @@ -173,12 +257,167 @@ class RuleGenerator(Generator): def toJsonStruct(self): struct = super().toJsonStruct() - # TODO: add rules + struct.update({'type': 'rules', + 'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]}) return struct def fromJsonStruct(self, struct): super().fromJsonStruct(struct) - # TODO: rules + for ruleStruct in struct['rules']: + dist = Distribution() + # The pattern should be converted from a list to a tuple + dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences') + self.rules.update({ruleStruct['id']: dist}) + + def generatePattern(self, pattern): + output = [] + for x in pattern: + if x in self.rules: + ouput.concat(self.generatePattern(self.rules[x].pickFrom())) + else: + output.append(x) + return output + + def generateWord(self): + return self.generatePattern(self.rules['word'].pickFrom()) + + def processRowFromExample(self, row, stressId, syllableBreakId): + # Check the number of stress + nbStress = row.count(stressId) + if nbStress > 1: + print("Too much stress in " + str(row) + ": skip the example") + return + # Build the syllable list + syllables = [] + currentSyllable = [] + stressedSyllableIdx = -1 + syllableIdx = 0 + for x in row: + # Append to the current syllable if not a syllable separator + if (x != stressId) and (x != syllableBreakId): + currentSyllable.append(x) + # In case of syllable separator, only add the syllable to the list if it is not empty + elif len(currentSyllable) != 0: + syllables.append(currentSyllable) + currentSyllable = [] + syllableIdx = syllableIdx + 1 + # If current id is stress, remember the position of the stressed syllable + if (x == stressId): + stressedSyllableIdx = syllableIdx + # After the loop, the current syllable should be non-empty, add it to the list of syllables + if len(currentSyllable) != 0: + syllables.append(currentSyllable) + # Single syllable case + if len(syllables) == 1: + if stressedSyllableIdx == 0: + self.rules['word'].addTo(tuple([stressId, 'single'])) + else: + self.rules['word'].addTo(tuple(['single'])) + self.rules['single'].addTo(tuple(syllables[0])) + # Other cases + else: + wordPattern = [] + for x in range(len(syllables)): + rule = '' + separator = syllableBreakId + if x == 0: + rule = 'initial' + elif x == (len(syllables) - 1): + rule = 'final' + else: + rule = 'middle' + if x == stressedSyllableIdx: + rule = rule + '-stressed' + separator = stressId + # only add the syllable separator to the pattern if it's not the first syllable unless it's stressed + if (separator == stressId) or (x > 0): + wordPattern.append(separator) + # Add the rule to the pattern + wordPattern.append(rule) + # The syllable is added to the corresponding rule + self.rules[rule].addTo(tuple(syllables[x])) + self.rules['word'].addTo(tuple(wordPattern)) + + def splitSyllableRule(self, syllableRule, phonology): + """Replace syllable rules with onset/nucleus/coda pattern""" + newDist = Distribution() + oldDist = self.rules[syllableRule] + # Add onset/nucleus/coda rules + onsetRule = syllableRule + '-onset' + nucleusRule = syllableRule +'-nucleus' + codaRule = syllableRule + '-coda' + self.rules[onsetRule] = Distribution() + self.rules[nucleusRule] = Distribution() + self.rules[codaRule] = Distribution() + # For each pattern, split into onset/nucleus/coda + for pattern in oldDist.items: + isOnset = True + onset = [] + isNucleus = False + nucleus = [] + isCoda = False + coda = [] + for phoneme in pattern: + # Check is there is a change of element + if isOnset and (phonology.isNucleus(phoneme)): + isOnset = False + isNucleus = True + elif isNucleus and (phonology.isCoda(phoneme)): + isNucleus = False + isCoda = True + # Add to the respective list + if isOnset: + onset.append(phoneme) + elif isNucleus: + nucleus.append(phoneme) + else: + coda.append(phoneme) + # Add to the specific distributions and determine the pattern in new distribution + occurences = oldDist.items[pattern] + distPattern = [] + if len(onset) != 0: + distPattern.append(onsetRule) + self.rules[onsetRule].addTo(tuple(onset), occurences) + if len(nucleus) != 0: + distPattern.append(nucleusRule) + self.rules[nucleusRule].addTo(tuple(nucleus), occurences) + if len(coda) != 0: + distPattern.append(codaRule) + self.rules[codaRule].addTo(tuple(coda), occurences) + # Add patterns to distributions + newDist.addTo(tuple(distPattern), occurences) + # Replace the old rules with the new rules + self.rules[syllableRule] = newDist + + def fromExamples(self, file, phonology): + """Train a rule generator on an example file""" + stressId = phonology.getStress() + syllableBreakId = phonology.getSyllableBreak() + # Words are modelled as lists of syllables, with one of those being stressed (optionally) + # The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables) + # Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed) + # + # Add the 'word' rule, and syllable rules, initialized with an empty distribution + self.rules.update({'word': Distribution()}) + syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed'] + for x in syllableRules: + self.rules.update({x: Distribution()}) + # Step 1: open the file and find how words look like + with open(file) as exampleFile: + fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True) + for row in fileReader: + if len(row) != 0: + # Check the items in row + for item in row: + if (item != '') and (not phonology.has(item)): + raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id) + # Process the row + self.processRowFromExample(row, stressId, syllableBreakId) + # Step 2: Check the syllable rules and split them into onset/nucleus/coda rules + for x in syllableRules: + self.splitSyllableRule(x, phonology) + # Step 3: remove the empty rules + self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()} generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator } def makeGenerator(struct): @@ -231,7 +470,7 @@ class PhonagenFile: json.dump(outputStruct, sys.stdout, ensure_ascii=False) else: with open(file, 'w', encoding='utf-8') as outputFile: - json.dump(outputStruct, outputFile, ensure_ascii=False) + json.dump(outputStruct, outputFile, ensure_ascii=False, indent=2) def mergeFrom(self, otherFile): """Add all phonologies and generators from the other file into this one.""" @@ -239,3 +478,11 @@ class PhonagenFile: self.addPhonology(phonology) for generator in otherFile.generators.values(): self.addGenerator(generator) + + def generateWord(self, generator = ''): + gen = generator + if gen == '': + gen = random.choice([x for x in self.generators]) + idList = self.generators[gen].generateWord() + phonology = self.phonologies[self.gnerators[gen].phonology] + return phonology.formatWord(idList)