diff --git a/py-phonagen/phonagen.py b/py-phonagen/phonagen.py index 4decd52..7a8ae2b 100644 --- a/py-phonagen/phonagen.py +++ b/py-phonagen/phonagen.py @@ -120,7 +120,7 @@ class Phonology: return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants) def isOnset(self, id): - """Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription""" + """Check if an id corresponds to a phoneme that can be in an onset, either from description, or if not available, guessed from the phonemic transcription""" entry = self.entries[id] description = entry['description'] result = ('#onset' in description) or ('#consonant' in description) @@ -129,7 +129,7 @@ class Phonology: return result def isNucleus(self, id): - """Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription""" + """Check if an id corresponds to a phoneme that can be in a nucleus, either from description, or if not available, guessed from the phonemic transcription""" entry = self.entries[id] description = entry['description'] result = ('#nucleus' in description) or ('#vowel' in description) @@ -138,7 +138,7 @@ class Phonology: return result def isCoda(self, id): - """Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription""" + """Check if an id corresponds to a phoneme that can be in a coda, either from description, or if not available, guessed from the phonemic transcription""" entry = self.entries[id] description = entry['description'] result = ('#coda' in description) or ('#consonant' in description) @@ -146,6 +146,77 @@ class Phonology: result = Phonology.isConsonant(entry['phoneme']) return result + def isInSingleSyllables(self, id): + """Check if an id corresponds to a phoneme that can be in a single syllable, from description""" + entry = self.entries[id] + description = entry['description'] + result = ('#single' in description) or ('#initial' in description) or ('#final' in description) + if (not result) and ('#middle' not in description): + result = True + return result + + def isInInitialSyllables(self, id): + """Check if an id corresponds to a phoneme that can be in an initial syllable, from description""" + entry = self.entries[id] + description = entry['description'] + result = ('#initial' in description) + if (not result) and ('#single' not in description) and ('#middle' not in description) and ('#final' not in description): + result = True + return result + + def isInMiddleSyllables(self, id): + """Check if an id corresponds to a phoneme that can be in a middle syllable, from description""" + entry = self.entries[id] + description = entry['description'] + result = ('#middle' in description) + if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#final' not in description): + result = True + return result + + def isInFinalSyllables(self, id): + """Check if an id corresponds to a phoneme that can be in a final syllable, from description""" + entry = self.entries[id] + description = entry['description'] + result = ('#final' in description) + if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#middle' not in description): + result = True + return result + + def isInStressedSyllables(self, id): + """Check if an id corresponds to a phoneme that can be in a stressed syllable, from description""" + entry = self.entries[id] + description = entry['description'] + return ('#stressed' in description) or ('#unstressed' not in description) + + def isInUnstressedSyllables(self, id): + """Check if an id corresponds to a phoneme that can be in an unstressed syllable, from description""" + entry = self.entries[id] + description = entry['description'] + return ('#unstressed' in description) or ('#stressed' not in description) + + def getPhonemesFromTags(self, tags): + """Return a list of phoneme id verifying the tag list""" + phonemeList = [] + tagToPredicate = { + '#onset': Phonology.isOnset, + '#nucleus': Phonology.isNucleus, + '#coda': Phonology.isCoda, + '#single': Phonology.isInSingleSyllables, + '#initial': Phonology.isInInitialSyllables, + '#middle': Phonology.isInMiddleSyllables, + '#final': Phonology.isInFinalSyllables, + '#stressed': Phonology.isInStressedSyllables, + '#unstressed': Phonology.isInUnstressedSyllables + } + for id in self.entries: + # skip stress and syllable break + if (id == self.getStress()) or (id == self.getSyllableBreak()): + pass + checklist = [tagToPredicate[t](self, id) for t in tags] + if all(checklist): + phonemeList.append(id) + return phonemeList + class Distribution: """Discrete distribution""" def __init__(self): @@ -389,6 +460,10 @@ class RuleGenerator(Generator): # Replace the old rules with the new rules self.rules[syllableRule] = newDist + def cleanRules(self): + """Remove the empty rules""" + self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()} + def fromExamples(self, file, phonology): """Train a rule generator on an example file""" stressId = phonology.getStress() @@ -417,7 +492,132 @@ class RuleGenerator(Generator): for x in syllableRules: self.splitSyllableRule(x, phonology) # Step 3: remove the empty rules - self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()} + self.cleanRules() + + def randomOccurences(mean, range): + """Generate a random number in the range [mean-range, mean+range+1]""" + return random.randint(mean - range, mean + range + 1) + + def isStressPosition(position, numberSyllables, stressPosition): + """Check if a given position is the position of the stress. The position goes from 1 to numberSyllables included.""" + isPosition = False + if (stressPosition > 0) and (stressPosition <= numberSyllables): + isPosition = position == stressPosition + elif (stressPosition < 0) and (abs(stressPosition) <= numberSyllables): + isPosition = position == (numberSyllables + 1 + stressPosition) + elif (position == numberSyllables) and (stressPosition > numberSyllables): + isPosition = True + elif (position == 1) and (stressPosition < 0) and (abs(stressPosition) > numberSyllables): + isPosition = True + return isPosition + + def fromPhonology(self, phonology, minNumberSyllables = 1, maxNumberSyllables = 4, stressPosition = -2, distributionMean = 20, distributionRange = 5): + """ + Generate a rule-based generator just from a phonology and some parameters. + - minNumberSyllables must be strictly positive. + - maxNumberSyllables must be greater than minNumberSyllables + - stressPosition indicates on which syllable the stress occurs. + Positive index count from the beginning to the end (with the first syllable being at index 1). + Negative index count from the end to the beginning (with the last syllable being at index -1) + Set this to zero if no stress should be generated. + - distributionMean indicates the medium value for the occurences of a phoneme + - distributionRange indicates the maximum absolute difference from distributionMean for the occurences of phonemes and syllables + """ + # Reinitialize + self.phonology = phonology.id + self.rules = {} + # Check the parameters + if maxNumberSyllables < minNumberSyllables: + raise Exception("Maximum number of syllables", maxNumberSyllables, "must be higher than the minimum number of syllables", minNumberSyllables) + if maxNumberSyllables < abs(stressPosition): + raise Exception("Can't set a stress at position", stressPosition, "with a maximum number of syllables", maxNumberSyllables) + if distributionMean < 1: + raise Exception("Distribution mean must be strictly positive. Given", distributionMean) + if distributionMean < distributionRange: + raise Exception("Distribution mean", distributionMean, "must be strictly higher than distribution range", distributionRange) + if distributionRange < 0: + raise Exception("Distribution range must be positive or nul. Given", distributionRange) + # Step 1: Generate the word rules based on the min and max number of syllables, and the presence of stress + stressId = phonology.getStress() + syllableBreakId = phonology.getSyllableBreak() + isStressed = stressPosition != 0 + # Add the 'word' rule, initialized with an empty distribution + self.rules.update({'word': Distribution()}) + # Add the syllable rules and word patterns + syllableRules = [] + syllableRulesToTags = {} + if minNumberSyllables == 1: + syllableRules.append('single') + syllableRulesToTags.update({'single': ['#single']}) + wordPattern = [] + if isStressed: + syllableRulesToTags['single'].append('#stressed') + wordPattern.append(stressId) + wordPattern.append('single') + self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange)) + if maxNumberSyllables > 1: + syllableRules = syllableRules + ['initial', 'middle', 'final'] + syllableRulesToTags.update({'initial': ['#initial'], 'middle': ['#middle'], 'final': ['#final']}) + if isStressed: + syllableRules = syllableRules + ['initial-stressed', 'middle-stressed', 'final-stressed'] + syllableRulesToTags.update({'initial-stressed': ['#initial', '#stressed'], 'middle-stressed': ['#middle', '#stressed'], 'final-stressed': ['#final', '#stressed']}) + syllableRulesToTags.update({'initial': ['#initial', '#unstressed'], 'middle': ['#middle', '#unstressed'], 'final': ['#final', '#unstressed']}) + for nbMiddleSyllables in range(maxNumberSyllables - 1): + nbSyllables = nbMiddleSyllables + 2 + wordPattern = [] + for position in range(1, nbSyllables + 1): + isStressPosition = RuleGenerator.isStressPosition(position, nbSyllables, stressPosition) + # add syllable separator + if isStressPosition: + wordPattern.append(stressId) + elif position > 1: + wordPattern.append(syllableBreakId) + # add syllable + if position == 1: + if isStressPosition: + wordPattern.append('initial-stressed') + else: + wordPattern.append('initial') + elif position == nbSyllables: + if isStressPosition: + wordPattern.append('final-stressed') + else: + wordPattern.append('final') + else: + if isStressPosition: + wordPattern.append('middle-stressed') + else: + wordPattern.append('middle') + self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange)) + # Step 2: Generate the syllable rules + # Add the rules in the distributions + phonemeRules = [] + phonemeRulesToTag = {} + for syllable in syllableRules: + self.rules.update({syllable: Distribution()}) + onset = syllable + '-onset' + nucleus = syllable + '-nucleus' + coda = syllable + '-coda' + phonemeRules = phonemeRules + [onset, nucleus, coda] + ruleTags = syllableRulesToTags[syllable] + phonemeRulesToTag.update({onset: ruleTags + ['#onset'], nucleus: ruleTags + ['#nucleus'], coda: ruleTags + ['#coda']}) + # Fill the syllable rules + # For the generated rules, initial and single syllables may not have onset + if ('#initial' in ruleTags) or ('#single' in ruleTags): + self.rules[syllable].addTo(tuple([nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange)) + self.rules[syllable].addTo(tuple([nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange)) + self.rules[syllable].addTo(tuple([onset, nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange)) + self.rules[syllable].addTo(tuple([onset, nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange)) + # Step 3: Generate the phoneme distributions for each phoneme rule + for rule in phonemeRules: + self.rules.update({rule: Distribution()}) + tags = phonemeRulesToTag[rule] + phonemeList = phonology.getPhonemesFromTags(tags) + for phoneme in phonemeList: + self.rules[rule].addTo(tuple([phoneme]), RuleGenerator.randomOccurences(distributionMean, distributionRange)) + # Step 4: Clean the rules + self.cleanRules() + generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator } def makeGenerator(struct): @@ -470,7 +670,7 @@ class PhonagenFile: json.dump(outputStruct, sys.stdout, ensure_ascii=False) else: with open(file, 'w', encoding='utf-8') as outputFile: - json.dump(outputStruct, outputFile, ensure_ascii=False, indent=2) + json.dump(outputStruct, outputFile, ensure_ascii=False) def mergeFrom(self, otherFile): """Add all phonologies and generators from the other file into this one."""