Add an algorithm to build a rule generator from a phonology, without examples.

This commit is contained in:
Feufochmar 2018-06-14 00:19:27 +02:00
parent 4def536673
commit 3e4485e1b9
1 changed files with 205 additions and 5 deletions

View File

@ -120,7 +120,7 @@ class Phonology:
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
def isOnset(self, id):
"""Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription"""
"""Check if an id corresponds to a phoneme that can be in an onset, either from description, or if not available, guessed from the phonemic transcription"""
entry = self.entries[id]
description = entry['description']
result = ('#onset' in description) or ('#consonant' in description)
@ -129,7 +129,7 @@ class Phonology:
return result
def isNucleus(self, id):
"""Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
"""Check if an id corresponds to a phoneme that can be in a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
entry = self.entries[id]
description = entry['description']
result = ('#nucleus' in description) or ('#vowel' in description)
@ -138,7 +138,7 @@ class Phonology:
return result
def isCoda(self, id):
"""Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription"""
"""Check if an id corresponds to a phoneme that can be in a coda, either from description, or if not available, guessed from the phonemic transcription"""
entry = self.entries[id]
description = entry['description']
result = ('#coda' in description) or ('#consonant' in description)
@ -146,6 +146,77 @@ class Phonology:
result = Phonology.isConsonant(entry['phoneme'])
return result
def isInSingleSyllables(self, id):
"""Check if an id corresponds to a phoneme that can be in a single syllable, from description"""
entry = self.entries[id]
description = entry['description']
result = ('#single' in description) or ('#initial' in description) or ('#final' in description)
if (not result) and ('#middle' not in description):
result = True
return result
def isInInitialSyllables(self, id):
"""Check if an id corresponds to a phoneme that can be in an initial syllable, from description"""
entry = self.entries[id]
description = entry['description']
result = ('#initial' in description)
if (not result) and ('#single' not in description) and ('#middle' not in description) and ('#final' not in description):
result = True
return result
def isInMiddleSyllables(self, id):
"""Check if an id corresponds to a phoneme that can be in a middle syllable, from description"""
entry = self.entries[id]
description = entry['description']
result = ('#middle' in description)
if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#final' not in description):
result = True
return result
def isInFinalSyllables(self, id):
"""Check if an id corresponds to a phoneme that can be in a final syllable, from description"""
entry = self.entries[id]
description = entry['description']
result = ('#final' in description)
if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#middle' not in description):
result = True
return result
def isInStressedSyllables(self, id):
"""Check if an id corresponds to a phoneme that can be in a stressed syllable, from description"""
entry = self.entries[id]
description = entry['description']
return ('#stressed' in description) or ('#unstressed' not in description)
def isInUnstressedSyllables(self, id):
"""Check if an id corresponds to a phoneme that can be in an unstressed syllable, from description"""
entry = self.entries[id]
description = entry['description']
return ('#unstressed' in description) or ('#stressed' not in description)
def getPhonemesFromTags(self, tags):
"""Return a list of phoneme id verifying the tag list"""
phonemeList = []
tagToPredicate = {
'#onset': Phonology.isOnset,
'#nucleus': Phonology.isNucleus,
'#coda': Phonology.isCoda,
'#single': Phonology.isInSingleSyllables,
'#initial': Phonology.isInInitialSyllables,
'#middle': Phonology.isInMiddleSyllables,
'#final': Phonology.isInFinalSyllables,
'#stressed': Phonology.isInStressedSyllables,
'#unstressed': Phonology.isInUnstressedSyllables
}
for id in self.entries:
# skip stress and syllable break
if (id == self.getStress()) or (id == self.getSyllableBreak()):
pass
checklist = [tagToPredicate[t](self, id) for t in tags]
if all(checklist):
phonemeList.append(id)
return phonemeList
class Distribution:
"""Discrete distribution"""
def __init__(self):
@ -389,6 +460,10 @@ class RuleGenerator(Generator):
# Replace the old rules with the new rules
self.rules[syllableRule] = newDist
def cleanRules(self):
"""Remove the empty rules"""
self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
def fromExamples(self, file, phonology):
"""Train a rule generator on an example file"""
stressId = phonology.getStress()
@ -417,7 +492,132 @@ class RuleGenerator(Generator):
for x in syllableRules:
self.splitSyllableRule(x, phonology)
# Step 3: remove the empty rules
self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
self.cleanRules()
def randomOccurences(mean, range):
"""Generate a random number in the range [mean-range, mean+range+1]"""
return random.randint(mean - range, mean + range + 1)
def isStressPosition(position, numberSyllables, stressPosition):
"""Check if a given position is the position of the stress. The position goes from 1 to numberSyllables included."""
isPosition = False
if (stressPosition > 0) and (stressPosition <= numberSyllables):
isPosition = position == stressPosition
elif (stressPosition < 0) and (abs(stressPosition) <= numberSyllables):
isPosition = position == (numberSyllables + 1 + stressPosition)
elif (position == numberSyllables) and (stressPosition > numberSyllables):
isPosition = True
elif (position == 1) and (stressPosition < 0) and (abs(stressPosition) > numberSyllables):
isPosition = True
return isPosition
def fromPhonology(self, phonology, minNumberSyllables = 1, maxNumberSyllables = 4, stressPosition = -2, distributionMean = 20, distributionRange = 5):
"""
Generate a rule-based generator just from a phonology and some parameters.
- minNumberSyllables must be strictly positive.
- maxNumberSyllables must be greater than minNumberSyllables
- stressPosition indicates on which syllable the stress occurs.
Positive index count from the beginning to the end (with the first syllable being at index 1).
Negative index count from the end to the beginning (with the last syllable being at index -1)
Set this to zero if no stress should be generated.
- distributionMean indicates the medium value for the occurences of a phoneme
- distributionRange indicates the maximum absolute difference from distributionMean for the occurences of phonemes and syllables
"""
# Reinitialize
self.phonology = phonology.id
self.rules = {}
# Check the parameters
if maxNumberSyllables < minNumberSyllables:
raise Exception("Maximum number of syllables", maxNumberSyllables, "must be higher than the minimum number of syllables", minNumberSyllables)
if maxNumberSyllables < abs(stressPosition):
raise Exception("Can't set a stress at position", stressPosition, "with a maximum number of syllables", maxNumberSyllables)
if distributionMean < 1:
raise Exception("Distribution mean must be strictly positive. Given", distributionMean)
if distributionMean < distributionRange:
raise Exception("Distribution mean", distributionMean, "must be strictly higher than distribution range", distributionRange)
if distributionRange < 0:
raise Exception("Distribution range must be positive or nul. Given", distributionRange)
# Step 1: Generate the word rules based on the min and max number of syllables, and the presence of stress
stressId = phonology.getStress()
syllableBreakId = phonology.getSyllableBreak()
isStressed = stressPosition != 0
# Add the 'word' rule, initialized with an empty distribution
self.rules.update({'word': Distribution()})
# Add the syllable rules and word patterns
syllableRules = []
syllableRulesToTags = {}
if minNumberSyllables == 1:
syllableRules.append('single')
syllableRulesToTags.update({'single': ['#single']})
wordPattern = []
if isStressed:
syllableRulesToTags['single'].append('#stressed')
wordPattern.append(stressId)
wordPattern.append('single')
self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
if maxNumberSyllables > 1:
syllableRules = syllableRules + ['initial', 'middle', 'final']
syllableRulesToTags.update({'initial': ['#initial'], 'middle': ['#middle'], 'final': ['#final']})
if isStressed:
syllableRules = syllableRules + ['initial-stressed', 'middle-stressed', 'final-stressed']
syllableRulesToTags.update({'initial-stressed': ['#initial', '#stressed'], 'middle-stressed': ['#middle', '#stressed'], 'final-stressed': ['#final', '#stressed']})
syllableRulesToTags.update({'initial': ['#initial', '#unstressed'], 'middle': ['#middle', '#unstressed'], 'final': ['#final', '#unstressed']})
for nbMiddleSyllables in range(maxNumberSyllables - 1):
nbSyllables = nbMiddleSyllables + 2
wordPattern = []
for position in range(1, nbSyllables + 1):
isStressPosition = RuleGenerator.isStressPosition(position, nbSyllables, stressPosition)
# add syllable separator
if isStressPosition:
wordPattern.append(stressId)
elif position > 1:
wordPattern.append(syllableBreakId)
# add syllable
if position == 1:
if isStressPosition:
wordPattern.append('initial-stressed')
else:
wordPattern.append('initial')
elif position == nbSyllables:
if isStressPosition:
wordPattern.append('final-stressed')
else:
wordPattern.append('final')
else:
if isStressPosition:
wordPattern.append('middle-stressed')
else:
wordPattern.append('middle')
self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
# Step 2: Generate the syllable rules
# Add the rules in the distributions
phonemeRules = []
phonemeRulesToTag = {}
for syllable in syllableRules:
self.rules.update({syllable: Distribution()})
onset = syllable + '-onset'
nucleus = syllable + '-nucleus'
coda = syllable + '-coda'
phonemeRules = phonemeRules + [onset, nucleus, coda]
ruleTags = syllableRulesToTags[syllable]
phonemeRulesToTag.update({onset: ruleTags + ['#onset'], nucleus: ruleTags + ['#nucleus'], coda: ruleTags + ['#coda']})
# Fill the syllable rules
# For the generated rules, initial and single syllables may not have onset
if ('#initial' in ruleTags) or ('#single' in ruleTags):
self.rules[syllable].addTo(tuple([nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
self.rules[syllable].addTo(tuple([nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
self.rules[syllable].addTo(tuple([onset, nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
self.rules[syllable].addTo(tuple([onset, nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
# Step 3: Generate the phoneme distributions for each phoneme rule
for rule in phonemeRules:
self.rules.update({rule: Distribution()})
tags = phonemeRulesToTag[rule]
phonemeList = phonology.getPhonemesFromTags(tags)
for phoneme in phonemeList:
self.rules[rule].addTo(tuple([phoneme]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
# Step 4: Clean the rules
self.cleanRules()
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
def makeGenerator(struct):
@ -470,7 +670,7 @@ class PhonagenFile:
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
else:
with open(file, 'w', encoding='utf-8') as outputFile:
json.dump(outputStruct, outputFile, ensure_ascii=False, indent=2)
json.dump(outputStruct, outputFile, ensure_ascii=False)
def mergeFrom(self, otherFile):
"""Add all phonologies and generators from the other file into this one."""