Add an algorithm to build a rule generator from a phonology, without examples.
This commit is contained in:
parent
4def536673
commit
3e4485e1b9
|
@ -120,7 +120,7 @@ class Phonology:
|
||||||
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
|
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
|
||||||
|
|
||||||
def isOnset(self, id):
|
def isOnset(self, id):
|
||||||
"""Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription"""
|
"""Check if an id corresponds to a phoneme that can be in an onset, either from description, or if not available, guessed from the phonemic transcription"""
|
||||||
entry = self.entries[id]
|
entry = self.entries[id]
|
||||||
description = entry['description']
|
description = entry['description']
|
||||||
result = ('#onset' in description) or ('#consonant' in description)
|
result = ('#onset' in description) or ('#consonant' in description)
|
||||||
|
@ -129,7 +129,7 @@ class Phonology:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def isNucleus(self, id):
|
def isNucleus(self, id):
|
||||||
"""Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
|
"""Check if an id corresponds to a phoneme that can be in a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
|
||||||
entry = self.entries[id]
|
entry = self.entries[id]
|
||||||
description = entry['description']
|
description = entry['description']
|
||||||
result = ('#nucleus' in description) or ('#vowel' in description)
|
result = ('#nucleus' in description) or ('#vowel' in description)
|
||||||
|
@ -138,7 +138,7 @@ class Phonology:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def isCoda(self, id):
|
def isCoda(self, id):
|
||||||
"""Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription"""
|
"""Check if an id corresponds to a phoneme that can be in a coda, either from description, or if not available, guessed from the phonemic transcription"""
|
||||||
entry = self.entries[id]
|
entry = self.entries[id]
|
||||||
description = entry['description']
|
description = entry['description']
|
||||||
result = ('#coda' in description) or ('#consonant' in description)
|
result = ('#coda' in description) or ('#consonant' in description)
|
||||||
|
@ -146,6 +146,77 @@ class Phonology:
|
||||||
result = Phonology.isConsonant(entry['phoneme'])
|
result = Phonology.isConsonant(entry['phoneme'])
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def isInSingleSyllables(self, id):
|
||||||
|
"""Check if an id corresponds to a phoneme that can be in a single syllable, from description"""
|
||||||
|
entry = self.entries[id]
|
||||||
|
description = entry['description']
|
||||||
|
result = ('#single' in description) or ('#initial' in description) or ('#final' in description)
|
||||||
|
if (not result) and ('#middle' not in description):
|
||||||
|
result = True
|
||||||
|
return result
|
||||||
|
|
||||||
|
def isInInitialSyllables(self, id):
|
||||||
|
"""Check if an id corresponds to a phoneme that can be in an initial syllable, from description"""
|
||||||
|
entry = self.entries[id]
|
||||||
|
description = entry['description']
|
||||||
|
result = ('#initial' in description)
|
||||||
|
if (not result) and ('#single' not in description) and ('#middle' not in description) and ('#final' not in description):
|
||||||
|
result = True
|
||||||
|
return result
|
||||||
|
|
||||||
|
def isInMiddleSyllables(self, id):
|
||||||
|
"""Check if an id corresponds to a phoneme that can be in a middle syllable, from description"""
|
||||||
|
entry = self.entries[id]
|
||||||
|
description = entry['description']
|
||||||
|
result = ('#middle' in description)
|
||||||
|
if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#final' not in description):
|
||||||
|
result = True
|
||||||
|
return result
|
||||||
|
|
||||||
|
def isInFinalSyllables(self, id):
|
||||||
|
"""Check if an id corresponds to a phoneme that can be in a final syllable, from description"""
|
||||||
|
entry = self.entries[id]
|
||||||
|
description = entry['description']
|
||||||
|
result = ('#final' in description)
|
||||||
|
if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#middle' not in description):
|
||||||
|
result = True
|
||||||
|
return result
|
||||||
|
|
||||||
|
def isInStressedSyllables(self, id):
|
||||||
|
"""Check if an id corresponds to a phoneme that can be in a stressed syllable, from description"""
|
||||||
|
entry = self.entries[id]
|
||||||
|
description = entry['description']
|
||||||
|
return ('#stressed' in description) or ('#unstressed' not in description)
|
||||||
|
|
||||||
|
def isInUnstressedSyllables(self, id):
|
||||||
|
"""Check if an id corresponds to a phoneme that can be in an unstressed syllable, from description"""
|
||||||
|
entry = self.entries[id]
|
||||||
|
description = entry['description']
|
||||||
|
return ('#unstressed' in description) or ('#stressed' not in description)
|
||||||
|
|
||||||
|
def getPhonemesFromTags(self, tags):
|
||||||
|
"""Return a list of phoneme id verifying the tag list"""
|
||||||
|
phonemeList = []
|
||||||
|
tagToPredicate = {
|
||||||
|
'#onset': Phonology.isOnset,
|
||||||
|
'#nucleus': Phonology.isNucleus,
|
||||||
|
'#coda': Phonology.isCoda,
|
||||||
|
'#single': Phonology.isInSingleSyllables,
|
||||||
|
'#initial': Phonology.isInInitialSyllables,
|
||||||
|
'#middle': Phonology.isInMiddleSyllables,
|
||||||
|
'#final': Phonology.isInFinalSyllables,
|
||||||
|
'#stressed': Phonology.isInStressedSyllables,
|
||||||
|
'#unstressed': Phonology.isInUnstressedSyllables
|
||||||
|
}
|
||||||
|
for id in self.entries:
|
||||||
|
# skip stress and syllable break
|
||||||
|
if (id == self.getStress()) or (id == self.getSyllableBreak()):
|
||||||
|
pass
|
||||||
|
checklist = [tagToPredicate[t](self, id) for t in tags]
|
||||||
|
if all(checklist):
|
||||||
|
phonemeList.append(id)
|
||||||
|
return phonemeList
|
||||||
|
|
||||||
class Distribution:
|
class Distribution:
|
||||||
"""Discrete distribution"""
|
"""Discrete distribution"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -389,6 +460,10 @@ class RuleGenerator(Generator):
|
||||||
# Replace the old rules with the new rules
|
# Replace the old rules with the new rules
|
||||||
self.rules[syllableRule] = newDist
|
self.rules[syllableRule] = newDist
|
||||||
|
|
||||||
|
def cleanRules(self):
|
||||||
|
"""Remove the empty rules"""
|
||||||
|
self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
|
||||||
|
|
||||||
def fromExamples(self, file, phonology):
|
def fromExamples(self, file, phonology):
|
||||||
"""Train a rule generator on an example file"""
|
"""Train a rule generator on an example file"""
|
||||||
stressId = phonology.getStress()
|
stressId = phonology.getStress()
|
||||||
|
@ -417,7 +492,132 @@ class RuleGenerator(Generator):
|
||||||
for x in syllableRules:
|
for x in syllableRules:
|
||||||
self.splitSyllableRule(x, phonology)
|
self.splitSyllableRule(x, phonology)
|
||||||
# Step 3: remove the empty rules
|
# Step 3: remove the empty rules
|
||||||
self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
|
self.cleanRules()
|
||||||
|
|
||||||
|
def randomOccurences(mean, range):
|
||||||
|
"""Generate a random number in the range [mean-range, mean+range+1]"""
|
||||||
|
return random.randint(mean - range, mean + range + 1)
|
||||||
|
|
||||||
|
def isStressPosition(position, numberSyllables, stressPosition):
|
||||||
|
"""Check if a given position is the position of the stress. The position goes from 1 to numberSyllables included."""
|
||||||
|
isPosition = False
|
||||||
|
if (stressPosition > 0) and (stressPosition <= numberSyllables):
|
||||||
|
isPosition = position == stressPosition
|
||||||
|
elif (stressPosition < 0) and (abs(stressPosition) <= numberSyllables):
|
||||||
|
isPosition = position == (numberSyllables + 1 + stressPosition)
|
||||||
|
elif (position == numberSyllables) and (stressPosition > numberSyllables):
|
||||||
|
isPosition = True
|
||||||
|
elif (position == 1) and (stressPosition < 0) and (abs(stressPosition) > numberSyllables):
|
||||||
|
isPosition = True
|
||||||
|
return isPosition
|
||||||
|
|
||||||
|
def fromPhonology(self, phonology, minNumberSyllables = 1, maxNumberSyllables = 4, stressPosition = -2, distributionMean = 20, distributionRange = 5):
|
||||||
|
"""
|
||||||
|
Generate a rule-based generator just from a phonology and some parameters.
|
||||||
|
- minNumberSyllables must be strictly positive.
|
||||||
|
- maxNumberSyllables must be greater than minNumberSyllables
|
||||||
|
- stressPosition indicates on which syllable the stress occurs.
|
||||||
|
Positive index count from the beginning to the end (with the first syllable being at index 1).
|
||||||
|
Negative index count from the end to the beginning (with the last syllable being at index -1)
|
||||||
|
Set this to zero if no stress should be generated.
|
||||||
|
- distributionMean indicates the medium value for the occurences of a phoneme
|
||||||
|
- distributionRange indicates the maximum absolute difference from distributionMean for the occurences of phonemes and syllables
|
||||||
|
"""
|
||||||
|
# Reinitialize
|
||||||
|
self.phonology = phonology.id
|
||||||
|
self.rules = {}
|
||||||
|
# Check the parameters
|
||||||
|
if maxNumberSyllables < minNumberSyllables:
|
||||||
|
raise Exception("Maximum number of syllables", maxNumberSyllables, "must be higher than the minimum number of syllables", minNumberSyllables)
|
||||||
|
if maxNumberSyllables < abs(stressPosition):
|
||||||
|
raise Exception("Can't set a stress at position", stressPosition, "with a maximum number of syllables", maxNumberSyllables)
|
||||||
|
if distributionMean < 1:
|
||||||
|
raise Exception("Distribution mean must be strictly positive. Given", distributionMean)
|
||||||
|
if distributionMean < distributionRange:
|
||||||
|
raise Exception("Distribution mean", distributionMean, "must be strictly higher than distribution range", distributionRange)
|
||||||
|
if distributionRange < 0:
|
||||||
|
raise Exception("Distribution range must be positive or nul. Given", distributionRange)
|
||||||
|
# Step 1: Generate the word rules based on the min and max number of syllables, and the presence of stress
|
||||||
|
stressId = phonology.getStress()
|
||||||
|
syllableBreakId = phonology.getSyllableBreak()
|
||||||
|
isStressed = stressPosition != 0
|
||||||
|
# Add the 'word' rule, initialized with an empty distribution
|
||||||
|
self.rules.update({'word': Distribution()})
|
||||||
|
# Add the syllable rules and word patterns
|
||||||
|
syllableRules = []
|
||||||
|
syllableRulesToTags = {}
|
||||||
|
if minNumberSyllables == 1:
|
||||||
|
syllableRules.append('single')
|
||||||
|
syllableRulesToTags.update({'single': ['#single']})
|
||||||
|
wordPattern = []
|
||||||
|
if isStressed:
|
||||||
|
syllableRulesToTags['single'].append('#stressed')
|
||||||
|
wordPattern.append(stressId)
|
||||||
|
wordPattern.append('single')
|
||||||
|
self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||||
|
if maxNumberSyllables > 1:
|
||||||
|
syllableRules = syllableRules + ['initial', 'middle', 'final']
|
||||||
|
syllableRulesToTags.update({'initial': ['#initial'], 'middle': ['#middle'], 'final': ['#final']})
|
||||||
|
if isStressed:
|
||||||
|
syllableRules = syllableRules + ['initial-stressed', 'middle-stressed', 'final-stressed']
|
||||||
|
syllableRulesToTags.update({'initial-stressed': ['#initial', '#stressed'], 'middle-stressed': ['#middle', '#stressed'], 'final-stressed': ['#final', '#stressed']})
|
||||||
|
syllableRulesToTags.update({'initial': ['#initial', '#unstressed'], 'middle': ['#middle', '#unstressed'], 'final': ['#final', '#unstressed']})
|
||||||
|
for nbMiddleSyllables in range(maxNumberSyllables - 1):
|
||||||
|
nbSyllables = nbMiddleSyllables + 2
|
||||||
|
wordPattern = []
|
||||||
|
for position in range(1, nbSyllables + 1):
|
||||||
|
isStressPosition = RuleGenerator.isStressPosition(position, nbSyllables, stressPosition)
|
||||||
|
# add syllable separator
|
||||||
|
if isStressPosition:
|
||||||
|
wordPattern.append(stressId)
|
||||||
|
elif position > 1:
|
||||||
|
wordPattern.append(syllableBreakId)
|
||||||
|
# add syllable
|
||||||
|
if position == 1:
|
||||||
|
if isStressPosition:
|
||||||
|
wordPattern.append('initial-stressed')
|
||||||
|
else:
|
||||||
|
wordPattern.append('initial')
|
||||||
|
elif position == nbSyllables:
|
||||||
|
if isStressPosition:
|
||||||
|
wordPattern.append('final-stressed')
|
||||||
|
else:
|
||||||
|
wordPattern.append('final')
|
||||||
|
else:
|
||||||
|
if isStressPosition:
|
||||||
|
wordPattern.append('middle-stressed')
|
||||||
|
else:
|
||||||
|
wordPattern.append('middle')
|
||||||
|
self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||||
|
# Step 2: Generate the syllable rules
|
||||||
|
# Add the rules in the distributions
|
||||||
|
phonemeRules = []
|
||||||
|
phonemeRulesToTag = {}
|
||||||
|
for syllable in syllableRules:
|
||||||
|
self.rules.update({syllable: Distribution()})
|
||||||
|
onset = syllable + '-onset'
|
||||||
|
nucleus = syllable + '-nucleus'
|
||||||
|
coda = syllable + '-coda'
|
||||||
|
phonemeRules = phonemeRules + [onset, nucleus, coda]
|
||||||
|
ruleTags = syllableRulesToTags[syllable]
|
||||||
|
phonemeRulesToTag.update({onset: ruleTags + ['#onset'], nucleus: ruleTags + ['#nucleus'], coda: ruleTags + ['#coda']})
|
||||||
|
# Fill the syllable rules
|
||||||
|
# For the generated rules, initial and single syllables may not have onset
|
||||||
|
if ('#initial' in ruleTags) or ('#single' in ruleTags):
|
||||||
|
self.rules[syllable].addTo(tuple([nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||||
|
self.rules[syllable].addTo(tuple([nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||||
|
self.rules[syllable].addTo(tuple([onset, nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||||
|
self.rules[syllable].addTo(tuple([onset, nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||||
|
# Step 3: Generate the phoneme distributions for each phoneme rule
|
||||||
|
for rule in phonemeRules:
|
||||||
|
self.rules.update({rule: Distribution()})
|
||||||
|
tags = phonemeRulesToTag[rule]
|
||||||
|
phonemeList = phonology.getPhonemesFromTags(tags)
|
||||||
|
for phoneme in phonemeList:
|
||||||
|
self.rules[rule].addTo(tuple([phoneme]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||||
|
# Step 4: Clean the rules
|
||||||
|
self.cleanRules()
|
||||||
|
|
||||||
|
|
||||||
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
|
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
|
||||||
def makeGenerator(struct):
|
def makeGenerator(struct):
|
||||||
|
@ -470,7 +670,7 @@ class PhonagenFile:
|
||||||
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
|
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
|
||||||
else:
|
else:
|
||||||
with open(file, 'w', encoding='utf-8') as outputFile:
|
with open(file, 'w', encoding='utf-8') as outputFile:
|
||||||
json.dump(outputStruct, outputFile, ensure_ascii=False, indent=2)
|
json.dump(outputStruct, outputFile, ensure_ascii=False)
|
||||||
|
|
||||||
def mergeFrom(self, otherFile):
|
def mergeFrom(self, otherFile):
|
||||||
"""Add all phonologies and generators from the other file into this one."""
|
"""Add all phonologies and generators from the other file into this one."""
|
||||||
|
|
Loading…
Reference in New Issue