Add an algorithm to build a rule generator from a phonology, without examples.
This commit is contained in:
parent
4def536673
commit
3e4485e1b9
|
@ -120,7 +120,7 @@ class Phonology:
|
|||
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
|
||||
|
||||
def isOnset(self, id):
|
||||
"""Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription"""
|
||||
"""Check if an id corresponds to a phoneme that can be in an onset, either from description, or if not available, guessed from the phonemic transcription"""
|
||||
entry = self.entries[id]
|
||||
description = entry['description']
|
||||
result = ('#onset' in description) or ('#consonant' in description)
|
||||
|
@ -129,7 +129,7 @@ class Phonology:
|
|||
return result
|
||||
|
||||
def isNucleus(self, id):
|
||||
"""Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
|
||||
"""Check if an id corresponds to a phoneme that can be in a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
|
||||
entry = self.entries[id]
|
||||
description = entry['description']
|
||||
result = ('#nucleus' in description) or ('#vowel' in description)
|
||||
|
@ -138,7 +138,7 @@ class Phonology:
|
|||
return result
|
||||
|
||||
def isCoda(self, id):
|
||||
"""Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription"""
|
||||
"""Check if an id corresponds to a phoneme that can be in a coda, either from description, or if not available, guessed from the phonemic transcription"""
|
||||
entry = self.entries[id]
|
||||
description = entry['description']
|
||||
result = ('#coda' in description) or ('#consonant' in description)
|
||||
|
@ -146,6 +146,77 @@ class Phonology:
|
|||
result = Phonology.isConsonant(entry['phoneme'])
|
||||
return result
|
||||
|
||||
def isInSingleSyllables(self, id):
|
||||
"""Check if an id corresponds to a phoneme that can be in a single syllable, from description"""
|
||||
entry = self.entries[id]
|
||||
description = entry['description']
|
||||
result = ('#single' in description) or ('#initial' in description) or ('#final' in description)
|
||||
if (not result) and ('#middle' not in description):
|
||||
result = True
|
||||
return result
|
||||
|
||||
def isInInitialSyllables(self, id):
|
||||
"""Check if an id corresponds to a phoneme that can be in an initial syllable, from description"""
|
||||
entry = self.entries[id]
|
||||
description = entry['description']
|
||||
result = ('#initial' in description)
|
||||
if (not result) and ('#single' not in description) and ('#middle' not in description) and ('#final' not in description):
|
||||
result = True
|
||||
return result
|
||||
|
||||
def isInMiddleSyllables(self, id):
|
||||
"""Check if an id corresponds to a phoneme that can be in a middle syllable, from description"""
|
||||
entry = self.entries[id]
|
||||
description = entry['description']
|
||||
result = ('#middle' in description)
|
||||
if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#final' not in description):
|
||||
result = True
|
||||
return result
|
||||
|
||||
def isInFinalSyllables(self, id):
|
||||
"""Check if an id corresponds to a phoneme that can be in a final syllable, from description"""
|
||||
entry = self.entries[id]
|
||||
description = entry['description']
|
||||
result = ('#final' in description)
|
||||
if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#middle' not in description):
|
||||
result = True
|
||||
return result
|
||||
|
||||
def isInStressedSyllables(self, id):
|
||||
"""Check if an id corresponds to a phoneme that can be in a stressed syllable, from description"""
|
||||
entry = self.entries[id]
|
||||
description = entry['description']
|
||||
return ('#stressed' in description) or ('#unstressed' not in description)
|
||||
|
||||
def isInUnstressedSyllables(self, id):
|
||||
"""Check if an id corresponds to a phoneme that can be in an unstressed syllable, from description"""
|
||||
entry = self.entries[id]
|
||||
description = entry['description']
|
||||
return ('#unstressed' in description) or ('#stressed' not in description)
|
||||
|
||||
def getPhonemesFromTags(self, tags):
|
||||
"""Return a list of phoneme id verifying the tag list"""
|
||||
phonemeList = []
|
||||
tagToPredicate = {
|
||||
'#onset': Phonology.isOnset,
|
||||
'#nucleus': Phonology.isNucleus,
|
||||
'#coda': Phonology.isCoda,
|
||||
'#single': Phonology.isInSingleSyllables,
|
||||
'#initial': Phonology.isInInitialSyllables,
|
||||
'#middle': Phonology.isInMiddleSyllables,
|
||||
'#final': Phonology.isInFinalSyllables,
|
||||
'#stressed': Phonology.isInStressedSyllables,
|
||||
'#unstressed': Phonology.isInUnstressedSyllables
|
||||
}
|
||||
for id in self.entries:
|
||||
# skip stress and syllable break
|
||||
if (id == self.getStress()) or (id == self.getSyllableBreak()):
|
||||
pass
|
||||
checklist = [tagToPredicate[t](self, id) for t in tags]
|
||||
if all(checklist):
|
||||
phonemeList.append(id)
|
||||
return phonemeList
|
||||
|
||||
class Distribution:
|
||||
"""Discrete distribution"""
|
||||
def __init__(self):
|
||||
|
@ -389,6 +460,10 @@ class RuleGenerator(Generator):
|
|||
# Replace the old rules with the new rules
|
||||
self.rules[syllableRule] = newDist
|
||||
|
||||
def cleanRules(self):
|
||||
"""Remove the empty rules"""
|
||||
self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
|
||||
|
||||
def fromExamples(self, file, phonology):
|
||||
"""Train a rule generator on an example file"""
|
||||
stressId = phonology.getStress()
|
||||
|
@ -417,7 +492,132 @@ class RuleGenerator(Generator):
|
|||
for x in syllableRules:
|
||||
self.splitSyllableRule(x, phonology)
|
||||
# Step 3: remove the empty rules
|
||||
self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
|
||||
self.cleanRules()
|
||||
|
||||
def randomOccurences(mean, range):
|
||||
"""Generate a random number in the range [mean-range, mean+range+1]"""
|
||||
return random.randint(mean - range, mean + range + 1)
|
||||
|
||||
def isStressPosition(position, numberSyllables, stressPosition):
|
||||
"""Check if a given position is the position of the stress. The position goes from 1 to numberSyllables included."""
|
||||
isPosition = False
|
||||
if (stressPosition > 0) and (stressPosition <= numberSyllables):
|
||||
isPosition = position == stressPosition
|
||||
elif (stressPosition < 0) and (abs(stressPosition) <= numberSyllables):
|
||||
isPosition = position == (numberSyllables + 1 + stressPosition)
|
||||
elif (position == numberSyllables) and (stressPosition > numberSyllables):
|
||||
isPosition = True
|
||||
elif (position == 1) and (stressPosition < 0) and (abs(stressPosition) > numberSyllables):
|
||||
isPosition = True
|
||||
return isPosition
|
||||
|
||||
def fromPhonology(self, phonology, minNumberSyllables = 1, maxNumberSyllables = 4, stressPosition = -2, distributionMean = 20, distributionRange = 5):
|
||||
"""
|
||||
Generate a rule-based generator just from a phonology and some parameters.
|
||||
- minNumberSyllables must be strictly positive.
|
||||
- maxNumberSyllables must be greater than minNumberSyllables
|
||||
- stressPosition indicates on which syllable the stress occurs.
|
||||
Positive index count from the beginning to the end (with the first syllable being at index 1).
|
||||
Negative index count from the end to the beginning (with the last syllable being at index -1)
|
||||
Set this to zero if no stress should be generated.
|
||||
- distributionMean indicates the medium value for the occurences of a phoneme
|
||||
- distributionRange indicates the maximum absolute difference from distributionMean for the occurences of phonemes and syllables
|
||||
"""
|
||||
# Reinitialize
|
||||
self.phonology = phonology.id
|
||||
self.rules = {}
|
||||
# Check the parameters
|
||||
if maxNumberSyllables < minNumberSyllables:
|
||||
raise Exception("Maximum number of syllables", maxNumberSyllables, "must be higher than the minimum number of syllables", minNumberSyllables)
|
||||
if maxNumberSyllables < abs(stressPosition):
|
||||
raise Exception("Can't set a stress at position", stressPosition, "with a maximum number of syllables", maxNumberSyllables)
|
||||
if distributionMean < 1:
|
||||
raise Exception("Distribution mean must be strictly positive. Given", distributionMean)
|
||||
if distributionMean < distributionRange:
|
||||
raise Exception("Distribution mean", distributionMean, "must be strictly higher than distribution range", distributionRange)
|
||||
if distributionRange < 0:
|
||||
raise Exception("Distribution range must be positive or nul. Given", distributionRange)
|
||||
# Step 1: Generate the word rules based on the min and max number of syllables, and the presence of stress
|
||||
stressId = phonology.getStress()
|
||||
syllableBreakId = phonology.getSyllableBreak()
|
||||
isStressed = stressPosition != 0
|
||||
# Add the 'word' rule, initialized with an empty distribution
|
||||
self.rules.update({'word': Distribution()})
|
||||
# Add the syllable rules and word patterns
|
||||
syllableRules = []
|
||||
syllableRulesToTags = {}
|
||||
if minNumberSyllables == 1:
|
||||
syllableRules.append('single')
|
||||
syllableRulesToTags.update({'single': ['#single']})
|
||||
wordPattern = []
|
||||
if isStressed:
|
||||
syllableRulesToTags['single'].append('#stressed')
|
||||
wordPattern.append(stressId)
|
||||
wordPattern.append('single')
|
||||
self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||
if maxNumberSyllables > 1:
|
||||
syllableRules = syllableRules + ['initial', 'middle', 'final']
|
||||
syllableRulesToTags.update({'initial': ['#initial'], 'middle': ['#middle'], 'final': ['#final']})
|
||||
if isStressed:
|
||||
syllableRules = syllableRules + ['initial-stressed', 'middle-stressed', 'final-stressed']
|
||||
syllableRulesToTags.update({'initial-stressed': ['#initial', '#stressed'], 'middle-stressed': ['#middle', '#stressed'], 'final-stressed': ['#final', '#stressed']})
|
||||
syllableRulesToTags.update({'initial': ['#initial', '#unstressed'], 'middle': ['#middle', '#unstressed'], 'final': ['#final', '#unstressed']})
|
||||
for nbMiddleSyllables in range(maxNumberSyllables - 1):
|
||||
nbSyllables = nbMiddleSyllables + 2
|
||||
wordPattern = []
|
||||
for position in range(1, nbSyllables + 1):
|
||||
isStressPosition = RuleGenerator.isStressPosition(position, nbSyllables, stressPosition)
|
||||
# add syllable separator
|
||||
if isStressPosition:
|
||||
wordPattern.append(stressId)
|
||||
elif position > 1:
|
||||
wordPattern.append(syllableBreakId)
|
||||
# add syllable
|
||||
if position == 1:
|
||||
if isStressPosition:
|
||||
wordPattern.append('initial-stressed')
|
||||
else:
|
||||
wordPattern.append('initial')
|
||||
elif position == nbSyllables:
|
||||
if isStressPosition:
|
||||
wordPattern.append('final-stressed')
|
||||
else:
|
||||
wordPattern.append('final')
|
||||
else:
|
||||
if isStressPosition:
|
||||
wordPattern.append('middle-stressed')
|
||||
else:
|
||||
wordPattern.append('middle')
|
||||
self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||
# Step 2: Generate the syllable rules
|
||||
# Add the rules in the distributions
|
||||
phonemeRules = []
|
||||
phonemeRulesToTag = {}
|
||||
for syllable in syllableRules:
|
||||
self.rules.update({syllable: Distribution()})
|
||||
onset = syllable + '-onset'
|
||||
nucleus = syllable + '-nucleus'
|
||||
coda = syllable + '-coda'
|
||||
phonemeRules = phonemeRules + [onset, nucleus, coda]
|
||||
ruleTags = syllableRulesToTags[syllable]
|
||||
phonemeRulesToTag.update({onset: ruleTags + ['#onset'], nucleus: ruleTags + ['#nucleus'], coda: ruleTags + ['#coda']})
|
||||
# Fill the syllable rules
|
||||
# For the generated rules, initial and single syllables may not have onset
|
||||
if ('#initial' in ruleTags) or ('#single' in ruleTags):
|
||||
self.rules[syllable].addTo(tuple([nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||
self.rules[syllable].addTo(tuple([nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||
self.rules[syllable].addTo(tuple([onset, nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||
self.rules[syllable].addTo(tuple([onset, nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||
# Step 3: Generate the phoneme distributions for each phoneme rule
|
||||
for rule in phonemeRules:
|
||||
self.rules.update({rule: Distribution()})
|
||||
tags = phonemeRulesToTag[rule]
|
||||
phonemeList = phonology.getPhonemesFromTags(tags)
|
||||
for phoneme in phonemeList:
|
||||
self.rules[rule].addTo(tuple([phoneme]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||||
# Step 4: Clean the rules
|
||||
self.cleanRules()
|
||||
|
||||
|
||||
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
|
||||
def makeGenerator(struct):
|
||||
|
@ -470,7 +670,7 @@ class PhonagenFile:
|
|||
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
|
||||
else:
|
||||
with open(file, 'w', encoding='utf-8') as outputFile:
|
||||
json.dump(outputStruct, outputFile, ensure_ascii=False, indent=2)
|
||||
json.dump(outputStruct, outputFile, ensure_ascii=False)
|
||||
|
||||
def mergeFrom(self, otherFile):
|
||||
"""Add all phonologies and generators from the other file into this one."""
|
||||
|
|
Loading…
Reference in New Issue