phonagen/py-phonagen/phonagen.py

710 lines
30 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Common functions and classes for phonagen tools"""
import json
import io
import sys
import csv
import random
import unicodedata
class Phonology:
"""Phonology class"""
def __init__(self, id = '', description = '', mainTranscription = ''):
self.id = id
self.description = description
self.transcriptions = []
self.mainTranscription = mainTranscription
self.entries = {} # id -> entry
def isValid(self):
return self.id != ''
def has(self, id):
return id in self.entries
def toJsonStruct(self):
"""Convert a Phonology to a Json structure"""
return { 'id': self.id,
'description': self.description,
'transcriptions': self.transcriptions,
'main-transcription': self.mainTranscription,
'entries': [x for x in self.entries.values()] }
def fromJsonStruct(self, struct):
"""Fill a Phonology from a Json structure"""
self.id = struct['id']
self.description = struct['description']
self.transcriptions = struct['transcriptions']
self.mainTranscription = struct['main-transcription']
self.entries = {x['id']: x for x in struct['entries']}
def fromCsv(self, file):
"""Fill a Phonology from a Csv file"""
with open(file) as csvfile:
fileReader = csv.reader(csvfile)
# get csv header
header = next(fileReader)
# get the transcriptions (header items not id or description)
self.transcriptions = [x for x in header if x not in ['id', 'description']]
# Check: self.transcriptions should contain 'phoneme'
if 'phoneme' not in self.transcriptions:
raise Exception('phoneme column not found in ', file)
# Check: self.transcriptions should have at least two items
if len(self.transcriptions) < 2:
raise Exception('No transcription found outside phoneme in file ', file, 'Did you named it id or description ?')
# get the first header item which is not one of those: id, description, phoneme
guessedMainTranscription = next(x for x in header if x not in ['id', 'description', 'phoneme'])
# If main-transcription was not given on the command line, use the guess as main-transcription
if self.mainTranscription == '':
self.mainTranscription = guessedMainTranscription
# Check: self.mainTranscription should be in self.transcriptions
if self.mainTranscription not in self.mainTranscription:
raise Exception('main-transcription', self.mainTranscription, 'not in list of transcriptions')
# If id was not given on the command line, use the mainTranscription as the id
if self.id == '':
self.id = self.mainTranscription
# parse entries
for row in fileReader:
entry = dict()
for i in range(len(row)):
entry.update({header[i]: row[i]})
# All absent elements are set to ''
for i in range(len(row), len(header)):
entry.update({header[i]: ''})
# if both phoneme and main-transcription are empty, skip the rest
if (entry['phoneme'] != '') or (entry[self.mainTranscription] != ''):
# if id is not provided, generate it
if 'id' not in header:
entry.update({'id': entry['phoneme'] + '-' + entry[self.mainTranscription]})
# if description is not provided, add an empty one
if 'description' not in header:
entry.update({'description': ''})
self.entries.update({entry['id']: entry})
def formatWord(self, idList):
"""Return a table of transcription -> string corresponding to the same word"""
result = {x: "" for x in self.transcriptions}
for x in idList:
phoneme = self.entries[x]
for y in result:
result[y] = result[y] + phoneme[y]
return result
def isStress(self, id):
entry = self.entries[id]
description = entry['description']
phoneme = entry['phoneme']
return (('#stress' in description) and ('#stressed' not in description)) or ("'" in phoneme) or ("ˈ" in phoneme)
def getStress(self):
"""Return the phoneme id of the stress phoneme"""
# search for #stress tag in description
found = [x['id'] for x in self.entries.values() if ('#stress' in x['description']) and ('#stressed' not in x['description'])]
if len(found) == 0:
# if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription
found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])]
if len(found) == 0:
raise Exception('No stress phoneme in phonology', self.id)
return found[0]
def isSyllableBreak(self, id):
entry = self.entries[id]
description = entry['description']
phoneme = entry['phoneme']
return ('#syllable-break' in description) or ("." in phoneme)
def getSyllableBreak(self):
"""Return the phoneme id of the syllable break phoneme"""
# search for #syllable-break tag in description
found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']]
if len(found) == 0:
# if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']]
if len(found) == 0:
raise Exception('No syllable break phoneme in phonology', self.id)
return found[0]
vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ"
def isVowel(phoneme):
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels)
consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ"
def isConsonant(phoneme):
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
def isOnset(self, id):
"""Check if an id corresponds to a phoneme that can be in an onset, either from description, or if not available, guessed from the phonemic transcription"""
entry = self.entries[id]
description = entry['description']
result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#onset' in description) or ('#consonant' in description))
if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description):
result = Phonology.isConsonant(entry['phoneme'])
return result
def isNucleus(self, id):
"""Check if an id corresponds to a phoneme that can be in a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
entry = self.entries[id]
description = entry['description']
result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#nucleus' in description) or ('#vowel' in description))
if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description):
result = Phonology.isVowel(entry['phoneme'])
return result
def isCoda(self, id):
"""Check if an id corresponds to a phoneme that can be in a coda, either from description, or if not available, guessed from the phonemic transcription"""
entry = self.entries[id]
description = entry['description']
result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#coda' in description) or ('#consonant' in description))
if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description):
result = Phonology.isConsonant(entry['phoneme'])
return result
def isInSingleSyllables(self, id):
"""Check if an id corresponds to a phoneme that can be in a single syllable, from description"""
entry = self.entries[id]
description = entry['description']
result = ('#single' in description) or ('#initial' in description) or ('#final' in description)
if (not result) and ('#middle' not in description):
result = True
return result
def isInInitialSyllables(self, id):
"""Check if an id corresponds to a phoneme that can be in an initial syllable, from description"""
entry = self.entries[id]
description = entry['description']
result = ('#initial' in description)
if (not result) and ('#single' not in description) and ('#middle' not in description) and ('#final' not in description):
result = True
return result
def isInMiddleSyllables(self, id):
"""Check if an id corresponds to a phoneme that can be in a middle syllable, from description"""
entry = self.entries[id]
description = entry['description']
result = ('#middle' in description)
if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#final' not in description):
result = True
return result
def isInFinalSyllables(self, id):
"""Check if an id corresponds to a phoneme that can be in a final syllable, from description"""
entry = self.entries[id]
description = entry['description']
result = ('#final' in description)
if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#middle' not in description):
result = True
return result
def isInStressedSyllables(self, id):
"""Check if an id corresponds to a phoneme that can be in a stressed syllable, from description"""
entry = self.entries[id]
description = entry['description']
return ('#stressed' in description) or ('#unstressed' not in description)
def isInUnstressedSyllables(self, id):
"""Check if an id corresponds to a phoneme that can be in an unstressed syllable, from description"""
entry = self.entries[id]
description = entry['description']
return ('#unstressed' in description) or ('#stressed' not in description)
def getPhonemesFromTags(self, tags):
"""Return a list of phoneme id verifying the tag list"""
phonemeList = []
tagToPredicate = {
'#onset': Phonology.isOnset,
'#nucleus': Phonology.isNucleus,
'#coda': Phonology.isCoda,
'#single': Phonology.isInSingleSyllables,
'#initial': Phonology.isInInitialSyllables,
'#middle': Phonology.isInMiddleSyllables,
'#final': Phonology.isInFinalSyllables,
'#stressed': Phonology.isInStressedSyllables,
'#unstressed': Phonology.isInUnstressedSyllables
}
for id in self.entries:
# skip stress and syllable break
if (id == self.getStress()) or (id == self.getSyllableBreak()):
pass
checklist = [tagToPredicate[t](self, id) for t in tags]
if all(checklist):
phonemeList.append(id)
return phonemeList
def hasStressedVowels(self):
"""Check if all vowels are tagged #unstressed"""
hasStressed = False
for id in self.entries:
if self.isNucleus(id) and self.isInStressedSyllables(id):
hasStressed = True
break
return hasStressed
class Distribution:
"""Discrete distribution"""
def __init__(self):
self.items = {}
def addTo(self, value, occurences = 1):
oc = occurences
if value in self.items:
oc = oc + self.items[value]
self.items.update({value: oc})
def pickFrom(self):
return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0]
def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
def fromJsonStruct(self, struct, itemRef = 'value', occurencesRef = 'occurences'):
self.items = {}
for item in struct:
self.items.update({item[itemRef]: item[occurencesRef]})
def isEmpty(self):
return len(self.items) == 0
class Generator:
"""Parent class for all generators"""
def __init__(self, id = '', description = '', phonology = ''):
self.id = id
self.description = description
self.phonology = phonology
self.isTyped = False
def isValid(self):
return (self.id != '') and self.isTyped
def toJsonStruct(self):
return { 'id': self.id,
'description': self.description,
'phonology': self.phonology }
def fromJsonStruct(self, struct):
self.id = struct['id']
self.description = struct['description']
self.phonology = struct['phonology']
def generateWord(self):
raise Exception('Word generation not supported on abstract generator')
class ChainGenerator(Generator):
"""Chains-based generator"""
def __init__(self, order = 1, **kwargs):
super().__init__(**kwargs)
self.order = order
self.chains = {} # input -> distribution of outputs
self.isTyped = True
def toJsonStruct(self):
struct = super().toJsonStruct()
struct.update({'type': 'chains',
'order': self.order,
'chains': [{'input': x, 'possible-outputs': self.chains[x].toJsonStruct(itemRef = 'value', occurencesRef = 'occurences')} for x in self.chains]})
return struct
def fromJsonStruct(self, struct):
super().fromJsonStruct(struct)
self.order = struct['order']
for chainStruct in struct['chains']:
dist = Distribution()
dist.fromJsonStruct(chainStruct['possible-outputs'], itemRef = 'value', occurencesRef = 'occurences')
self.chains.update({tuple(chainStruct['input']): dist})
def fromExamples(self, file, phonology):
"""Train a chain generator on an example file"""
with open(file) as exampleFile:
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
for row in fileReader:
if len(row) != 0:
row.append('') # Add terminator element (empty string)
previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
for item in row:
if (item != '') and (phonology.isValid()) and (not phonology.has(item)):
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
if previous in self.chains:
self.chains[previous].addTo(item)
else:
dist = Distribution()
dist.addTo(item)
self.chains.update({previous: dist})
previous = previous[1:] + (item,)
def generateWord(self):
outputIdList = []
nextItem = '.' #
previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
while nextItem != '':
nextItem = self.chains[previous].pickFrom()
if nextItem != '':
outputIdList.append(nextItem)
previous = previous[1:] + (nextItem,)
return outputIdList
class RuleGenerator(Generator):
"""Rules-based generator"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.rules = {}
self.isTyped = True
def toJsonStruct(self):
struct = super().toJsonStruct()
struct.update({'type': 'rules',
'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]})
return struct
def fromJsonStruct(self, struct):
super().fromJsonStruct(struct)
for ruleStruct in struct['rules']:
dist = Distribution()
# The pattern should be converted from a list to a tuple
dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences')
self.rules.update({ruleStruct['id']: dist})
def generatePattern(self, pattern):
output = []
for x in pattern:
if x in self.rules:
output = output + self.generatePattern(self.rules[x].pickFrom())
else:
output.append(x)
return output
def generateWord(self):
return self.generatePattern(self.rules['word'].pickFrom())
def processRowFromExample(self, row, stressId, syllableBreakId):
# Check the number of stress
nbStress = row.count(stressId)
if nbStress > 1:
print("Too much stress in " + str(row) + ": skip the example")
return
# Build the syllable list
syllables = []
currentSyllable = []
stressedSyllableIdx = -1
syllableIdx = 0
for x in row:
# Append to the current syllable if not a syllable separator
if (x != stressId) and (x != syllableBreakId):
currentSyllable.append(x)
# In case of syllable separator, only add the syllable to the list if it is not empty
elif len(currentSyllable) != 0:
syllables.append(currentSyllable)
currentSyllable = []
syllableIdx = syllableIdx + 1
# If current id is stress, remember the position of the stressed syllable
if (x == stressId):
stressedSyllableIdx = syllableIdx
# After the loop, the current syllable should be non-empty, add it to the list of syllables
if len(currentSyllable) != 0:
syllables.append(currentSyllable)
# Single syllable case
if len(syllables) == 1:
if stressedSyllableIdx == 0:
self.rules['word'].addTo(tuple([stressId, 'single']))
else:
self.rules['word'].addTo(tuple(['single']))
self.rules['single'].addTo(tuple(syllables[0]))
# Other cases
else:
wordPattern = []
for x in range(len(syllables)):
rule = ''
separator = syllableBreakId
if x == 0:
rule = 'initial'
elif x == (len(syllables) - 1):
rule = 'final'
else:
rule = 'middle'
if x == stressedSyllableIdx:
rule = rule + '-stressed'
separator = stressId
# only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
if (separator == stressId) or (x > 0):
wordPattern.append(separator)
# Add the rule to the pattern
wordPattern.append(rule)
# The syllable is added to the corresponding rule
self.rules[rule].addTo(tuple(syllables[x]))
self.rules['word'].addTo(tuple(wordPattern))
def splitSyllableRule(self, syllableRule, phonology):
"""Replace syllable rules with onset/nucleus/coda pattern"""
newDist = Distribution()
oldDist = self.rules[syllableRule]
# Add onset/nucleus/coda rules
onsetRule = syllableRule + '-onset'
nucleusRule = syllableRule +'-nucleus'
codaRule = syllableRule + '-coda'
self.rules[onsetRule] = Distribution()
self.rules[nucleusRule] = Distribution()
self.rules[codaRule] = Distribution()
# For each pattern, split into onset/nucleus/coda
for pattern in oldDist.items:
isOnset = True
onset = []
isNucleus = False
nucleus = []
isCoda = False
coda = []
for phoneme in pattern:
# Check is there is a change of element
if isOnset and (phonology.isNucleus(phoneme)):
isOnset = False
isNucleus = True
elif isNucleus and (phonology.isCoda(phoneme)):
isNucleus = False
isCoda = True
# Add to the respective list
if isOnset:
onset.append(phoneme)
elif isNucleus:
nucleus.append(phoneme)
else:
coda.append(phoneme)
# Add to the specific distributions and determine the pattern in new distribution
occurences = oldDist.items[pattern]
distPattern = []
if len(onset) != 0:
distPattern.append(onsetRule)
self.rules[onsetRule].addTo(tuple(onset), occurences)
if len(nucleus) != 0:
distPattern.append(nucleusRule)
self.rules[nucleusRule].addTo(tuple(nucleus), occurences)
if len(coda) != 0:
distPattern.append(codaRule)
self.rules[codaRule].addTo(tuple(coda), occurences)
# Add patterns to distributions
newDist.addTo(tuple(distPattern), occurences)
# Replace the old rules with the new rules
self.rules[syllableRule] = newDist
def cleanRules(self):
"""Remove the empty rules"""
self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
def fromExamples(self, file, phonology):
"""Train a rule generator on an example file"""
stressId = phonology.getStress()
syllableBreakId = phonology.getSyllableBreak()
# Words are modelled as lists of syllables, with one of those being stressed (optionally)
# The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
# Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
#
# Add the 'word' rule, and syllable rules, initialized with an empty distribution
self.rules.update({'word': Distribution()})
syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed']
for x in syllableRules:
self.rules.update({x: Distribution()})
# Step 1: open the file and find how words look like
with open(file) as exampleFile:
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
for row in fileReader:
if len(row) != 0:
# Check the items in row
for item in row:
if (item != '') and (not phonology.has(item)):
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
# Process the row
self.processRowFromExample(row, stressId, syllableBreakId)
# Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
for x in syllableRules:
self.splitSyllableRule(x, phonology)
# Step 3: remove the empty rules
self.cleanRules()
def randomOccurences(mean, range):
"""Generate a random number in the range [mean-range, mean+range+1]"""
return random.randint(mean - range, mean + range + 1)
def isStressPosition(position, numberSyllables, stressPosition):
"""Check if a given position is the position of the stress. The position goes from 1 to numberSyllables included."""
isPosition = False
if (stressPosition > 0) and (stressPosition <= numberSyllables):
isPosition = position == stressPosition
elif (stressPosition < 0) and (abs(stressPosition) <= numberSyllables):
isPosition = position == (numberSyllables + 1 + stressPosition)
elif (position == numberSyllables) and (stressPosition > numberSyllables):
isPosition = True
elif (position == 1) and (stressPosition < 0) and (abs(stressPosition) > numberSyllables):
isPosition = True
return isPosition
def fromPhonology(self, phonology, minNumberSyllables = 1, maxNumberSyllables = 4, stressPosition = -2, distributionMean = 20, distributionRange = 5):
"""
Generate a rule-based generator just from a phonology and some parameters.
- minNumberSyllables must be strictly positive.
- maxNumberSyllables must be greater than minNumberSyllables
- stressPosition indicates on which syllable the stress occurs.
Positive index count from the beginning to the end (with the first syllable being at index 1).
Negative index count from the end to the beginning (with the last syllable being at index -1)
Set this to zero if no stress should be generated.
- distributionMean indicates the medium value for the occurences of a phoneme
- distributionRange indicates the maximum absolute difference from distributionMean for the occurences of phonemes and syllables
"""
# Reinitialize
self.phonology = phonology.id
self.rules = {}
# Check the parameters
if maxNumberSyllables < minNumberSyllables:
raise Exception("Maximum number of syllables", maxNumberSyllables, "must be higher than the minimum number of syllables", minNumberSyllables)
if maxNumberSyllables < abs(stressPosition):
raise Exception("Can't set a stress at position", stressPosition, "with a maximum number of syllables", maxNumberSyllables)
if distributionMean < 1:
raise Exception("Distribution mean must be strictly positive. Given", distributionMean)
if distributionMean < distributionRange:
raise Exception("Distribution mean", distributionMean, "must be strictly higher than distribution range", distributionRange)
if distributionRange < 0:
raise Exception("Distribution range must be positive or nul. Given", distributionRange)
# Step 1: Generate the word rules based on the min and max number of syllables, and the presence of stress
stressId = phonology.getStress()
syllableBreakId = phonology.getSyllableBreak()
isStressed = (stressPosition != 0) and phonology.hasStressedVowels()
# Add the 'word' rule, initialized with an empty distribution
self.rules.update({'word': Distribution()})
# Add the syllable rules and word patterns
syllableRules = []
syllableRulesToTags = {}
if minNumberSyllables == 1:
syllableRules.append('single')
syllableRulesToTags.update({'single': ['#single']})
wordPattern = []
if isStressed:
syllableRulesToTags['single'].append('#stressed')
wordPattern.append(stressId)
wordPattern.append('single')
self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
if maxNumberSyllables > 1:
syllableRules = syllableRules + ['initial', 'middle', 'final']
syllableRulesToTags.update({'initial': ['#initial'], 'middle': ['#middle'], 'final': ['#final']})
if isStressed:
syllableRules = syllableRules + ['initial-stressed', 'middle-stressed', 'final-stressed']
syllableRulesToTags.update({'initial-stressed': ['#initial', '#stressed'], 'middle-stressed': ['#middle', '#stressed'], 'final-stressed': ['#final', '#stressed']})
syllableRulesToTags.update({'initial': ['#initial', '#unstressed'], 'middle': ['#middle', '#unstressed'], 'final': ['#final', '#unstressed']})
for nbMiddleSyllables in range(maxNumberSyllables - 1):
nbSyllables = nbMiddleSyllables + 2
wordPattern = []
for position in range(1, nbSyllables + 1):
isStressPosition = isStressed and RuleGenerator.isStressPosition(position, nbSyllables, stressPosition)
# add syllable separator
if isStressPosition:
wordPattern.append(stressId)
elif position > 1:
wordPattern.append(syllableBreakId)
# add syllable
if position == 1:
if isStressPosition:
wordPattern.append('initial-stressed')
else:
wordPattern.append('initial')
elif position == nbSyllables:
if isStressPosition:
wordPattern.append('final-stressed')
else:
wordPattern.append('final')
else:
if isStressPosition:
wordPattern.append('middle-stressed')
else:
wordPattern.append('middle')
self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
# Step 2: Generate the syllable rules
# Add the rules in the distributions
phonemeRules = []
phonemeRulesToTag = {}
for syllable in syllableRules:
self.rules.update({syllable: Distribution()})
onset = syllable + '-onset'
nucleus = syllable + '-nucleus'
coda = syllable + '-coda'
phonemeRules = phonemeRules + [onset, nucleus, coda]
ruleTags = syllableRulesToTags[syllable]
phonemeRulesToTag.update({onset: ruleTags + ['#onset'], nucleus: ruleTags + ['#nucleus'], coda: ruleTags + ['#coda']})
# Fill the syllable rules
# For the generated rules, initial and single syllables may not have onset
if ('#initial' in ruleTags) or ('#single' in ruleTags):
self.rules[syllable].addTo(tuple([nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
self.rules[syllable].addTo(tuple([nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
self.rules[syllable].addTo(tuple([onset, nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
self.rules[syllable].addTo(tuple([onset, nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
# Step 3: Generate the phoneme distributions for each phoneme rule
for rule in phonemeRules:
self.rules.update({rule: Distribution()})
tags = phonemeRulesToTag[rule]
phonemeList = phonology.getPhonemesFromTags(tags)
for phoneme in phonemeList:
self.rules[rule].addTo(tuple([phoneme]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
# Step 4: Clean the rules
self.cleanRules()
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
def makeGenerator(struct):
"""Function instanciating a generator from a JSON structure"""
if struct['type'] in generatorTypeToClass:
generator = generatorTypeToClass[struct['type']]()
else:
generator = Generator()
generator.fromJsonStruct(struct)
return generator
class PhonagenFile:
"""A phonagen file, with phonologies and generators"""
def __init__(self):
self.phonologies = {}
self.generators = {}
def addPhonology(self, phonology):
if (phonology.isValid()):
self.phonologies.update({phonology.id: phonology})
def addGenerator(self, generator):
if (generator.isValid()):
self.generators.update({generator.id: generator})
def getPhonology(self, id):
return self.phonologies[id]
def getGenerator(self, id):
return self.generators[id]
def load(self, file):
"""Load from a JSON file"""
with open(file, 'r', encoding='utf-8') as inputFile:
jsonStruct = json.load(inputFile)
# Load phonologies
for struct in jsonStruct['phonologies']:
phonology = Phonology()
phonology.fromJsonStruct(struct)
self.addPhonology(phonology)
# Load generators
for struct in jsonStruct['generators']:
self.addGenerator(makeGenerator(struct))
def writeTo(self, file = ''):
"""Output to a JSON file (or stdout)"""
outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
'generators': [x.toJsonStruct() for x in self.generators.values()] }
if file == '':
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
else:
with open(file, 'w', encoding='utf-8') as outputFile:
json.dump(outputStruct, outputFile, ensure_ascii=False)
def mergeFrom(self, otherFile):
"""Add all phonologies and generators from the other file into this one."""
for phonology in otherFile.phonologies.values():
self.addPhonology(phonology)
for generator in otherFile.generators.values():
self.addGenerator(generator)
def generateWord(self, generator = ''):
gen = generator
if gen == '':
gen = random.choice([x for x in self.generators])
idList = self.generators[gen].generateWord()
phonology = self.phonologies[self.generators[gen].phonology]
return phonology.formatWord(idList)