710 lines
30 KiB
Python
710 lines
30 KiB
Python
"""Common functions and classes for phonagen tools"""
|
||
import json
|
||
import io
|
||
import sys
|
||
import csv
|
||
import random
|
||
import unicodedata
|
||
|
||
class Phonology:
|
||
"""Phonology class"""
|
||
def __init__(self, id = '', description = '', mainTranscription = ''):
|
||
self.id = id
|
||
self.description = description
|
||
self.transcriptions = []
|
||
self.mainTranscription = mainTranscription
|
||
self.entries = {} # id -> entry
|
||
|
||
def isValid(self):
|
||
return self.id != ''
|
||
|
||
def has(self, id):
|
||
return id in self.entries
|
||
|
||
def toJsonStruct(self):
|
||
"""Convert a Phonology to a Json structure"""
|
||
return { 'id': self.id,
|
||
'description': self.description,
|
||
'transcriptions': self.transcriptions,
|
||
'main-transcription': self.mainTranscription,
|
||
'entries': [x for x in self.entries.values()] }
|
||
|
||
def fromJsonStruct(self, struct):
|
||
"""Fill a Phonology from a Json structure"""
|
||
self.id = struct['id']
|
||
self.description = struct['description']
|
||
self.transcriptions = struct['transcriptions']
|
||
self.mainTranscription = struct['main-transcription']
|
||
self.entries = {x['id']: x for x in struct['entries']}
|
||
|
||
def fromCsv(self, file):
|
||
"""Fill a Phonology from a Csv file"""
|
||
with open(file) as csvfile:
|
||
fileReader = csv.reader(csvfile)
|
||
# get csv header
|
||
header = next(fileReader)
|
||
# get the transcriptions (header items not id or description)
|
||
self.transcriptions = [x for x in header if x not in ['id', 'description']]
|
||
# Check: self.transcriptions should contain 'phoneme'
|
||
if 'phoneme' not in self.transcriptions:
|
||
raise Exception('phoneme column not found in ', file)
|
||
# Check: self.transcriptions should have at least two items
|
||
if len(self.transcriptions) < 2:
|
||
raise Exception('No transcription found outside phoneme in file ', file, 'Did you named it id or description ?')
|
||
# get the first header item which is not one of those: id, description, phoneme
|
||
guessedMainTranscription = next(x for x in header if x not in ['id', 'description', 'phoneme'])
|
||
# If main-transcription was not given on the command line, use the guess as main-transcription
|
||
if self.mainTranscription == '':
|
||
self.mainTranscription = guessedMainTranscription
|
||
# Check: self.mainTranscription should be in self.transcriptions
|
||
if self.mainTranscription not in self.mainTranscription:
|
||
raise Exception('main-transcription', self.mainTranscription, 'not in list of transcriptions')
|
||
# If id was not given on the command line, use the mainTranscription as the id
|
||
if self.id == '':
|
||
self.id = self.mainTranscription
|
||
# parse entries
|
||
for row in fileReader:
|
||
entry = dict()
|
||
for i in range(len(row)):
|
||
entry.update({header[i]: row[i]})
|
||
# All absent elements are set to ''
|
||
for i in range(len(row), len(header)):
|
||
entry.update({header[i]: ''})
|
||
# if both phoneme and main-transcription are empty, skip the rest
|
||
if (entry['phoneme'] != '') or (entry[self.mainTranscription] != ''):
|
||
# if id is not provided, generate it
|
||
if 'id' not in header:
|
||
entry.update({'id': entry['phoneme'] + '-' + entry[self.mainTranscription]})
|
||
# if description is not provided, add an empty one
|
||
if 'description' not in header:
|
||
entry.update({'description': ''})
|
||
self.entries.update({entry['id']: entry})
|
||
|
||
def formatWord(self, idList):
|
||
"""Return a table of transcription -> string corresponding to the same word"""
|
||
result = {x: "" for x in self.transcriptions}
|
||
for x in idList:
|
||
phoneme = self.entries[x]
|
||
for y in result:
|
||
result[y] = result[y] + phoneme[y]
|
||
return result
|
||
|
||
def isStress(self, id):
|
||
entry = self.entries[id]
|
||
description = entry['description']
|
||
phoneme = entry['phoneme']
|
||
return (('#stress' in description) and ('#stressed' not in description)) or ("'" in phoneme) or ("ˈ" in phoneme)
|
||
|
||
def getStress(self):
|
||
"""Return the phoneme id of the stress phoneme"""
|
||
# search for #stress tag in description
|
||
found = [x['id'] for x in self.entries.values() if ('#stress' in x['description']) and ('#stressed' not in x['description'])]
|
||
if len(found) == 0:
|
||
# if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription
|
||
found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])]
|
||
if len(found) == 0:
|
||
raise Exception('No stress phoneme in phonology', self.id)
|
||
return found[0]
|
||
|
||
def isSyllableBreak(self, id):
|
||
entry = self.entries[id]
|
||
description = entry['description']
|
||
phoneme = entry['phoneme']
|
||
return ('#syllable-break' in description) or ("." in phoneme)
|
||
|
||
def getSyllableBreak(self):
|
||
"""Return the phoneme id of the syllable break phoneme"""
|
||
# search for #syllable-break tag in description
|
||
found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']]
|
||
if len(found) == 0:
|
||
# if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
|
||
found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']]
|
||
if len(found) == 0:
|
||
raise Exception('No syllable break phoneme in phonology', self.id)
|
||
return found[0]
|
||
|
||
vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ"
|
||
def isVowel(phoneme):
|
||
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels)
|
||
|
||
consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ"
|
||
def isConsonant(phoneme):
|
||
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
|
||
|
||
def isOnset(self, id):
|
||
"""Check if an id corresponds to a phoneme that can be in an onset, either from description, or if not available, guessed from the phonemic transcription"""
|
||
entry = self.entries[id]
|
||
description = entry['description']
|
||
result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#onset' in description) or ('#consonant' in description))
|
||
if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description):
|
||
result = Phonology.isConsonant(entry['phoneme'])
|
||
return result
|
||
|
||
def isNucleus(self, id):
|
||
"""Check if an id corresponds to a phoneme that can be in a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
|
||
entry = self.entries[id]
|
||
description = entry['description']
|
||
result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#nucleus' in description) or ('#vowel' in description))
|
||
if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description):
|
||
result = Phonology.isVowel(entry['phoneme'])
|
||
return result
|
||
|
||
def isCoda(self, id):
|
||
"""Check if an id corresponds to a phoneme that can be in a coda, either from description, or if not available, guessed from the phonemic transcription"""
|
||
entry = self.entries[id]
|
||
description = entry['description']
|
||
result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#coda' in description) or ('#consonant' in description))
|
||
if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description):
|
||
result = Phonology.isConsonant(entry['phoneme'])
|
||
return result
|
||
|
||
def isInSingleSyllables(self, id):
|
||
"""Check if an id corresponds to a phoneme that can be in a single syllable, from description"""
|
||
entry = self.entries[id]
|
||
description = entry['description']
|
||
result = ('#single' in description) or ('#initial' in description) or ('#final' in description)
|
||
if (not result) and ('#middle' not in description):
|
||
result = True
|
||
return result
|
||
|
||
def isInInitialSyllables(self, id):
|
||
"""Check if an id corresponds to a phoneme that can be in an initial syllable, from description"""
|
||
entry = self.entries[id]
|
||
description = entry['description']
|
||
result = ('#initial' in description)
|
||
if (not result) and ('#single' not in description) and ('#middle' not in description) and ('#final' not in description):
|
||
result = True
|
||
return result
|
||
|
||
def isInMiddleSyllables(self, id):
|
||
"""Check if an id corresponds to a phoneme that can be in a middle syllable, from description"""
|
||
entry = self.entries[id]
|
||
description = entry['description']
|
||
result = ('#middle' in description)
|
||
if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#final' not in description):
|
||
result = True
|
||
return result
|
||
|
||
def isInFinalSyllables(self, id):
|
||
"""Check if an id corresponds to a phoneme that can be in a final syllable, from description"""
|
||
entry = self.entries[id]
|
||
description = entry['description']
|
||
result = ('#final' in description)
|
||
if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#middle' not in description):
|
||
result = True
|
||
return result
|
||
|
||
def isInStressedSyllables(self, id):
|
||
"""Check if an id corresponds to a phoneme that can be in a stressed syllable, from description"""
|
||
entry = self.entries[id]
|
||
description = entry['description']
|
||
return ('#stressed' in description) or ('#unstressed' not in description)
|
||
|
||
def isInUnstressedSyllables(self, id):
|
||
"""Check if an id corresponds to a phoneme that can be in an unstressed syllable, from description"""
|
||
entry = self.entries[id]
|
||
description = entry['description']
|
||
return ('#unstressed' in description) or ('#stressed' not in description)
|
||
|
||
def getPhonemesFromTags(self, tags):
|
||
"""Return a list of phoneme id verifying the tag list"""
|
||
phonemeList = []
|
||
tagToPredicate = {
|
||
'#onset': Phonology.isOnset,
|
||
'#nucleus': Phonology.isNucleus,
|
||
'#coda': Phonology.isCoda,
|
||
'#single': Phonology.isInSingleSyllables,
|
||
'#initial': Phonology.isInInitialSyllables,
|
||
'#middle': Phonology.isInMiddleSyllables,
|
||
'#final': Phonology.isInFinalSyllables,
|
||
'#stressed': Phonology.isInStressedSyllables,
|
||
'#unstressed': Phonology.isInUnstressedSyllables
|
||
}
|
||
for id in self.entries:
|
||
# skip stress and syllable break
|
||
if (id == self.getStress()) or (id == self.getSyllableBreak()):
|
||
pass
|
||
checklist = [tagToPredicate[t](self, id) for t in tags]
|
||
if all(checklist):
|
||
phonemeList.append(id)
|
||
return phonemeList
|
||
|
||
def hasStressedVowels(self):
|
||
"""Check if all vowels are tagged #unstressed"""
|
||
hasStressed = False
|
||
for id in self.entries:
|
||
if self.isNucleus(id) and self.isInStressedSyllables(id):
|
||
hasStressed = True
|
||
break
|
||
return hasStressed
|
||
|
||
class Distribution:
|
||
"""Discrete distribution"""
|
||
def __init__(self):
|
||
self.items = {}
|
||
|
||
def addTo(self, value, occurences = 1):
|
||
oc = occurences
|
||
if value in self.items:
|
||
oc = oc + self.items[value]
|
||
self.items.update({value: oc})
|
||
|
||
def pickFrom(self):
|
||
return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0]
|
||
|
||
def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
|
||
return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
|
||
|
||
def fromJsonStruct(self, struct, itemRef = 'value', occurencesRef = 'occurences'):
|
||
self.items = {}
|
||
for item in struct:
|
||
self.items.update({item[itemRef]: item[occurencesRef]})
|
||
|
||
def isEmpty(self):
|
||
return len(self.items) == 0
|
||
|
||
class Generator:
|
||
"""Parent class for all generators"""
|
||
def __init__(self, id = '', description = '', phonology = ''):
|
||
self.id = id
|
||
self.description = description
|
||
self.phonology = phonology
|
||
self.isTyped = False
|
||
|
||
def isValid(self):
|
||
return (self.id != '') and self.isTyped
|
||
|
||
def toJsonStruct(self):
|
||
return { 'id': self.id,
|
||
'description': self.description,
|
||
'phonology': self.phonology }
|
||
|
||
def fromJsonStruct(self, struct):
|
||
self.id = struct['id']
|
||
self.description = struct['description']
|
||
self.phonology = struct['phonology']
|
||
|
||
def generateWord(self):
|
||
raise Exception('Word generation not supported on abstract generator')
|
||
|
||
class ChainGenerator(Generator):
|
||
"""Chains-based generator"""
|
||
def __init__(self, order = 1, **kwargs):
|
||
super().__init__(**kwargs)
|
||
self.order = order
|
||
self.chains = {} # input -> distribution of outputs
|
||
self.isTyped = True
|
||
|
||
def toJsonStruct(self):
|
||
struct = super().toJsonStruct()
|
||
struct.update({'type': 'chains',
|
||
'order': self.order,
|
||
'chains': [{'input': x, 'possible-outputs': self.chains[x].toJsonStruct(itemRef = 'value', occurencesRef = 'occurences')} for x in self.chains]})
|
||
return struct
|
||
|
||
def fromJsonStruct(self, struct):
|
||
super().fromJsonStruct(struct)
|
||
self.order = struct['order']
|
||
for chainStruct in struct['chains']:
|
||
dist = Distribution()
|
||
dist.fromJsonStruct(chainStruct['possible-outputs'], itemRef = 'value', occurencesRef = 'occurences')
|
||
self.chains.update({tuple(chainStruct['input']): dist})
|
||
|
||
def fromExamples(self, file, phonology):
|
||
"""Train a chain generator on an example file"""
|
||
with open(file) as exampleFile:
|
||
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
|
||
for row in fileReader:
|
||
if len(row) != 0:
|
||
row.append('') # Add terminator element (empty string)
|
||
previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
|
||
for item in row:
|
||
if (item != '') and (phonology.isValid()) and (not phonology.has(item)):
|
||
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
|
||
if previous in self.chains:
|
||
self.chains[previous].addTo(item)
|
||
else:
|
||
dist = Distribution()
|
||
dist.addTo(item)
|
||
self.chains.update({previous: dist})
|
||
previous = previous[1:] + (item,)
|
||
|
||
def generateWord(self):
|
||
outputIdList = []
|
||
nextItem = '.' #
|
||
previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
|
||
while nextItem != '':
|
||
nextItem = self.chains[previous].pickFrom()
|
||
if nextItem != '':
|
||
outputIdList.append(nextItem)
|
||
previous = previous[1:] + (nextItem,)
|
||
return outputIdList
|
||
|
||
class RuleGenerator(Generator):
|
||
"""Rules-based generator"""
|
||
def __init__(self, **kwargs):
|
||
super().__init__(**kwargs)
|
||
self.rules = {}
|
||
self.isTyped = True
|
||
|
||
def toJsonStruct(self):
|
||
struct = super().toJsonStruct()
|
||
struct.update({'type': 'rules',
|
||
'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]})
|
||
return struct
|
||
|
||
def fromJsonStruct(self, struct):
|
||
super().fromJsonStruct(struct)
|
||
for ruleStruct in struct['rules']:
|
||
dist = Distribution()
|
||
# The pattern should be converted from a list to a tuple
|
||
dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences')
|
||
self.rules.update({ruleStruct['id']: dist})
|
||
|
||
def generatePattern(self, pattern):
|
||
output = []
|
||
for x in pattern:
|
||
if x in self.rules:
|
||
output = output + self.generatePattern(self.rules[x].pickFrom())
|
||
else:
|
||
output.append(x)
|
||
return output
|
||
|
||
def generateWord(self):
|
||
return self.generatePattern(self.rules['word'].pickFrom())
|
||
|
||
def processRowFromExample(self, row, stressId, syllableBreakId):
|
||
# Check the number of stress
|
||
nbStress = row.count(stressId)
|
||
if nbStress > 1:
|
||
print("Too much stress in " + str(row) + ": skip the example")
|
||
return
|
||
# Build the syllable list
|
||
syllables = []
|
||
currentSyllable = []
|
||
stressedSyllableIdx = -1
|
||
syllableIdx = 0
|
||
for x in row:
|
||
# Append to the current syllable if not a syllable separator
|
||
if (x != stressId) and (x != syllableBreakId):
|
||
currentSyllable.append(x)
|
||
# In case of syllable separator, only add the syllable to the list if it is not empty
|
||
elif len(currentSyllable) != 0:
|
||
syllables.append(currentSyllable)
|
||
currentSyllable = []
|
||
syllableIdx = syllableIdx + 1
|
||
# If current id is stress, remember the position of the stressed syllable
|
||
if (x == stressId):
|
||
stressedSyllableIdx = syllableIdx
|
||
# After the loop, the current syllable should be non-empty, add it to the list of syllables
|
||
if len(currentSyllable) != 0:
|
||
syllables.append(currentSyllable)
|
||
# Single syllable case
|
||
if len(syllables) == 1:
|
||
if stressedSyllableIdx == 0:
|
||
self.rules['word'].addTo(tuple([stressId, 'single']))
|
||
else:
|
||
self.rules['word'].addTo(tuple(['single']))
|
||
self.rules['single'].addTo(tuple(syllables[0]))
|
||
# Other cases
|
||
else:
|
||
wordPattern = []
|
||
for x in range(len(syllables)):
|
||
rule = ''
|
||
separator = syllableBreakId
|
||
if x == 0:
|
||
rule = 'initial'
|
||
elif x == (len(syllables) - 1):
|
||
rule = 'final'
|
||
else:
|
||
rule = 'middle'
|
||
if x == stressedSyllableIdx:
|
||
rule = rule + '-stressed'
|
||
separator = stressId
|
||
# only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
|
||
if (separator == stressId) or (x > 0):
|
||
wordPattern.append(separator)
|
||
# Add the rule to the pattern
|
||
wordPattern.append(rule)
|
||
# The syllable is added to the corresponding rule
|
||
self.rules[rule].addTo(tuple(syllables[x]))
|
||
self.rules['word'].addTo(tuple(wordPattern))
|
||
|
||
def splitSyllableRule(self, syllableRule, phonology):
|
||
"""Replace syllable rules with onset/nucleus/coda pattern"""
|
||
newDist = Distribution()
|
||
oldDist = self.rules[syllableRule]
|
||
# Add onset/nucleus/coda rules
|
||
onsetRule = syllableRule + '-onset'
|
||
nucleusRule = syllableRule +'-nucleus'
|
||
codaRule = syllableRule + '-coda'
|
||
self.rules[onsetRule] = Distribution()
|
||
self.rules[nucleusRule] = Distribution()
|
||
self.rules[codaRule] = Distribution()
|
||
# For each pattern, split into onset/nucleus/coda
|
||
for pattern in oldDist.items:
|
||
isOnset = True
|
||
onset = []
|
||
isNucleus = False
|
||
nucleus = []
|
||
isCoda = False
|
||
coda = []
|
||
for phoneme in pattern:
|
||
# Check is there is a change of element
|
||
if isOnset and (phonology.isNucleus(phoneme)):
|
||
isOnset = False
|
||
isNucleus = True
|
||
elif isNucleus and (phonology.isCoda(phoneme)):
|
||
isNucleus = False
|
||
isCoda = True
|
||
# Add to the respective list
|
||
if isOnset:
|
||
onset.append(phoneme)
|
||
elif isNucleus:
|
||
nucleus.append(phoneme)
|
||
else:
|
||
coda.append(phoneme)
|
||
# Add to the specific distributions and determine the pattern in new distribution
|
||
occurences = oldDist.items[pattern]
|
||
distPattern = []
|
||
if len(onset) != 0:
|
||
distPattern.append(onsetRule)
|
||
self.rules[onsetRule].addTo(tuple(onset), occurences)
|
||
if len(nucleus) != 0:
|
||
distPattern.append(nucleusRule)
|
||
self.rules[nucleusRule].addTo(tuple(nucleus), occurences)
|
||
if len(coda) != 0:
|
||
distPattern.append(codaRule)
|
||
self.rules[codaRule].addTo(tuple(coda), occurences)
|
||
# Add patterns to distributions
|
||
newDist.addTo(tuple(distPattern), occurences)
|
||
# Replace the old rules with the new rules
|
||
self.rules[syllableRule] = newDist
|
||
|
||
def cleanRules(self):
|
||
"""Remove the empty rules"""
|
||
self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
|
||
|
||
def fromExamples(self, file, phonology):
|
||
"""Train a rule generator on an example file"""
|
||
stressId = phonology.getStress()
|
||
syllableBreakId = phonology.getSyllableBreak()
|
||
# Words are modelled as lists of syllables, with one of those being stressed (optionally)
|
||
# The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
|
||
# Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
|
||
#
|
||
# Add the 'word' rule, and syllable rules, initialized with an empty distribution
|
||
self.rules.update({'word': Distribution()})
|
||
syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed']
|
||
for x in syllableRules:
|
||
self.rules.update({x: Distribution()})
|
||
# Step 1: open the file and find how words look like
|
||
with open(file) as exampleFile:
|
||
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
|
||
for row in fileReader:
|
||
if len(row) != 0:
|
||
# Check the items in row
|
||
for item in row:
|
||
if (item != '') and (not phonology.has(item)):
|
||
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
|
||
# Process the row
|
||
self.processRowFromExample(row, stressId, syllableBreakId)
|
||
# Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
|
||
for x in syllableRules:
|
||
self.splitSyllableRule(x, phonology)
|
||
# Step 3: remove the empty rules
|
||
self.cleanRules()
|
||
|
||
def randomOccurences(mean, range):
|
||
"""Generate a random number in the range [mean-range, mean+range+1]"""
|
||
return random.randint(mean - range, mean + range + 1)
|
||
|
||
def isStressPosition(position, numberSyllables, stressPosition):
|
||
"""Check if a given position is the position of the stress. The position goes from 1 to numberSyllables included."""
|
||
isPosition = False
|
||
if (stressPosition > 0) and (stressPosition <= numberSyllables):
|
||
isPosition = position == stressPosition
|
||
elif (stressPosition < 0) and (abs(stressPosition) <= numberSyllables):
|
||
isPosition = position == (numberSyllables + 1 + stressPosition)
|
||
elif (position == numberSyllables) and (stressPosition > numberSyllables):
|
||
isPosition = True
|
||
elif (position == 1) and (stressPosition < 0) and (abs(stressPosition) > numberSyllables):
|
||
isPosition = True
|
||
return isPosition
|
||
|
||
def fromPhonology(self, phonology, minNumberSyllables = 1, maxNumberSyllables = 4, stressPosition = -2, distributionMean = 20, distributionRange = 5):
|
||
"""
|
||
Generate a rule-based generator just from a phonology and some parameters.
|
||
- minNumberSyllables must be strictly positive.
|
||
- maxNumberSyllables must be greater than minNumberSyllables
|
||
- stressPosition indicates on which syllable the stress occurs.
|
||
Positive index count from the beginning to the end (with the first syllable being at index 1).
|
||
Negative index count from the end to the beginning (with the last syllable being at index -1)
|
||
Set this to zero if no stress should be generated.
|
||
- distributionMean indicates the medium value for the occurences of a phoneme
|
||
- distributionRange indicates the maximum absolute difference from distributionMean for the occurences of phonemes and syllables
|
||
"""
|
||
# Reinitialize
|
||
self.phonology = phonology.id
|
||
self.rules = {}
|
||
# Check the parameters
|
||
if maxNumberSyllables < minNumberSyllables:
|
||
raise Exception("Maximum number of syllables", maxNumberSyllables, "must be higher than the minimum number of syllables", minNumberSyllables)
|
||
if maxNumberSyllables < abs(stressPosition):
|
||
raise Exception("Can't set a stress at position", stressPosition, "with a maximum number of syllables", maxNumberSyllables)
|
||
if distributionMean < 1:
|
||
raise Exception("Distribution mean must be strictly positive. Given", distributionMean)
|
||
if distributionMean < distributionRange:
|
||
raise Exception("Distribution mean", distributionMean, "must be strictly higher than distribution range", distributionRange)
|
||
if distributionRange < 0:
|
||
raise Exception("Distribution range must be positive or nul. Given", distributionRange)
|
||
# Step 1: Generate the word rules based on the min and max number of syllables, and the presence of stress
|
||
stressId = phonology.getStress()
|
||
syllableBreakId = phonology.getSyllableBreak()
|
||
isStressed = (stressPosition != 0) and phonology.hasStressedVowels()
|
||
# Add the 'word' rule, initialized with an empty distribution
|
||
self.rules.update({'word': Distribution()})
|
||
# Add the syllable rules and word patterns
|
||
syllableRules = []
|
||
syllableRulesToTags = {}
|
||
if minNumberSyllables == 1:
|
||
syllableRules.append('single')
|
||
syllableRulesToTags.update({'single': ['#single']})
|
||
wordPattern = []
|
||
if isStressed:
|
||
syllableRulesToTags['single'].append('#stressed')
|
||
wordPattern.append(stressId)
|
||
wordPattern.append('single')
|
||
self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||
if maxNumberSyllables > 1:
|
||
syllableRules = syllableRules + ['initial', 'middle', 'final']
|
||
syllableRulesToTags.update({'initial': ['#initial'], 'middle': ['#middle'], 'final': ['#final']})
|
||
if isStressed:
|
||
syllableRules = syllableRules + ['initial-stressed', 'middle-stressed', 'final-stressed']
|
||
syllableRulesToTags.update({'initial-stressed': ['#initial', '#stressed'], 'middle-stressed': ['#middle', '#stressed'], 'final-stressed': ['#final', '#stressed']})
|
||
syllableRulesToTags.update({'initial': ['#initial', '#unstressed'], 'middle': ['#middle', '#unstressed'], 'final': ['#final', '#unstressed']})
|
||
for nbMiddleSyllables in range(maxNumberSyllables - 1):
|
||
nbSyllables = nbMiddleSyllables + 2
|
||
wordPattern = []
|
||
for position in range(1, nbSyllables + 1):
|
||
isStressPosition = isStressed and RuleGenerator.isStressPosition(position, nbSyllables, stressPosition)
|
||
# add syllable separator
|
||
if isStressPosition:
|
||
wordPattern.append(stressId)
|
||
elif position > 1:
|
||
wordPattern.append(syllableBreakId)
|
||
# add syllable
|
||
if position == 1:
|
||
if isStressPosition:
|
||
wordPattern.append('initial-stressed')
|
||
else:
|
||
wordPattern.append('initial')
|
||
elif position == nbSyllables:
|
||
if isStressPosition:
|
||
wordPattern.append('final-stressed')
|
||
else:
|
||
wordPattern.append('final')
|
||
else:
|
||
if isStressPosition:
|
||
wordPattern.append('middle-stressed')
|
||
else:
|
||
wordPattern.append('middle')
|
||
self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||
# Step 2: Generate the syllable rules
|
||
# Add the rules in the distributions
|
||
phonemeRules = []
|
||
phonemeRulesToTag = {}
|
||
for syllable in syllableRules:
|
||
self.rules.update({syllable: Distribution()})
|
||
onset = syllable + '-onset'
|
||
nucleus = syllable + '-nucleus'
|
||
coda = syllable + '-coda'
|
||
phonemeRules = phonemeRules + [onset, nucleus, coda]
|
||
ruleTags = syllableRulesToTags[syllable]
|
||
phonemeRulesToTag.update({onset: ruleTags + ['#onset'], nucleus: ruleTags + ['#nucleus'], coda: ruleTags + ['#coda']})
|
||
# Fill the syllable rules
|
||
# For the generated rules, initial and single syllables may not have onset
|
||
if ('#initial' in ruleTags) or ('#single' in ruleTags):
|
||
self.rules[syllable].addTo(tuple([nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||
self.rules[syllable].addTo(tuple([nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||
self.rules[syllable].addTo(tuple([onset, nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||
self.rules[syllable].addTo(tuple([onset, nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||
# Step 3: Generate the phoneme distributions for each phoneme rule
|
||
for rule in phonemeRules:
|
||
self.rules.update({rule: Distribution()})
|
||
tags = phonemeRulesToTag[rule]
|
||
phonemeList = phonology.getPhonemesFromTags(tags)
|
||
for phoneme in phonemeList:
|
||
self.rules[rule].addTo(tuple([phoneme]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
|
||
# Step 4: Clean the rules
|
||
self.cleanRules()
|
||
|
||
|
||
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
|
||
def makeGenerator(struct):
|
||
"""Function instanciating a generator from a JSON structure"""
|
||
if struct['type'] in generatorTypeToClass:
|
||
generator = generatorTypeToClass[struct['type']]()
|
||
else:
|
||
generator = Generator()
|
||
generator.fromJsonStruct(struct)
|
||
return generator
|
||
|
||
class PhonagenFile:
|
||
"""A phonagen file, with phonologies and generators"""
|
||
def __init__(self):
|
||
self.phonologies = {}
|
||
self.generators = {}
|
||
|
||
def addPhonology(self, phonology):
|
||
if (phonology.isValid()):
|
||
self.phonologies.update({phonology.id: phonology})
|
||
|
||
def addGenerator(self, generator):
|
||
if (generator.isValid()):
|
||
self.generators.update({generator.id: generator})
|
||
|
||
def getPhonology(self, id):
|
||
return self.phonologies[id]
|
||
|
||
def getGenerator(self, id):
|
||
return self.generators[id]
|
||
|
||
def load(self, file):
|
||
"""Load from a JSON file"""
|
||
with open(file, 'r', encoding='utf-8') as inputFile:
|
||
jsonStruct = json.load(inputFile)
|
||
# Load phonologies
|
||
for struct in jsonStruct['phonologies']:
|
||
phonology = Phonology()
|
||
phonology.fromJsonStruct(struct)
|
||
self.addPhonology(phonology)
|
||
# Load generators
|
||
for struct in jsonStruct['generators']:
|
||
self.addGenerator(makeGenerator(struct))
|
||
|
||
def writeTo(self, file = ''):
|
||
"""Output to a JSON file (or stdout)"""
|
||
outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
|
||
'generators': [x.toJsonStruct() for x in self.generators.values()] }
|
||
if file == '':
|
||
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
|
||
else:
|
||
with open(file, 'w', encoding='utf-8') as outputFile:
|
||
json.dump(outputStruct, outputFile, ensure_ascii=False)
|
||
|
||
def mergeFrom(self, otherFile):
|
||
"""Add all phonologies and generators from the other file into this one."""
|
||
for phonology in otherFile.phonologies.values():
|
||
self.addPhonology(phonology)
|
||
for generator in otherFile.generators.values():
|
||
self.addGenerator(generator)
|
||
|
||
def generateWord(self, generator = ''):
|
||
gen = generator
|
||
if gen == '':
|
||
gen = random.choice([x for x in self.generators])
|
||
idList = self.generators[gen].generateWord()
|
||
phonology = self.phonologies[self.generators[gen].phonology]
|
||
return phonology.formatWord(idList)
|