phonagen/py-phonagen/phonagen.py

489 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Common functions and classes for phonagen tools"""
import json
import io
import sys
import csv
import random
import unicodedata
class Phonology:
"""Phonology class"""
def __init__(self, id = '', description = '', mainTranscription = ''):
self.id = id
self.description = description
self.transcriptions = []
self.mainTranscription = mainTranscription
self.entries = {} # id -> entry
def isValid(self):
return self.id != ''
def has(self, id):
return id in self.entries
def toJsonStruct(self):
"""Convert a Phonology to a Json structure"""
return { 'id': self.id,
'description': self.description,
'transcriptions': self.transcriptions,
'main-transcription': self.mainTranscription,
'entries': [x for x in self.entries.values()] }
def fromJsonStruct(self, struct):
"""Fill a Phonology from a Json structure"""
self.id = struct['id']
self.description = struct['description']
self.transcriptions = struct['transcriptions']
self.mainTranscription = struct['main-transcription']
self.entries = {x['id']: x for x in struct['entries']}
def fromCsv(self, file):
"""Fill a Phonology from a Csv file"""
with open(file) as csvfile:
fileReader = csv.reader(csvfile)
# get csv header
header = next(fileReader)
# get the transcriptions (header items not id or description)
self.transcriptions = [x for x in header if x not in ['id', 'description']]
# Check: self.transcriptions should contain 'phoneme'
if 'phoneme' not in self.transcriptions:
raise Exception('phoneme column not found in ', file)
# Check: self.transcriptions should have at least two items
if len(self.transcriptions) < 2:
raise Exception('No transcription found outside phoneme in file ', file, 'Did you named it id or description ?')
# get the first header item which is not one of those: id, description, phoneme
guessedMainTranscription = next(x for x in header if x not in ['id', 'description', 'phoneme'])
# If main-transcription was not given on the command line, use the guess as main-transcription
if self.mainTranscription == '':
self.mainTranscription = guessedMainTranscription
# Check: self.mainTranscription should be in self.transcriptions
if self.mainTranscription not in self.mainTranscription:
raise Exception('main-transcription', self.mainTranscription, 'not in list of transcriptions')
# If id was not given on the command line, use the mainTranscription as the id
if self.id == '':
self.id = self.mainTranscription
# parse entries
for row in fileReader:
entry = dict()
for i in range(len(row)):
entry.update({header[i]: row[i]})
# All absent elements are set to ''
for i in range(len(row), len(header)):
entry.update({header[i]: ''})
# if both phoneme and main-transcription are empty, skip the rest
if (entry['phoneme'] != '') or (entry[self.mainTranscription] != ''):
# if id is not provided, generate it
if 'id' not in header:
entry.update({'id': entry['phoneme'] + '-' + entry[self.mainTranscription]})
# if description is not provided, add an empty one
if 'description' not in header:
entry.update({'description': ''})
self.entries.update({entry['id']: entry})
def formatWord(self, idList):
"""Return a table of transcription -> string corresponding to the same word"""
result = {x: "" for x in self.transcriptions}
for x in idList:
phoneme = self.entries[x]
for y in result:
result[y].append(phoneme[y])
return result
def getStress(self):
"""Return the phoneme id of the stress phoneme"""
# search for #stress tag in description
found = [x['id'] for x in self.entries.values() if '#stress' in x['description']]
if len(found) == 0:
# if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription
found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])]
if len(found) == 0:
raise Exception('No stress phoneme in phonology', self.id)
return found[0]
def getSyllableBreak(self):
"""Return the phoneme id of the syllable break phoneme"""
# search for #syllable-break tag in description
found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']]
if len(found) == 0:
# if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']]
if len(found) == 0:
raise Exception('No syllable break phoneme in phonology', self.id)
return found[0]
vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ"
def isVowel(phoneme):
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels)
consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ"
def isConsonant(phoneme):
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
def isOnset(self, id):
"""Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription"""
entry = self.entries[id]
description = entry['description']
result = ('#onset' in description) or ('#consonant' in description)
if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description):
result = Phonology.isConsonant(entry['phoneme'])
return result
def isNucleus(self, id):
"""Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
entry = self.entries[id]
description = entry['description']
result = ('#nucleus' in description) or ('#vowel' in description)
if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description):
result = Phonology.isVowel(entry['phoneme'])
return result
def isCoda(self, id):
"""Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription"""
entry = self.entries[id]
description = entry['description']
result = ('#coda' in description) or ('#consonant' in description)
if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description):
result = Phonology.isConsonant(entry['phoneme'])
return result
class Distribution:
"""Discrete distribution"""
def __init__(self):
self.items = {}
def addTo(self, value, occurences = 1):
oc = occurences
if value in self.items:
oc = oc + self.items[value]
self.items.update({value: oc})
def pickFrom(self):
return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0]
def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
def fromJsonStruct(self, struct, itemRef = 'value', occurencesRef = 'occurences'):
self.items = {}
for item in struct:
self.items.update({item[itemRef]: item[occurencesRef]})
def isEmpty(self):
return len(self.items) == 0
class Generator:
"""Parent class for all generators"""
def __init__(self, id = '', description = '', phonology = ''):
self.id = id
self.description = description
self.phonology = phonology
self.isTyped = False
def isValid(self):
return (self.id != '') and self.isTyped
def toJsonStruct(self):
return { 'id': self.id,
'description': self.description,
'phonology': self.phonology }
def fromJsonStruct(self, struct):
self.id = struct['id']
self.description = struct['description']
self.phonology = struct['phonology']
def generateWord(self):
raise Exception('Word generation not supported on abstract generator')
class ChainGenerator(Generator):
"""Chains-based generator"""
def __init__(self, order = 1, **kwargs):
super().__init__(**kwargs)
self.order = order
self.chains = {} # input -> distribution of outputs
self.isTyped = True
def toJsonStruct(self):
struct = super().toJsonStruct()
struct.update({'type': 'chains',
'order': self.order,
'chains': [{'input': x, 'possible-outputs': self.chains[x].toJsonStruct(itemRef = 'value', occurencesRef = 'occurences')} for x in self.chains]})
return struct
def fromJsonStruct(self, struct):
super().fromJsonStruct(struct)
self.order = struct['order']
for chainStruct in struct['chains']:
dist = Distribution()
dist.fromJsonStruct(chainStruct['possible-outputs'], itemRef = 'value', occurencesRef = 'occurences')
self.chains.update({tuple(chainStruct['input']): dist})
def fromExamples(self, file, phonology):
"""Train a chain generator on an example file"""
with open(file) as exampleFile:
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
for row in fileReader:
if len(row) != 0:
row.append('') # Add terminator element (empty string)
previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
for item in row:
if (item != '') and (phonology.isValid()) and (not phonology.has(item)):
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
if previous in self.chains:
self.chains[previous].addTo(item)
else:
dist = Distribution()
dist.addTo(item)
self.chains.update({previous: dist})
previous = previous[1:] + (item,)
def generateWord(self):
outputIdList = []
nextItem = '.' #
previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
while nextItem != '':
nextItem = self.chains[previous].pickFrom()
if nextItem != '':
outputIdList.append(nextItem)
previous = previous[1:] + (nextItem,)
return outputIdList
class RuleGenerator(Generator):
"""Rules-based generator"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.rules = {}
self.isTyped = True
def toJsonStruct(self):
struct = super().toJsonStruct()
struct.update({'type': 'rules',
'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]})
return struct
def fromJsonStruct(self, struct):
super().fromJsonStruct(struct)
for ruleStruct in struct['rules']:
dist = Distribution()
# The pattern should be converted from a list to a tuple
dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences')
self.rules.update({ruleStruct['id']: dist})
def generatePattern(self, pattern):
output = []
for x in pattern:
if x in self.rules:
ouput.concat(self.generatePattern(self.rules[x].pickFrom()))
else:
output.append(x)
return output
def generateWord(self):
return self.generatePattern(self.rules['word'].pickFrom())
def processRowFromExample(self, row, stressId, syllableBreakId):
# Check the number of stress
nbStress = row.count(stressId)
if nbStress > 1:
print("Too much stress in " + str(row) + ": skip the example")
return
# Build the syllable list
syllables = []
currentSyllable = []
stressedSyllableIdx = -1
syllableIdx = 0
for x in row:
# Append to the current syllable if not a syllable separator
if (x != stressId) and (x != syllableBreakId):
currentSyllable.append(x)
# In case of syllable separator, only add the syllable to the list if it is not empty
elif len(currentSyllable) != 0:
syllables.append(currentSyllable)
currentSyllable = []
syllableIdx = syllableIdx + 1
# If current id is stress, remember the position of the stressed syllable
if (x == stressId):
stressedSyllableIdx = syllableIdx
# After the loop, the current syllable should be non-empty, add it to the list of syllables
if len(currentSyllable) != 0:
syllables.append(currentSyllable)
# Single syllable case
if len(syllables) == 1:
if stressedSyllableIdx == 0:
self.rules['word'].addTo(tuple([stressId, 'single']))
else:
self.rules['word'].addTo(tuple(['single']))
self.rules['single'].addTo(tuple(syllables[0]))
# Other cases
else:
wordPattern = []
for x in range(len(syllables)):
rule = ''
separator = syllableBreakId
if x == 0:
rule = 'initial'
elif x == (len(syllables) - 1):
rule = 'final'
else:
rule = 'middle'
if x == stressedSyllableIdx:
rule = rule + '-stressed'
separator = stressId
# only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
if (separator == stressId) or (x > 0):
wordPattern.append(separator)
# Add the rule to the pattern
wordPattern.append(rule)
# The syllable is added to the corresponding rule
self.rules[rule].addTo(tuple(syllables[x]))
self.rules['word'].addTo(tuple(wordPattern))
def splitSyllableRule(self, syllableRule, phonology):
"""Replace syllable rules with onset/nucleus/coda pattern"""
newDist = Distribution()
oldDist = self.rules[syllableRule]
# Add onset/nucleus/coda rules
onsetRule = syllableRule + '-onset'
nucleusRule = syllableRule +'-nucleus'
codaRule = syllableRule + '-coda'
self.rules[onsetRule] = Distribution()
self.rules[nucleusRule] = Distribution()
self.rules[codaRule] = Distribution()
# For each pattern, split into onset/nucleus/coda
for pattern in oldDist.items:
isOnset = True
onset = []
isNucleus = False
nucleus = []
isCoda = False
coda = []
for phoneme in pattern:
# Check is there is a change of element
if isOnset and (phonology.isNucleus(phoneme)):
isOnset = False
isNucleus = True
elif isNucleus and (phonology.isCoda(phoneme)):
isNucleus = False
isCoda = True
# Add to the respective list
if isOnset:
onset.append(phoneme)
elif isNucleus:
nucleus.append(phoneme)
else:
coda.append(phoneme)
# Add to the specific distributions and determine the pattern in new distribution
occurences = oldDist.items[pattern]
distPattern = []
if len(onset) != 0:
distPattern.append(onsetRule)
self.rules[onsetRule].addTo(tuple(onset), occurences)
if len(nucleus) != 0:
distPattern.append(nucleusRule)
self.rules[nucleusRule].addTo(tuple(nucleus), occurences)
if len(coda) != 0:
distPattern.append(codaRule)
self.rules[codaRule].addTo(tuple(coda), occurences)
# Add patterns to distributions
newDist.addTo(tuple(distPattern), occurences)
# Replace the old rules with the new rules
self.rules[syllableRule] = newDist
def fromExamples(self, file, phonology):
"""Train a rule generator on an example file"""
stressId = phonology.getStress()
syllableBreakId = phonology.getSyllableBreak()
# Words are modelled as lists of syllables, with one of those being stressed (optionally)
# The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
# Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
#
# Add the 'word' rule, and syllable rules, initialized with an empty distribution
self.rules.update({'word': Distribution()})
syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed']
for x in syllableRules:
self.rules.update({x: Distribution()})
# Step 1: open the file and find how words look like
with open(file) as exampleFile:
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
for row in fileReader:
if len(row) != 0:
# Check the items in row
for item in row:
if (item != '') and (not phonology.has(item)):
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
# Process the row
self.processRowFromExample(row, stressId, syllableBreakId)
# Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
for x in syllableRules:
self.splitSyllableRule(x, phonology)
# Step 3: remove the empty rules
self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
def makeGenerator(struct):
"""Function instanciating a generator from a JSON structure"""
if struct['type'] in generatorTypeToClass:
generator = generatorTypeToClass[struct['type']]()
else:
generator = Generator()
generator.fromJsonStruct(struct)
return generator
class PhonagenFile:
"""A phonagen file, with phonologies and generators"""
def __init__(self):
self.phonologies = {}
self.generators = {}
def addPhonology(self, phonology):
if (phonology.isValid()):
self.phonologies.update({phonology.id: phonology})
def addGenerator(self, generator):
if (generator.isValid()):
self.generators.update({generator.id: generator})
def getPhonology(self, id):
return self.phonologies[id]
def getGenerator(self, id):
return self.generators[id]
def load(self, file):
"""Load from a JSON file"""
with open(file, 'r', encoding='utf-8') as inputFile:
jsonStruct = json.load(inputFile)
# Load phonologies
for struct in jsonStruct['phonologies']:
phonology = Phonology()
phonology.fromJsonStruct(struct)
self.addPhonology(phonology)
# Load generators
for struct in jsonStruct['generators']:
self.addGenerator(makeGenerator(struct))
def writeTo(self, file = ''):
"""Output to a JSON file (or stdout)"""
outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
'generators': [x.toJsonStruct() for x in self.generators.values()] }
if file == '':
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
else:
with open(file, 'w', encoding='utf-8') as outputFile:
json.dump(outputStruct, outputFile, ensure_ascii=False, indent=2)
def mergeFrom(self, otherFile):
"""Add all phonologies and generators from the other file into this one."""
for phonology in otherFile.phonologies.values():
self.addPhonology(phonology)
for generator in otherFile.generators.values():
self.addGenerator(generator)
def generateWord(self, generator = ''):
gen = generator
if gen == '':
gen = random.choice([x for x in self.generators])
idList = self.generators[gen].generateWord()
phonology = self.phonologies[self.gnerators[gen].phonology]
return phonology.formatWord(idList)