Add list2rule generator (generating rules from examples)

This commit is contained in:
Feufochmar 2018-06-10 22:55:04 +02:00
parent 6ec27d4429
commit a618ce0530
2 changed files with 285 additions and 7 deletions

View File

@ -0,0 +1,31 @@
#! /usr/bin/env python3
import argparse
import phonagen
def parseArgs():
# Define argument parser
parser = argparse.ArgumentParser(description='Convert an example list to a rule generator.')
parser.add_argument('file', metavar='listfile', help='list file to convert')
parser.add_argument('--id', metavar='id', help='id of the generator', required = True)
parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='')
parser.add_argument('--phonology', metavar='phonology', help='id of the phonology on which is based the generator', required = True)
parser.add_argument('--phonologyfile', metavar='phonologyfile', help='file containing the phonology; mandatory; phonology will present in the output', required = True)
parser.add_argument('--output', metavar='output-file', help='Output file for the generator. The file is printed to standard output if not given.', default='')
# Parse arguments
return parser.parse_args()
# Main
if __name__ == '__main__':
args = parseArgs()
generator = phonagen.RuleGenerator(id = args.id, description = args.description, phonology = args.phonology)
phonology = phonagen.Phonology()
phonologyFile = phonagen.PhonagenFile()
phonologyFile.load(args.phonologyfile)
phonology = phonologyFile.getPhonology(args.phonology)
#
generator.fromExamples(args.file, phonology)
phonagenFile = phonagen.PhonagenFile()
phonagenFile.addPhonology(phonology)
phonagenFile.addGenerator(generator)
phonagenFile.writeTo(args.output)

View File

@ -4,6 +4,7 @@ import io
import sys
import csv
import random
import unicodedata
class Phonology:
"""Phonology class"""
@ -79,6 +80,72 @@ class Phonology:
entry.update({'description': ''})
self.entries.update({entry['id']: entry})
def formatWord(self, idList):
"""Return a table of transcription -> string corresponding to the same word"""
result = {x: "" for x in self.transcriptions}
for x in idList:
phoneme = self.entries[x]
for y in result:
result[y].append(phoneme[y])
return result
def getStress(self):
"""Return the phoneme id of the stress phoneme"""
# search for #stress tag in description
found = [x['id'] for x in self.entries.values() if '#stress' in x['description']]
if len(found) == 0:
# if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription
found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])]
if len(found) == 0:
raise Exception('No stress phoneme in phonology', self.id)
return found[0]
def getSyllableBreak(self):
"""Return the phoneme id of the syllable break phoneme"""
# search for #syllable-break tag in description
found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']]
if len(found) == 0:
# if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']]
if len(found) == 0:
raise Exception('No syllable break phoneme in phonology', self.id)
return found[0]
vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ"
def isVowel(phoneme):
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels)
consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ"
def isConsonant(phoneme):
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
def isOnset(self, id):
"""Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription"""
entry = self.entries[id]
description = entry['description']
result = ('#onset' in description) or ('#consonant' in description)
if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description):
result = Phonology.isConsonant(entry['phoneme'])
return result
def isNucleus(self, id):
"""Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
entry = self.entries[id]
description = entry['description']
result = ('#nucleus' in description) or ('#vowel' in description)
if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description):
result = Phonology.isVowel(entry['phoneme'])
return result
def isCoda(self, id):
"""Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription"""
entry = self.entries[id]
description = entry['description']
result = ('#coda' in description) or ('#consonant' in description)
if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description):
result = Phonology.isConsonant(entry['phoneme'])
return result
class Distribution:
"""Discrete distribution"""
def __init__(self):
@ -91,7 +158,7 @@ class Distribution:
self.items.update({value: oc})
def pickFrom(self):
return random.choices([k for k in self.items.keys()], [v for v in self.items.values])[0]
return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0]
def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
@ -101,6 +168,9 @@ class Distribution:
for item in struct:
self.items.update({item[itemRef]: item[occurencesRef]})
def isEmpty(self):
return len(self.items) == 0
class Generator:
"""Parent class for all generators"""
def __init__(self, id = '', description = '', phonology = ''):
@ -122,6 +192,9 @@ class Generator:
self.description = struct['description']
self.phonology = struct['phonology']
def generateWord(self):
raise Exception('Word generation not supported on abstract generator')
class ChainGenerator(Generator):
"""Chains-based generator"""
def __init__(self, order = 1, **kwargs):
@ -151,10 +224,10 @@ class ChainGenerator(Generator):
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
for row in fileReader:
if len(row) != 0:
row.append("") # Add terminator element (empty string)
previous = tuple("" for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
row.append('') # Add terminator element (empty string)
previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
for item in row:
if (item != "") and (phonology.isValid()) and (not phonology.has(item)):
if (item != '') and (phonology.isValid()) and (not phonology.has(item)):
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
if previous in self.chains:
self.chains[previous].addTo(item)
@ -164,6 +237,17 @@ class ChainGenerator(Generator):
self.chains.update({previous: dist})
previous = previous[1:] + (item,)
def generateWord(self):
outputIdList = []
nextItem = '.' #
previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
while nextItem != '':
nextItem = self.chains[previous].pickFrom()
if nextItem != '':
outputIdList.append(nextItem)
previous = previous[1:] + (nextItem,)
return outputIdList
class RuleGenerator(Generator):
"""Rules-based generator"""
def __init__(self, **kwargs):
@ -173,12 +257,167 @@ class RuleGenerator(Generator):
def toJsonStruct(self):
struct = super().toJsonStruct()
# TODO: add rules
struct.update({'type': 'rules',
'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]})
return struct
def fromJsonStruct(self, struct):
super().fromJsonStruct(struct)
# TODO: rules
for ruleStruct in struct['rules']:
dist = Distribution()
# The pattern should be converted from a list to a tuple
dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences')
self.rules.update({ruleStruct['id']: dist})
def generatePattern(self, pattern):
output = []
for x in pattern:
if x in self.rules:
ouput.concat(self.generatePattern(self.rules[x].pickFrom()))
else:
output.append(x)
return output
def generateWord(self):
return self.generatePattern(self.rules['word'].pickFrom())
def processRowFromExample(self, row, stressId, syllableBreakId):
# Check the number of stress
nbStress = row.count(stressId)
if nbStress > 1:
print("Too much stress in " + str(row) + ": skip the example")
return
# Build the syllable list
syllables = []
currentSyllable = []
stressedSyllableIdx = -1
syllableIdx = 0
for x in row:
# Append to the current syllable if not a syllable separator
if (x != stressId) and (x != syllableBreakId):
currentSyllable.append(x)
# In case of syllable separator, only add the syllable to the list if it is not empty
elif len(currentSyllable) != 0:
syllables.append(currentSyllable)
currentSyllable = []
syllableIdx = syllableIdx + 1
# If current id is stress, remember the position of the stressed syllable
if (x == stressId):
stressedSyllableIdx = syllableIdx
# After the loop, the current syllable should be non-empty, add it to the list of syllables
if len(currentSyllable) != 0:
syllables.append(currentSyllable)
# Single syllable case
if len(syllables) == 1:
if stressedSyllableIdx == 0:
self.rules['word'].addTo(tuple([stressId, 'single']))
else:
self.rules['word'].addTo(tuple(['single']))
self.rules['single'].addTo(tuple(syllables[0]))
# Other cases
else:
wordPattern = []
for x in range(len(syllables)):
rule = ''
separator = syllableBreakId
if x == 0:
rule = 'initial'
elif x == (len(syllables) - 1):
rule = 'final'
else:
rule = 'middle'
if x == stressedSyllableIdx:
rule = rule + '-stressed'
separator = stressId
# only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
if (separator == stressId) or (x > 0):
wordPattern.append(separator)
# Add the rule to the pattern
wordPattern.append(rule)
# The syllable is added to the corresponding rule
self.rules[rule].addTo(tuple(syllables[x]))
self.rules['word'].addTo(tuple(wordPattern))
def splitSyllableRule(self, syllableRule, phonology):
"""Replace syllable rules with onset/nucleus/coda pattern"""
newDist = Distribution()
oldDist = self.rules[syllableRule]
# Add onset/nucleus/coda rules
onsetRule = syllableRule + '-onset'
nucleusRule = syllableRule +'-nucleus'
codaRule = syllableRule + '-coda'
self.rules[onsetRule] = Distribution()
self.rules[nucleusRule] = Distribution()
self.rules[codaRule] = Distribution()
# For each pattern, split into onset/nucleus/coda
for pattern in oldDist.items:
isOnset = True
onset = []
isNucleus = False
nucleus = []
isCoda = False
coda = []
for phoneme in pattern:
# Check is there is a change of element
if isOnset and (phonology.isNucleus(phoneme)):
isOnset = False
isNucleus = True
elif isNucleus and (phonology.isCoda(phoneme)):
isNucleus = False
isCoda = True
# Add to the respective list
if isOnset:
onset.append(phoneme)
elif isNucleus:
nucleus.append(phoneme)
else:
coda.append(phoneme)
# Add to the specific distributions and determine the pattern in new distribution
occurences = oldDist.items[pattern]
distPattern = []
if len(onset) != 0:
distPattern.append(onsetRule)
self.rules[onsetRule].addTo(tuple(onset), occurences)
if len(nucleus) != 0:
distPattern.append(nucleusRule)
self.rules[nucleusRule].addTo(tuple(nucleus), occurences)
if len(coda) != 0:
distPattern.append(codaRule)
self.rules[codaRule].addTo(tuple(coda), occurences)
# Add patterns to distributions
newDist.addTo(tuple(distPattern), occurences)
# Replace the old rules with the new rules
self.rules[syllableRule] = newDist
def fromExamples(self, file, phonology):
"""Train a rule generator on an example file"""
stressId = phonology.getStress()
syllableBreakId = phonology.getSyllableBreak()
# Words are modelled as lists of syllables, with one of those being stressed (optionally)
# The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
# Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
#
# Add the 'word' rule, and syllable rules, initialized with an empty distribution
self.rules.update({'word': Distribution()})
syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed']
for x in syllableRules:
self.rules.update({x: Distribution()})
# Step 1: open the file and find how words look like
with open(file) as exampleFile:
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
for row in fileReader:
if len(row) != 0:
# Check the items in row
for item in row:
if (item != '') and (not phonology.has(item)):
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
# Process the row
self.processRowFromExample(row, stressId, syllableBreakId)
# Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
for x in syllableRules:
self.splitSyllableRule(x, phonology)
# Step 3: remove the empty rules
self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
def makeGenerator(struct):
@ -231,7 +470,7 @@ class PhonagenFile:
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
else:
with open(file, 'w', encoding='utf-8') as outputFile:
json.dump(outputStruct, outputFile, ensure_ascii=False)
json.dump(outputStruct, outputFile, ensure_ascii=False, indent=2)
def mergeFrom(self, otherFile):
"""Add all phonologies and generators from the other file into this one."""
@ -239,3 +478,11 @@ class PhonagenFile:
self.addPhonology(phonology)
for generator in otherFile.generators.values():
self.addGenerator(generator)
def generateWord(self, generator = ''):
gen = generator
if gen == '':
gen = random.choice([x for x in self.generators])
idList = self.generators[gen].generateWord()
phonology = self.phonologies[self.gnerators[gen].phonology]
return phonology.formatWord(idList)