Add list2rule generator (generating rules from examples)
This commit is contained in:
parent
6ec27d4429
commit
a618ce0530
|
@ -0,0 +1,31 @@
|
|||
#! /usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import phonagen
|
||||
|
||||
def parseArgs():
|
||||
# Define argument parser
|
||||
parser = argparse.ArgumentParser(description='Convert an example list to a rule generator.')
|
||||
parser.add_argument('file', metavar='listfile', help='list file to convert')
|
||||
parser.add_argument('--id', metavar='id', help='id of the generator', required = True)
|
||||
parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='')
|
||||
parser.add_argument('--phonology', metavar='phonology', help='id of the phonology on which is based the generator', required = True)
|
||||
parser.add_argument('--phonologyfile', metavar='phonologyfile', help='file containing the phonology; mandatory; phonology will present in the output', required = True)
|
||||
parser.add_argument('--output', metavar='output-file', help='Output file for the generator. The file is printed to standard output if not given.', default='')
|
||||
# Parse arguments
|
||||
return parser.parse_args()
|
||||
|
||||
# Main
|
||||
if __name__ == '__main__':
|
||||
args = parseArgs()
|
||||
generator = phonagen.RuleGenerator(id = args.id, description = args.description, phonology = args.phonology)
|
||||
phonology = phonagen.Phonology()
|
||||
phonologyFile = phonagen.PhonagenFile()
|
||||
phonologyFile.load(args.phonologyfile)
|
||||
phonology = phonologyFile.getPhonology(args.phonology)
|
||||
#
|
||||
generator.fromExamples(args.file, phonology)
|
||||
phonagenFile = phonagen.PhonagenFile()
|
||||
phonagenFile.addPhonology(phonology)
|
||||
phonagenFile.addGenerator(generator)
|
||||
phonagenFile.writeTo(args.output)
|
|
@ -4,6 +4,7 @@ import io
|
|||
import sys
|
||||
import csv
|
||||
import random
|
||||
import unicodedata
|
||||
|
||||
class Phonology:
|
||||
"""Phonology class"""
|
||||
|
@ -79,6 +80,72 @@ class Phonology:
|
|||
entry.update({'description': ''})
|
||||
self.entries.update({entry['id']: entry})
|
||||
|
||||
def formatWord(self, idList):
|
||||
"""Return a table of transcription -> string corresponding to the same word"""
|
||||
result = {x: "" for x in self.transcriptions}
|
||||
for x in idList:
|
||||
phoneme = self.entries[x]
|
||||
for y in result:
|
||||
result[y].append(phoneme[y])
|
||||
return result
|
||||
|
||||
def getStress(self):
|
||||
"""Return the phoneme id of the stress phoneme"""
|
||||
# search for #stress tag in description
|
||||
found = [x['id'] for x in self.entries.values() if '#stress' in x['description']]
|
||||
if len(found) == 0:
|
||||
# if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription
|
||||
found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])]
|
||||
if len(found) == 0:
|
||||
raise Exception('No stress phoneme in phonology', self.id)
|
||||
return found[0]
|
||||
|
||||
def getSyllableBreak(self):
|
||||
"""Return the phoneme id of the syllable break phoneme"""
|
||||
# search for #syllable-break tag in description
|
||||
found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']]
|
||||
if len(found) == 0:
|
||||
# if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
|
||||
found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']]
|
||||
if len(found) == 0:
|
||||
raise Exception('No syllable break phoneme in phonology', self.id)
|
||||
return found[0]
|
||||
|
||||
vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ"
|
||||
def isVowel(phoneme):
|
||||
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels)
|
||||
|
||||
consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ"
|
||||
def isConsonant(phoneme):
|
||||
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
|
||||
|
||||
def isOnset(self, id):
|
||||
"""Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription"""
|
||||
entry = self.entries[id]
|
||||
description = entry['description']
|
||||
result = ('#onset' in description) or ('#consonant' in description)
|
||||
if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description):
|
||||
result = Phonology.isConsonant(entry['phoneme'])
|
||||
return result
|
||||
|
||||
def isNucleus(self, id):
|
||||
"""Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
|
||||
entry = self.entries[id]
|
||||
description = entry['description']
|
||||
result = ('#nucleus' in description) or ('#vowel' in description)
|
||||
if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description):
|
||||
result = Phonology.isVowel(entry['phoneme'])
|
||||
return result
|
||||
|
||||
def isCoda(self, id):
|
||||
"""Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription"""
|
||||
entry = self.entries[id]
|
||||
description = entry['description']
|
||||
result = ('#coda' in description) or ('#consonant' in description)
|
||||
if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description):
|
||||
result = Phonology.isConsonant(entry['phoneme'])
|
||||
return result
|
||||
|
||||
class Distribution:
|
||||
"""Discrete distribution"""
|
||||
def __init__(self):
|
||||
|
@ -91,7 +158,7 @@ class Distribution:
|
|||
self.items.update({value: oc})
|
||||
|
||||
def pickFrom(self):
|
||||
return random.choices([k for k in self.items.keys()], [v for v in self.items.values])[0]
|
||||
return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0]
|
||||
|
||||
def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
|
||||
return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
|
||||
|
@ -101,6 +168,9 @@ class Distribution:
|
|||
for item in struct:
|
||||
self.items.update({item[itemRef]: item[occurencesRef]})
|
||||
|
||||
def isEmpty(self):
|
||||
return len(self.items) == 0
|
||||
|
||||
class Generator:
|
||||
"""Parent class for all generators"""
|
||||
def __init__(self, id = '', description = '', phonology = ''):
|
||||
|
@ -122,6 +192,9 @@ class Generator:
|
|||
self.description = struct['description']
|
||||
self.phonology = struct['phonology']
|
||||
|
||||
def generateWord(self):
|
||||
raise Exception('Word generation not supported on abstract generator')
|
||||
|
||||
class ChainGenerator(Generator):
|
||||
"""Chains-based generator"""
|
||||
def __init__(self, order = 1, **kwargs):
|
||||
|
@ -151,10 +224,10 @@ class ChainGenerator(Generator):
|
|||
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
|
||||
for row in fileReader:
|
||||
if len(row) != 0:
|
||||
row.append("") # Add terminator element (empty string)
|
||||
previous = tuple("" for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
|
||||
row.append('') # Add terminator element (empty string)
|
||||
previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
|
||||
for item in row:
|
||||
if (item != "") and (phonology.isValid()) and (not phonology.has(item)):
|
||||
if (item != '') and (phonology.isValid()) and (not phonology.has(item)):
|
||||
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
|
||||
if previous in self.chains:
|
||||
self.chains[previous].addTo(item)
|
||||
|
@ -164,6 +237,17 @@ class ChainGenerator(Generator):
|
|||
self.chains.update({previous: dist})
|
||||
previous = previous[1:] + (item,)
|
||||
|
||||
def generateWord(self):
|
||||
outputIdList = []
|
||||
nextItem = '.' #
|
||||
previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
|
||||
while nextItem != '':
|
||||
nextItem = self.chains[previous].pickFrom()
|
||||
if nextItem != '':
|
||||
outputIdList.append(nextItem)
|
||||
previous = previous[1:] + (nextItem,)
|
||||
return outputIdList
|
||||
|
||||
class RuleGenerator(Generator):
|
||||
"""Rules-based generator"""
|
||||
def __init__(self, **kwargs):
|
||||
|
@ -173,12 +257,167 @@ class RuleGenerator(Generator):
|
|||
|
||||
def toJsonStruct(self):
|
||||
struct = super().toJsonStruct()
|
||||
# TODO: add rules
|
||||
struct.update({'type': 'rules',
|
||||
'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]})
|
||||
return struct
|
||||
|
||||
def fromJsonStruct(self, struct):
|
||||
super().fromJsonStruct(struct)
|
||||
# TODO: rules
|
||||
for ruleStruct in struct['rules']:
|
||||
dist = Distribution()
|
||||
# The pattern should be converted from a list to a tuple
|
||||
dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences')
|
||||
self.rules.update({ruleStruct['id']: dist})
|
||||
|
||||
def generatePattern(self, pattern):
|
||||
output = []
|
||||
for x in pattern:
|
||||
if x in self.rules:
|
||||
ouput.concat(self.generatePattern(self.rules[x].pickFrom()))
|
||||
else:
|
||||
output.append(x)
|
||||
return output
|
||||
|
||||
def generateWord(self):
|
||||
return self.generatePattern(self.rules['word'].pickFrom())
|
||||
|
||||
def processRowFromExample(self, row, stressId, syllableBreakId):
|
||||
# Check the number of stress
|
||||
nbStress = row.count(stressId)
|
||||
if nbStress > 1:
|
||||
print("Too much stress in " + str(row) + ": skip the example")
|
||||
return
|
||||
# Build the syllable list
|
||||
syllables = []
|
||||
currentSyllable = []
|
||||
stressedSyllableIdx = -1
|
||||
syllableIdx = 0
|
||||
for x in row:
|
||||
# Append to the current syllable if not a syllable separator
|
||||
if (x != stressId) and (x != syllableBreakId):
|
||||
currentSyllable.append(x)
|
||||
# In case of syllable separator, only add the syllable to the list if it is not empty
|
||||
elif len(currentSyllable) != 0:
|
||||
syllables.append(currentSyllable)
|
||||
currentSyllable = []
|
||||
syllableIdx = syllableIdx + 1
|
||||
# If current id is stress, remember the position of the stressed syllable
|
||||
if (x == stressId):
|
||||
stressedSyllableIdx = syllableIdx
|
||||
# After the loop, the current syllable should be non-empty, add it to the list of syllables
|
||||
if len(currentSyllable) != 0:
|
||||
syllables.append(currentSyllable)
|
||||
# Single syllable case
|
||||
if len(syllables) == 1:
|
||||
if stressedSyllableIdx == 0:
|
||||
self.rules['word'].addTo(tuple([stressId, 'single']))
|
||||
else:
|
||||
self.rules['word'].addTo(tuple(['single']))
|
||||
self.rules['single'].addTo(tuple(syllables[0]))
|
||||
# Other cases
|
||||
else:
|
||||
wordPattern = []
|
||||
for x in range(len(syllables)):
|
||||
rule = ''
|
||||
separator = syllableBreakId
|
||||
if x == 0:
|
||||
rule = 'initial'
|
||||
elif x == (len(syllables) - 1):
|
||||
rule = 'final'
|
||||
else:
|
||||
rule = 'middle'
|
||||
if x == stressedSyllableIdx:
|
||||
rule = rule + '-stressed'
|
||||
separator = stressId
|
||||
# only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
|
||||
if (separator == stressId) or (x > 0):
|
||||
wordPattern.append(separator)
|
||||
# Add the rule to the pattern
|
||||
wordPattern.append(rule)
|
||||
# The syllable is added to the corresponding rule
|
||||
self.rules[rule].addTo(tuple(syllables[x]))
|
||||
self.rules['word'].addTo(tuple(wordPattern))
|
||||
|
||||
def splitSyllableRule(self, syllableRule, phonology):
|
||||
"""Replace syllable rules with onset/nucleus/coda pattern"""
|
||||
newDist = Distribution()
|
||||
oldDist = self.rules[syllableRule]
|
||||
# Add onset/nucleus/coda rules
|
||||
onsetRule = syllableRule + '-onset'
|
||||
nucleusRule = syllableRule +'-nucleus'
|
||||
codaRule = syllableRule + '-coda'
|
||||
self.rules[onsetRule] = Distribution()
|
||||
self.rules[nucleusRule] = Distribution()
|
||||
self.rules[codaRule] = Distribution()
|
||||
# For each pattern, split into onset/nucleus/coda
|
||||
for pattern in oldDist.items:
|
||||
isOnset = True
|
||||
onset = []
|
||||
isNucleus = False
|
||||
nucleus = []
|
||||
isCoda = False
|
||||
coda = []
|
||||
for phoneme in pattern:
|
||||
# Check is there is a change of element
|
||||
if isOnset and (phonology.isNucleus(phoneme)):
|
||||
isOnset = False
|
||||
isNucleus = True
|
||||
elif isNucleus and (phonology.isCoda(phoneme)):
|
||||
isNucleus = False
|
||||
isCoda = True
|
||||
# Add to the respective list
|
||||
if isOnset:
|
||||
onset.append(phoneme)
|
||||
elif isNucleus:
|
||||
nucleus.append(phoneme)
|
||||
else:
|
||||
coda.append(phoneme)
|
||||
# Add to the specific distributions and determine the pattern in new distribution
|
||||
occurences = oldDist.items[pattern]
|
||||
distPattern = []
|
||||
if len(onset) != 0:
|
||||
distPattern.append(onsetRule)
|
||||
self.rules[onsetRule].addTo(tuple(onset), occurences)
|
||||
if len(nucleus) != 0:
|
||||
distPattern.append(nucleusRule)
|
||||
self.rules[nucleusRule].addTo(tuple(nucleus), occurences)
|
||||
if len(coda) != 0:
|
||||
distPattern.append(codaRule)
|
||||
self.rules[codaRule].addTo(tuple(coda), occurences)
|
||||
# Add patterns to distributions
|
||||
newDist.addTo(tuple(distPattern), occurences)
|
||||
# Replace the old rules with the new rules
|
||||
self.rules[syllableRule] = newDist
|
||||
|
||||
def fromExamples(self, file, phonology):
|
||||
"""Train a rule generator on an example file"""
|
||||
stressId = phonology.getStress()
|
||||
syllableBreakId = phonology.getSyllableBreak()
|
||||
# Words are modelled as lists of syllables, with one of those being stressed (optionally)
|
||||
# The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
|
||||
# Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
|
||||
#
|
||||
# Add the 'word' rule, and syllable rules, initialized with an empty distribution
|
||||
self.rules.update({'word': Distribution()})
|
||||
syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed']
|
||||
for x in syllableRules:
|
||||
self.rules.update({x: Distribution()})
|
||||
# Step 1: open the file and find how words look like
|
||||
with open(file) as exampleFile:
|
||||
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
|
||||
for row in fileReader:
|
||||
if len(row) != 0:
|
||||
# Check the items in row
|
||||
for item in row:
|
||||
if (item != '') and (not phonology.has(item)):
|
||||
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
|
||||
# Process the row
|
||||
self.processRowFromExample(row, stressId, syllableBreakId)
|
||||
# Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
|
||||
for x in syllableRules:
|
||||
self.splitSyllableRule(x, phonology)
|
||||
# Step 3: remove the empty rules
|
||||
self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
|
||||
|
||||
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
|
||||
def makeGenerator(struct):
|
||||
|
@ -231,7 +470,7 @@ class PhonagenFile:
|
|||
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
|
||||
else:
|
||||
with open(file, 'w', encoding='utf-8') as outputFile:
|
||||
json.dump(outputStruct, outputFile, ensure_ascii=False)
|
||||
json.dump(outputStruct, outputFile, ensure_ascii=False, indent=2)
|
||||
|
||||
def mergeFrom(self, otherFile):
|
||||
"""Add all phonologies and generators from the other file into this one."""
|
||||
|
@ -239,3 +478,11 @@ class PhonagenFile:
|
|||
self.addPhonology(phonology)
|
||||
for generator in otherFile.generators.values():
|
||||
self.addGenerator(generator)
|
||||
|
||||
def generateWord(self, generator = ''):
|
||||
gen = generator
|
||||
if gen == '':
|
||||
gen = random.choice([x for x in self.generators])
|
||||
idList = self.generators[gen].generateWord()
|
||||
phonology = self.phonologies[self.gnerators[gen].phonology]
|
||||
return phonology.formatWord(idList)
|
||||
|
|
Loading…
Reference in New Issue