Add list2rule generator (generating rules from examples)
This commit is contained in:
parent
6ec27d4429
commit
a618ce0530
|
@ -0,0 +1,31 @@
|
||||||
|
#! /usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import phonagen
|
||||||
|
|
||||||
|
def parseArgs():
|
||||||
|
# Define argument parser
|
||||||
|
parser = argparse.ArgumentParser(description='Convert an example list to a rule generator.')
|
||||||
|
parser.add_argument('file', metavar='listfile', help='list file to convert')
|
||||||
|
parser.add_argument('--id', metavar='id', help='id of the generator', required = True)
|
||||||
|
parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='')
|
||||||
|
parser.add_argument('--phonology', metavar='phonology', help='id of the phonology on which is based the generator', required = True)
|
||||||
|
parser.add_argument('--phonologyfile', metavar='phonologyfile', help='file containing the phonology; mandatory; phonology will present in the output', required = True)
|
||||||
|
parser.add_argument('--output', metavar='output-file', help='Output file for the generator. The file is printed to standard output if not given.', default='')
|
||||||
|
# Parse arguments
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
# Main
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = parseArgs()
|
||||||
|
generator = phonagen.RuleGenerator(id = args.id, description = args.description, phonology = args.phonology)
|
||||||
|
phonology = phonagen.Phonology()
|
||||||
|
phonologyFile = phonagen.PhonagenFile()
|
||||||
|
phonologyFile.load(args.phonologyfile)
|
||||||
|
phonology = phonologyFile.getPhonology(args.phonology)
|
||||||
|
#
|
||||||
|
generator.fromExamples(args.file, phonology)
|
||||||
|
phonagenFile = phonagen.PhonagenFile()
|
||||||
|
phonagenFile.addPhonology(phonology)
|
||||||
|
phonagenFile.addGenerator(generator)
|
||||||
|
phonagenFile.writeTo(args.output)
|
|
@ -4,6 +4,7 @@ import io
|
||||||
import sys
|
import sys
|
||||||
import csv
|
import csv
|
||||||
import random
|
import random
|
||||||
|
import unicodedata
|
||||||
|
|
||||||
class Phonology:
|
class Phonology:
|
||||||
"""Phonology class"""
|
"""Phonology class"""
|
||||||
|
@ -79,6 +80,72 @@ class Phonology:
|
||||||
entry.update({'description': ''})
|
entry.update({'description': ''})
|
||||||
self.entries.update({entry['id']: entry})
|
self.entries.update({entry['id']: entry})
|
||||||
|
|
||||||
|
def formatWord(self, idList):
|
||||||
|
"""Return a table of transcription -> string corresponding to the same word"""
|
||||||
|
result = {x: "" for x in self.transcriptions}
|
||||||
|
for x in idList:
|
||||||
|
phoneme = self.entries[x]
|
||||||
|
for y in result:
|
||||||
|
result[y].append(phoneme[y])
|
||||||
|
return result
|
||||||
|
|
||||||
|
def getStress(self):
|
||||||
|
"""Return the phoneme id of the stress phoneme"""
|
||||||
|
# search for #stress tag in description
|
||||||
|
found = [x['id'] for x in self.entries.values() if '#stress' in x['description']]
|
||||||
|
if len(found) == 0:
|
||||||
|
# if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription
|
||||||
|
found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])]
|
||||||
|
if len(found) == 0:
|
||||||
|
raise Exception('No stress phoneme in phonology', self.id)
|
||||||
|
return found[0]
|
||||||
|
|
||||||
|
def getSyllableBreak(self):
|
||||||
|
"""Return the phoneme id of the syllable break phoneme"""
|
||||||
|
# search for #syllable-break tag in description
|
||||||
|
found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']]
|
||||||
|
if len(found) == 0:
|
||||||
|
# if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
|
||||||
|
found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']]
|
||||||
|
if len(found) == 0:
|
||||||
|
raise Exception('No syllable break phoneme in phonology', self.id)
|
||||||
|
return found[0]
|
||||||
|
|
||||||
|
vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ"
|
||||||
|
def isVowel(phoneme):
|
||||||
|
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels)
|
||||||
|
|
||||||
|
consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ"
|
||||||
|
def isConsonant(phoneme):
|
||||||
|
return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
|
||||||
|
|
||||||
|
def isOnset(self, id):
|
||||||
|
"""Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription"""
|
||||||
|
entry = self.entries[id]
|
||||||
|
description = entry['description']
|
||||||
|
result = ('#onset' in description) or ('#consonant' in description)
|
||||||
|
if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description):
|
||||||
|
result = Phonology.isConsonant(entry['phoneme'])
|
||||||
|
return result
|
||||||
|
|
||||||
|
def isNucleus(self, id):
|
||||||
|
"""Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
|
||||||
|
entry = self.entries[id]
|
||||||
|
description = entry['description']
|
||||||
|
result = ('#nucleus' in description) or ('#vowel' in description)
|
||||||
|
if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description):
|
||||||
|
result = Phonology.isVowel(entry['phoneme'])
|
||||||
|
return result
|
||||||
|
|
||||||
|
def isCoda(self, id):
|
||||||
|
"""Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription"""
|
||||||
|
entry = self.entries[id]
|
||||||
|
description = entry['description']
|
||||||
|
result = ('#coda' in description) or ('#consonant' in description)
|
||||||
|
if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description):
|
||||||
|
result = Phonology.isConsonant(entry['phoneme'])
|
||||||
|
return result
|
||||||
|
|
||||||
class Distribution:
|
class Distribution:
|
||||||
"""Discrete distribution"""
|
"""Discrete distribution"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -91,7 +158,7 @@ class Distribution:
|
||||||
self.items.update({value: oc})
|
self.items.update({value: oc})
|
||||||
|
|
||||||
def pickFrom(self):
|
def pickFrom(self):
|
||||||
return random.choices([k for k in self.items.keys()], [v for v in self.items.values])[0]
|
return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0]
|
||||||
|
|
||||||
def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
|
def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
|
||||||
return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
|
return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
|
||||||
|
@ -101,6 +168,9 @@ class Distribution:
|
||||||
for item in struct:
|
for item in struct:
|
||||||
self.items.update({item[itemRef]: item[occurencesRef]})
|
self.items.update({item[itemRef]: item[occurencesRef]})
|
||||||
|
|
||||||
|
def isEmpty(self):
|
||||||
|
return len(self.items) == 0
|
||||||
|
|
||||||
class Generator:
|
class Generator:
|
||||||
"""Parent class for all generators"""
|
"""Parent class for all generators"""
|
||||||
def __init__(self, id = '', description = '', phonology = ''):
|
def __init__(self, id = '', description = '', phonology = ''):
|
||||||
|
@ -122,6 +192,9 @@ class Generator:
|
||||||
self.description = struct['description']
|
self.description = struct['description']
|
||||||
self.phonology = struct['phonology']
|
self.phonology = struct['phonology']
|
||||||
|
|
||||||
|
def generateWord(self):
|
||||||
|
raise Exception('Word generation not supported on abstract generator')
|
||||||
|
|
||||||
class ChainGenerator(Generator):
|
class ChainGenerator(Generator):
|
||||||
"""Chains-based generator"""
|
"""Chains-based generator"""
|
||||||
def __init__(self, order = 1, **kwargs):
|
def __init__(self, order = 1, **kwargs):
|
||||||
|
@ -151,10 +224,10 @@ class ChainGenerator(Generator):
|
||||||
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
|
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
|
||||||
for row in fileReader:
|
for row in fileReader:
|
||||||
if len(row) != 0:
|
if len(row) != 0:
|
||||||
row.append("") # Add terminator element (empty string)
|
row.append('') # Add terminator element (empty string)
|
||||||
previous = tuple("" for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
|
previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
|
||||||
for item in row:
|
for item in row:
|
||||||
if (item != "") and (phonology.isValid()) and (not phonology.has(item)):
|
if (item != '') and (phonology.isValid()) and (not phonology.has(item)):
|
||||||
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
|
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
|
||||||
if previous in self.chains:
|
if previous in self.chains:
|
||||||
self.chains[previous].addTo(item)
|
self.chains[previous].addTo(item)
|
||||||
|
@ -164,6 +237,17 @@ class ChainGenerator(Generator):
|
||||||
self.chains.update({previous: dist})
|
self.chains.update({previous: dist})
|
||||||
previous = previous[1:] + (item,)
|
previous = previous[1:] + (item,)
|
||||||
|
|
||||||
|
def generateWord(self):
|
||||||
|
outputIdList = []
|
||||||
|
nextItem = '.' #
|
||||||
|
previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
|
||||||
|
while nextItem != '':
|
||||||
|
nextItem = self.chains[previous].pickFrom()
|
||||||
|
if nextItem != '':
|
||||||
|
outputIdList.append(nextItem)
|
||||||
|
previous = previous[1:] + (nextItem,)
|
||||||
|
return outputIdList
|
||||||
|
|
||||||
class RuleGenerator(Generator):
|
class RuleGenerator(Generator):
|
||||||
"""Rules-based generator"""
|
"""Rules-based generator"""
|
||||||
def __init__(self, **kwargs):
|
def __init__(self, **kwargs):
|
||||||
|
@ -173,12 +257,167 @@ class RuleGenerator(Generator):
|
||||||
|
|
||||||
def toJsonStruct(self):
|
def toJsonStruct(self):
|
||||||
struct = super().toJsonStruct()
|
struct = super().toJsonStruct()
|
||||||
# TODO: add rules
|
struct.update({'type': 'rules',
|
||||||
|
'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]})
|
||||||
return struct
|
return struct
|
||||||
|
|
||||||
def fromJsonStruct(self, struct):
|
def fromJsonStruct(self, struct):
|
||||||
super().fromJsonStruct(struct)
|
super().fromJsonStruct(struct)
|
||||||
# TODO: rules
|
for ruleStruct in struct['rules']:
|
||||||
|
dist = Distribution()
|
||||||
|
# The pattern should be converted from a list to a tuple
|
||||||
|
dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences')
|
||||||
|
self.rules.update({ruleStruct['id']: dist})
|
||||||
|
|
||||||
|
def generatePattern(self, pattern):
|
||||||
|
output = []
|
||||||
|
for x in pattern:
|
||||||
|
if x in self.rules:
|
||||||
|
ouput.concat(self.generatePattern(self.rules[x].pickFrom()))
|
||||||
|
else:
|
||||||
|
output.append(x)
|
||||||
|
return output
|
||||||
|
|
||||||
|
def generateWord(self):
|
||||||
|
return self.generatePattern(self.rules['word'].pickFrom())
|
||||||
|
|
||||||
|
def processRowFromExample(self, row, stressId, syllableBreakId):
|
||||||
|
# Check the number of stress
|
||||||
|
nbStress = row.count(stressId)
|
||||||
|
if nbStress > 1:
|
||||||
|
print("Too much stress in " + str(row) + ": skip the example")
|
||||||
|
return
|
||||||
|
# Build the syllable list
|
||||||
|
syllables = []
|
||||||
|
currentSyllable = []
|
||||||
|
stressedSyllableIdx = -1
|
||||||
|
syllableIdx = 0
|
||||||
|
for x in row:
|
||||||
|
# Append to the current syllable if not a syllable separator
|
||||||
|
if (x != stressId) and (x != syllableBreakId):
|
||||||
|
currentSyllable.append(x)
|
||||||
|
# In case of syllable separator, only add the syllable to the list if it is not empty
|
||||||
|
elif len(currentSyllable) != 0:
|
||||||
|
syllables.append(currentSyllable)
|
||||||
|
currentSyllable = []
|
||||||
|
syllableIdx = syllableIdx + 1
|
||||||
|
# If current id is stress, remember the position of the stressed syllable
|
||||||
|
if (x == stressId):
|
||||||
|
stressedSyllableIdx = syllableIdx
|
||||||
|
# After the loop, the current syllable should be non-empty, add it to the list of syllables
|
||||||
|
if len(currentSyllable) != 0:
|
||||||
|
syllables.append(currentSyllable)
|
||||||
|
# Single syllable case
|
||||||
|
if len(syllables) == 1:
|
||||||
|
if stressedSyllableIdx == 0:
|
||||||
|
self.rules['word'].addTo(tuple([stressId, 'single']))
|
||||||
|
else:
|
||||||
|
self.rules['word'].addTo(tuple(['single']))
|
||||||
|
self.rules['single'].addTo(tuple(syllables[0]))
|
||||||
|
# Other cases
|
||||||
|
else:
|
||||||
|
wordPattern = []
|
||||||
|
for x in range(len(syllables)):
|
||||||
|
rule = ''
|
||||||
|
separator = syllableBreakId
|
||||||
|
if x == 0:
|
||||||
|
rule = 'initial'
|
||||||
|
elif x == (len(syllables) - 1):
|
||||||
|
rule = 'final'
|
||||||
|
else:
|
||||||
|
rule = 'middle'
|
||||||
|
if x == stressedSyllableIdx:
|
||||||
|
rule = rule + '-stressed'
|
||||||
|
separator = stressId
|
||||||
|
# only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
|
||||||
|
if (separator == stressId) or (x > 0):
|
||||||
|
wordPattern.append(separator)
|
||||||
|
# Add the rule to the pattern
|
||||||
|
wordPattern.append(rule)
|
||||||
|
# The syllable is added to the corresponding rule
|
||||||
|
self.rules[rule].addTo(tuple(syllables[x]))
|
||||||
|
self.rules['word'].addTo(tuple(wordPattern))
|
||||||
|
|
||||||
|
def splitSyllableRule(self, syllableRule, phonology):
|
||||||
|
"""Replace syllable rules with onset/nucleus/coda pattern"""
|
||||||
|
newDist = Distribution()
|
||||||
|
oldDist = self.rules[syllableRule]
|
||||||
|
# Add onset/nucleus/coda rules
|
||||||
|
onsetRule = syllableRule + '-onset'
|
||||||
|
nucleusRule = syllableRule +'-nucleus'
|
||||||
|
codaRule = syllableRule + '-coda'
|
||||||
|
self.rules[onsetRule] = Distribution()
|
||||||
|
self.rules[nucleusRule] = Distribution()
|
||||||
|
self.rules[codaRule] = Distribution()
|
||||||
|
# For each pattern, split into onset/nucleus/coda
|
||||||
|
for pattern in oldDist.items:
|
||||||
|
isOnset = True
|
||||||
|
onset = []
|
||||||
|
isNucleus = False
|
||||||
|
nucleus = []
|
||||||
|
isCoda = False
|
||||||
|
coda = []
|
||||||
|
for phoneme in pattern:
|
||||||
|
# Check is there is a change of element
|
||||||
|
if isOnset and (phonology.isNucleus(phoneme)):
|
||||||
|
isOnset = False
|
||||||
|
isNucleus = True
|
||||||
|
elif isNucleus and (phonology.isCoda(phoneme)):
|
||||||
|
isNucleus = False
|
||||||
|
isCoda = True
|
||||||
|
# Add to the respective list
|
||||||
|
if isOnset:
|
||||||
|
onset.append(phoneme)
|
||||||
|
elif isNucleus:
|
||||||
|
nucleus.append(phoneme)
|
||||||
|
else:
|
||||||
|
coda.append(phoneme)
|
||||||
|
# Add to the specific distributions and determine the pattern in new distribution
|
||||||
|
occurences = oldDist.items[pattern]
|
||||||
|
distPattern = []
|
||||||
|
if len(onset) != 0:
|
||||||
|
distPattern.append(onsetRule)
|
||||||
|
self.rules[onsetRule].addTo(tuple(onset), occurences)
|
||||||
|
if len(nucleus) != 0:
|
||||||
|
distPattern.append(nucleusRule)
|
||||||
|
self.rules[nucleusRule].addTo(tuple(nucleus), occurences)
|
||||||
|
if len(coda) != 0:
|
||||||
|
distPattern.append(codaRule)
|
||||||
|
self.rules[codaRule].addTo(tuple(coda), occurences)
|
||||||
|
# Add patterns to distributions
|
||||||
|
newDist.addTo(tuple(distPattern), occurences)
|
||||||
|
# Replace the old rules with the new rules
|
||||||
|
self.rules[syllableRule] = newDist
|
||||||
|
|
||||||
|
def fromExamples(self, file, phonology):
|
||||||
|
"""Train a rule generator on an example file"""
|
||||||
|
stressId = phonology.getStress()
|
||||||
|
syllableBreakId = phonology.getSyllableBreak()
|
||||||
|
# Words are modelled as lists of syllables, with one of those being stressed (optionally)
|
||||||
|
# The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
|
||||||
|
# Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
|
||||||
|
#
|
||||||
|
# Add the 'word' rule, and syllable rules, initialized with an empty distribution
|
||||||
|
self.rules.update({'word': Distribution()})
|
||||||
|
syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed']
|
||||||
|
for x in syllableRules:
|
||||||
|
self.rules.update({x: Distribution()})
|
||||||
|
# Step 1: open the file and find how words look like
|
||||||
|
with open(file) as exampleFile:
|
||||||
|
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
|
||||||
|
for row in fileReader:
|
||||||
|
if len(row) != 0:
|
||||||
|
# Check the items in row
|
||||||
|
for item in row:
|
||||||
|
if (item != '') and (not phonology.has(item)):
|
||||||
|
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
|
||||||
|
# Process the row
|
||||||
|
self.processRowFromExample(row, stressId, syllableBreakId)
|
||||||
|
# Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
|
||||||
|
for x in syllableRules:
|
||||||
|
self.splitSyllableRule(x, phonology)
|
||||||
|
# Step 3: remove the empty rules
|
||||||
|
self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
|
||||||
|
|
||||||
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
|
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
|
||||||
def makeGenerator(struct):
|
def makeGenerator(struct):
|
||||||
|
@ -231,7 +470,7 @@ class PhonagenFile:
|
||||||
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
|
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
|
||||||
else:
|
else:
|
||||||
with open(file, 'w', encoding='utf-8') as outputFile:
|
with open(file, 'w', encoding='utf-8') as outputFile:
|
||||||
json.dump(outputStruct, outputFile, ensure_ascii=False)
|
json.dump(outputStruct, outputFile, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
def mergeFrom(self, otherFile):
|
def mergeFrom(self, otherFile):
|
||||||
"""Add all phonologies and generators from the other file into this one."""
|
"""Add all phonologies and generators from the other file into this one."""
|
||||||
|
@ -239,3 +478,11 @@ class PhonagenFile:
|
||||||
self.addPhonology(phonology)
|
self.addPhonology(phonology)
|
||||||
for generator in otherFile.generators.values():
|
for generator in otherFile.generators.values():
|
||||||
self.addGenerator(generator)
|
self.addGenerator(generator)
|
||||||
|
|
||||||
|
def generateWord(self, generator = ''):
|
||||||
|
gen = generator
|
||||||
|
if gen == '':
|
||||||
|
gen = random.choice([x for x in self.generators])
|
||||||
|
idList = self.generators[gen].generateWord()
|
||||||
|
phonology = self.phonologies[self.gnerators[gen].phonology]
|
||||||
|
return phonology.formatWord(idList)
|
||||||
|
|
Loading…
Reference in New Issue