Add list2rule generator (generating rules from examples)

2018-06-10 22:55:04 +02:00 · 2018-06-10 22:55:04 +02:00 · a618ce0530
parent 6ec27d4429
commit a618ce0530
2 changed files with 285 additions and 7 deletions
--- a/py-phonagen/generator-list2rule.py
+++ b/py-phonagen/generator-list2rule.py
@ -0,0 +1,31 @@
 #! /usr/bin/env python3
 import argparse
 import phonagen
 def parseArgs():
  # Define argument parser
  parser = argparse.ArgumentParser(description='Convert an example list to a rule generator.')
  parser.add_argument('file', metavar='listfile', help='list file to convert')
  parser.add_argument('--id', metavar='id', help='id of the generator', required = True)
  parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='')
  parser.add_argument('--phonology', metavar='phonology', help='id of the phonology on which is based the generator', required = True)
  parser.add_argument('--phonologyfile', metavar='phonologyfile', help='file containing the phonology; mandatory; phonology will present in the output', required = True)
  parser.add_argument('--output', metavar='output-file', help='Output file for the generator. The file is printed to standard output if not given.', default='')
  # Parse arguments
  return parser.parse_args()
 # Main
 if __name__ == '__main__':
  args = parseArgs()
  generator = phonagen.RuleGenerator(id = args.id, description = args.description, phonology = args.phonology)
  phonology = phonagen.Phonology()
  phonologyFile = phonagen.PhonagenFile()
  phonologyFile.load(args.phonologyfile)
  phonology = phonologyFile.getPhonology(args.phonology)
  #
  generator.fromExamples(args.file, phonology)
  phonagenFile = phonagen.PhonagenFile()
  phonagenFile.addPhonology(phonology)
  phonagenFile.addGenerator(generator)
  phonagenFile.writeTo(args.output)
--- a/py-phonagen/phonagen.py
+++ b/py-phonagen/phonagen.py
@ -4,6 +4,7 @@ import io
 import sys
 import csv
 import random
 import unicodedata
 class Phonology:
  """Phonology class"""
@ -79,6 +80,72 @@ class Phonology:
            entry.update({'description': ''})
          self.entries.update({entry['id']: entry})
  def formatWord(self, idList):
    """Return a table of transcription -> string corresponding to the same word"""
    result = {x: "" for x in self.transcriptions}
    for x in idList:
      phoneme = self.entries[x]
      for y in result:
        result[y].append(phoneme[y])
    return result
  def getStress(self):
    """Return the phoneme id of the stress phoneme"""
    # search for #stress tag in description
    found = [x['id'] for x in self.entries.values() if '#stress' in x['description']]
    if len(found) == 0:
      # if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription
      found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])]
    if len(found) == 0:
      raise Exception('No stress phoneme in phonology', self.id)
    return found[0]
  def getSyllableBreak(self):
    """Return the phoneme id of the syllable break phoneme"""
    # search for #syllable-break tag in description
    found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']]
    if len(found) == 0:
      # if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
      found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']]
    if len(found) == 0:
      raise Exception('No syllable break phoneme in phonology', self.id)
    return found[0]
  vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ"
  def isVowel(phoneme):
    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels)
  consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ"
  def isConsonant(phoneme):
    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
  def isOnset(self, id):
    """Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#onset' in description) or ('#consonant' in description)
    if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description):
      result = Phonology.isConsonant(entry['phoneme'])
    return result
  def isNucleus(self, id):
    """Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#nucleus' in description) or ('#vowel' in description)
    if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description):
      result = Phonology.isVowel(entry['phoneme'])
    return result
  def isCoda(self, id):
    """Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#coda' in description) or ('#consonant' in description)
    if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description):
      result = Phonology.isConsonant(entry['phoneme'])
    return result
 class Distribution:
  """Discrete distribution"""
  def __init__(self):
@ -91,7 +158,7 @@ class Distribution:
    self.items.update({value: oc})
  def pickFrom(self):
-    return random.choices([k for k in self.items.keys()], [v for v in self.items.values])[0]
+    return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0]
  def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
    return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
@ -101,6 +168,9 @@ class Distribution:
    for item in struct:
      self.items.update({item[itemRef]: item[occurencesRef]})
  def isEmpty(self):
    return len(self.items) == 0
 class Generator:
  """Parent class for all generators"""
  def __init__(self, id = '', description = '', phonology = ''):
@ -122,6 +192,9 @@ class Generator:
    self.description = struct['description']
    self.phonology = struct['phonology']
  def generateWord(self):
    raise Exception('Word generation not supported on abstract generator')
 class ChainGenerator(Generator):
  """Chains-based generator"""
  def __init__(self, order = 1, **kwargs):
@ -151,10 +224,10 @@ class ChainGenerator(Generator):
      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
      for row in fileReader:
        if len(row) != 0:
-          row.append("") # Add terminator element (empty string)
+          row.append('') # Add terminator element (empty string)
-          previous = tuple("" for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
+          previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
          for item in row:
-            if (item != "") and (phonology.isValid()) and (not phonology.has(item)):
+            if (item != '') and (phonology.isValid()) and (not phonology.has(item)):
              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
            if previous in self.chains:
              self.chains[previous].addTo(item)
@ -164,6 +237,17 @@ class ChainGenerator(Generator):
              self.chains.update({previous: dist})
            previous = previous[1:] + (item,)
  def generateWord(self):
    outputIdList = []
    nextItem = '.' #
    previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
    while nextItem != '':
      nextItem = self.chains[previous].pickFrom()
      if nextItem != '':
        outputIdList.append(nextItem)
        previous = previous[1:] + (nextItem,)
    return outputIdList
 class RuleGenerator(Generator):
  """Rules-based generator"""
  def __init__(self, **kwargs):
@ -173,12 +257,167 @@ class RuleGenerator(Generator):
  def toJsonStruct(self):
    struct = super().toJsonStruct()
-    # TODO: add rules
+    struct.update({'type': 'rules',
                   'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]})
    return struct
  def fromJsonStruct(self, struct):
    super().fromJsonStruct(struct)
-    # TODO: rules
+    for ruleStruct in struct['rules']:
      dist = Distribution()
      # The pattern should be converted from a list to a tuple
      dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences')
      self.rules.update({ruleStruct['id']: dist})
  def generatePattern(self, pattern):
    output = []
    for x in pattern:
      if x in self.rules:
        ouput.concat(self.generatePattern(self.rules[x].pickFrom()))
      else:
        output.append(x)
    return output
  def generateWord(self):
    return self.generatePattern(self.rules['word'].pickFrom())
  def processRowFromExample(self, row, stressId, syllableBreakId):
    # Check the number of stress
    nbStress = row.count(stressId)
    if nbStress > 1:
      print("Too much stress in " + str(row) + ": skip the example")
      return
    # Build the syllable list
    syllables = []
    currentSyllable = []
    stressedSyllableIdx = -1
    syllableIdx = 0
    for x in row:
      # Append to the current syllable if not a syllable separator
      if (x != stressId) and (x != syllableBreakId):
        currentSyllable.append(x)
      # In case of syllable separator, only add the syllable to the list if it is not empty
      elif len(currentSyllable) != 0:
        syllables.append(currentSyllable)
        currentSyllable = []
        syllableIdx = syllableIdx + 1
      # If current id is stress, remember the position of the stressed syllable
      if (x == stressId):
        stressedSyllableIdx = syllableIdx
    # After the loop, the current syllable should be non-empty, add it to the list of syllables
    if len(currentSyllable) != 0:
      syllables.append(currentSyllable)
    # Single syllable case
    if len(syllables) == 1:
      if stressedSyllableIdx == 0:
        self.rules['word'].addTo(tuple([stressId, 'single']))
      else:
        self.rules['word'].addTo(tuple(['single']))
      self.rules['single'].addTo(tuple(syllables[0]))
    # Other cases
    else:
      wordPattern = []
      for x in range(len(syllables)):
        rule = ''
        separator = syllableBreakId
        if x == 0:
          rule = 'initial'
        elif x == (len(syllables) - 1):
          rule = 'final'
        else:
          rule = 'middle'
        if x == stressedSyllableIdx:
          rule = rule + '-stressed'
          separator = stressId
        # only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
        if (separator == stressId) or (x > 0):
          wordPattern.append(separator)
        # Add the rule to the pattern
        wordPattern.append(rule)
        # The syllable is added to the corresponding rule
        self.rules[rule].addTo(tuple(syllables[x]))
      self.rules['word'].addTo(tuple(wordPattern))
  def splitSyllableRule(self, syllableRule, phonology):
    """Replace syllable rules with onset/nucleus/coda pattern"""
    newDist = Distribution()
    oldDist = self.rules[syllableRule]
    # Add onset/nucleus/coda rules
    onsetRule = syllableRule + '-onset'
    nucleusRule = syllableRule +'-nucleus'
    codaRule = syllableRule + '-coda'
    self.rules[onsetRule] = Distribution()
    self.rules[nucleusRule] = Distribution()
    self.rules[codaRule] = Distribution()
    # For each pattern, split into onset/nucleus/coda
    for pattern in oldDist.items:
      isOnset = True
      onset = []
      isNucleus = False
      nucleus = []
      isCoda = False
      coda = []
      for phoneme in pattern:
        # Check is there is a change of element
        if isOnset and (phonology.isNucleus(phoneme)):
          isOnset = False
          isNucleus = True
        elif isNucleus and (phonology.isCoda(phoneme)):
          isNucleus = False
          isCoda = True
        # Add to the respective list
        if isOnset:
          onset.append(phoneme)
        elif isNucleus:
          nucleus.append(phoneme)
        else:
          coda.append(phoneme)
      # Add to the specific distributions and determine the pattern in new distribution
      occurences = oldDist.items[pattern]
      distPattern = []
      if len(onset) != 0:
        distPattern.append(onsetRule)
        self.rules[onsetRule].addTo(tuple(onset), occurences)
      if len(nucleus) != 0:
        distPattern.append(nucleusRule)
        self.rules[nucleusRule].addTo(tuple(nucleus), occurences)
      if len(coda) != 0:
        distPattern.append(codaRule)
        self.rules[codaRule].addTo(tuple(coda), occurences)
      # Add patterns to distributions
      newDist.addTo(tuple(distPattern), occurences)
    # Replace the old rules with the new rules
    self.rules[syllableRule] = newDist
  def fromExamples(self, file, phonology):
    """Train a rule generator on an example file"""
    stressId = phonology.getStress()
    syllableBreakId = phonology.getSyllableBreak()
    # Words are modelled as lists of syllables, with one of those being stressed (optionally)
    # The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
    # Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
    #
    # Add the 'word' rule, and syllable rules, initialized with an empty distribution
    self.rules.update({'word': Distribution()})
    syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed']
    for x in syllableRules:
      self.rules.update({x: Distribution()})
    # Step 1: open the file and find how words look like
    with open(file) as exampleFile:
      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
      for row in fileReader:
        if len(row) != 0:
          # Check the items in row
          for item in row:
            if (item != '') and (not phonology.has(item)):
              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
          # Process the row
          self.processRowFromExample(row, stressId, syllableBreakId)
    # Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
    for x in syllableRules:
      self.splitSyllableRule(x, phonology)
    # Step 3: remove the empty rules
    self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
 generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
 def makeGenerator(struct):
@ -231,7 +470,7 @@ class PhonagenFile:
      json.dump(outputStruct, sys.stdout, ensure_ascii=False)
    else:
      with open(file, 'w', encoding='utf-8') as outputFile:
-        json.dump(outputStruct, outputFile, ensure_ascii=False)
+        json.dump(outputStruct, outputFile, ensure_ascii=False,  indent=2)
  def mergeFrom(self, otherFile):
    """Add all phonologies and generators from the other file into this one."""
@ -239,3 +478,11 @@ class PhonagenFile:
      self.addPhonology(phonology)
    for generator in otherFile.generators.values():
      self.addGenerator(generator)
  def generateWord(self, generator = ''):
    gen = generator
    if gen == '':
      gen = random.choice([x for x in self.generators])
    idList = self.generators[gen].generateWord()
    phonology = self.phonologies[self.gnerators[gen].phonology]
    return phonology.formatWord(idList)