Add list2rule generator (generating rules from examples)

2018-06-10 22:55:04 +02:00 · 2018-06-10 22:55:04 +02:00 · a618ce0530
parent 6ec27d4429
commit a618ce0530
2 changed files with 285 additions and 7 deletions
--- a/py-phonagen/generator-list2rule.py
+++ b/py-phonagen/generator-list2rule.py
@ -0,0 +1,31 @@
+#! /usr/bin/env python3
+
+import argparse
+import phonagen
+
+def parseArgs():
+  # Define argument parser
+  parser = argparse.ArgumentParser(description='Convert an example list to a rule generator.')
+  parser.add_argument('file', metavar='listfile', help='list file to convert')
+  parser.add_argument('--id', metavar='id', help='id of the generator', required = True)
+  parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='')
+  parser.add_argument('--phonology', metavar='phonology', help='id of the phonology on which is based the generator', required = True)
+  parser.add_argument('--phonologyfile', metavar='phonologyfile', help='file containing the phonology; mandatory; phonology will present in the output', required = True)
+  parser.add_argument('--output', metavar='output-file', help='Output file for the generator. The file is printed to standard output if not given.', default='')
+  # Parse arguments
+  return parser.parse_args()
+
+# Main
+if __name__ == '__main__':
+  args = parseArgs()
+  generator = phonagen.RuleGenerator(id = args.id, description = args.description, phonology = args.phonology)
+  phonology = phonagen.Phonology()
+  phonologyFile = phonagen.PhonagenFile()
+  phonologyFile.load(args.phonologyfile)
+  phonology = phonologyFile.getPhonology(args.phonology)
+  #
+  generator.fromExamples(args.file, phonology)
+  phonagenFile = phonagen.PhonagenFile()
+  phonagenFile.addPhonology(phonology)
+  phonagenFile.addGenerator(generator)
+  phonagenFile.writeTo(args.output)
--- a/py-phonagen/phonagen.py
+++ b/py-phonagen/phonagen.py
@ -4,6 +4,7 @@ import io
 import sys
 import csv
 import random
+import unicodedata

 class Phonology:
  """Phonology class"""
@ -79,6 +80,72 @@ class Phonology:
            entry.update({'description': ''})
          self.entries.update({entry['id']: entry})

+  def formatWord(self, idList):
+    """Return a table of transcription -> string corresponding to the same word"""
+    result = {x: "" for x in self.transcriptions}
+    for x in idList:
+      phoneme = self.entries[x]
+      for y in result:
+        result[y].append(phoneme[y])
+    return result
+
+  def getStress(self):
+    """Return the phoneme id of the stress phoneme"""
+    # search for #stress tag in description
+    found = [x['id'] for x in self.entries.values() if '#stress' in x['description']]
+    if len(found) == 0:
+      # if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription
+      found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])]
+    if len(found) == 0:
+      raise Exception('No stress phoneme in phonology', self.id)
+    return found[0]
+
+  def getSyllableBreak(self):
+    """Return the phoneme id of the syllable break phoneme"""
+    # search for #syllable-break tag in description
+    found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']]
+    if len(found) == 0:
+      # if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
+      found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']]
+    if len(found) == 0:
+      raise Exception('No syllable break phoneme in phonology', self.id)
+    return found[0]
+
+  vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ"
+  def isVowel(phoneme):
+    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels)
+
+  consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ"
+  def isConsonant(phoneme):
+    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
+
+  def isOnset(self, id):
+    """Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription"""
+    entry = self.entries[id]
+    description = entry['description']
+    result = ('#onset' in description) or ('#consonant' in description)
+    if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description):
+      result = Phonology.isConsonant(entry['phoneme'])
+    return result
+
+  def isNucleus(self, id):
+    """Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
+    entry = self.entries[id]
+    description = entry['description']
+    result = ('#nucleus' in description) or ('#vowel' in description)
+    if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description):
+      result = Phonology.isVowel(entry['phoneme'])
+    return result
+
+  def isCoda(self, id):
+    """Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription"""
+    entry = self.entries[id]
+    description = entry['description']
+    result = ('#coda' in description) or ('#consonant' in description)
+    if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description):
+      result = Phonology.isConsonant(entry['phoneme'])
+    return result
+
 class Distribution:
  """Discrete distribution"""
  def __init__(self):
@ -91,7 +158,7 @@ class Distribution:
    self.items.update({value: oc})

  def pickFrom(self):
-    return random.choices([k for k in self.items.keys()], [v for v in self.items.values])[0]
+    return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0]

  def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
    return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
@ -101,6 +168,9 @@ class Distribution:
    for item in struct:
      self.items.update({item[itemRef]: item[occurencesRef]})

+  def isEmpty(self):
+    return len(self.items) == 0
+
 class Generator:
  """Parent class for all generators"""
  def __init__(self, id = '', description = '', phonology = ''):
@ -122,6 +192,9 @@ class Generator:
    self.description = struct['description']
    self.phonology = struct['phonology']

+  def generateWord(self):
+    raise Exception('Word generation not supported on abstract generator')
+
 class ChainGenerator(Generator):
  """Chains-based generator"""
  def __init__(self, order = 1, **kwargs):
@ -151,10 +224,10 @@ class ChainGenerator(Generator):
      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
      for row in fileReader:
        if len(row) != 0:
-          row.append("") # Add terminator element (empty string)
-          previous = tuple("" for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
+          row.append('') # Add terminator element (empty string)
+          previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
          for item in row:
-            if (item != "") and (phonology.isValid()) and (not phonology.has(item)):
+            if (item != '') and (phonology.isValid()) and (not phonology.has(item)):
              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
            if previous in self.chains:
              self.chains[previous].addTo(item)
@ -164,6 +237,17 @@ class ChainGenerator(Generator):
              self.chains.update({previous: dist})
            previous = previous[1:] + (item,)

+  def generateWord(self):
+    outputIdList = []
+    nextItem = '.' #
+    previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
+    while nextItem != '':
+      nextItem = self.chains[previous].pickFrom()
+      if nextItem != '':
+        outputIdList.append(nextItem)
+        previous = previous[1:] + (nextItem,)
+    return outputIdList
+
 class RuleGenerator(Generator):
  """Rules-based generator"""
  def __init__(self, **kwargs):
@ -173,12 +257,167 @@ class RuleGenerator(Generator):

  def toJsonStruct(self):
    struct = super().toJsonStruct()
-    # TODO: add rules
+    struct.update({'type': 'rules',
+                   'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]})
    return struct

  def fromJsonStruct(self, struct):
    super().fromJsonStruct(struct)
-    # TODO: rules
+    for ruleStruct in struct['rules']:
+      dist = Distribution()
+      # The pattern should be converted from a list to a tuple
+      dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences')
+      self.rules.update({ruleStruct['id']: dist})
+
+  def generatePattern(self, pattern):
+    output = []
+    for x in pattern:
+      if x in self.rules:
+        ouput.concat(self.generatePattern(self.rules[x].pickFrom()))
+      else:
+        output.append(x)
+    return output
+
+  def generateWord(self):
+    return self.generatePattern(self.rules['word'].pickFrom())
+
+  def processRowFromExample(self, row, stressId, syllableBreakId):
+    # Check the number of stress
+    nbStress = row.count(stressId)
+    if nbStress > 1:
+      print("Too much stress in " + str(row) + ": skip the example")
+      return
+    # Build the syllable list
+    syllables = []
+    currentSyllable = []
+    stressedSyllableIdx = -1
+    syllableIdx = 0
+    for x in row:
+      # Append to the current syllable if not a syllable separator
+      if (x != stressId) and (x != syllableBreakId):
+        currentSyllable.append(x)
+      # In case of syllable separator, only add the syllable to the list if it is not empty
+      elif len(currentSyllable) != 0:
+        syllables.append(currentSyllable)
+        currentSyllable = []
+        syllableIdx = syllableIdx + 1
+      # If current id is stress, remember the position of the stressed syllable
+      if (x == stressId):
+        stressedSyllableIdx = syllableIdx
+    # After the loop, the current syllable should be non-empty, add it to the list of syllables
+    if len(currentSyllable) != 0:
+      syllables.append(currentSyllable)
+    # Single syllable case
+    if len(syllables) == 1:
+      if stressedSyllableIdx == 0:
+        self.rules['word'].addTo(tuple([stressId, 'single']))
+      else:
+        self.rules['word'].addTo(tuple(['single']))
+      self.rules['single'].addTo(tuple(syllables[0]))
+    # Other cases
+    else:
+      wordPattern = []
+      for x in range(len(syllables)):
+        rule = ''
+        separator = syllableBreakId
+        if x == 0:
+          rule = 'initial'
+        elif x == (len(syllables) - 1):
+          rule = 'final'
+        else:
+          rule = 'middle'
+        if x == stressedSyllableIdx:
+          rule = rule + '-stressed'
+          separator = stressId
+        # only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
+        if (separator == stressId) or (x > 0):
+          wordPattern.append(separator)
+        # Add the rule to the pattern
+        wordPattern.append(rule)
+        # The syllable is added to the corresponding rule
+        self.rules[rule].addTo(tuple(syllables[x]))
+      self.rules['word'].addTo(tuple(wordPattern))
+
+  def splitSyllableRule(self, syllableRule, phonology):
+    """Replace syllable rules with onset/nucleus/coda pattern"""
+    newDist = Distribution()
+    oldDist = self.rules[syllableRule]
+    # Add onset/nucleus/coda rules
+    onsetRule = syllableRule + '-onset'
+    nucleusRule = syllableRule +'-nucleus'
+    codaRule = syllableRule + '-coda'
+    self.rules[onsetRule] = Distribution()
+    self.rules[nucleusRule] = Distribution()
+    self.rules[codaRule] = Distribution()
+    # For each pattern, split into onset/nucleus/coda
+    for pattern in oldDist.items:
+      isOnset = True
+      onset = []
+      isNucleus = False
+      nucleus = []
+      isCoda = False
+      coda = []
+      for phoneme in pattern:
+        # Check is there is a change of element
+        if isOnset and (phonology.isNucleus(phoneme)):
+          isOnset = False
+          isNucleus = True
+        elif isNucleus and (phonology.isCoda(phoneme)):
+          isNucleus = False
+          isCoda = True
+        # Add to the respective list
+        if isOnset:
+          onset.append(phoneme)
+        elif isNucleus:
+          nucleus.append(phoneme)
+        else:
+          coda.append(phoneme)
+      # Add to the specific distributions and determine the pattern in new distribution
+      occurences = oldDist.items[pattern]
+      distPattern = []
+      if len(onset) != 0:
+        distPattern.append(onsetRule)
+        self.rules[onsetRule].addTo(tuple(onset), occurences)
+      if len(nucleus) != 0:
+        distPattern.append(nucleusRule)
+        self.rules[nucleusRule].addTo(tuple(nucleus), occurences)
+      if len(coda) != 0:
+        distPattern.append(codaRule)
+        self.rules[codaRule].addTo(tuple(coda), occurences)
+      # Add patterns to distributions
+      newDist.addTo(tuple(distPattern), occurences)
+    # Replace the old rules with the new rules
+    self.rules[syllableRule] = newDist
+
+  def fromExamples(self, file, phonology):
+    """Train a rule generator on an example file"""
+    stressId = phonology.getStress()
+    syllableBreakId = phonology.getSyllableBreak()
+    # Words are modelled as lists of syllables, with one of those being stressed (optionally)
+    # The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
+    # Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
+    #
+    # Add the 'word' rule, and syllable rules, initialized with an empty distribution
+    self.rules.update({'word': Distribution()})
+    syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed']
+    for x in syllableRules:
+      self.rules.update({x: Distribution()})
+    # Step 1: open the file and find how words look like
+    with open(file) as exampleFile:
+      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
+      for row in fileReader:
+        if len(row) != 0:
+          # Check the items in row
+          for item in row:
+            if (item != '') and (not phonology.has(item)):
+              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
+          # Process the row
+          self.processRowFromExample(row, stressId, syllableBreakId)
+    # Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
+    for x in syllableRules:
+      self.splitSyllableRule(x, phonology)
+    # Step 3: remove the empty rules
+    self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}

 generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
 def makeGenerator(struct):
@ -231,7 +470,7 @@ class PhonagenFile:
      json.dump(outputStruct, sys.stdout, ensure_ascii=False)
    else:
      with open(file, 'w', encoding='utf-8') as outputFile:
-        json.dump(outputStruct, outputFile, ensure_ascii=False)
+        json.dump(outputStruct, outputFile, ensure_ascii=False,  indent=2)

  def mergeFrom(self, otherFile):
    """Add all phonologies and generators from the other file into this one."""
@ -239,3 +478,11 @@ class PhonagenFile:
      self.addPhonology(phonology)
    for generator in otherFile.generators.values():
      self.addGenerator(generator)
+
+  def generateWord(self, generator = ''):
+    gen = generator
+    if gen == '':
+      gen = random.choice([x for x in self.generators])
+    idList = self.generators[gen].generateWord()
+    phonology = self.phonologies[self.gnerators[gen].phonology]
+    return phonology.formatWord(idList)