Example list to Markov chain generator.

2018-06-09 18:58:46 +02:00 · 2018-06-09 18:58:46 +02:00 · 5f204ce9e9
parent bc5f677aa6
commit 5f204ce9e9
2 changed files with 175 additions and 8 deletions
--- a/py-phonagen/generator-list2chain.py
+++ b/py-phonagen/generator-list2chain.py
@ -0,0 +1,34 @@
+#! /usr/bin/env python3
+
+import argparse
+import phonagen
+
+def parseArgs():
+  # Define argument parser
+  parser = argparse.ArgumentParser(description='Convert an example list to a chain generator.')
+  parser.add_argument('file', metavar='listfile', help='list file to convert')
+  parser.add_argument('--id', metavar='id', help='id of the generator', required = True)
+  parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='')
+  parser.add_argument('--phonology', metavar='phonology', help='id of the phonology on which is based the generator', required = True)
+  parser.add_argument('--phonologyfile', metavar='phonologyfile', help='file containing the phonology; optional, if provided, examples will be checked for unknown phonemes and phonology will present in the output', default='')
+  parser.add_argument('--order', metavar='order', help='order of the chain; 1 by default', default=1, type=int)
+  parser.add_argument('--output', metavar='output-file', help='Output file for the generator. The file is printed to standard output if not given.', default='')
+  # Parse arguments
+  return parser.parse_args()
+
+# Main
+if __name__ == '__main__':
+  args = parseArgs()
+  generator = phonagen.ChainGenerator(id = args.id, description = args.description, phonology = args.phonology, order = args.order)
+  phonology = phonagen.Phonology()
+  # load the phonology from a file
+  if args.phonologyfile != '':
+    phonologyFile = phonagen.PhonagenFile()
+    phonologyFile.load(args.phonologyfile)
+    phonology = phonologyFile.getPhonology(args.phonology)
+  #
+  generator.fromExamples(args.file, phonology)
+  phonagenFile = phonagen.PhonagenFile()
+  phonagenFile.addPhonology(phonology)
+  phonagenFile.addGenerator(generator)
+  phonagenFile.writeTo(args.output)
--- a/py-phonagen/phonagen.py
+++ b/py-phonagen/phonagen.py
@ -3,32 +3,41 @@ import json
 import io
 import sys
 import csv
+import random

 class Phonology:
  """Phonology class"""
-  def __init__(self, id = '', description = '', transcriptions = [], mainTranscription = '', entries = []):
+  def __init__(self, id = '', description = '', mainTranscription = ''):
    self.id = id
    self.description = description
-    self.transcriptions = transcriptions
+    self.transcriptions = []
    self.mainTranscription = mainTranscription
-    self.entries = entries
+    self.entries = {} # id -> entry

  def isValid(self):
    return self.id != ''

+  def has(self, id):
+    return id in self.entries
+
  def toJsonStruct(self):
    """Convert a Phonology to a Json structure"""
    return { 'id': self.id,
             'description': self.description,
             'transcriptions': self.transcriptions,
             'main-transcription': self.mainTranscription,
-             'entries': self.entries }
+             'entries': [x for x in self.entries.values()] }

  def fromJsonStruct(self, struct):
-    """Create a Phonology from a Json structure"""
-    return Phonology(struct['id'], struct['decription'], struct['transcriptions'], struct['main-transcription'], struct['entries'])
+    """Fill a Phonology from a Json structure"""
+    self.id = struct['id']
+    self.description = struct['description']
+    self.transcriptions = struct['transcriptions']
+    self.mainTranscription = struct['main-transcription']
+    self.entries = {x['id']: x for x in struct['entries']}

  def fromCsv(self, file):
+    """Fill a Phonology from a Csv file"""
    with open(file) as csvfile:
      fileReader = csv.reader(csvfile)
      # get csv header
@ -68,7 +77,118 @@ class Phonology:
          # if description is not provided, add an empty one
          if 'description' not in header:
            entry.update({'description': ''})
-          self.entries.append(entry)
+          self.entries.update({entry['id']: entry})
+
+class Distribution:
+  """Discrete distribution"""
+  def __init__(self):
+    self.items = {}
+
+  def addTo(self, value, occurences = 1):
+    oc = occurences
+    if value in self.items:
+      oc = oc + self.items[value]
+    self.items.update({value: oc})
+
+  def pickFrom(self):
+    return random.choices([k for k in self.items.keys()], [v for v in self.items.values])[0]
+
+  def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
+    return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
+
+  def fromJsonStruct(self, struct, itemRef = 'value', occurencesRef = 'occurences'):
+    self.items = {}
+    for item in struct:
+      self.items.update({item[itemRef]: item[occurencesRef]})
+
+class Generator:
+  """Parent class for all generators"""
+  def __init__(self, id = '', description = '', phonology = ''):
+    self.id = id
+    self.description = description
+    self.phonology = phonology
+    self.isTyped = False
+
+  def isValid(self):
+    return (self.id != '') and self.isTyped
+
+  def toJsonStruct(self):
+    return { 'id': self.id,
+             'description': self.description,
+             'phonology': self.phonology }
+
+  def fromJsonStruct(self, struct):
+    self.id = struct['id']
+    self.description = struct['description']
+    self.phonology = struct['phonology']
+
+class ChainGenerator(Generator):
+  """Chains-based generator"""
+  def __init__(self, order = 1, **kwargs):
+    super().__init__(**kwargs)
+    self.order = order
+    self.chains = {} # input -> distribution of outputs
+    self.isTyped = True
+
+  def toJsonStruct(self):
+    struct = super().toJsonStruct()
+    struct.update({'type': 'chains',
+                   'order': self.order,
+                   'chains': [{'input': x, 'possible-outputs': self.chains[x].toJsonStruct(itemRef = 'value', occurencesRef = 'occurences')} for x in self.chains]})
+    return struct
+
+  def fromJsonStruct(self, struct):
+    super().fromJsonStruct(struct)
+    self.order = struct['order']
+    for chainStruct in struct['chains']:
+      dist = Distribution()
+      dist.fromJsonStruct(chainStruct['possible-outputs'], itemRef = 'value', occurencesRef = 'occurences')
+      self.chains.update({chainStruct['input']: dist})
+
+  def fromExamples(self, file, phonology):
+    """Train a chain generator on an example file"""
+    with open(file) as exampleFile:
+      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
+      for row in fileReader:
+        if len(row) != 0:
+          row.append("") # Add terminator element (empty string)
+          previous = tuple("" for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
+          for item in row:
+            if (item != "") and (phonology.isValid()) and (not phonology.has(item)):
+              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
+            if previous in self.chains:
+              self.chains[previous].addTo(item)
+            else:
+              dist = Distribution()
+              dist.addTo(item)
+              self.chains.update({previous: dist})
+            previous = previous[1:] + (item,)
+
+class RuleGenerator(Generator):
+  """Rules-based generator"""
+  def __init__(self, **kwargs):
+    super().__init__(**kwargs)
+    self.rules = {}
+    self.isTyped = True
+
+  def toJsonStruct(self):
+    struct = super().toJsonStruct()
+    # TODO: add rules
+    return struct
+
+  def fromJsonStruct(self, struct):
+    super().fromJsonStruct(struct)
+    # TODO: rules
+
+generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
+def makeGenerator(struct):
+  """Function instanciating a generator from a JSON structure"""
+  if struct['type'] in generatorTypeToClass:
+    generator = generatorTypeToClass[struct['type']]()
+  else:
+    generator = Generator()
+  generator.fromJsonStruct(struct)
+  return generator

 class PhonagenFile:
  """A phonagen file, with phonologies and generators"""
@ -90,8 +210,21 @@ class PhonagenFile:
  def getGenerator(self, id):
    return self.generators[id]

+  def load(self, file):
+    """Load from a JSON file"""
+    with open(file, 'r', encoding='utf-8') as inputFile:
+      jsonStruct = json.load(inputFile)
+      # Load phonologies
+      for struct in jsonStruct['phonologies']:
+        phonology = Phonology()
+        phonology.fromJsonStruct(struct)
+        self.addPhonology(phonology)
+      # Load generators
+      for struct in jsonStruct['generators']:
+        self.addGenerator(makeGenerator(struct))
+
  def writeTo(self, file = ''):
-    """Output a JSON file from lists of phonologies and generators"""
+    """Output to a JSON file (or stdout)"""
    outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
                     'generators': [x.toJsonStruct() for x in self.generators.values()] }
    if file == '':