Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

2018-06-09 03:02:45 +02:00 · 2018-06-09 03:02:45 +02:00 · bc5f677aa6
parent be47b5526c
commit bc5f677aa6
2 changed files with 125 additions and 0 deletions
--- a/py-phonagen/phonagen.py
+++ b/py-phonagen/phonagen.py
@ -0,0 +1,101 @@
+"""Common functions and classes for phonagen tools"""
+import json
+import io
+import sys
+import csv
+
+class Phonology:
+  """Phonology class"""
+  def __init__(self, id = '', description = '', transcriptions = [], mainTranscription = '', entries = []):
+    self.id = id
+    self.description = description
+    self.transcriptions = transcriptions
+    self.mainTranscription = mainTranscription
+    self.entries = entries
+
+  def isValid(self):
+    return self.id != ''
+
+  def toJsonStruct(self):
+    """Convert a Phonology to a Json structure"""
+    return { 'id': self.id,
+             'description': self.description,
+             'transcriptions': self.transcriptions,
+             'main-transcription': self.mainTranscription,
+             'entries': self.entries }
+
+  def fromJsonStruct(self, struct):
+    """Create a Phonology from a Json structure"""
+    return Phonology(struct['id'], struct['decription'], struct['transcriptions'], struct['main-transcription'], struct['entries'])
+
+  def fromCsv(self, file):
+    with open(file) as csvfile:
+      fileReader = csv.reader(csvfile)
+      # get csv header
+      header = next(fileReader)
+      # get the transcriptions (header items not id or description)
+      self.transcriptions = [x for x in header if x not in ['id', 'description']]
+      # Check: self.transcriptions should contain 'phoneme'
+      if 'phoneme' not in self.transcriptions:
+        raise Exception('phoneme column not found in ', file)
+      # Check: self.transcriptions should have at least two items
+      if len(self.transcriptions) < 2:
+        raise Exception('No transcription found outside phoneme in file ', file, 'Did you named it id or description ?')
+      # get the first header item which is not one of those: id, description, phoneme
+      guessedMainTranscription = next(x for x in header if x not in ['id', 'description', 'phoneme'])
+      # If main-transcription was not given on the command line, use the guess as main-transcription
+      if self.mainTranscription == '':
+        self.mainTranscription = guessedMainTranscription
+      # Check: self.mainTranscription should be in self.transcriptions
+      if self.mainTranscription not in self.mainTranscription:
+        raise Exception('main-transcription', self.mainTranscription, 'not in list of transcriptions')
+      # If id was not given on the command line, use the mainTranscription as the id
+      if self.id == '':
+        self.id = self.mainTranscription
+      # parse entries
+      for row in fileReader:
+        entry = dict()
+        for i in range(len(row)):
+          entry.update({header[i]: row[i]})
+        # All absent elements are set to ''
+        for i in range(len(row), len(header)):
+          entry.update({header[i]: ''})
+        # if both phoneme and main-transcription are empty, skip the rest
+        if (entry['phoneme'] != '') or (entry[self.mainTranscription] != ''):
+          # if id is not provided, generate it
+          if 'id' not in header:
+            entry.update({'id': entry['phoneme'] + '-' + entry[self.mainTranscription]})
+          # if description is not provided, add an empty one
+          if 'description' not in header:
+            entry.update({'description': ''})
+          self.entries.append(entry)
+
+class PhonagenFile:
+  """A phonagen file, with phonologies and generators"""
+  def __init__(self):
+    self.phonologies = {}
+    self.generators = {}
+
+  def addPhonology(self, phonology):
+    if (phonology.isValid()):
+      self.phonologies.update({phonology.id: phonology})
+
+  def addGenerator(self, generator):
+    if (generator.isValid()):
+      self.generators.update({generator.id: generator})
+
+  def getPhonology(self, id):
+    return self.phonologies[id]
+
+  def getGenerator(self, id):
+    return self.generators[id]
+
+  def writeTo(self, file = ''):
+    """Output a JSON file from lists of phonologies and generators"""
+    outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
+                     'generators': [x.toJsonStruct() for x in self.generators.values()] }
+    if file == '':
+      json.dump(outputStruct, sys.stdout, ensure_ascii=False)
+    else:
+      with open(file, 'w', encoding='utf-8') as outputFile:
+        json.dump(outputStruct, outputFile, ensure_ascii=False)
--- a/py-phonagen/phonology-csv2json.py
+++ b/py-phonagen/phonology-csv2json.py
@ -0,0 +1,24 @@
+#! /usr/bin/env python3
+
+import argparse
+import phonagen
+
+def parseArgs():
+  # Define argument parser
+  parser = argparse.ArgumentParser(description='Convert a phonology from csv to json.')
+  parser.add_argument('file', metavar='csvfile', help='csv file to convert')
+  parser.add_argument('--id', metavar='id', help='id of the phonology; guessed from the csv header if not provided', default='')
+  parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='')
+  parser.add_argument('--main', metavar='main-transcription', help='main transcription of the phonology; must correspond to an element of the csv header (outside id and description); guessed from the csv header if not provided.', default='')
+  parser.add_argument('--output', metavar='output-file', help='Output file for the phonology. The file is printed to standard output if not given.', default='')
+  # Parse arguments
+  return parser.parse_args()
+
+# Main
+if __name__ == '__main__':
+  args = parseArgs()
+  phonology = phonagen.Phonology(id = args.id, description = args.description, mainTranscription = args.main)
+  phonology.fromCsv(args.file)
+  phonagenFile = phonagen.PhonagenFile()
+  phonagenFile.addPhonology(phonology)
+  phonagenFile.writeTo(args.output)