phonagen/py-phonagen/phonagen.py

"""Common functions and classes for phonagen tools"""
import json
import io
import sys
import csv

class Phonology:
  """Phonology class"""
  def __init__(self, id = '', description = '', transcriptions = [], mainTranscription = '', entries = []):
    self.id = id
    self.description = description
    self.transcriptions = transcriptions
    self.mainTranscription = mainTranscription
    self.entries = entries

  def isValid(self):
    return self.id != ''

  def toJsonStruct(self):
    """Convert a Phonology to a Json structure"""
    return { 'id': self.id,
             'description': self.description,
             'transcriptions': self.transcriptions,
             'main-transcription': self.mainTranscription,
             'entries': self.entries }

  def fromJsonStruct(self, struct):
    """Create a Phonology from a Json structure"""
    return Phonology(struct['id'], struct['decription'], struct['transcriptions'], struct['main-transcription'], struct['entries'])

  def fromCsv(self, file):
    with open(file) as csvfile:
      fileReader = csv.reader(csvfile)
      # get csv header
      header = next(fileReader)
      # get the transcriptions (header items not id or description)
      self.transcriptions = [x for x in header if x not in ['id', 'description']]
      # Check: self.transcriptions should contain 'phoneme'
      if 'phoneme' not in self.transcriptions:
        raise Exception('phoneme column not found in ', file)
      # Check: self.transcriptions should have at least two items
      if len(self.transcriptions) < 2:
        raise Exception('No transcription found outside phoneme in file ', file, 'Did you named it id or description ?')
      # get the first header item which is not one of those: id, description, phoneme
      guessedMainTranscription = next(x for x in header if x not in ['id', 'description', 'phoneme'])
      # If main-transcription was not given on the command line, use the guess as main-transcription
      if self.mainTranscription == '':
        self.mainTranscription = guessedMainTranscription
      # Check: self.mainTranscription should be in self.transcriptions
      if self.mainTranscription not in self.mainTranscription:
        raise Exception('main-transcription', self.mainTranscription, 'not in list of transcriptions')
      # If id was not given on the command line, use the mainTranscription as the id
      if self.id == '':
        self.id = self.mainTranscription
      # parse entries
      for row in fileReader:
        entry = dict()
        for i in range(len(row)):
          entry.update({header[i]: row[i]})
        # All absent elements are set to ''
        for i in range(len(row), len(header)):
          entry.update({header[i]: ''})
        # if both phoneme and main-transcription are empty, skip the rest
        if (entry['phoneme'] != '') or (entry[self.mainTranscription] != ''):
          # if id is not provided, generate it
          if 'id' not in header:
            entry.update({'id': entry['phoneme'] + '-' + entry[self.mainTranscription]})
          # if description is not provided, add an empty one
          if 'description' not in header:
            entry.update({'description': ''})
          self.entries.append(entry)

class PhonagenFile:
  """A phonagen file, with phonologies and generators"""
  def __init__(self):
    self.phonologies = {}
    self.generators = {}

  def addPhonology(self, phonology):
    if (phonology.isValid()):
      self.phonologies.update({phonology.id: phonology})

  def addGenerator(self, generator):
    if (generator.isValid()):
      self.generators.update({generator.id: generator})

  def getPhonology(self, id):
    return self.phonologies[id]

  def getGenerator(self, id):
    return self.generators[id]

  def writeTo(self, file = ''):
    """Output a JSON file from lists of phonologies and generators"""
    outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
                     'generators': [x.toJsonStruct() for x in self.generators.values()] }
    if file == '':
      json.dump(outputStruct, sys.stdout, ensure_ascii=False)
    else:
      with open(file, 'w', encoding='utf-8') as outputFile:
        json.dump(outputStruct, outputFile, ensure_ascii=False)