phonagen/py-phonagen/phonagen.py

"""Common functions and classes for phonagen tools"""
import json
import io
import sys
import csv
import random
import unicodedata

class Phonology:
  """Phonology class"""
  def __init__(self, id = '', description = '', mainTranscription = ''):
    self.id = id
    self.description = description
    self.transcriptions = []
    self.mainTranscription = mainTranscription
    self.entries = {} # id -> entry

  def isValid(self):
    return self.id != ''

  def has(self, id):
    return id in self.entries

  def toJsonStruct(self):
    """Convert a Phonology to a Json structure"""
    return { 'id': self.id,
             'description': self.description,
             'transcriptions': self.transcriptions,
             'main-transcription': self.mainTranscription,
             'entries': [x for x in self.entries.values()] }

  def fromJsonStruct(self, struct):
    """Fill a Phonology from a Json structure"""
    self.id = struct['id']
    self.description = struct['description']
    self.transcriptions = struct['transcriptions']
    self.mainTranscription = struct['main-transcription']
    self.entries = {x['id']: x for x in struct['entries']}

  def fromCsv(self, file):
    """Fill a Phonology from a Csv file"""
    with open(file) as csvfile:
      fileReader = csv.reader(csvfile)
      # get csv header
      header = next(fileReader)
      # get the transcriptions (header items not id or description)
      self.transcriptions = [x for x in header if x not in ['id', 'description']]
      # Check: self.transcriptions should contain 'phoneme'
      if 'phoneme' not in self.transcriptions:
        raise Exception('phoneme column not found in ', file)
      # Check: self.transcriptions should have at least two items
      if len(self.transcriptions) < 2:
        raise Exception('No transcription found outside phoneme in file ', file, 'Did you named it id or description ?')
      # get the first header item which is not one of those: id, description, phoneme
      guessedMainTranscription = next(x for x in header if x not in ['id', 'description', 'phoneme'])
      # If main-transcription was not given on the command line, use the guess as main-transcription
      if self.mainTranscription == '':
        self.mainTranscription = guessedMainTranscription
      # Check: self.mainTranscription should be in self.transcriptions
      if self.mainTranscription not in self.mainTranscription:
        raise Exception('main-transcription', self.mainTranscription, 'not in list of transcriptions')
      # If id was not given on the command line, use the mainTranscription as the id
      if self.id == '':
        self.id = self.mainTranscription
      # parse entries
      for row in fileReader:
        entry = dict()
        for i in range(len(row)):
          entry.update({header[i]: row[i]})
        # All absent elements are set to ''
        for i in range(len(row), len(header)):
          entry.update({header[i]: ''})
        # if both phoneme and main-transcription are empty, skip the rest
        if (entry['phoneme'] != '') or (entry[self.mainTranscription] != ''):
          # if id is not provided, generate it
          if 'id' not in header:
            entry.update({'id': entry['phoneme'] + '-' + entry[self.mainTranscription]})
          # if description is not provided, add an empty one
          if 'description' not in header:
            entry.update({'description': ''})
          self.entries.update({entry['id']: entry})

  def formatWord(self, idList):
    """Return a table of transcription -> string corresponding to the same word"""
    result = {x: "" for x in self.transcriptions}
    for x in idList:
      phoneme = self.entries[x]
      for y in result:
        result[y].append(phoneme[y])
    return result

  def getStress(self):
    """Return the phoneme id of the stress phoneme"""
    # search for #stress tag in description
    found = [x['id'] for x in self.entries.values() if '#stress' in x['description']]
    if len(found) == 0:
      # if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription
      found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])]
    if len(found) == 0:
      raise Exception('No stress phoneme in phonology', self.id)
    return found[0]

  def getSyllableBreak(self):
    """Return the phoneme id of the syllable break phoneme"""
    # search for #syllable-break tag in description
    found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']]
    if len(found) == 0:
      # if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
      found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']]
    if len(found) == 0:
      raise Exception('No syllable break phoneme in phonology', self.id)
    return found[0]

  vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ"
  def isVowel(phoneme):
    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels)

  consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ"
  def isConsonant(phoneme):
    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)

  def isOnset(self, id):
    """Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#onset' in description) or ('#consonant' in description)
    if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description):
      result = Phonology.isConsonant(entry['phoneme'])
    return result

  def isNucleus(self, id):
    """Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#nucleus' in description) or ('#vowel' in description)
    if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description):
      result = Phonology.isVowel(entry['phoneme'])
    return result

  def isCoda(self, id):
    """Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#coda' in description) or ('#consonant' in description)
    if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description):
      result = Phonology.isConsonant(entry['phoneme'])
    return result

class Distribution:
  """Discrete distribution"""
  def __init__(self):
    self.items = {}

  def addTo(self, value, occurences = 1):
    oc = occurences
    if value in self.items:
      oc = oc + self.items[value]
    self.items.update({value: oc})

  def pickFrom(self):
    return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0]

  def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
    return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]

  def fromJsonStruct(self, struct, itemRef = 'value', occurencesRef = 'occurences'):
    self.items = {}
    for item in struct:
      self.items.update({item[itemRef]: item[occurencesRef]})

  def isEmpty(self):
    return len(self.items) == 0

class Generator:
  """Parent class for all generators"""
  def __init__(self, id = '', description = '', phonology = ''):
    self.id = id
    self.description = description
    self.phonology = phonology
    self.isTyped = False

  def isValid(self):
    return (self.id != '') and self.isTyped

  def toJsonStruct(self):
    return { 'id': self.id,
             'description': self.description,
             'phonology': self.phonology }

  def fromJsonStruct(self, struct):
    self.id = struct['id']
    self.description = struct['description']
    self.phonology = struct['phonology']

  def generateWord(self):
    raise Exception('Word generation not supported on abstract generator')

class ChainGenerator(Generator):
  """Chains-based generator"""
  def __init__(self, order = 1, **kwargs):
    super().__init__(**kwargs)
    self.order = order
    self.chains = {} # input -> distribution of outputs
    self.isTyped = True

  def toJsonStruct(self):
    struct = super().toJsonStruct()
    struct.update({'type': 'chains',
                   'order': self.order,
                   'chains': [{'input': x, 'possible-outputs': self.chains[x].toJsonStruct(itemRef = 'value', occurencesRef = 'occurences')} for x in self.chains]})
    return struct

  def fromJsonStruct(self, struct):
    super().fromJsonStruct(struct)
    self.order = struct['order']
    for chainStruct in struct['chains']:
      dist = Distribution()
      dist.fromJsonStruct(chainStruct['possible-outputs'], itemRef = 'value', occurencesRef = 'occurences')
      self.chains.update({tuple(chainStruct['input']): dist})

  def fromExamples(self, file, phonology):
    """Train a chain generator on an example file"""
    with open(file) as exampleFile:
      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
      for row in fileReader:
        if len(row) != 0:
          row.append('') # Add terminator element (empty string)
          previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
          for item in row:
            if (item != '') and (phonology.isValid()) and (not phonology.has(item)):
              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
            if previous in self.chains:
              self.chains[previous].addTo(item)
            else:
              dist = Distribution()
              dist.addTo(item)
              self.chains.update({previous: dist})
            previous = previous[1:] + (item,)

  def generateWord(self):
    outputIdList = []
    nextItem = '.' #
    previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
    while nextItem != '':
      nextItem = self.chains[previous].pickFrom()
      if nextItem != '':
        outputIdList.append(nextItem)
        previous = previous[1:] + (nextItem,)
    return outputIdList

class RuleGenerator(Generator):
  """Rules-based generator"""
  def __init__(self, **kwargs):
    super().__init__(**kwargs)
    self.rules = {}
    self.isTyped = True

  def toJsonStruct(self):
    struct = super().toJsonStruct()
    struct.update({'type': 'rules',
                   'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]})
    return struct

  def fromJsonStruct(self, struct):
    super().fromJsonStruct(struct)
    for ruleStruct in struct['rules']:
      dist = Distribution()
      # The pattern should be converted from a list to a tuple
      dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences')
      self.rules.update({ruleStruct['id']: dist})

  def generatePattern(self, pattern):
    output = []
    for x in pattern:
      if x in self.rules:
        ouput.concat(self.generatePattern(self.rules[x].pickFrom()))
      else:
        output.append(x)
    return output

  def generateWord(self):
    return self.generatePattern(self.rules['word'].pickFrom())

  def processRowFromExample(self, row, stressId, syllableBreakId):
    # Check the number of stress
    nbStress = row.count(stressId)
    if nbStress > 1:
      print("Too much stress in " + str(row) + ": skip the example")
      return
    # Build the syllable list
    syllables = []
    currentSyllable = []
    stressedSyllableIdx = -1
    syllableIdx = 0
    for x in row:
      # Append to the current syllable if not a syllable separator
      if (x != stressId) and (x != syllableBreakId):
        currentSyllable.append(x)
      # In case of syllable separator, only add the syllable to the list if it is not empty
      elif len(currentSyllable) != 0:
        syllables.append(currentSyllable)
        currentSyllable = []
        syllableIdx = syllableIdx + 1
      # If current id is stress, remember the position of the stressed syllable
      if (x == stressId):
        stressedSyllableIdx = syllableIdx
    # After the loop, the current syllable should be non-empty, add it to the list of syllables
    if len(currentSyllable) != 0:
      syllables.append(currentSyllable)
    # Single syllable case
    if len(syllables) == 1:
      if stressedSyllableIdx == 0:
        self.rules['word'].addTo(tuple([stressId, 'single']))
      else:
        self.rules['word'].addTo(tuple(['single']))
      self.rules['single'].addTo(tuple(syllables[0]))
    # Other cases
    else:
      wordPattern = []
      for x in range(len(syllables)):
        rule = ''
        separator = syllableBreakId
        if x == 0:
          rule = 'initial'
        elif x == (len(syllables) - 1):
          rule = 'final'
        else:
          rule = 'middle'
        if x == stressedSyllableIdx:
          rule = rule + '-stressed'
          separator = stressId
        # only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
        if (separator == stressId) or (x > 0):
          wordPattern.append(separator)
        # Add the rule to the pattern
        wordPattern.append(rule)
        # The syllable is added to the corresponding rule
        self.rules[rule].addTo(tuple(syllables[x]))
      self.rules['word'].addTo(tuple(wordPattern))

  def splitSyllableRule(self, syllableRule, phonology):
    """Replace syllable rules with onset/nucleus/coda pattern"""
    newDist = Distribution()
    oldDist = self.rules[syllableRule]
    # Add onset/nucleus/coda rules
    onsetRule = syllableRule + '-onset'
    nucleusRule = syllableRule +'-nucleus'
    codaRule = syllableRule + '-coda'
    self.rules[onsetRule] = Distribution()
    self.rules[nucleusRule] = Distribution()
    self.rules[codaRule] = Distribution()
    # For each pattern, split into onset/nucleus/coda
    for pattern in oldDist.items:
      isOnset = True
      onset = []
      isNucleus = False
      nucleus = []
      isCoda = False
      coda = []
      for phoneme in pattern:
        # Check is there is a change of element
        if isOnset and (phonology.isNucleus(phoneme)):
          isOnset = False
          isNucleus = True
        elif isNucleus and (phonology.isCoda(phoneme)):
          isNucleus = False
          isCoda = True
        # Add to the respective list
        if isOnset:
          onset.append(phoneme)
        elif isNucleus:
          nucleus.append(phoneme)
        else:
          coda.append(phoneme)
      # Add to the specific distributions and determine the pattern in new distribution
      occurences = oldDist.items[pattern]
      distPattern = []
      if len(onset) != 0:
        distPattern.append(onsetRule)
        self.rules[onsetRule].addTo(tuple(onset), occurences)
      if len(nucleus) != 0:
        distPattern.append(nucleusRule)
        self.rules[nucleusRule].addTo(tuple(nucleus), occurences)
      if len(coda) != 0:
        distPattern.append(codaRule)
        self.rules[codaRule].addTo(tuple(coda), occurences)
      # Add patterns to distributions
      newDist.addTo(tuple(distPattern), occurences)
    # Replace the old rules with the new rules
    self.rules[syllableRule] = newDist

  def fromExamples(self, file, phonology):
    """Train a rule generator on an example file"""
    stressId = phonology.getStress()
    syllableBreakId = phonology.getSyllableBreak()
    # Words are modelled as lists of syllables, with one of those being stressed (optionally)
    # The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
    # Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
    #
    # Add the 'word' rule, and syllable rules, initialized with an empty distribution
    self.rules.update({'word': Distribution()})
    syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed']
    for x in syllableRules:
      self.rules.update({x: Distribution()})
    # Step 1: open the file and find how words look like
    with open(file) as exampleFile:
      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
      for row in fileReader:
        if len(row) != 0:
          # Check the items in row
          for item in row:
            if (item != '') and (not phonology.has(item)):
              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
          # Process the row
          self.processRowFromExample(row, stressId, syllableBreakId)
    # Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
    for x in syllableRules:
      self.splitSyllableRule(x, phonology)
    # Step 3: remove the empty rules
    self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}

generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
def makeGenerator(struct):
  """Function instanciating a generator from a JSON structure"""
  if struct['type'] in generatorTypeToClass:
    generator = generatorTypeToClass[struct['type']]()
  else:
    generator = Generator()
  generator.fromJsonStruct(struct)
  return generator

class PhonagenFile:
  """A phonagen file, with phonologies and generators"""
  def __init__(self):
    self.phonologies = {}
    self.generators = {}

  def addPhonology(self, phonology):
    if (phonology.isValid()):
      self.phonologies.update({phonology.id: phonology})

  def addGenerator(self, generator):
    if (generator.isValid()):
      self.generators.update({generator.id: generator})

  def getPhonology(self, id):
    return self.phonologies[id]

  def getGenerator(self, id):
    return self.generators[id]

  def load(self, file):
    """Load from a JSON file"""
    with open(file, 'r', encoding='utf-8') as inputFile:
      jsonStruct = json.load(inputFile)
      # Load phonologies
      for struct in jsonStruct['phonologies']:
        phonology = Phonology()
        phonology.fromJsonStruct(struct)
        self.addPhonology(phonology)
      # Load generators
      for struct in jsonStruct['generators']:
        self.addGenerator(makeGenerator(struct))

  def writeTo(self, file = ''):
    """Output to a JSON file (or stdout)"""
    outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
                     'generators': [x.toJsonStruct() for x in self.generators.values()] }
    if file == '':
      json.dump(outputStruct, sys.stdout, ensure_ascii=False)
    else:
      with open(file, 'w', encoding='utf-8') as outputFile:
        json.dump(outputStruct, outputFile, ensure_ascii=False,  indent=2)

  def mergeFrom(self, otherFile):
    """Add all phonologies and generators from the other file into this one."""
    for phonology in otherFile.phonologies.values():
      self.addPhonology(phonology)
    for generator in otherFile.generators.values():
      self.addGenerator(generator)

  def generateWord(self, generator = ''):
    gen = generator
    if gen == '':
      gen = random.choice([x for x in self.generators])
    idList = self.generators[gen].generateWord()
    phonology = self.phonologies[self.gnerators[gen].phonology]
    return phonology.formatWord(idList)