phonagen/py-phonagen/phonagen.py

"""Common functions and classes for phonagen tools"""
import json
import io
import sys
import csv
import random
import unicodedata

class Phonology:
  """Phonology class"""
  def __init__(self, id = '', description = '', mainTranscription = ''):
    self.id = id
    self.description = description
    self.transcriptions = []
    self.mainTranscription = mainTranscription
    self.entries = {} # id -> entry

  def isValid(self):
    return self.id != ''

  def has(self, id):
    return id in self.entries

  def toJsonStruct(self):
    """Convert a Phonology to a Json structure"""
    return { 'id': self.id,
             'description': self.description,
             'transcriptions': self.transcriptions,
             'main-transcription': self.mainTranscription,
             'entries': [x for x in self.entries.values()] }

  def fromJsonStruct(self, struct):
    """Fill a Phonology from a Json structure"""
    self.id = struct['id']
    self.description = struct['description']
    self.transcriptions = struct['transcriptions']
    self.mainTranscription = struct['main-transcription']
    self.entries = {x['id']: x for x in struct['entries']}

  def fromCsv(self, file):
    """Fill a Phonology from a Csv file"""
    with open(file) as csvfile:
      fileReader = csv.reader(csvfile)
      # get csv header
      header = next(fileReader)
      # get the transcriptions (header items not id or description)
      self.transcriptions = [x for x in header if x not in ['id', 'description']]
      # Check: self.transcriptions should contain 'phoneme'
      if 'phoneme' not in self.transcriptions:
        raise Exception('phoneme column not found in ', file)
      # Check: self.transcriptions should have at least two items
      if len(self.transcriptions) < 2:
        raise Exception('No transcription found outside phoneme in file ', file, 'Did you named it id or description ?')
      # get the first header item which is not one of those: id, description, phoneme
      guessedMainTranscription = next(x for x in header if x not in ['id', 'description', 'phoneme'])
      # If main-transcription was not given on the command line, use the guess as main-transcription
      if self.mainTranscription == '':
        self.mainTranscription = guessedMainTranscription
      # Check: self.mainTranscription should be in self.transcriptions
      if self.mainTranscription not in self.mainTranscription:
        raise Exception('main-transcription', self.mainTranscription, 'not in list of transcriptions')
      # If id was not given on the command line, use the mainTranscription as the id
      if self.id == '':
        self.id = self.mainTranscription
      # parse entries
      for row in fileReader:
        entry = dict()
        for i in range(len(row)):
          entry.update({header[i]: row[i]})
        # All absent elements are set to ''
        for i in range(len(row), len(header)):
          entry.update({header[i]: ''})
        # if both phoneme and main-transcription are empty, skip the rest
        if (entry['phoneme'] != '') or (entry[self.mainTranscription] != ''):
          # if id is not provided, generate it
          if 'id' not in header:
            entry.update({'id': entry['phoneme'] + '-' + entry[self.mainTranscription]})
          # if description is not provided, add an empty one
          if 'description' not in header:
            entry.update({'description': ''})
          self.entries.update({entry['id']: entry})

  def formatWord(self, idList):
    """Return a table of transcription -> string corresponding to the same word"""
    result = {x: "" for x in self.transcriptions}
    for x in idList:
      phoneme = self.entries[x]
      for y in result:
        result[y] = result[y] + phoneme[y]
    return result

  def isStress(self, id):
    entry = self.entries[id]
    description = entry['description']
    phoneme = entry['phoneme']
    return (('#stress' in description) and ('#stressed' not in description)) or ("'" in phoneme) or ("ˈ" in phoneme)

  def getStress(self):
    """Return the phoneme id of the stress phoneme"""
    # search for #stress tag in description
    found = [x['id'] for x in self.entries.values() if ('#stress' in x['description']) and ('#stressed' not in x['description'])]
    if len(found) == 0:
      # if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription
      found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])]
    if len(found) == 0:
      raise Exception('No stress phoneme in phonology', self.id)
    return found[0]

  def isSyllableBreak(self, id):
    entry = self.entries[id]
    description = entry['description']
    phoneme = entry['phoneme']
    return ('#syllable-break' in description) or ("." in phoneme)

  def getSyllableBreak(self):
    """Return the phoneme id of the syllable break phoneme"""
    # search for #syllable-break tag in description
    found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']]
    if len(found) == 0:
      # if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
      found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']]
    if len(found) == 0:
      raise Exception('No syllable break phoneme in phonology', self.id)
    return found[0]

  vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ"
  def isVowel(phoneme):
    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels)

  consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ"
  def isConsonant(phoneme):
    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)

  def isOnset(self, id):
    """Check if an id corresponds to a phoneme that can be in an onset, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#onset' in description) or ('#consonant' in description))
    if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description):
      result = Phonology.isConsonant(entry['phoneme'])
    return result

  def isNucleus(self, id):
    """Check if an id corresponds to a phoneme that can be in a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#nucleus' in description) or ('#vowel' in description))
    if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description):
      result = Phonology.isVowel(entry['phoneme'])
    return result

  def isCoda(self, id):
    """Check if an id corresponds to a phoneme that can be in a coda, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#coda' in description) or ('#consonant' in description))
    if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description):
      result = Phonology.isConsonant(entry['phoneme'])
    return result

  def isInSingleSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in a single syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#single' in description) or ('#initial' in description) or ('#final' in description)
    if (not result) and ('#middle' not in description):
      result = True
    return result

  def isInInitialSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in an initial syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#initial' in description)
    if (not result) and ('#single' not in description) and ('#middle' not in description) and ('#final' not in description):
      result = True
    return result

  def isInMiddleSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in a middle syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#middle' in description)
    if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#final' not in description):
      result = True
    return result

  def isInFinalSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in a final syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#final' in description)
    if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#middle' not in description):
      result = True
    return result

  def isInStressedSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in a stressed syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    return ('#stressed' in description) or ('#unstressed' not in description)

  def isInUnstressedSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in an unstressed syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    return ('#unstressed' in description) or ('#stressed' not in description)

  def getPhonemesFromTags(self, tags):
    """Return a list of phoneme id verifying the tag list"""
    phonemeList = []
    tagToPredicate = {
      '#onset': Phonology.isOnset,
      '#nucleus': Phonology.isNucleus,
      '#coda': Phonology.isCoda,
      '#single': Phonology.isInSingleSyllables,
      '#initial': Phonology.isInInitialSyllables,
      '#middle': Phonology.isInMiddleSyllables,
      '#final': Phonology.isInFinalSyllables,
      '#stressed': Phonology.isInStressedSyllables,
      '#unstressed': Phonology.isInUnstressedSyllables
    }
    for id in self.entries:
      # skip stress and syllable break
      if (id == self.getStress()) or (id == self.getSyllableBreak()):
        pass
      checklist = [tagToPredicate[t](self, id) for t in tags]
      if all(checklist):
        phonemeList.append(id)
    return phonemeList

  def hasStressedVowels(self):
    """Check if all vowels are tagged #unstressed"""
    hasStressed = False
    for id in self.entries:
      if self.isNucleus(id) and self.isInStressedSyllables(id):
        hasStressed = True
        break
    return hasStressed

class Distribution:
  """Discrete distribution"""
  def __init__(self):
    self.items = {}

  def addTo(self, value, occurences = 1):
    oc = occurences
    if value in self.items:
      oc = oc + self.items[value]
    self.items.update({value: oc})

  def pickFrom(self):
    return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0]

  def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
    return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]

  def fromJsonStruct(self, struct, itemRef = 'value', occurencesRef = 'occurences'):
    self.items = {}
    for item in struct:
      self.items.update({item[itemRef]: item[occurencesRef]})

  def isEmpty(self):
    return len(self.items) == 0

class Generator:
  """Parent class for all generators"""
  def __init__(self, id = '', description = '', phonology = ''):
    self.id = id
    self.description = description
    self.phonology = phonology
    self.isTyped = False

  def isValid(self):
    return (self.id != '') and self.isTyped

  def toJsonStruct(self):
    return { 'id': self.id,
             'description': self.description,
             'phonology': self.phonology }

  def fromJsonStruct(self, struct):
    self.id = struct['id']
    self.description = struct['description']
    self.phonology = struct['phonology']

  def generateWord(self):
    raise Exception('Word generation not supported on abstract generator')

class ChainGenerator(Generator):
  """Chains-based generator"""
  def __init__(self, order = 1, **kwargs):
    super().__init__(**kwargs)
    self.order = order
    self.chains = {} # input -> distribution of outputs
    self.isTyped = True

  def toJsonStruct(self):
    struct = super().toJsonStruct()
    struct.update({'type': 'chains',
                   'order': self.order,
                   'chains': [{'input': x, 'possible-outputs': self.chains[x].toJsonStruct(itemRef = 'value', occurencesRef = 'occurences')} for x in self.chains]})
    return struct

  def fromJsonStruct(self, struct):
    super().fromJsonStruct(struct)
    self.order = struct['order']
    for chainStruct in struct['chains']:
      dist = Distribution()
      dist.fromJsonStruct(chainStruct['possible-outputs'], itemRef = 'value', occurencesRef = 'occurences')
      self.chains.update({tuple(chainStruct['input']): dist})

  def fromExamples(self, file, phonology):
    """Train a chain generator on an example file"""
    with open(file) as exampleFile:
      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
      for row in fileReader:
        if len(row) != 0:
          row.append('') # Add terminator element (empty string)
          previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
          for item in row:
            if (item != '') and (phonology.isValid()) and (not phonology.has(item)):
              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
            if previous in self.chains:
              self.chains[previous].addTo(item)
            else:
              dist = Distribution()
              dist.addTo(item)
              self.chains.update({previous: dist})
            previous = previous[1:] + (item,)

  def generateWord(self):
    outputIdList = []
    nextItem = '.' #
    previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
    while nextItem != '':
      nextItem = self.chains[previous].pickFrom()
      if nextItem != '':
        outputIdList.append(nextItem)
        previous = previous[1:] + (nextItem,)
    return outputIdList

class RuleGenerator(Generator):
  """Rules-based generator"""
  def __init__(self, **kwargs):
    super().__init__(**kwargs)
    self.rules = {}
    self.isTyped = True

  def toJsonStruct(self):
    struct = super().toJsonStruct()
    struct.update({'type': 'rules',
                   'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]})
    return struct

  def fromJsonStruct(self, struct):
    super().fromJsonStruct(struct)
    for ruleStruct in struct['rules']:
      dist = Distribution()
      # The pattern should be converted from a list to a tuple
      dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences')
      self.rules.update({ruleStruct['id']: dist})

  def generatePattern(self, pattern):
    output = []
    for x in pattern:
      if x in self.rules:
        output = output + self.generatePattern(self.rules[x].pickFrom())
      else:
        output.append(x)
    return output

  def generateWord(self):
    return self.generatePattern(self.rules['word'].pickFrom())

  def processRowFromExample(self, row, stressId, syllableBreakId):
    # Check the number of stress
    nbStress = row.count(stressId)
    if nbStress > 1:
      print("Too much stress in " + str(row) + ": skip the example")
      return
    # Build the syllable list
    syllables = []
    currentSyllable = []
    stressedSyllableIdx = -1
    syllableIdx = 0
    for x in row:
      # Append to the current syllable if not a syllable separator
      if (x != stressId) and (x != syllableBreakId):
        currentSyllable.append(x)
      # In case of syllable separator, only add the syllable to the list if it is not empty
      elif len(currentSyllable) != 0:
        syllables.append(currentSyllable)
        currentSyllable = []
        syllableIdx = syllableIdx + 1
      # If current id is stress, remember the position of the stressed syllable
      if (x == stressId):
        stressedSyllableIdx = syllableIdx
    # After the loop, the current syllable should be non-empty, add it to the list of syllables
    if len(currentSyllable) != 0:
      syllables.append(currentSyllable)
    # Single syllable case
    if len(syllables) == 1:
      if stressedSyllableIdx == 0:
        self.rules['word'].addTo(tuple([stressId, 'single']))
      else:
        self.rules['word'].addTo(tuple(['single']))
      self.rules['single'].addTo(tuple(syllables[0]))
    # Other cases
    else:
      wordPattern = []
      for x in range(len(syllables)):
        rule = ''
        separator = syllableBreakId
        if x == 0:
          rule = 'initial'
        elif x == (len(syllables) - 1):
          rule = 'final'
        else:
          rule = 'middle'
        if x == stressedSyllableIdx:
          rule = rule + '-stressed'
          separator = stressId
        # only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
        if (separator == stressId) or (x > 0):
          wordPattern.append(separator)
        # Add the rule to the pattern
        wordPattern.append(rule)
        # The syllable is added to the corresponding rule
        self.rules[rule].addTo(tuple(syllables[x]))
      self.rules['word'].addTo(tuple(wordPattern))

  def splitSyllableRule(self, syllableRule, phonology):
    """Replace syllable rules with onset/nucleus/coda pattern"""
    newDist = Distribution()
    oldDist = self.rules[syllableRule]
    # Add onset/nucleus/coda rules
    onsetRule = syllableRule + '-onset'
    nucleusRule = syllableRule +'-nucleus'
    codaRule = syllableRule + '-coda'
    self.rules[onsetRule] = Distribution()
    self.rules[nucleusRule] = Distribution()
    self.rules[codaRule] = Distribution()
    # For each pattern, split into onset/nucleus/coda
    for pattern in oldDist.items:
      isOnset = True
      onset = []
      isNucleus = False
      nucleus = []
      isCoda = False
      coda = []
      for phoneme in pattern:
        # Check is there is a change of element
        if isOnset and (phonology.isNucleus(phoneme)):
          isOnset = False
          isNucleus = True
        elif isNucleus and (phonology.isCoda(phoneme)):
          isNucleus = False
          isCoda = True
        # Add to the respective list
        if isOnset:
          onset.append(phoneme)
        elif isNucleus:
          nucleus.append(phoneme)
        else:
          coda.append(phoneme)
      # Add to the specific distributions and determine the pattern in new distribution
      occurences = oldDist.items[pattern]
      distPattern = []
      if len(onset) != 0:
        distPattern.append(onsetRule)
        self.rules[onsetRule].addTo(tuple(onset), occurences)
      if len(nucleus) != 0:
        distPattern.append(nucleusRule)
        self.rules[nucleusRule].addTo(tuple(nucleus), occurences)
      if len(coda) != 0:
        distPattern.append(codaRule)
        self.rules[codaRule].addTo(tuple(coda), occurences)
      # Add patterns to distributions
      newDist.addTo(tuple(distPattern), occurences)
    # Replace the old rules with the new rules
    self.rules[syllableRule] = newDist

  def cleanRules(self):
    """Remove the empty rules"""
    self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}

  def fromExamples(self, file, phonology):
    """Train a rule generator on an example file"""
    stressId = phonology.getStress()
    syllableBreakId = phonology.getSyllableBreak()
    # Words are modelled as lists of syllables, with one of those being stressed (optionally)
    # The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
    # Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
    #
    # Add the 'word' rule, and syllable rules, initialized with an empty distribution
    self.rules.update({'word': Distribution()})
    syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed']
    for x in syllableRules:
      self.rules.update({x: Distribution()})
    # Step 1: open the file and find how words look like
    with open(file) as exampleFile:
      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
      for row in fileReader:
        if len(row) != 0:
          # Check the items in row
          for item in row:
            if (item != '') and (not phonology.has(item)):
              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
          # Process the row
          self.processRowFromExample(row, stressId, syllableBreakId)
    # Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
    for x in syllableRules:
      self.splitSyllableRule(x, phonology)
    # Step 3: remove the empty rules
    self.cleanRules()

  def randomOccurences(mean, range):
    """Generate a random number in the range [mean-range, mean+range+1]"""
    return random.randint(mean - range, mean + range + 1)

  def isStressPosition(position, numberSyllables, stressPosition):
    """Check if a given position is the position of the stress. The position goes from 1 to numberSyllables included."""
    isPosition = False
    if (stressPosition > 0) and (stressPosition <= numberSyllables):
      isPosition = position == stressPosition
    elif (stressPosition < 0)  and (abs(stressPosition) <= numberSyllables):
      isPosition = position == (numberSyllables + 1 + stressPosition)
    elif (position == numberSyllables) and (stressPosition > numberSyllables):
      isPosition = True
    elif (position == 1) and (stressPosition < 0) and (abs(stressPosition) > numberSyllables):
      isPosition = True
    return isPosition

  def fromPhonology(self, phonology, minNumberSyllables = 1, maxNumberSyllables = 4, stressPosition = -2, distributionMean = 20, distributionRange = 5):
    """
    Generate a rule-based generator just from a phonology and some parameters.
    - minNumberSyllables must be strictly positive.
    - maxNumberSyllables must be greater than minNumberSyllables
    - stressPosition indicates on which syllable the stress occurs.
      Positive index count from the beginning to the end (with the first syllable being at index 1).
      Negative index count from the end to the beginning (with the last syllable being at index -1)
      Set this to zero if no stress should be generated.
    - distributionMean indicates the medium value for the occurences of a phoneme
    - distributionRange indicates the maximum absolute difference from distributionMean for the occurences of phonemes and syllables
    """
    # Reinitialize
    self.phonology = phonology.id
    self.rules = {}
    # Check the parameters
    if maxNumberSyllables < minNumberSyllables:
      raise Exception("Maximum number of syllables", maxNumberSyllables, "must be higher than the minimum number of syllables", minNumberSyllables)
    if maxNumberSyllables < abs(stressPosition):
      raise Exception("Can't set a stress at position", stressPosition, "with a maximum number of syllables", maxNumberSyllables)
    if distributionMean < 1:
      raise Exception("Distribution mean must be strictly positive. Given", distributionMean)
    if distributionMean < distributionRange:
      raise Exception("Distribution mean", distributionMean, "must be strictly higher than distribution range", distributionRange)
    if distributionRange < 0:
      raise Exception("Distribution range must be positive or nul. Given", distributionRange)
    # Step 1: Generate the word rules based on the min and max number of syllables, and the presence of stress
    stressId = phonology.getStress()
    syllableBreakId = phonology.getSyllableBreak()
    isStressed = (stressPosition != 0) and phonology.hasStressedVowels()
    # Add the 'word' rule, initialized with an empty distribution
    self.rules.update({'word': Distribution()})
    # Add the syllable rules and word patterns
    syllableRules = []
    syllableRulesToTags = {}
    if minNumberSyllables == 1:
      syllableRules.append('single')
      syllableRulesToTags.update({'single': ['#single']})
      wordPattern = []
      if isStressed:
        syllableRulesToTags['single'].append('#stressed')
        wordPattern.append(stressId)
      wordPattern.append('single')
      self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
    if maxNumberSyllables > 1:
      syllableRules = syllableRules + ['initial', 'middle', 'final']
      syllableRulesToTags.update({'initial': ['#initial'], 'middle': ['#middle'], 'final': ['#final']})
      if isStressed:
        syllableRules = syllableRules + ['initial-stressed', 'middle-stressed', 'final-stressed']
        syllableRulesToTags.update({'initial-stressed': ['#initial', '#stressed'], 'middle-stressed': ['#middle', '#stressed'], 'final-stressed': ['#final', '#stressed']})
        syllableRulesToTags.update({'initial': ['#initial', '#unstressed'], 'middle': ['#middle', '#unstressed'], 'final': ['#final', '#unstressed']})
      for nbMiddleSyllables in range(maxNumberSyllables - 1):
        nbSyllables = nbMiddleSyllables + 2
        wordPattern = []
        for position in range(1, nbSyllables + 1):
          isStressPosition = isStressed and RuleGenerator.isStressPosition(position, nbSyllables, stressPosition)
          # add syllable separator
          if isStressPosition:
            wordPattern.append(stressId)
          elif position > 1:
            wordPattern.append(syllableBreakId)
          # add syllable
          if position == 1:
            if isStressPosition:
              wordPattern.append('initial-stressed')
            else:
              wordPattern.append('initial')
          elif position == nbSyllables:
            if isStressPosition:
              wordPattern.append('final-stressed')
            else:
              wordPattern.append('final')
          else:
            if isStressPosition:
              wordPattern.append('middle-stressed')
            else:
              wordPattern.append('middle')
        self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
    # Step 2: Generate the syllable rules
    # Add the rules in the distributions
    phonemeRules = []
    phonemeRulesToTag = {}
    for syllable in syllableRules:
      self.rules.update({syllable: Distribution()})
      onset = syllable + '-onset'
      nucleus = syllable + '-nucleus'
      coda = syllable + '-coda'
      phonemeRules = phonemeRules + [onset, nucleus, coda]
      ruleTags = syllableRulesToTags[syllable]
      phonemeRulesToTag.update({onset: ruleTags + ['#onset'], nucleus: ruleTags + ['#nucleus'], coda: ruleTags + ['#coda']})
      # Fill the syllable rules
      # For the generated rules, initial and single syllables may not have onset
      if ('#initial' in ruleTags) or ('#single' in ruleTags):
        self.rules[syllable].addTo(tuple([nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
        self.rules[syllable].addTo(tuple([nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
      self.rules[syllable].addTo(tuple([onset, nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
      self.rules[syllable].addTo(tuple([onset, nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
    # Step 3: Generate the phoneme distributions for each phoneme rule
    for rule in phonemeRules:
      self.rules.update({rule: Distribution()})
      tags = phonemeRulesToTag[rule]
      phonemeList = phonology.getPhonemesFromTags(tags)
      for phoneme in phonemeList:
        self.rules[rule].addTo(tuple([phoneme]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
    # Step 4: Clean the rules
    self.cleanRules()


generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
def makeGenerator(struct):
  """Function instanciating a generator from a JSON structure"""
  if struct['type'] in generatorTypeToClass:
    generator = generatorTypeToClass[struct['type']]()
  else:
    generator = Generator()
  generator.fromJsonStruct(struct)
  return generator

class PhonagenFile:
  """A phonagen file, with phonologies and generators"""
  def __init__(self):
    self.phonologies = {}
    self.generators = {}

  def addPhonology(self, phonology):
    if (phonology.isValid()):
      self.phonologies.update({phonology.id: phonology})

  def addGenerator(self, generator):
    if (generator.isValid()):
      self.generators.update({generator.id: generator})

  def getPhonology(self, id):
    return self.phonologies[id]

  def getGenerator(self, id):
    return self.generators[id]

  def load(self, file):
    """Load from a JSON file"""
    with open(file, 'r', encoding='utf-8') as inputFile:
      jsonStruct = json.load(inputFile)
      # Load phonologies
      for struct in jsonStruct['phonologies']:
        phonology = Phonology()
        phonology.fromJsonStruct(struct)
        self.addPhonology(phonology)
      # Load generators
      for struct in jsonStruct['generators']:
        self.addGenerator(makeGenerator(struct))

  def writeTo(self, file = ''):
    """Output to a JSON file (or stdout)"""
    outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
                     'generators': [x.toJsonStruct() for x in self.generators.values()] }
    if file == '':
      json.dump(outputStruct, sys.stdout, ensure_ascii=False)
    else:
      with open(file, 'w', encoding='utf-8') as outputFile:
        json.dump(outputStruct, outputFile, ensure_ascii=False)

  def mergeFrom(self, otherFile):
    """Add all phonologies and generators from the other file into this one."""
    for phonology in otherFile.phonologies.values():
      self.addPhonology(phonology)
    for generator in otherFile.generators.values():
      self.addGenerator(generator)

  def generateWord(self, generator = ''):
    gen = generator
    if gen == '':
      gen = random.choice([x for x in self.generators])
    idList = self.generators[gen].generateWord()
    phonology = self.phonologies[self.generators[gen].phonology]
    return phonology.formatWord(idList)