phonagen/py-phonagen/phonagen.py

"""Common functions and classes for phonagen tools"""
import json
import io
import sys
import csv
import random
import unicodedata

class Phonology:
  """Phonology class"""
  def __init__(self, id = '', description = '', mainTranscription = ''):
    self.id = id
    self.description = description
    self.transcriptions = []
    self.mainTranscription = mainTranscription
    self.entries = {} # id -> entry

  def isValid(self):
    return self.id != ''

  def has(self, id):
    return id in self.entries

  def toJsonStruct(self):
    """Convert a Phonology to a Json structure"""
    return { 'id': self.id,
             'description': self.description,
             'transcriptions': self.transcriptions,
             'main-transcription': self.mainTranscription,
             'entries': [x for x in self.entries.values()] }

  def fromJsonStruct(self, struct):
    """Fill a Phonology from a Json structure"""
    self.id = struct['id']
    self.description = struct['description']
    self.transcriptions = struct['transcriptions']
    self.mainTranscription = struct['main-transcription']
    self.entries = {x['id']: x for x in struct['entries']}

  def fromCsv(self, file):
    """Fill a Phonology from a Csv file"""
    with open(file) as csvfile:
      fileReader = csv.reader(csvfile)
      # get csv header
      header = next(fileReader)
      # get the transcriptions (header items not id or description)
      self.transcriptions = [x for x in header if x not in ['id', 'description']]
      # Check: self.transcriptions should contain 'phoneme'
      if 'phoneme' not in self.transcriptions:
        raise Exception('phoneme column not found in ', file)
      # Check: self.transcriptions should have at least two items
      if len(self.transcriptions) < 2:
        raise Exception('No transcription found outside phoneme in file ', file, 'Did you named it id or description ?')
      # get the first header item which is not one of those: id, description, phoneme
      guessedMainTranscription = next(x for x in header if x not in ['id', 'description', 'phoneme'])
      # If main-transcription was not given on the command line, use the guess as main-transcription
      if self.mainTranscription == '':
        self.mainTranscription = guessedMainTranscription
      # Check: self.mainTranscription should be in self.transcriptions
      if self.mainTranscription not in self.mainTranscription:
        raise Exception('main-transcription', self.mainTranscription, 'not in list of transcriptions')
      # If id was not given on the command line, use the mainTranscription as the id
      if self.id == '':
        self.id = self.mainTranscription
      # parse entries
      for row in fileReader:
        entry = dict()
        for i in range(len(row)):
          entry.update({header[i]: row[i]})
        # All absent elements are set to ''
        for i in range(len(row), len(header)):
          entry.update({header[i]: ''})
        # if both phoneme and main-transcription are empty, skip the rest
        if (entry['phoneme'] != '') or (entry[self.mainTranscription] != ''):
          # if id is not provided, generate it
          if 'id' not in header:
            entry.update({'id': entry['phoneme'] + '-' + entry[self.mainTranscription]})
          # if description is not provided, add an empty one
          if 'description' not in header:
            entry.update({'description': ''})
          self.entries.update({entry['id']: entry})

  def formatWord(self, idList):
    """Return a table of transcription -> string corresponding to the same word"""
    result = {x: "" for x in self.transcriptions}
    for x in idList:
      phoneme = self.entries[x]
      for y in result:
        result[y] = result[y] + phoneme[y]
    return result

  def isStress(self, id):
    entry = self.entries[id]
    description = entry['description']
    phoneme = entry['phoneme']
    return (('#stress' in description) and ('#stressed' not in description)) or ("'" in phoneme) or ("ˈ" in phoneme)

  def getStress(self):
    """Return the phoneme id of the stress phoneme"""
    # search for #stress tag in description
    found = [x['id'] for x in self.entries.values() if ('#stress' in x['description']) and ('#stressed' not in x['description'])]
    if len(found) == 0:
      # if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription
      found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])]
    if len(found) == 0:
      raise Exception('No stress phoneme in phonology', self.id)
    return found[0]

  def isSyllableBreak(self, id):
    entry = self.entries[id]
    description = entry['description']
    phoneme = entry['phoneme']
    return ('#syllable-break' in description) or ("." in phoneme)

  def getSyllableBreak(self):
    """Return the phoneme id of the syllable break phoneme"""
    # search for #syllable-break tag in description
    found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']]
    if len(found) == 0:
      # if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
      found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']]
    if len(found) == 0:
      raise Exception('No syllable break phoneme in phonology', self.id)
    return found[0]

  vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ"
  def isVowel(phoneme):
    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels)

  consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ"
  def isConsonant(phoneme):
    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)

  def isOnset(self, id):
    """Check if an id corresponds to a phoneme that can be in an onset, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#onset' in description) or ('#consonant' in description))
    if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description):
      result = Phonology.isConsonant(entry['phoneme'])
    return result

  def isNucleus(self, id):
    """Check if an id corresponds to a phoneme that can be in a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#nucleus' in description) or ('#vowel' in description))
    if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description):
      result = Phonology.isVowel(entry['phoneme'])
    return result

  def isCoda(self, id):
    """Check if an id corresponds to a phoneme that can be in a coda, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#coda' in description) or ('#consonant' in description))
    if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description):
      result = Phonology.isConsonant(entry['phoneme'])
    return result

  def isInSingleSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in a single syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#single' in description) or ('#initial' in description) or ('#final' in description)
    if (not result) and ('#middle' not in description):
      result = True
    return result

  def isInInitialSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in an initial syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#initial' in description)
    if (not result) and ('#single' not in description) and ('#middle' not in description) and ('#final' not in description):
      result = True
    return result

  def isInMiddleSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in a middle syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#middle' in description)
    if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#final' not in description):
      result = True
    return result

  def isInFinalSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in a final syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#final' in description)
    if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#middle' not in description):
      result = True
    return result

  def isInStressedSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in a stressed syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    return ('#stressed' in description) or ('#unstressed' not in description)

  def isInUnstressedSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in an unstressed syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    return ('#unstressed' in description) or ('#stressed' not in description)

  def getPhonemesFromTags(self, tags):
    """Return a list of phoneme id verifying the tag list"""
    phonemeList = []
    tagToPredicate = {
      '#onset': Phonology.isOnset,
      '#nucleus': Phonology.isNucleus,
      '#coda': Phonology.isCoda,
      '#single': Phonology.isInSingleSyllables,
      '#initial': Phonology.isInInitialSyllables,
      '#middle': Phonology.isInMiddleSyllables,
      '#final': Phonology.isInFinalSyllables,
      '#stressed': Phonology.isInStressedSyllables,
      '#unstressed': Phonology.isInUnstressedSyllables
    }
    for id in self.entries:
      # skip stress and syllable break
      if (id == self.getStress()) or (id == self.getSyllableBreak()):
        pass
      checklist = [tagToPredicate[t](self, id) for t in tags]
      if all(checklist):
        phonemeList.append(id)
    return phonemeList

  def hasStressedVowels(self):
    """Check if all vowels are tagged #unstressed"""
    hasStressed = False
    for id in self.entries:
      if self.isNucleus(id) and self.isInStressedSyllables(id):
        hasStressed = True
        break
    return hasStressed

class Distribution:
  """Discrete distribution"""
  def __init__(self):
    self.items = {}

  def addTo(self, value, occurences = 1):
    oc = occurences
    if value in self.items:
      oc = oc + self.items[value]
    self.items.update({value: oc})

  def pickFrom(self):
    return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0]

  def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
    return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]

  def fromJsonStruct(self, struct, itemRef = 'value', occurencesRef = 'occurences'):
    self.items = {}
    for item in struct:
      self.items.update({item[itemRef]: item[occurencesRef]})

  def isEmpty(self):
    return len(self.items) == 0

class Generator:
  """Parent class for all generators"""
  def __init__(self, id = '', description = '', phonology = ''):
    self.id = id
    self.description = description
    self.phonology = phonology
    self.isTyped = False

  def isValid(self):
    return (self.id != '') and self.isTyped

  def toJsonStruct(self):
    return { 'id': self.id,
             'description': self.description,
             'phonology': self.phonology }

  def fromJsonStruct(self, struct):
    self.id = struct['id']
    self.description = struct['description']
    self.phonology = struct['phonology']

  def generateWord(self):
    raise Exception('Word generation not supported on abstract generator')

class ChainGenerator(Generator):
  """Chains-based generator"""
  def __init__(self, order = 1, **kwargs):
    super().__init__(**kwargs)
    self.order = order
    self.chains = {} # input -> distribution of outputs
    self.isTyped = True

  def toJsonStruct(self):
    struct = super().toJsonStruct()
    struct.update({'type': 'chains',
                   'order': self.order,
                   'chains': [{'input': x, 'possible-outputs': self.chains[x].toJsonStruct(itemRef = 'value', occurencesRef = 'occurences')} for x in self.chains]})
    return struct

  def fromJsonStruct(self, struct):
    super().fromJsonStruct(struct)
    self.order = struct['order']
    for chainStruct in struct['chains']:
      dist = Distribution()
      dist.fromJsonStruct(chainStruct['possible-outputs'], itemRef = 'value', occurencesRef = 'occurences')
      self.chains.update({tuple(chainStruct['input']): dist})

  def fromExamples(self, file, phonology):
    """Train a chain generator on an example file"""
    with open(file) as exampleFile:
      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
      for row in fileReader:
        if len(row) != 0:
          row.append('') # Add terminator element (empty string)
          previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
          for item in row:
            if (item != '') and (phonology.isValid()) and (not phonology.has(item)):
              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
            if previous in self.chains:
              self.chains[previous].addTo(item)
            else:
              dist = Distribution()
              dist.addTo(item)
              self.chains.update({previous: dist})
            previous = previous[1:] + (item,)

  def generateWord(self):
    outputIdList = []
    nextItem = '.' #
    previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
    while nextItem != '':
      nextItem = self.chains[previous].pickFrom()
      if nextItem != '':
        outputIdList.append(nextItem)
        previous = previous[1:] + (nextItem,)
    return outputIdList

class RuleGenerator(Generator):
  """Rules-based generator"""
  def __init__(self, **kwargs):
    super().__init__(**kwargs)
    self.rules = {}
    self.isTyped = True

  def toJsonStruct(self):
    struct = super().toJsonStruct()
    struct.update({'type': 'rules',
                   'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]})
    return struct

  def fromJsonStruct(self, struct):
    super().fromJsonStruct(struct)
    for ruleStruct in struct['rules']:
      dist = Distribution()
      # The pattern should be converted from a list to a tuple
      dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences')
      self.rules.update({ruleStruct['id']: dist})

  def generatePattern(self, pattern):
    output = []
    for x in pattern:
      if x in self.rules:
        output = output + self.generatePattern(self.rules[x].pickFrom())
      else:
        output.append(x)
    return output

  def generateWord(self):
    return self.generatePattern(self.rules['word'].pickFrom())

  def processRowFromExample(self, row, stressId, syllableBreakId):
    # Check the number of stress
    nbStress = row.count(stressId)
    if nbStress > 1:
      print("Too much stress in " + str(row) + ": skip the example")
      return
    # Build the syllable list
    syllables = []
    currentSyllable = []
    stressedSyllableIdx = -1
    syllableIdx = 0
    for x in row:
      # Append to the current syllable if not a syllable separator
      if (x != stressId) and (x != syllableBreakId):
        currentSyllable.append(x)
      # In case of syllable separator, only add the syllable to the list if it is not empty
      elif len(currentSyllable) != 0:
        syllables.append(currentSyllable)
        currentSyllable = []
        syllableIdx = syllableIdx + 1
      # If current id is stress, remember the position of the stressed syllable
      if (x == stressId):
        stressedSyllableIdx = syllableIdx
    # After the loop, the current syllable should be non-empty, add it to the list of syllables
    if len(currentSyllable) != 0:
      syllables.append(currentSyllable)
    # Single syllable case
    if len(syllables) == 1:
      if stressedSyllableIdx == 0:
        self.rules['word'].addTo(tuple([stressId, 'single']))
      else:
        self.rules['word'].addTo(tuple(['single']))
      self.rules['single'].addTo(tuple(syllables[0]))
    # Other cases
    else:
      wordPattern = []
      for x in range(len(syllables)):
        rule = ''
        separator = syllableBreakId
        if x == 0:
          rule = 'initial'
        elif x == (len(syllables) - 1):
          rule = 'final'
        else:
          rule = 'middle'
        if x == stressedSyllableIdx:
          rule = rule + '-stressed'
          separator = stressId
        # only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
        if (separator == stressId) or (x > 0):
          wordPattern.append(separator)
        # Add the rule to the pattern
        wordPattern.append(rule)
        # The syllable is added to the corresponding rule
        self.rules[rule].addTo(tuple(syllables[x]))
      self.rules['word'].addTo(tuple(wordPattern))

  def splitSyllableRule(self, syllableRule, phonology):
    """Replace syllable rules with onset/nucleus/coda pattern"""
    newDist = Distribution()
    oldDist = self.rules[syllableRule]
    # Add onset/nucleus/coda rules
    onsetRule = syllableRule + '-onset'
    nucleusRule = syllableRule +'-nucleus'
    codaRule = syllableRule + '-coda'
    self.rules[onsetRule] = Distribution()
    self.rules[nucleusRule] = Distribution()
    self.rules[codaRule] = Distribution()
    # For each pattern, split into onset/nucleus/coda
    for pattern in oldDist.items:
      isOnset = True
      onset = []
      isNucleus = False
      nucleus = []
      isCoda = False
      coda = []
      for phoneme in pattern:
        # Check is there is a change of element
        if isOnset and (phonology.isNucleus(phoneme)):
          isOnset = False
          isNucleus = True
        elif isNucleus and (phonology.isCoda(phoneme)):
          isNucleus = False
          isCoda = True
        # Add to the respective list
        if isOnset:
          onset.append(phoneme)
        elif isNucleus:
          nucleus.append(phoneme)
        else:
          coda.append(phoneme)
      # Add to the specific distributions and determine the pattern in new distribution
      occurences = oldDist.items[pattern]
      distPattern = []
      if len(onset) != 0:
        distPattern.append(onsetRule)
        self.rules[onsetRule].addTo(tuple(onset), occurences)
      if len(nucleus) != 0:
        distPattern.append(nucleusRule)
        self.rules[nucleusRule].addTo(tuple(nucleus), occurences)
      if len(coda) != 0:
        distPattern.append(codaRule)
        self.rules[codaRule].addTo(tuple(coda), occurences)
      # Add patterns to distributions
      newDist.addTo(tuple(distPattern), occurences)
    # Replace the old rules with the new rules
    self.rules[syllableRule] = newDist

  def cleanRules(self):
    """Remove the empty rules"""
    self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}

  def fromExamples(self, file, phonology):
    """Train a rule generator on an example file"""
    stressId = phonology.getStress()
    syllableBreakId = phonology.getSyllableBreak()
    # Words are modelled as lists of syllables, with one of those being stressed (optionally)
    # The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
    # Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
    #
    # Add the 'word' rule, and syllable rules, initialized with an empty distribution
    self.rules.update({'word': Distribution()})
    syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed']
    for x in syllableRules:
      self.rules.update({x: Distribution()})
    # Step 1: open the file and find how words look like
    with open(file) as exampleFile:
      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
      for row in fileReader:
        if len(row) != 0:
          # Check the items in row
          for item in row:
            if (item != '') and (not phonology.has(item)):
              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
          # Process the row
          self.processRowFromExample(row, stressId, syllableBreakId)
    # Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
    for x in syllableRules:
      self.splitSyllableRule(x, phonology)
    # Step 3: remove the empty rules
    self.cleanRules()

  def randomOccurences(mean, range):
    """Generate a random number in the range [mean-range, mean+range+1]"""
    return random.randint(mean - range, mean + range + 1)

  def isStressPosition(position, numberSyllables, stressPosition):
    """Check if a given position is the position of the stress. The position goes from 1 to numberSyllables included."""
    isPosition = False
    if (stressPosition > 0) and (stressPosition <= numberSyllables):
      isPosition = position == stressPosition
    elif (stressPosition < 0)  and (abs(stressPosition) <= numberSyllables):
      isPosition = position == (numberSyllables + 1 + stressPosition)
    elif (position == numberSyllables) and (stressPosition > numberSyllables):
      isPosition = True
    elif (position == 1) and (stressPosition < 0) and (abs(stressPosition) > numberSyllables):
      isPosition = True
    return isPosition

  def fromPhonology(self, phonology, minNumberSyllables = 1, maxNumberSyllables = 4, stressPosition = -2, distributionMean = 20, distributionRange = 5):
    """
    Generate a rule-based generator just from a phonology and some parameters.
    - minNumberSyllables must be strictly positive.
    - maxNumberSyllables must be greater than minNumberSyllables
    - stressPosition indicates on which syllable the stress occurs.
      Positive index count from the beginning to the end (with the first syllable being at index 1).
      Negative index count from the end to the beginning (with the last syllable being at index -1)
      Set this to zero if no stress should be generated.
    - distributionMean indicates the medium value for the occurences of a phoneme
    - distributionRange indicates the maximum absolute difference from distributionMean for the occurences of phonemes and syllables
    """
    # Reinitialize
    self.phonology = phonology.id
    self.rules = {}
    # Check the parameters
    if maxNumberSyllables < minNumberSyllables:
      raise Exception("Maximum number of syllables", maxNumberSyllables, "must be higher than the minimum number of syllables", minNumberSyllables)
    if maxNumberSyllables < abs(stressPosition):
      raise Exception("Can't set a stress at position", stressPosition, "with a maximum number of syllables", maxNumberSyllables)
    if distributionMean < 1:
      raise Exception("Distribution mean must be strictly positive. Given", distributionMean)
    if distributionMean < distributionRange:
      raise Exception("Distribution mean", distributionMean, "must be strictly higher than distribution range", distributionRange)
    if distributionRange < 0:
      raise Exception("Distribution range must be positive or nul. Given", distributionRange)
    # Step 1: Generate the word rules based on the min and max number of syllables, and the presence of stress
    stressId = phonology.getStress()
    syllableBreakId = phonology.getSyllableBreak()
    isStressed = (stressPosition != 0) and phonology.hasStressedVowels()
    # Add the 'word' rule, initialized with an empty distribution
    self.rules.update({'word': Distribution()})
    # Add the syllable rules and word patterns
    syllableRules = []
    syllableRulesToTags = {}
    if minNumberSyllables == 1:
      syllableRules.append('single')
      syllableRulesToTags.update({'single': ['#single']})
      wordPattern = []
      if isStressed:
        syllableRulesToTags['single'].append('#stressed')
        wordPattern.append(stressId)
      wordPattern.append('single')
      self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
    if maxNumberSyllables > 1:
      syllableRules = syllableRules + ['initial', 'middle', 'final']
      syllableRulesToTags.update({'initial': ['#initial'], 'middle': ['#middle'], 'final': ['#final']})
      if isStressed:
        syllableRules = syllableRules + ['initial-stressed', 'middle-stressed', 'final-stressed']
        syllableRulesToTags.update({'initial-stressed': ['#initial', '#stressed'], 'middle-stressed': ['#middle', '#stressed'], 'final-stressed': ['#final', '#stressed']})
        syllableRulesToTags.update({'initial': ['#initial', '#unstressed'], 'middle': ['#middle', '#unstressed'], 'final': ['#final', '#unstressed']})
      for nbMiddleSyllables in range(maxNumberSyllables - 1):
        nbSyllables = nbMiddleSyllables + 2
        wordPattern = []
        for position in range(1, nbSyllables + 1):
          isStressPosition = isStressed and RuleGenerator.isStressPosition(position, nbSyllables, stressPosition)
          # add syllable separator
          if isStressPosition:
            wordPattern.append(stressId)
          elif position > 1:
            wordPattern.append(syllableBreakId)
          # add syllable
          if position == 1:
            if isStressPosition:
              wordPattern.append('initial-stressed')
            else:
              wordPattern.append('initial')
          elif position == nbSyllables:
            if isStressPosition:
              wordPattern.append('final-stressed')
            else:
              wordPattern.append('final')
          else:
            if isStressPosition:
              wordPattern.append('middle-stressed')
            else:
              wordPattern.append('middle')
        self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
    # Step 2: Generate the syllable rules
    # Add the rules in the distributions
    phonemeRules = []
    phonemeRulesToTag = {}
    for syllable in syllableRules:
      self.rules.update({syllable: Distribution()})
      onset = syllable + '-onset'
      nucleus = syllable + '-nucleus'
      coda = syllable + '-coda'
      phonemeRules = phonemeRules + [onset, nucleus, coda]
      ruleTags = syllableRulesToTags[syllable]
      phonemeRulesToTag.update({onset: ruleTags + ['#onset'], nucleus: ruleTags + ['#nucleus'], coda: ruleTags + ['#coda']})
      # Fill the syllable rules
      # For the generated rules, initial and single syllables may not have onset
      if ('#initial' in ruleTags) or ('#single' in ruleTags):
        self.rules[syllable].addTo(tuple([nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
        self.rules[syllable].addTo(tuple([nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
      self.rules[syllable].addTo(tuple([onset, nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
      self.rules[syllable].addTo(tuple([onset, nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
    # Step 3: Generate the phoneme distributions for each phoneme rule
    for rule in phonemeRules:
      self.rules.update({rule: Distribution()})
      tags = phonemeRulesToTag[rule]
      phonemeList = phonology.getPhonemesFromTags(tags)
      for phoneme in phonemeList:
        self.rules[rule].addTo(tuple([phoneme]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
    # Step 4: Clean the rules
    self.cleanRules()


generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
def makeGenerator(struct):
  """Function instanciating a generator from a JSON structure"""
  if struct['type'] in generatorTypeToClass:
    generator = generatorTypeToClass[struct['type']]()
  else:
    generator = Generator()
  generator.fromJsonStruct(struct)
  return generator

class PhonagenFile:
  """A phonagen file, with phonologies and generators"""
  def __init__(self):
    self.phonologies = {}
    self.generators = {}

  def addPhonology(self, phonology):
    if (phonology.isValid()):
      self.phonologies.update({phonology.id: phonology})

  def addGenerator(self, generator):
    if (generator.isValid()):
      self.generators.update({generator.id: generator})

  def getPhonology(self, id):
    return self.phonologies[id]

  def getGenerator(self, id):
    return self.generators[id]

  def load(self, file):
    """Load from a JSON file"""
    with open(file, 'r', encoding='utf-8') as inputFile:
      jsonStruct = json.load(inputFile)
      # Load phonologies
      for struct in jsonStruct['phonologies']:
        phonology = Phonology()
        phonology.fromJsonStruct(struct)
        self.addPhonology(phonology)
      # Load generators
      for struct in jsonStruct['generators']:
        self.addGenerator(makeGenerator(struct))

  def writeTo(self, file = ''):
    """Output to a JSON file (or stdout)"""
    outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
                     'generators': [x.toJsonStruct() for x in self.generators.values()] }
    if file == '':
      json.dump(outputStruct, sys.stdout, ensure_ascii=False)
    else:
      with open(file, 'w', encoding='utf-8') as outputFile:
        json.dump(outputStruct, outputFile, ensure_ascii=False)

  def mergeFrom(self, otherFile):
    """Add all phonologies and generators from the other file into this one."""
    for phonology in otherFile.phonologies.values():
      self.addPhonology(phonology)
    for generator in otherFile.generators.values():
      self.addGenerator(generator)

  def generateWord(self, generator = ''):
    gen = generator
    if gen == '':
      gen = random.choice([x for x in self.generators])
    idList = self.generators[gen].generateWord()
    phonology = self.phonologies[self.generators[gen].phonology]
    return phonology.formatWord(idList)
-												Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

											
										
										
											2018-06-09 03:02:45 +02:00
+								"""Common functions and classes for phonagen tools"""
 								import json
 								import io
 								import sys
 								import csv
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								import random
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								import unicodedata
-												Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

											
										
										
											2018-06-09 03:02:45 +02:00
 								class Phonology:
 								  """Phonology class"""
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								  def __init__(self, id = '', description = '', mainTranscription = ''):
-												Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

											
										
										
											2018-06-09 03:02:45 +02:00
+								    self.id = id
 								    self.description = description
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								    self.transcriptions = []
-												Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

											
										
										
											2018-06-09 03:02:45 +02:00
+								    self.mainTranscription = mainTranscription
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								    self.entries = {} # id -> entry
-												Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

											
										
										
											2018-06-09 03:02:45 +02:00
 								  def isValid(self):
 								    return self.id != ''
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								  def has(self, id):
 								    return id in self.entries
-												Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

											
										
										
											2018-06-09 03:02:45 +02:00
+								  def toJsonStruct(self):
 								    """Convert a Phonology to a Json structure"""
 								    return { 'id': self.id,
 								             'description': self.description,
 								             'transcriptions': self.transcriptions,
 								             'main-transcription': self.mainTranscription,
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								             'entries': [x for x in self.entries.values()] }
-												Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

											
										
										
											2018-06-09 03:02:45 +02:00
 								  def fromJsonStruct(self, struct):
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								    """Fill a Phonology from a Json structure"""
 								    self.id = struct['id']
 								    self.description = struct['description']
 								    self.transcriptions = struct['transcriptions']
 								    self.mainTranscription = struct['main-transcription']
 								    self.entries = {x['id']: x for x in struct['entries']}
-												Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

											
										
										
											2018-06-09 03:02:45 +02:00
 								  def fromCsv(self, file):
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								    """Fill a Phonology from a Csv file"""
-												Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

											
										
										
											2018-06-09 03:02:45 +02:00
+								    with open(file) as csvfile:
 								      fileReader = csv.reader(csvfile)
 								      # get csv header
 								      header = next(fileReader)
 								      # get the transcriptions (header items not id or description)
 								      self.transcriptions = [x for x in header if x not in ['id', 'description']]
 								      # Check: self.transcriptions should contain 'phoneme'
 								      if 'phoneme' not in self.transcriptions:
 								        raise Exception('phoneme column not found in ', file)
 								      # Check: self.transcriptions should have at least two items
 								      if len(self.transcriptions) < 2:
 								        raise Exception('No transcription found outside phoneme in file ', file, 'Did you named it id or description ?')
 								      # get the first header item which is not one of those: id, description, phoneme
 								      guessedMainTranscription = next(x for x in header if x not in ['id', 'description', 'phoneme'])
 								      # If main-transcription was not given on the command line, use the guess as main-transcription
 								      if self.mainTranscription == '':
 								        self.mainTranscription = guessedMainTranscription
 								      # Check: self.mainTranscription should be in self.transcriptions
 								      if self.mainTranscription not in self.mainTranscription:
 								        raise Exception('main-transcription', self.mainTranscription, 'not in list of transcriptions')
 								      # If id was not given on the command line, use the mainTranscription as the id
 								      if self.id == '':
 								        self.id = self.mainTranscription
 								      # parse entries
 								      for row in fileReader:
 								        entry = dict()
 								        for i in range(len(row)):
 								          entry.update({header[i]: row[i]})
 								        # All absent elements are set to ''
 								        for i in range(len(row), len(header)):
 								          entry.update({header[i]: ''})
 								        # if both phoneme and main-transcription are empty, skip the rest
 								        if (entry['phoneme'] != '') or (entry[self.mainTranscription] != ''):
 								          # if id is not provided, generate it
 								          if 'id' not in header:
 								            entry.update({'id': entry['phoneme'] + '-' + entry[self.mainTranscription]})
 								          # if description is not provided, add an empty one
 								          if 'description' not in header:
 								            entry.update({'description': ''})
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								          self.entries.update({entry['id']: entry})
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								  def formatWord(self, idList):
 								    """Return a table of transcription -> string corresponding to the same word"""
 								    result = {x: "" for x in self.transcriptions}
 								    for x in idList:
 								      phoneme = self.entries[x]
 								      for y in result:
-												Fix a few things when generatong words

											
										
										
											2018-06-23 03:33:46 +02:00
+								        result[y] = result[y] + phoneme[y]
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								    return result
-												Add some conditions to not confuse #stress and #stressed tags. Add restrictions to which phoneme can be onset/nucleus/coda.

											
										
										
											2018-06-15 19:31:20 +02:00
+								  def isStress(self, id):
 								    entry = self.entries[id]
 								    description = entry['description']
 								    phoneme = entry['phoneme']
 								    return (('#stress' in description) and ('#stressed' not in description)) or ("'" in phoneme) or ("ˈ" in phoneme)
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								  def getStress(self):
 								    """Return the phoneme id of the stress phoneme"""
 								    # search for #stress tag in description
-												Add some conditions to not confuse #stress and #stressed tags. Add restrictions to which phoneme can be onset/nucleus/coda.

											
										
										
											2018-06-15 19:31:20 +02:00
+								    found = [x['id'] for x in self.entries.values() if ('#stress' in x['description']) and ('#stressed' not in x['description'])]
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								    if len(found) == 0:
 								      # if not tagged, search for "'" (apostrophe, u+0027) or "ˈ" (primary stress, u+02C8) in phoneme transcription
 								      found = [x['id'] for x in self.entries.values() if ("'" in x['phoneme']) or ("ˈ" in x['phoneme'])]
 								    if len(found) == 0:
 								      raise Exception('No stress phoneme in phonology', self.id)
 								    return found[0]
-												Add some conditions to not confuse #stress and #stressed tags. Add restrictions to which phoneme can be onset/nucleus/coda.

											
										
										
											2018-06-15 19:31:20 +02:00
+								  def isSyllableBreak(self, id):
 								    entry = self.entries[id]
 								    description = entry['description']
 								    phoneme = entry['phoneme']
 								    return ('#syllable-break' in description) or ("." in phoneme)
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								  def getSyllableBreak(self):
 								    """Return the phoneme id of the syllable break phoneme"""
 								    # search for #syllable-break tag in description
 								    found = [x['id'] for x in self.entries.values() if '#syllable-break' in x['description']]
 								    if len(found) == 0:
 								      # if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
 								      found = [x['id'] for x in self.entries.values() if '.' in x['phoneme']]
 								    if len(found) == 0:
 								      raise Exception('No syllable break phoneme in phonology', self.id)
 								    return found[0]
 								  vowels = "iyɨʉɯuɪʏʊɯeøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ"
 								  def isVowel(phoneme):
 								    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.vowels)
 								  consonants = "mɱnɳɲŋɴpbtdʈɖcɟkɡgqɢʡʔszʃʒʂʐɕʑɸβfvθðçʝxɣχʁħʕhɦʋɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀǃǂǁɓɗᶑʄɠʛɧʍwɫɥ"
 								  def isConsonant(phoneme):
 								    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
 								  def isOnset(self, id):
-												Add an algorithm to build a rule generator from a phonology, without examples.

											
										
										
											2018-06-14 00:19:27 +02:00
+								    """Check if an id corresponds to a phoneme that can be in an onset, either from description, or if not available, guessed from the phonemic transcription"""
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								    entry = self.entries[id]
 								    description = entry['description']
-												Add some conditions to not confuse #stress and #stressed tags. Add restrictions to which phoneme can be onset/nucleus/coda.

											
										
										
											2018-06-15 19:31:20 +02:00
+								    result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#onset' in description) or ('#consonant' in description))
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								    if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#coda' not in description):
 								      result = Phonology.isConsonant(entry['phoneme'])
 								    return result
 								  def isNucleus(self, id):
-												Add an algorithm to build a rule generator from a phonology, without examples.

											
										
										
											2018-06-14 00:19:27 +02:00
+								    """Check if an id corresponds to a phoneme that can be in a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								    entry = self.entries[id]
 								    description = entry['description']
-												Add some conditions to not confuse #stress and #stressed tags. Add restrictions to which phoneme can be onset/nucleus/coda.

											
										
										
											2018-06-15 19:31:20 +02:00
+								    result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#nucleus' in description) or ('#vowel' in description))
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								    if (not result) and ('#consonant' not in description) and ('#onset' not in description) and ('#coda' not in description):
 								      result = Phonology.isVowel(entry['phoneme'])
 								    return result
 								  def isCoda(self, id):
-												Add an algorithm to build a rule generator from a phonology, without examples.

											
										
										
											2018-06-14 00:19:27 +02:00
+								    """Check if an id corresponds to a phoneme that can be in a coda, either from description, or if not available, guessed from the phonemic transcription"""
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								    entry = self.entries[id]
 								    description = entry['description']
-												Add some conditions to not confuse #stress and #stressed tags. Add restrictions to which phoneme can be onset/nucleus/coda.

											
										
										
											2018-06-15 19:31:20 +02:00
+								    result = (not self.isSyllableBreak(id)) and (not self.isStress(id)) and (('#coda' in description) or ('#consonant' in description))
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								    if (not result) and ('#vowel' not in description) and ('#nucleus' not in description) and ('#onset' not in description):
 								      result = Phonology.isConsonant(entry['phoneme'])
 								    return result
-												Add an algorithm to build a rule generator from a phonology, without examples.

											
										
										
											2018-06-14 00:19:27 +02:00
+								  def isInSingleSyllables(self, id):
 								    """Check if an id corresponds to a phoneme that can be in a single syllable, from description"""
 								    entry = self.entries[id]
 								    description = entry['description']
 								    result = ('#single' in description) or ('#initial' in description) or ('#final' in description)
 								    if (not result) and ('#middle' not in description):
 								      result = True
 								    return result
 								  def isInInitialSyllables(self, id):
 								    """Check if an id corresponds to a phoneme that can be in an initial syllable, from description"""
 								    entry = self.entries[id]
 								    description = entry['description']
 								    result = ('#initial' in description)
 								    if (not result) and ('#single' not in description) and ('#middle' not in description) and ('#final' not in description):
 								      result = True
 								    return result
 								  def isInMiddleSyllables(self, id):
 								    """Check if an id corresponds to a phoneme that can be in a middle syllable, from description"""
 								    entry = self.entries[id]
 								    description = entry['description']
 								    result = ('#middle' in description)
 								    if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#final' not in description):
 								      result = True
 								    return result
 								  def isInFinalSyllables(self, id):
 								    """Check if an id corresponds to a phoneme that can be in a final syllable, from description"""
 								    entry = self.entries[id]
 								    description = entry['description']
 								    result = ('#final' in description)
 								    if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#middle' not in description):
 								      result = True
 								    return result
 								  def isInStressedSyllables(self, id):
 								    """Check if an id corresponds to a phoneme that can be in a stressed syllable, from description"""
 								    entry = self.entries[id]
 								    description = entry['description']
 								    return ('#stressed' in description) or ('#unstressed' not in description)
 								  def isInUnstressedSyllables(self, id):
 								    """Check if an id corresponds to a phoneme that can be in an unstressed syllable, from description"""
 								    entry = self.entries[id]
 								    description = entry['description']
 								    return ('#unstressed' in description) or ('#stressed' not in description)
 								  def getPhonemesFromTags(self, tags):
 								    """Return a list of phoneme id verifying the tag list"""
 								    phonemeList = []
 								    tagToPredicate = {
 								      '#onset': Phonology.isOnset,
 								      '#nucleus': Phonology.isNucleus,
 								      '#coda': Phonology.isCoda,
 								      '#single': Phonology.isInSingleSyllables,
 								      '#initial': Phonology.isInInitialSyllables,
 								      '#middle': Phonology.isInMiddleSyllables,
 								      '#final': Phonology.isInFinalSyllables,
 								      '#stressed': Phonology.isInStressedSyllables,
 								      '#unstressed': Phonology.isInUnstressedSyllables
 								    }
 								    for id in self.entries:
 								      # skip stress and syllable break
 								      if (id == self.getStress()) or (id == self.getSyllableBreak()):
 								        pass
 								      checklist = [tagToPredicate[t](self, id) for t in tags]
 								      if all(checklist):
 								        phonemeList.append(id)
 								    return phonemeList
-												Ignore stress in rulemaker if all vowels of a phonology is tagged unstressed

											
										
										
											2018-06-23 00:24:55 +02:00
+								  def hasStressedVowels(self):
 								    """Check if all vowels are tagged #unstressed"""
 								    hasStressed = False
 								    for id in self.entries:
 								      if self.isNucleus(id) and self.isInStressedSyllables(id):
 								        hasStressed = True
 								        break
 								    return hasStressed
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								class Distribution:
 								  """Discrete distribution"""
 								  def __init__(self):
 								    self.items = {}
 								  def addTo(self, value, occurences = 1):
 								    oc = occurences
 								    if value in self.items:
 								      oc = oc + self.items[value]
 								    self.items.update({value: oc})
 								  def pickFrom(self):
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								    return random.choices([k for k in self.items.keys()], [v for v in self.items.values()])[0]
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
 								  def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
 								    return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
 								  def fromJsonStruct(self, struct, itemRef = 'value', occurencesRef = 'occurences'):
 								    self.items = {}
 								    for item in struct:
 								      self.items.update({item[itemRef]: item[occurencesRef]})
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								  def isEmpty(self):
 								    return len(self.items) == 0
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								class Generator:
 								  """Parent class for all generators"""
 								  def __init__(self, id = '', description = '', phonology = ''):
 								    self.id = id
 								    self.description = description
 								    self.phonology = phonology
 								    self.isTyped = False
 								  def isValid(self):
 								    return (self.id != '') and self.isTyped
 								  def toJsonStruct(self):
 								    return { 'id': self.id,
 								             'description': self.description,
 								             'phonology': self.phonology }
 								  def fromJsonStruct(self, struct):
 								    self.id = struct['id']
 								    self.description = struct['description']
 								    self.phonology = struct['phonology']
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								  def generateWord(self):
 								    raise Exception('Word generation not supported on abstract generator')
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								class ChainGenerator(Generator):
 								  """Chains-based generator"""
 								  def __init__(self, order = 1, **kwargs):
 								    super().__init__(**kwargs)
 								    self.order = order
 								    self.chains = {} # input -> distribution of outputs
 								    self.isTyped = True
 								  def toJsonStruct(self):
 								    struct = super().toJsonStruct()
 								    struct.update({'type': 'chains',
 								                   'order': self.order,
 								                   'chains': [{'input': x, 'possible-outputs': self.chains[x].toJsonStruct(itemRef = 'value', occurencesRef = 'occurences')} for x in self.chains]})
 								    return struct
 								  def fromJsonStruct(self, struct):
 								    super().fromJsonStruct(struct)
 								    self.order = struct['order']
 								    for chainStruct in struct['chains']:
 								      dist = Distribution()
 								      dist.fromJsonStruct(chainStruct['possible-outputs'], itemRef = 'value', occurencesRef = 'occurences')
-												Add tool for merging phonagen json files into one.

											
										
										
											2018-06-09 20:17:47 +02:00
+								      self.chains.update({tuple(chainStruct['input']): dist})
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
 								  def fromExamples(self, file, phonology):
 								    """Train a chain generator on an example file"""
 								    with open(file) as exampleFile:
 								      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
 								      for row in fileReader:
 								        if len(row) != 0:
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								          row.append('') # Add terminator element (empty string)
 								          previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								          for item in row:
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								            if (item != '') and (phonology.isValid()) and (not phonology.has(item)):
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
 								            if previous in self.chains:
 								              self.chains[previous].addTo(item)
 								            else:
 								              dist = Distribution()
 								              dist.addTo(item)
 								              self.chains.update({previous: dist})
 								            previous = previous[1:] + (item,)
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								  def generateWord(self):
 								    outputIdList = []
 								    nextItem = '.' #
 								    previous = tuple('' for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
 								    while nextItem != '':
 								      nextItem = self.chains[previous].pickFrom()
 								      if nextItem != '':
 								        outputIdList.append(nextItem)
 								        previous = previous[1:] + (nextItem,)
 								    return outputIdList
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								class RuleGenerator(Generator):
 								  """Rules-based generator"""
 								  def __init__(self, **kwargs):
 								    super().__init__(**kwargs)
 								    self.rules = {}
 								    self.isTyped = True
 								  def toJsonStruct(self):
 								    struct = super().toJsonStruct()
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								    struct.update({'type': 'rules',
 								                   'rules': [{'id': x, 'distribution': self.rules[x].toJsonStruct(itemRef = 'pattern', occurencesRef = 'occurences')} for x in self.rules]})
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								    return struct
 								  def fromJsonStruct(self, struct):
 								    super().fromJsonStruct(struct)
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								    for ruleStruct in struct['rules']:
 								      dist = Distribution()
 								      # The pattern should be converted from a list to a tuple
 								      dist.fromJsonStruct([{'pattern': tuple(x['pattern']), 'occurences':x['occurences']} for x in ruleStruct['distribution']], itemRef = 'pattern', occurencesRef = 'occurences')
 								      self.rules.update({ruleStruct['id']: dist})
 								  def generatePattern(self, pattern):
 								    output = []
 								    for x in pattern:
 								      if x in self.rules:
-												Fix a few things when generatong words

											
										
										
											2018-06-23 03:33:46 +02:00
+								        output = output + self.generatePattern(self.rules[x].pickFrom())
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								      else:
 								        output.append(x)
 								    return output
 								  def generateWord(self):
 								    return self.generatePattern(self.rules['word'].pickFrom())
 								  def processRowFromExample(self, row, stressId, syllableBreakId):
 								    # Check the number of stress
 								    nbStress = row.count(stressId)
 								    if nbStress > 1:
 								      print("Too much stress in " + str(row) + ": skip the example")
 								      return
 								    # Build the syllable list
 								    syllables = []
 								    currentSyllable = []
 								    stressedSyllableIdx = -1
 								    syllableIdx = 0
 								    for x in row:
 								      # Append to the current syllable if not a syllable separator
 								      if (x != stressId) and (x != syllableBreakId):
 								        currentSyllable.append(x)
 								      # In case of syllable separator, only add the syllable to the list if it is not empty
 								      elif len(currentSyllable) != 0:
 								        syllables.append(currentSyllable)
 								        currentSyllable = []
 								        syllableIdx = syllableIdx + 1
 								      # If current id is stress, remember the position of the stressed syllable
 								      if (x == stressId):
 								        stressedSyllableIdx = syllableIdx
 								    # After the loop, the current syllable should be non-empty, add it to the list of syllables
 								    if len(currentSyllable) != 0:
 								      syllables.append(currentSyllable)
 								    # Single syllable case
 								    if len(syllables) == 1:
 								      if stressedSyllableIdx == 0:
 								        self.rules['word'].addTo(tuple([stressId, 'single']))
 								      else:
 								        self.rules['word'].addTo(tuple(['single']))
 								      self.rules['single'].addTo(tuple(syllables[0]))
 								    # Other cases
 								    else:
 								      wordPattern = []
 								      for x in range(len(syllables)):
 								        rule = ''
 								        separator = syllableBreakId
 								        if x == 0:
 								          rule = 'initial'
 								        elif x == (len(syllables) - 1):
 								          rule = 'final'
 								        else:
 								          rule = 'middle'
 								        if x == stressedSyllableIdx:
 								          rule = rule + '-stressed'
 								          separator = stressId
 								        # only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
 								        if (separator == stressId) or (x > 0):
 								          wordPattern.append(separator)
 								        # Add the rule to the pattern
 								        wordPattern.append(rule)
 								        # The syllable is added to the corresponding rule
 								        self.rules[rule].addTo(tuple(syllables[x]))
 								      self.rules['word'].addTo(tuple(wordPattern))
 								  def splitSyllableRule(self, syllableRule, phonology):
 								    """Replace syllable rules with onset/nucleus/coda pattern"""
 								    newDist = Distribution()
 								    oldDist = self.rules[syllableRule]
 								    # Add onset/nucleus/coda rules
 								    onsetRule = syllableRule + '-onset'
 								    nucleusRule = syllableRule +'-nucleus'
 								    codaRule = syllableRule + '-coda'
 								    self.rules[onsetRule] = Distribution()
 								    self.rules[nucleusRule] = Distribution()
 								    self.rules[codaRule] = Distribution()
 								    # For each pattern, split into onset/nucleus/coda
 								    for pattern in oldDist.items:
 								      isOnset = True
 								      onset = []
 								      isNucleus = False
 								      nucleus = []
 								      isCoda = False
 								      coda = []
 								      for phoneme in pattern:
 								        # Check is there is a change of element
 								        if isOnset and (phonology.isNucleus(phoneme)):
 								          isOnset = False
 								          isNucleus = True
 								        elif isNucleus and (phonology.isCoda(phoneme)):
 								          isNucleus = False
 								          isCoda = True
 								        # Add to the respective list
 								        if isOnset:
 								          onset.append(phoneme)
 								        elif isNucleus:
 								          nucleus.append(phoneme)
 								        else:
 								          coda.append(phoneme)
 								      # Add to the specific distributions and determine the pattern in new distribution
 								      occurences = oldDist.items[pattern]
 								      distPattern = []
 								      if len(onset) != 0:
 								        distPattern.append(onsetRule)
 								        self.rules[onsetRule].addTo(tuple(onset), occurences)
 								      if len(nucleus) != 0:
 								        distPattern.append(nucleusRule)
 								        self.rules[nucleusRule].addTo(tuple(nucleus), occurences)
 								      if len(coda) != 0:
 								        distPattern.append(codaRule)
 								        self.rules[codaRule].addTo(tuple(coda), occurences)
 								      # Add patterns to distributions
 								      newDist.addTo(tuple(distPattern), occurences)
 								    # Replace the old rules with the new rules
 								    self.rules[syllableRule] = newDist
-												Add an algorithm to build a rule generator from a phonology, without examples.

											
										
										
											2018-06-14 00:19:27 +02:00
+								  def cleanRules(self):
 								    """Remove the empty rules"""
 								    self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								  def fromExamples(self, file, phonology):
 								    """Train a rule generator on an example file"""
 								    stressId = phonology.getStress()
 								    syllableBreakId = phonology.getSyllableBreak()
 								    # Words are modelled as lists of syllables, with one of those being stressed (optionally)
 								    # The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
 								    # Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
 								    #
 								    # Add the 'word' rule, and syllable rules, initialized with an empty distribution
 								    self.rules.update({'word': Distribution()})
 								    syllableRules = ['single', 'initial', 'initial-stressed', 'final', 'final-stressed', 'middle', 'middle-stressed']
 								    for x in syllableRules:
 								      self.rules.update({x: Distribution()})
 								    # Step 1: open the file and find how words look like
 								    with open(file) as exampleFile:
 								      fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
 								      for row in fileReader:
 								        if len(row) != 0:
 								          # Check the items in row
 								          for item in row:
 								            if (item != '') and (not phonology.has(item)):
 								              raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
 								          # Process the row
 								          self.processRowFromExample(row, stressId, syllableBreakId)
 								    # Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
 								    for x in syllableRules:
 								      self.splitSyllableRule(x, phonology)
 								    # Step 3: remove the empty rules
-												Add an algorithm to build a rule generator from a phonology, without examples.

											
										
										
											2018-06-14 00:19:27 +02:00
+								    self.cleanRules()
 								  def randomOccurences(mean, range):
 								    """Generate a random number in the range [mean-range, mean+range+1]"""
 								    return random.randint(mean - range, mean + range + 1)
 								  def isStressPosition(position, numberSyllables, stressPosition):
 								    """Check if a given position is the position of the stress. The position goes from 1 to numberSyllables included."""
 								    isPosition = False
 								    if (stressPosition > 0) and (stressPosition <= numberSyllables):
 								      isPosition = position == stressPosition
 								    elif (stressPosition < 0)  and (abs(stressPosition) <= numberSyllables):
 								      isPosition = position == (numberSyllables + 1 + stressPosition)
 								    elif (position == numberSyllables) and (stressPosition > numberSyllables):
 								      isPosition = True
 								    elif (position == 1) and (stressPosition < 0) and (abs(stressPosition) > numberSyllables):
 								      isPosition = True
 								    return isPosition
 								  def fromPhonology(self, phonology, minNumberSyllables = 1, maxNumberSyllables = 4, stressPosition = -2, distributionMean = 20, distributionRange = 5):
 								    """
 								    Generate a rule-based generator just from a phonology and some parameters.
 								    - minNumberSyllables must be strictly positive.
 								    - maxNumberSyllables must be greater than minNumberSyllables
 								    - stressPosition indicates on which syllable the stress occurs.
 								      Positive index count from the beginning to the end (with the first syllable being at index 1).
 								      Negative index count from the end to the beginning (with the last syllable being at index -1)
 								      Set this to zero if no stress should be generated.
 								    - distributionMean indicates the medium value for the occurences of a phoneme
 								    - distributionRange indicates the maximum absolute difference from distributionMean for the occurences of phonemes and syllables
 								    """
 								    # Reinitialize
 								    self.phonology = phonology.id
 								    self.rules = {}
 								    # Check the parameters
 								    if maxNumberSyllables < minNumberSyllables:
 								      raise Exception("Maximum number of syllables", maxNumberSyllables, "must be higher than the minimum number of syllables", minNumberSyllables)
 								    if maxNumberSyllables < abs(stressPosition):
 								      raise Exception("Can't set a stress at position", stressPosition, "with a maximum number of syllables", maxNumberSyllables)
 								    if distributionMean < 1:
 								      raise Exception("Distribution mean must be strictly positive. Given", distributionMean)
 								    if distributionMean < distributionRange:
 								      raise Exception("Distribution mean", distributionMean, "must be strictly higher than distribution range", distributionRange)
 								    if distributionRange < 0:
 								      raise Exception("Distribution range must be positive or nul. Given", distributionRange)
 								    # Step 1: Generate the word rules based on the min and max number of syllables, and the presence of stress
 								    stressId = phonology.getStress()
 								    syllableBreakId = phonology.getSyllableBreak()
-												Ignore stress in rulemaker if all vowels of a phonology is tagged unstressed

											
										
										
											2018-06-23 00:24:55 +02:00
+								    isStressed = (stressPosition != 0) and phonology.hasStressedVowels()
-												Add an algorithm to build a rule generator from a phonology, without examples.

											
										
										
											2018-06-14 00:19:27 +02:00
+								    # Add the 'word' rule, initialized with an empty distribution
 								    self.rules.update({'word': Distribution()})
 								    # Add the syllable rules and word patterns
 								    syllableRules = []
 								    syllableRulesToTags = {}
 								    if minNumberSyllables == 1:
 								      syllableRules.append('single')
 								      syllableRulesToTags.update({'single': ['#single']})
 								      wordPattern = []
 								      if isStressed:
 								        syllableRulesToTags['single'].append('#stressed')
 								        wordPattern.append(stressId)
 								      wordPattern.append('single')
 								      self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
 								    if maxNumberSyllables > 1:
 								      syllableRules = syllableRules + ['initial', 'middle', 'final']
 								      syllableRulesToTags.update({'initial': ['#initial'], 'middle': ['#middle'], 'final': ['#final']})
 								      if isStressed:
 								        syllableRules = syllableRules + ['initial-stressed', 'middle-stressed', 'final-stressed']
 								        syllableRulesToTags.update({'initial-stressed': ['#initial', '#stressed'], 'middle-stressed': ['#middle', '#stressed'], 'final-stressed': ['#final', '#stressed']})
 								        syllableRulesToTags.update({'initial': ['#initial', '#unstressed'], 'middle': ['#middle', '#unstressed'], 'final': ['#final', '#unstressed']})
 								      for nbMiddleSyllables in range(maxNumberSyllables - 1):
 								        nbSyllables = nbMiddleSyllables + 2
 								        wordPattern = []
 								        for position in range(1, nbSyllables + 1):
-												Ignore stress in rulemaker if all vowels of a phonology is tagged unstressed

											
										
										
											2018-06-23 00:24:55 +02:00
+								          isStressPosition = isStressed and RuleGenerator.isStressPosition(position, nbSyllables, stressPosition)
-												Add an algorithm to build a rule generator from a phonology, without examples.

											
										
										
											2018-06-14 00:19:27 +02:00
+								          # add syllable separator
 								          if isStressPosition:
 								            wordPattern.append(stressId)
 								          elif position > 1:
 								            wordPattern.append(syllableBreakId)
 								          # add syllable
 								          if position == 1:
 								            if isStressPosition:
 								              wordPattern.append('initial-stressed')
 								            else:
 								              wordPattern.append('initial')
 								          elif position == nbSyllables:
 								            if isStressPosition:
 								              wordPattern.append('final-stressed')
 								            else:
 								              wordPattern.append('final')
 								          else:
 								            if isStressPosition:
 								              wordPattern.append('middle-stressed')
 								            else:
 								              wordPattern.append('middle')
 								        self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
 								    # Step 2: Generate the syllable rules
 								    # Add the rules in the distributions
 								    phonemeRules = []
 								    phonemeRulesToTag = {}
 								    for syllable in syllableRules:
 								      self.rules.update({syllable: Distribution()})
 								      onset = syllable + '-onset'
 								      nucleus = syllable + '-nucleus'
 								      coda = syllable + '-coda'
 								      phonemeRules = phonemeRules + [onset, nucleus, coda]
 								      ruleTags = syllableRulesToTags[syllable]
 								      phonemeRulesToTag.update({onset: ruleTags + ['#onset'], nucleus: ruleTags + ['#nucleus'], coda: ruleTags + ['#coda']})
 								      # Fill the syllable rules
 								      # For the generated rules, initial and single syllables may not have onset
 								      if ('#initial' in ruleTags) or ('#single' in ruleTags):
 								        self.rules[syllable].addTo(tuple([nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
 								        self.rules[syllable].addTo(tuple([nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
 								      self.rules[syllable].addTo(tuple([onset, nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
 								      self.rules[syllable].addTo(tuple([onset, nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
 								    # Step 3: Generate the phoneme distributions for each phoneme rule
 								    for rule in phonemeRules:
 								      self.rules.update({rule: Distribution()})
 								      tags = phonemeRulesToTag[rule]
 								      phonemeList = phonology.getPhonemesFromTags(tags)
 								      for phoneme in phonemeList:
 								        self.rules[rule].addTo(tuple([phoneme]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
 								    # Step 4: Clean the rules
 								    self.cleanRules()
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
 								generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
 								def makeGenerator(struct):
 								  """Function instanciating a generator from a JSON structure"""
 								  if struct['type'] in generatorTypeToClass:
 								    generator = generatorTypeToClass[struct['type']]()
 								  else:
 								    generator = Generator()
 								  generator.fromJsonStruct(struct)
 								  return generator
-												Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

											
										
										
											2018-06-09 03:02:45 +02:00
 								class PhonagenFile:
 								  """A phonagen file, with phonologies and generators"""
 								  def __init__(self):
 								    self.phonologies = {}
 								    self.generators = {}
 								  def addPhonology(self, phonology):
 								    if (phonology.isValid()):
 								      self.phonologies.update({phonology.id: phonology})
 								  def addGenerator(self, generator):
 								    if (generator.isValid()):
 								      self.generators.update({generator.id: generator})
 								  def getPhonology(self, id):
 								    return self.phonologies[id]
 								  def getGenerator(self, id):
 								    return self.generators[id]
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								  def load(self, file):
 								    """Load from a JSON file"""
 								    with open(file, 'r', encoding='utf-8') as inputFile:
 								      jsonStruct = json.load(inputFile)
 								      # Load phonologies
 								      for struct in jsonStruct['phonologies']:
 								        phonology = Phonology()
 								        phonology.fromJsonStruct(struct)
 								        self.addPhonology(phonology)
 								      # Load generators
 								      for struct in jsonStruct['generators']:
 								        self.addGenerator(makeGenerator(struct))
-												Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

											
										
										
											2018-06-09 03:02:45 +02:00
+								  def writeTo(self, file = ''):
-												Example list to Markov chain generator.

											
										
										
											2018-06-09 18:58:46 +02:00
+								    """Output to a JSON file (or stdout)"""
-												Add phonology-csv2json tool and a Phonagen python library on which the tool is based.

											
										
										
											2018-06-09 03:02:45 +02:00
+								    outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
 								                     'generators': [x.toJsonStruct() for x in self.generators.values()] }
 								    if file == '':
 								      json.dump(outputStruct, sys.stdout, ensure_ascii=False)
 								    else:
 								      with open(file, 'w', encoding='utf-8') as outputFile:
-												Add an algorithm to build a rule generator from a phonology, without examples.

											
										
										
											2018-06-14 00:19:27 +02:00
+								        json.dump(outputStruct, outputFile, ensure_ascii=False)
-												Add tool for merging phonagen json files into one.

											
										
										
											2018-06-09 20:17:47 +02:00
 								  def mergeFrom(self, otherFile):
 								    """Add all phonologies and generators from the other file into this one."""
 								    for phonology in otherFile.phonologies.values():
 								      self.addPhonology(phonology)
 								    for generator in otherFile.generators.values():
 								      self.addGenerator(generator)
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
 								  def generateWord(self, generator = ''):
 								    gen = generator
 								    if gen == '':
 								      gen = random.choice([x for x in self.generators])
 								    idList = self.generators[gen].generateWord()
-												Fix a few things when generatong words

											
										
										
											2018-06-23 03:33:46 +02:00
+								    phonology = self.phonologies[self.generators[gen].phonology]
-												Add list2rule generator (generating rules from examples)

											
										
										
											2018-06-10 22:55:04 +02:00
+								    return phonology.formatWord(idList)