Add an algorithm to build a rule generator from a phonology, without examples.

2018-06-14 00:19:27 +02:00 · 2018-06-14 00:19:27 +02:00 · 3e4485e1b9
parent 4def536673
commit 3e4485e1b9
1 changed files with 205 additions and 5 deletions
--- a/py-phonagen/phonagen.py
+++ b/py-phonagen/phonagen.py
@ -120,7 +120,7 @@ class Phonology:
    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)
  def isOnset(self, id):
-    """Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription"""
+    """Check if an id corresponds to a phoneme that can be in an onset, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#onset' in description) or ('#consonant' in description)
@ -129,7 +129,7 @@ class Phonology:
    return result
  def isNucleus(self, id):
-    """Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
+    """Check if an id corresponds to a phoneme that can be in a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#nucleus' in description) or ('#vowel' in description)
@ -138,7 +138,7 @@ class Phonology:
    return result
  def isCoda(self, id):
-    """Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription"""
+    """Check if an id corresponds to a phoneme that can be in a coda, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#coda' in description) or ('#consonant' in description)
@ -146,6 +146,77 @@ class Phonology:
      result = Phonology.isConsonant(entry['phoneme'])
    return result
  def isInSingleSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in a single syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#single' in description) or ('#initial' in description) or ('#final' in description)
    if (not result) and ('#middle' not in description):
      result = True
    return result
  def isInInitialSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in an initial syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#initial' in description)
    if (not result) and ('#single' not in description) and ('#middle' not in description) and ('#final' not in description):
      result = True
    return result
  def isInMiddleSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in a middle syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#middle' in description)
    if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#final' not in description):
      result = True
    return result
  def isInFinalSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in a final syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#final' in description)
    if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#middle' not in description):
      result = True
    return result
  def isInStressedSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in a stressed syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    return ('#stressed' in description) or ('#unstressed' not in description)
  def isInUnstressedSyllables(self, id):
    """Check if an id corresponds to a phoneme that can be in an unstressed syllable, from description"""
    entry = self.entries[id]
    description = entry['description']
    return ('#unstressed' in description) or ('#stressed' not in description)
  def getPhonemesFromTags(self, tags):
    """Return a list of phoneme id verifying the tag list"""
    phonemeList = []
    tagToPredicate = {
      '#onset': Phonology.isOnset,
      '#nucleus': Phonology.isNucleus,
      '#coda': Phonology.isCoda,
      '#single': Phonology.isInSingleSyllables,
      '#initial': Phonology.isInInitialSyllables,
      '#middle': Phonology.isInMiddleSyllables,
      '#final': Phonology.isInFinalSyllables,
      '#stressed': Phonology.isInStressedSyllables,
      '#unstressed': Phonology.isInUnstressedSyllables
    }
    for id in self.entries:
      # skip stress and syllable break
      if (id == self.getStress()) or (id == self.getSyllableBreak()):
        pass
      checklist = [tagToPredicate[t](self, id) for t in tags]
      if all(checklist):
        phonemeList.append(id)
    return phonemeList
 class Distribution:
  """Discrete distribution"""
  def __init__(self):
@ -389,6 +460,10 @@ class RuleGenerator(Generator):
    # Replace the old rules with the new rules
    self.rules[syllableRule] = newDist
  def cleanRules(self):
    """Remove the empty rules"""
    self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
  def fromExamples(self, file, phonology):
    """Train a rule generator on an example file"""
    stressId = phonology.getStress()
@ -417,7 +492,132 @@ class RuleGenerator(Generator):
    for x in syllableRules:
      self.splitSyllableRule(x, phonology)
    # Step 3: remove the empty rules
-    self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
+    self.cleanRules()
  def randomOccurences(mean, range):
    """Generate a random number in the range [mean-range, mean+range+1]"""
    return random.randint(mean - range, mean + range + 1)
  def isStressPosition(position, numberSyllables, stressPosition):
    """Check if a given position is the position of the stress. The position goes from 1 to numberSyllables included."""
    isPosition = False
    if (stressPosition > 0) and (stressPosition <= numberSyllables):
      isPosition = position == stressPosition
    elif (stressPosition < 0)  and (abs(stressPosition) <= numberSyllables):
      isPosition = position == (numberSyllables + 1 + stressPosition)
    elif (position == numberSyllables) and (stressPosition > numberSyllables):
      isPosition = True
    elif (position == 1) and (stressPosition < 0) and (abs(stressPosition) > numberSyllables):
      isPosition = True
    return isPosition
  def fromPhonology(self, phonology, minNumberSyllables = 1, maxNumberSyllables = 4, stressPosition = -2, distributionMean = 20, distributionRange = 5):
    """
    Generate a rule-based generator just from a phonology and some parameters.
    - minNumberSyllables must be strictly positive.
    - maxNumberSyllables must be greater than minNumberSyllables
    - stressPosition indicates on which syllable the stress occurs.
      Positive index count from the beginning to the end (with the first syllable being at index 1).
      Negative index count from the end to the beginning (with the last syllable being at index -1)
      Set this to zero if no stress should be generated.
    - distributionMean indicates the medium value for the occurences of a phoneme
    - distributionRange indicates the maximum absolute difference from distributionMean for the occurences of phonemes and syllables
    """
    # Reinitialize
    self.phonology = phonology.id
    self.rules = {}
    # Check the parameters
    if maxNumberSyllables < minNumberSyllables:
      raise Exception("Maximum number of syllables", maxNumberSyllables, "must be higher than the minimum number of syllables", minNumberSyllables)
    if maxNumberSyllables < abs(stressPosition):
      raise Exception("Can't set a stress at position", stressPosition, "with a maximum number of syllables", maxNumberSyllables)
    if distributionMean < 1:
      raise Exception("Distribution mean must be strictly positive. Given", distributionMean)
    if distributionMean < distributionRange:
      raise Exception("Distribution mean", distributionMean, "must be strictly higher than distribution range", distributionRange)
    if distributionRange < 0:
      raise Exception("Distribution range must be positive or nul. Given", distributionRange)
    # Step 1: Generate the word rules based on the min and max number of syllables, and the presence of stress
    stressId = phonology.getStress()
    syllableBreakId = phonology.getSyllableBreak()
    isStressed = stressPosition != 0
    # Add the 'word' rule, initialized with an empty distribution
    self.rules.update({'word': Distribution()})
    # Add the syllable rules and word patterns
    syllableRules = []
    syllableRulesToTags = {}
    if minNumberSyllables == 1:
      syllableRules.append('single')
      syllableRulesToTags.update({'single': ['#single']})
      wordPattern = []
      if isStressed:
        syllableRulesToTags['single'].append('#stressed')
        wordPattern.append(stressId)
      wordPattern.append('single')
      self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
    if maxNumberSyllables > 1:
      syllableRules = syllableRules + ['initial', 'middle', 'final']
      syllableRulesToTags.update({'initial': ['#initial'], 'middle': ['#middle'], 'final': ['#final']})
      if isStressed:
        syllableRules = syllableRules + ['initial-stressed', 'middle-stressed', 'final-stressed']
        syllableRulesToTags.update({'initial-stressed': ['#initial', '#stressed'], 'middle-stressed': ['#middle', '#stressed'], 'final-stressed': ['#final', '#stressed']})
        syllableRulesToTags.update({'initial': ['#initial', '#unstressed'], 'middle': ['#middle', '#unstressed'], 'final': ['#final', '#unstressed']})
      for nbMiddleSyllables in range(maxNumberSyllables - 1):
        nbSyllables = nbMiddleSyllables + 2
        wordPattern = []
        for position in range(1, nbSyllables + 1):
          isStressPosition = RuleGenerator.isStressPosition(position, nbSyllables, stressPosition)
          # add syllable separator
          if isStressPosition:
            wordPattern.append(stressId)
          elif position > 1:
            wordPattern.append(syllableBreakId)
          # add syllable
          if position == 1:
            if isStressPosition:
              wordPattern.append('initial-stressed')
            else:
              wordPattern.append('initial')
          elif position == nbSyllables:
            if isStressPosition:
              wordPattern.append('final-stressed')
            else:
              wordPattern.append('final')
          else:
            if isStressPosition:
              wordPattern.append('middle-stressed')
            else:
              wordPattern.append('middle')
        self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
    # Step 2: Generate the syllable rules
    # Add the rules in the distributions
    phonemeRules = []
    phonemeRulesToTag = {}
    for syllable in syllableRules:
      self.rules.update({syllable: Distribution()})
      onset = syllable + '-onset'
      nucleus = syllable + '-nucleus'
      coda = syllable + '-coda'
      phonemeRules = phonemeRules + [onset, nucleus, coda]
      ruleTags = syllableRulesToTags[syllable]
      phonemeRulesToTag.update({onset: ruleTags + ['#onset'], nucleus: ruleTags + ['#nucleus'], coda: ruleTags + ['#coda']})
      # Fill the syllable rules
      # For the generated rules, initial and single syllables may not have onset
      if ('#initial' in ruleTags) or ('#single' in ruleTags):
        self.rules[syllable].addTo(tuple([nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
        self.rules[syllable].addTo(tuple([nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
      self.rules[syllable].addTo(tuple([onset, nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
      self.rules[syllable].addTo(tuple([onset, nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
    # Step 3: Generate the phoneme distributions for each phoneme rule
    for rule in phonemeRules:
      self.rules.update({rule: Distribution()})
      tags = phonemeRulesToTag[rule]
      phonemeList = phonology.getPhonemesFromTags(tags)
      for phoneme in phonemeList:
        self.rules[rule].addTo(tuple([phoneme]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
    # Step 4: Clean the rules
    self.cleanRules()
 generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
 def makeGenerator(struct):
@ -470,7 +670,7 @@ class PhonagenFile:
      json.dump(outputStruct, sys.stdout, ensure_ascii=False)
    else:
      with open(file, 'w', encoding='utf-8') as outputFile:
-        json.dump(outputStruct, outputFile, ensure_ascii=False,  indent=2)
+        json.dump(outputStruct, outputFile, ensure_ascii=False)
  def mergeFrom(self, otherFile):
    """Add all phonologies and generators from the other file into this one."""