Add an algorithm to build a rule generator from a phonology, without examples.

2018-06-14 00:19:27 +02:00 · 2018-06-14 00:19:27 +02:00 · 3e4485e1b9
parent 4def536673
commit 3e4485e1b9
1 changed files with 205 additions and 5 deletions
--- a/py-phonagen/phonagen.py
+++ b/py-phonagen/phonagen.py
@ -120,7 +120,7 @@ class Phonology:
    return (len(phoneme) > 0) and (unicodedata.normalize('NFD', phoneme)[0] in Phonology.consonants)

  def isOnset(self, id):
-    """Check if an id corresponds to an onset, either from description, or if not available, guessed from the phonemic transcription"""
+    """Check if an id corresponds to a phoneme that can be in an onset, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#onset' in description) or ('#consonant' in description)
@ -129,7 +129,7 @@ class Phonology:
    return result

  def isNucleus(self, id):
-    """Check if an id corresponds to a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
+    """Check if an id corresponds to a phoneme that can be in a nucleus, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#nucleus' in description) or ('#vowel' in description)
@ -138,7 +138,7 @@ class Phonology:
    return result

  def isCoda(self, id):
-    """Check if an id corresponds to a coda, either from description, or if not available, guessed from the phonemic transcription"""
+    """Check if an id corresponds to a phoneme that can be in a coda, either from description, or if not available, guessed from the phonemic transcription"""
    entry = self.entries[id]
    description = entry['description']
    result = ('#coda' in description) or ('#consonant' in description)
@ -146,6 +146,77 @@ class Phonology:
      result = Phonology.isConsonant(entry['phoneme'])
    return result

+  def isInSingleSyllables(self, id):
+    """Check if an id corresponds to a phoneme that can be in a single syllable, from description"""
+    entry = self.entries[id]
+    description = entry['description']
+    result = ('#single' in description) or ('#initial' in description) or ('#final' in description)
+    if (not result) and ('#middle' not in description):
+      result = True
+    return result
+
+  def isInInitialSyllables(self, id):
+    """Check if an id corresponds to a phoneme that can be in an initial syllable, from description"""
+    entry = self.entries[id]
+    description = entry['description']
+    result = ('#initial' in description)
+    if (not result) and ('#single' not in description) and ('#middle' not in description) and ('#final' not in description):
+      result = True
+    return result
+
+  def isInMiddleSyllables(self, id):
+    """Check if an id corresponds to a phoneme that can be in a middle syllable, from description"""
+    entry = self.entries[id]
+    description = entry['description']
+    result = ('#middle' in description)
+    if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#final' not in description):
+      result = True
+    return result
+
+  def isInFinalSyllables(self, id):
+    """Check if an id corresponds to a phoneme that can be in a final syllable, from description"""
+    entry = self.entries[id]
+    description = entry['description']
+    result = ('#final' in description)
+    if (not result) and ('#single' not in description) and ('#initial' not in description) and ('#middle' not in description):
+      result = True
+    return result
+
+  def isInStressedSyllables(self, id):
+    """Check if an id corresponds to a phoneme that can be in a stressed syllable, from description"""
+    entry = self.entries[id]
+    description = entry['description']
+    return ('#stressed' in description) or ('#unstressed' not in description)
+
+  def isInUnstressedSyllables(self, id):
+    """Check if an id corresponds to a phoneme that can be in an unstressed syllable, from description"""
+    entry = self.entries[id]
+    description = entry['description']
+    return ('#unstressed' in description) or ('#stressed' not in description)
+
+  def getPhonemesFromTags(self, tags):
+    """Return a list of phoneme id verifying the tag list"""
+    phonemeList = []
+    tagToPredicate = {
+      '#onset': Phonology.isOnset,
+      '#nucleus': Phonology.isNucleus,
+      '#coda': Phonology.isCoda,
+      '#single': Phonology.isInSingleSyllables,
+      '#initial': Phonology.isInInitialSyllables,
+      '#middle': Phonology.isInMiddleSyllables,
+      '#final': Phonology.isInFinalSyllables,
+      '#stressed': Phonology.isInStressedSyllables,
+      '#unstressed': Phonology.isInUnstressedSyllables
+    }
+    for id in self.entries:
+      # skip stress and syllable break
+      if (id == self.getStress()) or (id == self.getSyllableBreak()):
+        pass
+      checklist = [tagToPredicate[t](self, id) for t in tags]
+      if all(checklist):
+        phonemeList.append(id)
+    return phonemeList
+
 class Distribution:
  """Discrete distribution"""
  def __init__(self):
@ -389,6 +460,10 @@ class RuleGenerator(Generator):
    # Replace the old rules with the new rules
    self.rules[syllableRule] = newDist

+  def cleanRules(self):
+    """Remove the empty rules"""
+    self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
+
  def fromExamples(self, file, phonology):
    """Train a rule generator on an example file"""
    stressId = phonology.getStress()
@ -417,7 +492,132 @@ class RuleGenerator(Generator):
    for x in syllableRules:
      self.splitSyllableRule(x, phonology)
    # Step 3: remove the empty rules
-    self.rules = {x: self.rules[x] for x in self.rules if not self.rules[x].isEmpty()}
+    self.cleanRules()
+
+  def randomOccurences(mean, range):
+    """Generate a random number in the range [mean-range, mean+range+1]"""
+    return random.randint(mean - range, mean + range + 1)
+
+  def isStressPosition(position, numberSyllables, stressPosition):
+    """Check if a given position is the position of the stress. The position goes from 1 to numberSyllables included."""
+    isPosition = False
+    if (stressPosition > 0) and (stressPosition <= numberSyllables):
+      isPosition = position == stressPosition
+    elif (stressPosition < 0)  and (abs(stressPosition) <= numberSyllables):
+      isPosition = position == (numberSyllables + 1 + stressPosition)
+    elif (position == numberSyllables) and (stressPosition > numberSyllables):
+      isPosition = True
+    elif (position == 1) and (stressPosition < 0) and (abs(stressPosition) > numberSyllables):
+      isPosition = True
+    return isPosition
+
+  def fromPhonology(self, phonology, minNumberSyllables = 1, maxNumberSyllables = 4, stressPosition = -2, distributionMean = 20, distributionRange = 5):
+    """
+    Generate a rule-based generator just from a phonology and some parameters.
+    - minNumberSyllables must be strictly positive.
+    - maxNumberSyllables must be greater than minNumberSyllables
+    - stressPosition indicates on which syllable the stress occurs.
+      Positive index count from the beginning to the end (with the first syllable being at index 1).
+      Negative index count from the end to the beginning (with the last syllable being at index -1)
+      Set this to zero if no stress should be generated.
+    - distributionMean indicates the medium value for the occurences of a phoneme
+    - distributionRange indicates the maximum absolute difference from distributionMean for the occurences of phonemes and syllables
+    """
+    # Reinitialize
+    self.phonology = phonology.id
+    self.rules = {}
+    # Check the parameters
+    if maxNumberSyllables < minNumberSyllables:
+      raise Exception("Maximum number of syllables", maxNumberSyllables, "must be higher than the minimum number of syllables", minNumberSyllables)
+    if maxNumberSyllables < abs(stressPosition):
+      raise Exception("Can't set a stress at position", stressPosition, "with a maximum number of syllables", maxNumberSyllables)
+    if distributionMean < 1:
+      raise Exception("Distribution mean must be strictly positive. Given", distributionMean)
+    if distributionMean < distributionRange:
+      raise Exception("Distribution mean", distributionMean, "must be strictly higher than distribution range", distributionRange)
+    if distributionRange < 0:
+      raise Exception("Distribution range must be positive or nul. Given", distributionRange)
+    # Step 1: Generate the word rules based on the min and max number of syllables, and the presence of stress
+    stressId = phonology.getStress()
+    syllableBreakId = phonology.getSyllableBreak()
+    isStressed = stressPosition != 0
+    # Add the 'word' rule, initialized with an empty distribution
+    self.rules.update({'word': Distribution()})
+    # Add the syllable rules and word patterns
+    syllableRules = []
+    syllableRulesToTags = {}
+    if minNumberSyllables == 1:
+      syllableRules.append('single')
+      syllableRulesToTags.update({'single': ['#single']})
+      wordPattern = []
+      if isStressed:
+        syllableRulesToTags['single'].append('#stressed')
+        wordPattern.append(stressId)
+      wordPattern.append('single')
+      self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
+    if maxNumberSyllables > 1:
+      syllableRules = syllableRules + ['initial', 'middle', 'final']
+      syllableRulesToTags.update({'initial': ['#initial'], 'middle': ['#middle'], 'final': ['#final']})
+      if isStressed:
+        syllableRules = syllableRules + ['initial-stressed', 'middle-stressed', 'final-stressed']
+        syllableRulesToTags.update({'initial-stressed': ['#initial', '#stressed'], 'middle-stressed': ['#middle', '#stressed'], 'final-stressed': ['#final', '#stressed']})
+        syllableRulesToTags.update({'initial': ['#initial', '#unstressed'], 'middle': ['#middle', '#unstressed'], 'final': ['#final', '#unstressed']})
+      for nbMiddleSyllables in range(maxNumberSyllables - 1):
+        nbSyllables = nbMiddleSyllables + 2
+        wordPattern = []
+        for position in range(1, nbSyllables + 1):
+          isStressPosition = RuleGenerator.isStressPosition(position, nbSyllables, stressPosition)
+          # add syllable separator
+          if isStressPosition:
+            wordPattern.append(stressId)
+          elif position > 1:
+            wordPattern.append(syllableBreakId)
+          # add syllable
+          if position == 1:
+            if isStressPosition:
+              wordPattern.append('initial-stressed')
+            else:
+              wordPattern.append('initial')
+          elif position == nbSyllables:
+            if isStressPosition:
+              wordPattern.append('final-stressed')
+            else:
+              wordPattern.append('final')
+          else:
+            if isStressPosition:
+              wordPattern.append('middle-stressed')
+            else:
+              wordPattern.append('middle')
+        self.rules['word'].addTo(tuple(wordPattern), RuleGenerator.randomOccurences(distributionMean, distributionRange))
+    # Step 2: Generate the syllable rules
+    # Add the rules in the distributions
+    phonemeRules = []
+    phonemeRulesToTag = {}
+    for syllable in syllableRules:
+      self.rules.update({syllable: Distribution()})
+      onset = syllable + '-onset'
+      nucleus = syllable + '-nucleus'
+      coda = syllable + '-coda'
+      phonemeRules = phonemeRules + [onset, nucleus, coda]
+      ruleTags = syllableRulesToTags[syllable]
+      phonemeRulesToTag.update({onset: ruleTags + ['#onset'], nucleus: ruleTags + ['#nucleus'], coda: ruleTags + ['#coda']})
+      # Fill the syllable rules
+      # For the generated rules, initial and single syllables may not have onset
+      if ('#initial' in ruleTags) or ('#single' in ruleTags):
+        self.rules[syllable].addTo(tuple([nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
+        self.rules[syllable].addTo(tuple([nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
+      self.rules[syllable].addTo(tuple([onset, nucleus]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
+      self.rules[syllable].addTo(tuple([onset, nucleus, coda]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
+    # Step 3: Generate the phoneme distributions for each phoneme rule
+    for rule in phonemeRules:
+      self.rules.update({rule: Distribution()})
+      tags = phonemeRulesToTag[rule]
+      phonemeList = phonology.getPhonemesFromTags(tags)
+      for phoneme in phonemeList:
+        self.rules[rule].addTo(tuple([phoneme]), RuleGenerator.randomOccurences(distributionMean, distributionRange))
+    # Step 4: Clean the rules
+    self.cleanRules()
+

 generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
 def makeGenerator(struct):
@ -470,7 +670,7 @@ class PhonagenFile:
      json.dump(outputStruct, sys.stdout, ensure_ascii=False)
    else:
      with open(file, 'w', encoding='utf-8') as outputFile:
-        json.dump(outputStruct, outputFile, ensure_ascii=False,  indent=2)
+        json.dump(outputStruct, outputFile, ensure_ascii=False)

  def mergeFrom(self, otherFile):
    """Add all phonologies and generators from the other file into this one."""