2018-06-09 03:02:45 +02:00
""" Common functions and classes for phonagen tools """
import json
import io
import sys
import csv
2018-06-09 18:58:46 +02:00
import random
2018-06-10 22:55:04 +02:00
import unicodedata
2018-06-09 03:02:45 +02:00
class Phonology :
""" Phonology class """
2018-06-09 18:58:46 +02:00
def __init__ ( self , id = ' ' , description = ' ' , mainTranscription = ' ' ) :
2018-06-09 03:02:45 +02:00
self . id = id
self . description = description
2018-06-09 18:58:46 +02:00
self . transcriptions = [ ]
2018-06-09 03:02:45 +02:00
self . mainTranscription = mainTranscription
2018-06-09 18:58:46 +02:00
self . entries = { } # id -> entry
2018-06-09 03:02:45 +02:00
def isValid ( self ) :
return self . id != ' '
2018-06-09 18:58:46 +02:00
def has ( self , id ) :
return id in self . entries
2018-06-09 03:02:45 +02:00
def toJsonStruct ( self ) :
""" Convert a Phonology to a Json structure """
return { ' id ' : self . id ,
' description ' : self . description ,
' transcriptions ' : self . transcriptions ,
' main-transcription ' : self . mainTranscription ,
2018-06-09 18:58:46 +02:00
' entries ' : [ x for x in self . entries . values ( ) ] }
2018-06-09 03:02:45 +02:00
def fromJsonStruct ( self , struct ) :
2018-06-09 18:58:46 +02:00
""" Fill a Phonology from a Json structure """
self . id = struct [ ' id ' ]
self . description = struct [ ' description ' ]
self . transcriptions = struct [ ' transcriptions ' ]
self . mainTranscription = struct [ ' main-transcription ' ]
self . entries = { x [ ' id ' ] : x for x in struct [ ' entries ' ] }
2018-06-09 03:02:45 +02:00
def fromCsv ( self , file ) :
2018-06-09 18:58:46 +02:00
""" Fill a Phonology from a Csv file """
2018-06-09 03:02:45 +02:00
with open ( file ) as csvfile :
fileReader = csv . reader ( csvfile )
# get csv header
header = next ( fileReader )
# get the transcriptions (header items not id or description)
self . transcriptions = [ x for x in header if x not in [ ' id ' , ' description ' ] ]
# Check: self.transcriptions should contain 'phoneme'
if ' phoneme ' not in self . transcriptions :
raise Exception ( ' phoneme column not found in ' , file )
# Check: self.transcriptions should have at least two items
if len ( self . transcriptions ) < 2 :
raise Exception ( ' No transcription found outside phoneme in file ' , file , ' Did you named it id or description ? ' )
# get the first header item which is not one of those: id, description, phoneme
guessedMainTranscription = next ( x for x in header if x not in [ ' id ' , ' description ' , ' phoneme ' ] )
# If main-transcription was not given on the command line, use the guess as main-transcription
if self . mainTranscription == ' ' :
self . mainTranscription = guessedMainTranscription
# Check: self.mainTranscription should be in self.transcriptions
if self . mainTranscription not in self . mainTranscription :
raise Exception ( ' main-transcription ' , self . mainTranscription , ' not in list of transcriptions ' )
# If id was not given on the command line, use the mainTranscription as the id
if self . id == ' ' :
self . id = self . mainTranscription
# parse entries
for row in fileReader :
entry = dict ( )
for i in range ( len ( row ) ) :
entry . update ( { header [ i ] : row [ i ] } )
# All absent elements are set to ''
for i in range ( len ( row ) , len ( header ) ) :
entry . update ( { header [ i ] : ' ' } )
# if both phoneme and main-transcription are empty, skip the rest
if ( entry [ ' phoneme ' ] != ' ' ) or ( entry [ self . mainTranscription ] != ' ' ) :
# if id is not provided, generate it
if ' id ' not in header :
entry . update ( { ' id ' : entry [ ' phoneme ' ] + ' - ' + entry [ self . mainTranscription ] } )
# if description is not provided, add an empty one
if ' description ' not in header :
entry . update ( { ' description ' : ' ' } )
2018-06-09 18:58:46 +02:00
self . entries . update ( { entry [ ' id ' ] : entry } )
2018-06-10 22:55:04 +02:00
def formatWord ( self , idList ) :
""" Return a table of transcription -> string corresponding to the same word """
result = { x : " " for x in self . transcriptions }
for x in idList :
phoneme = self . entries [ x ]
for y in result :
2018-06-23 03:33:46 +02:00
result [ y ] = result [ y ] + phoneme [ y ]
2018-06-10 22:55:04 +02:00
return result
2018-06-15 19:31:20 +02:00
def isStress ( self , id ) :
entry = self . entries [ id ]
description = entry [ ' description ' ]
phoneme = entry [ ' phoneme ' ]
return ( ( ' #stress ' in description ) and ( ' #stressed ' not in description ) ) or ( " ' " in phoneme ) or ( " ˈ " in phoneme )
2018-06-10 22:55:04 +02:00
def getStress ( self ) :
""" Return the phoneme id of the stress phoneme """
# search for #stress tag in description
2018-06-15 19:31:20 +02:00
found = [ x [ ' id ' ] for x in self . entries . values ( ) if ( ' #stress ' in x [ ' description ' ] ) and ( ' #stressed ' not in x [ ' description ' ] ) ]
2018-06-10 22:55:04 +02:00
if len ( found ) == 0 :
# if not tagged, search for "'" (apostrophe, u+0027) or "ˈ " (primary stress, u+02C8) in phoneme transcription
found = [ x [ ' id ' ] for x in self . entries . values ( ) if ( " ' " in x [ ' phoneme ' ] ) or ( " ˈ " in x [ ' phoneme ' ] ) ]
if len ( found ) == 0 :
raise Exception ( ' No stress phoneme in phonology ' , self . id )
return found [ 0 ]
2018-06-15 19:31:20 +02:00
def isSyllableBreak ( self , id ) :
entry = self . entries [ id ]
description = entry [ ' description ' ]
phoneme = entry [ ' phoneme ' ]
return ( ' #syllable-break ' in description ) or ( " . " in phoneme )
2018-06-10 22:55:04 +02:00
def getSyllableBreak ( self ) :
""" Return the phoneme id of the syllable break phoneme """
# search for #syllable-break tag in description
found = [ x [ ' id ' ] for x in self . entries . values ( ) if ' #syllable-break ' in x [ ' description ' ] ]
if len ( found ) == 0 :
# if not tagged, search for '.' (full stop, u+002E) in phoneme transcription
found = [ x [ ' id ' ] for x in self . entries . values ( ) if ' . ' in x [ ' phoneme ' ] ]
if len ( found ) == 0 :
raise Exception ( ' No syllable break phoneme in phonology ' , self . id )
return found [ 0 ]
vowels = " iyɨʉɯ uɪ ʏ ʊɯ eøɘɵɤoəɛœɜɞʌɔæɐaɶɒɑ "
def isVowel ( phoneme ) :
return ( len ( phoneme ) > 0 ) and ( unicodedata . normalize ( ' NFD ' , phoneme ) [ 0 ] in Phonology . vowels )
consonants = " mɱnɳɲŋɴpbtdʈɖcɟkɡ gqɢʡʔ szʃʒʂʐɕʑɸβfvθðçʝxɣ χʁħʕhɦʋ ɹɻjɰⱱɾɽʙrʀʜʢɬɮlɭʎʟɺʘǀ ǃ ǂǁɓɗᶑʄɠʛɧʍwɫɥ "
def isConsonant ( phoneme ) :
return ( len ( phoneme ) > 0 ) and ( unicodedata . normalize ( ' NFD ' , phoneme ) [ 0 ] in Phonology . consonants )
def isOnset ( self , id ) :
2018-06-14 00:19:27 +02:00
""" Check if an id corresponds to a phoneme that can be in an onset, either from description, or if not available, guessed from the phonemic transcription """
2018-06-10 22:55:04 +02:00
entry = self . entries [ id ]
description = entry [ ' description ' ]
2018-06-15 19:31:20 +02:00
result = ( not self . isSyllableBreak ( id ) ) and ( not self . isStress ( id ) ) and ( ( ' #onset ' in description ) or ( ' #consonant ' in description ) )
2018-06-10 22:55:04 +02:00
if ( not result ) and ( ' #vowel ' not in description ) and ( ' #nucleus ' not in description ) and ( ' #coda ' not in description ) :
result = Phonology . isConsonant ( entry [ ' phoneme ' ] )
return result
def isNucleus ( self , id ) :
2018-06-14 00:19:27 +02:00
""" Check if an id corresponds to a phoneme that can be in a nucleus, either from description, or if not available, guessed from the phonemic transcription """
2018-06-10 22:55:04 +02:00
entry = self . entries [ id ]
description = entry [ ' description ' ]
2018-06-15 19:31:20 +02:00
result = ( not self . isSyllableBreak ( id ) ) and ( not self . isStress ( id ) ) and ( ( ' #nucleus ' in description ) or ( ' #vowel ' in description ) )
2018-06-10 22:55:04 +02:00
if ( not result ) and ( ' #consonant ' not in description ) and ( ' #onset ' not in description ) and ( ' #coda ' not in description ) :
result = Phonology . isVowel ( entry [ ' phoneme ' ] )
return result
def isCoda ( self , id ) :
2018-06-14 00:19:27 +02:00
""" Check if an id corresponds to a phoneme that can be in a coda, either from description, or if not available, guessed from the phonemic transcription """
2018-06-10 22:55:04 +02:00
entry = self . entries [ id ]
description = entry [ ' description ' ]
2018-06-15 19:31:20 +02:00
result = ( not self . isSyllableBreak ( id ) ) and ( not self . isStress ( id ) ) and ( ( ' #coda ' in description ) or ( ' #consonant ' in description ) )
2018-06-10 22:55:04 +02:00
if ( not result ) and ( ' #vowel ' not in description ) and ( ' #nucleus ' not in description ) and ( ' #onset ' not in description ) :
result = Phonology . isConsonant ( entry [ ' phoneme ' ] )
return result
2018-06-14 00:19:27 +02:00
def isInSingleSyllables ( self , id ) :
""" Check if an id corresponds to a phoneme that can be in a single syllable, from description """
entry = self . entries [ id ]
description = entry [ ' description ' ]
result = ( ' #single ' in description ) or ( ' #initial ' in description ) or ( ' #final ' in description )
if ( not result ) and ( ' #middle ' not in description ) :
result = True
return result
def isInInitialSyllables ( self , id ) :
""" Check if an id corresponds to a phoneme that can be in an initial syllable, from description """
entry = self . entries [ id ]
description = entry [ ' description ' ]
result = ( ' #initial ' in description )
if ( not result ) and ( ' #single ' not in description ) and ( ' #middle ' not in description ) and ( ' #final ' not in description ) :
result = True
return result
def isInMiddleSyllables ( self , id ) :
""" Check if an id corresponds to a phoneme that can be in a middle syllable, from description """
entry = self . entries [ id ]
description = entry [ ' description ' ]
result = ( ' #middle ' in description )
if ( not result ) and ( ' #single ' not in description ) and ( ' #initial ' not in description ) and ( ' #final ' not in description ) :
result = True
return result
def isInFinalSyllables ( self , id ) :
""" Check if an id corresponds to a phoneme that can be in a final syllable, from description """
entry = self . entries [ id ]
description = entry [ ' description ' ]
result = ( ' #final ' in description )
if ( not result ) and ( ' #single ' not in description ) and ( ' #initial ' not in description ) and ( ' #middle ' not in description ) :
result = True
return result
def isInStressedSyllables ( self , id ) :
""" Check if an id corresponds to a phoneme that can be in a stressed syllable, from description """
entry = self . entries [ id ]
description = entry [ ' description ' ]
return ( ' #stressed ' in description ) or ( ' #unstressed ' not in description )
def isInUnstressedSyllables ( self , id ) :
""" Check if an id corresponds to a phoneme that can be in an unstressed syllable, from description """
entry = self . entries [ id ]
description = entry [ ' description ' ]
return ( ' # unstressed ' in description ) or ( ' #stressed ' not in description )
def getPhonemesFromTags ( self , tags ) :
""" Return a list of phoneme id verifying the tag list """
phonemeList = [ ]
tagToPredicate = {
' #onset ' : Phonology . isOnset ,
' #nucleus ' : Phonology . isNucleus ,
' #coda ' : Phonology . isCoda ,
' #single ' : Phonology . isInSingleSyllables ,
' #initial ' : Phonology . isInInitialSyllables ,
' #middle ' : Phonology . isInMiddleSyllables ,
' #final ' : Phonology . isInFinalSyllables ,
' #stressed ' : Phonology . isInStressedSyllables ,
' #unstressed ' : Phonology . isInUnstressedSyllables
}
for id in self . entries :
# skip stress and syllable break
if ( id == self . getStress ( ) ) or ( id == self . getSyllableBreak ( ) ) :
pass
checklist = [ tagToPredicate [ t ] ( self , id ) for t in tags ]
if all ( checklist ) :
phonemeList . append ( id )
return phonemeList
2018-06-23 00:24:55 +02:00
def hasStressedVowels ( self ) :
""" Check if all vowels are tagged #unstressed """
hasStressed = False
for id in self . entries :
if self . isNucleus ( id ) and self . isInStressedSyllables ( id ) :
hasStressed = True
break
return hasStressed
2018-06-09 18:58:46 +02:00
class Distribution :
""" Discrete distribution """
def __init__ ( self ) :
self . items = { }
def addTo ( self , value , occurences = 1 ) :
oc = occurences
if value in self . items :
oc = oc + self . items [ value ]
self . items . update ( { value : oc } )
def pickFrom ( self ) :
2018-06-10 22:55:04 +02:00
return random . choices ( [ k for k in self . items . keys ( ) ] , [ v for v in self . items . values ( ) ] ) [ 0 ]
2018-06-09 18:58:46 +02:00
def toJsonStruct ( self , itemRef = ' value ' , occurencesRef = ' occurences ' ) :
return [ { itemRef : x , occurencesRef : self . items [ x ] } for x in self . items ]
def fromJsonStruct ( self , struct , itemRef = ' value ' , occurencesRef = ' occurences ' ) :
self . items = { }
for item in struct :
self . items . update ( { item [ itemRef ] : item [ occurencesRef ] } )
2018-06-10 22:55:04 +02:00
def isEmpty ( self ) :
return len ( self . items ) == 0
2018-06-09 18:58:46 +02:00
class Generator :
""" Parent class for all generators """
def __init__ ( self , id = ' ' , description = ' ' , phonology = ' ' ) :
self . id = id
self . description = description
self . phonology = phonology
self . isTyped = False
def isValid ( self ) :
return ( self . id != ' ' ) and self . isTyped
def toJsonStruct ( self ) :
return { ' id ' : self . id ,
' description ' : self . description ,
' phonology ' : self . phonology }
def fromJsonStruct ( self , struct ) :
self . id = struct [ ' id ' ]
self . description = struct [ ' description ' ]
self . phonology = struct [ ' phonology ' ]
2018-06-10 22:55:04 +02:00
def generateWord ( self ) :
raise Exception ( ' Word generation not supported on abstract generator ' )
2018-06-09 18:58:46 +02:00
class ChainGenerator ( Generator ) :
""" Chains-based generator """
def __init__ ( self , order = 1 , * * kwargs ) :
super ( ) . __init__ ( * * kwargs )
self . order = order
self . chains = { } # input -> distribution of outputs
self . isTyped = True
def toJsonStruct ( self ) :
struct = super ( ) . toJsonStruct ( )
struct . update ( { ' type ' : ' chains ' ,
' order ' : self . order ,
' chains ' : [ { ' input ' : x , ' possible-outputs ' : self . chains [ x ] . toJsonStruct ( itemRef = ' value ' , occurencesRef = ' occurences ' ) } for x in self . chains ] } )
return struct
def fromJsonStruct ( self , struct ) :
super ( ) . fromJsonStruct ( struct )
self . order = struct [ ' order ' ]
for chainStruct in struct [ ' chains ' ] :
dist = Distribution ( )
dist . fromJsonStruct ( chainStruct [ ' possible-outputs ' ] , itemRef = ' value ' , occurencesRef = ' occurences ' )
2018-06-09 20:17:47 +02:00
self . chains . update ( { tuple ( chainStruct [ ' input ' ] ) : dist } )
2018-06-09 18:58:46 +02:00
def fromExamples ( self , file , phonology ) :
""" Train a chain generator on an example file """
with open ( file ) as exampleFile :
fileReader = csv . reader ( exampleFile , delimiter = ' ' , skipinitialspace = True )
for row in fileReader :
if len ( row ) != 0 :
2018-06-10 22:55:04 +02:00
row . append ( ' ' ) # Add terminator element (empty string)
previous = tuple ( ' ' for i in range ( self . order ) ) # Initial sequence (a list of empty string of length = self.order)
2018-06-09 18:58:46 +02:00
for item in row :
2018-06-10 22:55:04 +02:00
if ( item != ' ' ) and ( phonology . isValid ( ) ) and ( not phonology . has ( item ) ) :
2018-06-09 18:58:46 +02:00
raise Exception ( ' In row ' , row , ' : ' , item , ' is not an id in phonology ' , phonology . id )
if previous in self . chains :
self . chains [ previous ] . addTo ( item )
else :
dist = Distribution ( )
dist . addTo ( item )
self . chains . update ( { previous : dist } )
previous = previous [ 1 : ] + ( item , )
2018-06-10 22:55:04 +02:00
def generateWord ( self ) :
outputIdList = [ ]
nextItem = ' . ' #
previous = tuple ( ' ' for i in range ( self . order ) ) # Initial sequence (a list of empty string of length = self.order)
while nextItem != ' ' :
nextItem = self . chains [ previous ] . pickFrom ( )
if nextItem != ' ' :
outputIdList . append ( nextItem )
previous = previous [ 1 : ] + ( nextItem , )
return outputIdList
2018-06-09 18:58:46 +02:00
class RuleGenerator ( Generator ) :
""" Rules-based generator """
def __init__ ( self , * * kwargs ) :
super ( ) . __init__ ( * * kwargs )
self . rules = { }
self . isTyped = True
def toJsonStruct ( self ) :
struct = super ( ) . toJsonStruct ( )
2018-06-10 22:55:04 +02:00
struct . update ( { ' type ' : ' rules ' ,
' rules ' : [ { ' id ' : x , ' distribution ' : self . rules [ x ] . toJsonStruct ( itemRef = ' pattern ' , occurencesRef = ' occurences ' ) } for x in self . rules ] } )
2018-06-09 18:58:46 +02:00
return struct
def fromJsonStruct ( self , struct ) :
super ( ) . fromJsonStruct ( struct )
2018-06-10 22:55:04 +02:00
for ruleStruct in struct [ ' rules ' ] :
dist = Distribution ( )
# The pattern should be converted from a list to a tuple
dist . fromJsonStruct ( [ { ' pattern ' : tuple ( x [ ' pattern ' ] ) , ' occurences ' : x [ ' occurences ' ] } for x in ruleStruct [ ' distribution ' ] ] , itemRef = ' pattern ' , occurencesRef = ' occurences ' )
self . rules . update ( { ruleStruct [ ' id ' ] : dist } )
def generatePattern ( self , pattern ) :
output = [ ]
for x in pattern :
if x in self . rules :
2018-06-23 03:33:46 +02:00
output = output + self . generatePattern ( self . rules [ x ] . pickFrom ( ) )
2018-06-10 22:55:04 +02:00
else :
output . append ( x )
return output
def generateWord ( self ) :
return self . generatePattern ( self . rules [ ' word ' ] . pickFrom ( ) )
def processRowFromExample ( self , row , stressId , syllableBreakId ) :
# Check the number of stress
nbStress = row . count ( stressId )
if nbStress > 1 :
print ( " Too much stress in " + str ( row ) + " : skip the example " )
return
# Build the syllable list
syllables = [ ]
currentSyllable = [ ]
stressedSyllableIdx = - 1
syllableIdx = 0
for x in row :
# Append to the current syllable if not a syllable separator
if ( x != stressId ) and ( x != syllableBreakId ) :
currentSyllable . append ( x )
# In case of syllable separator, only add the syllable to the list if it is not empty
elif len ( currentSyllable ) != 0 :
syllables . append ( currentSyllable )
currentSyllable = [ ]
syllableIdx = syllableIdx + 1
# If current id is stress, remember the position of the stressed syllable
if ( x == stressId ) :
stressedSyllableIdx = syllableIdx
# After the loop, the current syllable should be non-empty, add it to the list of syllables
if len ( currentSyllable ) != 0 :
syllables . append ( currentSyllable )
# Single syllable case
if len ( syllables ) == 1 :
if stressedSyllableIdx == 0 :
self . rules [ ' word ' ] . addTo ( tuple ( [ stressId , ' single ' ] ) )
else :
self . rules [ ' word ' ] . addTo ( tuple ( [ ' single ' ] ) )
self . rules [ ' single ' ] . addTo ( tuple ( syllables [ 0 ] ) )
# Other cases
else :
wordPattern = [ ]
for x in range ( len ( syllables ) ) :
rule = ' '
separator = syllableBreakId
if x == 0 :
rule = ' initial '
elif x == ( len ( syllables ) - 1 ) :
rule = ' final '
else :
rule = ' middle '
if x == stressedSyllableIdx :
rule = rule + ' -stressed '
separator = stressId
# only add the syllable separator to the pattern if it's not the first syllable unless it's stressed
if ( separator == stressId ) or ( x > 0 ) :
wordPattern . append ( separator )
# Add the rule to the pattern
wordPattern . append ( rule )
# The syllable is added to the corresponding rule
self . rules [ rule ] . addTo ( tuple ( syllables [ x ] ) )
self . rules [ ' word ' ] . addTo ( tuple ( wordPattern ) )
def splitSyllableRule ( self , syllableRule , phonology ) :
""" Replace syllable rules with onset/nucleus/coda pattern """
newDist = Distribution ( )
oldDist = self . rules [ syllableRule ]
# Add onset/nucleus/coda rules
onsetRule = syllableRule + ' -onset '
nucleusRule = syllableRule + ' -nucleus '
codaRule = syllableRule + ' -coda '
self . rules [ onsetRule ] = Distribution ( )
self . rules [ nucleusRule ] = Distribution ( )
self . rules [ codaRule ] = Distribution ( )
# For each pattern, split into onset/nucleus/coda
for pattern in oldDist . items :
isOnset = True
onset = [ ]
isNucleus = False
nucleus = [ ]
isCoda = False
coda = [ ]
for phoneme in pattern :
# Check is there is a change of element
if isOnset and ( phonology . isNucleus ( phoneme ) ) :
isOnset = False
isNucleus = True
elif isNucleus and ( phonology . isCoda ( phoneme ) ) :
isNucleus = False
isCoda = True
# Add to the respective list
if isOnset :
onset . append ( phoneme )
elif isNucleus :
nucleus . append ( phoneme )
else :
coda . append ( phoneme )
# Add to the specific distributions and determine the pattern in new distribution
occurences = oldDist . items [ pattern ]
distPattern = [ ]
if len ( onset ) != 0 :
distPattern . append ( onsetRule )
self . rules [ onsetRule ] . addTo ( tuple ( onset ) , occurences )
if len ( nucleus ) != 0 :
distPattern . append ( nucleusRule )
self . rules [ nucleusRule ] . addTo ( tuple ( nucleus ) , occurences )
if len ( coda ) != 0 :
distPattern . append ( codaRule )
self . rules [ codaRule ] . addTo ( tuple ( coda ) , occurences )
# Add patterns to distributions
newDist . addTo ( tuple ( distPattern ) , occurences )
# Replace the old rules with the new rules
self . rules [ syllableRule ] = newDist
2018-06-14 00:19:27 +02:00
def cleanRules ( self ) :
""" Remove the empty rules """
self . rules = { x : self . rules [ x ] for x in self . rules if not self . rules [ x ] . isEmpty ( ) }
2018-06-10 22:55:04 +02:00
def fromExamples ( self , file , phonology ) :
""" Train a rule generator on an example file """
stressId = phonology . getStress ( )
syllableBreakId = phonology . getSyllableBreak ( )
# Words are modelled as lists of syllables, with one of those being stressed (optionally)
# The syllables are classed among: single (1 syllable words), initial (first syllable), final (last syllable), middle (other syllables)
# Words are split among 7 rules (single, initial, initial-stressed, final, final-stressed, middle, middle-stressed)
#
# Add the 'word' rule, and syllable rules, initialized with an empty distribution
self . rules . update ( { ' word ' : Distribution ( ) } )
syllableRules = [ ' single ' , ' initial ' , ' initial-stressed ' , ' final ' , ' final-stressed ' , ' middle ' , ' middle-stressed ' ]
for x in syllableRules :
self . rules . update ( { x : Distribution ( ) } )
# Step 1: open the file and find how words look like
with open ( file ) as exampleFile :
fileReader = csv . reader ( exampleFile , delimiter = ' ' , skipinitialspace = True )
for row in fileReader :
if len ( row ) != 0 :
# Check the items in row
for item in row :
if ( item != ' ' ) and ( not phonology . has ( item ) ) :
raise Exception ( ' In row ' , row , ' : ' , item , ' is not an id in phonology ' , phonology . id )
# Process the row
self . processRowFromExample ( row , stressId , syllableBreakId )
# Step 2: Check the syllable rules and split them into onset/nucleus/coda rules
for x in syllableRules :
self . splitSyllableRule ( x , phonology )
# Step 3: remove the empty rules
2018-06-14 00:19:27 +02:00
self . cleanRules ( )
def randomOccurences ( mean , range ) :
""" Generate a random number in the range [mean-range, mean+range+1] """
return random . randint ( mean - range , mean + range + 1 )
def isStressPosition ( position , numberSyllables , stressPosition ) :
""" Check if a given position is the position of the stress. The position goes from 1 to numberSyllables included. """
isPosition = False
if ( stressPosition > 0 ) and ( stressPosition < = numberSyllables ) :
isPosition = position == stressPosition
elif ( stressPosition < 0 ) and ( abs ( stressPosition ) < = numberSyllables ) :
isPosition = position == ( numberSyllables + 1 + stressPosition )
elif ( position == numberSyllables ) and ( stressPosition > numberSyllables ) :
isPosition = True
elif ( position == 1 ) and ( stressPosition < 0 ) and ( abs ( stressPosition ) > numberSyllables ) :
isPosition = True
return isPosition
def fromPhonology ( self , phonology , minNumberSyllables = 1 , maxNumberSyllables = 4 , stressPosition = - 2 , distributionMean = 20 , distributionRange = 5 ) :
"""
Generate a rule - based generator just from a phonology and some parameters .
- minNumberSyllables must be strictly positive .
- maxNumberSyllables must be greater than minNumberSyllables
- stressPosition indicates on which syllable the stress occurs .
Positive index count from the beginning to the end ( with the first syllable being at index 1 ) .
Negative index count from the end to the beginning ( with the last syllable being at index - 1 )
Set this to zero if no stress should be generated .
- distributionMean indicates the medium value for the occurences of a phoneme
- distributionRange indicates the maximum absolute difference from distributionMean for the occurences of phonemes and syllables
"""
# Reinitialize
self . phonology = phonology . id
self . rules = { }
# Check the parameters
if maxNumberSyllables < minNumberSyllables :
raise Exception ( " Maximum number of syllables " , maxNumberSyllables , " must be higher than the minimum number of syllables " , minNumberSyllables )
if maxNumberSyllables < abs ( stressPosition ) :
raise Exception ( " Can ' t set a stress at position " , stressPosition , " with a maximum number of syllables " , maxNumberSyllables )
if distributionMean < 1 :
raise Exception ( " Distribution mean must be strictly positive. Given " , distributionMean )
if distributionMean < distributionRange :
raise Exception ( " Distribution mean " , distributionMean , " must be strictly higher than distribution range " , distributionRange )
if distributionRange < 0 :
raise Exception ( " Distribution range must be positive or nul. Given " , distributionRange )
# Step 1: Generate the word rules based on the min and max number of syllables, and the presence of stress
stressId = phonology . getStress ( )
syllableBreakId = phonology . getSyllableBreak ( )
2018-06-23 00:24:55 +02:00
isStressed = ( stressPosition != 0 ) and phonology . hasStressedVowels ( )
2018-06-14 00:19:27 +02:00
# Add the 'word' rule, initialized with an empty distribution
self . rules . update ( { ' word ' : Distribution ( ) } )
# Add the syllable rules and word patterns
syllableRules = [ ]
syllableRulesToTags = { }
if minNumberSyllables == 1 :
syllableRules . append ( ' single ' )
syllableRulesToTags . update ( { ' single ' : [ ' #single ' ] } )
wordPattern = [ ]
if isStressed :
syllableRulesToTags [ ' single ' ] . append ( ' #stressed ' )
wordPattern . append ( stressId )
wordPattern . append ( ' single ' )
self . rules [ ' word ' ] . addTo ( tuple ( wordPattern ) , RuleGenerator . randomOccurences ( distributionMean , distributionRange ) )
if maxNumberSyllables > 1 :
syllableRules = syllableRules + [ ' initial ' , ' middle ' , ' final ' ]
syllableRulesToTags . update ( { ' initial ' : [ ' #initial ' ] , ' middle ' : [ ' #middle ' ] , ' final ' : [ ' #final ' ] } )
if isStressed :
syllableRules = syllableRules + [ ' initial-stressed ' , ' middle-stressed ' , ' final-stressed ' ]
syllableRulesToTags . update ( { ' initial-stressed ' : [ ' #initial ' , ' #stressed ' ] , ' middle-stressed ' : [ ' #middle ' , ' #stressed ' ] , ' final-stressed ' : [ ' #final ' , ' #stressed ' ] } )
syllableRulesToTags . update ( { ' initial ' : [ ' #initial ' , ' #unstressed ' ] , ' middle ' : [ ' #middle ' , ' #unstressed ' ] , ' final ' : [ ' #final ' , ' #unstressed ' ] } )
for nbMiddleSyllables in range ( maxNumberSyllables - 1 ) :
nbSyllables = nbMiddleSyllables + 2
wordPattern = [ ]
for position in range ( 1 , nbSyllables + 1 ) :
2018-06-23 00:24:55 +02:00
isStressPosition = isStressed and RuleGenerator . isStressPosition ( position , nbSyllables , stressPosition )
2018-06-14 00:19:27 +02:00
# add syllable separator
if isStressPosition :
wordPattern . append ( stressId )
elif position > 1 :
wordPattern . append ( syllableBreakId )
# add syllable
if position == 1 :
if isStressPosition :
wordPattern . append ( ' initial-stressed ' )
else :
wordPattern . append ( ' initial ' )
elif position == nbSyllables :
if isStressPosition :
wordPattern . append ( ' final-stressed ' )
else :
wordPattern . append ( ' final ' )
else :
if isStressPosition :
wordPattern . append ( ' middle-stressed ' )
else :
wordPattern . append ( ' middle ' )
self . rules [ ' word ' ] . addTo ( tuple ( wordPattern ) , RuleGenerator . randomOccurences ( distributionMean , distributionRange ) )
# Step 2: Generate the syllable rules
# Add the rules in the distributions
phonemeRules = [ ]
phonemeRulesToTag = { }
for syllable in syllableRules :
self . rules . update ( { syllable : Distribution ( ) } )
onset = syllable + ' -onset '
nucleus = syllable + ' -nucleus '
coda = syllable + ' -coda '
phonemeRules = phonemeRules + [ onset , nucleus , coda ]
ruleTags = syllableRulesToTags [ syllable ]
phonemeRulesToTag . update ( { onset : ruleTags + [ ' #onset ' ] , nucleus : ruleTags + [ ' #nucleus ' ] , coda : ruleTags + [ ' #coda ' ] } )
# Fill the syllable rules
# For the generated rules, initial and single syllables may not have onset
if ( ' #initial ' in ruleTags ) or ( ' #single ' in ruleTags ) :
self . rules [ syllable ] . addTo ( tuple ( [ nucleus ] ) , RuleGenerator . randomOccurences ( distributionMean , distributionRange ) )
self . rules [ syllable ] . addTo ( tuple ( [ nucleus , coda ] ) , RuleGenerator . randomOccurences ( distributionMean , distributionRange ) )
self . rules [ syllable ] . addTo ( tuple ( [ onset , nucleus ] ) , RuleGenerator . randomOccurences ( distributionMean , distributionRange ) )
self . rules [ syllable ] . addTo ( tuple ( [ onset , nucleus , coda ] ) , RuleGenerator . randomOccurences ( distributionMean , distributionRange ) )
# Step 3: Generate the phoneme distributions for each phoneme rule
for rule in phonemeRules :
self . rules . update ( { rule : Distribution ( ) } )
tags = phonemeRulesToTag [ rule ]
phonemeList = phonology . getPhonemesFromTags ( tags )
for phoneme in phonemeList :
self . rules [ rule ] . addTo ( tuple ( [ phoneme ] ) , RuleGenerator . randomOccurences ( distributionMean , distributionRange ) )
# Step 4: Clean the rules
self . cleanRules ( )
2018-06-09 18:58:46 +02:00
generatorTypeToClass = { ' chains ' : ChainGenerator , ' rules ' : RuleGenerator }
def makeGenerator ( struct ) :
""" Function instanciating a generator from a JSON structure """
if struct [ ' type ' ] in generatorTypeToClass :
generator = generatorTypeToClass [ struct [ ' type ' ] ] ( )
else :
generator = Generator ( )
generator . fromJsonStruct ( struct )
return generator
2018-06-09 03:02:45 +02:00
class PhonagenFile :
""" A phonagen file, with phonologies and generators """
def __init__ ( self ) :
self . phonologies = { }
self . generators = { }
def addPhonology ( self , phonology ) :
if ( phonology . isValid ( ) ) :
self . phonologies . update ( { phonology . id : phonology } )
def addGenerator ( self , generator ) :
if ( generator . isValid ( ) ) :
self . generators . update ( { generator . id : generator } )
def getPhonology ( self , id ) :
return self . phonologies [ id ]
def getGenerator ( self , id ) :
return self . generators [ id ]
2018-06-09 18:58:46 +02:00
def load ( self , file ) :
""" Load from a JSON file """
with open ( file , ' r ' , encoding = ' utf-8 ' ) as inputFile :
jsonStruct = json . load ( inputFile )
# Load phonologies
for struct in jsonStruct [ ' phonologies ' ] :
phonology = Phonology ( )
phonology . fromJsonStruct ( struct )
self . addPhonology ( phonology )
# Load generators
for struct in jsonStruct [ ' generators ' ] :
self . addGenerator ( makeGenerator ( struct ) )
2018-06-09 03:02:45 +02:00
def writeTo ( self , file = ' ' ) :
2018-06-09 18:58:46 +02:00
""" Output to a JSON file (or stdout) """
2018-06-09 03:02:45 +02:00
outputStruct = { ' phonologies ' : [ x . toJsonStruct ( ) for x in self . phonologies . values ( ) ] ,
' generators ' : [ x . toJsonStruct ( ) for x in self . generators . values ( ) ] }
if file == ' ' :
json . dump ( outputStruct , sys . stdout , ensure_ascii = False )
else :
with open ( file , ' w ' , encoding = ' utf-8 ' ) as outputFile :
2018-06-14 00:19:27 +02:00
json . dump ( outputStruct , outputFile , ensure_ascii = False )
2018-06-09 20:17:47 +02:00
def mergeFrom ( self , otherFile ) :
""" Add all phonologies and generators from the other file into this one. """
for phonology in otherFile . phonologies . values ( ) :
self . addPhonology ( phonology )
for generator in otherFile . generators . values ( ) :
self . addGenerator ( generator )
2018-06-10 22:55:04 +02:00
def generateWord ( self , generator = ' ' ) :
gen = generator
if gen == ' ' :
gen = random . choice ( [ x for x in self . generators ] )
idList = self . generators [ gen ] . generateWord ( )
2018-06-23 03:33:46 +02:00
phonology = self . phonologies [ self . generators [ gen ] . phonology ]
2018-06-10 22:55:04 +02:00
return phonology . formatWord ( idList )