phonagen/py-phonagen/phonology-maker.py

#! /usr/bin/env python3
import argparse
import phonagen
import random

class Stress:
  """Stress representation"""
  def __init__(self):
    self.transcriptions = {}

  def __str__(self):
    return "\u02C8"

  def getDescription(self):
    return "#stress"

class SyllableBreak:
  """Syllable break representation"""
  def __init__(self):
    self.transcriptions = {}

  def __str__(self):
    return "."

  def getDescription(self):
    return "#syllable-break"

###
# Vowels representation and generation
class Vowel:
  """Vowel representation"""

  # Simplified vowel model
  matrixPhoneme = [
    ["i", "y", "ɨ", "ɯ", "u"], # close
    ["e", "ø", "ə", "ɤ", "o"], # mid close
    ["ɛ", "œ", "ɐ", "ʌ", "ɔ"], # mid open
    ["æ", "ɶ", "a", "ɑ", "ɒ"], # open
  ]
  # Vowel height
  close = 0
  midClose = 1
  midOpen = 2
  open = 3
  # Vowel backness (+ roundness)
  frontUnrounded = 0
  frontRounded = 1
  central = 2
  backUnrounded = 3
  backRounded = 4

  def __init__(self, height = midClose, backness = central):
    """Constructor"""
    self.height = height
    self.backness = backness
    self.isNasal = False
    self.isLong = False
    self.isStressed = False
    self.transcriptions = {}

  def __str__(self):
    """To String operator: Get the phoneme representation in IPA"""
    result = Vowel.matrixPhoneme[self.height][self.backness]
    if self.isNasal:
      result = result + "\u0303" # Conbining tilde
    if self.isLong:
      result = result + "ː"
    #
    return result

  def clone(self):
    """Clone the vowel"""
    result = Vowel(self.height, self.backness)
    result.isNasal = self.isNasal
    result.isLong = self.isLong
    result.isStressed = self.isStressed
    return result

  def getDescription(self):
    result = "#vowel"
    if self.isStressed:
      result = result + " #stressed"
    else:
      result = result + " #unstressed"
    return result

# Common vowels
Vowel.A = Vowel(Vowel.open, Vowel.central)
Vowel.E = Vowel(Vowel.midClose, Vowel.frontUnrounded)
Vowel.I = Vowel(Vowel.close, Vowel.frontUnrounded)
Vowel.O = Vowel(Vowel.midClose, Vowel.backRounded)
Vowel.U = Vowel(Vowel.close, Vowel.backRounded)
Vowel.Schwa = Vowel(Vowel.midClose, Vowel.central)
# Less common vowels
Vowel.openO = Vowel(Vowel.midOpen, Vowel.backRounded)
Vowel.openE = Vowel(Vowel.midOpen, Vowel.frontUnrounded)
Vowel.Y = Vowel(Vowel.close, Vowel.frontRounded)
Vowel.W = Vowel(Vowel.close, Vowel.backUnrounded)
Vowel.OE = Vowel(Vowel.midClose, Vowel.frontRounded)
Vowel.openOE = Vowel(Vowel.midOpen, Vowel.frontRounded)
Vowel.AE = Vowel(Vowel.open, Vowel.frontUnrounded)
Vowel.AO = Vowel(Vowel.open, Vowel.backRounded)

# Distributions of vowel features
# Stress
stressDistribution = phonagen.Distribution()
stressDistribution.addTo(True, 4)
stressDistribution.addTo(False, 6)
# Long vowels
longVowelDistribution = phonagen.Distribution()
longVowelDistribution.addTo(True, 2)
longVowelDistribution.addTo(False, 8)
# Nasal vowels
nasalVowelDistribution = phonagen.Distribution()
nasalVowelDistribution.addTo(True, 2)
nasalVowelDistribution.addTo(False, 8)

# Base vowels
def pickBoolean():
  return random.choice([True,False])

# Generative functions
def twoVowelSet():
  if pickBoolean():
    return (Vowel.A, Vowel.Schwa,) # Open/Close contrast
  else:
    return (Vowel.E, Vowel.O,) # Front/Back contrast

def threeVowelSet():
  return (Vowel.A, Vowel.I, Vowel.U,) # Extreme of the vowel triangle

def fourVowelSet():
  if pickBoolean():
    return (Vowel.A, Vowel.I, Vowel.U, Vowel.Schwa,) # Extreme + central
  else:
    # Choose wether contrast is between close and midOpen or midClose and open
    heightClose = random.choice([Vowel.close, Vowel.midClose])
    return (Vowel(heightClose, Vowel.frontUnrounded),
            Vowel(heightClose, Vowel.backRounded),
            Vowel(heightClose + 2, Vowel.frontUnrounded),
            Vowel(heightClose + 2, Vowel.backRounded),)

def fiveVowelSet():
  if pickBoolean():
    return (Vowel.A, Vowel.E, Vowel.I, Vowel.O, Vowel.U,)
  else:
    # Choose wether contrast is between close and midOpen or midClose and open
    heightClose = random.choice([Vowel.close, Vowel.midClose])
    return (Vowel.Schwa,
            Vowel(heightClose, Vowel.frontUnrounded),
            Vowel(heightClose, Vowel.backRounded),
            Vowel(heightClose + 2, Vowel.frontUnrounded),
            Vowel(heightClose + 2, Vowel.backRounded),)

def sixVowelSet():
  if pickBoolean():
    return (Vowel.A, Vowel.E, Vowel.I, Vowel.O, Vowel.U, Vowel.Schwa,)
  else:
    return (Vowel.I, Vowel.U, Vowel.E, Vowel.O, Vowel.AE, Vowel.AO,)

def sevenVowelSet():
  if pickBoolean():
    return (Vowel.A, Vowel.E, Vowel.I, Vowel.O, Vowel.U, Vowel.openE, Vowel.openO,)
  else:
    return (Vowel.I, Vowel.U, Vowel.E, Vowel.O, Vowel.AE, Vowel.AO, Vowel.Schwa,)

def eightVowelSet():
  rnd = random.randrange(3)
  if rnd == 0:
    return (Vowel.A, Vowel.E, Vowel.I, Vowel.O, Vowel.U, Vowel.openE, Vowel.openO, Vowel.Schwa)
  elif rnd == 1:
    central = random.choice([Vowel.central, Vowel.backUnrounded])
    open = random.choice([Vowel.midOpen, Vowel.open])
    close = random.choice([Vowel.midClose, Vowel.close])
    return (Vowel.I, Vowel.E, Vowel.Y, Vowel.openOE,
            Vowel.U, Vowel.O, Vowel(close, central), Vowel(open, central),)
  else:
    return (Vowel.I, Vowel.E, Vowel.openE, Vowel.AE,
            Vowel.U, Vowel.O, Vowel.openO, Vowel.AO,)

def nineVowelSet():
  if pickBoolean():
    return (Vowel.I, Vowel.E, Vowel.openE, Vowel.AE,
            Vowel.U, Vowel.O, Vowel.openO, Vowel.AO,
            Vowel.Schwa,)
  else:
    return (Vowel.I, Vowel.E, Vowel.openE,
            Vowel(Vowel.close, Vowel.central), Vowel.Schwa, Vowel.A,
            Vowel.U, Vowel.O, Vowel.openO,)

def tenVowelSet():
  if pickBoolean():
    return (Vowel.I, Vowel.E, Vowel.openE,
            Vowel.Y, Vowel.OE, Vowel.openOE,
            Vowel.A,
            Vowel.U, Vowel.O, Vowel.openO,)
  else:
    return (Vowel.I, Vowel.Y, Vowel.Schwa, Vowel.W, Vowel.U,
            Vowel.E, Vowel.openOE, Vowel.A, Vowel(Vowel.midOpen, Vowel.backUnrounded), Vowel.O,)

def elevenVowelSet():
  if pickBoolean():
    return (Vowel.I, Vowel.E, Vowel.openE,
            Vowel.Y, Vowel.OE, Vowel.openOE,
            Vowel.A, Vowel.Schwa,
            Vowel.U, Vowel.O, Vowel.openO,)
  else:
    return (Vowel.I, Vowel.E, Vowel.openE, Vowel.AE,
            Vowel.Y, Vowel.OE, Vowel.openOE,
            Vowel.U, Vowel.O, Vowel.openO, Vowel.AO,)

# Distribution
baseVowelDistribution = phonagen.Distribution()
baseVowelDistribution.addTo(twoVowelSet, 2)
baseVowelDistribution.addTo(threeVowelSet, 6)
baseVowelDistribution.addTo(fourVowelSet, 8)
baseVowelDistribution.addTo(fiveVowelSet, 10)
baseVowelDistribution.addTo(sixVowelSet, 8)
baseVowelDistribution.addTo(sevenVowelSet, 8)
baseVowelDistribution.addTo(eightVowelSet, 6)
baseVowelDistribution.addTo(nineVowelSet, 4)
baseVowelDistribution.addTo(tenVowelSet, 2)
baseVowelDistribution.addTo(elevenVowelSet, 2)


def generateVowelSet():
  """Generate a set of vowels for a phonology"""
  # Choose some language features on the vowel set
  isStressPhonemic = stressDistribution.pickFrom()
  isLongVowelPhonemic = longVowelDistribution.pickFrom()
  isNasalVowelPhonemic = nasalVowelDistribution.pickFrom()
  # Generate a set of base vowels
  baseVowelSet = baseVowelDistribution.pickFrom()()
  # Is stress on long vowel ?
  isLongStressed = False
  if isStressPhonemic and isLongVowelPhonemic and pickBoolean():
    isLongStressed = True
  #
  result = []
  for v in baseVowelSet:
    result.append(v)
    if isLongStressed:
      vls = v.clone()
      vls.isStressed = True
      vls.isLong = True
      result.append(vls)
      if isNasalVowelPhonemic:
        vlsn = vls.clone()
        vlsn.isNasal = True
        result.append(vlsn)
    if (not isLongStressed) and isLongVowelPhonemic:
      vl = v.clone()
      vl.isLong = True
      result.append(vl)
      if isNasalVowelPhonemic:
        vln = vl.clone()
        vln.isNasal = True
        result.append(vln)
    if (not isLongStressed) and isStressPhonemic:
      vs = v.clone()
      vs.isStressed = True
      result.append(vs)
      if isNasalVowelPhonemic:
        vsn = vs.clone()
        vsn.isNasal = True
        result.append(vsn)
    if isNasalVowelPhonemic:
      vn = v.clone()
      vn.isNasal = True
      result.append(vn)
  return result

###
# Consonants representation and generation
class Consonant:
  """Consonant representation"""

  # Simplified model
  # '%' means impossible articulation
  matrixPhonemes = [
    ["m", "ɱ", "n", "ɳ", "ɲ", "ŋ", "ɴ", "%"], # nasal
    ["p", "p̪", "t", "ʈ", "c", "k", "q", "ʔ"], # stop voiceless
    ["b", "b̪", "d", "ɖ", "ɟ", "ɡ", "ɢ", "%"], # stop voiced
    ["ɓ", "ɓ̪", "ɗ", "ᶑ", "ʄ", "ɠ", "ʛ", "%"], # stop implosive
    ["pf", "tθ", "ts", "ʈʂ", "tʃ", "kx", "qχ", "ʔh"], # affricate voiceless
    ["bv", "dð", "dz", "ɖʐ", "dʒ", "ɡɣ", "ɢʁ", "ʡʕ" ], # affricate voiced
    ["f", "θ", "s", "ʂ", "ʃ", "x", "χ", "h"], # fricative voiceless
    ["v", "ð", "z", "ʐ", "ʒ", "ɣ", "ʁ", "ɦ"], # fricative voiced
    ["β", "ʋ", "ɹ", "ɻ", "j", "w", "ʁ", "ʕ"], # approximant
    ["ⱱ", "ⱱ", "ɾ", "ɽ", "%", "%", "ɢ̆", "ʡ̮"], # tap/flap
    ["ʙ", "ʙ̪", "r", "ɽr", "%", "%", "ʀ", "ʢ"], # trill
    ["%", "l", "l", "ɭ", "ʎ", "ʟ", "ʟ̠", "%"], # lateral
    ["ʘ", "ǀ", "ǃ", "ǁ", "ǂ", "ʞ", "%", "%"], # click
    ]

  # left>right: place of articulation:
  labial = 0
  dental = 1
  alveolar = 2
  retroflex = 3
  palatal = 4
  velar = 5
  uvular = 6
  glottal = 7
  # top>bottom: manner
  nasal = 0
  stopVoiceless = 1
  stopVoiced = 2
  implosive = 3
  affricateVoiceless = 4
  affricateVoiced = 5
  fricativeVoiceless = 6
  fricativeVoiced = 7
  approximant = 8
  tapFlap = 9
  trill = 10
  lateral = 11
  click = 12

  def __init__(self, manner = stopVoiceless, place = alveolar):
    """Constructor"""
    # Primary features
    self.place = place
    self.manner = manner
    # Secondary feature
    # Phonation
    self.isEjective = False
    self.isAspirated = False # or murmured, for voiced
    self.isGlotalized = False
    # Secondary articulation
    self.isLabialized = False
    self.isPalatalized = False
    self.isVelarized = False
    self.isPharyngealized = False
    #
    self.transcriptions = {}

  def __str__(self):
    """To String operator: Get the phoneme representation in IPA"""
    result = Consonant.matrixPhonemes[self.manner][self.place]
    if self.isEjective:
      result = result + "ʼ"
    if self.isAspirated:
      result = result + "ʰ"
    if self.isGlotalized:
      result = result + "ˀ"
    if self.isLabialized:
      result = result + "ʷ"
    if self.isPalatalized:
      result = result + "ʲ"
    if self.isVelarized:
      result = result + "ˠ"
    if self.isPharyngealized:
      result = result + "ˤ"
    #
    return result

  def clone(self):
    """Clone the consonant"""
    result = Consonant()
    result.place = self.place
    result.manner = self.manner
    result.isEjective = self.isEjective
    result.isAspirated = self.isAspirated
    result.isGlotalized = self.isGlotalized
    result.isLabialized = self.isLabialized
    result.isPalatalized = self.isPalatalized
    result.isVelarized = self.isVelarized
    result.isPharyngealized = self.isPharyngealized
    return result

  def getDescription(self):
    return "#consonant"

  def isPossible(manner, place):
    return '%' != Consonant.matrixPhonemes[manner][place]

# Has retroflex consonants ?
retroflexDistribution = phonagen.Distribution()
retroflexDistribution.addTo(True, 5)
retroflexDistribution.addTo(False, 15)
# Has glottal consonants ?
glottalDistribution = phonagen.Distribution()
glottalDistribution.addTo(True, 2)
glottalDistribution.addTo(False, 18)
# Has uvular consonants ?
uvularDistribution = phonagen.Distribution()
uvularDistribution.addTo(True, 2)
uvularDistribution.addTo(False, 18)
# Has dental consonants ?
dentalDistribution = phonagen.Distribution()
dentalDistribution.addTo(True, 1)
dentalDistribution.addTo(False, 19)

# Are the affricates distinguished from stops ?
affricateDistribution = phonagen.Distribution()
affricateDistribution.addTo(True, 2)
affricateDistribution.addTo(False, 18)
# Are voiced distinguished from unvoiced ?
voicedDistribution = phonagen.Distribution()
voicedDistribution.addTo(True, 15)
voicedDistribution.addTo(False, 5)
# Has click ?
clickDistribution = phonagen.Distribution()
clickDistribution.addTo(True, 1)
clickDistribution.addTo(False, 69)

# Rhotic realisation
rhoticRealisationDistribution = phonagen.Distribution()
rhoticRealisationDistribution.addTo(False, 10)
rhoticRealisationDistribution.addTo(Consonant.tapFlap, 30)
rhoticRealisationDistribution.addTo(Consonant.trill, 30)
rhoticRealisationDistribution.addTo(Consonant.approximant, 40)
rhoticRealisationDistribution.addTo(Consonant.fricativeVoiced, 20)

# Is aspiration phonemic ?
aspirationDistribution = phonagen.Distribution()
aspirationDistribution.addTo(True, 6)
aspirationDistribution.addTo(False, 14)

# TODO: other stuff ?


def generateConsonantSet():
  """Generate a set of consonants for a phonology"""
  # Places features
  hasRetroflex = retroflexDistribution.pickFrom()
  hasGlottal = glottalDistribution.pickFrom()
  hasUvular = uvularDistribution.pickFrom()
  hasDental = dentalDistribution.pickFrom()

  # Places of articulation
  # Minimal set
  places = [Consonant.labial, Consonant.alveolar, Consonant.palatal, Consonant.velar]
  # Add the other positions
  if hasRetroflex:
    places.append(Consonant.retroflex)
  if hasDental:
    places.append(Consonant.dental)
  if hasGlottal:
    places.append(Consonant.glottal)
  if hasUvular:
    places.append(Consonant.uvular)

  # Nominal place is alveolar: this place will get all the possible manners
  # Other places will be more limited
  nominalPlace = Consonant.alveolar

  # Manner features
  hasVoiced = voicedDistribution.pickFrom()
  hasSeparateAffricates = affricateDistribution.pickFrom()
  hasClick = clickDistribution.pickFrom()
  rhoticRealisation = rhoticRealisationDistribution.pickFrom()
  hasAspirated = aspirationDistribution.pickFrom()

  # Minimal set of manners
  manners = [Consonant.nasal, Consonant.stopVoiceless, Consonant.fricativeVoiceless, Consonant.approximant]
  if hasSeparateAffricates:
    manners.append(Consonant.affricateVoiceless)
  if hasVoiced:
    manners = manners + [Consonant.stopVoiced, Consonant.fricativeVoiced]
    if hasSeparateAffricates:
      manners.append(Consonant.affricateVoiced)
  if hasClick:
    manners.append(Consonant.click)

  # Generate the set of consonants
  result = []
  rhoticAdded = False
  for pl in places:
    for mn in manners:
      # there is a small chance that a phoneme not on the nominal place will be skipped
      if Consonant.isPossible(mn, pl) and ((pl == nominalPlace) or (random.randrange(8) != 0)):
        cons = Consonant(mn, pl)
        # there may be some modifications on the manner or place depending on how contrastive are the consonants
        if (not hasSeparateAffricates) and (pl == Consonant.palatal) and (random.randrange(10) < 8):
          if (mn == Consonant.stopVoiceless):
            cons.manner = Consonant.affricateVoiceless
          elif (mn == Consonant.stopVoiced):
            cons.manner = Consonant.affricateVoiced
        # TODO : other common modifications
        result.append(cons)
        # Rhotic added ?
        if (mn == rhoticRealisation):
          rhoticAdded =((mn != Consonant.fricativeVoiced) and (pl == nominalPlace)) or ((mn == Consonant.fricativeVoiced) and (pl == Consonant.uvular))
        # Aspirated consonants
        if hasAspirated and (mn >= Consonant.stopVoiceless) and (mn <= Consonant.fricativeVoiced):
          asp = cons.clone()
          asp.isAspirated = True
          result.append(asp)
    # lateral
    if ((pl == nominalPlace) and (random.randrange(6) != 0)) or (random.randrange(20) == 0):
      lat = Consonant(Consonant.lateral, pl)
      result.append(lat)

  # rhotic
  if rhoticRealisation and (not rhoticAdded):
    if (rhoticRealisation != Consonant.fricativeVoiced):
      rhot = Consonant(rhoticRealisation, nominalPlace)
      result.append(rhot)
    else:
      rhot = Consonant(rhoticRealisation, Consonant.uvular)
      result.append(rhot)
  #
  return result


###
# Transcriptions
def addSimpleLatinTranscription(transcriptions, phonemeList):
  transcriptions.append('simple-latin')
  vowelTranslationMatrix = [
    ["i", "ú", "ï", "í", "u"], # close
    ["e", "ê", "ë", "o", "o"], # mid close
    ["é", "ê", "ä", "ó", "ó"], # mid open
    ["á", "a", "a", "a", "â"], # open
  ]
  consonantTranslationMatrix = [
    ["m", "ḿ", "n", "ň", "ñ", "ǹ", "ń", "ń"], # nasal
    ["p", "ṕ", "t", "ť", "c", "k", "q", "q"], # stop voiceless
    ["b", "ṕ", "d", "ď", "j", "g", "ǵ", "ǵ"], # stop voiced
    ["b'", "b'", "d'", "ď'", "j'", "g'", "ǵ'", "ǵ'"], # stop implosive
    ["pf", "tŝ", "ts", "tš", "tś", "kx", "qẍ", "qh"], # affricate voiceless
    ["bv", "dẑ", "dz", "dž", "dź", "ǵĝ", "ǵr", "ǵh" ], # affricate voiced
    ["f", "ŝ", "s", "š", "ś", "x", "ẍ", "h"], # fricative voiceless
    ["v", "ẑ", "z", "ž", "ź", "ĝ", "r", "h"], # fricative voiced
    ["v", "v", "r", "ř", "y", "w", "r", "h"], # approximant
    ["ṽ", "ṽ", "r", "ř", "r", "gy", "gr", "hg"], # tap/flap
    ["br", "br", "rr", "řr", "ry", "ŕr", "ŕr", "hŕ"], # trill
    ["l", "l", "l", "ľ", "ly", "ĺl", "ĺl", "ĺl"], # lateral
    ["p*", "ṕ*", "t*", "ť*", "c*", "k*", "q*", "q*"], # click
  ]
  nasalSign = random.choice(["\u0328", "\u0330", "n"]) # combining ogonek, combining tilde below, n
  for ph in phonemeList:
    tr = ""
    if isinstance(ph, Vowel):
      tr = vowelTranslationMatrix[ph.height][ph.backness]
      if ph.isLong:
        tr = tr + tr # Double
      if ph.isNasal:
        tr = tr + nasalSign
    if isinstance(ph, Consonant):
      tr = consonantTranslationMatrix[ph.manner][ph.place]
      if ph.isAspirated:
        tr = tr + "h"
    ph.transcriptions.update({'simple-latin': tr})

def makePhonology(id, description):
  phonology = phonagen.Phonology(id = id, description = description)
  # Define phonemes, from their IPA notation
  phonemeList = []
  # Step 0: stress, syllable break
  phonemeList.append(Stress())
  phonemeList.append(SyllableBreak())
  # Step 1: Vowels
  phonemeList = phonemeList + generateVowelSet()
  # Step 2: consonants
  phonemeList = phonemeList + generateConsonantSet()
  # Step 3: Transcriptions, and decide the main
  transcriptions = ['phoneme']
  addSimpleLatinTranscription(transcriptions, phonemeList)
  # set main transcription
  phonology.transcriptions = transcriptions
  # TODO: change this
  phonology.mainTranscription = 'simple-latin'
  # Step 4: translate phoneme into phonology entries
  for ph in phonemeList:
    id = str(ph)
    if isinstance(ph, Vowel) and ph.isStressed:
      id = "'" + id
    entry = {'id': id, 'description': ph.getDescription(), 'phoneme': str(ph)}
    for tr in ph.transcriptions:
      entry.update({tr: ph.transcriptions[tr]})
    phonology.entries.update({entry['id']: entry})
  return phonology


def parseArgs():
  # Define argument parser
  parser = argparse.ArgumentParser(description='Make a new phonology.')
  parser.add_argument('--id', metavar='id', help='id of the phonology', required = True)
  parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='')
  parser.add_argument('--output', metavar='output-file', help='Output file for the generator. The file is printed to standard output if not given.', default='')
  # Parse arguments
  return parser.parse_args()

# Main
if __name__ == '__main__':
  args = parseArgs()
  phonology = makePhonology(args.id, args.description)
  outputFile = phonagen.PhonagenFile()
  outputFile.addPhonology(phonology)
  outputFile.writeTo(args.output)