235 lines
8.6 KiB
Python
235 lines
8.6 KiB
Python
"""Common functions and classes for phonagen tools"""
|
|
import json
|
|
import io
|
|
import sys
|
|
import csv
|
|
import random
|
|
|
|
class Phonology:
|
|
"""Phonology class"""
|
|
def __init__(self, id = '', description = '', mainTranscription = ''):
|
|
self.id = id
|
|
self.description = description
|
|
self.transcriptions = []
|
|
self.mainTranscription = mainTranscription
|
|
self.entries = {} # id -> entry
|
|
|
|
def isValid(self):
|
|
return self.id != ''
|
|
|
|
def has(self, id):
|
|
return id in self.entries
|
|
|
|
def toJsonStruct(self):
|
|
"""Convert a Phonology to a Json structure"""
|
|
return { 'id': self.id,
|
|
'description': self.description,
|
|
'transcriptions': self.transcriptions,
|
|
'main-transcription': self.mainTranscription,
|
|
'entries': [x for x in self.entries.values()] }
|
|
|
|
def fromJsonStruct(self, struct):
|
|
"""Fill a Phonology from a Json structure"""
|
|
self.id = struct['id']
|
|
self.description = struct['description']
|
|
self.transcriptions = struct['transcriptions']
|
|
self.mainTranscription = struct['main-transcription']
|
|
self.entries = {x['id']: x for x in struct['entries']}
|
|
|
|
def fromCsv(self, file):
|
|
"""Fill a Phonology from a Csv file"""
|
|
with open(file) as csvfile:
|
|
fileReader = csv.reader(csvfile)
|
|
# get csv header
|
|
header = next(fileReader)
|
|
# get the transcriptions (header items not id or description)
|
|
self.transcriptions = [x for x in header if x not in ['id', 'description']]
|
|
# Check: self.transcriptions should contain 'phoneme'
|
|
if 'phoneme' not in self.transcriptions:
|
|
raise Exception('phoneme column not found in ', file)
|
|
# Check: self.transcriptions should have at least two items
|
|
if len(self.transcriptions) < 2:
|
|
raise Exception('No transcription found outside phoneme in file ', file, 'Did you named it id or description ?')
|
|
# get the first header item which is not one of those: id, description, phoneme
|
|
guessedMainTranscription = next(x for x in header if x not in ['id', 'description', 'phoneme'])
|
|
# If main-transcription was not given on the command line, use the guess as main-transcription
|
|
if self.mainTranscription == '':
|
|
self.mainTranscription = guessedMainTranscription
|
|
# Check: self.mainTranscription should be in self.transcriptions
|
|
if self.mainTranscription not in self.mainTranscription:
|
|
raise Exception('main-transcription', self.mainTranscription, 'not in list of transcriptions')
|
|
# If id was not given on the command line, use the mainTranscription as the id
|
|
if self.id == '':
|
|
self.id = self.mainTranscription
|
|
# parse entries
|
|
for row in fileReader:
|
|
entry = dict()
|
|
for i in range(len(row)):
|
|
entry.update({header[i]: row[i]})
|
|
# All absent elements are set to ''
|
|
for i in range(len(row), len(header)):
|
|
entry.update({header[i]: ''})
|
|
# if both phoneme and main-transcription are empty, skip the rest
|
|
if (entry['phoneme'] != '') or (entry[self.mainTranscription] != ''):
|
|
# if id is not provided, generate it
|
|
if 'id' not in header:
|
|
entry.update({'id': entry['phoneme'] + '-' + entry[self.mainTranscription]})
|
|
# if description is not provided, add an empty one
|
|
if 'description' not in header:
|
|
entry.update({'description': ''})
|
|
self.entries.update({entry['id']: entry})
|
|
|
|
class Distribution:
|
|
"""Discrete distribution"""
|
|
def __init__(self):
|
|
self.items = {}
|
|
|
|
def addTo(self, value, occurences = 1):
|
|
oc = occurences
|
|
if value in self.items:
|
|
oc = oc + self.items[value]
|
|
self.items.update({value: oc})
|
|
|
|
def pickFrom(self):
|
|
return random.choices([k for k in self.items.keys()], [v for v in self.items.values])[0]
|
|
|
|
def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
|
|
return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
|
|
|
|
def fromJsonStruct(self, struct, itemRef = 'value', occurencesRef = 'occurences'):
|
|
self.items = {}
|
|
for item in struct:
|
|
self.items.update({item[itemRef]: item[occurencesRef]})
|
|
|
|
class Generator:
|
|
"""Parent class for all generators"""
|
|
def __init__(self, id = '', description = '', phonology = ''):
|
|
self.id = id
|
|
self.description = description
|
|
self.phonology = phonology
|
|
self.isTyped = False
|
|
|
|
def isValid(self):
|
|
return (self.id != '') and self.isTyped
|
|
|
|
def toJsonStruct(self):
|
|
return { 'id': self.id,
|
|
'description': self.description,
|
|
'phonology': self.phonology }
|
|
|
|
def fromJsonStruct(self, struct):
|
|
self.id = struct['id']
|
|
self.description = struct['description']
|
|
self.phonology = struct['phonology']
|
|
|
|
class ChainGenerator(Generator):
|
|
"""Chains-based generator"""
|
|
def __init__(self, order = 1, **kwargs):
|
|
super().__init__(**kwargs)
|
|
self.order = order
|
|
self.chains = {} # input -> distribution of outputs
|
|
self.isTyped = True
|
|
|
|
def toJsonStruct(self):
|
|
struct = super().toJsonStruct()
|
|
struct.update({'type': 'chains',
|
|
'order': self.order,
|
|
'chains': [{'input': x, 'possible-outputs': self.chains[x].toJsonStruct(itemRef = 'value', occurencesRef = 'occurences')} for x in self.chains]})
|
|
return struct
|
|
|
|
def fromJsonStruct(self, struct):
|
|
super().fromJsonStruct(struct)
|
|
self.order = struct['order']
|
|
for chainStruct in struct['chains']:
|
|
dist = Distribution()
|
|
dist.fromJsonStruct(chainStruct['possible-outputs'], itemRef = 'value', occurencesRef = 'occurences')
|
|
self.chains.update({chainStruct['input']: dist})
|
|
|
|
def fromExamples(self, file, phonology):
|
|
"""Train a chain generator on an example file"""
|
|
with open(file) as exampleFile:
|
|
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
|
|
for row in fileReader:
|
|
if len(row) != 0:
|
|
row.append("") # Add terminator element (empty string)
|
|
previous = tuple("" for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
|
|
for item in row:
|
|
if (item != "") and (phonology.isValid()) and (not phonology.has(item)):
|
|
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
|
|
if previous in self.chains:
|
|
self.chains[previous].addTo(item)
|
|
else:
|
|
dist = Distribution()
|
|
dist.addTo(item)
|
|
self.chains.update({previous: dist})
|
|
previous = previous[1:] + (item,)
|
|
|
|
class RuleGenerator(Generator):
|
|
"""Rules-based generator"""
|
|
def __init__(self, **kwargs):
|
|
super().__init__(**kwargs)
|
|
self.rules = {}
|
|
self.isTyped = True
|
|
|
|
def toJsonStruct(self):
|
|
struct = super().toJsonStruct()
|
|
# TODO: add rules
|
|
return struct
|
|
|
|
def fromJsonStruct(self, struct):
|
|
super().fromJsonStruct(struct)
|
|
# TODO: rules
|
|
|
|
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
|
|
def makeGenerator(struct):
|
|
"""Function instanciating a generator from a JSON structure"""
|
|
if struct['type'] in generatorTypeToClass:
|
|
generator = generatorTypeToClass[struct['type']]()
|
|
else:
|
|
generator = Generator()
|
|
generator.fromJsonStruct(struct)
|
|
return generator
|
|
|
|
class PhonagenFile:
|
|
"""A phonagen file, with phonologies and generators"""
|
|
def __init__(self):
|
|
self.phonologies = {}
|
|
self.generators = {}
|
|
|
|
def addPhonology(self, phonology):
|
|
if (phonology.isValid()):
|
|
self.phonologies.update({phonology.id: phonology})
|
|
|
|
def addGenerator(self, generator):
|
|
if (generator.isValid()):
|
|
self.generators.update({generator.id: generator})
|
|
|
|
def getPhonology(self, id):
|
|
return self.phonologies[id]
|
|
|
|
def getGenerator(self, id):
|
|
return self.generators[id]
|
|
|
|
def load(self, file):
|
|
"""Load from a JSON file"""
|
|
with open(file, 'r', encoding='utf-8') as inputFile:
|
|
jsonStruct = json.load(inputFile)
|
|
# Load phonologies
|
|
for struct in jsonStruct['phonologies']:
|
|
phonology = Phonology()
|
|
phonology.fromJsonStruct(struct)
|
|
self.addPhonology(phonology)
|
|
# Load generators
|
|
for struct in jsonStruct['generators']:
|
|
self.addGenerator(makeGenerator(struct))
|
|
|
|
def writeTo(self, file = ''):
|
|
"""Output to a JSON file (or stdout)"""
|
|
outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
|
|
'generators': [x.toJsonStruct() for x in self.generators.values()] }
|
|
if file == '':
|
|
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
|
|
else:
|
|
with open(file, 'w', encoding='utf-8') as outputFile:
|
|
json.dump(outputStruct, outputFile, ensure_ascii=False)
|