Example list to Markov chain generator.

This commit is contained in:
Feufochmar 2018-06-09 18:58:46 +02:00
parent bc5f677aa6
commit 5f204ce9e9
2 changed files with 175 additions and 8 deletions

View File

@ -0,0 +1,34 @@
#! /usr/bin/env python3
import argparse
import phonagen
def parseArgs():
# Define argument parser
parser = argparse.ArgumentParser(description='Convert an example list to a chain generator.')
parser.add_argument('file', metavar='listfile', help='list file to convert')
parser.add_argument('--id', metavar='id', help='id of the generator', required = True)
parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='')
parser.add_argument('--phonology', metavar='phonology', help='id of the phonology on which is based the generator', required = True)
parser.add_argument('--phonologyfile', metavar='phonologyfile', help='file containing the phonology; optional, if provided, examples will be checked for unknown phonemes and phonology will present in the output', default='')
parser.add_argument('--order', metavar='order', help='order of the chain; 1 by default', default=1, type=int)
parser.add_argument('--output', metavar='output-file', help='Output file for the generator. The file is printed to standard output if not given.', default='')
# Parse arguments
return parser.parse_args()
# Main
if __name__ == '__main__':
args = parseArgs()
generator = phonagen.ChainGenerator(id = args.id, description = args.description, phonology = args.phonology, order = args.order)
phonology = phonagen.Phonology()
# load the phonology from a file
if args.phonologyfile != '':
phonologyFile = phonagen.PhonagenFile()
phonologyFile.load(args.phonologyfile)
phonology = phonologyFile.getPhonology(args.phonology)
#
generator.fromExamples(args.file, phonology)
phonagenFile = phonagen.PhonagenFile()
phonagenFile.addPhonology(phonology)
phonagenFile.addGenerator(generator)
phonagenFile.writeTo(args.output)

View File

@ -3,32 +3,41 @@ import json
import io
import sys
import csv
import random
class Phonology:
"""Phonology class"""
def __init__(self, id = '', description = '', transcriptions = [], mainTranscription = '', entries = []):
def __init__(self, id = '', description = '', mainTranscription = ''):
self.id = id
self.description = description
self.transcriptions = transcriptions
self.transcriptions = []
self.mainTranscription = mainTranscription
self.entries = entries
self.entries = {} # id -> entry
def isValid(self):
return self.id != ''
def has(self, id):
return id in self.entries
def toJsonStruct(self):
"""Convert a Phonology to a Json structure"""
return { 'id': self.id,
'description': self.description,
'transcriptions': self.transcriptions,
'main-transcription': self.mainTranscription,
'entries': self.entries }
'entries': [x for x in self.entries.values()] }
def fromJsonStruct(self, struct):
"""Create a Phonology from a Json structure"""
return Phonology(struct['id'], struct['decription'], struct['transcriptions'], struct['main-transcription'], struct['entries'])
"""Fill a Phonology from a Json structure"""
self.id = struct['id']
self.description = struct['description']
self.transcriptions = struct['transcriptions']
self.mainTranscription = struct['main-transcription']
self.entries = {x['id']: x for x in struct['entries']}
def fromCsv(self, file):
"""Fill a Phonology from a Csv file"""
with open(file) as csvfile:
fileReader = csv.reader(csvfile)
# get csv header
@ -68,7 +77,118 @@ class Phonology:
# if description is not provided, add an empty one
if 'description' not in header:
entry.update({'description': ''})
self.entries.append(entry)
self.entries.update({entry['id']: entry})
class Distribution:
"""Discrete distribution"""
def __init__(self):
self.items = {}
def addTo(self, value, occurences = 1):
oc = occurences
if value in self.items:
oc = oc + self.items[value]
self.items.update({value: oc})
def pickFrom(self):
return random.choices([k for k in self.items.keys()], [v for v in self.items.values])[0]
def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
def fromJsonStruct(self, struct, itemRef = 'value', occurencesRef = 'occurences'):
self.items = {}
for item in struct:
self.items.update({item[itemRef]: item[occurencesRef]})
class Generator:
"""Parent class for all generators"""
def __init__(self, id = '', description = '', phonology = ''):
self.id = id
self.description = description
self.phonology = phonology
self.isTyped = False
def isValid(self):
return (self.id != '') and self.isTyped
def toJsonStruct(self):
return { 'id': self.id,
'description': self.description,
'phonology': self.phonology }
def fromJsonStruct(self, struct):
self.id = struct['id']
self.description = struct['description']
self.phonology = struct['phonology']
class ChainGenerator(Generator):
"""Chains-based generator"""
def __init__(self, order = 1, **kwargs):
super().__init__(**kwargs)
self.order = order
self.chains = {} # input -> distribution of outputs
self.isTyped = True
def toJsonStruct(self):
struct = super().toJsonStruct()
struct.update({'type': 'chains',
'order': self.order,
'chains': [{'input': x, 'possible-outputs': self.chains[x].toJsonStruct(itemRef = 'value', occurencesRef = 'occurences')} for x in self.chains]})
return struct
def fromJsonStruct(self, struct):
super().fromJsonStruct(struct)
self.order = struct['order']
for chainStruct in struct['chains']:
dist = Distribution()
dist.fromJsonStruct(chainStruct['possible-outputs'], itemRef = 'value', occurencesRef = 'occurences')
self.chains.update({chainStruct['input']: dist})
def fromExamples(self, file, phonology):
"""Train a chain generator on an example file"""
with open(file) as exampleFile:
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
for row in fileReader:
if len(row) != 0:
row.append("") # Add terminator element (empty string)
previous = tuple("" for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
for item in row:
if (item != "") and (phonology.isValid()) and (not phonology.has(item)):
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
if previous in self.chains:
self.chains[previous].addTo(item)
else:
dist = Distribution()
dist.addTo(item)
self.chains.update({previous: dist})
previous = previous[1:] + (item,)
class RuleGenerator(Generator):
"""Rules-based generator"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.rules = {}
self.isTyped = True
def toJsonStruct(self):
struct = super().toJsonStruct()
# TODO: add rules
return struct
def fromJsonStruct(self, struct):
super().fromJsonStruct(struct)
# TODO: rules
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
def makeGenerator(struct):
"""Function instanciating a generator from a JSON structure"""
if struct['type'] in generatorTypeToClass:
generator = generatorTypeToClass[struct['type']]()
else:
generator = Generator()
generator.fromJsonStruct(struct)
return generator
class PhonagenFile:
"""A phonagen file, with phonologies and generators"""
@ -90,8 +210,21 @@ class PhonagenFile:
def getGenerator(self, id):
return self.generators[id]
def load(self, file):
"""Load from a JSON file"""
with open(file, 'r', encoding='utf-8') as inputFile:
jsonStruct = json.load(inputFile)
# Load phonologies
for struct in jsonStruct['phonologies']:
phonology = Phonology()
phonology.fromJsonStruct(struct)
self.addPhonology(phonology)
# Load generators
for struct in jsonStruct['generators']:
self.addGenerator(makeGenerator(struct))
def writeTo(self, file = ''):
"""Output a JSON file from lists of phonologies and generators"""
"""Output to a JSON file (or stdout)"""
outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
'generators': [x.toJsonStruct() for x in self.generators.values()] }
if file == '':