Example list to Markov chain generator.
This commit is contained in:
parent
bc5f677aa6
commit
5f204ce9e9
|
@ -0,0 +1,34 @@
|
|||
#! /usr/bin/env python3
|
||||
|
||||
import argparse
|
||||
import phonagen
|
||||
|
||||
def parseArgs():
|
||||
# Define argument parser
|
||||
parser = argparse.ArgumentParser(description='Convert an example list to a chain generator.')
|
||||
parser.add_argument('file', metavar='listfile', help='list file to convert')
|
||||
parser.add_argument('--id', metavar='id', help='id of the generator', required = True)
|
||||
parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='')
|
||||
parser.add_argument('--phonology', metavar='phonology', help='id of the phonology on which is based the generator', required = True)
|
||||
parser.add_argument('--phonologyfile', metavar='phonologyfile', help='file containing the phonology; optional, if provided, examples will be checked for unknown phonemes and phonology will present in the output', default='')
|
||||
parser.add_argument('--order', metavar='order', help='order of the chain; 1 by default', default=1, type=int)
|
||||
parser.add_argument('--output', metavar='output-file', help='Output file for the generator. The file is printed to standard output if not given.', default='')
|
||||
# Parse arguments
|
||||
return parser.parse_args()
|
||||
|
||||
# Main
|
||||
if __name__ == '__main__':
|
||||
args = parseArgs()
|
||||
generator = phonagen.ChainGenerator(id = args.id, description = args.description, phonology = args.phonology, order = args.order)
|
||||
phonology = phonagen.Phonology()
|
||||
# load the phonology from a file
|
||||
if args.phonologyfile != '':
|
||||
phonologyFile = phonagen.PhonagenFile()
|
||||
phonologyFile.load(args.phonologyfile)
|
||||
phonology = phonologyFile.getPhonology(args.phonology)
|
||||
#
|
||||
generator.fromExamples(args.file, phonology)
|
||||
phonagenFile = phonagen.PhonagenFile()
|
||||
phonagenFile.addPhonology(phonology)
|
||||
phonagenFile.addGenerator(generator)
|
||||
phonagenFile.writeTo(args.output)
|
|
@ -3,32 +3,41 @@ import json
|
|||
import io
|
||||
import sys
|
||||
import csv
|
||||
import random
|
||||
|
||||
class Phonology:
|
||||
"""Phonology class"""
|
||||
def __init__(self, id = '', description = '', transcriptions = [], mainTranscription = '', entries = []):
|
||||
def __init__(self, id = '', description = '', mainTranscription = ''):
|
||||
self.id = id
|
||||
self.description = description
|
||||
self.transcriptions = transcriptions
|
||||
self.transcriptions = []
|
||||
self.mainTranscription = mainTranscription
|
||||
self.entries = entries
|
||||
self.entries = {} # id -> entry
|
||||
|
||||
def isValid(self):
|
||||
return self.id != ''
|
||||
|
||||
def has(self, id):
|
||||
return id in self.entries
|
||||
|
||||
def toJsonStruct(self):
|
||||
"""Convert a Phonology to a Json structure"""
|
||||
return { 'id': self.id,
|
||||
'description': self.description,
|
||||
'transcriptions': self.transcriptions,
|
||||
'main-transcription': self.mainTranscription,
|
||||
'entries': self.entries }
|
||||
'entries': [x for x in self.entries.values()] }
|
||||
|
||||
def fromJsonStruct(self, struct):
|
||||
"""Create a Phonology from a Json structure"""
|
||||
return Phonology(struct['id'], struct['decription'], struct['transcriptions'], struct['main-transcription'], struct['entries'])
|
||||
"""Fill a Phonology from a Json structure"""
|
||||
self.id = struct['id']
|
||||
self.description = struct['description']
|
||||
self.transcriptions = struct['transcriptions']
|
||||
self.mainTranscription = struct['main-transcription']
|
||||
self.entries = {x['id']: x for x in struct['entries']}
|
||||
|
||||
def fromCsv(self, file):
|
||||
"""Fill a Phonology from a Csv file"""
|
||||
with open(file) as csvfile:
|
||||
fileReader = csv.reader(csvfile)
|
||||
# get csv header
|
||||
|
@ -68,7 +77,118 @@ class Phonology:
|
|||
# if description is not provided, add an empty one
|
||||
if 'description' not in header:
|
||||
entry.update({'description': ''})
|
||||
self.entries.append(entry)
|
||||
self.entries.update({entry['id']: entry})
|
||||
|
||||
class Distribution:
|
||||
"""Discrete distribution"""
|
||||
def __init__(self):
|
||||
self.items = {}
|
||||
|
||||
def addTo(self, value, occurences = 1):
|
||||
oc = occurences
|
||||
if value in self.items:
|
||||
oc = oc + self.items[value]
|
||||
self.items.update({value: oc})
|
||||
|
||||
def pickFrom(self):
|
||||
return random.choices([k for k in self.items.keys()], [v for v in self.items.values])[0]
|
||||
|
||||
def toJsonStruct(self, itemRef = 'value', occurencesRef = 'occurences'):
|
||||
return [{itemRef: x, occurencesRef: self.items[x]} for x in self.items]
|
||||
|
||||
def fromJsonStruct(self, struct, itemRef = 'value', occurencesRef = 'occurences'):
|
||||
self.items = {}
|
||||
for item in struct:
|
||||
self.items.update({item[itemRef]: item[occurencesRef]})
|
||||
|
||||
class Generator:
|
||||
"""Parent class for all generators"""
|
||||
def __init__(self, id = '', description = '', phonology = ''):
|
||||
self.id = id
|
||||
self.description = description
|
||||
self.phonology = phonology
|
||||
self.isTyped = False
|
||||
|
||||
def isValid(self):
|
||||
return (self.id != '') and self.isTyped
|
||||
|
||||
def toJsonStruct(self):
|
||||
return { 'id': self.id,
|
||||
'description': self.description,
|
||||
'phonology': self.phonology }
|
||||
|
||||
def fromJsonStruct(self, struct):
|
||||
self.id = struct['id']
|
||||
self.description = struct['description']
|
||||
self.phonology = struct['phonology']
|
||||
|
||||
class ChainGenerator(Generator):
|
||||
"""Chains-based generator"""
|
||||
def __init__(self, order = 1, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.order = order
|
||||
self.chains = {} # input -> distribution of outputs
|
||||
self.isTyped = True
|
||||
|
||||
def toJsonStruct(self):
|
||||
struct = super().toJsonStruct()
|
||||
struct.update({'type': 'chains',
|
||||
'order': self.order,
|
||||
'chains': [{'input': x, 'possible-outputs': self.chains[x].toJsonStruct(itemRef = 'value', occurencesRef = 'occurences')} for x in self.chains]})
|
||||
return struct
|
||||
|
||||
def fromJsonStruct(self, struct):
|
||||
super().fromJsonStruct(struct)
|
||||
self.order = struct['order']
|
||||
for chainStruct in struct['chains']:
|
||||
dist = Distribution()
|
||||
dist.fromJsonStruct(chainStruct['possible-outputs'], itemRef = 'value', occurencesRef = 'occurences')
|
||||
self.chains.update({chainStruct['input']: dist})
|
||||
|
||||
def fromExamples(self, file, phonology):
|
||||
"""Train a chain generator on an example file"""
|
||||
with open(file) as exampleFile:
|
||||
fileReader = csv.reader(exampleFile, delimiter=' ', skipinitialspace=True)
|
||||
for row in fileReader:
|
||||
if len(row) != 0:
|
||||
row.append("") # Add terminator element (empty string)
|
||||
previous = tuple("" for i in range(self.order)) # Initial sequence (a list of empty string of length = self.order)
|
||||
for item in row:
|
||||
if (item != "") and (phonology.isValid()) and (not phonology.has(item)):
|
||||
raise Exception('In row', row, ':', item, 'is not an id in phonology', phonology.id)
|
||||
if previous in self.chains:
|
||||
self.chains[previous].addTo(item)
|
||||
else:
|
||||
dist = Distribution()
|
||||
dist.addTo(item)
|
||||
self.chains.update({previous: dist})
|
||||
previous = previous[1:] + (item,)
|
||||
|
||||
class RuleGenerator(Generator):
|
||||
"""Rules-based generator"""
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.rules = {}
|
||||
self.isTyped = True
|
||||
|
||||
def toJsonStruct(self):
|
||||
struct = super().toJsonStruct()
|
||||
# TODO: add rules
|
||||
return struct
|
||||
|
||||
def fromJsonStruct(self, struct):
|
||||
super().fromJsonStruct(struct)
|
||||
# TODO: rules
|
||||
|
||||
generatorTypeToClass = { 'chains': ChainGenerator, 'rules': RuleGenerator }
|
||||
def makeGenerator(struct):
|
||||
"""Function instanciating a generator from a JSON structure"""
|
||||
if struct['type'] in generatorTypeToClass:
|
||||
generator = generatorTypeToClass[struct['type']]()
|
||||
else:
|
||||
generator = Generator()
|
||||
generator.fromJsonStruct(struct)
|
||||
return generator
|
||||
|
||||
class PhonagenFile:
|
||||
"""A phonagen file, with phonologies and generators"""
|
||||
|
@ -90,8 +210,21 @@ class PhonagenFile:
|
|||
def getGenerator(self, id):
|
||||
return self.generators[id]
|
||||
|
||||
def load(self, file):
|
||||
"""Load from a JSON file"""
|
||||
with open(file, 'r', encoding='utf-8') as inputFile:
|
||||
jsonStruct = json.load(inputFile)
|
||||
# Load phonologies
|
||||
for struct in jsonStruct['phonologies']:
|
||||
phonology = Phonology()
|
||||
phonology.fromJsonStruct(struct)
|
||||
self.addPhonology(phonology)
|
||||
# Load generators
|
||||
for struct in jsonStruct['generators']:
|
||||
self.addGenerator(makeGenerator(struct))
|
||||
|
||||
def writeTo(self, file = ''):
|
||||
"""Output a JSON file from lists of phonologies and generators"""
|
||||
"""Output to a JSON file (or stdout)"""
|
||||
outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
|
||||
'generators': [x.toJsonStruct() for x in self.generators.values()] }
|
||||
if file == '':
|
||||
|
|
Loading…
Reference in New Issue