diff --git a/py-phonagen/phonagen.py b/py-phonagen/phonagen.py new file mode 100644 index 0000000..1ae0110 --- /dev/null +++ b/py-phonagen/phonagen.py @@ -0,0 +1,101 @@ +"""Common functions and classes for phonagen tools""" +import json +import io +import sys +import csv + +class Phonology: + """Phonology class""" + def __init__(self, id = '', description = '', transcriptions = [], mainTranscription = '', entries = []): + self.id = id + self.description = description + self.transcriptions = transcriptions + self.mainTranscription = mainTranscription + self.entries = entries + + def isValid(self): + return self.id != '' + + def toJsonStruct(self): + """Convert a Phonology to a Json structure""" + return { 'id': self.id, + 'description': self.description, + 'transcriptions': self.transcriptions, + 'main-transcription': self.mainTranscription, + 'entries': self.entries } + + def fromJsonStruct(self, struct): + """Create a Phonology from a Json structure""" + return Phonology(struct['id'], struct['decription'], struct['transcriptions'], struct['main-transcription'], struct['entries']) + + def fromCsv(self, file): + with open(file) as csvfile: + fileReader = csv.reader(csvfile) + # get csv header + header = next(fileReader) + # get the transcriptions (header items not id or description) + self.transcriptions = [x for x in header if x not in ['id', 'description']] + # Check: self.transcriptions should contain 'phoneme' + if 'phoneme' not in self.transcriptions: + raise Exception('phoneme column not found in ', file) + # Check: self.transcriptions should have at least two items + if len(self.transcriptions) < 2: + raise Exception('No transcription found outside phoneme in file ', file, 'Did you named it id or description ?') + # get the first header item which is not one of those: id, description, phoneme + guessedMainTranscription = next(x for x in header if x not in ['id', 'description', 'phoneme']) + # If main-transcription was not given on the command line, use the guess as main-transcription + if self.mainTranscription == '': + self.mainTranscription = guessedMainTranscription + # Check: self.mainTranscription should be in self.transcriptions + if self.mainTranscription not in self.mainTranscription: + raise Exception('main-transcription', self.mainTranscription, 'not in list of transcriptions') + # If id was not given on the command line, use the mainTranscription as the id + if self.id == '': + self.id = self.mainTranscription + # parse entries + for row in fileReader: + entry = dict() + for i in range(len(row)): + entry.update({header[i]: row[i]}) + # All absent elements are set to '' + for i in range(len(row), len(header)): + entry.update({header[i]: ''}) + # if both phoneme and main-transcription are empty, skip the rest + if (entry['phoneme'] != '') or (entry[self.mainTranscription] != ''): + # if id is not provided, generate it + if 'id' not in header: + entry.update({'id': entry['phoneme'] + '-' + entry[self.mainTranscription]}) + # if description is not provided, add an empty one + if 'description' not in header: + entry.update({'description': ''}) + self.entries.append(entry) + +class PhonagenFile: + """A phonagen file, with phonologies and generators""" + def __init__(self): + self.phonologies = {} + self.generators = {} + + def addPhonology(self, phonology): + if (phonology.isValid()): + self.phonologies.update({phonology.id: phonology}) + + def addGenerator(self, generator): + if (generator.isValid()): + self.generators.update({generator.id: generator}) + + def getPhonology(self, id): + return self.phonologies[id] + + def getGenerator(self, id): + return self.generators[id] + + def writeTo(self, file = ''): + """Output a JSON file from lists of phonologies and generators""" + outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()], + 'generators': [x.toJsonStruct() for x in self.generators.values()] } + if file == '': + json.dump(outputStruct, sys.stdout, ensure_ascii=False) + else: + with open(file, 'w', encoding='utf-8') as outputFile: + json.dump(outputStruct, outputFile, ensure_ascii=False) diff --git a/py-phonagen/phonology-csv2json.py b/py-phonagen/phonology-csv2json.py new file mode 100644 index 0000000..4d33bdd --- /dev/null +++ b/py-phonagen/phonology-csv2json.py @@ -0,0 +1,24 @@ +#! /usr/bin/env python3 + +import argparse +import phonagen + +def parseArgs(): + # Define argument parser + parser = argparse.ArgumentParser(description='Convert a phonology from csv to json.') + parser.add_argument('file', metavar='csvfile', help='csv file to convert') + parser.add_argument('--id', metavar='id', help='id of the phonology; guessed from the csv header if not provided', default='') + parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='') + parser.add_argument('--main', metavar='main-transcription', help='main transcription of the phonology; must correspond to an element of the csv header (outside id and description); guessed from the csv header if not provided.', default='') + parser.add_argument('--output', metavar='output-file', help='Output file for the phonology. The file is printed to standard output if not given.', default='') + # Parse arguments + return parser.parse_args() + +# Main +if __name__ == '__main__': + args = parseArgs() + phonology = phonagen.Phonology(id = args.id, description = args.description, mainTranscription = args.main) + phonology.fromCsv(args.file) + phonagenFile = phonagen.PhonagenFile() + phonagenFile.addPhonology(phonology) + phonagenFile.writeTo(args.output)