Add phonology-csv2json tool and a Phonagen python library on which the tool is based.
This commit is contained in:
parent
be47b5526c
commit
bc5f677aa6
|
@ -0,0 +1,101 @@
|
||||||
|
"""Common functions and classes for phonagen tools"""
|
||||||
|
import json
|
||||||
|
import io
|
||||||
|
import sys
|
||||||
|
import csv
|
||||||
|
|
||||||
|
class Phonology:
|
||||||
|
"""Phonology class"""
|
||||||
|
def __init__(self, id = '', description = '', transcriptions = [], mainTranscription = '', entries = []):
|
||||||
|
self.id = id
|
||||||
|
self.description = description
|
||||||
|
self.transcriptions = transcriptions
|
||||||
|
self.mainTranscription = mainTranscription
|
||||||
|
self.entries = entries
|
||||||
|
|
||||||
|
def isValid(self):
|
||||||
|
return self.id != ''
|
||||||
|
|
||||||
|
def toJsonStruct(self):
|
||||||
|
"""Convert a Phonology to a Json structure"""
|
||||||
|
return { 'id': self.id,
|
||||||
|
'description': self.description,
|
||||||
|
'transcriptions': self.transcriptions,
|
||||||
|
'main-transcription': self.mainTranscription,
|
||||||
|
'entries': self.entries }
|
||||||
|
|
||||||
|
def fromJsonStruct(self, struct):
|
||||||
|
"""Create a Phonology from a Json structure"""
|
||||||
|
return Phonology(struct['id'], struct['decription'], struct['transcriptions'], struct['main-transcription'], struct['entries'])
|
||||||
|
|
||||||
|
def fromCsv(self, file):
|
||||||
|
with open(file) as csvfile:
|
||||||
|
fileReader = csv.reader(csvfile)
|
||||||
|
# get csv header
|
||||||
|
header = next(fileReader)
|
||||||
|
# get the transcriptions (header items not id or description)
|
||||||
|
self.transcriptions = [x for x in header if x not in ['id', 'description']]
|
||||||
|
# Check: self.transcriptions should contain 'phoneme'
|
||||||
|
if 'phoneme' not in self.transcriptions:
|
||||||
|
raise Exception('phoneme column not found in ', file)
|
||||||
|
# Check: self.transcriptions should have at least two items
|
||||||
|
if len(self.transcriptions) < 2:
|
||||||
|
raise Exception('No transcription found outside phoneme in file ', file, 'Did you named it id or description ?')
|
||||||
|
# get the first header item which is not one of those: id, description, phoneme
|
||||||
|
guessedMainTranscription = next(x for x in header if x not in ['id', 'description', 'phoneme'])
|
||||||
|
# If main-transcription was not given on the command line, use the guess as main-transcription
|
||||||
|
if self.mainTranscription == '':
|
||||||
|
self.mainTranscription = guessedMainTranscription
|
||||||
|
# Check: self.mainTranscription should be in self.transcriptions
|
||||||
|
if self.mainTranscription not in self.mainTranscription:
|
||||||
|
raise Exception('main-transcription', self.mainTranscription, 'not in list of transcriptions')
|
||||||
|
# If id was not given on the command line, use the mainTranscription as the id
|
||||||
|
if self.id == '':
|
||||||
|
self.id = self.mainTranscription
|
||||||
|
# parse entries
|
||||||
|
for row in fileReader:
|
||||||
|
entry = dict()
|
||||||
|
for i in range(len(row)):
|
||||||
|
entry.update({header[i]: row[i]})
|
||||||
|
# All absent elements are set to ''
|
||||||
|
for i in range(len(row), len(header)):
|
||||||
|
entry.update({header[i]: ''})
|
||||||
|
# if both phoneme and main-transcription are empty, skip the rest
|
||||||
|
if (entry['phoneme'] != '') or (entry[self.mainTranscription] != ''):
|
||||||
|
# if id is not provided, generate it
|
||||||
|
if 'id' not in header:
|
||||||
|
entry.update({'id': entry['phoneme'] + '-' + entry[self.mainTranscription]})
|
||||||
|
# if description is not provided, add an empty one
|
||||||
|
if 'description' not in header:
|
||||||
|
entry.update({'description': ''})
|
||||||
|
self.entries.append(entry)
|
||||||
|
|
||||||
|
class PhonagenFile:
|
||||||
|
"""A phonagen file, with phonologies and generators"""
|
||||||
|
def __init__(self):
|
||||||
|
self.phonologies = {}
|
||||||
|
self.generators = {}
|
||||||
|
|
||||||
|
def addPhonology(self, phonology):
|
||||||
|
if (phonology.isValid()):
|
||||||
|
self.phonologies.update({phonology.id: phonology})
|
||||||
|
|
||||||
|
def addGenerator(self, generator):
|
||||||
|
if (generator.isValid()):
|
||||||
|
self.generators.update({generator.id: generator})
|
||||||
|
|
||||||
|
def getPhonology(self, id):
|
||||||
|
return self.phonologies[id]
|
||||||
|
|
||||||
|
def getGenerator(self, id):
|
||||||
|
return self.generators[id]
|
||||||
|
|
||||||
|
def writeTo(self, file = ''):
|
||||||
|
"""Output a JSON file from lists of phonologies and generators"""
|
||||||
|
outputStruct = { 'phonologies': [x.toJsonStruct() for x in self.phonologies.values()],
|
||||||
|
'generators': [x.toJsonStruct() for x in self.generators.values()] }
|
||||||
|
if file == '':
|
||||||
|
json.dump(outputStruct, sys.stdout, ensure_ascii=False)
|
||||||
|
else:
|
||||||
|
with open(file, 'w', encoding='utf-8') as outputFile:
|
||||||
|
json.dump(outputStruct, outputFile, ensure_ascii=False)
|
|
@ -0,0 +1,24 @@
|
||||||
|
#! /usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import phonagen
|
||||||
|
|
||||||
|
def parseArgs():
|
||||||
|
# Define argument parser
|
||||||
|
parser = argparse.ArgumentParser(description='Convert a phonology from csv to json.')
|
||||||
|
parser.add_argument('file', metavar='csvfile', help='csv file to convert')
|
||||||
|
parser.add_argument('--id', metavar='id', help='id of the phonology; guessed from the csv header if not provided', default='')
|
||||||
|
parser.add_argument('--description', metavar='description', help='description of the phonology; empty if not provided', default='')
|
||||||
|
parser.add_argument('--main', metavar='main-transcription', help='main transcription of the phonology; must correspond to an element of the csv header (outside id and description); guessed from the csv header if not provided.', default='')
|
||||||
|
parser.add_argument('--output', metavar='output-file', help='Output file for the phonology. The file is printed to standard output if not given.', default='')
|
||||||
|
# Parse arguments
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
# Main
|
||||||
|
if __name__ == '__main__':
|
||||||
|
args = parseArgs()
|
||||||
|
phonology = phonagen.Phonology(id = args.id, description = args.description, mainTranscription = args.main)
|
||||||
|
phonology.fromCsv(args.file)
|
||||||
|
phonagenFile = phonagen.PhonagenFile()
|
||||||
|
phonagenFile.addPhonology(phonology)
|
||||||
|
phonagenFile.writeTo(args.output)
|
Loading…
Reference in New Issue