Add extractor & generator
This commit is contained in:
parent
9c93990b21
commit
912b51abe4
|
@ -0,0 +1,421 @@
|
|||
#lang racket
|
||||
|
||||
; Extracts data from CSV and generate a json file suitable for generation
|
||||
; The output json file contains a list of regions, each containing a list of departements :
|
||||
; [ {
|
||||
; "id": "...", "name": "...",
|
||||
; "departements":
|
||||
; [ {
|
||||
; "id": "...", "name": "...",
|
||||
; "word-masculine": [{"value": "...", "count": X}, ...],
|
||||
; "word-feminine": [{"value": "...", "count": X}, ...],
|
||||
; "word-plural": [{"value": "...", "count": X}, ...],
|
||||
; "word-masculine-plural": [{"value": "...", "count": X}, ...],
|
||||
; "word-feminine-plural": [{"value": "...", "count": X}, ...],
|
||||
; "person-name-masculine": [{"value": "...", "count": X}, ...],
|
||||
; "person-name-feminine": [{"value": "...", "count": X}, ...],
|
||||
; "patterns": [{"value": [...], "count": X}, ...],
|
||||
; "locality-markov": {"order": X, "next": [{ "sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
|
||||
; "area-markov": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
|
||||
; "river-markov": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
|
||||
; },
|
||||
; ...
|
||||
; ],
|
||||
; },
|
||||
; ...
|
||||
; ]
|
||||
; The distributions and markov chains are trained are by departement
|
||||
|
||||
(require
|
||||
csv-reading
|
||||
json
|
||||
srfi/1
|
||||
"markov.rkt"
|
||||
"distribution.rkt")
|
||||
|
||||
; Structure for each region
|
||||
(struct region
|
||||
(id ; id of region
|
||||
name ; name of region
|
||||
departements ; table of departements (id->dep)
|
||||
))
|
||||
|
||||
; Constructor for a region
|
||||
(define (make-region id name)
|
||||
(region id name (make-hash)))
|
||||
|
||||
; Structure for each departement
|
||||
(struct departement
|
||||
(id ; id of departement
|
||||
name ; name of departement
|
||||
patterns ; distribution of patterns
|
||||
word-masculine ; distribution of masculine words
|
||||
word-feminine ; distribution of feminine words
|
||||
word-plural ; distribution of plural words
|
||||
word-masculine-plural ; distribution of masculine plural words
|
||||
word-feminine-plural ; distribution of feminine plural words
|
||||
person-name-masculine ; distributino of masculine person name
|
||||
person-name-feminine ; distribution of feminine person name
|
||||
locality-name ; markov chain for locality names
|
||||
area-name ; markov chain for area names
|
||||
river-name ; markov chain for river names
|
||||
))
|
||||
|
||||
; Constructor for a departement
|
||||
(define (make-departement id name)
|
||||
(departement
|
||||
id
|
||||
name
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-markov 3)
|
||||
(make-markov 3)
|
||||
(make-markov 3)))
|
||||
|
||||
;;
|
||||
; Extraction tools
|
||||
|
||||
; Accessors of a csv-commune object (dict of line extracted from the csv file)
|
||||
; Name
|
||||
(define (csv-commune-name obj)
|
||||
(dict-ref obj "COM_NOM"))
|
||||
; Uppercase name
|
||||
(define (csv-commune-name-maj obj)
|
||||
(dict-ref obj "COM_NOM_MAJ"))
|
||||
; Uppercase short name
|
||||
(define (csv-commune-name-maj-court obj)
|
||||
(dict-ref obj "COM_NOM_MAJ_COURT"))
|
||||
; Region Id
|
||||
(define (csv-commune-region-id obj)
|
||||
(dict-ref obj "REG_ID_OLD"))
|
||||
; Region name
|
||||
(define (csv-commune-region-name obj)
|
||||
(dict-ref obj "REG_NOM_OLD"))
|
||||
; Departement Id
|
||||
(define (csv-commune-departement-id obj)
|
||||
(dict-ref obj "DEP_ID"))
|
||||
; Departement name
|
||||
(define (csv-commune-departement-name obj)
|
||||
(dict-ref obj "DEP_NOM"))
|
||||
; Indicate if a commune is active (still exists)
|
||||
(define (csv-commune-active? obj)
|
||||
(not (equal? "SO" (dict-ref obj "UUCR_ID"))))
|
||||
|
||||
; Function removing all diacrtics of latin-based languages
|
||||
(define (remove-diacritics str)
|
||||
(list->string
|
||||
(filter
|
||||
(lambda (c)
|
||||
(< (char->integer c) 256))
|
||||
(string->list
|
||||
(string-normalize-nfd str)))))
|
||||
|
||||
; Return the short form of a name
|
||||
(define (short-form str)
|
||||
(string-join
|
||||
(map
|
||||
(lambda (x)
|
||||
(cond
|
||||
((equal? x "SAINT") "ST")
|
||||
((equal? x "SAINTE") "STE")
|
||||
(#t x)))
|
||||
(string-split
|
||||
str
|
||||
#rx"[-']"))
|
||||
" "))
|
||||
|
||||
; Indicate if the commune name's uppercase form and uppercase short form can be computed from the commune's name
|
||||
(define (csv-commune-coherent? obj)
|
||||
(define name (csv-commune-name obj))
|
||||
(define name-maj (csv-commune-name-maj obj))
|
||||
(define name-maj-court (csv-commune-name-maj-court obj))
|
||||
(and
|
||||
; uppercase name should be name but capitalized and diacritics removed
|
||||
(equal? name-maj
|
||||
(string-upcase (remove-diacritics name)))
|
||||
; uppercase short name should be uppercase name with apostrophes and hyphens replaced by spaces, "SAINT " replaced by "ST " and "SAINTE " replaced by "STE "
|
||||
(equal? name-maj-court
|
||||
(short-form name-maj))))
|
||||
|
||||
; Add a region to the extractor
|
||||
(define (add-region extractor obj)
|
||||
(define id (csv-commune-region-id obj))
|
||||
(when (not (hash-has-key? extractor id))
|
||||
(hash-set! extractor id (make-region id (csv-commune-region-name obj)))))
|
||||
|
||||
; Add a departement to the extractor
|
||||
(define (add-departement extractor obj)
|
||||
(define reg (hash-ref extractor (csv-commune-region-id obj)))
|
||||
(define id (csv-commune-departement-id obj))
|
||||
(when (not (hash-has-key? (region-departements reg) id))
|
||||
(hash-set! (region-departements reg) id (make-departement id (csv-commune-departement-name obj)))))
|
||||
|
||||
; Remove elision
|
||||
(define *elisions*
|
||||
'(("-l'" . "-le-") ; masculine is used - though "l'" can also be an elision for the feminine "la"
|
||||
("-d'" . "-de-")
|
||||
("-des-" . "-de-les-")
|
||||
("-du-" . "-de-le-")
|
||||
("-del-" . "-de-el-") ; Catalan
|
||||
("-dels-" . "-de-los-") ; Catalan
|
||||
("-au-" . "-à-le-")
|
||||
("-aux-" . "-à-les-")
|
||||
("Sant'" . "Santo-") ; Corsican
|
||||
; Initials articles are replaced with lowercase variants
|
||||
("La " . "la-")
|
||||
("Le " . "le-")
|
||||
("Les " . "les-")
|
||||
("L'" . "le-")
|
||||
("Los " . "los-") ; Catalan
|
||||
))
|
||||
(define (remove-elision str)
|
||||
(foldl
|
||||
(lambda (item result)
|
||||
(string-replace result (car item) (cdr item)))
|
||||
str
|
||||
*elisions*))
|
||||
|
||||
; Transform preposition into binary operators between lists:
|
||||
; ex: (A B C "sur" X Y Z) => (sur (A B C) (X Y Z))
|
||||
; Note: recursive on lists
|
||||
(define *prepositions*
|
||||
'("et"
|
||||
"à"
|
||||
"en"
|
||||
"de"
|
||||
"di" ; Corsican
|
||||
"sur"
|
||||
"sous"
|
||||
"lès"
|
||||
"lez"
|
||||
"près"))
|
||||
(define (preposition-pass prep tokens)
|
||||
(if (list? tokens)
|
||||
(let ((idx (index-of tokens prep)))
|
||||
(if (and idx (< (+ idx 1) (length tokens))) ; preposition cannot be in last position
|
||||
(list
|
||||
(string->symbol prep)
|
||||
(take tokens idx)
|
||||
(preposition-pass prep (drop tokens (+ 1 idx))))
|
||||
(map (lambda (x) (preposition-pass prep x)) tokens)))
|
||||
tokens))
|
||||
|
||||
; Transform articles (and article-like words as "Saint") into unary operators
|
||||
; ex: ("le" X Y) => ((le X) Y)
|
||||
(define *articles*
|
||||
'("le"
|
||||
"el" ; Catalan/Spanish
|
||||
"la"
|
||||
"les"
|
||||
"las" ; Catalan
|
||||
"los" ; Catalan
|
||||
"Saint"
|
||||
"Santo" ; Corsican
|
||||
"San" ; Corsican
|
||||
"Sainte"
|
||||
"Santa" ; Corsican
|
||||
"sans"))
|
||||
(define (article-pass art tokens)
|
||||
(if (list? tokens)
|
||||
(let ((idx (index-of tokens art)))
|
||||
(if (and idx (< (+ idx 1) (length tokens))) ; article cannot be in last position
|
||||
(list
|
||||
(take tokens idx)
|
||||
(list
|
||||
(string->symbol art)
|
||||
(list-ref tokens (+ 1 idx)))
|
||||
(article-pass art (drop tokens (+ 2 idx))))
|
||||
(map (lambda (x) (article-pass art x)) tokens)))
|
||||
tokens))
|
||||
|
||||
; Simplify: remove empty lists
|
||||
(define (remove-empty tokens)
|
||||
(if (list? tokens)
|
||||
(map
|
||||
remove-empty
|
||||
(filter (lambda (x) (not (null? x))) tokens))
|
||||
tokens))
|
||||
|
||||
; Simplify: lists of single elements are replaced by that element
|
||||
(define (simplify-single tokens)
|
||||
(if (list? tokens)
|
||||
(if (eq? 1 (length tokens))
|
||||
(simplify-single (car tokens))
|
||||
(map simplify-single tokens))
|
||||
tokens))
|
||||
|
||||
; Extract the strings and generate a pattern
|
||||
; The string are extracted into the different tables according to the following rules
|
||||
; Single string
|
||||
; X => X = locality name
|
||||
; Unary operators
|
||||
; (le X), (el X), (sans X) => X = masculine word
|
||||
; (la X) => X = feminine word
|
||||
; (les X) => X = plural word
|
||||
; (las X) => X = feminine plural word
|
||||
; (los X) => X = masculine plural word
|
||||
; (Saint X), (Santo X), (San X) => X = masculine name
|
||||
; (Sainte X), (Santa X) => X = feminine name
|
||||
; Binary operators
|
||||
; (et X Y), (sous X Y), (lès X Y), (lez X Y), (près X Y) => X = locality name, Y = locality name
|
||||
; (à X Y), (en X Y), (de X Y), (di X Y) => X = locality name, Y = area name
|
||||
; (sur X Y) => X = locality name, Y = river name
|
||||
|
||||
; During extraction, the strings are replaced by symbols of the corresponding type to form patterns
|
||||
(define (extract-to-distribution dep token dist sym)
|
||||
(if (string? token)
|
||||
(begin
|
||||
(case sym
|
||||
(($LocalityName$ $AreaName$ $RiverName$)
|
||||
; Those are added to a markov chain
|
||||
(markov-add-example! dist (string->list (string-titlecase token))))
|
||||
(else
|
||||
; added to a distribution
|
||||
(distribution-add-to! dist token)))
|
||||
sym)
|
||||
(extract-strings dep token)))
|
||||
|
||||
; Extraction of unary operators
|
||||
(define (unary-operator dep token dist sym)
|
||||
(list (car token) (extract-to-distribution dep (cadr token) dist sym)))
|
||||
|
||||
; Extraction of binary operators
|
||||
(define (binary-operator dep token dist1 sym1 dist2 sym2)
|
||||
(list (car token) (extract-to-distribution dep (cadr token) dist1 sym1) (extract-to-distribution dep (caddr token) dist2 sym2)))
|
||||
|
||||
(define (extract-strings dep token)
|
||||
(cond
|
||||
( (list? token)
|
||||
(case (car token)
|
||||
((le el sans)
|
||||
(unary-operator dep token (departement-word-masculine dep) '$WordMasculine$))
|
||||
((la)
|
||||
(unary-operator dep token (departement-word-feminine dep) '$WordFeminine$))
|
||||
((les)
|
||||
(unary-operator dep token (departement-word-plural dep) '$WordPlural$))
|
||||
((las)
|
||||
(unary-operator dep token (departement-word-feminine-plural dep) '$WordFemininePlural$))
|
||||
((los)
|
||||
(unary-operator dep token (departement-word-masculine-plural dep) '$WordMasculinePlural$))
|
||||
((Saint Santo San)
|
||||
(unary-operator dep token (departement-person-name-masculine dep) '$PersonNameMasculine$))
|
||||
((Sainte Santa)
|
||||
(unary-operator dep token (departement-person-name-feminine dep) '$PersonNameFeminine$))
|
||||
((et sous lès lez près)
|
||||
(binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-locality-name dep) '$LocalityName$))
|
||||
((à en de di)
|
||||
(binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-area-name dep) '$AreaName$))
|
||||
((sur)
|
||||
(binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-river-name dep) '$RiverName$))
|
||||
(else
|
||||
(map (lambda (x) (extract-strings dep x)) token))
|
||||
))
|
||||
( (string? token)
|
||||
(extract-to-distribution dep token (departement-locality-name dep) '$LocalityName$))
|
||||
(#t
|
||||
(error "Unexpected element: " token))))
|
||||
|
||||
; Add a pattern to a departement
|
||||
(define (add-pattern dep pattern)
|
||||
(distribution-add-to! (departement-patterns dep) pattern))
|
||||
|
||||
; Train the extractor on the commune name
|
||||
(define (train-commune extractor obj)
|
||||
(define reg (hash-ref extractor (csv-commune-region-id obj)))
|
||||
(define dep (hash-ref (region-departements reg) (csv-commune-departement-id obj)))
|
||||
; Tokenize the name by removing elisions and splitting along the hyphens
|
||||
(define tokens (string-split (remove-elision (csv-commune-name obj)) "-"))
|
||||
; Decompose the tokens into a tree with preposition
|
||||
(set! tokens (foldl preposition-pass tokens *prepositions*))
|
||||
; Decompose the tokens into a tree with articles
|
||||
(set! tokens (foldl article-pass tokens *articles*))
|
||||
; Simplify a bit the tokens: remove empty lists and replace single-item list by the contained item
|
||||
(set! tokens (simplify-single (remove-empty tokens)))
|
||||
; Extract the strings into a departement and generate a pattern
|
||||
(add-pattern dep (extract-strings dep tokens)))
|
||||
|
||||
; Add a commune to the extractor, an hash table of regions
|
||||
; Only add if the locality is active and its names are coherent
|
||||
(define (add-commune extractor obj)
|
||||
(when (and (csv-commune-coherent? obj)
|
||||
(csv-commune-active? obj))
|
||||
(add-region extractor obj)
|
||||
(add-departement extractor obj)
|
||||
(train-commune extractor obj)))
|
||||
|
||||
; Table of regions in which the data is stored
|
||||
(define *extractor* (make-hash))
|
||||
|
||||
; Read data file and fill tables
|
||||
; File a CSV with semi-colon (';') as field separator
|
||||
(define *source-file* "./data/fr-esr-referentiel-geographique.csv")
|
||||
(define source-reader
|
||||
(make-csv-reader
|
||||
(open-input-file *source-file*)
|
||||
'((separator-chars . (#\;))
|
||||
(strip-leading-whitespace? . #t)
|
||||
(strip-trailing-whitespace? . #t))))
|
||||
; Get the header
|
||||
(define *header* (source-reader))
|
||||
|
||||
; Fill the extractor
|
||||
(csv-for-each
|
||||
(lambda (line)
|
||||
(add-commune
|
||||
*extractor*
|
||||
(map
|
||||
(lambda (x) (apply cons x))
|
||||
(zip *header* line))))
|
||||
source-reader)
|
||||
|
||||
; Translators for json serialisation
|
||||
(define (pattern-translator x)
|
||||
(cond
|
||||
((symbol? x) (symbol->string x))
|
||||
((list? x) (map pattern-translator x))
|
||||
(else x)))
|
||||
|
||||
(define (markov-translator x)
|
||||
(cond
|
||||
((char? x) (list->string (list x)))
|
||||
(else x)))
|
||||
|
||||
; Output a departement to a json-compatible structure
|
||||
(define (departement->jsexpr dep)
|
||||
(make-immutable-hash
|
||||
`((id . ,(departement-id dep))
|
||||
(name . ,(departement-name dep))
|
||||
(patterns . ,(distribution->jsexpr (departement-patterns dep) #:translator pattern-translator))
|
||||
(word-masculine . ,(distribution->jsexpr (departement-word-masculine dep)))
|
||||
(word-feminine . ,(distribution->jsexpr (departement-word-feminine dep)))
|
||||
(word-plural . ,(distribution->jsexpr (departement-word-plural dep)))
|
||||
(word-masculine-plural . ,(distribution->jsexpr (departement-word-masculine-plural dep)))
|
||||
(word-feminine-plural . ,(distribution->jsexpr (departement-word-feminine-plural dep)))
|
||||
(person-name-masculine . ,(distribution->jsexpr (departement-person-name-masculine dep)))
|
||||
(person-name-feminine . ,(distribution->jsexpr (departement-person-name-feminine dep)))
|
||||
(locality-name . ,(markov->jsexpr (departement-locality-name dep) #:translator markov-translator))
|
||||
(area-name . ,(markov->jsexpr (departement-area-name dep) #:translator markov-translator))
|
||||
(river-name . ,(markov->jsexpr (departement-river-name dep) #:translator markov-translator))
|
||||
)))
|
||||
|
||||
; Output a region to a json-compatible structure
|
||||
(define (region->jsexpr reg)
|
||||
(make-immutable-hash
|
||||
`((id . ,(region-id reg))
|
||||
(name . ,(region-name reg))
|
||||
(departements . ,(map departement->jsexpr (hash-values (region-departements reg))))
|
||||
)))
|
||||
|
||||
; Output the extractor to json-compatible structure
|
||||
(define (extractor->jsexpr extractor)
|
||||
(map region->jsexpr (hash-values extractor)))
|
||||
|
||||
(define *output* (open-output-file "./gener-communes.json" #:exists 'truncate/replace))
|
||||
; output lisp structure
|
||||
(write-json (extractor->jsexpr *extractor*) *output*)
|
|
@ -0,0 +1,228 @@
|
|||
#lang racket
|
||||
|
||||
(require
|
||||
json
|
||||
"markov.rkt"
|
||||
"distribution.rkt")
|
||||
|
||||
; Structure for each region
|
||||
(struct region
|
||||
(id ; id of region
|
||||
name ; name of region
|
||||
departements ; table of departements (id->dep)
|
||||
))
|
||||
|
||||
; Constructor for a region
|
||||
(define (make-region id name)
|
||||
(region id name (make-hash)))
|
||||
|
||||
; Structure for each departement
|
||||
(struct departement
|
||||
(id ; id of departement
|
||||
name ; name of departement
|
||||
patterns ; distribution of patterns
|
||||
word-masculine ; distribution of masculine words
|
||||
word-feminine ; distribution of feminine words
|
||||
word-plural ; distribution of plural words
|
||||
word-masculine-plural ; distribution of masculine plural words
|
||||
word-feminine-plural ; distribution of feminine plural words
|
||||
person-name-masculine ; distributino of masculine person name
|
||||
person-name-feminine ; distribution of feminine person name
|
||||
locality-name ; markov chain for locality names
|
||||
area-name ; markov chain for area names
|
||||
river-name ; markov chain for river names
|
||||
))
|
||||
|
||||
; Constructor for a departement
|
||||
(define (make-departement id name)
|
||||
(departement
|
||||
id
|
||||
name
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-distribution)
|
||||
(make-markov 3)
|
||||
(make-markov 3)
|
||||
(make-markov 3)))
|
||||
|
||||
; Generate names
|
||||
; Replace symbols with a generated element
|
||||
(define (replace-with sym dep)
|
||||
(cond
|
||||
((symbol? sym)
|
||||
(case sym
|
||||
(($WordMasculine$) (distribution-pick-from (departement-word-masculine dep)))
|
||||
(($WordFeminine$) (distribution-pick-from (departement-word-feminine dep)))
|
||||
(($WordPlural$) (distribution-pick-from (departement-word-plural dep)))
|
||||
(($WordFemininePlural$) (distribution-pick-from (departement-word-feminine-plural dep)))
|
||||
(($WordMasculinePlural$) (distribution-pick-from (departement-word-masculine-plural dep)))
|
||||
(($PersonNameMasculine$) (distribution-pick-from (departement-person-name-masculine dep)))
|
||||
(($PersonNameFeminine$) (distribution-pick-from (departement-person-name-feminine dep)))
|
||||
(($LocalityName$) (list->string (markov-generate (departement-locality-name dep))))
|
||||
(($AreaName$) (list->string (markov-generate (departement-area-name dep))))
|
||||
(($RiverName$) (list->string (markov-generate (departement-river-name dep))))
|
||||
(else sym)))
|
||||
((list? sym)
|
||||
(map (lambda (x) (replace-with x dep)) sym))
|
||||
(#t sym)))
|
||||
|
||||
; Indicate if a string starts with a vowel
|
||||
; Accent taken into account
|
||||
(define (start-with-vowel? str)
|
||||
(member
|
||||
(car
|
||||
(string->list
|
||||
(string-normalize-nfd str)))
|
||||
(list #\A #\E #\I #\O #\U #\Y)))
|
||||
|
||||
; Transform the tree structure of a pattern back to a string
|
||||
(define (tree->string pattern)
|
||||
(cond
|
||||
( (list? pattern)
|
||||
(case (car pattern)
|
||||
((el sans la les las los Saint San Sainte Santa)
|
||||
(string-append
|
||||
(symbol->string (car pattern))
|
||||
"-"
|
||||
(tree->string (cadr pattern))))
|
||||
((le)
|
||||
; elision possible: much check the initial of the following word
|
||||
(let ((following (tree->string (cadr pattern))))
|
||||
(string-append
|
||||
(if (start-with-vowel? following)
|
||||
"l'"
|
||||
"le-")
|
||||
following)))
|
||||
((Santo)
|
||||
; elision possible: much check the initial of the following
|
||||
(let ((following (tree->string (cadr pattern))))
|
||||
(string-append
|
||||
(if (start-with-vowel? following)
|
||||
"Sant'"
|
||||
"Santo-")
|
||||
following)))
|
||||
((et sous lès lez près à en di sur)
|
||||
(string-append
|
||||
(tree->string (cadr pattern))
|
||||
"-"
|
||||
(symbol->string (car pattern))
|
||||
"-"
|
||||
(tree->string (caddr pattern))))
|
||||
((de)
|
||||
; ellision possible: must check the initial of the following word
|
||||
(let ((following (tree->string (caddr pattern))))
|
||||
(string-append
|
||||
(tree->string (cadr pattern))
|
||||
"-"
|
||||
(if (start-with-vowel? following)
|
||||
"d'"
|
||||
"de-")
|
||||
following)))
|
||||
(else
|
||||
(string-join
|
||||
(map tree->string pattern)
|
||||
"-"))))
|
||||
( (string? pattern)
|
||||
pattern)
|
||||
(#t
|
||||
(error "Unexpected element: " pattern))))
|
||||
|
||||
; Apply ellision rules
|
||||
(define (apply-ellisions str)
|
||||
(define res str)
|
||||
(set! res (string-replace res "-à-les-" "-aux-"))
|
||||
(set! res (string-replace res "-de-los-" "-dels-")) ; Catalan
|
||||
(set! res (string-replace res "-de-el-" "-del-")) ; Catalan
|
||||
(set! res (string-replace res "-de-les-" "-des-"))
|
||||
(set! res (string-replace res "-de-le-" "-du-"))
|
||||
(set! res (string-replace res "-à-le-" "-au-"))
|
||||
; Start of word
|
||||
(cond
|
||||
((string-prefix? res "la-")
|
||||
(string-replace res "la-" "La " #:all? #f))
|
||||
((string-prefix? res "le-")
|
||||
(string-replace res "le-" "Le " #:all? #f))
|
||||
((string-prefix? res "les-")
|
||||
(string-replace res "les-" "Les " #:all? #f))
|
||||
((string-prefix? res "l'")
|
||||
(string-replace res "l'" "L'" #:all? #f))
|
||||
((string-prefix? res "los-")
|
||||
(string-replace res "los-" "Los " #:all? #f))
|
||||
(#t res)))
|
||||
|
||||
; Generate a new name from a departement
|
||||
(define (departement-generate dep)
|
||||
(apply-ellisions
|
||||
(tree->string
|
||||
(replace-with
|
||||
(distribution-pick-from (departement-patterns dep))
|
||||
dep))))
|
||||
|
||||
; Generate a new name from a region
|
||||
(define (region-generate reg)
|
||||
(departement-generate (car (shuffle (hash-values (region-departements reg))))))
|
||||
|
||||
; Generate a new name
|
||||
(define (generate gen)
|
||||
(region-generate (car (shuffle (hash-values gen)))))
|
||||
|
||||
(define (pattern-translator x)
|
||||
(cond
|
||||
((string? x) (string->symbol x))
|
||||
((list? x) (map pattern-translator x))
|
||||
(#t x)))
|
||||
|
||||
(define (markov-translator x)
|
||||
(cond
|
||||
((string? x) (car (string->list x))) ; single-character strings
|
||||
(#t x)))
|
||||
|
||||
; Convert a json-compatible structure to a departement
|
||||
(define (jsexpr->departement js)
|
||||
(departement
|
||||
(hash-ref js 'id)
|
||||
(hash-ref js 'name)
|
||||
(jsexpr->distribution (hash-ref js 'patterns) #:translator pattern-translator)
|
||||
(jsexpr->distribution (hash-ref js 'word-masculine))
|
||||
(jsexpr->distribution (hash-ref js 'word-feminine))
|
||||
(jsexpr->distribution (hash-ref js 'word-plural))
|
||||
(jsexpr->distribution (hash-ref js 'word-masculine-plural))
|
||||
(jsexpr->distribution (hash-ref js 'word-feminine-plural))
|
||||
(jsexpr->distribution (hash-ref js 'person-name-masculine))
|
||||
(jsexpr->distribution (hash-ref js 'person-name-feminine))
|
||||
(jsexpr->markov (hash-ref js 'locality-name) #:translator markov-translator)
|
||||
(jsexpr->markov (hash-ref js 'area-name) #:translator markov-translator)
|
||||
(jsexpr->markov (hash-ref js 'river-name) #:translator markov-translator)))
|
||||
|
||||
; Convert a json-compatible structure to a region
|
||||
(define (jsexpr->region js)
|
||||
(define reg (make-region (hash-ref js 'id) (hash-ref js 'name)))
|
||||
(for-each
|
||||
(lambda (dep)
|
||||
(hash-set! (region-departements reg) (departement-id dep) dep))
|
||||
(map jsexpr->departement (hash-ref js 'departements)))
|
||||
reg)
|
||||
|
||||
; Convert a json-compatible structure to a generator
|
||||
(define (jsexpr->generator js)
|
||||
(define gen (make-hash))
|
||||
(for-each
|
||||
(lambda (reg)
|
||||
(hash-set! gen (region-id reg) reg))
|
||||
(map jsexpr->region js))
|
||||
gen)
|
||||
|
||||
; Read the file containing all the data
|
||||
(define *input* (open-input-file "./gener-communes.json"))
|
||||
|
||||
; Process the input and train
|
||||
(define *generator* (jsexpr->generator (read-json *input*)))
|
||||
|
||||
(for-each
|
||||
displayln
|
||||
(build-list 100 (lambda (_) (generate *generator*))))
|
Loading…
Reference in New Issue