422 lines
15 KiB
Racket
422 lines
15 KiB
Racket
#lang racket
|
|
|
|
; Extracts data from CSV and generate a json file suitable for generation
|
|
; The output json file contains a list of regions, each containing a list of departements :
|
|
; [ {
|
|
; "id": "...", "name": "...",
|
|
; "departements":
|
|
; [ {
|
|
; "id": "...", "name": "...",
|
|
; "word-masculine": [{"value": "...", "count": X}, ...],
|
|
; "word-feminine": [{"value": "...", "count": X}, ...],
|
|
; "word-plural": [{"value": "...", "count": X}, ...],
|
|
; "word-masculine-plural": [{"value": "...", "count": X}, ...],
|
|
; "word-feminine-plural": [{"value": "...", "count": X}, ...],
|
|
; "person-name-masculine": [{"value": "...", "count": X}, ...],
|
|
; "person-name-feminine": [{"value": "...", "count": X}, ...],
|
|
; "patterns": [{"value": [...], "count": X}, ...],
|
|
; "locality-name": {"order": X, "next": [{ "sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
|
|
; "area-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
|
|
; "river-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
|
|
; },
|
|
; ...
|
|
; ],
|
|
; },
|
|
; ...
|
|
; ]
|
|
; The distributions and markov chains are trained are by departement
|
|
|
|
(require
|
|
csv-reading
|
|
json
|
|
srfi/1
|
|
"markov.rkt"
|
|
"distribution.rkt")
|
|
|
|
; Structure for each region
|
|
(struct region
|
|
(id ; id of region
|
|
name ; name of region
|
|
departements ; table of departements (id->dep)
|
|
))
|
|
|
|
; Constructor for a region
|
|
(define (make-region id name)
|
|
(region id name (make-hash)))
|
|
|
|
; Structure for each departement
|
|
(struct departement
|
|
(id ; id of departement
|
|
name ; name of departement
|
|
patterns ; distribution of patterns
|
|
word-masculine ; distribution of masculine words
|
|
word-feminine ; distribution of feminine words
|
|
word-plural ; distribution of plural words
|
|
word-masculine-plural ; distribution of masculine plural words
|
|
word-feminine-plural ; distribution of feminine plural words
|
|
person-name-masculine ; distributino of masculine person name
|
|
person-name-feminine ; distribution of feminine person name
|
|
locality-name ; markov chain for locality names
|
|
area-name ; markov chain for area names
|
|
river-name ; markov chain for river names
|
|
))
|
|
|
|
; Constructor for a departement
|
|
(define (make-departement id name)
|
|
(departement
|
|
id
|
|
name
|
|
(make-distribution)
|
|
(make-distribution)
|
|
(make-distribution)
|
|
(make-distribution)
|
|
(make-distribution)
|
|
(make-distribution)
|
|
(make-distribution)
|
|
(make-distribution)
|
|
(make-markov 3)
|
|
(make-markov 3)
|
|
(make-markov 3)))
|
|
|
|
;;
|
|
; Extraction tools
|
|
|
|
; Accessors of a csv-commune object (dict of line extracted from the csv file)
|
|
; Name
|
|
(define (csv-commune-name obj)
|
|
(dict-ref obj "COM_NOM"))
|
|
; Uppercase name
|
|
(define (csv-commune-name-maj obj)
|
|
(dict-ref obj "COM_NOM_MAJ"))
|
|
; Uppercase short name
|
|
(define (csv-commune-name-maj-court obj)
|
|
(dict-ref obj "COM_NOM_MAJ_COURT"))
|
|
; Region Id
|
|
(define (csv-commune-region-id obj)
|
|
(dict-ref obj "REG_ID_OLD"))
|
|
; Region name
|
|
(define (csv-commune-region-name obj)
|
|
(dict-ref obj "REG_NOM_OLD"))
|
|
; Departement Id
|
|
(define (csv-commune-departement-id obj)
|
|
(dict-ref obj "DEP_ID"))
|
|
; Departement name
|
|
(define (csv-commune-departement-name obj)
|
|
(dict-ref obj "DEP_NOM"))
|
|
; Indicate if a commune is active (still exists)
|
|
(define (csv-commune-active? obj)
|
|
(not (equal? "SO" (dict-ref obj "UUCR_ID"))))
|
|
|
|
; Function removing all diacrtics of latin-based languages
|
|
(define (remove-diacritics str)
|
|
(list->string
|
|
(filter
|
|
(lambda (c)
|
|
(< (char->integer c) 256))
|
|
(string->list
|
|
(string-normalize-nfd str)))))
|
|
|
|
; Return the short form of a name
|
|
(define (short-form str)
|
|
(string-join
|
|
(map
|
|
(lambda (x)
|
|
(cond
|
|
((equal? x "SAINT") "ST")
|
|
((equal? x "SAINTE") "STE")
|
|
(#t x)))
|
|
(string-split
|
|
str
|
|
#rx"[-']"))
|
|
" "))
|
|
|
|
; Indicate if the commune name's uppercase form and uppercase short form can be computed from the commune's name
|
|
(define (csv-commune-coherent? obj)
|
|
(define name (csv-commune-name obj))
|
|
(define name-maj (csv-commune-name-maj obj))
|
|
(define name-maj-court (csv-commune-name-maj-court obj))
|
|
(and
|
|
; uppercase name should be name but capitalized and diacritics removed
|
|
(equal? name-maj
|
|
(string-upcase (remove-diacritics name)))
|
|
; uppercase short name should be uppercase name with apostrophes and hyphens replaced by spaces, "SAINT " replaced by "ST " and "SAINTE " replaced by "STE "
|
|
(equal? name-maj-court
|
|
(short-form name-maj))))
|
|
|
|
; Add a region to the extractor
|
|
(define (add-region extractor obj)
|
|
(define id (csv-commune-region-id obj))
|
|
(when (not (hash-has-key? extractor id))
|
|
(hash-set! extractor id (make-region id (csv-commune-region-name obj)))))
|
|
|
|
; Add a departement to the extractor
|
|
(define (add-departement extractor obj)
|
|
(define reg (hash-ref extractor (csv-commune-region-id obj)))
|
|
(define id (csv-commune-departement-id obj))
|
|
(when (not (hash-has-key? (region-departements reg) id))
|
|
(hash-set! (region-departements reg) id (make-departement id (csv-commune-departement-name obj)))))
|
|
|
|
; Remove elision
|
|
(define *elisions*
|
|
'(("-l'" . "-le-") ; masculine is used - though "l'" can also be an elision for the feminine "la"
|
|
("-d'" . "-de-")
|
|
("-des-" . "-de-les-")
|
|
("-du-" . "-de-le-")
|
|
("-del-" . "-de-el-") ; Catalan
|
|
("-dels-" . "-de-los-") ; Catalan
|
|
("-au-" . "-à-le-")
|
|
("-aux-" . "-à-les-")
|
|
("Sant'" . "Santo-") ; Corsican
|
|
; Initials articles are replaced with lowercase variants
|
|
("La " . "la-")
|
|
("Le " . "le-")
|
|
("Les " . "les-")
|
|
("L'" . "le-")
|
|
("Los " . "los-") ; Catalan
|
|
))
|
|
(define (remove-elision str)
|
|
(foldl
|
|
(lambda (item result)
|
|
(string-replace result (car item) (cdr item)))
|
|
str
|
|
*elisions*))
|
|
|
|
; Transform preposition into binary operators between lists:
|
|
; ex: (A B C "sur" X Y Z) => (sur (A B C) (X Y Z))
|
|
; Note: recursive on lists
|
|
(define *prepositions*
|
|
'("et"
|
|
"à"
|
|
"en"
|
|
"de"
|
|
"di" ; Corsican
|
|
"sur"
|
|
"sous"
|
|
"lès"
|
|
"lez"
|
|
"près"))
|
|
(define (preposition-pass prep tokens)
|
|
(if (list? tokens)
|
|
(let ((idx (index-of tokens prep)))
|
|
(if (and idx (< (+ idx 1) (length tokens))) ; preposition cannot be in last position
|
|
(list
|
|
(string->symbol prep)
|
|
(take tokens idx)
|
|
(preposition-pass prep (drop tokens (+ 1 idx))))
|
|
(map (lambda (x) (preposition-pass prep x)) tokens)))
|
|
tokens))
|
|
|
|
; Transform articles (and article-like words as "Saint") into unary operators
|
|
; ex: ("le" X Y) => ((le X) Y)
|
|
(define *articles*
|
|
'("le"
|
|
"el" ; Catalan/Spanish
|
|
"la"
|
|
"les"
|
|
"las" ; Catalan
|
|
"los" ; Catalan
|
|
"Saint"
|
|
"Santo" ; Corsican
|
|
"San" ; Corsican
|
|
"Sainte"
|
|
"Santa" ; Corsican
|
|
"sans"))
|
|
(define (article-pass art tokens)
|
|
(if (list? tokens)
|
|
(let ((idx (index-of tokens art)))
|
|
(if (and idx (< (+ idx 1) (length tokens))) ; article cannot be in last position
|
|
(list
|
|
(take tokens idx)
|
|
(list
|
|
(string->symbol art)
|
|
(list-ref tokens (+ 1 idx)))
|
|
(article-pass art (drop tokens (+ 2 idx))))
|
|
(map (lambda (x) (article-pass art x)) tokens)))
|
|
tokens))
|
|
|
|
; Simplify: remove empty lists
|
|
(define (remove-empty tokens)
|
|
(if (list? tokens)
|
|
(map
|
|
remove-empty
|
|
(filter (lambda (x) (not (null? x))) tokens))
|
|
tokens))
|
|
|
|
; Simplify: lists of single elements are replaced by that element
|
|
(define (simplify-single tokens)
|
|
(if (list? tokens)
|
|
(if (eq? 1 (length tokens))
|
|
(simplify-single (car tokens))
|
|
(map simplify-single tokens))
|
|
tokens))
|
|
|
|
; Extract the strings and generate a pattern
|
|
; The string are extracted into the different tables according to the following rules
|
|
; Single string
|
|
; X => X = locality name
|
|
; Unary operators
|
|
; (le X), (el X), (sans X) => X = masculine word
|
|
; (la X) => X = feminine word
|
|
; (les X) => X = plural word
|
|
; (las X) => X = feminine plural word
|
|
; (los X) => X = masculine plural word
|
|
; (Saint X), (Santo X), (San X) => X = masculine name
|
|
; (Sainte X), (Santa X) => X = feminine name
|
|
; Binary operators
|
|
; (et X Y), (sous X Y), (lès X Y), (lez X Y), (près X Y) => X = locality name, Y = locality name
|
|
; (à X Y), (en X Y), (de X Y), (di X Y) => X = locality name, Y = area name
|
|
; (sur X Y) => X = locality name, Y = river name
|
|
|
|
; During extraction, the strings are replaced by symbols of the corresponding type to form patterns
|
|
(define (extract-to-distribution dep token dist sym)
|
|
(if (string? token)
|
|
(begin
|
|
(case sym
|
|
(($LocalityName$ $AreaName$ $RiverName$)
|
|
; Those are added to a markov chain
|
|
(markov-add-example! dist (string->list (string-titlecase token))))
|
|
(else
|
|
; added to a distribution
|
|
(distribution-add-to! dist token)))
|
|
sym)
|
|
(extract-strings dep token)))
|
|
|
|
; Extraction of unary operators
|
|
(define (unary-operator dep token dist sym)
|
|
(list (car token) (extract-to-distribution dep (cadr token) dist sym)))
|
|
|
|
; Extraction of binary operators
|
|
(define (binary-operator dep token dist1 sym1 dist2 sym2)
|
|
(list (car token) (extract-to-distribution dep (cadr token) dist1 sym1) (extract-to-distribution dep (caddr token) dist2 sym2)))
|
|
|
|
(define (extract-strings dep token)
|
|
(cond
|
|
( (list? token)
|
|
(case (car token)
|
|
((le el sans)
|
|
(unary-operator dep token (departement-word-masculine dep) '$WordMasculine$))
|
|
((la)
|
|
(unary-operator dep token (departement-word-feminine dep) '$WordFeminine$))
|
|
((les)
|
|
(unary-operator dep token (departement-word-plural dep) '$WordPlural$))
|
|
((las)
|
|
(unary-operator dep token (departement-word-feminine-plural dep) '$WordFemininePlural$))
|
|
((los)
|
|
(unary-operator dep token (departement-word-masculine-plural dep) '$WordMasculinePlural$))
|
|
((Saint Santo San)
|
|
(unary-operator dep token (departement-person-name-masculine dep) '$PersonNameMasculine$))
|
|
((Sainte Santa)
|
|
(unary-operator dep token (departement-person-name-feminine dep) '$PersonNameFeminine$))
|
|
((et sous lès lez près)
|
|
(binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-locality-name dep) '$LocalityName$))
|
|
((à en de di)
|
|
(binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-area-name dep) '$AreaName$))
|
|
((sur)
|
|
(binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-river-name dep) '$RiverName$))
|
|
(else
|
|
(map (lambda (x) (extract-strings dep x)) token))
|
|
))
|
|
( (string? token)
|
|
(extract-to-distribution dep token (departement-locality-name dep) '$LocalityName$))
|
|
(#t
|
|
(error "Unexpected element: " token))))
|
|
|
|
; Add a pattern to a departement
|
|
(define (add-pattern dep pattern)
|
|
(distribution-add-to! (departement-patterns dep) pattern))
|
|
|
|
; Train the extractor on the commune name
|
|
(define (train-commune extractor obj)
|
|
(define reg (hash-ref extractor (csv-commune-region-id obj)))
|
|
(define dep (hash-ref (region-departements reg) (csv-commune-departement-id obj)))
|
|
; Tokenize the name by removing elisions and splitting along the hyphens
|
|
(define tokens (string-split (remove-elision (csv-commune-name obj)) "-"))
|
|
; Decompose the tokens into a tree with preposition
|
|
(set! tokens (foldl preposition-pass tokens *prepositions*))
|
|
; Decompose the tokens into a tree with articles
|
|
(set! tokens (foldl article-pass tokens *articles*))
|
|
; Simplify a bit the tokens: remove empty lists and replace single-item list by the contained item
|
|
(set! tokens (simplify-single (remove-empty tokens)))
|
|
; Extract the strings into a departement and generate a pattern
|
|
(add-pattern dep (extract-strings dep tokens)))
|
|
|
|
; Add a commune to the extractor, an hash table of regions
|
|
; Only add if the locality is active and its names are coherent
|
|
(define (add-commune extractor obj)
|
|
(when (and (csv-commune-coherent? obj)
|
|
(csv-commune-active? obj))
|
|
(add-region extractor obj)
|
|
(add-departement extractor obj)
|
|
(train-commune extractor obj)))
|
|
|
|
; Table of regions in which the data is stored
|
|
(define *extractor* (make-hash))
|
|
|
|
; Read data file and fill tables
|
|
; File a CSV with semi-colon (';') as field separator
|
|
(define *source-file* "./data/fr-esr-referentiel-geographique.csv")
|
|
(define source-reader
|
|
(make-csv-reader
|
|
(open-input-file *source-file*)
|
|
'((separator-chars . (#\;))
|
|
(strip-leading-whitespace? . #t)
|
|
(strip-trailing-whitespace? . #t))))
|
|
; Get the header
|
|
(define *header* (source-reader))
|
|
|
|
; Fill the extractor
|
|
(csv-for-each
|
|
(lambda (line)
|
|
(add-commune
|
|
*extractor*
|
|
(map
|
|
(lambda (x) (apply cons x))
|
|
(zip *header* line))))
|
|
source-reader)
|
|
|
|
; Translators for json serialisation
|
|
(define (pattern-translator x)
|
|
(cond
|
|
((symbol? x) (symbol->string x))
|
|
((list? x) (map pattern-translator x))
|
|
(else x)))
|
|
|
|
(define (markov-translator x)
|
|
(cond
|
|
((char? x) (list->string (list x)))
|
|
(else x)))
|
|
|
|
; Output a departement to a json-compatible structure
|
|
(define (departement->jsexpr dep)
|
|
(make-immutable-hash
|
|
`((id . ,(departement-id dep))
|
|
(name . ,(departement-name dep))
|
|
(patterns . ,(distribution->jsexpr (departement-patterns dep) #:translator pattern-translator))
|
|
(word-masculine . ,(distribution->jsexpr (departement-word-masculine dep)))
|
|
(word-feminine . ,(distribution->jsexpr (departement-word-feminine dep)))
|
|
(word-plural . ,(distribution->jsexpr (departement-word-plural dep)))
|
|
(word-masculine-plural . ,(distribution->jsexpr (departement-word-masculine-plural dep)))
|
|
(word-feminine-plural . ,(distribution->jsexpr (departement-word-feminine-plural dep)))
|
|
(person-name-masculine . ,(distribution->jsexpr (departement-person-name-masculine dep)))
|
|
(person-name-feminine . ,(distribution->jsexpr (departement-person-name-feminine dep)))
|
|
(locality-name . ,(markov->jsexpr (departement-locality-name dep) #:translator markov-translator))
|
|
(area-name . ,(markov->jsexpr (departement-area-name dep) #:translator markov-translator))
|
|
(river-name . ,(markov->jsexpr (departement-river-name dep) #:translator markov-translator))
|
|
)))
|
|
|
|
; Output a region to a json-compatible structure
|
|
(define (region->jsexpr reg)
|
|
(make-immutable-hash
|
|
`((id . ,(region-id reg))
|
|
(name . ,(region-name reg))
|
|
(departements . ,(map departement->jsexpr (hash-values (region-departements reg))))
|
|
)))
|
|
|
|
; Output the extractor to json-compatible structure
|
|
(define (extractor->jsexpr extractor)
|
|
(map region->jsexpr (hash-values extractor)))
|
|
|
|
(define *output* (open-output-file "./web/gener-communes.json" #:exists 'truncate/replace))
|
|
; output lisp structure
|
|
(write-json (extractor->jsexpr *extractor*) *output*)
|