gener-communes/extractor.rkt

422 lines
15 KiB
Racket

#lang racket
; Extracts data from CSV and generate a json file suitable for generation
; The output json file contains a list of regions, each containing a list of departements :
; [ {
; "id": "...", "name": "...",
; "departements":
; [ {
; "id": "...", "name": "...",
; "word-masculine": [{"value": "...", "count": X}, ...],
; "word-feminine": [{"value": "...", "count": X}, ...],
; "word-plural": [{"value": "...", "count": X}, ...],
; "word-masculine-plural": [{"value": "...", "count": X}, ...],
; "word-feminine-plural": [{"value": "...", "count": X}, ...],
; "person-name-masculine": [{"value": "...", "count": X}, ...],
; "person-name-feminine": [{"value": "...", "count": X}, ...],
; "patterns": [{"value": [...], "count": X}, ...],
; "locality-name": {"order": X, "next": [{ "sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
; "area-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
; "river-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
; },
; ...
; ],
; },
; ...
; ]
; The distributions and markov chains are trained are by departement
(require
csv-reading
json
srfi/1
"markov.rkt"
"distribution.rkt")
; Structure for each region
(struct region
(id ; id of region
name ; name of region
departements ; table of departements (id->dep)
))
; Constructor for a region
(define (make-region id name)
(region id name (make-hash)))
; Structure for each departement
(struct departement
(id ; id of departement
name ; name of departement
patterns ; distribution of patterns
word-masculine ; distribution of masculine words
word-feminine ; distribution of feminine words
word-plural ; distribution of plural words
word-masculine-plural ; distribution of masculine plural words
word-feminine-plural ; distribution of feminine plural words
person-name-masculine ; distributino of masculine person name
person-name-feminine ; distribution of feminine person name
locality-name ; markov chain for locality names
area-name ; markov chain for area names
river-name ; markov chain for river names
))
; Constructor for a departement
(define (make-departement id name)
(departement
id
name
(make-distribution)
(make-distribution)
(make-distribution)
(make-distribution)
(make-distribution)
(make-distribution)
(make-distribution)
(make-distribution)
(make-markov 3)
(make-markov 3)
(make-markov 3)))
;;
; Extraction tools
; Accessors of a csv-commune object (dict of line extracted from the csv file)
; Name
(define (csv-commune-name obj)
(dict-ref obj "COM_NOM"))
; Uppercase name
(define (csv-commune-name-maj obj)
(dict-ref obj "COM_NOM_MAJ"))
; Uppercase short name
(define (csv-commune-name-maj-court obj)
(dict-ref obj "COM_NOM_MAJ_COURT"))
; Region Id
(define (csv-commune-region-id obj)
(dict-ref obj "REG_ID_OLD"))
; Region name
(define (csv-commune-region-name obj)
(dict-ref obj "REG_NOM_OLD"))
; Departement Id
(define (csv-commune-departement-id obj)
(dict-ref obj "DEP_ID"))
; Departement name
(define (csv-commune-departement-name obj)
(dict-ref obj "DEP_NOM"))
; Indicate if a commune is active (still exists)
(define (csv-commune-active? obj)
(not (equal? "SO" (dict-ref obj "UUCR_ID"))))
; Function removing all diacrtics of latin-based languages
(define (remove-diacritics str)
(list->string
(filter
(lambda (c)
(< (char->integer c) 256))
(string->list
(string-normalize-nfd str)))))
; Return the short form of a name
(define (short-form str)
(string-join
(map
(lambda (x)
(cond
((equal? x "SAINT") "ST")
((equal? x "SAINTE") "STE")
(#t x)))
(string-split
str
#rx"[-']"))
" "))
; Indicate if the commune name's uppercase form and uppercase short form can be computed from the commune's name
(define (csv-commune-coherent? obj)
(define name (csv-commune-name obj))
(define name-maj (csv-commune-name-maj obj))
(define name-maj-court (csv-commune-name-maj-court obj))
(and
; uppercase name should be name but capitalized and diacritics removed
(equal? name-maj
(string-upcase (remove-diacritics name)))
; uppercase short name should be uppercase name with apostrophes and hyphens replaced by spaces, "SAINT " replaced by "ST " and "SAINTE " replaced by "STE "
(equal? name-maj-court
(short-form name-maj))))
; Add a region to the extractor
(define (add-region extractor obj)
(define id (csv-commune-region-id obj))
(when (not (hash-has-key? extractor id))
(hash-set! extractor id (make-region id (csv-commune-region-name obj)))))
; Add a departement to the extractor
(define (add-departement extractor obj)
(define reg (hash-ref extractor (csv-commune-region-id obj)))
(define id (csv-commune-departement-id obj))
(when (not (hash-has-key? (region-departements reg) id))
(hash-set! (region-departements reg) id (make-departement id (csv-commune-departement-name obj)))))
; Remove elision
(define *elisions*
'(("-l'" . "-le-") ; masculine is used - though "l'" can also be an elision for the feminine "la"
("-d'" . "-de-")
("-des-" . "-de-les-")
("-du-" . "-de-le-")
("-del-" . "-de-el-") ; Catalan
("-dels-" . "-de-los-") ; Catalan
("-au-" . "-à-le-")
("-aux-" . "-à-les-")
("Sant'" . "Santo-") ; Corsican
; Initials articles are replaced with lowercase variants
("La " . "la-")
("Le " . "le-")
("Les " . "les-")
("L'" . "le-")
("Los " . "los-") ; Catalan
))
(define (remove-elision str)
(foldl
(lambda (item result)
(string-replace result (car item) (cdr item)))
str
*elisions*))
; Transform preposition into binary operators between lists:
; ex: (A B C "sur" X Y Z) => (sur (A B C) (X Y Z))
; Note: recursive on lists
(define *prepositions*
'("et"
"à"
"en"
"de"
"di" ; Corsican
"sur"
"sous"
"lès"
"lez"
"près"))
(define (preposition-pass prep tokens)
(if (list? tokens)
(let ((idx (index-of tokens prep)))
(if (and idx (< (+ idx 1) (length tokens))) ; preposition cannot be in last position
(list
(string->symbol prep)
(take tokens idx)
(preposition-pass prep (drop tokens (+ 1 idx))))
(map (lambda (x) (preposition-pass prep x)) tokens)))
tokens))
; Transform articles (and article-like words as "Saint") into unary operators
; ex: ("le" X Y) => ((le X) Y)
(define *articles*
'("le"
"el" ; Catalan/Spanish
"la"
"les"
"las" ; Catalan
"los" ; Catalan
"Saint"
"Santo" ; Corsican
"San" ; Corsican
"Sainte"
"Santa" ; Corsican
"sans"))
(define (article-pass art tokens)
(if (list? tokens)
(let ((idx (index-of tokens art)))
(if (and idx (< (+ idx 1) (length tokens))) ; article cannot be in last position
(list
(take tokens idx)
(list
(string->symbol art)
(list-ref tokens (+ 1 idx)))
(article-pass art (drop tokens (+ 2 idx))))
(map (lambda (x) (article-pass art x)) tokens)))
tokens))
; Simplify: remove empty lists
(define (remove-empty tokens)
(if (list? tokens)
(map
remove-empty
(filter (lambda (x) (not (null? x))) tokens))
tokens))
; Simplify: lists of single elements are replaced by that element
(define (simplify-single tokens)
(if (list? tokens)
(if (eq? 1 (length tokens))
(simplify-single (car tokens))
(map simplify-single tokens))
tokens))
; Extract the strings and generate a pattern
; The string are extracted into the different tables according to the following rules
; Single string
; X => X = locality name
; Unary operators
; (le X), (el X), (sans X) => X = masculine word
; (la X) => X = feminine word
; (les X) => X = plural word
; (las X) => X = feminine plural word
; (los X) => X = masculine plural word
; (Saint X), (Santo X), (San X) => X = masculine name
; (Sainte X), (Santa X) => X = feminine name
; Binary operators
; (et X Y), (sous X Y), (lès X Y), (lez X Y), (près X Y) => X = locality name, Y = locality name
; (à X Y), (en X Y), (de X Y), (di X Y) => X = locality name, Y = area name
; (sur X Y) => X = locality name, Y = river name
; During extraction, the strings are replaced by symbols of the corresponding type to form patterns
(define (extract-to-distribution dep token dist sym)
(if (string? token)
(begin
(case sym
(($LocalityName$ $AreaName$ $RiverName$)
; Those are added to a markov chain
(markov-add-example! dist (string->list (string-titlecase token))))
(else
; added to a distribution
(distribution-add-to! dist token)))
sym)
(extract-strings dep token)))
; Extraction of unary operators
(define (unary-operator dep token dist sym)
(list (car token) (extract-to-distribution dep (cadr token) dist sym)))
; Extraction of binary operators
(define (binary-operator dep token dist1 sym1 dist2 sym2)
(list (car token) (extract-to-distribution dep (cadr token) dist1 sym1) (extract-to-distribution dep (caddr token) dist2 sym2)))
(define (extract-strings dep token)
(cond
( (list? token)
(case (car token)
((le el sans)
(unary-operator dep token (departement-word-masculine dep) '$WordMasculine$))
((la)
(unary-operator dep token (departement-word-feminine dep) '$WordFeminine$))
((les)
(unary-operator dep token (departement-word-plural dep) '$WordPlural$))
((las)
(unary-operator dep token (departement-word-feminine-plural dep) '$WordFemininePlural$))
((los)
(unary-operator dep token (departement-word-masculine-plural dep) '$WordMasculinePlural$))
((Saint Santo San)
(unary-operator dep token (departement-person-name-masculine dep) '$PersonNameMasculine$))
((Sainte Santa)
(unary-operator dep token (departement-person-name-feminine dep) '$PersonNameFeminine$))
((et sous lès lez près)
(binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-locality-name dep) '$LocalityName$))
((à en de di)
(binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-area-name dep) '$AreaName$))
((sur)
(binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-river-name dep) '$RiverName$))
(else
(map (lambda (x) (extract-strings dep x)) token))
))
( (string? token)
(extract-to-distribution dep token (departement-locality-name dep) '$LocalityName$))
(#t
(error "Unexpected element: " token))))
; Add a pattern to a departement
(define (add-pattern dep pattern)
(distribution-add-to! (departement-patterns dep) pattern))
; Train the extractor on the commune name
(define (train-commune extractor obj)
(define reg (hash-ref extractor (csv-commune-region-id obj)))
(define dep (hash-ref (region-departements reg) (csv-commune-departement-id obj)))
; Tokenize the name by removing elisions and splitting along the hyphens
(define tokens (string-split (remove-elision (csv-commune-name obj)) "-"))
; Decompose the tokens into a tree with preposition
(set! tokens (foldl preposition-pass tokens *prepositions*))
; Decompose the tokens into a tree with articles
(set! tokens (foldl article-pass tokens *articles*))
; Simplify a bit the tokens: remove empty lists and replace single-item list by the contained item
(set! tokens (simplify-single (remove-empty tokens)))
; Extract the strings into a departement and generate a pattern
(add-pattern dep (extract-strings dep tokens)))
; Add a commune to the extractor, an hash table of regions
; Only add if the locality is active and its names are coherent
(define (add-commune extractor obj)
(when (and (csv-commune-coherent? obj)
(csv-commune-active? obj))
(add-region extractor obj)
(add-departement extractor obj)
(train-commune extractor obj)))
; Table of regions in which the data is stored
(define *extractor* (make-hash))
; Read data file and fill tables
; File a CSV with semi-colon (';') as field separator
(define *source-file* "./data/fr-esr-referentiel-geographique.csv")
(define source-reader
(make-csv-reader
(open-input-file *source-file*)
'((separator-chars . (#\;))
(strip-leading-whitespace? . #t)
(strip-trailing-whitespace? . #t))))
; Get the header
(define *header* (source-reader))
; Fill the extractor
(csv-for-each
(lambda (line)
(add-commune
*extractor*
(map
(lambda (x) (apply cons x))
(zip *header* line))))
source-reader)
; Translators for json serialisation
(define (pattern-translator x)
(cond
((symbol? x) (symbol->string x))
((list? x) (map pattern-translator x))
(else x)))
(define (markov-translator x)
(cond
((char? x) (list->string (list x)))
(else x)))
; Output a departement to a json-compatible structure
(define (departement->jsexpr dep)
(make-immutable-hash
`((id . ,(departement-id dep))
(name . ,(departement-name dep))
(patterns . ,(distribution->jsexpr (departement-patterns dep) #:translator pattern-translator))
(word-masculine . ,(distribution->jsexpr (departement-word-masculine dep)))
(word-feminine . ,(distribution->jsexpr (departement-word-feminine dep)))
(word-plural . ,(distribution->jsexpr (departement-word-plural dep)))
(word-masculine-plural . ,(distribution->jsexpr (departement-word-masculine-plural dep)))
(word-feminine-plural . ,(distribution->jsexpr (departement-word-feminine-plural dep)))
(person-name-masculine . ,(distribution->jsexpr (departement-person-name-masculine dep)))
(person-name-feminine . ,(distribution->jsexpr (departement-person-name-feminine dep)))
(locality-name . ,(markov->jsexpr (departement-locality-name dep) #:translator markov-translator))
(area-name . ,(markov->jsexpr (departement-area-name dep) #:translator markov-translator))
(river-name . ,(markov->jsexpr (departement-river-name dep) #:translator markov-translator))
)))
; Output a region to a json-compatible structure
(define (region->jsexpr reg)
(make-immutable-hash
`((id . ,(region-id reg))
(name . ,(region-name reg))
(departements . ,(map departement->jsexpr (hash-values (region-departements reg))))
)))
; Output the extractor to json-compatible structure
(define (extractor->jsexpr extractor)
(map region->jsexpr (hash-values extractor)))
(define *output* (open-output-file "./web/gener-communes.json" #:exists 'truncate/replace))
; output lisp structure
(write-json (extractor->jsexpr *extractor*) *output*)