2019-11-04 17:23:08 +01:00
#lang racket
; Extracts data from CSV and generate a json file suitable for generation
; The output json file contains a list of regions, each containing a list of departements :
; [ {
; "id": "...", "name": "...",
; "departements":
; [ {
; "id": "...", "name": "...",
; "word-masculine": [{"value": "...", "count": X}, ...],
; "word-feminine": [{"value": "...", "count": X}, ...],
; "word-plural": [{"value": "...", "count": X}, ...],
; "word-masculine-plural": [{"value": "...", "count": X}, ...],
; "word-feminine-plural": [{"value": "...", "count": X}, ...],
; "person-name-masculine": [{"value": "...", "count": X}, ...],
; "person-name-feminine": [{"value": "...", "count": X}, ...],
; "patterns": [{"value": [...], "count": X}, ...],
2019-11-05 17:39:29 +01:00
; "locality-name": {"order": X, "next": [{ "sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
; "area-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
; "river-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
2019-11-04 17:23:08 +01:00
; },
; ...
; ],
; },
; ...
; ]
; The distributions and markov chains are trained are by departement
( require
csv-reading
json
srfi/1
" markov.rkt "
" distribution.rkt " )
; Structure for each region
( struct region
( id ; id of region
name ; name of region
departements ; table of departements (id->dep)
) )
; Constructor for a region
( define ( make-region id name )
( region id name ( make-hash ) ) )
; Structure for each departement
( struct departement
( id ; id of departement
name ; name of departement
patterns ; distribution of patterns
word-masculine ; distribution of masculine words
word-feminine ; distribution of feminine words
word-plural ; distribution of plural words
word-masculine-plural ; distribution of masculine plural words
word-feminine-plural ; distribution of feminine plural words
person-name-masculine ; distributino of masculine person name
person-name-feminine ; distribution of feminine person name
locality-name ; markov chain for locality names
area-name ; markov chain for area names
river-name ; markov chain for river names
) )
; Constructor for a departement
( define ( make-departement id name )
( departement
id
name
( make-distribution )
( make-distribution )
( make-distribution )
( make-distribution )
( make-distribution )
( make-distribution )
( make-distribution )
( make-distribution )
( make-markov 3 )
( make-markov 3 )
( make-markov 3 ) ) )
;;
; Extraction tools
; Accessors of a csv-commune object (dict of line extracted from the csv file)
; Name
( define ( csv-commune-name obj )
( dict-ref obj " COM_NOM " ) )
; Uppercase name
( define ( csv-commune-name-maj obj )
( dict-ref obj " COM_NOM_MAJ " ) )
; Uppercase short name
( define ( csv-commune-name-maj-court obj )
( dict-ref obj " COM_NOM_MAJ_COURT " ) )
; Region Id
( define ( csv-commune-region-id obj )
( dict-ref obj " REG_ID_OLD " ) )
; Region name
( define ( csv-commune-region-name obj )
( dict-ref obj " REG_NOM_OLD " ) )
; Departement Id
( define ( csv-commune-departement-id obj )
( dict-ref obj " DEP_ID " ) )
; Departement name
( define ( csv-commune-departement-name obj )
( dict-ref obj " DEP_NOM " ) )
; Indicate if a commune is active (still exists)
( define ( csv-commune-active? obj )
( not ( equal? " SO " ( dict-ref obj " UUCR_ID " ) ) ) )
; Function removing all diacrtics of latin-based languages
( define ( remove-diacritics str )
( list->string
( filter
( lambda ( c )
( < ( char->integer c ) 256 ) )
( string->list
( string-normalize-nfd str ) ) ) ) )
; Return the short form of a name
( define ( short-form str )
( string-join
( map
( lambda ( x )
( cond
( ( equal? x " SAINT " ) " ST " )
( ( equal? x " SAINTE " ) " STE " )
( #t x ) ) )
( string-split
str
#rx"[-']" ) )
" " ) )
; Indicate if the commune name's uppercase form and uppercase short form can be computed from the commune's name
( define ( csv-commune-coherent? obj )
( define name ( csv-commune-name obj ) )
( define name-maj ( csv-commune-name-maj obj ) )
( define name-maj-court ( csv-commune-name-maj-court obj ) )
( and
; uppercase name should be name but capitalized and diacritics removed
( equal? name-maj
( string-upcase ( remove-diacritics name ) ) )
; uppercase short name should be uppercase name with apostrophes and hyphens replaced by spaces, "SAINT " replaced by "ST " and "SAINTE " replaced by "STE "
( equal? name-maj-court
( short-form name-maj ) ) ) )
; Add a region to the extractor
( define ( add-region extractor obj )
( define id ( csv-commune-region-id obj ) )
( when ( not ( hash-has-key? extractor id ) )
( hash-set! extractor id ( make-region id ( csv-commune-region-name obj ) ) ) ) )
; Add a departement to the extractor
( define ( add-departement extractor obj )
( define reg ( hash-ref extractor ( csv-commune-region-id obj ) ) )
( define id ( csv-commune-departement-id obj ) )
( when ( not ( hash-has-key? ( region-departements reg ) id ) )
( hash-set! ( region-departements reg ) id ( make-departement id ( csv-commune-departement-name obj ) ) ) ) )
; Remove elision
( define *elisions*
' ( ( " -l' " . " -le- " ) ; masculine is used - though "l'" can also be an elision for the feminine "la"
( " -d' " . " -de- " )
( " -des- " . " -de-les- " )
( " -du- " . " -de-le- " )
( " -del- " . " -de-el- " ) ; Catalan
( " -dels- " . " -de-los- " ) ; Catalan
( " -au- " . " -à-le- " )
( " -aux- " . " -à-les- " )
( " Sant' " . " Santo- " ) ; Corsican
; Initials articles are replaced with lowercase variants
( " La " . " la- " )
( " Le " . " le- " )
( " Les " . " les- " )
( " L' " . " le- " )
( " Los " . " los- " ) ; Catalan
) )
( define ( remove-elision str )
( foldl
( lambda ( item result )
( string-replace result ( car item ) ( cdr item ) ) )
str
*elisions* ) )
; Transform preposition into binary operators between lists:
; ex: (A B C "sur" X Y Z) => (sur (A B C) (X Y Z))
; Note: recursive on lists
( define *prepositions*
' ( " et "
" à "
" en "
" de "
" di " ; Corsican
" sur "
" sous "
" lès "
" lez "
" près " ) )
( define ( preposition-pass prep tokens )
( if ( list? tokens )
( let ( ( idx ( index-of tokens prep ) ) )
( if ( and idx ( < ( + idx 1 ) ( length tokens ) ) ) ; preposition cannot be in last position
( list
( string->symbol prep )
( take tokens idx )
( preposition-pass prep ( drop tokens ( + 1 idx ) ) ) )
( map ( lambda ( x ) ( preposition-pass prep x ) ) tokens ) ) )
tokens ) )
; Transform articles (and article-like words as "Saint") into unary operators
; ex: ("le" X Y) => ((le X) Y)
( define *articles*
' ( " le "
" el " ; Catalan/Spanish
" la "
" les "
" las " ; Catalan
" los " ; Catalan
" Saint "
" Santo " ; Corsican
" San " ; Corsican
" Sainte "
" Santa " ; Corsican
" sans " ) )
( define ( article-pass art tokens )
( if ( list? tokens )
( let ( ( idx ( index-of tokens art ) ) )
( if ( and idx ( < ( + idx 1 ) ( length tokens ) ) ) ; article cannot be in last position
( list
( take tokens idx )
( list
( string->symbol art )
( list-ref tokens ( + 1 idx ) ) )
( article-pass art ( drop tokens ( + 2 idx ) ) ) )
( map ( lambda ( x ) ( article-pass art x ) ) tokens ) ) )
tokens ) )
; Simplify: remove empty lists
( define ( remove-empty tokens )
( if ( list? tokens )
( map
remove-empty
( filter ( lambda ( x ) ( not ( null? x ) ) ) tokens ) )
tokens ) )
; Simplify: lists of single elements are replaced by that element
( define ( simplify-single tokens )
( if ( list? tokens )
( if ( eq? 1 ( length tokens ) )
( simplify-single ( car tokens ) )
( map simplify-single tokens ) )
tokens ) )
; Extract the strings and generate a pattern
; The string are extracted into the different tables according to the following rules
; Single string
; X => X = locality name
; Unary operators
; (le X), (el X), (sans X) => X = masculine word
; (la X) => X = feminine word
; (les X) => X = plural word
; (las X) => X = feminine plural word
; (los X) => X = masculine plural word
; (Saint X), (Santo X), (San X) => X = masculine name
; (Sainte X), (Santa X) => X = feminine name
; Binary operators
; (et X Y), (sous X Y), (lès X Y), (lez X Y), (près X Y) => X = locality name, Y = locality name
; (à X Y), (en X Y), (de X Y), (di X Y) => X = locality name, Y = area name
; (sur X Y) => X = locality name, Y = river name
; During extraction, the strings are replaced by symbols of the corresponding type to form patterns
( define ( extract-to-distribution dep token dist sym )
( if ( string? token )
( begin
( case sym
( ( $LocalityName$ $AreaName$ $RiverName$ )
; Those are added to a markov chain
( markov-add-example! dist ( string->list ( string-titlecase token ) ) ) )
( else
; added to a distribution
( distribution-add-to! dist token ) ) )
sym )
( extract-strings dep token ) ) )
; Extraction of unary operators
( define ( unary-operator dep token dist sym )
( list ( car token ) ( extract-to-distribution dep ( cadr token ) dist sym ) ) )
; Extraction of binary operators
( define ( binary-operator dep token dist1 sym1 dist2 sym2 )
( list ( car token ) ( extract-to-distribution dep ( cadr token ) dist1 sym1 ) ( extract-to-distribution dep ( caddr token ) dist2 sym2 ) ) )
( define ( extract-strings dep token )
( cond
( ( list? token )
( case ( car token )
( ( le el sans )
( unary-operator dep token ( departement-word-masculine dep ) ' $WordMasculine$ ) )
( ( la )
( unary-operator dep token ( departement-word-feminine dep ) ' $WordFeminine$ ) )
( ( les )
( unary-operator dep token ( departement-word-plural dep ) ' $WordPlural$ ) )
( ( las )
( unary-operator dep token ( departement-word-feminine-plural dep ) ' $WordFemininePlural$ ) )
( ( los )
( unary-operator dep token ( departement-word-masculine-plural dep ) ' $WordMasculinePlural$ ) )
( ( Saint Santo San )
( unary-operator dep token ( departement-person-name-masculine dep ) ' $PersonNameMasculine$ ) )
( ( Sainte Santa )
( unary-operator dep token ( departement-person-name-feminine dep ) ' $PersonNameFeminine$ ) )
( ( et sous lès lez près )
( binary-operator dep token ( departement-locality-name dep ) ' $LocalityName$ ( departement-locality-name dep ) ' $LocalityName$ ) )
( ( à en de di )
( binary-operator dep token ( departement-locality-name dep ) ' $LocalityName$ ( departement-area-name dep ) ' $AreaName$ ) )
( ( sur )
( binary-operator dep token ( departement-locality-name dep ) ' $LocalityName$ ( departement-river-name dep ) ' $RiverName$ ) )
( else
( map ( lambda ( x ) ( extract-strings dep x ) ) token ) )
) )
( ( string? token )
( extract-to-distribution dep token ( departement-locality-name dep ) ' $LocalityName$ ) )
( #t
( error " Unexpected element: " token ) ) ) )
; Add a pattern to a departement
( define ( add-pattern dep pattern )
( distribution-add-to! ( departement-patterns dep ) pattern ) )
; Train the extractor on the commune name
( define ( train-commune extractor obj )
( define reg ( hash-ref extractor ( csv-commune-region-id obj ) ) )
( define dep ( hash-ref ( region-departements reg ) ( csv-commune-departement-id obj ) ) )
; Tokenize the name by removing elisions and splitting along the hyphens
( define tokens ( string-split ( remove-elision ( csv-commune-name obj ) ) " - " ) )
; Decompose the tokens into a tree with preposition
( set! tokens ( foldl preposition-pass tokens *prepositions* ) )
; Decompose the tokens into a tree with articles
( set! tokens ( foldl article-pass tokens *articles* ) )
; Simplify a bit the tokens: remove empty lists and replace single-item list by the contained item
( set! tokens ( simplify-single ( remove-empty tokens ) ) )
; Extract the strings into a departement and generate a pattern
( add-pattern dep ( extract-strings dep tokens ) ) )
; Add a commune to the extractor, an hash table of regions
; Only add if the locality is active and its names are coherent
( define ( add-commune extractor obj )
( when ( and ( csv-commune-coherent? obj )
( csv-commune-active? obj ) )
( add-region extractor obj )
( add-departement extractor obj )
( train-commune extractor obj ) ) )
; Table of regions in which the data is stored
( define *extractor* ( make-hash ) )
; Read data file and fill tables
; File a CSV with semi-colon (';') as field separator
( define *source-file* " ./data/fr-esr-referentiel-geographique.csv " )
( define source-reader
( make-csv-reader
( open-input-file *source-file* )
' ( ( separator-chars . ( #\; ) )
( strip-leading-whitespace? . #t )
( strip-trailing-whitespace? . #t ) ) ) )
; Get the header
( define *header* ( source-reader ) )
; Fill the extractor
( csv-for-each
( lambda ( line )
( add-commune
*extractor*
( map
( lambda ( x ) ( apply cons x ) )
( zip *header* line ) ) ) )
source-reader )
; Translators for json serialisation
( define ( pattern-translator x )
( cond
( ( symbol? x ) ( symbol->string x ) )
( ( list? x ) ( map pattern-translator x ) )
( else x ) ) )
( define ( markov-translator x )
( cond
( ( char? x ) ( list->string ( list x ) ) )
( else x ) ) )
; Output a departement to a json-compatible structure
( define ( departement->jsexpr dep )
( make-immutable-hash
` ( ( id . , ( departement-id dep ) )
( name . , ( departement-name dep ) )
( patterns . , ( distribution->jsexpr ( departement-patterns dep ) #:translator pattern-translator ) )
( word-masculine . , ( distribution->jsexpr ( departement-word-masculine dep ) ) )
( word-feminine . , ( distribution->jsexpr ( departement-word-feminine dep ) ) )
( word-plural . , ( distribution->jsexpr ( departement-word-plural dep ) ) )
( word-masculine-plural . , ( distribution->jsexpr ( departement-word-masculine-plural dep ) ) )
( word-feminine-plural . , ( distribution->jsexpr ( departement-word-feminine-plural dep ) ) )
( person-name-masculine . , ( distribution->jsexpr ( departement-person-name-masculine dep ) ) )
( person-name-feminine . , ( distribution->jsexpr ( departement-person-name-feminine dep ) ) )
( locality-name . , ( markov->jsexpr ( departement-locality-name dep ) #:translator markov-translator ) )
( area-name . , ( markov->jsexpr ( departement-area-name dep ) #:translator markov-translator ) )
( river-name . , ( markov->jsexpr ( departement-river-name dep ) #:translator markov-translator ) )
) ) )
; Output a region to a json-compatible structure
( define ( region->jsexpr reg )
( make-immutable-hash
` ( ( id . , ( region-id reg ) )
( name . , ( region-name reg ) )
( departements . , ( map departement->jsexpr ( hash-values ( region-departements reg ) ) ) )
) ) )
; Output the extractor to json-compatible structure
( define ( extractor->jsexpr extractor )
( map region->jsexpr ( hash-values extractor ) ) )
2019-11-05 17:39:29 +01:00
( define *output* ( open-output-file " ./web/gener-communes.json " #:exists ' truncate/replace ) )
2019-11-04 17:23:08 +01:00
; output lisp structure
( write-json ( extractor->jsexpr *extractor* ) *output* )