#lang racket ; Extracts data from CSV and generate a json file suitable for generation ; The output json file contains a list of regions, each containing a list of departements : ; [ { ; "id": "...", "name": "...", ; "departements": ; [ { ; "id": "...", "name": "...", ; "word-masculine": [{"value": "...", "count": X}, ...], ; "word-feminine": [{"value": "...", "count": X}, ...], ; "word-plural": [{"value": "...", "count": X}, ...], ; "word-masculine-plural": [{"value": "...", "count": X}, ...], ; "word-feminine-plural": [{"value": "...", "count": X}, ...], ; "person-name-masculine": [{"value": "...", "count": X}, ...], ; "person-name-feminine": [{"value": "...", "count": X}, ...], ; "patterns": [{"value": [...], "count": X}, ...], ; "locality-name": {"order": X, "next": [{ "sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]}, ; "area-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]}, ; "river-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]}, ; }, ; ... ; ], ; }, ; ... ; ] ; The distributions and markov chains are trained are by departement (require csv-reading json srfi/1 "markov.rkt" "distribution.rkt") ; Structure for each region (struct region (id ; id of region name ; name of region departements ; table of departements (id->dep) )) ; Constructor for a region (define (make-region id name) (region id name (make-hash))) ; Structure for each departement (struct departement (id ; id of departement name ; name of departement patterns ; distribution of patterns word-masculine ; distribution of masculine words word-feminine ; distribution of feminine words word-plural ; distribution of plural words word-masculine-plural ; distribution of masculine plural words word-feminine-plural ; distribution of feminine plural words person-name-masculine ; distributino of masculine person name person-name-feminine ; distribution of feminine person name locality-name ; markov chain for locality names area-name ; markov chain for area names river-name ; markov chain for river names )) ; Constructor for a departement (define (make-departement id name) (departement id name (make-distribution) (make-distribution) (make-distribution) (make-distribution) (make-distribution) (make-distribution) (make-distribution) (make-distribution) (make-markov 3) (make-markov 3) (make-markov 3))) ;; ; Extraction tools ; Accessors of a csv-commune object (dict of line extracted from the csv file) ; Name (define (csv-commune-name obj) (dict-ref obj "COM_NOM")) ; Uppercase name (define (csv-commune-name-maj obj) (dict-ref obj "COM_NOM_MAJ")) ; Uppercase short name (define (csv-commune-name-maj-court obj) (dict-ref obj "COM_NOM_MAJ_COURT")) ; Region Id (define (csv-commune-region-id obj) (dict-ref obj "REG_ID_OLD")) ; Region name (define (csv-commune-region-name obj) (dict-ref obj "REG_NOM_OLD")) ; Departement Id (define (csv-commune-departement-id obj) (dict-ref obj "DEP_ID")) ; Departement name (define (csv-commune-departement-name obj) (dict-ref obj "DEP_NOM")) ; Indicate if a commune is active (still exists) (define (csv-commune-active? obj) (not (equal? "SO" (dict-ref obj "UUCR_ID")))) ; Function removing all diacrtics of latin-based languages (define (remove-diacritics str) (list->string (filter (lambda (c) (< (char->integer c) 256)) (string->list (string-normalize-nfd str))))) ; Return the short form of a name (define (short-form str) (string-join (map (lambda (x) (cond ((equal? x "SAINT") "ST") ((equal? x "SAINTE") "STE") (#t x))) (string-split str #rx"[-']")) " ")) ; Indicate if the commune name's uppercase form and uppercase short form can be computed from the commune's name (define (csv-commune-coherent? obj) (define name (csv-commune-name obj)) (define name-maj (csv-commune-name-maj obj)) (define name-maj-court (csv-commune-name-maj-court obj)) (and ; uppercase name should be name but capitalized and diacritics removed (equal? name-maj (string-upcase (remove-diacritics name))) ; uppercase short name should be uppercase name with apostrophes and hyphens replaced by spaces, "SAINT " replaced by "ST " and "SAINTE " replaced by "STE " (equal? name-maj-court (short-form name-maj)))) ; Add a region to the extractor (define (add-region extractor obj) (define id (csv-commune-region-id obj)) (when (not (hash-has-key? extractor id)) (hash-set! extractor id (make-region id (csv-commune-region-name obj))))) ; Add a departement to the extractor (define (add-departement extractor obj) (define reg (hash-ref extractor (csv-commune-region-id obj))) (define id (csv-commune-departement-id obj)) (when (not (hash-has-key? (region-departements reg) id)) (hash-set! (region-departements reg) id (make-departement id (csv-commune-departement-name obj))))) ; Remove elision (define *elisions* '(("-l'" . "-le-") ; masculine is used - though "l'" can also be an elision for the feminine "la" ("-d'" . "-de-") ("-des-" . "-de-les-") ("-du-" . "-de-le-") ("-del-" . "-de-el-") ; Catalan ("-dels-" . "-de-los-") ; Catalan ("-au-" . "-à-le-") ("-aux-" . "-à-les-") ("Sant'" . "Santo-") ; Corsican ; Initials articles are replaced with lowercase variants ("La " . "la-") ("Le " . "le-") ("Les " . "les-") ("L'" . "le-") ("Los " . "los-") ; Catalan )) (define (remove-elision str) (foldl (lambda (item result) (string-replace result (car item) (cdr item))) str *elisions*)) ; Transform preposition into binary operators between lists: ; ex: (A B C "sur" X Y Z) => (sur (A B C) (X Y Z)) ; Note: recursive on lists (define *prepositions* '("et" "à" "en" "de" "di" ; Corsican "sur" "sous" "lès" "lez" "près")) (define (preposition-pass prep tokens) (if (list? tokens) (let ((idx (index-of tokens prep))) (if (and idx (< (+ idx 1) (length tokens))) ; preposition cannot be in last position (list (string->symbol prep) (take tokens idx) (preposition-pass prep (drop tokens (+ 1 idx)))) (map (lambda (x) (preposition-pass prep x)) tokens))) tokens)) ; Transform articles (and article-like words as "Saint") into unary operators ; ex: ("le" X Y) => ((le X) Y) (define *articles* '("le" "el" ; Catalan/Spanish "la" "les" "las" ; Catalan "los" ; Catalan "Saint" "Santo" ; Corsican "San" ; Corsican "Sainte" "Santa" ; Corsican "sans")) (define (article-pass art tokens) (if (list? tokens) (let ((idx (index-of tokens art))) (if (and idx (< (+ idx 1) (length tokens))) ; article cannot be in last position (list (take tokens idx) (list (string->symbol art) (list-ref tokens (+ 1 idx))) (article-pass art (drop tokens (+ 2 idx)))) (map (lambda (x) (article-pass art x)) tokens))) tokens)) ; Simplify: remove empty lists (define (remove-empty tokens) (if (list? tokens) (map remove-empty (filter (lambda (x) (not (null? x))) tokens)) tokens)) ; Simplify: lists of single elements are replaced by that element (define (simplify-single tokens) (if (list? tokens) (if (eq? 1 (length tokens)) (simplify-single (car tokens)) (map simplify-single tokens)) tokens)) ; Extract the strings and generate a pattern ; The string are extracted into the different tables according to the following rules ; Single string ; X => X = locality name ; Unary operators ; (le X), (el X), (sans X) => X = masculine word ; (la X) => X = feminine word ; (les X) => X = plural word ; (las X) => X = feminine plural word ; (los X) => X = masculine plural word ; (Saint X), (Santo X), (San X) => X = masculine name ; (Sainte X), (Santa X) => X = feminine name ; Binary operators ; (et X Y), (sous X Y), (lès X Y), (lez X Y), (près X Y) => X = locality name, Y = locality name ; (à X Y), (en X Y), (de X Y), (di X Y) => X = locality name, Y = area name ; (sur X Y) => X = locality name, Y = river name ; During extraction, the strings are replaced by symbols of the corresponding type to form patterns (define (extract-to-distribution dep token dist sym) (if (string? token) (begin (case sym (($LocalityName$ $AreaName$ $RiverName$) ; Those are added to a markov chain (markov-add-example! dist (string->list (string-titlecase token)))) (else ; added to a distribution (distribution-add-to! dist token))) sym) (extract-strings dep token))) ; Extraction of unary operators (define (unary-operator dep token dist sym) (list (car token) (extract-to-distribution dep (cadr token) dist sym))) ; Extraction of binary operators (define (binary-operator dep token dist1 sym1 dist2 sym2) (list (car token) (extract-to-distribution dep (cadr token) dist1 sym1) (extract-to-distribution dep (caddr token) dist2 sym2))) (define (extract-strings dep token) (cond ( (list? token) (case (car token) ((le el sans) (unary-operator dep token (departement-word-masculine dep) '$WordMasculine$)) ((la) (unary-operator dep token (departement-word-feminine dep) '$WordFeminine$)) ((les) (unary-operator dep token (departement-word-plural dep) '$WordPlural$)) ((las) (unary-operator dep token (departement-word-feminine-plural dep) '$WordFemininePlural$)) ((los) (unary-operator dep token (departement-word-masculine-plural dep) '$WordMasculinePlural$)) ((Saint Santo San) (unary-operator dep token (departement-person-name-masculine dep) '$PersonNameMasculine$)) ((Sainte Santa) (unary-operator dep token (departement-person-name-feminine dep) '$PersonNameFeminine$)) ((et sous lès lez près) (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-locality-name dep) '$LocalityName$)) ((à en de di) (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-area-name dep) '$AreaName$)) ((sur) (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-river-name dep) '$RiverName$)) (else (map (lambda (x) (extract-strings dep x)) token)) )) ( (string? token) (extract-to-distribution dep token (departement-locality-name dep) '$LocalityName$)) (#t (error "Unexpected element: " token)))) ; Add a pattern to a departement (define (add-pattern dep pattern) (distribution-add-to! (departement-patterns dep) pattern)) ; Train the extractor on the commune name (define (train-commune extractor obj) (define reg (hash-ref extractor (csv-commune-region-id obj))) (define dep (hash-ref (region-departements reg) (csv-commune-departement-id obj))) ; Tokenize the name by removing elisions and splitting along the hyphens (define tokens (string-split (remove-elision (csv-commune-name obj)) "-")) ; Decompose the tokens into a tree with preposition (set! tokens (foldl preposition-pass tokens *prepositions*)) ; Decompose the tokens into a tree with articles (set! tokens (foldl article-pass tokens *articles*)) ; Simplify a bit the tokens: remove empty lists and replace single-item list by the contained item (set! tokens (simplify-single (remove-empty tokens))) ; Extract the strings into a departement and generate a pattern (add-pattern dep (extract-strings dep tokens))) ; Add a commune to the extractor, an hash table of regions ; Only add if the locality is active and its names are coherent (define (add-commune extractor obj) (when (and (csv-commune-coherent? obj) (csv-commune-active? obj)) (add-region extractor obj) (add-departement extractor obj) (train-commune extractor obj))) ; Table of regions in which the data is stored (define *extractor* (make-hash)) ; Read data file and fill tables ; File a CSV with semi-colon (';') as field separator (define *source-file* "./data/fr-esr-referentiel-geographique.csv") (define source-reader (make-csv-reader (open-input-file *source-file*) '((separator-chars . (#\;)) (strip-leading-whitespace? . #t) (strip-trailing-whitespace? . #t)))) ; Get the header (define *header* (source-reader)) ; Fill the extractor (csv-for-each (lambda (line) (add-commune *extractor* (map (lambda (x) (apply cons x)) (zip *header* line)))) source-reader) ; Translators for json serialisation (define (pattern-translator x) (cond ((symbol? x) (symbol->string x)) ((list? x) (map pattern-translator x)) (else x))) (define (markov-translator x) (cond ((char? x) (list->string (list x))) (else x))) ; Output a departement to a json-compatible structure (define (departement->jsexpr dep) (make-immutable-hash `((id . ,(departement-id dep)) (name . ,(departement-name dep)) (patterns . ,(distribution->jsexpr (departement-patterns dep) #:translator pattern-translator)) (word-masculine . ,(distribution->jsexpr (departement-word-masculine dep))) (word-feminine . ,(distribution->jsexpr (departement-word-feminine dep))) (word-plural . ,(distribution->jsexpr (departement-word-plural dep))) (word-masculine-plural . ,(distribution->jsexpr (departement-word-masculine-plural dep))) (word-feminine-plural . ,(distribution->jsexpr (departement-word-feminine-plural dep))) (person-name-masculine . ,(distribution->jsexpr (departement-person-name-masculine dep))) (person-name-feminine . ,(distribution->jsexpr (departement-person-name-feminine dep))) (locality-name . ,(markov->jsexpr (departement-locality-name dep) #:translator markov-translator)) (area-name . ,(markov->jsexpr (departement-area-name dep) #:translator markov-translator)) (river-name . ,(markov->jsexpr (departement-river-name dep) #:translator markov-translator)) ))) ; Output a region to a json-compatible structure (define (region->jsexpr reg) (make-immutable-hash `((id . ,(region-id reg)) (name . ,(region-name reg)) (departements . ,(map departement->jsexpr (hash-values (region-departements reg)))) ))) ; Output the extractor to json-compatible structure (define (extractor->jsexpr extractor) (map region->jsexpr (hash-values extractor))) (define *output* (open-output-file "./web/gener-communes.json" #:exists 'truncate/replace)) ; output lisp structure (write-json (extractor->jsexpr *extractor*) *output*)