gener-communes/extractor.rkt

#lang racket

; Extracts data from CSV and generate a json file suitable for generation
; The output json file contains a list of regions, each containing a list of departements :
; [ {
;     "id": "...", "name": "...",
;     "departements":
;       [ {
;           "id": "...", "name": "...",
;           "word-masculine": [{"value": "...", "count": X}, ...],
;           "word-feminine": [{"value": "...", "count": X}, ...],
;           "word-plural": [{"value": "...", "count": X}, ...],
;           "word-masculine-plural": [{"value": "...", "count": X}, ...],
;           "word-feminine-plural": [{"value": "...", "count": X}, ...],
;           "person-name-masculine": [{"value": "...", "count": X}, ...],
;           "person-name-feminine": [{"value": "...", "count": X}, ...],
;           "patterns": [{"value": [...], "count": X}, ...],
;           "locality-name": {"order": X, "next": [{ "sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
;           "area-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
;           "river-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
;         },
;         ...
;      ],
;   },
;   ...
; ]
; The distributions and markov chains are trained are by departement

(require
  csv-reading
  json
  srfi/1
  "markov.rkt"
  "distribution.rkt")

; Structure for each region
(struct region
  (id ; id of region
   name ; name of region
   departements ; table of departements (id->dep)
   ))

; Constructor for a region
(define (make-region id name)
  (region id name (make-hash)))

; Structure for each departement
(struct departement
  (id ; id of departement
   name ; name of departement
   patterns ; distribution of patterns
   word-masculine ; distribution of masculine words
   word-feminine ; distribution of feminine words
   word-plural ; distribution of plural words
   word-masculine-plural ; distribution of masculine plural words
   word-feminine-plural ; distribution of feminine plural words
   person-name-masculine ; distributino of masculine person name
   person-name-feminine ; distribution of feminine person name
   locality-name ; markov chain for locality names
   area-name ; markov chain for area names
   river-name ; markov chain for river names
   ))

; Constructor for a departement
(define (make-departement id name)
  (departement
    id
    name
    (make-distribution)
    (make-distribution)
    (make-distribution)
    (make-distribution)
    (make-distribution)
    (make-distribution)
    (make-distribution)
    (make-distribution)
    (make-markov 3)
    (make-markov 3)
    (make-markov 3)))

;;
; Extraction tools

; Accessors of a csv-commune object (dict of line extracted from the csv file)
; Name
(define (csv-commune-name obj)
  (dict-ref obj "COM_NOM"))
; Uppercase name
(define (csv-commune-name-maj obj)
  (dict-ref obj "COM_NOM_MAJ"))
; Uppercase short name
(define (csv-commune-name-maj-court obj)
  (dict-ref obj "COM_NOM_MAJ_COURT"))
; Region Id
(define (csv-commune-region-id obj)
  (dict-ref obj "REG_ID_OLD"))
; Region name
(define (csv-commune-region-name obj)
  (dict-ref obj "REG_NOM_OLD"))
; Departement Id
(define (csv-commune-departement-id obj)
  (dict-ref obj "DEP_ID"))
; Departement name
(define (csv-commune-departement-name obj)
  (dict-ref obj "DEP_NOM"))
; Indicate if a commune is active (still exists)
(define (csv-commune-active? obj)
  (not (equal? "SO" (dict-ref obj "UUCR_ID"))))

; Function removing all diacrtics of latin-based languages
(define (remove-diacritics str)
  (list->string
      (filter
        (lambda (c)
          (< (char->integer c) 256))
        (string->list
          (string-normalize-nfd str)))))

; Return the short form of a name
(define (short-form str)
  (string-join
    (map
      (lambda (x)
        (cond
          ((equal? x "SAINT") "ST")
          ((equal? x "SAINTE") "STE")
          (#t x)))
      (string-split
        str
        #rx"[-']"))
    " "))

; Indicate if the commune name's uppercase form and uppercase short form can be computed from the commune's name
(define (csv-commune-coherent? obj)
  (define name (csv-commune-name obj))
  (define name-maj (csv-commune-name-maj obj))
  (define name-maj-court (csv-commune-name-maj-court obj))
  (and
    ; uppercase name should be name but capitalized and diacritics removed
    (equal? name-maj
            (string-upcase (remove-diacritics name)))
    ; uppercase short name should be uppercase name with apostrophes and hyphens replaced by spaces, "SAINT " replaced by "ST " and "SAINTE " replaced by "STE "
    (equal? name-maj-court
            (short-form name-maj))))

; Add a region to the extractor
(define (add-region extractor obj)
  (define id (csv-commune-region-id obj))
  (when (not (hash-has-key? extractor id))
    (hash-set! extractor id (make-region id (csv-commune-region-name obj)))))

; Add a departement to the extractor
(define (add-departement extractor obj)
  (define reg (hash-ref extractor (csv-commune-region-id obj)))
  (define id (csv-commune-departement-id obj))
  (when (not (hash-has-key? (region-departements reg) id))
    (hash-set! (region-departements reg) id (make-departement id (csv-commune-departement-name obj)))))

; Remove elision
(define *elisions*
  '(("-l'" . "-le-") ; masculine is used - though "l'" can also be an elision for the feminine "la"
    ("-d'" . "-de-")
    ("-des-" . "-de-les-")
    ("-du-" . "-de-le-")
    ("-del-" . "-de-el-") ; Catalan
    ("-dels-" . "-de-los-") ; Catalan
    ("-au-" . "-à-le-")
    ("-aux-" . "-à-les-")
    ("Sant'" . "Santo-") ; Corsican
    ; Initials articles are replaced with lowercase variants
    ("La " . "la-")
    ("Le " . "le-")
    ("Les " . "les-")
    ("L'" . "le-")
    ("Los " . "los-") ; Catalan
   ))
(define (remove-elision str)
  (foldl
    (lambda (item result)
      (string-replace result (car item) (cdr item)))
    str
    *elisions*))

; Transform preposition into binary operators between lists:
; ex: (A B C "sur" X Y Z) => (sur (A B C) (X Y Z))
; Note: recursive on lists
(define *prepositions*
  '("et"
    "à"
    "en"
    "de"
    "di" ; Corsican
    "sur"
    "sous"
    "lès"
    "lez"
    "près"))
(define (preposition-pass prep tokens)
  (if (list? tokens)
      (let ((idx (index-of tokens prep)))
        (if (and idx (< (+ idx 1) (length tokens))) ; preposition cannot be in last position
            (list
              (string->symbol prep)
              (take tokens idx)
              (preposition-pass prep (drop tokens (+ 1 idx))))
            (map (lambda (x) (preposition-pass prep x)) tokens)))
      tokens))

; Transform articles (and article-like words as "Saint") into unary operators
; ex: ("le" X Y) => ((le X) Y)
(define *articles*
  '("le"
    "el" ; Catalan/Spanish
    "la"
    "les"
    "las" ; Catalan
    "los" ; Catalan
    "Saint"
    "Santo" ; Corsican
    "San" ; Corsican
    "Sainte"
    "Santa" ; Corsican
    "sans"))
(define (article-pass art tokens)
  (if (list? tokens)
      (let ((idx (index-of tokens art)))
        (if (and idx (< (+ idx 1) (length tokens))) ; article cannot be in last position
            (list
              (take tokens idx)
              (list
                (string->symbol art)
                (list-ref tokens (+ 1 idx)))
              (article-pass art (drop tokens (+ 2 idx))))
            (map (lambda (x) (article-pass art x)) tokens)))
      tokens))

; Simplify: remove empty lists
(define (remove-empty tokens)
  (if (list? tokens)
      (map
        remove-empty
        (filter (lambda (x) (not (null? x))) tokens))
      tokens))

; Simplify: lists of single elements are replaced by that element
(define (simplify-single tokens)
  (if (list? tokens)
      (if (eq? 1 (length tokens))
          (simplify-single (car tokens))
          (map simplify-single tokens))
      tokens))

; Extract the strings and generate a pattern
; The string are extracted into the different tables according to the following rules
; Single string
; X => X = locality name
; Unary operators
; (le X), (el X), (sans X) => X = masculine word
; (la X) => X = feminine word
; (les X) => X = plural word
; (las X) => X = feminine plural word
; (los X) => X = masculine plural word
; (Saint X), (Santo X), (San X) => X = masculine name
; (Sainte X), (Santa X) => X = feminine name
; Binary operators
; (et X Y), (sous X Y), (lès X Y), (lez X Y), (près X Y) => X = locality name, Y = locality name
; (à X Y), (en X Y), (de X Y), (di X Y)  => X = locality name, Y = area name
; (sur X Y) => X = locality name, Y = river name

; During extraction, the strings are replaced by symbols of the corresponding type to form patterns
(define (extract-to-distribution dep token dist sym)
  (if (string? token)
      (begin
        (case sym
          (($LocalityName$ $AreaName$ $RiverName$)
            ; Those are added to a markov chain
            (markov-add-example! dist (string->list (string-titlecase token))))
          (else
            ; added to a distribution
            (distribution-add-to! dist token)))
        sym)
      (extract-strings dep token)))

; Extraction of unary operators
(define (unary-operator dep token dist sym)
  (list (car token) (extract-to-distribution dep (cadr token) dist sym)))

; Extraction of binary operators
(define (binary-operator dep token dist1 sym1 dist2 sym2)
  (list (car token) (extract-to-distribution dep (cadr token) dist1 sym1) (extract-to-distribution dep (caddr token) dist2 sym2)))

(define (extract-strings dep token)
  (cond
    ( (list? token)
      (case (car token)
        ((le el sans)
         (unary-operator dep token (departement-word-masculine dep) '$WordMasculine$))
        ((la)
         (unary-operator dep token (departement-word-feminine dep) '$WordFeminine$))
        ((les)
         (unary-operator dep token (departement-word-plural dep) '$WordPlural$))
        ((las)
         (unary-operator dep token (departement-word-feminine-plural dep) '$WordFemininePlural$))
        ((los)
         (unary-operator dep token (departement-word-masculine-plural dep) '$WordMasculinePlural$))
        ((Saint Santo San)
         (unary-operator dep token (departement-person-name-masculine dep) '$PersonNameMasculine$))
        ((Sainte Santa)
         (unary-operator dep token (departement-person-name-feminine dep) '$PersonNameFeminine$))
        ((et sous lès lez près)
         (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-locality-name dep) '$LocalityName$))
        ((à en de di)
         (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-area-name dep) '$AreaName$))
        ((sur)
         (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-river-name dep) '$RiverName$))
        (else
          (map (lambda (x) (extract-strings dep x)) token))
        ))
    ( (string? token)
      (extract-to-distribution dep token (departement-locality-name dep) '$LocalityName$))
    (#t
      (error "Unexpected element: " token))))

; Add a pattern to a departement
(define (add-pattern dep pattern)
  (distribution-add-to! (departement-patterns dep) pattern))

; Train the extractor on the commune name
(define (train-commune extractor obj)
  (define reg (hash-ref extractor (csv-commune-region-id obj)))
  (define dep (hash-ref (region-departements reg) (csv-commune-departement-id obj)))
  ; Tokenize the name by removing elisions and splitting along the hyphens
  (define tokens (string-split (remove-elision (csv-commune-name obj)) "-"))
  ; Decompose the tokens into a tree with preposition
  (set! tokens (foldl preposition-pass tokens *prepositions*))
  ; Decompose the tokens into a tree with articles
  (set! tokens (foldl article-pass tokens *articles*))
  ; Simplify a bit the tokens: remove empty lists and replace single-item list by the contained item
  (set! tokens (simplify-single (remove-empty tokens)))
  ; Extract the strings into a departement and generate a pattern
  (add-pattern dep (extract-strings dep tokens)))

; Add a commune to the extractor, an hash table of regions
; Only add if the locality is active and its names are coherent
(define (add-commune extractor obj)
  (when (and (csv-commune-coherent? obj)
             (csv-commune-active? obj))
    (add-region extractor obj)
    (add-departement extractor obj)
    (train-commune extractor obj)))

; Table of regions in which the data is stored
(define *extractor* (make-hash))

; Read data file and fill tables
; File a CSV with semi-colon (';') as field separator
(define *source-file* "./data/fr-esr-referentiel-geographique.csv")
(define source-reader
  (make-csv-reader
    (open-input-file *source-file*)
    '((separator-chars . (#\;))
      (strip-leading-whitespace? . #t)
      (strip-trailing-whitespace? . #t))))
; Get the header
(define *header* (source-reader))

; Fill the extractor
(csv-for-each
  (lambda (line)
    (add-commune
      *extractor*
      (map
        (lambda (x) (apply cons x))
        (zip *header* line))))
  source-reader)

; Translators for json serialisation
(define (pattern-translator x)
  (cond
    ((symbol? x) (symbol->string x))
    ((list? x) (map pattern-translator x))
    (else x)))

(define (markov-translator x)
  (cond
    ((char? x) (list->string (list x)))
    (else x)))

; Output a departement to a json-compatible structure
(define (departement->jsexpr dep)
  (make-immutable-hash
    `((id . ,(departement-id dep))
      (name . ,(departement-name dep))
      (patterns . ,(distribution->jsexpr (departement-patterns dep) #:translator pattern-translator))
      (word-masculine . ,(distribution->jsexpr (departement-word-masculine dep)))
      (word-feminine . ,(distribution->jsexpr (departement-word-feminine dep)))
      (word-plural . ,(distribution->jsexpr (departement-word-plural dep)))
      (word-masculine-plural . ,(distribution->jsexpr (departement-word-masculine-plural dep)))
      (word-feminine-plural . ,(distribution->jsexpr (departement-word-feminine-plural dep)))
      (person-name-masculine . ,(distribution->jsexpr (departement-person-name-masculine dep)))
      (person-name-feminine . ,(distribution->jsexpr (departement-person-name-feminine dep)))
      (locality-name . ,(markov->jsexpr (departement-locality-name dep) #:translator markov-translator))
      (area-name . ,(markov->jsexpr (departement-area-name dep) #:translator markov-translator))
      (river-name . ,(markov->jsexpr (departement-river-name dep) #:translator markov-translator))
      )))

; Output a region to a json-compatible structure
(define (region->jsexpr reg)
  (make-immutable-hash
    `((id . ,(region-id reg))
      (name . ,(region-name reg))
      (departements . ,(map departement->jsexpr (hash-values (region-departements reg))))
      )))

; Output the extractor to json-compatible structure
(define (extractor->jsexpr extractor)
  (map region->jsexpr (hash-values extractor)))

(define *output* (open-output-file "./web/gener-communes.json" #:exists 'truncate/replace))
; output lisp structure
(write-json (extractor->jsexpr *extractor*) *output*)