gener-communes/extractor.rkt

#lang racket

; Extracts data from CSV and generate a json file suitable for generation
; The output json file contains a list of regions, each containing a list of departements :
; [ {
;     "id": "...", "name": "...",
;     "departements":
;       [ {
;           "id": "...", "name": "...",
;           "word-masculine": [{"value": "...", "count": X}, ...],
;           "word-feminine": [{"value": "...", "count": X}, ...],
;           "word-plural": [{"value": "...", "count": X}, ...],
;           "word-masculine-plural": [{"value": "...", "count": X}, ...],
;           "word-feminine-plural": [{"value": "...", "count": X}, ...],
;           "person-name-masculine": [{"value": "...", "count": X}, ...],
;           "person-name-feminine": [{"value": "...", "count": X}, ...],
;           "patterns": [{"value": [...], "count": X}, ...],
;           "locality-name": {"order": X, "next": [{ "sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
;           "area-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
;           "river-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
;         },
;         ...
;      ],
;   },
;   ...
; ]
; The distributions and markov chains are trained are by departement

(require
  csv-reading
  json
  srfi/1
  "markov.rkt"
  "distribution.rkt")

; Structure for each region
(struct region
  (id ; id of region
   name ; name of region
   departements ; table of departements (id->dep)
   ))

; Constructor for a region
(define (make-region id name)
  (region id name (make-hash)))

; Structure for each departement
(struct departement
  (id ; id of departement
   name ; name of departement
   patterns ; distribution of patterns
   word-masculine ; distribution of masculine words
   word-feminine ; distribution of feminine words
   word-plural ; distribution of plural words
   word-masculine-plural ; distribution of masculine plural words
   word-feminine-plural ; distribution of feminine plural words
   person-name-masculine ; distributino of masculine person name
   person-name-feminine ; distribution of feminine person name
   locality-name ; markov chain for locality names
   area-name ; markov chain for area names
   river-name ; markov chain for river names
   ))

; Constructor for a departement
(define (make-departement id name)
  (departement
    id
    name
    (make-distribution)
    (make-distribution)
    (make-distribution)
    (make-distribution)
    (make-distribution)
    (make-distribution)
    (make-distribution)
    (make-distribution)
    (make-markov 3)
    (make-markov 3)
    (make-markov 3)))

;;
; Extraction tools

; Accessors of a csv-commune object (dict of line extracted from the csv file)
; Name
(define (csv-commune-name obj)
  (dict-ref obj "COM_NOM"))
; Uppercase name
(define (csv-commune-name-maj obj)
  (dict-ref obj "COM_NOM_MAJ"))
; Uppercase short name
(define (csv-commune-name-maj-court obj)
  (dict-ref obj "COM_NOM_MAJ_COURT"))
; Region Id
(define (csv-commune-region-id obj)
  (dict-ref obj "REG_ID_OLD"))
; Region name
(define (csv-commune-region-name obj)
  (dict-ref obj "REG_NOM_OLD"))
; Departement Id
(define (csv-commune-departement-id obj)
  (dict-ref obj "DEP_ID"))
; Departement name
(define (csv-commune-departement-name obj)
  (dict-ref obj "DEP_NOM"))
; Indicate if a commune is active (still exists)
(define (csv-commune-active? obj)
  (not (equal? "SO" (dict-ref obj "UUCR_ID"))))

; Function removing all diacrtics of latin-based languages
(define (remove-diacritics str)
  (list->string
      (filter
        (lambda (c)
          (< (char->integer c) 256))
        (string->list
          (string-normalize-nfd str)))))

; Return the short form of a name
(define (short-form str)
  (string-join
    (map
      (lambda (x)
        (cond
          ((equal? x "SAINT") "ST")
          ((equal? x "SAINTE") "STE")
          (#t x)))
      (string-split
        str
        #rx"[-']"))
    " "))

; Indicate if the commune name's uppercase form and uppercase short form can be computed from the commune's name
(define (csv-commune-coherent? obj)
  (define name (csv-commune-name obj))
  (define name-maj (csv-commune-name-maj obj))
  (define name-maj-court (csv-commune-name-maj-court obj))
  (and
    ; uppercase name should be name but capitalized and diacritics removed
    (equal? name-maj
            (string-upcase (remove-diacritics name)))
    ; uppercase short name should be uppercase name with apostrophes and hyphens replaced by spaces, "SAINT " replaced by "ST " and "SAINTE " replaced by "STE "
    (equal? name-maj-court
            (short-form name-maj))))

; Add a region to the extractor
(define (add-region extractor obj)
  (define id (csv-commune-region-id obj))
  (when (not (hash-has-key? extractor id))
    (hash-set! extractor id (make-region id (csv-commune-region-name obj)))))

; Add a departement to the extractor
(define (add-departement extractor obj)
  (define reg (hash-ref extractor (csv-commune-region-id obj)))
  (define id (csv-commune-departement-id obj))
  (when (not (hash-has-key? (region-departements reg) id))
    (hash-set! (region-departements reg) id (make-departement id (csv-commune-departement-name obj)))))

; Remove elision
(define *elisions*
  '(("-l'" . "-le-") ; masculine is used - though "l'" can also be an elision for the feminine "la"
    ("-d'" . "-de-")
    ("-des-" . "-de-les-")
    ("-du-" . "-de-le-")
    ("-del-" . "-de-el-") ; Catalan
    ("-dels-" . "-de-los-") ; Catalan
    ("-au-" . "-à-le-")
    ("-aux-" . "-à-les-")
    ("Sant'" . "Santo-") ; Corsican
    ; Initials articles are replaced with lowercase variants
    ("La " . "la-")
    ("Le " . "le-")
    ("Les " . "les-")
    ("L'" . "le-")
    ("Los " . "los-") ; Catalan
   ))
(define (remove-elision str)
  (foldl
    (lambda (item result)
      (string-replace result (car item) (cdr item)))
    str
    *elisions*))

; Transform preposition into binary operators between lists:
; ex: (A B C "sur" X Y Z) => (sur (A B C) (X Y Z))
; Note: recursive on lists
(define *prepositions*
  '("et"
    "à"
    "en"
    "de"
    "di" ; Corsican
    "sur"
    "sous"
    "lès"
    "lez"
    "près"))
(define (preposition-pass prep tokens)
  (if (list? tokens)
      (let ((idx (index-of tokens prep)))
        (if (and idx (< (+ idx 1) (length tokens))) ; preposition cannot be in last position
            (list
              (string->symbol prep)
              (take tokens idx)
              (preposition-pass prep (drop tokens (+ 1 idx))))
            (map (lambda (x) (preposition-pass prep x)) tokens)))
      tokens))

; Transform articles (and article-like words as "Saint") into unary operators
; ex: ("le" X Y) => ((le X) Y)
(define *articles*
  '("le"
    "el" ; Catalan/Spanish
    "la"
    "les"
    "las" ; Catalan
    "los" ; Catalan
    "Saint"
    "Santo" ; Corsican
    "San" ; Corsican
    "Sainte"
    "Santa" ; Corsican
    "sans"))
(define (article-pass art tokens)
  (if (list? tokens)
      (let ((idx (index-of tokens art)))
        (if (and idx (< (+ idx 1) (length tokens))) ; article cannot be in last position
            (list
              (take tokens idx)
              (list
                (string->symbol art)
                (list-ref tokens (+ 1 idx)))
              (article-pass art (drop tokens (+ 2 idx))))
            (map (lambda (x) (article-pass art x)) tokens)))
      tokens))

; Simplify: remove empty lists
(define (remove-empty tokens)
  (if (list? tokens)
      (map
        remove-empty
        (filter (lambda (x) (not (null? x))) tokens))
      tokens))

; Simplify: lists of single elements are replaced by that element
(define (simplify-single tokens)
  (if (list? tokens)
      (if (eq? 1 (length tokens))
          (simplify-single (car tokens))
          (map simplify-single tokens))
      tokens))

; Extract the strings and generate a pattern
; The string are extracted into the different tables according to the following rules
; Single string
; X => X = locality name
; Unary operators
; (le X), (el X), (sans X) => X = masculine word
; (la X) => X = feminine word
; (les X) => X = plural word
; (las X) => X = feminine plural word
; (los X) => X = masculine plural word
; (Saint X), (Santo X), (San X) => X = masculine name
; (Sainte X), (Santa X) => X = feminine name
; Binary operators
; (et X Y), (sous X Y), (lès X Y), (lez X Y), (près X Y) => X = locality name, Y = locality name
; (à X Y), (en X Y), (de X Y), (di X Y)  => X = locality name, Y = area name
; (sur X Y) => X = locality name, Y = river name

; During extraction, the strings are replaced by symbols of the corresponding type to form patterns
(define (extract-to-distribution dep token dist sym)
  (if (string? token)
      (begin
        (case sym
          (($LocalityName$ $AreaName$ $RiverName$)
            ; Those are added to a markov chain
            (markov-add-example! dist (string->list (string-titlecase token))))
          (else
            ; added to a distribution
            (distribution-add-to! dist token)))
        sym)
      (extract-strings dep token)))

; Extraction of unary operators
(define (unary-operator dep token dist sym)
  (list (car token) (extract-to-distribution dep (cadr token) dist sym)))

; Extraction of binary operators
(define (binary-operator dep token dist1 sym1 dist2 sym2)
  (list (car token) (extract-to-distribution dep (cadr token) dist1 sym1) (extract-to-distribution dep (caddr token) dist2 sym2)))

(define (extract-strings dep token)
  (cond
    ( (list? token)
      (case (car token)
        ((le el sans)
         (unary-operator dep token (departement-word-masculine dep) '$WordMasculine$))
        ((la)
         (unary-operator dep token (departement-word-feminine dep) '$WordFeminine$))
        ((les)
         (unary-operator dep token (departement-word-plural dep) '$WordPlural$))
        ((las)
         (unary-operator dep token (departement-word-feminine-plural dep) '$WordFemininePlural$))
        ((los)
         (unary-operator dep token (departement-word-masculine-plural dep) '$WordMasculinePlural$))
        ((Saint Santo San)
         (unary-operator dep token (departement-person-name-masculine dep) '$PersonNameMasculine$))
        ((Sainte Santa)
         (unary-operator dep token (departement-person-name-feminine dep) '$PersonNameFeminine$))
        ((et sous lès lez près)
         (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-locality-name dep) '$LocalityName$))
        ((à en de di)
         (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-area-name dep) '$AreaName$))
        ((sur)
         (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-river-name dep) '$RiverName$))
        (else
          (map (lambda (x) (extract-strings dep x)) token))
        ))
    ( (string? token)
      (extract-to-distribution dep token (departement-locality-name dep) '$LocalityName$))
    (#t
      (error "Unexpected element: " token))))

; Add a pattern to a departement
(define (add-pattern dep pattern)
  (distribution-add-to! (departement-patterns dep) pattern))

; Train the extractor on the commune name
(define (train-commune extractor obj)
  (define reg (hash-ref extractor (csv-commune-region-id obj)))
  (define dep (hash-ref (region-departements reg) (csv-commune-departement-id obj)))
  ; Tokenize the name by removing elisions and splitting along the hyphens
  (define tokens (string-split (remove-elision (csv-commune-name obj)) "-"))
  ; Decompose the tokens into a tree with preposition
  (set! tokens (foldl preposition-pass tokens *prepositions*))
  ; Decompose the tokens into a tree with articles
  (set! tokens (foldl article-pass tokens *articles*))
  ; Simplify a bit the tokens: remove empty lists and replace single-item list by the contained item
  (set! tokens (simplify-single (remove-empty tokens)))
  ; Extract the strings into a departement and generate a pattern
  (add-pattern dep (extract-strings dep tokens)))

; Add a commune to the extractor, an hash table of regions
; Only add if the locality is active and its names are coherent
(define (add-commune extractor obj)
  (when (and (csv-commune-coherent? obj)
             (csv-commune-active? obj))
    (add-region extractor obj)
    (add-departement extractor obj)
    (train-commune extractor obj)))

; Table of regions in which the data is stored
(define *extractor* (make-hash))

; Read data file and fill tables
; File a CSV with semi-colon (';') as field separator
(define *source-file* "./data/fr-esr-referentiel-geographique.csv")
(define source-reader
  (make-csv-reader
    (open-input-file *source-file*)
    '((separator-chars . (#\;))
      (strip-leading-whitespace? . #t)
      (strip-trailing-whitespace? . #t))))
; Get the header
(define *header* (source-reader))

; Fill the extractor
(csv-for-each
  (lambda (line)
    (add-commune
      *extractor*
      (map
        (lambda (x) (apply cons x))
        (zip *header* line))))
  source-reader)

; Translators for json serialisation
(define (pattern-translator x)
  (cond
    ((symbol? x) (symbol->string x))
    ((list? x) (map pattern-translator x))
    (else x)))

(define (markov-translator x)
  (cond
    ((char? x) (list->string (list x)))
    (else x)))

; Output a departement to a json-compatible structure
(define (departement->jsexpr dep)
  (make-immutable-hash
    `((id . ,(departement-id dep))
      (name . ,(departement-name dep))
      (patterns . ,(distribution->jsexpr (departement-patterns dep) #:translator pattern-translator))
      (word-masculine . ,(distribution->jsexpr (departement-word-masculine dep)))
      (word-feminine . ,(distribution->jsexpr (departement-word-feminine dep)))
      (word-plural . ,(distribution->jsexpr (departement-word-plural dep)))
      (word-masculine-plural . ,(distribution->jsexpr (departement-word-masculine-plural dep)))
      (word-feminine-plural . ,(distribution->jsexpr (departement-word-feminine-plural dep)))
      (person-name-masculine . ,(distribution->jsexpr (departement-person-name-masculine dep)))
      (person-name-feminine . ,(distribution->jsexpr (departement-person-name-feminine dep)))
      (locality-name . ,(markov->jsexpr (departement-locality-name dep) #:translator markov-translator))
      (area-name . ,(markov->jsexpr (departement-area-name dep) #:translator markov-translator))
      (river-name . ,(markov->jsexpr (departement-river-name dep) #:translator markov-translator))
      )))

; Output a region to a json-compatible structure
(define (region->jsexpr reg)
  (make-immutable-hash
    `((id . ,(region-id reg))
      (name . ,(region-name reg))
      (departements . ,(map departement->jsexpr (hash-values (region-departements reg))))
      )))

; Output the extractor to json-compatible structure
(define (extractor->jsexpr extractor)
  (map region->jsexpr (hash-values extractor)))

(define *output* (open-output-file "./web/gener-communes.json" #:exists 'truncate/replace))
; output lisp structure
(write-json (extractor->jsexpr *extractor*) *output*)
Add extractor & generator 2019-11-04 17:23:08 +01:00			`#lang racket`

			`; Extracts data from CSV and generate a json file suitable for generation`
			`; The output json file contains a list of regions, each containing a list of departements :`
			`; [ {`
			`; "id": "...", "name": "...",`
			`; "departements":`
			`; [ {`
			`; "id": "...", "name": "...",`
			`; "word-masculine": [{"value": "...", "count": X}, ...],`
			`; "word-feminine": [{"value": "...", "count": X}, ...],`
			`; "word-plural": [{"value": "...", "count": X}, ...],`
			`; "word-masculine-plural": [{"value": "...", "count": X}, ...],`
			`; "word-feminine-plural": [{"value": "...", "count": X}, ...],`
			`; "person-name-masculine": [{"value": "...", "count": X}, ...],`
			`; "person-name-feminine": [{"value": "...", "count": X}, ...],`
			`; "patterns": [{"value": [...], "count": X}, ...],`
Move the json representation in the web folder 2019-11-05 17:39:29 +01:00			`; "locality-name": {"order": X, "next": [{ "sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},`
			`; "area-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},`
			`; "river-name": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},`
Add extractor & generator 2019-11-04 17:23:08 +01:00			`; },`
			`; ...`
			`; ],`
			`; },`
			`; ...`
			`; ]`
			`; The distributions and markov chains are trained are by departement`

			`(require`
			`csv-reading`
			`json`
			`srfi/1`
			`"markov.rkt"`
			`"distribution.rkt")`

			`; Structure for each region`
			`(struct region`
			`(id ; id of region`
			`name ; name of region`
			`departements ; table of departements (id->dep)`
			`))`

			`; Constructor for a region`
			`(define (make-region id name)`
			`(region id name (make-hash)))`

			`; Structure for each departement`
			`(struct departement`
			`(id ; id of departement`
			`name ; name of departement`
			`patterns ; distribution of patterns`
			`word-masculine ; distribution of masculine words`
			`word-feminine ; distribution of feminine words`
			`word-plural ; distribution of plural words`
			`word-masculine-plural ; distribution of masculine plural words`
			`word-feminine-plural ; distribution of feminine plural words`
			`person-name-masculine ; distributino of masculine person name`
			`person-name-feminine ; distribution of feminine person name`
			`locality-name ; markov chain for locality names`
			`area-name ; markov chain for area names`
			`river-name ; markov chain for river names`
			`))`

			`; Constructor for a departement`
			`(define (make-departement id name)`
			`(departement`
			`id`
			`name`
			`(make-distribution)`
			`(make-distribution)`
			`(make-distribution)`
			`(make-distribution)`
			`(make-distribution)`
			`(make-distribution)`
			`(make-distribution)`
			`(make-distribution)`
			`(make-markov 3)`
			`(make-markov 3)`
			`(make-markov 3)))`

			`;;`
			`; Extraction tools`

			`; Accessors of a csv-commune object (dict of line extracted from the csv file)`
			`; Name`
			`(define (csv-commune-name obj)`
			`(dict-ref obj "COM_NOM"))`
			`; Uppercase name`
			`(define (csv-commune-name-maj obj)`
			`(dict-ref obj "COM_NOM_MAJ"))`
			`; Uppercase short name`
			`(define (csv-commune-name-maj-court obj)`
			`(dict-ref obj "COM_NOM_MAJ_COURT"))`
			`; Region Id`
			`(define (csv-commune-region-id obj)`
			`(dict-ref obj "REG_ID_OLD"))`
			`; Region name`
			`(define (csv-commune-region-name obj)`
			`(dict-ref obj "REG_NOM_OLD"))`
			`; Departement Id`
			`(define (csv-commune-departement-id obj)`
			`(dict-ref obj "DEP_ID"))`
			`; Departement name`
			`(define (csv-commune-departement-name obj)`
			`(dict-ref obj "DEP_NOM"))`
			`; Indicate if a commune is active (still exists)`
			`(define (csv-commune-active? obj)`
			`(not (equal? "SO" (dict-ref obj "UUCR_ID"))))`

			`; Function removing all diacrtics of latin-based languages`
			`(define (remove-diacritics str)`
			`(list->string`
			`(filter`
			`(lambda (c)`
			`(< (char->integer c) 256))`
			`(string->list`
			`(string-normalize-nfd str)))))`

			`; Return the short form of a name`
			`(define (short-form str)`
			`(string-join`
			`(map`
			`(lambda (x)`
			`(cond`
			`((equal? x "SAINT") "ST")`
			`((equal? x "SAINTE") "STE")`
			`(#t x)))`
			`(string-split`
			`str`
			`#rx"[-']"))`
			`" "))`

			`; Indicate if the commune name's uppercase form and uppercase short form can be computed from the commune's name`
			`(define (csv-commune-coherent? obj)`
			`(define name (csv-commune-name obj))`
			`(define name-maj (csv-commune-name-maj obj))`
			`(define name-maj-court (csv-commune-name-maj-court obj))`
			`(and`
			`; uppercase name should be name but capitalized and diacritics removed`
			`(equal? name-maj`
			`(string-upcase (remove-diacritics name)))`
			`; uppercase short name should be uppercase name with apostrophes and hyphens replaced by spaces, "SAINT " replaced by "ST " and "SAINTE " replaced by "STE "`
			`(equal? name-maj-court`
			`(short-form name-maj))))`

			`; Add a region to the extractor`
			`(define (add-region extractor obj)`
			`(define id (csv-commune-region-id obj))`
			`(when (not (hash-has-key? extractor id))`
			`(hash-set! extractor id (make-region id (csv-commune-region-name obj)))))`

			`; Add a departement to the extractor`
			`(define (add-departement extractor obj)`
			`(define reg (hash-ref extractor (csv-commune-region-id obj)))`
			`(define id (csv-commune-departement-id obj))`
			`(when (not (hash-has-key? (region-departements reg) id))`
			`(hash-set! (region-departements reg) id (make-departement id (csv-commune-departement-name obj)))))`

			`; Remove elision`
			`(define elisions`
			`'(("-l'" . "-le-") ; masculine is used - though "l'" can also be an elision for the feminine "la"`
			`("-d'" . "-de-")`
			`("-des-" . "-de-les-")`
			`("-du-" . "-de-le-")`
			`("-del-" . "-de-el-") ; Catalan`
			`("-dels-" . "-de-los-") ; Catalan`
			`("-au-" . "-à-le-")`
			`("-aux-" . "-à-les-")`
			`("Sant'" . "Santo-") ; Corsican`
			`; Initials articles are replaced with lowercase variants`
			`("La " . "la-")`
			`("Le " . "le-")`
			`("Les " . "les-")`
			`("L'" . "le-")`
			`("Los " . "los-") ; Catalan`
			`))`
			`(define (remove-elision str)`
			`(foldl`
			`(lambda (item result)`
			`(string-replace result (car item) (cdr item)))`
			`str`
			`elisions))`

			`; Transform preposition into binary operators between lists:`
			`; ex: (A B C "sur" X Y Z) => (sur (A B C) (X Y Z))`
			`; Note: recursive on lists`
			`(define prepositions`
			`'("et"`
			`"à"`
			`"en"`
			`"de"`
			`"di" ; Corsican`
			`"sur"`
			`"sous"`
			`"lès"`
			`"lez"`
			`"près"))`
			`(define (preposition-pass prep tokens)`
			`(if (list? tokens)`
			`(let ((idx (index-of tokens prep)))`
			`(if (and idx (< (+ idx 1) (length tokens))) ; preposition cannot be in last position`
			`(list`
			`(string->symbol prep)`
			`(take tokens idx)`
			`(preposition-pass prep (drop tokens (+ 1 idx))))`
			`(map (lambda (x) (preposition-pass prep x)) tokens)))`
			`tokens))`

			`; Transform articles (and article-like words as "Saint") into unary operators`
			`; ex: ("le" X Y) => ((le X) Y)`
			`(define articles`
			`'("le"`
			`"el" ; Catalan/Spanish`
			`"la"`
			`"les"`
			`"las" ; Catalan`
			`"los" ; Catalan`
			`"Saint"`
			`"Santo" ; Corsican`
			`"San" ; Corsican`
			`"Sainte"`
			`"Santa" ; Corsican`
			`"sans"))`
			`(define (article-pass art tokens)`
			`(if (list? tokens)`
			`(let ((idx (index-of tokens art)))`
			`(if (and idx (< (+ idx 1) (length tokens))) ; article cannot be in last position`
			`(list`
			`(take tokens idx)`
			`(list`
			`(string->symbol art)`
			`(list-ref tokens (+ 1 idx)))`
			`(article-pass art (drop tokens (+ 2 idx))))`
			`(map (lambda (x) (article-pass art x)) tokens)))`
			`tokens))`

			`; Simplify: remove empty lists`
			`(define (remove-empty tokens)`
			`(if (list? tokens)`
			`(map`
			`remove-empty`
			`(filter (lambda (x) (not (null? x))) tokens))`
			`tokens))`

			`; Simplify: lists of single elements are replaced by that element`
			`(define (simplify-single tokens)`
			`(if (list? tokens)`
			`(if (eq? 1 (length tokens))`
			`(simplify-single (car tokens))`
			`(map simplify-single tokens))`
			`tokens))`

			`; Extract the strings and generate a pattern`
			`; The string are extracted into the different tables according to the following rules`
			`; Single string`
			`; X => X = locality name`
			`; Unary operators`
			`; (le X), (el X), (sans X) => X = masculine word`
			`; (la X) => X = feminine word`
			`; (les X) => X = plural word`
			`; (las X) => X = feminine plural word`
			`; (los X) => X = masculine plural word`
			`; (Saint X), (Santo X), (San X) => X = masculine name`
			`; (Sainte X), (Santa X) => X = feminine name`
			`; Binary operators`
			`; (et X Y), (sous X Y), (lès X Y), (lez X Y), (près X Y) => X = locality name, Y = locality name`
			`; (à X Y), (en X Y), (de X Y), (di X Y) => X = locality name, Y = area name`
			`; (sur X Y) => X = locality name, Y = river name`

			`; During extraction, the strings are replaced by symbols of the corresponding type to form patterns`
			`(define (extract-to-distribution dep token dist sym)`
			`(if (string? token)`
			`(begin`
			`(case sym`
			`(($LocalityName$ $AreaName$ $RiverName$)`
			`; Those are added to a markov chain`
			`(markov-add-example! dist (string->list (string-titlecase token))))`
			`(else`
			`; added to a distribution`
			`(distribution-add-to! dist token)))`
			`sym)`
			`(extract-strings dep token)))`

			`; Extraction of unary operators`
			`(define (unary-operator dep token dist sym)`
			`(list (car token) (extract-to-distribution dep (cadr token) dist sym)))`

			`; Extraction of binary operators`
			`(define (binary-operator dep token dist1 sym1 dist2 sym2)`
			`(list (car token) (extract-to-distribution dep (cadr token) dist1 sym1) (extract-to-distribution dep (caddr token) dist2 sym2)))`

			`(define (extract-strings dep token)`
			`(cond`
			`( (list? token)`
			`(case (car token)`
			`((le el sans)`
			`(unary-operator dep token (departement-word-masculine dep) '$WordMasculine$))`
			`((la)`
			`(unary-operator dep token (departement-word-feminine dep) '$WordFeminine$))`
			`((les)`
			`(unary-operator dep token (departement-word-plural dep) '$WordPlural$))`
			`((las)`
			`(unary-operator dep token (departement-word-feminine-plural dep) '$WordFemininePlural$))`
			`((los)`
			`(unary-operator dep token (departement-word-masculine-plural dep) '$WordMasculinePlural$))`
			`((Saint Santo San)`
			`(unary-operator dep token (departement-person-name-masculine dep) '$PersonNameMasculine$))`
			`((Sainte Santa)`
			`(unary-operator dep token (departement-person-name-feminine dep) '$PersonNameFeminine$))`
			`((et sous lès lez près)`
			`(binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-locality-name dep) '$LocalityName$))`
			`((à en de di)`
			`(binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-area-name dep) '$AreaName$))`
			`((sur)`
			`(binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-river-name dep) '$RiverName$))`
			`(else`
			`(map (lambda (x) (extract-strings dep x)) token))`
			`))`
			`( (string? token)`
			`(extract-to-distribution dep token (departement-locality-name dep) '$LocalityName$))`
			`(#t`
			`(error "Unexpected element: " token))))`

			`; Add a pattern to a departement`
			`(define (add-pattern dep pattern)`
			`(distribution-add-to! (departement-patterns dep) pattern))`

			`; Train the extractor on the commune name`
			`(define (train-commune extractor obj)`
			`(define reg (hash-ref extractor (csv-commune-region-id obj)))`
			`(define dep (hash-ref (region-departements reg) (csv-commune-departement-id obj)))`
			`; Tokenize the name by removing elisions and splitting along the hyphens`
			`(define tokens (string-split (remove-elision (csv-commune-name obj)) "-"))`
			`; Decompose the tokens into a tree with preposition`
			`(set! tokens (foldl preposition-pass tokens prepositions))`
			`; Decompose the tokens into a tree with articles`
			`(set! tokens (foldl article-pass tokens articles))`
			`; Simplify a bit the tokens: remove empty lists and replace single-item list by the contained item`
			`(set! tokens (simplify-single (remove-empty tokens)))`
			`; Extract the strings into a departement and generate a pattern`
			`(add-pattern dep (extract-strings dep tokens)))`

			`; Add a commune to the extractor, an hash table of regions`
			`; Only add if the locality is active and its names are coherent`
			`(define (add-commune extractor obj)`
			`(when (and (csv-commune-coherent? obj)`
			`(csv-commune-active? obj))`
			`(add-region extractor obj)`
			`(add-departement extractor obj)`
			`(train-commune extractor obj)))`

			`; Table of regions in which the data is stored`
			`(define extractor (make-hash))`

			`; Read data file and fill tables`
			`; File a CSV with semi-colon (';') as field separator`
			`(define source-file "./data/fr-esr-referentiel-geographique.csv")`
			`(define source-reader`
			`(make-csv-reader`
			`(open-input-file source-file)`
			`'((separator-chars . (#\;))`
			`(strip-leading-whitespace? . #t)`
			`(strip-trailing-whitespace? . #t))))`
			`; Get the header`
			`(define header (source-reader))`

			`; Fill the extractor`
			`(csv-for-each`
			`(lambda (line)`
			`(add-commune`
			`extractor`
			`(map`
			`(lambda (x) (apply cons x))`
			`(zip header line))))`
			`source-reader)`

			`; Translators for json serialisation`
			`(define (pattern-translator x)`
			`(cond`
			`((symbol? x) (symbol->string x))`
			`((list? x) (map pattern-translator x))`
			`(else x)))`

			`(define (markov-translator x)`
			`(cond`
			`((char? x) (list->string (list x)))`
			`(else x)))`

			`; Output a departement to a json-compatible structure`
			`(define (departement->jsexpr dep)`
			`(make-immutable-hash`
			`((id . ,(departement-id dep))
			`(name . ,(departement-name dep))`
			`(patterns . ,(distribution->jsexpr (departement-patterns dep) #:translator pattern-translator))`
			`(word-masculine . ,(distribution->jsexpr (departement-word-masculine dep)))`
			`(word-feminine . ,(distribution->jsexpr (departement-word-feminine dep)))`
			`(word-plural . ,(distribution->jsexpr (departement-word-plural dep)))`
			`(word-masculine-plural . ,(distribution->jsexpr (departement-word-masculine-plural dep)))`
			`(word-feminine-plural . ,(distribution->jsexpr (departement-word-feminine-plural dep)))`
			`(person-name-masculine . ,(distribution->jsexpr (departement-person-name-masculine dep)))`
			`(person-name-feminine . ,(distribution->jsexpr (departement-person-name-feminine dep)))`
			`(locality-name . ,(markov->jsexpr (departement-locality-name dep) #:translator markov-translator))`
			`(area-name . ,(markov->jsexpr (departement-area-name dep) #:translator markov-translator))`
			`(river-name . ,(markov->jsexpr (departement-river-name dep) #:translator markov-translator))`
			`)))`

			`; Output a region to a json-compatible structure`
			`(define (region->jsexpr reg)`
			`(make-immutable-hash`
			`((id . ,(region-id reg))
			`(name . ,(region-name reg))`
			`(departements . ,(map departement->jsexpr (hash-values (region-departements reg))))`
			`)))`

			`; Output the extractor to json-compatible structure`
			`(define (extractor->jsexpr extractor)`
			`(map region->jsexpr (hash-values extractor)))`

Move the json representation in the web folder 2019-11-05 17:39:29 +01:00			`(define output (open-output-file "./web/gener-communes.json" #:exists 'truncate/replace))`
Add extractor & generator 2019-11-04 17:23:08 +01:00			`; output lisp structure`
			`(write-json (extractor->jsexpr extractor) output)`