Add extractor & generator

2019-11-04 17:23:08 +01:00 · 2019-11-04 17:23:08 +01:00 · 912b51abe4
parent 9c93990b21
commit 912b51abe4
2 changed files with 649 additions and 0 deletions
--- a/extractor.rkt
+++ b/extractor.rkt
@ -0,0 +1,421 @@
+#lang racket
+
+; Extracts data from CSV and generate a json file suitable for generation
+; The output json file contains a list of regions, each containing a list of departements :
+; [ {
+;     "id": "...", "name": "...",
+;     "departements":
+;       [ {
+;           "id": "...", "name": "...",
+;           "word-masculine": [{"value": "...", "count": X}, ...],
+;           "word-feminine": [{"value": "...", "count": X}, ...],
+;           "word-plural": [{"value": "...", "count": X}, ...],
+;           "word-masculine-plural": [{"value": "...", "count": X}, ...],
+;           "word-feminine-plural": [{"value": "...", "count": X}, ...],
+;           "person-name-masculine": [{"value": "...", "count": X}, ...],
+;           "person-name-feminine": [{"value": "...", "count": X}, ...],
+;           "patterns": [{"value": [...], "count": X}, ...],
+;           "locality-markov": {"order": X, "next": [{ "sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
+;           "area-markov": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
+;           "river-markov": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]},
+;         },
+;         ...
+;      ],
+;   },
+;   ...
+; ]
+; The distributions and markov chains are trained are by departement
+
+(require
+  csv-reading
+  json
+  srfi/1
+  "markov.rkt"
+  "distribution.rkt")
+
+; Structure for each region
+(struct region
+  (id ; id of region
+   name ; name of region
+   departements ; table of departements (id->dep)
+   ))
+
+; Constructor for a region
+(define (make-region id name)
+  (region id name (make-hash)))
+
+; Structure for each departement
+(struct departement
+  (id ; id of departement
+   name ; name of departement
+   patterns ; distribution of patterns
+   word-masculine ; distribution of masculine words
+   word-feminine ; distribution of feminine words
+   word-plural ; distribution of plural words
+   word-masculine-plural ; distribution of masculine plural words
+   word-feminine-plural ; distribution of feminine plural words
+   person-name-masculine ; distributino of masculine person name
+   person-name-feminine ; distribution of feminine person name
+   locality-name ; markov chain for locality names
+   area-name ; markov chain for area names
+   river-name ; markov chain for river names
+   ))
+
+; Constructor for a departement
+(define (make-departement id name)
+  (departement
+    id
+    name
+    (make-distribution)
+    (make-distribution)
+    (make-distribution)
+    (make-distribution)
+    (make-distribution)
+    (make-distribution)
+    (make-distribution)
+    (make-distribution)
+    (make-markov 3)
+    (make-markov 3)
+    (make-markov 3)))
+
+;;
+; Extraction tools
+
+; Accessors of a csv-commune object (dict of line extracted from the csv file)
+; Name
+(define (csv-commune-name obj)
+  (dict-ref obj "COM_NOM"))
+; Uppercase name
+(define (csv-commune-name-maj obj)
+  (dict-ref obj "COM_NOM_MAJ"))
+; Uppercase short name
+(define (csv-commune-name-maj-court obj)
+  (dict-ref obj "COM_NOM_MAJ_COURT"))
+; Region Id
+(define (csv-commune-region-id obj)
+  (dict-ref obj "REG_ID_OLD"))
+; Region name
+(define (csv-commune-region-name obj)
+  (dict-ref obj "REG_NOM_OLD"))
+; Departement Id
+(define (csv-commune-departement-id obj)
+  (dict-ref obj "DEP_ID"))
+; Departement name
+(define (csv-commune-departement-name obj)
+  (dict-ref obj "DEP_NOM"))
+; Indicate if a commune is active (still exists)
+(define (csv-commune-active? obj)
+  (not (equal? "SO" (dict-ref obj "UUCR_ID"))))
+
+; Function removing all diacrtics of latin-based languages
+(define (remove-diacritics str)
+  (list->string
+      (filter
+        (lambda (c)
+          (< (char->integer c) 256))
+        (string->list
+          (string-normalize-nfd str)))))
+
+; Return the short form of a name
+(define (short-form str)
+  (string-join
+    (map
+      (lambda (x)
+        (cond
+          ((equal? x "SAINT") "ST")
+          ((equal? x "SAINTE") "STE")
+          (#t x)))
+      (string-split
+        str
+        #rx"[-']"))
+    " "))
+
+; Indicate if the commune name's uppercase form and uppercase short form can be computed from the commune's name
+(define (csv-commune-coherent? obj)
+  (define name (csv-commune-name obj))
+  (define name-maj (csv-commune-name-maj obj))
+  (define name-maj-court (csv-commune-name-maj-court obj))
+  (and
+    ; uppercase name should be name but capitalized and diacritics removed
+    (equal? name-maj
+            (string-upcase (remove-diacritics name)))
+    ; uppercase short name should be uppercase name with apostrophes and hyphens replaced by spaces, "SAINT " replaced by "ST " and "SAINTE " replaced by "STE "
+    (equal? name-maj-court
+            (short-form name-maj))))
+
+; Add a region to the extractor
+(define (add-region extractor obj)
+  (define id (csv-commune-region-id obj))
+  (when (not (hash-has-key? extractor id))
+    (hash-set! extractor id (make-region id (csv-commune-region-name obj)))))
+
+; Add a departement to the extractor
+(define (add-departement extractor obj)
+  (define reg (hash-ref extractor (csv-commune-region-id obj)))
+  (define id (csv-commune-departement-id obj))
+  (when (not (hash-has-key? (region-departements reg) id))
+    (hash-set! (region-departements reg) id (make-departement id (csv-commune-departement-name obj)))))
+
+; Remove elision
+(define *elisions*
+  '(("-l'" . "-le-") ; masculine is used - though "l'" can also be an elision for the feminine "la"
+    ("-d'" . "-de-")
+    ("-des-" . "-de-les-")
+    ("-du-" . "-de-le-")
+    ("-del-" . "-de-el-") ; Catalan
+    ("-dels-" . "-de-los-") ; Catalan
+    ("-au-" . "-à-le-")
+    ("-aux-" . "-à-les-")
+    ("Sant'" . "Santo-") ; Corsican
+    ; Initials articles are replaced with lowercase variants
+    ("La " . "la-")
+    ("Le " . "le-")
+    ("Les " . "les-")
+    ("L'" . "le-")
+    ("Los " . "los-") ; Catalan
+   ))
+(define (remove-elision str)
+  (foldl
+    (lambda (item result)
+      (string-replace result (car item) (cdr item)))
+    str
+    *elisions*))
+
+; Transform preposition into binary operators between lists:
+; ex: (A B C "sur" X Y Z) => (sur (A B C) (X Y Z))
+; Note: recursive on lists
+(define *prepositions*
+  '("et"
+    "à"
+    "en"
+    "de"
+    "di" ; Corsican
+    "sur"
+    "sous"
+    "lès"
+    "lez"
+    "près"))
+(define (preposition-pass prep tokens)
+  (if (list? tokens)
+      (let ((idx (index-of tokens prep)))
+        (if (and idx (< (+ idx 1) (length tokens))) ; preposition cannot be in last position
+            (list
+              (string->symbol prep)
+              (take tokens idx)
+              (preposition-pass prep (drop tokens (+ 1 idx))))
+            (map (lambda (x) (preposition-pass prep x)) tokens)))
+      tokens))
+
+; Transform articles (and article-like words as "Saint") into unary operators
+; ex: ("le" X Y) => ((le X) Y)
+(define *articles*
+  '("le"
+    "el" ; Catalan/Spanish
+    "la"
+    "les"
+    "las" ; Catalan
+    "los" ; Catalan
+    "Saint"
+    "Santo" ; Corsican
+    "San" ; Corsican
+    "Sainte"
+    "Santa" ; Corsican
+    "sans"))
+(define (article-pass art tokens)
+  (if (list? tokens)
+      (let ((idx (index-of tokens art)))
+        (if (and idx (< (+ idx 1) (length tokens))) ; article cannot be in last position
+            (list
+              (take tokens idx)
+              (list
+                (string->symbol art)
+                (list-ref tokens (+ 1 idx)))
+              (article-pass art (drop tokens (+ 2 idx))))
+            (map (lambda (x) (article-pass art x)) tokens)))
+      tokens))
+
+; Simplify: remove empty lists
+(define (remove-empty tokens)
+  (if (list? tokens)
+      (map
+        remove-empty
+        (filter (lambda (x) (not (null? x))) tokens))
+      tokens))
+
+; Simplify: lists of single elements are replaced by that element
+(define (simplify-single tokens)
+  (if (list? tokens)
+      (if (eq? 1 (length tokens))
+          (simplify-single (car tokens))
+          (map simplify-single tokens))
+      tokens))
+
+; Extract the strings and generate a pattern
+; The string are extracted into the different tables according to the following rules
+; Single string
+; X => X = locality name
+; Unary operators
+; (le X), (el X), (sans X) => X = masculine word
+; (la X) => X = feminine word
+; (les X) => X = plural word
+; (las X) => X = feminine plural word
+; (los X) => X = masculine plural word
+; (Saint X), (Santo X), (San X) => X = masculine name
+; (Sainte X), (Santa X) => X = feminine name
+; Binary operators
+; (et X Y), (sous X Y), (lès X Y), (lez X Y), (près X Y) => X = locality name, Y = locality name
+; (à X Y), (en X Y), (de X Y), (di X Y)  => X = locality name, Y = area name
+; (sur X Y) => X = locality name, Y = river name
+
+; During extraction, the strings are replaced by symbols of the corresponding type to form patterns
+(define (extract-to-distribution dep token dist sym)
+  (if (string? token)
+      (begin
+        (case sym
+          (($LocalityName$ $AreaName$ $RiverName$)
+            ; Those are added to a markov chain
+            (markov-add-example! dist (string->list (string-titlecase token))))
+          (else
+            ; added to a distribution
+            (distribution-add-to! dist token)))
+        sym)
+      (extract-strings dep token)))
+
+; Extraction of unary operators
+(define (unary-operator dep token dist sym)
+  (list (car token) (extract-to-distribution dep (cadr token) dist sym)))
+
+; Extraction of binary operators
+(define (binary-operator dep token dist1 sym1 dist2 sym2)
+  (list (car token) (extract-to-distribution dep (cadr token) dist1 sym1) (extract-to-distribution dep (caddr token) dist2 sym2)))
+
+(define (extract-strings dep token)
+  (cond
+    ( (list? token)
+      (case (car token)
+        ((le el sans)
+         (unary-operator dep token (departement-word-masculine dep) '$WordMasculine$))
+        ((la)
+         (unary-operator dep token (departement-word-feminine dep) '$WordFeminine$))
+        ((les)
+         (unary-operator dep token (departement-word-plural dep) '$WordPlural$))
+        ((las)
+         (unary-operator dep token (departement-word-feminine-plural dep) '$WordFemininePlural$))
+        ((los)
+         (unary-operator dep token (departement-word-masculine-plural dep) '$WordMasculinePlural$))
+        ((Saint Santo San)
+         (unary-operator dep token (departement-person-name-masculine dep) '$PersonNameMasculine$))
+        ((Sainte Santa)
+         (unary-operator dep token (departement-person-name-feminine dep) '$PersonNameFeminine$))
+        ((et sous lès lez près)
+         (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-locality-name dep) '$LocalityName$))
+        ((à en de di)
+         (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-area-name dep) '$AreaName$))
+        ((sur)
+         (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-river-name dep) '$RiverName$))
+        (else
+          (map (lambda (x) (extract-strings dep x)) token))
+        ))
+    ( (string? token)
+      (extract-to-distribution dep token (departement-locality-name dep) '$LocalityName$))
+    (#t
+      (error "Unexpected element: " token))))
+
+; Add a pattern to a departement
+(define (add-pattern dep pattern)
+  (distribution-add-to! (departement-patterns dep) pattern))
+
+; Train the extractor on the commune name
+(define (train-commune extractor obj)
+  (define reg (hash-ref extractor (csv-commune-region-id obj)))
+  (define dep (hash-ref (region-departements reg) (csv-commune-departement-id obj)))
+  ; Tokenize the name by removing elisions and splitting along the hyphens
+  (define tokens (string-split (remove-elision (csv-commune-name obj)) "-"))
+  ; Decompose the tokens into a tree with preposition
+  (set! tokens (foldl preposition-pass tokens *prepositions*))
+  ; Decompose the tokens into a tree with articles
+  (set! tokens (foldl article-pass tokens *articles*))
+  ; Simplify a bit the tokens: remove empty lists and replace single-item list by the contained item
+  (set! tokens (simplify-single (remove-empty tokens)))
+  ; Extract the strings into a departement and generate a pattern
+  (add-pattern dep (extract-strings dep tokens)))
+
+; Add a commune to the extractor, an hash table of regions
+; Only add if the locality is active and its names are coherent
+(define (add-commune extractor obj)
+  (when (and (csv-commune-coherent? obj)
+             (csv-commune-active? obj))
+    (add-region extractor obj)
+    (add-departement extractor obj)
+    (train-commune extractor obj)))
+
+; Table of regions in which the data is stored
+(define *extractor* (make-hash))
+
+; Read data file and fill tables
+; File a CSV with semi-colon (';') as field separator
+(define *source-file* "./data/fr-esr-referentiel-geographique.csv")
+(define source-reader
+  (make-csv-reader
+    (open-input-file *source-file*)
+    '((separator-chars . (#\;))
+      (strip-leading-whitespace? . #t)
+      (strip-trailing-whitespace? . #t))))
+; Get the header
+(define *header* (source-reader))
+
+; Fill the extractor
+(csv-for-each
+  (lambda (line)
+    (add-commune
+      *extractor*
+      (map
+        (lambda (x) (apply cons x))
+        (zip *header* line))))
+  source-reader)
+
+; Translators for json serialisation
+(define (pattern-translator x)
+  (cond
+    ((symbol? x) (symbol->string x))
+    ((list? x) (map pattern-translator x))
+    (else x)))
+
+(define (markov-translator x)
+  (cond
+    ((char? x) (list->string (list x)))
+    (else x)))
+
+; Output a departement to a json-compatible structure
+(define (departement->jsexpr dep)
+  (make-immutable-hash
+    `((id . ,(departement-id dep))
+      (name . ,(departement-name dep))
+      (patterns . ,(distribution->jsexpr (departement-patterns dep) #:translator pattern-translator))
+      (word-masculine . ,(distribution->jsexpr (departement-word-masculine dep)))
+      (word-feminine . ,(distribution->jsexpr (departement-word-feminine dep)))
+      (word-plural . ,(distribution->jsexpr (departement-word-plural dep)))
+      (word-masculine-plural . ,(distribution->jsexpr (departement-word-masculine-plural dep)))
+      (word-feminine-plural . ,(distribution->jsexpr (departement-word-feminine-plural dep)))
+      (person-name-masculine . ,(distribution->jsexpr (departement-person-name-masculine dep)))
+      (person-name-feminine . ,(distribution->jsexpr (departement-person-name-feminine dep)))
+      (locality-name . ,(markov->jsexpr (departement-locality-name dep) #:translator markov-translator))
+      (area-name . ,(markov->jsexpr (departement-area-name dep) #:translator markov-translator))
+      (river-name . ,(markov->jsexpr (departement-river-name dep) #:translator markov-translator))
+      )))
+
+; Output a region to a json-compatible structure
+(define (region->jsexpr reg)
+  (make-immutable-hash
+    `((id . ,(region-id reg))
+      (name . ,(region-name reg))
+      (departements . ,(map departement->jsexpr (hash-values (region-departements reg))))
+      )))
+
+; Output the extractor to json-compatible structure
+(define (extractor->jsexpr extractor)
+  (map region->jsexpr (hash-values extractor)))
+
+(define *output* (open-output-file "./gener-communes.json" #:exists 'truncate/replace))
+; output lisp structure
+(write-json (extractor->jsexpr *extractor*) *output*)
--- a/name-generator.rkt
+++ b/name-generator.rkt
@ -0,0 +1,228 @@
+#lang racket
+
+(require
+  json
+  "markov.rkt"
+  "distribution.rkt")
+
+; Structure for each region
+(struct region
+  (id ; id of region
+   name ; name of region
+   departements ; table of departements (id->dep)
+   ))
+
+; Constructor for a region
+(define (make-region id name)
+  (region id name (make-hash)))
+
+; Structure for each departement
+(struct departement
+  (id ; id of departement
+   name ; name of departement
+   patterns ; distribution of patterns
+   word-masculine ; distribution of masculine words
+   word-feminine ; distribution of feminine words
+   word-plural ; distribution of plural words
+   word-masculine-plural ; distribution of masculine plural words
+   word-feminine-plural ; distribution of feminine plural words
+   person-name-masculine ; distributino of masculine person name
+   person-name-feminine ; distribution of feminine person name
+   locality-name ; markov chain for locality names
+   area-name ; markov chain for area names
+   river-name ; markov chain for river names
+   ))
+
+; Constructor for a departement
+(define (make-departement id name)
+  (departement
+    id
+    name
+    (make-distribution)
+    (make-distribution)
+    (make-distribution)
+    (make-distribution)
+    (make-distribution)
+    (make-distribution)
+    (make-distribution)
+    (make-distribution)
+    (make-markov 3)
+    (make-markov 3)
+    (make-markov 3)))
+
+; Generate names
+; Replace symbols with a generated element
+(define (replace-with sym dep)
+  (cond
+    ((symbol? sym)
+     (case sym
+      (($WordMasculine$) (distribution-pick-from (departement-word-masculine dep)))
+      (($WordFeminine$) (distribution-pick-from (departement-word-feminine dep)))
+      (($WordPlural$) (distribution-pick-from (departement-word-plural dep)))
+      (($WordFemininePlural$) (distribution-pick-from (departement-word-feminine-plural dep)))
+      (($WordMasculinePlural$) (distribution-pick-from (departement-word-masculine-plural dep)))
+      (($PersonNameMasculine$) (distribution-pick-from (departement-person-name-masculine dep)))
+      (($PersonNameFeminine$) (distribution-pick-from (departement-person-name-feminine dep)))
+      (($LocalityName$) (list->string (markov-generate (departement-locality-name dep))))
+      (($AreaName$) (list->string (markov-generate (departement-area-name dep))))
+      (($RiverName$) (list->string (markov-generate (departement-river-name dep))))
+      (else sym)))
+    ((list? sym)
+     (map (lambda (x) (replace-with x dep)) sym))
+    (#t sym)))
+
+; Indicate if a string starts with a vowel
+; Accent taken into account
+(define (start-with-vowel? str)
+  (member
+    (car
+      (string->list
+        (string-normalize-nfd str)))
+    (list #\A #\E #\I #\O #\U #\Y)))
+
+; Transform the tree structure of a pattern back to a string
+(define (tree->string pattern)
+  (cond
+    ( (list? pattern)
+      (case (car pattern)
+        ((el sans la les las los Saint San Sainte Santa)
+          (string-append
+            (symbol->string (car pattern))
+            "-"
+            (tree->string (cadr pattern))))
+        ((le)
+          ; elision possible: much check the initial of the following word
+          (let ((following (tree->string (cadr pattern))))
+            (string-append
+              (if (start-with-vowel? following)
+                "l'"
+                "le-")
+               following)))
+        ((Santo)
+          ; elision possible: much check the initial of the following
+          (let ((following (tree->string (cadr pattern))))
+            (string-append
+              (if (start-with-vowel? following)
+                  "Sant'"
+                  "Santo-")
+               following)))
+        ((et sous lès lez près à en di sur)
+          (string-append
+            (tree->string (cadr pattern))
+            "-"
+            (symbol->string (car pattern))
+            "-"
+            (tree->string (caddr pattern))))
+        ((de)
+          ; ellision possible: must check the initial of the following word
+          (let ((following (tree->string (caddr pattern))))
+            (string-append
+              (tree->string (cadr pattern))
+              "-"
+              (if (start-with-vowel? following)
+                  "d'"
+                  "de-")
+              following)))
+        (else
+          (string-join
+            (map tree->string pattern)
+            "-"))))
+    ( (string? pattern)
+      pattern)
+    (#t
+      (error "Unexpected element: " pattern))))
+
+; Apply ellision rules
+(define (apply-ellisions str)
+  (define res str)
+  (set! res (string-replace res "-à-les-" "-aux-"))
+  (set! res (string-replace res "-de-los-" "-dels-")) ; Catalan
+  (set! res (string-replace res "-de-el-" "-del-")) ; Catalan
+  (set! res (string-replace res "-de-les-" "-des-"))
+  (set! res (string-replace res "-de-le-" "-du-"))
+  (set! res (string-replace res "-à-le-" "-au-"))
+  ; Start of word
+  (cond
+    ((string-prefix? res "la-")
+     (string-replace res "la-" "La " #:all? #f))
+    ((string-prefix? res "le-")
+     (string-replace res "le-" "Le " #:all? #f))
+    ((string-prefix? res "les-")
+     (string-replace res "les-" "Les " #:all? #f))
+    ((string-prefix? res "l'")
+     (string-replace res "l'" "L'" #:all? #f))
+    ((string-prefix? res "los-")
+     (string-replace res "los-" "Los " #:all? #f))
+    (#t res)))
+
+; Generate a new name from a departement
+(define (departement-generate dep)
+  (apply-ellisions
+    (tree->string
+      (replace-with
+        (distribution-pick-from (departement-patterns dep))
+        dep))))
+
+; Generate a new name from a region
+(define (region-generate reg)
+  (departement-generate (car (shuffle (hash-values (region-departements reg))))))
+
+; Generate a new name
+(define (generate gen)
+  (region-generate (car (shuffle (hash-values gen)))))
+
+(define (pattern-translator x)
+  (cond
+    ((string? x) (string->symbol x))
+    ((list? x) (map pattern-translator x))
+    (#t x)))
+
+(define (markov-translator x)
+  (cond
+    ((string? x) (car (string->list x))) ; single-character strings
+    (#t x)))
+
+; Convert a json-compatible structure to a departement
+(define (jsexpr->departement js)
+  (departement
+    (hash-ref js 'id)
+    (hash-ref js 'name)
+    (jsexpr->distribution (hash-ref js 'patterns) #:translator pattern-translator)
+    (jsexpr->distribution (hash-ref js 'word-masculine))
+    (jsexpr->distribution (hash-ref js 'word-feminine))
+    (jsexpr->distribution (hash-ref js 'word-plural))
+    (jsexpr->distribution (hash-ref js 'word-masculine-plural))
+    (jsexpr->distribution (hash-ref js 'word-feminine-plural))
+    (jsexpr->distribution (hash-ref js 'person-name-masculine))
+    (jsexpr->distribution (hash-ref js 'person-name-feminine))
+    (jsexpr->markov (hash-ref js 'locality-name) #:translator markov-translator)
+    (jsexpr->markov (hash-ref js 'area-name) #:translator markov-translator)
+    (jsexpr->markov (hash-ref js 'river-name) #:translator markov-translator)))
+
+; Convert a json-compatible structure to a region
+(define (jsexpr->region js)
+  (define reg (make-region (hash-ref js 'id) (hash-ref js 'name)))
+  (for-each
+    (lambda (dep)
+      (hash-set! (region-departements reg) (departement-id dep) dep))
+    (map jsexpr->departement (hash-ref js 'departements)))
+  reg)
+
+; Convert a json-compatible structure to a generator
+(define (jsexpr->generator js)
+  (define gen (make-hash))
+  (for-each
+    (lambda (reg)
+      (hash-set! gen (region-id reg) reg))
+    (map jsexpr->region js))
+  gen)
+
+; Read the file containing all the data
+(define *input* (open-input-file "./gener-communes.json"))
+
+; Process the input and train
+(define *generator* (jsexpr->generator (read-json *input*)))
+
+(for-each
+  displayln
+  (build-list 100 (lambda (_) (generate *generator*))))