diff --git a/extractor.rkt b/extractor.rkt new file mode 100644 index 0000000..5f977bb --- /dev/null +++ b/extractor.rkt @@ -0,0 +1,421 @@ +#lang racket + +; Extracts data from CSV and generate a json file suitable for generation +; The output json file contains a list of regions, each containing a list of departements : +; [ { +; "id": "...", "name": "...", +; "departements": +; [ { +; "id": "...", "name": "...", +; "word-masculine": [{"value": "...", "count": X}, ...], +; "word-feminine": [{"value": "...", "count": X}, ...], +; "word-plural": [{"value": "...", "count": X}, ...], +; "word-masculine-plural": [{"value": "...", "count": X}, ...], +; "word-feminine-plural": [{"value": "...", "count": X}, ...], +; "person-name-masculine": [{"value": "...", "count": X}, ...], +; "person-name-feminine": [{"value": "...", "count": X}, ...], +; "patterns": [{"value": [...], "count": X}, ...], +; "locality-markov": {"order": X, "next": [{ "sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]}, +; "area-markov": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]}, +; "river-markov": {"order": X, "next": [{"sequence": [...], "distribution": [{"value": "...", "count": X}, ...] }, ...]}, +; }, +; ... +; ], +; }, +; ... +; ] +; The distributions and markov chains are trained are by departement + +(require + csv-reading + json + srfi/1 + "markov.rkt" + "distribution.rkt") + +; Structure for each region +(struct region + (id ; id of region + name ; name of region + departements ; table of departements (id->dep) + )) + +; Constructor for a region +(define (make-region id name) + (region id name (make-hash))) + +; Structure for each departement +(struct departement + (id ; id of departement + name ; name of departement + patterns ; distribution of patterns + word-masculine ; distribution of masculine words + word-feminine ; distribution of feminine words + word-plural ; distribution of plural words + word-masculine-plural ; distribution of masculine plural words + word-feminine-plural ; distribution of feminine plural words + person-name-masculine ; distributino of masculine person name + person-name-feminine ; distribution of feminine person name + locality-name ; markov chain for locality names + area-name ; markov chain for area names + river-name ; markov chain for river names + )) + +; Constructor for a departement +(define (make-departement id name) + (departement + id + name + (make-distribution) + (make-distribution) + (make-distribution) + (make-distribution) + (make-distribution) + (make-distribution) + (make-distribution) + (make-distribution) + (make-markov 3) + (make-markov 3) + (make-markov 3))) + +;; +; Extraction tools + +; Accessors of a csv-commune object (dict of line extracted from the csv file) +; Name +(define (csv-commune-name obj) + (dict-ref obj "COM_NOM")) +; Uppercase name +(define (csv-commune-name-maj obj) + (dict-ref obj "COM_NOM_MAJ")) +; Uppercase short name +(define (csv-commune-name-maj-court obj) + (dict-ref obj "COM_NOM_MAJ_COURT")) +; Region Id +(define (csv-commune-region-id obj) + (dict-ref obj "REG_ID_OLD")) +; Region name +(define (csv-commune-region-name obj) + (dict-ref obj "REG_NOM_OLD")) +; Departement Id +(define (csv-commune-departement-id obj) + (dict-ref obj "DEP_ID")) +; Departement name +(define (csv-commune-departement-name obj) + (dict-ref obj "DEP_NOM")) +; Indicate if a commune is active (still exists) +(define (csv-commune-active? obj) + (not (equal? "SO" (dict-ref obj "UUCR_ID")))) + +; Function removing all diacrtics of latin-based languages +(define (remove-diacritics str) + (list->string + (filter + (lambda (c) + (< (char->integer c) 256)) + (string->list + (string-normalize-nfd str))))) + +; Return the short form of a name +(define (short-form str) + (string-join + (map + (lambda (x) + (cond + ((equal? x "SAINT") "ST") + ((equal? x "SAINTE") "STE") + (#t x))) + (string-split + str + #rx"[-']")) + " ")) + +; Indicate if the commune name's uppercase form and uppercase short form can be computed from the commune's name +(define (csv-commune-coherent? obj) + (define name (csv-commune-name obj)) + (define name-maj (csv-commune-name-maj obj)) + (define name-maj-court (csv-commune-name-maj-court obj)) + (and + ; uppercase name should be name but capitalized and diacritics removed + (equal? name-maj + (string-upcase (remove-diacritics name))) + ; uppercase short name should be uppercase name with apostrophes and hyphens replaced by spaces, "SAINT " replaced by "ST " and "SAINTE " replaced by "STE " + (equal? name-maj-court + (short-form name-maj)))) + +; Add a region to the extractor +(define (add-region extractor obj) + (define id (csv-commune-region-id obj)) + (when (not (hash-has-key? extractor id)) + (hash-set! extractor id (make-region id (csv-commune-region-name obj))))) + +; Add a departement to the extractor +(define (add-departement extractor obj) + (define reg (hash-ref extractor (csv-commune-region-id obj))) + (define id (csv-commune-departement-id obj)) + (when (not (hash-has-key? (region-departements reg) id)) + (hash-set! (region-departements reg) id (make-departement id (csv-commune-departement-name obj))))) + +; Remove elision +(define *elisions* + '(("-l'" . "-le-") ; masculine is used - though "l'" can also be an elision for the feminine "la" + ("-d'" . "-de-") + ("-des-" . "-de-les-") + ("-du-" . "-de-le-") + ("-del-" . "-de-el-") ; Catalan + ("-dels-" . "-de-los-") ; Catalan + ("-au-" . "-à-le-") + ("-aux-" . "-à-les-") + ("Sant'" . "Santo-") ; Corsican + ; Initials articles are replaced with lowercase variants + ("La " . "la-") + ("Le " . "le-") + ("Les " . "les-") + ("L'" . "le-") + ("Los " . "los-") ; Catalan + )) +(define (remove-elision str) + (foldl + (lambda (item result) + (string-replace result (car item) (cdr item))) + str + *elisions*)) + +; Transform preposition into binary operators between lists: +; ex: (A B C "sur" X Y Z) => (sur (A B C) (X Y Z)) +; Note: recursive on lists +(define *prepositions* + '("et" + "à" + "en" + "de" + "di" ; Corsican + "sur" + "sous" + "lès" + "lez" + "près")) +(define (preposition-pass prep tokens) + (if (list? tokens) + (let ((idx (index-of tokens prep))) + (if (and idx (< (+ idx 1) (length tokens))) ; preposition cannot be in last position + (list + (string->symbol prep) + (take tokens idx) + (preposition-pass prep (drop tokens (+ 1 idx)))) + (map (lambda (x) (preposition-pass prep x)) tokens))) + tokens)) + +; Transform articles (and article-like words as "Saint") into unary operators +; ex: ("le" X Y) => ((le X) Y) +(define *articles* + '("le" + "el" ; Catalan/Spanish + "la" + "les" + "las" ; Catalan + "los" ; Catalan + "Saint" + "Santo" ; Corsican + "San" ; Corsican + "Sainte" + "Santa" ; Corsican + "sans")) +(define (article-pass art tokens) + (if (list? tokens) + (let ((idx (index-of tokens art))) + (if (and idx (< (+ idx 1) (length tokens))) ; article cannot be in last position + (list + (take tokens idx) + (list + (string->symbol art) + (list-ref tokens (+ 1 idx))) + (article-pass art (drop tokens (+ 2 idx)))) + (map (lambda (x) (article-pass art x)) tokens))) + tokens)) + +; Simplify: remove empty lists +(define (remove-empty tokens) + (if (list? tokens) + (map + remove-empty + (filter (lambda (x) (not (null? x))) tokens)) + tokens)) + +; Simplify: lists of single elements are replaced by that element +(define (simplify-single tokens) + (if (list? tokens) + (if (eq? 1 (length tokens)) + (simplify-single (car tokens)) + (map simplify-single tokens)) + tokens)) + +; Extract the strings and generate a pattern +; The string are extracted into the different tables according to the following rules +; Single string +; X => X = locality name +; Unary operators +; (le X), (el X), (sans X) => X = masculine word +; (la X) => X = feminine word +; (les X) => X = plural word +; (las X) => X = feminine plural word +; (los X) => X = masculine plural word +; (Saint X), (Santo X), (San X) => X = masculine name +; (Sainte X), (Santa X) => X = feminine name +; Binary operators +; (et X Y), (sous X Y), (lès X Y), (lez X Y), (près X Y) => X = locality name, Y = locality name +; (à X Y), (en X Y), (de X Y), (di X Y) => X = locality name, Y = area name +; (sur X Y) => X = locality name, Y = river name + +; During extraction, the strings are replaced by symbols of the corresponding type to form patterns +(define (extract-to-distribution dep token dist sym) + (if (string? token) + (begin + (case sym + (($LocalityName$ $AreaName$ $RiverName$) + ; Those are added to a markov chain + (markov-add-example! dist (string->list (string-titlecase token)))) + (else + ; added to a distribution + (distribution-add-to! dist token))) + sym) + (extract-strings dep token))) + +; Extraction of unary operators +(define (unary-operator dep token dist sym) + (list (car token) (extract-to-distribution dep (cadr token) dist sym))) + +; Extraction of binary operators +(define (binary-operator dep token dist1 sym1 dist2 sym2) + (list (car token) (extract-to-distribution dep (cadr token) dist1 sym1) (extract-to-distribution dep (caddr token) dist2 sym2))) + +(define (extract-strings dep token) + (cond + ( (list? token) + (case (car token) + ((le el sans) + (unary-operator dep token (departement-word-masculine dep) '$WordMasculine$)) + ((la) + (unary-operator dep token (departement-word-feminine dep) '$WordFeminine$)) + ((les) + (unary-operator dep token (departement-word-plural dep) '$WordPlural$)) + ((las) + (unary-operator dep token (departement-word-feminine-plural dep) '$WordFemininePlural$)) + ((los) + (unary-operator dep token (departement-word-masculine-plural dep) '$WordMasculinePlural$)) + ((Saint Santo San) + (unary-operator dep token (departement-person-name-masculine dep) '$PersonNameMasculine$)) + ((Sainte Santa) + (unary-operator dep token (departement-person-name-feminine dep) '$PersonNameFeminine$)) + ((et sous lès lez près) + (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-locality-name dep) '$LocalityName$)) + ((à en de di) + (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-area-name dep) '$AreaName$)) + ((sur) + (binary-operator dep token (departement-locality-name dep) '$LocalityName$ (departement-river-name dep) '$RiverName$)) + (else + (map (lambda (x) (extract-strings dep x)) token)) + )) + ( (string? token) + (extract-to-distribution dep token (departement-locality-name dep) '$LocalityName$)) + (#t + (error "Unexpected element: " token)))) + +; Add a pattern to a departement +(define (add-pattern dep pattern) + (distribution-add-to! (departement-patterns dep) pattern)) + +; Train the extractor on the commune name +(define (train-commune extractor obj) + (define reg (hash-ref extractor (csv-commune-region-id obj))) + (define dep (hash-ref (region-departements reg) (csv-commune-departement-id obj))) + ; Tokenize the name by removing elisions and splitting along the hyphens + (define tokens (string-split (remove-elision (csv-commune-name obj)) "-")) + ; Decompose the tokens into a tree with preposition + (set! tokens (foldl preposition-pass tokens *prepositions*)) + ; Decompose the tokens into a tree with articles + (set! tokens (foldl article-pass tokens *articles*)) + ; Simplify a bit the tokens: remove empty lists and replace single-item list by the contained item + (set! tokens (simplify-single (remove-empty tokens))) + ; Extract the strings into a departement and generate a pattern + (add-pattern dep (extract-strings dep tokens))) + +; Add a commune to the extractor, an hash table of regions +; Only add if the locality is active and its names are coherent +(define (add-commune extractor obj) + (when (and (csv-commune-coherent? obj) + (csv-commune-active? obj)) + (add-region extractor obj) + (add-departement extractor obj) + (train-commune extractor obj))) + +; Table of regions in which the data is stored +(define *extractor* (make-hash)) + +; Read data file and fill tables +; File a CSV with semi-colon (';') as field separator +(define *source-file* "./data/fr-esr-referentiel-geographique.csv") +(define source-reader + (make-csv-reader + (open-input-file *source-file*) + '((separator-chars . (#\;)) + (strip-leading-whitespace? . #t) + (strip-trailing-whitespace? . #t)))) +; Get the header +(define *header* (source-reader)) + +; Fill the extractor +(csv-for-each + (lambda (line) + (add-commune + *extractor* + (map + (lambda (x) (apply cons x)) + (zip *header* line)))) + source-reader) + +; Translators for json serialisation +(define (pattern-translator x) + (cond + ((symbol? x) (symbol->string x)) + ((list? x) (map pattern-translator x)) + (else x))) + +(define (markov-translator x) + (cond + ((char? x) (list->string (list x))) + (else x))) + +; Output a departement to a json-compatible structure +(define (departement->jsexpr dep) + (make-immutable-hash + `((id . ,(departement-id dep)) + (name . ,(departement-name dep)) + (patterns . ,(distribution->jsexpr (departement-patterns dep) #:translator pattern-translator)) + (word-masculine . ,(distribution->jsexpr (departement-word-masculine dep))) + (word-feminine . ,(distribution->jsexpr (departement-word-feminine dep))) + (word-plural . ,(distribution->jsexpr (departement-word-plural dep))) + (word-masculine-plural . ,(distribution->jsexpr (departement-word-masculine-plural dep))) + (word-feminine-plural . ,(distribution->jsexpr (departement-word-feminine-plural dep))) + (person-name-masculine . ,(distribution->jsexpr (departement-person-name-masculine dep))) + (person-name-feminine . ,(distribution->jsexpr (departement-person-name-feminine dep))) + (locality-name . ,(markov->jsexpr (departement-locality-name dep) #:translator markov-translator)) + (area-name . ,(markov->jsexpr (departement-area-name dep) #:translator markov-translator)) + (river-name . ,(markov->jsexpr (departement-river-name dep) #:translator markov-translator)) + ))) + +; Output a region to a json-compatible structure +(define (region->jsexpr reg) + (make-immutable-hash + `((id . ,(region-id reg)) + (name . ,(region-name reg)) + (departements . ,(map departement->jsexpr (hash-values (region-departements reg)))) + ))) + +; Output the extractor to json-compatible structure +(define (extractor->jsexpr extractor) + (map region->jsexpr (hash-values extractor))) + +(define *output* (open-output-file "./gener-communes.json" #:exists 'truncate/replace)) +; output lisp structure +(write-json (extractor->jsexpr *extractor*) *output*) diff --git a/name-generator.rkt b/name-generator.rkt new file mode 100644 index 0000000..1f47229 --- /dev/null +++ b/name-generator.rkt @@ -0,0 +1,228 @@ +#lang racket + +(require + json + "markov.rkt" + "distribution.rkt") + +; Structure for each region +(struct region + (id ; id of region + name ; name of region + departements ; table of departements (id->dep) + )) + +; Constructor for a region +(define (make-region id name) + (region id name (make-hash))) + +; Structure for each departement +(struct departement + (id ; id of departement + name ; name of departement + patterns ; distribution of patterns + word-masculine ; distribution of masculine words + word-feminine ; distribution of feminine words + word-plural ; distribution of plural words + word-masculine-plural ; distribution of masculine plural words + word-feminine-plural ; distribution of feminine plural words + person-name-masculine ; distributino of masculine person name + person-name-feminine ; distribution of feminine person name + locality-name ; markov chain for locality names + area-name ; markov chain for area names + river-name ; markov chain for river names + )) + +; Constructor for a departement +(define (make-departement id name) + (departement + id + name + (make-distribution) + (make-distribution) + (make-distribution) + (make-distribution) + (make-distribution) + (make-distribution) + (make-distribution) + (make-distribution) + (make-markov 3) + (make-markov 3) + (make-markov 3))) + +; Generate names +; Replace symbols with a generated element +(define (replace-with sym dep) + (cond + ((symbol? sym) + (case sym + (($WordMasculine$) (distribution-pick-from (departement-word-masculine dep))) + (($WordFeminine$) (distribution-pick-from (departement-word-feminine dep))) + (($WordPlural$) (distribution-pick-from (departement-word-plural dep))) + (($WordFemininePlural$) (distribution-pick-from (departement-word-feminine-plural dep))) + (($WordMasculinePlural$) (distribution-pick-from (departement-word-masculine-plural dep))) + (($PersonNameMasculine$) (distribution-pick-from (departement-person-name-masculine dep))) + (($PersonNameFeminine$) (distribution-pick-from (departement-person-name-feminine dep))) + (($LocalityName$) (list->string (markov-generate (departement-locality-name dep)))) + (($AreaName$) (list->string (markov-generate (departement-area-name dep)))) + (($RiverName$) (list->string (markov-generate (departement-river-name dep)))) + (else sym))) + ((list? sym) + (map (lambda (x) (replace-with x dep)) sym)) + (#t sym))) + +; Indicate if a string starts with a vowel +; Accent taken into account +(define (start-with-vowel? str) + (member + (car + (string->list + (string-normalize-nfd str))) + (list #\A #\E #\I #\O #\U #\Y))) + +; Transform the tree structure of a pattern back to a string +(define (tree->string pattern) + (cond + ( (list? pattern) + (case (car pattern) + ((el sans la les las los Saint San Sainte Santa) + (string-append + (symbol->string (car pattern)) + "-" + (tree->string (cadr pattern)))) + ((le) + ; elision possible: much check the initial of the following word + (let ((following (tree->string (cadr pattern)))) + (string-append + (if (start-with-vowel? following) + "l'" + "le-") + following))) + ((Santo) + ; elision possible: much check the initial of the following + (let ((following (tree->string (cadr pattern)))) + (string-append + (if (start-with-vowel? following) + "Sant'" + "Santo-") + following))) + ((et sous lès lez près à en di sur) + (string-append + (tree->string (cadr pattern)) + "-" + (symbol->string (car pattern)) + "-" + (tree->string (caddr pattern)))) + ((de) + ; ellision possible: must check the initial of the following word + (let ((following (tree->string (caddr pattern)))) + (string-append + (tree->string (cadr pattern)) + "-" + (if (start-with-vowel? following) + "d'" + "de-") + following))) + (else + (string-join + (map tree->string pattern) + "-")))) + ( (string? pattern) + pattern) + (#t + (error "Unexpected element: " pattern)))) + +; Apply ellision rules +(define (apply-ellisions str) + (define res str) + (set! res (string-replace res "-à-les-" "-aux-")) + (set! res (string-replace res "-de-los-" "-dels-")) ; Catalan + (set! res (string-replace res "-de-el-" "-del-")) ; Catalan + (set! res (string-replace res "-de-les-" "-des-")) + (set! res (string-replace res "-de-le-" "-du-")) + (set! res (string-replace res "-à-le-" "-au-")) + ; Start of word + (cond + ((string-prefix? res "la-") + (string-replace res "la-" "La " #:all? #f)) + ((string-prefix? res "le-") + (string-replace res "le-" "Le " #:all? #f)) + ((string-prefix? res "les-") + (string-replace res "les-" "Les " #:all? #f)) + ((string-prefix? res "l'") + (string-replace res "l'" "L'" #:all? #f)) + ((string-prefix? res "los-") + (string-replace res "los-" "Los " #:all? #f)) + (#t res))) + +; Generate a new name from a departement +(define (departement-generate dep) + (apply-ellisions + (tree->string + (replace-with + (distribution-pick-from (departement-patterns dep)) + dep)))) + +; Generate a new name from a region +(define (region-generate reg) + (departement-generate (car (shuffle (hash-values (region-departements reg)))))) + +; Generate a new name +(define (generate gen) + (region-generate (car (shuffle (hash-values gen))))) + +(define (pattern-translator x) + (cond + ((string? x) (string->symbol x)) + ((list? x) (map pattern-translator x)) + (#t x))) + +(define (markov-translator x) + (cond + ((string? x) (car (string->list x))) ; single-character strings + (#t x))) + +; Convert a json-compatible structure to a departement +(define (jsexpr->departement js) + (departement + (hash-ref js 'id) + (hash-ref js 'name) + (jsexpr->distribution (hash-ref js 'patterns) #:translator pattern-translator) + (jsexpr->distribution (hash-ref js 'word-masculine)) + (jsexpr->distribution (hash-ref js 'word-feminine)) + (jsexpr->distribution (hash-ref js 'word-plural)) + (jsexpr->distribution (hash-ref js 'word-masculine-plural)) + (jsexpr->distribution (hash-ref js 'word-feminine-plural)) + (jsexpr->distribution (hash-ref js 'person-name-masculine)) + (jsexpr->distribution (hash-ref js 'person-name-feminine)) + (jsexpr->markov (hash-ref js 'locality-name) #:translator markov-translator) + (jsexpr->markov (hash-ref js 'area-name) #:translator markov-translator) + (jsexpr->markov (hash-ref js 'river-name) #:translator markov-translator))) + +; Convert a json-compatible structure to a region +(define (jsexpr->region js) + (define reg (make-region (hash-ref js 'id) (hash-ref js 'name))) + (for-each + (lambda (dep) + (hash-set! (region-departements reg) (departement-id dep) dep)) + (map jsexpr->departement (hash-ref js 'departements))) + reg) + +; Convert a json-compatible structure to a generator +(define (jsexpr->generator js) + (define gen (make-hash)) + (for-each + (lambda (reg) + (hash-set! gen (region-id reg) reg)) + (map jsexpr->region js)) + gen) + +; Read the file containing all the data +(define *input* (open-input-file "./gener-communes.json")) + +; Process the input and train +(define *generator* (jsexpr->generator (read-json *input*))) + +(for-each + displayln + (build-list 100 (lambda (_) (generate *generator*))))