Upload New File

63b62435 · Vincent Berment · 63b62435
Commit 63b62435 authored Apr 6, 2022 by Vincent Berment
--- a/Grammaire_documents_UNL.txt
+++ b/Grammaire_documents_UNL.txt
+nom SyntaxeGrapheUNL;
+// En plus des documents complets, on accepte tout un tas de formes simplifiées. Voir 
+@(GrapheUNL) = . DocumentUNL . + . ParagraphesUNL . + . PhrasesUNL . + . UnePhraseUNL . + . ExpressionUNL . + . RelationsBinaires . ;
+// 1) Document UNL
+// Spécification UNL 3.3, page 37
+// ==============================
+// A détailler (voir page 37 : <dinf> = <document name> "," <author name> [ "," <document ID> "," <date> "," <email address> ])
+(InformationsDocumentUNL) = \[#93]~* ;
+// La version sans les ":" après "D" est là pour permettre d'accepter des textes comme le Petit Prince.
+(FinDeDocument) = {[/D]} ;
+(DocumentUNL) = {[D:} . InformationsDocumentUNL . {]} . ParagraphesUNL . {[/D]} +
+				{[D} . InformationsDocumentUNL . {]} . ParagraphesUNL . FinDeDocument +
+				{[D:} . InformationsDocumentUNL . {]} . PhrasesUNL . FinDeDocument +
+				{[D} . InformationsDocumentUNL . {]} . PhrasesUNL . FinDeDocument ;
+(PhraseEnLangueSource) = \[#123]~* ;
+(PhraseEnLangueCible) = \[#123]~* ;
+(Code) = [A..Z,a..z]~+ ;
+(EgalCodeLangueOuRien) = {=} . Code + {} ;
+// Voir page 37 : <l-tag> = "ab" | "cn" | "de" | "el" | "es" | "fr" | "id" | "hd" | "it" | "jp" | "lv" | "mg" | "pg" | "ru" | "sh" | "th" /* language codes : language tags */
+(TagLangue) = [A..Z,a..z]~2 ;
+(AvantExpressionUNLOuRien) = [#123]~1 . {org:} . TagLangue . EgalCodeLangueOuRien . [#125]~1 . PhraseEnLangueSource . {{/org} . [#125]~1 + {} ;
+(ApresExpressionUNLOuRien) = [#123]~1 . TagLangue . EgalCodeLangueOuRien . InformationsLangueCibleOuRien . [#125]~1 . PhraseEnLangueCible . [#123]~1 . TagLangue . [#125]~1 ;
+AutresApresExpressionUNLOuRien = ApresExpressionUNLOuRien . AutresApresExpressionUNLOuRien + {} ;
+(PlusieursApresExpressionUNLOuRien) = ApresExpressionUNLOuRien . AutresApresExpressionUNLOuRien + {} ;
+(TexteLibreOuRien) = \[#91]~* ;
+(UnePhraseUNL) = AvantExpressionUNLOuRien . ExpressionUNL . PlusieursApresExpressionUNLOuRien . TexteLibreOuRien ;
+(NumeroDePhraseUNL) = [0..9]~+ ;
+(FinDePhrase) = {[/S]} ;
+(PhraseUNL) = {[S:} . NumeroDePhraseUNL . {]} . UnePhraseUNL . FinDePhrase ;
+AutresPhrasesUNL = PhraseUNL . AutresPhrasesUNL + {} ;
+(PhrasesUNL) = PhraseUNL . AutresPhrasesUNL ;
+(NumeroDeParagrapheUNL) = [0..9]~+ ;
+(FinDeParagraphe) = {[/P]} ;
+(ParagrapheUNL) = {[P:} . NumeroDeParagrapheUNL . {]} . PhrasesUNL . FinDeParagraphe ;
+AutresParagraphesUNL = ParagrapheUNL . AutresParagraphesUNL + {} ;
+(ParagraphesUNL) = ParagrapheUNL . AutresParagraphesUNL ;
+// 2) Expression UNL
+// Spécification UNL 3.3, page 38
+// ==============================
+IdentifiantUniversalWord = [0..9]~1 ;
+(InstanceUW) = DeuxPoints Numero ;
+(InstanceUWOuRien) = InstanceUW + {} ;
+// $$$ Factoriser BaliseUNL ?
+// Uniquement la forme "table" (pas la forme "liste" ; voir page 38)
+(UnUW) = {[W]} . UniversalWord . InstanceUWOuRien . AttributsOuRien . {[/W]} ;
+(UnScope) = {[W]} . {:} . IdentifiantUniversalWord . AttributsOuRien . {[/W]} . RelationsBinaires ;
+ExpressionUNL = BaliseUNL . RelationsBinaires . FinBaliseUNL +
+				BaliseUNL . UnUW . FinBaliseUNL +
+				BaliseUNL . UnScope . FinBaliseUNL ;
+// 3) Relation binaire
+// Spécification UNL 3.3, page 39
+// ==============================
+//<binary relation> ::= <relation> [“:”<compound UW-ID>] “(“
+//{{ <UW1> [":" <UW-ID1>]} | { “:” <compound UW-ID1> }}[<attribute list>] “,”
+//{{ <UW2> [":" <UW-ID2>]} | { “:” <compound UW-ID2> }}[<attribute list>] “)”
+//<relation> ::= a relation label, defined in “Chapter 2 Relations”
+//<UW> ::= a UW, see “Chapter 3 Universal Words”
+//<attribute list> ::= { “.” <attribute> } ...
+//<attribute> ::= an attribute, see “Chapter 4 Attributes”
+//<UW-ID> ::= two alphanumeric characters of ‘0’ - ‘9’ and ‘A’ - ‘Z’
+//<compound UW-ID> ::= two digits of “00” - “99”. “00” must be used for the main sentence and can be ommited.
+// Commentaire Gilles
+// La grammaire UNL spécifie clairement qu'un numéro a 2 chiffres (ou 2 lettres)
+// Les graphes du GETA utilisent des numéros à 1 ou plusieurs chiffres... (A rectifier le cas échéant).
+(DeuxPointsNumero) = DeuxPoints Numero ;
+// DeuxPointsNumeroOuRien = ":<compound UW-ID>" (cas d'une relation "scope")
+DeuxPointsNumeroOuRien = DeuxPointsNumero + {} ;
+UWDecore = UniversalWord InstanceUWOuRien AttributsOuRien ;
+(NoSousGraphe) = DeuxPointsNumero AttributsOuRien ;
+//{{ <UW1> [":" <UW-ID1>]} | { “:” <compound UW-ID1> }}[<attribute list>] “,” 
+//{{ <UW2> [":" <UW-ID2>]} | { “:” <compound UW-ID2> }}[<attribute list>] “
+(NoeudUNL) = UWDecore + NoSousGraphe ;
+//(NomRelation) = TokenGenerique ;
+(NomRelation) = [A..Z,a..z]~+ ;
+// DeuxPointsNumeroOuRien = ":<compound UW-ID>" (cas d'une relation "scope")
+(FinDeRelationBinaire) = Fermante ;
+(RelationBinaire) = NomRelation . DeuxPointsNumeroOuRien . Ouvrante . NoeudUNL . Virgule . NoeudUNL . FinDeRelationBinaire . ;
+AutresRelations = RelationBinaire AutresRelations + {} ;
+(RelationsBinaires) = RelationBinaire AutresRelations ;
+// 4) UW décorés
+// Spécification UNL 3.3, page 27
+// ==============================
+//<UW> ::= <headword> [<constraint list>]
+//<headword> ::= <character>...
+//<constraint list> ::= “(“ <constraint> [ “,” <constraint>]... “)”
+//<constraint> ::= 	<relation label> { “>” | “<” } <UW> [<constraint list>] |
+//					<relation label> { “>” | “<” } <UW> [<constraint list>] [ { “>” | “<” } <UW> [<constraint list>] ] ...
+//<relation label> ::= “agt” | and” | “aoj” | “obj” | “icl” | ...
+//<character> ::= “A” | ... | “Z” | “a” | ... | “z” | 0 | 1 | 2 | ... | 9 | “_” | ” “ | “#” | “!” | “$” | “%” | “=” | “^” | “~” | “|” | “@” | “+” | “-“ | “<” | “>” | “?”
+(Attribut) = {.@} [A..Z,a..z,0..9,_,-]~+ ;
+AutresAttributs = Attribut . AutresAttributs + {} ;
+(Attributs) = Attribut . AutresAttributs ;
+AttributsOuRien = Attributs + {} ;
+(Label) = [A..Z,a..z,0..9,_,-,',#,&,/,.,@,#192..#255]~+ ;
+(Direction) = Inferieur + Superieur ;
+SuiteRestrictionOuRien = Direction . Headword . RestrictionsOuRien + {} ;
+(Restriction) = Label . Direction . Headword . RestrictionsOuRien . SuiteRestrictionOuRien ;
+AutresRestrictions = Virgule . Restriction . AutresRestrictions + {} ;
+(Restrictions) = Ouvrante . Restriction . AutresRestrictions . Fermante ;
+RestrictionsOuRien = Restrictions + {} ;
+// Prend en compte le cas général ("TokenGenerique") et le cas d'un mot entre guillemets ("DBLQUOTED"), a priori non prévu dans la version 3.3 de la spécification UNL
+(Headword) = TokenGenerique + {"} MotEntreGuillemets {"} ;
+(UniversalWord) = Headword RestrictionsOuRien ;
+// Commentaire Gilles
+// Putain de syntaxe à la con !
+// Un UW dans une restriction n'a pas forcément la même syntaxe qu'une UW "normale".
+// En effet, la restriction peut être factorisée.
+//ListeDeRestrictionsInternes = Restrictions ;
+//UWDansUneRestriction = UniversalWord ListeDeRestrictionsInternes ;
+// A) Constituants de base
+// ==============================
+Alphabetique = [A..Z,a..z]~1 ;
+Numerique = [0..9]~1 ;
+// AlphaNumerique = Alphabetique + Numerique + [_,-]~1 ;
+// AlphaNumerique = [A..Z,a..z,0..9,_,-]~1 ;
+Espaces = { } + [#2..#32]~1 ;
+(Ouvrante) = {(} ;
+(Fermante) = {)} ;
+(Inferieur) = {<} ;
+(Superieur) = {>} ;
+(Virgule) = {,} ;
+(DeuxPoints) = {:} ;
+(PourCent) = {%} ;
+(Numero) = [A..Z,a..z,0..9,_,-]~+ ;
+// CaracteresDeHeadword = [A..Z,a..z,0..9,_,-,',#,&,/,.,@,#192..#255]~1 ;
+(TexteOuRien) = {:} \[#93]~* + {} ;
+// A détailler (voir page 37 : <uinf> = <system name> "," <post-editor name> "," <reliability> [ "," <date> "," <email address> ] )
+(InformationsExpressionUNLOuRien) = . {:} . \[#125]~* + {} ;
+// A détailler (voir page 37 : <sinf> = <system name> "," <post-editor name> "," <reliability> [ "," <date> "," <email address> ] )
+(InformationsLangueCibleOuRien) = . {:} . \[#125]~* + {} ;
+Document = {[} Espaces* {D} TexteOuRien {]} ;
+Paragraphe = {[} Espaces* {P} TexteOuRien {]} ;
+Titre = {[} Espaces* {T} TexteOuRien {]} ;
+Phrase = {[} Espaces* {S} TexteOuRien {]} ;
+ListeNoeuds = {[} Espaces* {W} TexteOuRien {]} ;
+BaliseUNL = [#123]~1 {unl} InformationsExpressionUNLOuRien [#125]~1 ;
+FinDocument = {[} Espaces* {/D} Espaces* {]} ;
+FinParagraphe = {[} Espaces* {/P} Espaces* {]} ;
+FinTitre = {[} Espaces* {/T} Espaces* {]} ;
+FinPhrase = {[} Espaces* {/S} Espaces* {]} ;
+FinListeNoeuds = {[} Espaces* {/W} Espaces* {]} ;
+FinBaliseUNL = [#123]~1 {/unl} [#125]~1 ;
+// <character> ::= “A” | ... | “Z” | “a” | ... | “z” | 0 | 1 | 2 | ... | 9 | “_” | ” “ | “#” | “!” | “$” | “%” | “=” | “^” | “~” | “|” | “@” | “+” | “-“ | “<” | “>” | “?”
+// (donc incomplet)
+TokenGenerique = [A..Z,a..z,0..9,_,#32,#,!,$,%,=,^,~,|,@,+,-,<,>,?,',&,/,#192..#255]~+ ;
+(MotEntreGuillemets) = \["]~* ;
+/