Skip to content
Snippets Groups Projects
Commit 116f46a7 authored by Aurélien Lamercerie's avatar Aurélien Lamercerie
Browse files

Basic document parsing (with org and unl parts separation)

parent f330c405
No related branches found
No related tags found
No related merge requests found
File added
#!/usr/bin/python3.10
# -*-coding:Utf-8 -*
#==============================================================================
# TENET: inference
#------------------------------------------------------------------------------
# Module to execute the extraction process,
# by applying the transduction schemes with the SHACL inference engine
#==============================================================================
#==============================================================================
# Importing required modules
#==============================================================================
import sys
from subprocess import Popen, PIPE
from antlr4 import FileStream, CommonTokenStream
# from antlr.unlLexer import unlLexer
# from antlr.unlParser import unlParser
#==============================================================================
# Parameters
#==============================================================================
# None
#==============================================================================
# Document Class
#==============================================================================
class Document:
def __init__(self, sentence):
self.sentence = sentence
def to_string(self):
return self.sentence.to_string()
class Sentence:
def __init__(self, org_part, unl_part):
self.org_part = org_part
self.unl_part = unl_part
def to_string(self):
return self.org_part.to_string() + self.unl_part.to_string
class OrgPart:
def __init__(self, value):
self.value = value
def to_string(self):
return self.value
class UnlPart:
def __init__(self, value):
self.value = value
def to_string(self):
return self.value
\ No newline at end of file
File added
File added
//=============================================================================
// ANTLR Grammar for UNL Document
//=============================================================================
grammar doc;
@header {
from asd import doc
}
//=============================================================================
// Parser Grammar
//=============================================================================
//---------------------------------------------------------
// Document = list of sentence
//---------------------------------------------------------
document returns [out]
: '[D]' s=sentence '[/D]' {$out = doc.Document($s.out)}
;
sentence returns [out]
: '[S:R1]' o=orgPart u=unlPart '[/S]' {$out = doc.Sentence($o.out, $u.out)}
;
orgPart returns [out]
: o=ORG {$out = doc.OrgPart($o.text)}
;
unlPart returns [out]
: u=UNL {$out = doc.UnlPart($u.text)}
;
//=============================================================================
// Lexer Grammar
//=============================================================================
// ignore whitespaces
WS : (' '|'\n'|'\t'|'\r'|'\u000C')+ -> skip ;
// other tokens
ORG : '{org:en}' (.)* '{/org}' ;
UNL : '{unl}' (.)* '{/unl}' ;
token literal names:
null
'[D]'
'[/D]'
'[S:R1]'
'[/S]'
null
null
null
token symbolic names:
null
null
null
null
null
WS
ORG
UNL
rule names:
document
sentence
orgPart
unlPart
atn:
[3, 24715, 42794, 33075, 47597, 16764, 15335, 30598, 22884, 3, 9, 28, 4, 2, 9, 2, 4, 3, 9, 3, 4, 4, 9, 4, 4, 5, 9, 5, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, 3, 4, 3, 5, 3, 5, 3, 5, 3, 5, 2, 2, 6, 2, 4, 6, 8, 2, 2, 2, 23, 2, 10, 3, 2, 2, 2, 4, 15, 3, 2, 2, 2, 6, 21, 3, 2, 2, 2, 8, 24, 3, 2, 2, 2, 10, 11, 7, 3, 2, 2, 11, 12, 5, 4, 3, 2, 12, 13, 7, 4, 2, 2, 13, 14, 8, 2, 1, 2, 14, 3, 3, 2, 2, 2, 15, 16, 7, 5, 2, 2, 16, 17, 5, 6, 4, 2, 17, 18, 5, 8, 5, 2, 18, 19, 7, 6, 2, 2, 19, 20, 8, 3, 1, 2, 20, 5, 3, 2, 2, 2, 21, 22, 7, 8, 2, 2, 22, 23, 8, 4, 1, 2, 23, 7, 3, 2, 2, 2, 24, 25, 7, 9, 2, 2, 25, 26, 8, 5, 1, 2, 26, 9, 3, 2, 2, 2, 2]
\ No newline at end of file
T__0=1
T__1=2
T__2=3
T__3=4
WS=5
ORG=6
UNL=7
'[D]'=1
'[/D]'=2
'[S:R1]'=3
'[/S]'=4
token literal names:
null
'[D]'
'[/D]'
'[S:R1]'
'[/S]'
null
null
null
token symbolic names:
null
null
null
null
null
WS
ORG
UNL
rule names:
T__0
T__1
T__2
T__3
WS
ORG
UNL
channel names:
DEFAULT_TOKEN_CHANNEL
HIDDEN
mode names:
DEFAULT_MODE
atn:
[3, 24715, 42794, 33075, 47597, 16764, 15335, 30598, 22884, 2, 9, 86, 8, 1, 4, 2, 9, 2, 4, 3, 9, 3, 4, 4, 9, 4, 4, 5, 9, 5, 4, 6, 9, 6, 4, 7, 9, 7, 4, 8, 9, 8, 3, 2, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 6, 6, 6, 40, 10, 6, 13, 6, 14, 6, 41, 3, 6, 3, 6, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 7, 7, 56, 10, 7, 12, 7, 14, 7, 59, 11, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 7, 8, 75, 10, 8, 12, 8, 14, 8, 78, 11, 8, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 2, 2, 9, 3, 3, 5, 4, 7, 5, 9, 6, 11, 7, 13, 8, 15, 9, 3, 2, 3, 5, 2, 11, 12, 14, 15, 34, 34, 2, 88, 2, 3, 3, 2, 2, 2, 2, 5, 3, 2, 2, 2, 2, 7, 3, 2, 2, 2, 2, 9, 3, 2, 2, 2, 2, 11, 3, 2, 2, 2, 2, 13, 3, 2, 2, 2, 2, 15, 3, 2, 2, 2, 3, 17, 3, 2, 2, 2, 5, 21, 3, 2, 2, 2, 7, 26, 3, 2, 2, 2, 9, 33, 3, 2, 2, 2, 11, 39, 3, 2, 2, 2, 13, 45, 3, 2, 2, 2, 15, 67, 3, 2, 2, 2, 17, 18, 7, 93, 2, 2, 18, 19, 7, 70, 2, 2, 19, 20, 7, 95, 2, 2, 20, 4, 3, 2, 2, 2, 21, 22, 7, 93, 2, 2, 22, 23, 7, 49, 2, 2, 23, 24, 7, 70, 2, 2, 24, 25, 7, 95, 2, 2, 25, 6, 3, 2, 2, 2, 26, 27, 7, 93, 2, 2, 27, 28, 7, 85, 2, 2, 28, 29, 7, 60, 2, 2, 29, 30, 7, 84, 2, 2, 30, 31, 7, 51, 2, 2, 31, 32, 7, 95, 2, 2, 32, 8, 3, 2, 2, 2, 33, 34, 7, 93, 2, 2, 34, 35, 7, 49, 2, 2, 35, 36, 7, 85, 2, 2, 36, 37, 7, 95, 2, 2, 37, 10, 3, 2, 2, 2, 38, 40, 9, 2, 2, 2, 39, 38, 3, 2, 2, 2, 40, 41, 3, 2, 2, 2, 41, 39, 3, 2, 2, 2, 41, 42, 3, 2, 2, 2, 42, 43, 3, 2, 2, 2, 43, 44, 8, 6, 2, 2, 44, 12, 3, 2, 2, 2, 45, 46, 7, 125, 2, 2, 46, 47, 7, 113, 2, 2, 47, 48, 7, 116, 2, 2, 48, 49, 7, 105, 2, 2, 49, 50, 7, 60, 2, 2, 50, 51, 7, 103, 2, 2, 51, 52, 7, 112, 2, 2, 52, 53, 7, 127, 2, 2, 53, 57, 3, 2, 2, 2, 54, 56, 11, 2, 2, 2, 55, 54, 3, 2, 2, 2, 56, 59, 3, 2, 2, 2, 57, 55, 3, 2, 2, 2, 57, 58, 3, 2, 2, 2, 58, 60, 3, 2, 2, 2, 59, 57, 3, 2, 2, 2, 60, 61, 7, 125, 2, 2, 61, 62, 7, 49, 2, 2, 62, 63, 7, 113, 2, 2, 63, 64, 7, 116, 2, 2, 64, 65, 7, 105, 2, 2, 65, 66, 7, 127, 2, 2, 66, 14, 3, 2, 2, 2, 67, 68, 7, 125, 2, 2, 68, 69, 7, 119, 2, 2, 69, 70, 7, 112, 2, 2, 70, 71, 7, 110, 2, 2, 71, 72, 7, 127, 2, 2, 72, 76, 3, 2, 2, 2, 73, 75, 11, 2, 2, 2, 74, 73, 3, 2, 2, 2, 75, 78, 3, 2, 2, 2, 76, 74, 3, 2, 2, 2, 76, 77, 3, 2, 2, 2, 77, 79, 3, 2, 2, 2, 78, 76, 3, 2, 2, 2, 79, 80, 7, 125, 2, 2, 80, 81, 7, 49, 2, 2, 81, 82, 7, 119, 2, 2, 82, 83, 7, 112, 2, 2, 83, 84, 7, 110, 2, 2, 84, 85, 7, 127, 2, 2, 85, 16, 3, 2, 2, 2, 6, 2, 41, 57, 76, 3, 8, 2, 2]
\ No newline at end of file
# Generated from grammar/doc/doc.g4 by ANTLR 4.9.3
from antlr4 import *
from io import StringIO
import sys
if sys.version_info[1] > 5:
from typing import TextIO
else:
from typing.io import TextIO
from asd import doc
def serializedATN():
with StringIO() as buf:
buf.write("\3\u608b\ua72a\u8133\ub9ed\u417c\u3be7\u7786\u5964\2\t")
buf.write("V\b\1\4\2\t\2\4\3\t\3\4\4\t\4\4\5\t\5\4\6\t\6\4\7\t\7")
buf.write("\4\b\t\b\3\2\3\2\3\2\3\2\3\3\3\3\3\3\3\3\3\3\3\4\3\4\3")
buf.write("\4\3\4\3\4\3\4\3\4\3\5\3\5\3\5\3\5\3\5\3\6\6\6(\n\6\r")
buf.write("\6\16\6)\3\6\3\6\3\7\3\7\3\7\3\7\3\7\3\7\3\7\3\7\3\7\3")
buf.write("\7\7\78\n\7\f\7\16\7;\13\7\3\7\3\7\3\7\3\7\3\7\3\7\3\7")
buf.write("\3\b\3\b\3\b\3\b\3\b\3\b\3\b\7\bK\n\b\f\b\16\bN\13\b\3")
buf.write("\b\3\b\3\b\3\b\3\b\3\b\3\b\2\2\t\3\3\5\4\7\5\t\6\13\7")
buf.write("\r\b\17\t\3\2\3\5\2\13\f\16\17\"\"\2X\2\3\3\2\2\2\2\5")
buf.write("\3\2\2\2\2\7\3\2\2\2\2\t\3\2\2\2\2\13\3\2\2\2\2\r\3\2")
buf.write("\2\2\2\17\3\2\2\2\3\21\3\2\2\2\5\25\3\2\2\2\7\32\3\2\2")
buf.write("\2\t!\3\2\2\2\13\'\3\2\2\2\r-\3\2\2\2\17C\3\2\2\2\21\22")
buf.write("\7]\2\2\22\23\7F\2\2\23\24\7_\2\2\24\4\3\2\2\2\25\26\7")
buf.write("]\2\2\26\27\7\61\2\2\27\30\7F\2\2\30\31\7_\2\2\31\6\3")
buf.write("\2\2\2\32\33\7]\2\2\33\34\7U\2\2\34\35\7<\2\2\35\36\7")
buf.write("T\2\2\36\37\7\63\2\2\37 \7_\2\2 \b\3\2\2\2!\"\7]\2\2\"")
buf.write("#\7\61\2\2#$\7U\2\2$%\7_\2\2%\n\3\2\2\2&(\t\2\2\2\'&\3")
buf.write("\2\2\2()\3\2\2\2)\'\3\2\2\2)*\3\2\2\2*+\3\2\2\2+,\b\6")
buf.write("\2\2,\f\3\2\2\2-.\7}\2\2./\7q\2\2/\60\7t\2\2\60\61\7i")
buf.write("\2\2\61\62\7<\2\2\62\63\7g\2\2\63\64\7p\2\2\64\65\7\177")
buf.write("\2\2\659\3\2\2\2\668\13\2\2\2\67\66\3\2\2\28;\3\2\2\2")
buf.write("9\67\3\2\2\29:\3\2\2\2:<\3\2\2\2;9\3\2\2\2<=\7}\2\2=>")
buf.write("\7\61\2\2>?\7q\2\2?@\7t\2\2@A\7i\2\2AB\7\177\2\2B\16\3")
buf.write("\2\2\2CD\7}\2\2DE\7w\2\2EF\7p\2\2FG\7n\2\2GH\7\177\2\2")
buf.write("HL\3\2\2\2IK\13\2\2\2JI\3\2\2\2KN\3\2\2\2LJ\3\2\2\2LM")
buf.write("\3\2\2\2MO\3\2\2\2NL\3\2\2\2OP\7}\2\2PQ\7\61\2\2QR\7w")
buf.write("\2\2RS\7p\2\2ST\7n\2\2TU\7\177\2\2U\20\3\2\2\2\6\2)9L")
buf.write("\3\b\2\2")
return buf.getvalue()
class docLexer(Lexer):
atn = ATNDeserializer().deserialize(serializedATN())
decisionsToDFA = [ DFA(ds, i) for i, ds in enumerate(atn.decisionToState) ]
T__0 = 1
T__1 = 2
T__2 = 3
T__3 = 4
WS = 5
ORG = 6
UNL = 7
channelNames = [ u"DEFAULT_TOKEN_CHANNEL", u"HIDDEN" ]
modeNames = [ "DEFAULT_MODE" ]
literalNames = [ "<INVALID>",
"'[D]'", "'[/D]'", "'[S:R1]'", "'[/S]'" ]
symbolicNames = [ "<INVALID>",
"WS", "ORG", "UNL" ]
ruleNames = [ "T__0", "T__1", "T__2", "T__3", "WS", "ORG", "UNL" ]
grammarFileName = "doc.g4"
def __init__(self, input=None, output:TextIO = sys.stdout):
super().__init__(input, output)
self.checkVersion("4.9.3")
self._interp = LexerATNSimulator(self, self.atn, self.decisionsToDFA, PredictionContextCache())
self._actions = None
self._predicates = None
T__0=1
T__1=2
T__2=3
T__3=4
WS=5
ORG=6
UNL=7
'[D]'=1
'[/D]'=2
'[S:R1]'=3
'[/S]'=4
# Generated from grammar/doc/doc.g4 by ANTLR 4.9.3
from antlr4 import *
if __name__ is not None and "." in __name__:
from .docParser import docParser
else:
from docParser import docParser
from asd import doc
# This class defines a complete listener for a parse tree produced by docParser.
class docListener(ParseTreeListener):
# Enter a parse tree produced by docParser#document.
def enterDocument(self, ctx:docParser.DocumentContext):
pass
# Exit a parse tree produced by docParser#document.
def exitDocument(self, ctx:docParser.DocumentContext):
pass
# Enter a parse tree produced by docParser#sentence.
def enterSentence(self, ctx:docParser.SentenceContext):
pass
# Exit a parse tree produced by docParser#sentence.
def exitSentence(self, ctx:docParser.SentenceContext):
pass
# Enter a parse tree produced by docParser#orgPart.
def enterOrgPart(self, ctx:docParser.OrgPartContext):
pass
# Exit a parse tree produced by docParser#orgPart.
def exitOrgPart(self, ctx:docParser.OrgPartContext):
pass
# Enter a parse tree produced by docParser#unlPart.
def enterUnlPart(self, ctx:docParser.UnlPartContext):
pass
# Exit a parse tree produced by docParser#unlPart.
def exitUnlPart(self, ctx:docParser.UnlPartContext):
pass
del docParser
\ No newline at end of file
# Generated from grammar/doc/doc.g4 by ANTLR 4.9.3
# encoding: utf-8
from antlr4 import *
from io import StringIO
import sys
if sys.version_info[1] > 5:
from typing import TextIO
else:
from typing.io import TextIO
from asd import doc
def serializedATN():
with StringIO() as buf:
buf.write("\3\u608b\ua72a\u8133\ub9ed\u417c\u3be7\u7786\u5964\3\t")
buf.write("\34\4\2\t\2\4\3\t\3\4\4\t\4\4\5\t\5\3\2\3\2\3\2\3\2\3")
buf.write("\2\3\3\3\3\3\3\3\3\3\3\3\3\3\4\3\4\3\4\3\5\3\5\3\5\3\5")
buf.write("\2\2\6\2\4\6\b\2\2\2\27\2\n\3\2\2\2\4\17\3\2\2\2\6\25")
buf.write("\3\2\2\2\b\30\3\2\2\2\n\13\7\3\2\2\13\f\5\4\3\2\f\r\7")
buf.write("\4\2\2\r\16\b\2\1\2\16\3\3\2\2\2\17\20\7\5\2\2\20\21\5")
buf.write("\6\4\2\21\22\5\b\5\2\22\23\7\6\2\2\23\24\b\3\1\2\24\5")
buf.write("\3\2\2\2\25\26\7\b\2\2\26\27\b\4\1\2\27\7\3\2\2\2\30\31")
buf.write("\7\t\2\2\31\32\b\5\1\2\32\t\3\2\2\2\2")
return buf.getvalue()
class docParser ( Parser ):
grammarFileName = "doc.g4"
atn = ATNDeserializer().deserialize(serializedATN())
decisionsToDFA = [ DFA(ds, i) for i, ds in enumerate(atn.decisionToState) ]
sharedContextCache = PredictionContextCache()
literalNames = [ "<INVALID>", "'[D]'", "'[/D]'", "'[S:R1]'", "'[/S]'" ]
symbolicNames = [ "<INVALID>", "<INVALID>", "<INVALID>", "<INVALID>",
"<INVALID>", "WS", "ORG", "UNL" ]
RULE_document = 0
RULE_sentence = 1
RULE_orgPart = 2
RULE_unlPart = 3
ruleNames = [ "document", "sentence", "orgPart", "unlPart" ]
EOF = Token.EOF
T__0=1
T__1=2
T__2=3
T__3=4
WS=5
ORG=6
UNL=7
def __init__(self, input:TokenStream, output:TextIO = sys.stdout):
super().__init__(input, output)
self.checkVersion("4.9.3")
self._interp = ParserATNSimulator(self, self.atn, self.decisionsToDFA, self.sharedContextCache)
self._predicates = None
class DocumentContext(ParserRuleContext):
__slots__ = 'parser'
def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
super().__init__(parent, invokingState)
self.parser = parser
self.out = None
self.s = None # SentenceContext
def sentence(self):
return self.getTypedRuleContext(docParser.SentenceContext,0)
def getRuleIndex(self):
return docParser.RULE_document
def enterRule(self, listener:ParseTreeListener):
if hasattr( listener, "enterDocument" ):
listener.enterDocument(self)
def exitRule(self, listener:ParseTreeListener):
if hasattr( listener, "exitDocument" ):
listener.exitDocument(self)
def document(self):
localctx = docParser.DocumentContext(self, self._ctx, self.state)
self.enterRule(localctx, 0, self.RULE_document)
try:
self.enterOuterAlt(localctx, 1)
self.state = 8
self.match(docParser.T__0)
self.state = 9
localctx.s = self.sentence()
self.state = 10
self.match(docParser.T__1)
localctx.out = doc.Document(localctx.s.out)
except RecognitionException as re:
localctx.exception = re
self._errHandler.reportError(self, re)
self._errHandler.recover(self, re)
finally:
self.exitRule()
return localctx
class SentenceContext(ParserRuleContext):
__slots__ = 'parser'
def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
super().__init__(parent, invokingState)
self.parser = parser
self.out = None
self.o = None # OrgPartContext
self.u = None # UnlPartContext
def orgPart(self):
return self.getTypedRuleContext(docParser.OrgPartContext,0)
def unlPart(self):
return self.getTypedRuleContext(docParser.UnlPartContext,0)
def getRuleIndex(self):
return docParser.RULE_sentence
def enterRule(self, listener:ParseTreeListener):
if hasattr( listener, "enterSentence" ):
listener.enterSentence(self)
def exitRule(self, listener:ParseTreeListener):
if hasattr( listener, "exitSentence" ):
listener.exitSentence(self)
def sentence(self):
localctx = docParser.SentenceContext(self, self._ctx, self.state)
self.enterRule(localctx, 2, self.RULE_sentence)
try:
self.enterOuterAlt(localctx, 1)
self.state = 13
self.match(docParser.T__2)
self.state = 14
localctx.o = self.orgPart()
self.state = 15
localctx.u = self.unlPart()
self.state = 16
self.match(docParser.T__3)
localctx.out = doc.Sentence(localctx.o.out, localctx.u.out)
except RecognitionException as re:
localctx.exception = re
self._errHandler.reportError(self, re)
self._errHandler.recover(self, re)
finally:
self.exitRule()
return localctx
class OrgPartContext(ParserRuleContext):
__slots__ = 'parser'
def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
super().__init__(parent, invokingState)
self.parser = parser
self.out = None
self.o = None # Token
def ORG(self):
return self.getToken(docParser.ORG, 0)
def getRuleIndex(self):
return docParser.RULE_orgPart
def enterRule(self, listener:ParseTreeListener):
if hasattr( listener, "enterOrgPart" ):
listener.enterOrgPart(self)
def exitRule(self, listener:ParseTreeListener):
if hasattr( listener, "exitOrgPart" ):
listener.exitOrgPart(self)
def orgPart(self):
localctx = docParser.OrgPartContext(self, self._ctx, self.state)
self.enterRule(localctx, 4, self.RULE_orgPart)
try:
self.enterOuterAlt(localctx, 1)
self.state = 19
localctx.o = self.match(docParser.ORG)
localctx.out = doc.OrgPart((None if localctx.o is None else localctx.o.text))
except RecognitionException as re:
localctx.exception = re
self._errHandler.reportError(self, re)
self._errHandler.recover(self, re)
finally:
self.exitRule()
return localctx
class UnlPartContext(ParserRuleContext):
__slots__ = 'parser'
def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1):
super().__init__(parent, invokingState)
self.parser = parser
self.out = None
self.u = None # Token
def UNL(self):
return self.getToken(docParser.UNL, 0)
def getRuleIndex(self):
return docParser.RULE_unlPart
def enterRule(self, listener:ParseTreeListener):
if hasattr( listener, "enterUnlPart" ):
listener.enterUnlPart(self)
def exitRule(self, listener:ParseTreeListener):
if hasattr( listener, "exitUnlPart" ):
listener.exitUnlPart(self)
def unlPart(self):
localctx = docParser.UnlPartContext(self, self._ctx, self.state)
self.enterRule(localctx, 6, self.RULE_unlPart)
try:
self.enterOuterAlt(localctx, 1)
self.state = 22
localctx.u = self.match(docParser.UNL)
localctx.out = doc.UnlPart((None if localctx.u is None else localctx.u.text))
except RecognitionException as re:
localctx.exception = re
self._errHandler.reportError(self, re)
self._errHandler.recover(self, re)
finally:
self.exitRule()
return localctx
//=============================================================================
// ANTLR Grammar for UNL Document
//=============================================================================
grammar org;
//=============================================================================
// Parser Grammar
//=============================================================================
//---------------------------------------------------------
// Origin NL sentence
//---------------------------------------------------------
orgPart
: ORG
;
//=============================================================================
// Lexer Grammar
//=============================================================================
// ignore whitespaces
WS : (' '|'\n'|'\t'|'\r'|'\u000C')+ -> skip ;
// other tokens
ORG : '{org:en}' (.)* '{/org}' ;
//=============================================================================
// ANTLR Grammar for UNL Document
//=============================================================================
grammar unl;
//=============================================================================
// Parser Grammar
//=============================================================================
//---------------------------------------------------------
// UNL representation
//---------------------------------------------------------
unlPart
: UNL
;
//=============================================================================
// Lexer Grammar
//=============================================================================
// ignore whitespaces
WS : (' '|'\n'|'\t'|'\r'|'\u000C')+ -> skip ;
// other tokens
ORG : '{org:en}' (.)* '{/org}' ;
UNL : '{unl}' (.)* '{/unl}' ;
[D]
[S:R1]
{org:en}
The system allows a radio channel to take on two states: Listening Traffic and.
{/org}
{unl}
aoj( a, b )
{/unl}
[/S]
[/D]
parse.py 0 → 100644
#!/usr/bin/python3.10
# -*-coding:Utf-8 -*
#==============================================================================
# unlAnt: parse
#------------------------------------------------------------------------------
# Main script to parse an UNL document file
#==============================================================================
#==============================================================================
# Importing required modules
#==============================================================================
import sys
from subprocess import Popen, PIPE
from antlr4 import FileStream, CommonTokenStream, InputStream
#==============================================================================
# Parameters
#==============================================================================
# ANTLR Grammar
doc_grammar = 'grammar/doc/doc.g4'
org_grammar = 'grammar/org/org.g4'
unl_grammar = 'grammar/unl/unl.g4'
#==============================================================================
# Utilities
#==============================================================================
def run_command(cmd):
with Popen(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p:
p.poll()
p.stdout.flush()
p.stderr.flush()
stdout, stderr = p.communicate()
return p.returncode, stdout, stderr
#==============================================================================
# Parsing Functions
#==============================================================================
def create_lexer_parser_with_antlr(grammar_file):
""" Create python lexer and parser using ANTLR4 """
cmd = ['antlr4', '-Dlanguage=Python3', grammar_file]
print("--- Create python lexer and parser (run command: " + str(cmd) + ")")
code, out, err = run_command(cmd)
if code != 0:
print("Error in grammar: \n\n" + err)
def instantiate_lexer_parser(input, antLexer, antParser):
""" Instantiate lexer and parser """
print("--- Instantiate lexer and parser")
lexer = antLexer(input)
stream = CommonTokenStream(lexer)
parser = antParser(stream)
return parser
def parse_document(input):
# -- Create python lexer and parser
create_lexer_parser_with_antlr(doc_grammar)
# -- Import Lexer/Parser (after creation by ANTLR4)
from grammar.doc.docLexer import docLexer
from grammar.doc.docParser import docParser
# -- Parse document
parser = instantiate_lexer_parser(input, docLexer, docParser)
print("--- Parse document to separate org part and unl part")
tree = parser.document()
print("----- resulting tree:\n" + tree.toStringTree(recog=parser))
document = tree.out
return document
def parse_org(input):
pass
def parse_unl(input):
pass
#==============================================================================
# Main Function
#==============================================================================
def main(argv):
# -- Read input file
input_file = argv[1]
input = FileStream(input_file)
# -- Document Parsing
print("-- Document Parsing ")
document = parse_document(input)
org_part = document.sentence.org_part.to_string()
unl_part = document.sentence.unl_part.to_string()
print("----- org_part:\n" + org_part)
print("----- unl_part:\n" + unl_part)
if __name__ == '__main__':
main(sys.argv)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment