diff --git a/asd/__pycache__/doc.cpython-310.pyc b/asd/__pycache__/doc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..85a8c8992eda54bf56db9080f47bf63f2375f1ec Binary files /dev/null and b/asd/__pycache__/doc.cpython-310.pyc differ diff --git a/asd/doc.py b/asd/doc.py new file mode 100644 index 0000000000000000000000000000000000000000..bbfdebb11a324d0aaf1b2ddfd7674d75d4bef155 --- /dev/null +++ b/asd/doc.py @@ -0,0 +1,67 @@ +#!/usr/bin/python3.10 +# -*-coding:Utf-8 -* + +#============================================================================== +# TENET: inference +#------------------------------------------------------------------------------ +# Module to execute the extraction process, +# by applying the transduction schemes with the SHACL inference engine +#============================================================================== + +#============================================================================== +# Importing required modules +#============================================================================== + +import sys +from subprocess import Popen, PIPE +from antlr4 import FileStream, CommonTokenStream +# from antlr.unlLexer import unlLexer +# from antlr.unlParser import unlParser + + +#============================================================================== +# Parameters +#============================================================================== + +# None + + +#============================================================================== +# Document Class +#============================================================================== + +class Document: + + def __init__(self, sentence): + self.sentence = sentence + + def to_string(self): + return self.sentence.to_string() + + +class Sentence: + + def __init__(self, org_part, unl_part): + self.org_part = org_part + self.unl_part = unl_part + + def to_string(self): + return self.org_part.to_string() + self.unl_part.to_string + + +class OrgPart: + + def __init__(self, value): + self.value = value + + def to_string(self): + return self.value + + +class UnlPart: + + def __init__(self, value): + self.value = value + + def to_string(self): + return self.value \ No newline at end of file diff --git a/grammar/doc/__pycache__/docLexer.cpython-310.pyc b/grammar/doc/__pycache__/docLexer.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..997817b3aabec262a7798f35c9b0ab317b3e40f1 Binary files /dev/null and b/grammar/doc/__pycache__/docLexer.cpython-310.pyc differ diff --git a/grammar/doc/__pycache__/docParser.cpython-310.pyc b/grammar/doc/__pycache__/docParser.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6356f2ab138d2e5b8efa078c6b549fa9e9274a89 Binary files /dev/null and b/grammar/doc/__pycache__/docParser.cpython-310.pyc differ diff --git a/grammar/doc/doc.g4 b/grammar/doc/doc.g4 new file mode 100644 index 0000000000000000000000000000000000000000..7912b9c97e7d4832eb4925370fcc48eb6ebdb171 --- /dev/null +++ b/grammar/doc/doc.g4 @@ -0,0 +1,47 @@ +//============================================================================= +// ANTLR Grammar for UNL Document +//============================================================================= + +grammar doc; + +@header { +from asd import doc +} + + +//============================================================================= +// Parser Grammar +//============================================================================= + +//--------------------------------------------------------- +// Document = list of sentence +//--------------------------------------------------------- + +document returns [out] + : '[D]' s=sentence '[/D]' {$out = doc.Document($s.out)} + ; + +sentence returns [out] + : '[S:R1]' o=orgPart u=unlPart '[/S]' {$out = doc.Sentence($o.out, $u.out)} + ; + +orgPart returns [out] + : o=ORG {$out = doc.OrgPart($o.text)} + ; + +unlPart returns [out] + : u=UNL {$out = doc.UnlPart($u.text)} + ; + + +//============================================================================= +// Lexer Grammar +//============================================================================= + +// ignore whitespaces +WS : (' '|'\n'|'\t'|'\r'|'\u000C')+ -> skip ; + +// other tokens +ORG : '{org:en}' (.)* '{/org}' ; +UNL : '{unl}' (.)* '{/unl}' ; + diff --git a/grammar/doc/doc.interp b/grammar/doc/doc.interp new file mode 100644 index 0000000000000000000000000000000000000000..69c2f70ff15922fc502882fd4bd144e580aa4efb --- /dev/null +++ b/grammar/doc/doc.interp @@ -0,0 +1,29 @@ +token literal names: +null +'[D]' +'[/D]' +'[S:R1]' +'[/S]' +null +null +null + +token symbolic names: +null +null +null +null +null +WS +ORG +UNL + +rule names: +document +sentence +orgPart +unlPart + + +atn: +[3, 24715, 42794, 33075, 47597, 16764, 15335, 30598, 22884, 3, 9, 28, 4, 2, 9, 2, 4, 3, 9, 3, 4, 4, 9, 4, 4, 5, 9, 5, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, 3, 4, 3, 5, 3, 5, 3, 5, 3, 5, 2, 2, 6, 2, 4, 6, 8, 2, 2, 2, 23, 2, 10, 3, 2, 2, 2, 4, 15, 3, 2, 2, 2, 6, 21, 3, 2, 2, 2, 8, 24, 3, 2, 2, 2, 10, 11, 7, 3, 2, 2, 11, 12, 5, 4, 3, 2, 12, 13, 7, 4, 2, 2, 13, 14, 8, 2, 1, 2, 14, 3, 3, 2, 2, 2, 15, 16, 7, 5, 2, 2, 16, 17, 5, 6, 4, 2, 17, 18, 5, 8, 5, 2, 18, 19, 7, 6, 2, 2, 19, 20, 8, 3, 1, 2, 20, 5, 3, 2, 2, 2, 21, 22, 7, 8, 2, 2, 22, 23, 8, 4, 1, 2, 23, 7, 3, 2, 2, 2, 24, 25, 7, 9, 2, 2, 25, 26, 8, 5, 1, 2, 26, 9, 3, 2, 2, 2, 2] \ No newline at end of file diff --git a/grammar/doc/doc.tokens b/grammar/doc/doc.tokens new file mode 100644 index 0000000000000000000000000000000000000000..dc0d799667a2a666cd9632133e0f07f718f5074a --- /dev/null +++ b/grammar/doc/doc.tokens @@ -0,0 +1,11 @@ +T__0=1 +T__1=2 +T__2=3 +T__3=4 +WS=5 +ORG=6 +UNL=7 +'[D]'=1 +'[/D]'=2 +'[S:R1]'=3 +'[/S]'=4 diff --git a/grammar/doc/docLexer.interp b/grammar/doc/docLexer.interp new file mode 100644 index 0000000000000000000000000000000000000000..eb836f741a31c84e17c7ff9a784964fd9ede3458 --- /dev/null +++ b/grammar/doc/docLexer.interp @@ -0,0 +1,38 @@ +token literal names: +null +'[D]' +'[/D]' +'[S:R1]' +'[/S]' +null +null +null + +token symbolic names: +null +null +null +null +null +WS +ORG +UNL + +rule names: +T__0 +T__1 +T__2 +T__3 +WS +ORG +UNL + +channel names: +DEFAULT_TOKEN_CHANNEL +HIDDEN + +mode names: +DEFAULT_MODE + +atn: +[3, 24715, 42794, 33075, 47597, 16764, 15335, 30598, 22884, 2, 9, 86, 8, 1, 4, 2, 9, 2, 4, 3, 9, 3, 4, 4, 9, 4, 4, 5, 9, 5, 4, 6, 9, 6, 4, 7, 9, 7, 4, 8, 9, 8, 3, 2, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 5, 3, 5, 3, 5, 3, 5, 3, 5, 3, 6, 6, 6, 40, 10, 6, 13, 6, 14, 6, 41, 3, 6, 3, 6, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 7, 7, 56, 10, 7, 12, 7, 14, 7, 59, 11, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 7, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 7, 8, 75, 10, 8, 12, 8, 14, 8, 78, 11, 8, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 3, 8, 2, 2, 9, 3, 3, 5, 4, 7, 5, 9, 6, 11, 7, 13, 8, 15, 9, 3, 2, 3, 5, 2, 11, 12, 14, 15, 34, 34, 2, 88, 2, 3, 3, 2, 2, 2, 2, 5, 3, 2, 2, 2, 2, 7, 3, 2, 2, 2, 2, 9, 3, 2, 2, 2, 2, 11, 3, 2, 2, 2, 2, 13, 3, 2, 2, 2, 2, 15, 3, 2, 2, 2, 3, 17, 3, 2, 2, 2, 5, 21, 3, 2, 2, 2, 7, 26, 3, 2, 2, 2, 9, 33, 3, 2, 2, 2, 11, 39, 3, 2, 2, 2, 13, 45, 3, 2, 2, 2, 15, 67, 3, 2, 2, 2, 17, 18, 7, 93, 2, 2, 18, 19, 7, 70, 2, 2, 19, 20, 7, 95, 2, 2, 20, 4, 3, 2, 2, 2, 21, 22, 7, 93, 2, 2, 22, 23, 7, 49, 2, 2, 23, 24, 7, 70, 2, 2, 24, 25, 7, 95, 2, 2, 25, 6, 3, 2, 2, 2, 26, 27, 7, 93, 2, 2, 27, 28, 7, 85, 2, 2, 28, 29, 7, 60, 2, 2, 29, 30, 7, 84, 2, 2, 30, 31, 7, 51, 2, 2, 31, 32, 7, 95, 2, 2, 32, 8, 3, 2, 2, 2, 33, 34, 7, 93, 2, 2, 34, 35, 7, 49, 2, 2, 35, 36, 7, 85, 2, 2, 36, 37, 7, 95, 2, 2, 37, 10, 3, 2, 2, 2, 38, 40, 9, 2, 2, 2, 39, 38, 3, 2, 2, 2, 40, 41, 3, 2, 2, 2, 41, 39, 3, 2, 2, 2, 41, 42, 3, 2, 2, 2, 42, 43, 3, 2, 2, 2, 43, 44, 8, 6, 2, 2, 44, 12, 3, 2, 2, 2, 45, 46, 7, 125, 2, 2, 46, 47, 7, 113, 2, 2, 47, 48, 7, 116, 2, 2, 48, 49, 7, 105, 2, 2, 49, 50, 7, 60, 2, 2, 50, 51, 7, 103, 2, 2, 51, 52, 7, 112, 2, 2, 52, 53, 7, 127, 2, 2, 53, 57, 3, 2, 2, 2, 54, 56, 11, 2, 2, 2, 55, 54, 3, 2, 2, 2, 56, 59, 3, 2, 2, 2, 57, 55, 3, 2, 2, 2, 57, 58, 3, 2, 2, 2, 58, 60, 3, 2, 2, 2, 59, 57, 3, 2, 2, 2, 60, 61, 7, 125, 2, 2, 61, 62, 7, 49, 2, 2, 62, 63, 7, 113, 2, 2, 63, 64, 7, 116, 2, 2, 64, 65, 7, 105, 2, 2, 65, 66, 7, 127, 2, 2, 66, 14, 3, 2, 2, 2, 67, 68, 7, 125, 2, 2, 68, 69, 7, 119, 2, 2, 69, 70, 7, 112, 2, 2, 70, 71, 7, 110, 2, 2, 71, 72, 7, 127, 2, 2, 72, 76, 3, 2, 2, 2, 73, 75, 11, 2, 2, 2, 74, 73, 3, 2, 2, 2, 75, 78, 3, 2, 2, 2, 76, 74, 3, 2, 2, 2, 76, 77, 3, 2, 2, 2, 77, 79, 3, 2, 2, 2, 78, 76, 3, 2, 2, 2, 79, 80, 7, 125, 2, 2, 80, 81, 7, 49, 2, 2, 81, 82, 7, 119, 2, 2, 82, 83, 7, 112, 2, 2, 83, 84, 7, 110, 2, 2, 84, 85, 7, 127, 2, 2, 85, 16, 3, 2, 2, 2, 6, 2, 41, 57, 76, 3, 8, 2, 2] \ No newline at end of file diff --git a/grammar/doc/docLexer.py b/grammar/doc/docLexer.py new file mode 100644 index 0000000000000000000000000000000000000000..ffe37e265ad7d98c28a17335464c504b7ce0b288 --- /dev/null +++ b/grammar/doc/docLexer.py @@ -0,0 +1,83 @@ +# Generated from grammar/doc/doc.g4 by ANTLR 4.9.3 +from antlr4 import * +from io import StringIO +import sys +if sys.version_info[1] > 5: + from typing import TextIO +else: + from typing.io import TextIO + + +from asd import doc + + + +def serializedATN(): + with StringIO() as buf: + buf.write("\3\u608b\ua72a\u8133\ub9ed\u417c\u3be7\u7786\u5964\2\t") + buf.write("V\b\1\4\2\t\2\4\3\t\3\4\4\t\4\4\5\t\5\4\6\t\6\4\7\t\7") + buf.write("\4\b\t\b\3\2\3\2\3\2\3\2\3\3\3\3\3\3\3\3\3\3\3\4\3\4\3") + buf.write("\4\3\4\3\4\3\4\3\4\3\5\3\5\3\5\3\5\3\5\3\6\6\6(\n\6\r") + buf.write("\6\16\6)\3\6\3\6\3\7\3\7\3\7\3\7\3\7\3\7\3\7\3\7\3\7\3") + buf.write("\7\7\78\n\7\f\7\16\7;\13\7\3\7\3\7\3\7\3\7\3\7\3\7\3\7") + buf.write("\3\b\3\b\3\b\3\b\3\b\3\b\3\b\7\bK\n\b\f\b\16\bN\13\b\3") + buf.write("\b\3\b\3\b\3\b\3\b\3\b\3\b\2\2\t\3\3\5\4\7\5\t\6\13\7") + buf.write("\r\b\17\t\3\2\3\5\2\13\f\16\17\"\"\2X\2\3\3\2\2\2\2\5") + buf.write("\3\2\2\2\2\7\3\2\2\2\2\t\3\2\2\2\2\13\3\2\2\2\2\r\3\2") + buf.write("\2\2\2\17\3\2\2\2\3\21\3\2\2\2\5\25\3\2\2\2\7\32\3\2\2") + buf.write("\2\t!\3\2\2\2\13\'\3\2\2\2\r-\3\2\2\2\17C\3\2\2\2\21\22") + buf.write("\7]\2\2\22\23\7F\2\2\23\24\7_\2\2\24\4\3\2\2\2\25\26\7") + buf.write("]\2\2\26\27\7\61\2\2\27\30\7F\2\2\30\31\7_\2\2\31\6\3") + buf.write("\2\2\2\32\33\7]\2\2\33\34\7U\2\2\34\35\7<\2\2\35\36\7") + buf.write("T\2\2\36\37\7\63\2\2\37 \7_\2\2 \b\3\2\2\2!\"\7]\2\2\"") + buf.write("#\7\61\2\2#$\7U\2\2$%\7_\2\2%\n\3\2\2\2&(\t\2\2\2\'&\3") + buf.write("\2\2\2()\3\2\2\2)\'\3\2\2\2)*\3\2\2\2*+\3\2\2\2+,\b\6") + buf.write("\2\2,\f\3\2\2\2-.\7}\2\2./\7q\2\2/\60\7t\2\2\60\61\7i") + buf.write("\2\2\61\62\7<\2\2\62\63\7g\2\2\63\64\7p\2\2\64\65\7\177") + buf.write("\2\2\659\3\2\2\2\668\13\2\2\2\67\66\3\2\2\28;\3\2\2\2") + buf.write("9\67\3\2\2\29:\3\2\2\2:<\3\2\2\2;9\3\2\2\2<=\7}\2\2=>") + buf.write("\7\61\2\2>?\7q\2\2?@\7t\2\2@A\7i\2\2AB\7\177\2\2B\16\3") + buf.write("\2\2\2CD\7}\2\2DE\7w\2\2EF\7p\2\2FG\7n\2\2GH\7\177\2\2") + buf.write("HL\3\2\2\2IK\13\2\2\2JI\3\2\2\2KN\3\2\2\2LJ\3\2\2\2LM") + buf.write("\3\2\2\2MO\3\2\2\2NL\3\2\2\2OP\7}\2\2PQ\7\61\2\2QR\7w") + buf.write("\2\2RS\7p\2\2ST\7n\2\2TU\7\177\2\2U\20\3\2\2\2\6\2)9L") + buf.write("\3\b\2\2") + return buf.getvalue() + + +class docLexer(Lexer): + + atn = ATNDeserializer().deserialize(serializedATN()) + + decisionsToDFA = [ DFA(ds, i) for i, ds in enumerate(atn.decisionToState) ] + + T__0 = 1 + T__1 = 2 + T__2 = 3 + T__3 = 4 + WS = 5 + ORG = 6 + UNL = 7 + + channelNames = [ u"DEFAULT_TOKEN_CHANNEL", u"HIDDEN" ] + + modeNames = [ "DEFAULT_MODE" ] + + literalNames = [ "<INVALID>", + "'[D]'", "'[/D]'", "'[S:R1]'", "'[/S]'" ] + + symbolicNames = [ "<INVALID>", + "WS", "ORG", "UNL" ] + + ruleNames = [ "T__0", "T__1", "T__2", "T__3", "WS", "ORG", "UNL" ] + + grammarFileName = "doc.g4" + + def __init__(self, input=None, output:TextIO = sys.stdout): + super().__init__(input, output) + self.checkVersion("4.9.3") + self._interp = LexerATNSimulator(self, self.atn, self.decisionsToDFA, PredictionContextCache()) + self._actions = None + self._predicates = None + + diff --git a/grammar/doc/docLexer.tokens b/grammar/doc/docLexer.tokens new file mode 100644 index 0000000000000000000000000000000000000000..dc0d799667a2a666cd9632133e0f07f718f5074a --- /dev/null +++ b/grammar/doc/docLexer.tokens @@ -0,0 +1,11 @@ +T__0=1 +T__1=2 +T__2=3 +T__3=4 +WS=5 +ORG=6 +UNL=7 +'[D]'=1 +'[/D]'=2 +'[S:R1]'=3 +'[/S]'=4 diff --git a/grammar/doc/docListener.py b/grammar/doc/docListener.py new file mode 100644 index 0000000000000000000000000000000000000000..b71589a5a60a50708e7fc382607812430e28b9f9 --- /dev/null +++ b/grammar/doc/docListener.py @@ -0,0 +1,51 @@ +# Generated from grammar/doc/doc.g4 by ANTLR 4.9.3 +from antlr4 import * +if __name__ is not None and "." in __name__: + from .docParser import docParser +else: + from docParser import docParser + +from asd import doc + + +# This class defines a complete listener for a parse tree produced by docParser. +class docListener(ParseTreeListener): + + # Enter a parse tree produced by docParser#document. + def enterDocument(self, ctx:docParser.DocumentContext): + pass + + # Exit a parse tree produced by docParser#document. + def exitDocument(self, ctx:docParser.DocumentContext): + pass + + + # Enter a parse tree produced by docParser#sentence. + def enterSentence(self, ctx:docParser.SentenceContext): + pass + + # Exit a parse tree produced by docParser#sentence. + def exitSentence(self, ctx:docParser.SentenceContext): + pass + + + # Enter a parse tree produced by docParser#orgPart. + def enterOrgPart(self, ctx:docParser.OrgPartContext): + pass + + # Exit a parse tree produced by docParser#orgPart. + def exitOrgPart(self, ctx:docParser.OrgPartContext): + pass + + + # Enter a parse tree produced by docParser#unlPart. + def enterUnlPart(self, ctx:docParser.UnlPartContext): + pass + + # Exit a parse tree produced by docParser#unlPart. + def exitUnlPart(self, ctx:docParser.UnlPartContext): + pass + + + +del docParser \ No newline at end of file diff --git a/grammar/doc/docParser.py b/grammar/doc/docParser.py new file mode 100644 index 0000000000000000000000000000000000000000..ab5ba95cf9c7a82246d8a65c3a9b532bdb1af21f --- /dev/null +++ b/grammar/doc/docParser.py @@ -0,0 +1,264 @@ +# Generated from grammar/doc/doc.g4 by ANTLR 4.9.3 +# encoding: utf-8 +from antlr4 import * +from io import StringIO +import sys +if sys.version_info[1] > 5: + from typing import TextIO +else: + from typing.io import TextIO + + +from asd import doc + + +def serializedATN(): + with StringIO() as buf: + buf.write("\3\u608b\ua72a\u8133\ub9ed\u417c\u3be7\u7786\u5964\3\t") + buf.write("\34\4\2\t\2\4\3\t\3\4\4\t\4\4\5\t\5\3\2\3\2\3\2\3\2\3") + buf.write("\2\3\3\3\3\3\3\3\3\3\3\3\3\3\4\3\4\3\4\3\5\3\5\3\5\3\5") + buf.write("\2\2\6\2\4\6\b\2\2\2\27\2\n\3\2\2\2\4\17\3\2\2\2\6\25") + buf.write("\3\2\2\2\b\30\3\2\2\2\n\13\7\3\2\2\13\f\5\4\3\2\f\r\7") + buf.write("\4\2\2\r\16\b\2\1\2\16\3\3\2\2\2\17\20\7\5\2\2\20\21\5") + buf.write("\6\4\2\21\22\5\b\5\2\22\23\7\6\2\2\23\24\b\3\1\2\24\5") + buf.write("\3\2\2\2\25\26\7\b\2\2\26\27\b\4\1\2\27\7\3\2\2\2\30\31") + buf.write("\7\t\2\2\31\32\b\5\1\2\32\t\3\2\2\2\2") + return buf.getvalue() + + +class docParser ( Parser ): + + grammarFileName = "doc.g4" + + atn = ATNDeserializer().deserialize(serializedATN()) + + decisionsToDFA = [ DFA(ds, i) for i, ds in enumerate(atn.decisionToState) ] + + sharedContextCache = PredictionContextCache() + + literalNames = [ "<INVALID>", "'[D]'", "'[/D]'", "'[S:R1]'", "'[/S]'" ] + + symbolicNames = [ "<INVALID>", "<INVALID>", "<INVALID>", "<INVALID>", + "<INVALID>", "WS", "ORG", "UNL" ] + + RULE_document = 0 + RULE_sentence = 1 + RULE_orgPart = 2 + RULE_unlPart = 3 + + ruleNames = [ "document", "sentence", "orgPart", "unlPart" ] + + EOF = Token.EOF + T__0=1 + T__1=2 + T__2=3 + T__3=4 + WS=5 + ORG=6 + UNL=7 + + def __init__(self, input:TokenStream, output:TextIO = sys.stdout): + super().__init__(input, output) + self.checkVersion("4.9.3") + self._interp = ParserATNSimulator(self, self.atn, self.decisionsToDFA, self.sharedContextCache) + self._predicates = None + + + + + class DocumentContext(ParserRuleContext): + __slots__ = 'parser' + + def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1): + super().__init__(parent, invokingState) + self.parser = parser + self.out = None + self.s = None # SentenceContext + + def sentence(self): + return self.getTypedRuleContext(docParser.SentenceContext,0) + + + def getRuleIndex(self): + return docParser.RULE_document + + def enterRule(self, listener:ParseTreeListener): + if hasattr( listener, "enterDocument" ): + listener.enterDocument(self) + + def exitRule(self, listener:ParseTreeListener): + if hasattr( listener, "exitDocument" ): + listener.exitDocument(self) + + + + + def document(self): + + localctx = docParser.DocumentContext(self, self._ctx, self.state) + self.enterRule(localctx, 0, self.RULE_document) + try: + self.enterOuterAlt(localctx, 1) + self.state = 8 + self.match(docParser.T__0) + self.state = 9 + localctx.s = self.sentence() + self.state = 10 + self.match(docParser.T__1) + localctx.out = doc.Document(localctx.s.out) + except RecognitionException as re: + localctx.exception = re + self._errHandler.reportError(self, re) + self._errHandler.recover(self, re) + finally: + self.exitRule() + return localctx + + + class SentenceContext(ParserRuleContext): + __slots__ = 'parser' + + def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1): + super().__init__(parent, invokingState) + self.parser = parser + self.out = None + self.o = None # OrgPartContext + self.u = None # UnlPartContext + + def orgPart(self): + return self.getTypedRuleContext(docParser.OrgPartContext,0) + + + def unlPart(self): + return self.getTypedRuleContext(docParser.UnlPartContext,0) + + + def getRuleIndex(self): + return docParser.RULE_sentence + + def enterRule(self, listener:ParseTreeListener): + if hasattr( listener, "enterSentence" ): + listener.enterSentence(self) + + def exitRule(self, listener:ParseTreeListener): + if hasattr( listener, "exitSentence" ): + listener.exitSentence(self) + + + + + def sentence(self): + + localctx = docParser.SentenceContext(self, self._ctx, self.state) + self.enterRule(localctx, 2, self.RULE_sentence) + try: + self.enterOuterAlt(localctx, 1) + self.state = 13 + self.match(docParser.T__2) + self.state = 14 + localctx.o = self.orgPart() + self.state = 15 + localctx.u = self.unlPart() + self.state = 16 + self.match(docParser.T__3) + localctx.out = doc.Sentence(localctx.o.out, localctx.u.out) + except RecognitionException as re: + localctx.exception = re + self._errHandler.reportError(self, re) + self._errHandler.recover(self, re) + finally: + self.exitRule() + return localctx + + + class OrgPartContext(ParserRuleContext): + __slots__ = 'parser' + + def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1): + super().__init__(parent, invokingState) + self.parser = parser + self.out = None + self.o = None # Token + + def ORG(self): + return self.getToken(docParser.ORG, 0) + + def getRuleIndex(self): + return docParser.RULE_orgPart + + def enterRule(self, listener:ParseTreeListener): + if hasattr( listener, "enterOrgPart" ): + listener.enterOrgPart(self) + + def exitRule(self, listener:ParseTreeListener): + if hasattr( listener, "exitOrgPart" ): + listener.exitOrgPart(self) + + + + + def orgPart(self): + + localctx = docParser.OrgPartContext(self, self._ctx, self.state) + self.enterRule(localctx, 4, self.RULE_orgPart) + try: + self.enterOuterAlt(localctx, 1) + self.state = 19 + localctx.o = self.match(docParser.ORG) + localctx.out = doc.OrgPart((None if localctx.o is None else localctx.o.text)) + except RecognitionException as re: + localctx.exception = re + self._errHandler.reportError(self, re) + self._errHandler.recover(self, re) + finally: + self.exitRule() + return localctx + + + class UnlPartContext(ParserRuleContext): + __slots__ = 'parser' + + def __init__(self, parser, parent:ParserRuleContext=None, invokingState:int=-1): + super().__init__(parent, invokingState) + self.parser = parser + self.out = None + self.u = None # Token + + def UNL(self): + return self.getToken(docParser.UNL, 0) + + def getRuleIndex(self): + return docParser.RULE_unlPart + + def enterRule(self, listener:ParseTreeListener): + if hasattr( listener, "enterUnlPart" ): + listener.enterUnlPart(self) + + def exitRule(self, listener:ParseTreeListener): + if hasattr( listener, "exitUnlPart" ): + listener.exitUnlPart(self) + + + + + def unlPart(self): + + localctx = docParser.UnlPartContext(self, self._ctx, self.state) + self.enterRule(localctx, 6, self.RULE_unlPart) + try: + self.enterOuterAlt(localctx, 1) + self.state = 22 + localctx.u = self.match(docParser.UNL) + localctx.out = doc.UnlPart((None if localctx.u is None else localctx.u.text)) + except RecognitionException as re: + localctx.exception = re + self._errHandler.reportError(self, re) + self._errHandler.recover(self, re) + finally: + self.exitRule() + return localctx + + + + + diff --git a/grammar/org/org.g4 b/grammar/org/org.g4 new file mode 100644 index 0000000000000000000000000000000000000000..729dcd8a51927d0790717e9b456d7d68fd3e6d24 --- /dev/null +++ b/grammar/org/org.g4 @@ -0,0 +1,30 @@ +//============================================================================= +// ANTLR Grammar for UNL Document +//============================================================================= + +grammar org; + + +//============================================================================= +// Parser Grammar +//============================================================================= + +//--------------------------------------------------------- +// Origin NL sentence +//--------------------------------------------------------- + +orgPart + : ORG + ; + + +//============================================================================= +// Lexer Grammar +//============================================================================= + +// ignore whitespaces +WS : (' '|'\n'|'\t'|'\r'|'\u000C')+ -> skip ; + +// other tokens +ORG : '{org:en}' (.)* '{/org}' ; + diff --git a/grammar/unl/unl.g4 b/grammar/unl/unl.g4 new file mode 100644 index 0000000000000000000000000000000000000000..dac12c8a9c08bdbc1410f868779c2ad89b123f1d --- /dev/null +++ b/grammar/unl/unl.g4 @@ -0,0 +1,30 @@ +//============================================================================= +// ANTLR Grammar for UNL Document +//============================================================================= + +grammar unl; + + +//============================================================================= +// Parser Grammar +//============================================================================= + +//--------------------------------------------------------- +// UNL representation +//--------------------------------------------------------- + +unlPart + : UNL + ; + + +//============================================================================= +// Lexer Grammar +//============================================================================= + +// ignore whitespaces +WS : (' '|'\n'|'\t'|'\r'|'\u000C')+ -> skip ; + +// other tokens +ORG : '{org:en}' (.)* '{/org}' ; +UNL : '{unl}' (.)* '{/unl}' ; diff --git a/input.txt b/input.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed7718c27cd1653d56d397bacd6ffcac059635b7 --- /dev/null +++ b/input.txt @@ -0,0 +1,10 @@ +[D] +[S:R1] +{org:en} +The system allows a radio channel to take on two states: Listening Traffic and. +{/org} +{unl} +aoj( a, b ) +{/unl} +[/S] +[/D] diff --git a/parse.py b/parse.py new file mode 100644 index 0000000000000000000000000000000000000000..fa573f047d1eb13cf4972ebc3610ba28c71235b8 --- /dev/null +++ b/parse.py @@ -0,0 +1,115 @@ +#!/usr/bin/python3.10 +# -*-coding:Utf-8 -* + +#============================================================================== +# unlAnt: parse +#------------------------------------------------------------------------------ +# Main script to parse an UNL document file +#============================================================================== + +#============================================================================== +# Importing required modules +#============================================================================== + +import sys +from subprocess import Popen, PIPE +from antlr4 import FileStream, CommonTokenStream, InputStream + + +#============================================================================== +# Parameters +#============================================================================== + +# ANTLR Grammar + +doc_grammar = 'grammar/doc/doc.g4' +org_grammar = 'grammar/org/org.g4' +unl_grammar = 'grammar/unl/unl.g4' + + +#============================================================================== +# Utilities +#============================================================================== + +def run_command(cmd): + with Popen(cmd, stdout=PIPE, stderr=PIPE, universal_newlines=True) as p: + p.poll() + p.stdout.flush() + p.stderr.flush() + stdout, stderr = p.communicate() + return p.returncode, stdout, stderr + + +#============================================================================== +# Parsing Functions +#============================================================================== + +def create_lexer_parser_with_antlr(grammar_file): + """ Create python lexer and parser using ANTLR4 """ + + cmd = ['antlr4', '-Dlanguage=Python3', grammar_file] + print("--- Create python lexer and parser (run command: " + str(cmd) + ")") + code, out, err = run_command(cmd) + if code != 0: + print("Error in grammar: \n\n" + err) + + +def instantiate_lexer_parser(input, antLexer, antParser): + """ Instantiate lexer and parser """ + + print("--- Instantiate lexer and parser") + lexer = antLexer(input) + stream = CommonTokenStream(lexer) + parser = antParser(stream) + + return parser + + +def parse_document(input): + + # -- Create python lexer and parser + create_lexer_parser_with_antlr(doc_grammar) + + # -- Import Lexer/Parser (after creation by ANTLR4) + from grammar.doc.docLexer import docLexer + from grammar.doc.docParser import docParser + + # -- Parse document + parser = instantiate_lexer_parser(input, docLexer, docParser) + print("--- Parse document to separate org part and unl part") + tree = parser.document() + print("----- resulting tree:\n" + tree.toStringTree(recog=parser)) + document = tree.out + + return document + + +def parse_org(input): + pass + + +def parse_unl(input): + pass + + +#============================================================================== +# Main Function +#============================================================================== + +def main(argv): + + # -- Read input file + input_file = argv[1] + input = FileStream(input_file) + + # -- Document Parsing + print("-- Document Parsing ") + document = parse_document(input) + org_part = document.sentence.org_part.to_string() + unl_part = document.sentence.unl_part.to_string() + print("----- org_part:\n" + org_part) + print("----- unl_part:\n" + unl_part) + + +if __name__ == '__main__': + main(sys.argv) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..7545404907866e05b070ec90199690d5c441c641 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +antlr4-python3-runtime==4.9.3