From c974963db773bf3c08e8b492fe79c79dd54deb59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Lamercerie?= <aurelien.lamercerie@tetras-libre.fr> Date: Tue, 23 Aug 2022 09:58:48 +0200 Subject: [PATCH] Dev: regular expression test and amr analyzer init --- .gitignore | 1 + inputData/test-amr-graph-1.penman | 18 +++++ lib/amr_analyzer.py | 108 ++++++++++++++++++++++++++++++ lib/propbank_analyzer.py | 2 +- lib/re_test.py | 57 ++++++++++++++++ 5 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 inputData/test-amr-graph-1.penman create mode 100644 lib/amr_analyzer.py create mode 100644 lib/re_test.py diff --git a/.gitignore b/.gitignore index 4ffefd07..ae30e025 100755 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ *-env/* lib/amrld/wk/* +*__pycache__* diff --git a/inputData/test-amr-graph-1.penman b/inputData/test-amr-graph-1.penman new file mode 100644 index 00000000..ec5a85a1 --- /dev/null +++ b/inputData/test-amr-graph-1.penman @@ -0,0 +1,18 @@ +# ::id SSC-01-01 +# ::snt The Solar System is the gravitationally bound system of the Sun and the objects that orbit it, either directly or indirectly. +(s / system + :domain (p / planet + :name (n / name + :op1 "Solar" + :op2 "System")) + :ARG1-of (b / bind-01 + :ARG0 (g / gravitation)) + :part (a / and + :op1 (s2 / sun) + :op2 (o / object + :ARG0-of (o2 / orbit-01 + :ARG1 s2 + :manner (o3 / or + :op1 (d / direct-02) + :op2 (d2 / direct-02 + :polarity -)))))) \ No newline at end of file diff --git a/lib/amr_analyzer.py b/lib/amr_analyzer.py new file mode 100644 index 00000000..81a8b5a5 --- /dev/null +++ b/lib/amr_analyzer.py @@ -0,0 +1,108 @@ +#!/usr/bin/python3.10 +# -*-coding:Utf-8 -* + +#============================================================================== +# C.M. Tool: AMR Graph (penman) Analzer +#------------------------------------------------------------------------------ +# Module to analyze AMR Graph in penman format +#============================================================================== + +#============================================================================== +# Importing required modules +#============================================================================== + +import sys +import glob +import re +import propbank_analyzer + +from bs4 import BeautifulSoup + + +#============================================================================== +# Parameters +#============================================================================== + +# Input/Output Directories +INPUT_DIR = "../inputData/" +OUTPUT_DIR = "../outputData/" + +# Data +PROPBANK_FRAMES_DIR = "../propbankFrames/" +PBF_DIGITS = 2 +AMR_PREDICATE_FORM = ['[a-z]+-0\d'] +ROLE_SEARCH_RE = '(?<=[a-z]+-0\d):ARG\d(?=[[a-z]+-0\d)|$]' +AMR_CORE_ROLE_FORM = [':ARG\d'] +PARENTHICAL_EXPRESSION = '\((?>\((?<c>)|[^()]+|\)(?<-c>))*(?(c)(?!))\)' +AMR_PREDICATE_SCOPE_FORM = ['(^())*'] + + +#============================================================================== +# Functions to find AMR predicates and AMR core roles +#============================================================================== + +def get_amr_predicate_list(amr_graph): + amr_predicate_list = [] + for target_re in AMR_PREDICATE_FORM: + found_predicates = re.findall(target_re, amr_graph) + amr_predicate_list.extend(found_predicates) + return amr_predicate_list + +def get_parenthical_expression(amr_graph): + result_list = [] + result_list.extend(re.findall(PARENTHICAL_EXPRESSION, amr_graph)) + return result_list + +def get_core_role_list_of_predicate(amr_graph, predicate): + amr_core_role_list = [] + for target_re in AMR_PREDICATE_FORM: + found_roles = re.findall(target_re, amr_graph) + amr_core_role_list.extend(found_roles) + return amr_core_role_list + + + +#============================================================================== +# Main function +#============================================================================== + +def main(amr_graph_file): + + print("\n" + "[CMT] AMR Graph Analyzer") + + print(re.findall(ROLE_SEARCH_RE, 'test (d \ bind-01 :ARG1)')) + + # amr_graph_file = INPUT_DIR + amr_graph_file + # print("-- Reading file " + amr_graph_file) + # with open(amr_graph_file, 'r') as f: + # amr_graph = f.read() + # print("----- AMR Graph: \n" + amr_graph) + + # print("-- Analyzing graph ") + # amr_predicate_list = get_amr_predicate_list(amr_graph) + # print("--- predicates found: ") + # if len(amr_predicate_list) > 0: + # for p in amr_predicate_list: + # print("----- " + p) + # else: + # print("None") + + # parenthical_expression_list = get_parenthical_expression(amr_graph) + # print("-- Parenthical expression found: ") + # if len(parenthical_expression_list) > 0: + # for e in parenthical_expression_list: + # print("----- " + e) + # else: + # print("None") + + # -- Ending print + print("\n" + "[SSC] Done") + + +if __name__ == "__main__": + main(sys.argv[1]) + + + + + diff --git a/lib/propbank_analyzer.py b/lib/propbank_analyzer.py index 9d4b29ae..52d8916d 100644 --- a/lib/propbank_analyzer.py +++ b/lib/propbank_analyzer.py @@ -2,7 +2,7 @@ # -*-coding:Utf-8 -* #============================================================================== -# C.M. Tool: prop +# C.M. Tool: PropBank Frame Analyzer #------------------------------------------------------------------------------ # Module to analyze PropBank frames #============================================================================== diff --git a/lib/re_test.py b/lib/re_test.py new file mode 100644 index 00000000..9c3c53f6 --- /dev/null +++ b/lib/re_test.py @@ -0,0 +1,57 @@ +import regex as re + +text = ''' # ::id SSC-01-01 +# ::snt The Solar System is the gravitationally bound system of the Sun and the objects that orbit it, either directly or indirectly. +(s / system + :domain (p / planet + :name (n / name + :op1 "Solar" + :op2 "System")) + :ARG1-of (b / bind-01 + :ARG0 (g / gravitation)) + :part (a / and + :op1 (s2 / sun) + :op2 (o / object + :ARG0-of (o2 / orbit-01 + :ARG1 s2 + :manner (o3 / or + :op1 (d / direct-02) + :op2 (d2 / direct-02 + :polarity -))))))''' + +rx = re.compile(r''' + \([^.]*\) (*SKIP)(*FAIL) # match anything in parentheses and "throw it away" + | # or + :ARG\d # match :ARGi + ''', re.VERBOSE) + +rx_2 = re.compile(r''' + \([^.]*\) (*SKIP)(*FAIL) # match anything in parentheses and "throw it away" + | # or + :ARG\d-of # match :ARGi-of + ''', re.VERBOSE) + + +pred_pattern = '[a-z]+-0\d' +arg_of_pattern = ':ARG\d-of' + +result = [] + +# -- argument pour chaque prédicat +for pred_match in re.finditer(pred_pattern, text): + print(pred_match) + arg_match_list = rx.findall(text[pred_match.end():]) + print(arg_match_list) + for arg_match in arg_match_list: + result.append((pred_match.group(), arg_match)) + +# -- prédicat pour chaque ARGi-of +for arg_match in re.finditer(arg_of_pattern, text): + print(arg_match) + pred_match = re.findall(pred_pattern, text[arg_match.end():]) + print(pred_match[0]) + result.append((pred_match[0], arg_match.group())) + +print("Result:") +for r in result: + print(r) \ No newline at end of file -- GitLab