diff --git a/inputData/test-amr-graph-1.penman b/inputData/test-amr-graph-1.penman index ec5a85a1808c81906f7bd917a8842ba30b41484e..523685788e7bf0425cbce72359ce05e32d42508b 100644 --- a/inputData/test-amr-graph-1.penman +++ b/inputData/test-amr-graph-1.penman @@ -1,5 +1,3 @@ -# ::id SSC-01-01 -# ::snt The Solar System is the gravitationally bound system of the Sun and the objects that orbit it, either directly or indirectly. (s / system :domain (p / planet :name (n / name @@ -15,4 +13,4 @@ :manner (o3 / or :op1 (d / direct-02) :op2 (d2 / direct-02 - :polarity -)))))) \ No newline at end of file + :polarity -)))))) diff --git a/lib/amr_analyzer.py b/lib/amr_analyzer.py index 81a8b5a5a9c9e6f90c02af792bc19dffead8b391..f14708ba9d3cd23899334f04fe150fd0ce2b83af 100644 --- a/lib/amr_analyzer.py +++ b/lib/amr_analyzer.py @@ -13,7 +13,7 @@ import sys import glob -import re +import regex as re import propbank_analyzer from bs4 import BeautifulSoup @@ -27,80 +27,262 @@ from bs4 import BeautifulSoup INPUT_DIR = "../inputData/" OUTPUT_DIR = "../outputData/" -# Data -PROPBANK_FRAMES_DIR = "../propbankFrames/" -PBF_DIGITS = 2 -AMR_PREDICATE_FORM = ['[a-z]+-0\d'] -ROLE_SEARCH_RE = '(?<=[a-z]+-0\d):ARG\d(?=[[a-z]+-0\d)|$]' -AMR_CORE_ROLE_FORM = [':ARG\d'] -PARENTHICAL_EXPRESSION = '\((?>\((?<c>)|[^()]+|\)(?<-c>))*(?(c)(?!))\)' -AMR_PREDICATE_SCOPE_FORM = ['(^())*'] +# Regular expressions for AMR graph analysis +AMR_PRED_RE = '[a-z]+-0\d' + +AMR_ARGOF_RE = ':ARG\d-of' + +AMR_ARG_RE = re.compile(r''' + \([^.]*\) (*SKIP)(*FAIL) # match anything in parentheses and "throw it away" + | # or + :ARG\d # match :ARGi + ''', re.VERBOSE) + + + +#============================================================================== +# Functions to find AMR predicate/argument relations +#============================================================================== + +def find_pred_arg_relations(graph, relation_list): + """ Find all direct predicat/argument relations in a graph + (argument as :ARGi), and add found relations in the input list. + """ + + for pred_match in re.finditer(AMR_PRED_RE, graph): + for arg_match in AMR_ARG_RE.finditer(graph[pred_match.end():]): + pred = pred_match.group() + arg = arg_match.group() + relation_list.append((pred, arg)) + + return relation_list + + +def find_argof_pred_relations(graph, relation_list): + """ Find all undirect predicat/argument relations in a graph + (argument as :ARGi-of), and add found relations in the input list. + """ + + for arg_match in re.finditer(AMR_ARGOF_RE, graph): + pred_match = re.findall(AMR_PRED_RE, graph[arg_match.end():]) + pred = pred_match[0] + arg = arg_match.group() + relation_list.append((pred, arg)) + + return relation_list + + +def find_all_pred_arg_relations(graph, relation_list): + """ Find all predicat/argument relations in a graph + (argument as :ARGi or :ARGi-of), and add found relations + in the input list. + """ + + relation_list = find_pred_arg_relations(graph, relation_list) + relation_list = find_argof_pred_relations(graph, relation_list) + + return relation_list + + +#============================================================================== +# Functions to update relation list with probbank roles (from propbank frames) +#============================================================================== + +def update_relation_list_with_propbank_role(old_relation_list): + + new_relation_list = [] + + for (pred, orig_arg) in old_relation_list: + + orig_role = orig_arg[0:5] + new_role = propbank_analyzer.find_pb_role(pred, orig_role) + + if new_role is not None: + + new_arg = orig_arg[0:5] + '-' + new_role + if len(orig_arg) >= 8: + new_arg += orig_arg[5:8] + + new_relation_list.append((pred, orig_arg, new_arg)) + + else: + print("*** relation (" + pred + ", " + orig_role + ") " + + "no found in PropBank frames ***") + + return new_relation_list #============================================================================== -# Functions to find AMR predicates and AMR core roles +# Functions to substitute arguments in AMR graph #============================================================================== -def get_amr_predicate_list(amr_graph): - amr_predicate_list = [] - for target_re in AMR_PREDICATE_FORM: - found_predicates = re.findall(target_re, amr_graph) - amr_predicate_list.extend(found_predicates) - return amr_predicate_list +def sub_betwenn_pos(text, start, end, new_str): + result = text[:start] + result += new_str + result += text[end:] + return result + -def get_parenthical_expression(amr_graph): - result_list = [] - result_list.extend(re.findall(PARENTHICAL_EXPRESSION, amr_graph)) - return result_list +def substitute_pred_arg_relations(graph, relation_list): + """ Substitute direct predicat/argument relations in a given AMR graph. + """ -def get_core_role_list_of_predicate(amr_graph, predicate): - amr_core_role_list = [] - for target_re in AMR_PREDICATE_FORM: - found_roles = re.findall(target_re, amr_graph) - amr_core_role_list.extend(found_roles) - return amr_core_role_list + for (pred, old_arg, new_arg) in relation_list: + + for pred_match in re.finditer(AMR_PRED_RE, graph): + + for arg_match in AMR_ARG_RE.finditer(graph[pred_match.end():]): + + start = pred_match.end() + arg_match.start() + end = pred_match.end() + arg_match.end() + + if ((pred == pred_match.group()) & + (arg_match.group() == old_arg)): + + graph = sub_betwenn_pos(graph, start, end, new_arg) + + return graph +def substitute_argof_pred_relations(graph, relation_list): + """ Substitute undirect predicat/argument relations in a given AMR graph. + """ + + for (pred, old_arg, new_arg) in relation_list: + + for arg_match in re.finditer(AMR_ARGOF_RE, graph): + + pred_match = re.findall(AMR_PRED_RE, graph[arg_match.end():]) + start = arg_match.start() + end = arg_match.end() + + if ((pred == pred_match[0]) & + (arg_match.group() == old_arg)): + + graph = sub_betwenn_pos(graph, start, end, new_arg) + + return graph + + +def substitute_all_pred_arg_relations(graph, relation_list): + """ Substitute all predicat/argument relations in a given AMR graph. + """ + + graph = substitute_pred_arg_relations(graph, relation_list) + graph = substitute_argof_pred_relations(graph, relation_list) + + return graph + #============================================================================== -# Main function +# Main Function(s) #============================================================================== -def main(amr_graph_file): +def enrich_amr_graph_with_propbank_role(graph): + """ + Enrich an AMR graph with PropBank roles. + + Parameters + ---------- + graph : STRING + AMR graph in PENMAN form. + + Returns + ------- + graph : STRING + AMR graph enriched with PropBank roles. - print("\n" + "[CMT] AMR Graph Analyzer") + """ - print(re.findall(ROLE_SEARCH_RE, 'test (d \ bind-01 :ARG1)')) + relation_list = [] - # amr_graph_file = INPUT_DIR + amr_graph_file - # print("-- Reading file " + amr_graph_file) - # with open(amr_graph_file, 'r') as f: - # amr_graph = f.read() - # print("----- AMR Graph: \n" + amr_graph) + relation_list = find_all_pred_arg_relations(graph, relation_list) + relation_list = update_relation_list_with_propbank_role(relation_list) - # print("-- Analyzing graph ") - # amr_predicate_list = get_amr_predicate_list(amr_graph) - # print("--- predicates found: ") - # if len(amr_predicate_list) > 0: - # for p in amr_predicate_list: - # print("----- " + p) - # else: - # print("None") + graph = substitute_all_pred_arg_relations(graph, relation_list) + + return graph + + +#============================================================================== +# *** Dev Test *** +#============================================================================== + +def dev_analyze(amr_graph_file): + + print("\n" + "[CMT-Dev] AMR Graph Analyzer") + + print("\n-- Start data") + amr_graph_file = INPUT_DIR + amr_graph_file + print("----- Reading file " + amr_graph_file) + with open(amr_graph_file, 'r') as f: + amr_graph_1 = f.read() + amr_graph_2 = ''.join(amr_graph_1) + print("----- AMR Graph 1: \n" + amr_graph_1) + print("----- AMR Graph 2: \n" + amr_graph_2) + rel_list_1 = [] + nb_relation_1 = len(rel_list_1) + print("----- Relation list 1 (init): " + str(rel_list_1)) + print("----- Number of relations in list 1: " + str(nb_relation_1)) + rel_list_2 = [] + nb_relation_2 = len(rel_list_2) + print("----- Relation list 2 (init): " + str(rel_list_2)) + print("----- Number of relations in list 2: " + str(nb_relation_2)) + + print("\n-- Finding AMR predicate/argument relations (step-by-step)") + rel_list_1 = find_pred_arg_relations(amr_graph_1, rel_list_1) + if len(rel_list_1) > nb_relation_1: + nb_relation_1 = len(rel_list_1) + print("----- some relations found ") + print("----- Relation list (update): " + str(rel_list_1)) + print("----- Number of relations in list 1: " + str(nb_relation_1)) + else: + print("----- no relation found ") + rel_list_1 = find_argof_pred_relations(amr_graph_1, rel_list_1) + if len(rel_list_1) > nb_relation_1: + nb_relation_1 = len(rel_list_1) + print("----- some relations found ") + print("----- Relation list (update): " + str(rel_list_1)) + print("----- Number of relations in list 1: " + str(nb_relation_1)) + else: + print("----- no relation found ") + + print("\n-- Finding AMR predicate/argument relations (all-in)") + rel_list_2 = find_all_pred_arg_relations(amr_graph_1, rel_list_2) + if len(rel_list_2) > nb_relation_2: + nb_relation_2 = len(rel_list_2) + print("----- some relations found ") + print("----- Relation list (update): " + str(rel_list_2)) + print("----- Number of relations: " + str(nb_relation_2)) + else: + print("----- no relation found ") + + print("\n-- Update relation list with probbank roles (from propbank frames)") + rel_list_3 = update_relation_list_with_propbank_role(rel_list_2) + nb_relation_3 = len(rel_list_3) + if nb_relation_3 >= nb_relation_2: + print("----- All relation update (good!)") + else: + print("----- Update imperfect") + print("----- Relation list (update): " + str(rel_list_3)) + print("----- Number of relations: " + str(nb_relation_3)) + + print("\n-- Enrich AMR graph with PropBank roles (step-by-step)") + amr_graph_1 = substitute_pred_arg_relations(amr_graph_1, rel_list_3) + print("----- AMR Graph 1 (update after step 1): \n" + amr_graph_1) + amr_graph_1 = substitute_argof_pred_relations(amr_graph_1, rel_list_3) + print("----- AMR Graph 1 (update after step 2): \n" + amr_graph_1) - # parenthical_expression_list = get_parenthical_expression(amr_graph) - # print("-- Parenthical expression found: ") - # if len(parenthical_expression_list) > 0: - # for e in parenthical_expression_list: - # print("----- " + e) - # else: - # print("None") + print("\n-- Enrich AMR graph with PropBank roles (main function)") + amr_graph_2 = enrich_amr_graph_with_propbank_role(amr_graph_2) + print("----- AMR Graph 2 (update): \n" + amr_graph_2) # -- Ending print print("\n" + "[SSC] Done") -if __name__ == "__main__": - main(sys.argv[1]) +def dev_test_1(): + dev_analyze('test-amr-graph-1.penman') diff --git a/lib/propbank_analyzer.py b/lib/propbank_analyzer.py index 4a04bedf8bb9de2f08431e3924dcb7de6a9df71e..bb6fbe35b89a0837e1f890eaf0c69905c4c2869d 100644 --- a/lib/propbank_analyzer.py +++ b/lib/propbank_analyzer.py @@ -187,7 +187,7 @@ def find_pb_role(amr_predicate, amr_role): def dev_analyze(amr_predicate, amr_role): - print("\n" + "[CMT] PropBank Frame Analyzer") + print("\n" + "[CMT-Dev] PropBank Frame Analyzer") # -- Analyze and adapt the target description print("-- Analyzing given data to specify the targetted data") diff --git a/lib/re_test.py b/lib/re_test.py deleted file mode 100644 index 73e4d186401aa993f745f47a8bd22420036db5a6..0000000000000000000000000000000000000000 --- a/lib/re_test.py +++ /dev/null @@ -1,181 +0,0 @@ -import regex as re -import propbank_analyzer as pba - -print("[DEV] Regular Expression Test") - -# -- Données de test -print("\n-- Données de test") - -GRAPH_INIT = ''' (s / system - :domain (p / planet - :name (n / name - :op1 "Solar" - :op2 "System")) - :ARG1-of (b / bind-01 - :ARG0 (g / gravitation)) - :part (a / and - :op1 (s2 / sun) - :op2 (o / object - :ARG0-of (o2 / orbit-01 - :ARG1 s2 - :manner (o3 / or - :op1 (d / direct-02) - :op2 (d2 / direct-02 - :polarity -))))))''' - -print("----- graphe AMR traité : " + GRAPH_INIT) - -substitutions = [] -substitutions.append(('bind-01', ':ARG0', ':ARG0-AGT')) -substitutions.append(('orbit-01', ':ARG1', ':ARG1-PPT')) -substitutions.append(('bind-01', ':ARG1-of', ':ARG1-PPT-of')) -substitutions.append(('orbit-01', ':ARG0-of', ':ARG0-GOL-of')) - -print("----- substitutions visées : " + str(substitutions)) - - -rx = re.compile(r''' - \([^.]*\) (*SKIP)(*FAIL) # match anything in parentheses and "throw it away" - | # or - :ARG\d # match :ARGi - ''', re.VERBOSE) - -rx_2 = re.compile(r''' - \([^.]*\) (*SKIP)(*FAIL) # match anything in parentheses and "throw it away" - | # or - :ARG\d-of # match :ARGi-of - ''', re.VERBOSE) - - -PRED_PATTERN = '[a-z]+-0\d' -ARGOF_PATTERN = ':ARG\d-of' - - -# -- Recherche des relations (predicat, argument) - -print("\n-- Recherche des relations (predicat, argument)") - -graph_1 = GRAPH_INIT - -pred_arg_relation_list = [] - - -# ----- argument pour chaque prédicat -def find_pred_arg_relations(graph, pred_arg_relation_list): - for pred_match in re.finditer(PRED_PATTERN, graph): - print("----- Match pour prédicat: " + str(pred_match)) - for arg_match in rx.finditer(graph[pred_match.end():]): - print("-------- Match pour argument de type ARGi: " + str(arg_match)) - arg_pos_start = pred_match.end() + arg_match.start() - arg_pos_end = pred_match.end() + arg_match.end() - pred_arg_relation_list.append((pred_match.group(), arg_match.group(), - arg_pos_start, arg_pos_end)) - return pred_arg_relation_list - -pred_arg_relation_list = find_pred_arg_relations(graph_1, pred_arg_relation_list) - -# ----- prédicat pour chaque ARGi-of -def find_argof_pred_relations(graph, pred_arg_relation_list): - for arg_match in re.finditer(ARGOF_PATTERN, graph): - print("----- Match pour argument de type ARGi-of: " + str(arg_match)) - pred_match = re.findall(PRED_PATTERN, graph[arg_match.end():]) - print("-------- Prédicat correspondant: " + pred_match[0]) - arg_pos_start = arg_match.start() - arg_pos_end = arg_match.end() - pred_arg_relation_list.append((pred_match[0], arg_match.group(), - arg_pos_start, arg_pos_end)) - return pred_arg_relation_list - -find_argof_pred_relations(graph_1, pred_arg_relation_list) - -print("----- Resultat (matchs trouvés) :") -for r in pred_arg_relation_list: - print(r) - - -# -- Substitution des arguments dans le graphe - -print("\n-- Substitution des arguments dans le graphe") - -graph_2 = GRAPH_INIT - -def sub_betwenn_pos(text, start, end, new_str): - result = text[:start] - result += new_str - result += text[end:] - return result - - -# ----- argument pour chaque prédicat -def sub_pred_arg_relations(graph, sub_list): - for (pred, old_arg, new_arg) in sub_list: - for pred_match in re.finditer(PRED_PATTERN, graph): - for arg_match in rx.finditer(graph[pred_match.end():]): - arg_pos_start = pred_match.end() + arg_match.start() - arg_pos_end = pred_match.end() + arg_match.end() - if (pred == pred_match.group()) & (arg_match.group() == old_arg): - print("----- substition de " + new_arg + - " sur le segment [" + str(arg_pos_start) + - ", " + str(arg_pos_end) + "]") - graph = sub_betwenn_pos(graph, - arg_pos_start, - arg_pos_end, - new_arg) - return graph - -graph_2 = sub_pred_arg_relations(graph_2, substitutions) - -# ----- prédicat pour chaque ARGi-of -def sub_argof_pred_relations(graph, sub_list): - for (pred, old_arg, new_arg) in sub_list: - for arg_match in re.finditer(ARGOF_PATTERN, graph): - pred_match = re.findall(PRED_PATTERN, graph[arg_match.end():]) - arg_pos_start = arg_match.start() - arg_pos_end = arg_match.end() - if (pred == pred_match[0]) & (arg_match.group() == old_arg): - print("----- substition de " + new_arg + - " sur le segment [" + str(arg_pos_start) + - ", " + str(arg_pos_end) + "]") - graph = sub_betwenn_pos(graph, - arg_pos_start, - arg_pos_end, - new_arg) - return graph - -graph_2 = sub_argof_pred_relations(graph_2, substitutions) - - -print("----- Résultat (graphe après substitutions) :" + graph_2) - -# -- Substitution des arguments dans le graphe - -print("\n-- Test avec l'analyseur des cadres ProbBank (pba)") - -graph_3 = GRAPH_INIT - -init_relations = pred_arg_relation_list - -substitutions_from_pb = [] - -for (pred, orig_arg, _, _) in init_relations: - orig_role = orig_arg[0:5] - print("----- find pb role for: " + pred + " and " + orig_role) - new_role = pba.find_pb_role(pred, orig_role) - if new_role is not None: - print("----- pb role found: " + new_role) - new_arg = orig_arg[0:5] + '-' + new_role - if len(orig_arg) >= 8: - new_arg += orig_arg[5:8] - print("----- substitution add: " + pred + - ", " + orig_arg + ", " + new_arg) - substitutions_from_pb.append((pred, orig_arg, new_arg)) - else: - print("----- pb role not found") - -print("----- origin relations: " + str(init_relations)) -print("----- substitutions list: " + str(substitutions_from_pb)) - -graph_3 = sub_pred_arg_relations(graph_3, substitutions_from_pb) -result_graph = sub_argof_pred_relations(graph_3, substitutions_from_pb) - -print("----- Result: " + result_graph) \ No newline at end of file