From c974963db773bf3c08e8b492fe79c79dd54deb59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Lamercerie?=
 <aurelien.lamercerie@tetras-libre.fr>
Date: Tue, 23 Aug 2022 09:58:48 +0200
Subject: [PATCH] Dev: regular expression test and amr analyzer init

---
 .gitignore                        |   1 +
 inputData/test-amr-graph-1.penman |  18 +++++
 lib/amr_analyzer.py               | 108 ++++++++++++++++++++++++++++++
 lib/propbank_analyzer.py          |   2 +-
 lib/re_test.py                    |  57 ++++++++++++++++
 5 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100644 inputData/test-amr-graph-1.penman
 create mode 100644 lib/amr_analyzer.py
 create mode 100644 lib/re_test.py

diff --git a/.gitignore b/.gitignore
index 4ffefd07..ae30e025 100755
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 *-env/*
 lib/amrld/wk/*
+*__pycache__*
diff --git a/inputData/test-amr-graph-1.penman b/inputData/test-amr-graph-1.penman
new file mode 100644
index 00000000..ec5a85a1
--- /dev/null
+++ b/inputData/test-amr-graph-1.penman
@@ -0,0 +1,18 @@
+# ::id SSC-01-01
+# ::snt The Solar System is the gravitationally bound system of the Sun and the objects that orbit it, either directly or indirectly.
+(s / system
+      :domain (p / planet
+            :name (n / name
+                  :op1 "Solar"
+                  :op2 "System"))
+      :ARG1-of (b / bind-01
+            :ARG0 (g / gravitation))
+      :part (a / and
+            :op1 (s2 / sun)
+            :op2 (o / object
+                  :ARG0-of (o2 / orbit-01
+                        :ARG1 s2
+                        :manner (o3 / or
+                              :op1 (d / direct-02)
+                              :op2 (d2 / direct-02
+                                    :polarity -))))))
\ No newline at end of file
diff --git a/lib/amr_analyzer.py b/lib/amr_analyzer.py
new file mode 100644
index 00000000..81a8b5a5
--- /dev/null
+++ b/lib/amr_analyzer.py
@@ -0,0 +1,108 @@
+#!/usr/bin/python3.10
+# -*-coding:Utf-8 -*
+
+#==============================================================================
+# C.M. Tool: AMR Graph (penman) Analzer
+#------------------------------------------------------------------------------
+# Module to analyze AMR Graph in penman format
+#==============================================================================
+
+#==============================================================================
+# Importing required modules
+#==============================================================================
+
+import sys
+import glob
+import re
+import propbank_analyzer
+
+from bs4 import BeautifulSoup
+
+
+#==============================================================================
+# Parameters
+#==============================================================================
+
+# Input/Output Directories
+INPUT_DIR = "../inputData/"
+OUTPUT_DIR = "../outputData/"
+
+# Data
+PROPBANK_FRAMES_DIR = "../propbankFrames/"
+PBF_DIGITS = 2
+AMR_PREDICATE_FORM = ['[a-z]+-0\d']
+ROLE_SEARCH_RE = '(?<=[a-z]+-0\d):ARG\d(?=[[a-z]+-0\d)|$]'
+AMR_CORE_ROLE_FORM = [':ARG\d']
+PARENTHICAL_EXPRESSION = '\((?>\((?<c>)|[^()]+|\)(?<-c>))*(?(c)(?!))\)'
+AMR_PREDICATE_SCOPE_FORM = ['(^())*']
+
+
+#==============================================================================
+# Functions to find AMR predicates and AMR core roles
+#==============================================================================
+
+def get_amr_predicate_list(amr_graph):
+    amr_predicate_list = []
+    for target_re in AMR_PREDICATE_FORM:
+        found_predicates = re.findall(target_re, amr_graph)
+        amr_predicate_list.extend(found_predicates)
+    return amr_predicate_list
+
+def get_parenthical_expression(amr_graph):
+    result_list = []
+    result_list.extend(re.findall(PARENTHICAL_EXPRESSION, amr_graph))
+    return result_list
+
+def get_core_role_list_of_predicate(amr_graph, predicate):
+    amr_core_role_list = []
+    for target_re in AMR_PREDICATE_FORM:
+        found_roles = re.findall(target_re, amr_graph)
+        amr_core_role_list.extend(found_roles)
+    return amr_core_role_list
+
+    
+
+#==============================================================================
+# Main function
+#==============================================================================
+
+def main(amr_graph_file):
+
+    print("\n" + "[CMT] AMR Graph Analyzer")
+    
+    print(re.findall(ROLE_SEARCH_RE, 'test (d \ bind-01 :ARG1)'))
+    
+    # amr_graph_file = INPUT_DIR + amr_graph_file
+    # print("-- Reading file " + amr_graph_file)
+    # with open(amr_graph_file, 'r') as f:
+    #     amr_graph = f.read()
+    # print("----- AMR Graph: \n" + amr_graph)
+    
+    # print("-- Analyzing graph ")
+    # amr_predicate_list = get_amr_predicate_list(amr_graph)
+    # print("--- predicates found: ")
+    # if len(amr_predicate_list) > 0:
+    #     for p in amr_predicate_list:
+    #         print("----- " + p)
+    # else:
+    #     print("None")
+       
+    # parenthical_expression_list = get_parenthical_expression(amr_graph)
+    # print("-- Parenthical expression found: ")
+    # if len(parenthical_expression_list) > 0:
+    #     for e in parenthical_expression_list:
+    #         print("----- " + e)
+    # else:
+    #     print("None")    
+    
+    # -- Ending print
+    print("\n" + "[SSC] Done")
+    
+
+if __name__ == "__main__":
+    main(sys.argv[1])
+
+
+    
+    
+    
diff --git a/lib/propbank_analyzer.py b/lib/propbank_analyzer.py
index 9d4b29ae..52d8916d 100644
--- a/lib/propbank_analyzer.py
+++ b/lib/propbank_analyzer.py
@@ -2,7 +2,7 @@
 # -*-coding:Utf-8 -*
 
 #==============================================================================
-# C.M. Tool: prop
+# C.M. Tool: PropBank Frame Analyzer
 #------------------------------------------------------------------------------
 # Module to analyze PropBank frames
 #==============================================================================
diff --git a/lib/re_test.py b/lib/re_test.py
new file mode 100644
index 00000000..9c3c53f6
--- /dev/null
+++ b/lib/re_test.py
@@ -0,0 +1,57 @@
+import regex as re
+
+text = ''' # ::id SSC-01-01
+# ::snt The Solar System is the gravitationally bound system of the Sun and the objects that orbit it, either directly or indirectly.
+(s / system
+      :domain (p / planet
+            :name (n / name
+                  :op1 "Solar"
+                  :op2 "System"))
+      :ARG1-of (b / bind-01
+            :ARG0 (g / gravitation))
+      :part (a / and
+            :op1 (s2 / sun)
+            :op2 (o / object
+                  :ARG0-of (o2 / orbit-01
+                        :ARG1 s2
+                        :manner (o3 / or
+                              :op1 (d / direct-02)
+                              :op2 (d2 / direct-02
+                                    :polarity -))))))'''
+
+rx = re.compile(r'''
+    \([^.]*\) (*SKIP)(*FAIL)  # match anything in parentheses and "throw it away"
+    |                          # or
+    :ARG\d                     # match :ARGi
+    ''', re.VERBOSE)
+    
+rx_2 = re.compile(r'''
+    \([^.]*\) (*SKIP)(*FAIL)  # match anything in parentheses and "throw it away"
+    |                          # or
+    :ARG\d-of                     # match :ARGi-of
+    ''', re.VERBOSE)
+
+
+pred_pattern = '[a-z]+-0\d'
+arg_of_pattern = ':ARG\d-of'
+
+result = []
+   
+# -- argument pour chaque prédicat 
+for pred_match in re.finditer(pred_pattern, text):
+    print(pred_match)
+    arg_match_list = rx.findall(text[pred_match.end():])
+    print(arg_match_list)
+    for arg_match in arg_match_list:
+        result.append((pred_match.group(), arg_match))
+    
+# -- prédicat pour chaque ARGi-of
+for arg_match in re.finditer(arg_of_pattern, text):
+    print(arg_match)
+    pred_match = re.findall(pred_pattern, text[arg_match.end():])
+    print(pred_match[0])
+    result.append((pred_match[0], arg_match.group()))
+    
+print("Result:")
+for r in result:
+    print(r)
\ No newline at end of file
-- 
GitLab