diff --git a/lib/propbank_analyzer.py b/lib/propbank_analyzer.py index fcfe6426c1b9504df43a42ca33f3758832912e04..9d4b29aed49e749a7b83624b1e34c61cfcf9430e 100644 --- a/lib/propbank_analyzer.py +++ b/lib/propbank_analyzer.py @@ -13,6 +13,7 @@ import sys import glob +import re from bs4 import BeautifulSoup @@ -28,7 +29,7 @@ OUTPUT_DIR = "../outputData/" # Data PROPBANK_FRAMES_DIR = "../propbankFrames/" PBF_DIGITS = 2 - +AMR_CORE_ROLE_FORM = [':ARG\d$', 'ARG\d$', '\d$'] #============================================================================== @@ -64,7 +65,11 @@ def get_roleset_id_from_amr_predicate(amr_predicate): def get_number_from_amr_role(amr_role): - return 1 + role_number = -1 + for role_format in AMR_CORE_ROLE_FORM: + if re.match(role_format, amr_role): + role_number = int(amr_role[-1]) + return role_number #============================================================================== @@ -79,16 +84,16 @@ def find_frame_of_lemma(lemma): frame_filepath = glob.glob(target_file, recursive=True) if len(frame_filepath) >= 1: - is_found = True frame_filepath = frame_filepath[0] with open(frame_filepath, 'r') as f: xml_data = f.read() frame_data = BeautifulSoup(xml_data, 'xml') else: - is_found = False frame_filepath = '' frame_data = None + is_found = frame_data is not None + return is_found, frame_filepath, frame_data @@ -96,35 +101,36 @@ def find_frame_of_lemma(lemma): # Functions to analyze a frame data #============================================================================== -def get_roleset_in_frame(frame_data, lemma, roleset_id): - """ Get a roleset in a given frame data +def find_roleset_in_frame(frame_data, lemma, roleset_id): + """ Find the roleset corresponding to a lemma and an id in a frame data """ try: lemma_data = frame_data.find('predicate', {'lemma':lemma}) roleset_data = lemma_data.find('roleset', {'id':roleset_id}) - is_found = True except: + lemma_data = None roleset_data = None - is_found = False + + is_found = (lemma_data is not None) & (roleset_data is not None) return is_found, roleset_data -def get_role_in_roleset(roleset_data, role_number): +def find_role_in_roleset(roleset_data, role_number): + """ Find the role corresponding to a given number in a roleset data + """ try: role_data = roleset_data.find('role', {'n':role_number}) - is_found = True except: - roleset_data = None - is_found = False - - return is_found, role_data + role_data = None + is_found = (role_data is not None) + return is_found, role_data @@ -152,30 +158,37 @@ def main(amr_predicate, amr_role): if frame_found: print("----- frame xml file found: " + frame_filepath) else: - print("----- frame xml file not found") + print("----- frame xml file not found for lemma " + lemma) + + if frame_found: + # -- Analyze frame data to get informations + print("-- Analyzing frame data") + rs_found, rs_data = find_roleset_in_frame(frame_data, lemma, roleset_id) + nb_roles = -1 - # -- Analyze frame data to get informations - print("-- Analyzing frame data") - rs_found, rs_data = get_roleset_in_frame(frame_data, lemma, roleset_id) - nb_roles = -1 - if rs_found: - print("----- roleset id: " + rs_data.get('id')) - print("----- roleset name: " + rs_data.get('name')) - nb_roles = len(rs_data.find_all('role')) - print("----- number of roles: " + str(nb_roles)) - for n in range(nb_roles): - _, role_data = get_role_in_roleset(rs_data, n) - print("----- role " + str(n) + ': ' + role_data.get('f') + + if rs_found: + print("----- roleset id: " + rs_data.get('id')) + print("----- roleset name: " + rs_data.get('name')) + nb_roles = len(rs_data.find_all('role')) + print("----- number of roles: " + str(nb_roles)) + for n in range(nb_roles): + _, role_data = find_role_in_roleset(rs_data, n) + print("----- role " + str(n) + ': ' + role_data.get('f') + ', ' + role_data.get('descr')) - else: - print("----- roleset not found") + else: + print("----- roleset " + roleset_id + " not found") - # -- Analyze frame data to get informations - if rs_found & role_number in range(nb_roles): - print("-- Finding role") - _, role_data = get_role_in_roleset(rs_data, role_number) - print("----- role found: " + role_data.get('f') + - ', ' + role_data.get('descr')) + # -- Analyze frame data to get informations + if rs_found & role_number in range(nb_roles): + print("-- Finding role") + print("----- role number: " + str(role_number)) + r_found, role_data = find_role_in_roleset(rs_data, role_number) + if r_found: + print("----- role " + str(role_number) + " found: " + + role_data.get('f') + + ', ' + role_data.get('descr')) + else: + print("----- role " + str(role_number) + " not found") # -- Ending print print("\n" + "[SSC] Done") diff --git a/requirements.txt b/requirements.txt index 28b538c75dd1e29dd0e0be5eb38a57a8ff62277e..3d9c233eb1eec6330f70da26a2abe00ea80cc98b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ argparse numpy rdflib graphviz +bs4 +lxml