diff --git a/lib/propbank_analyzer.py b/lib/propbank_analyzer.py index 0d5ae2388a28542a592c76d7c87b00d75d39b2f2..fcfe6426c1b9504df43a42ca33f3758832912e04 100644 --- a/lib/propbank_analyzer.py +++ b/lib/propbank_analyzer.py @@ -14,6 +14,8 @@ import sys import glob +from bs4 import BeautifulSoup + #============================================================================== # Parameters @@ -25,46 +27,162 @@ OUTPUT_DIR = "../outputData/" # Data PROPBANK_FRAMES_DIR = "../propbankFrames/" +PBF_DIGITS = 2 +#============================================================================== +# Functions to analyze and adapt the target description +#============================================================================== + +def itemize_amr_predicate(amr_predicate): + ap_items = amr_predicate.split('-') + lemma = ap_items[0] + if len(ap_items) > 1: + roleset_number = int(ap_items[1]) + else: + roleset_number = 1 + return lemma, roleset_number + + +def get_lemma_from_amr_predicate(amr_predicate): + lemma, _ = itemize_amr_predicate(amr_predicate) + return lemma + + +def get_role_ref_from_amr_predicate(amr_predicate): + _, roleset_number = itemize_amr_predicate(amr_predicate) + roleset_ref = str(roleset_number).rjust(PBF_DIGITS,"0") + return roleset_ref + + +def get_roleset_id_from_amr_predicate(amr_predicate): + lemma = get_lemma_from_amr_predicate(amr_predicate) + roleset_ref = get_role_ref_from_amr_predicate(amr_predicate) + roleset_id = lemma + '.' + roleset_ref + return roleset_id + + +def get_number_from_amr_role(amr_role): + return 1 + + #============================================================================== # Functions to find the XML description corresponding to a roleset #============================================================================== -def find_frame_filepath(lemma): - """ Find the Frame XML filepath corresponding to a given lemma +def find_frame_of_lemma(lemma): + """ Find the Frame XML data corresponding to a given lemma """ target_file = PROPBANK_FRAMES_DIR + lemma + '.xml' - filepath = glob.glob(target_file, recursive=True) + frame_filepath = glob.glob(target_file, recursive=True) + + if len(frame_filepath) >= 1: + is_found = True + frame_filepath = frame_filepath[0] + with open(frame_filepath, 'r') as f: + xml_data = f.read() + frame_data = BeautifulSoup(xml_data, 'xml') + else: + is_found = False + frame_filepath = '' + frame_data = None - return filepath + return is_found, frame_filepath, frame_data +#============================================================================== +# Functions to analyze a frame data +#============================================================================== + +def get_roleset_in_frame(frame_data, lemma, roleset_id): + """ Get a roleset in a given frame data + """ + + try: + lemma_data = frame_data.find('predicate', {'lemma':lemma}) + roleset_data = lemma_data.find('roleset', {'id':roleset_id}) + is_found = True + + except: + roleset_data = None + is_found = False + + return is_found, roleset_data + +def get_role_in_roleset(roleset_data, role_number): + + try: + role_data = roleset_data.find('role', {'n':role_number}) + is_found = True + except: + roleset_data = None + is_found = False + return is_found, role_data + + + + #============================================================================== # Main function #============================================================================== -def main(lemma): - - # -- Prepare the sentences to be converted - print("\n" + "[CMT] Finding frame") - print("-- lemma: " + lemma) - filepath = find_frame_filepath(lemma) - print(filepath) +def main(amr_predicate, amr_role): + print("\n" + "[CMT] PropBank Frame Analyzer") + + # -- Analyze and adapt the target description + print("-- Analyzing given data to specify the targetted data") + print("----- given data: " + amr_predicate + ', ' + amr_role) + lemma = get_lemma_from_amr_predicate(amr_predicate) + print("----- lemma: " + lemma) + roleset_id = get_roleset_id_from_amr_predicate(amr_predicate) + print("----- roleset id: " + roleset_id) + role_number = get_number_from_amr_role(amr_role) + print("----- role number: " + str(role_number)) + + # -- Find the Frame XML data corresponding to a given lemma + print("-- Finding frame data") + frame_found, frame_filepath, frame_data = find_frame_of_lemma(lemma) + if frame_found: + print("----- frame xml file found: " + frame_filepath) + else: + print("----- frame xml file not found") + + # -- Analyze frame data to get informations + print("-- Analyzing frame data") + rs_found, rs_data = get_roleset_in_frame(frame_data, lemma, roleset_id) + nb_roles = -1 + if rs_found: + print("----- roleset id: " + rs_data.get('id')) + print("----- roleset name: " + rs_data.get('name')) + nb_roles = len(rs_data.find_all('role')) + print("----- number of roles: " + str(nb_roles)) + for n in range(nb_roles): + _, role_data = get_role_in_roleset(rs_data, n) + print("----- role " + str(n) + ': ' + role_data.get('f') + + ', ' + role_data.get('descr')) + else: + print("----- roleset not found") + + # -- Analyze frame data to get informations + if rs_found & role_number in range(nb_roles): + print("-- Finding role") + _, role_data = get_role_in_roleset(rs_data, role_number) + print("----- role found: " + role_data.get('f') + + ', ' + role_data.get('descr')) # -- Ending print print("\n" + "[SSC] Done") if __name__ == "__main__": - main(sys.argv[1]) + main(sys.argv[1], sys.argv[2])