Skip to content
Snippets Groups Projects
Select Git revision
  • ffb575883009a8e97e294dc6ee2b56a3c1986cef
  • mui5-annotation-on-video-stable default
  • get_setter_canvasSizeInformations
  • fix-error-div-into-p
  • annotation-on-video-v2
  • detached
  • annotation-on-video-r17
  • mui5
  • mui5-react-18
  • jacob-test
  • annotation-on-video protected
  • master
  • test-antoinev1
  • 20-fetch-thumbnail-on-annotation
  • add-research-field
  • Save
  • add-plugin
  • 14-wip-no-seek-to
  • 14-bug-on-video-time-control
  • 9_wip_videotests
  • _upgrade_material_ui
  • latest-tetras-16
  • v3.3.0
  • v3.2.0
  • v3.1.1
  • v3.1.0
  • v3.0.0
  • v3.0.0-rc.7
  • v3.0.0-rc.6
  • v3.0.0-rc.5
  • v3.0.0-rc.4
  • v3.0.0-rc.3
  • v3.0.0-rc.2
  • v3.0.0-rc.1
  • v3.0.0-beta.10
  • v3.0.0-beta.9
  • v3.0.0-beta.8
  • v3.0.0-beta.7
  • v3.0.0-beta.6
  • v3.0.0-beta.5
  • v3.0.0-beta.3
41 results

AnnotationManifestsItem.js

Blame
  • extract.py 6.71 KiB
    #!/usr/bin/python3.10
    # -*-coding:Utf-8 -*
    
    #==============================================================================
    # TENET: extract
    #------------------------------------------------------------------------------
    # Command to run the main extraction process
    #==============================================================================
    
    #==============================================================================
    # Importing required modules
    #==============================================================================
    
    import argparse, os, glob
    import logging.config
    from lib import config, structure
    from lib import shacl_extraction, tenet_extraction
    from rdflib import Graph
    
    
    #==============================================================================
    # Parameters
    #==============================================================================
    
    # Logging
    logging.config.fileConfig('logging.conf', disable_existing_loggers=False)
    logger = logging.getLogger('root')
    
    # Configuration
    CONFIG_FILE = "config.xml"
    
    # Default values
    DEFAULT_SOURCE_TYPE = 'amr' 
    DEFAULT_SOURCE_CORPUS = "samples/s1/" # name of the directory with slash
    DEFAULT_TARGET_ID =  'DefaultTargetId'
    DEFAULT_ENGINE = 'tenet'
    
       
    #==============================================================================
    # Utilities
    #==============================================================================
    
    def control_arguments(): 
        arg_parser = argparse.ArgumentParser(
            description=("TENET - Tool for Extraction using Net Extension ",
                         "by (semantic) Transduction"))     
        arg_parser.add_argument("--source_type", nargs='?',
                                default=DEFAULT_SOURCE_TYPE,
                                help="source_type: amr or unl")   
        arg_parser.add_argument("--source_corpus",
                                default=DEFAULT_SOURCE_CORPUS,
                                help="source_corpus: name of the source corpus directory with slash")  
        arg_parser.add_argument("--target_id",
                                default=DEFAULT_TARGET_ID,
                                help="target_id: id for the target ontology")    
        arg_parser.add_argument("--engine",
                                default=DEFAULT_ENGINE,
                                help="engine: shacl, tenet or new")  
        args = arg_parser.parse_args()
        return args
    
    
        
       
    #==============================================================================
    # Steps
    #==============================================================================
    
    def set_config(args):
        
        logger.info("-- Process Setting ")
        logger.info("----- Corpus source: {0} ({1})".format(args.source_corpus, 
                                                            args.source_type))
        logger.info("----- Ontology target (id): {0}".format(args.target_id))
        logger.debug("----- Current path: {0}".format(os.getcwd()))
        logger.debug("----- Config file: {0}".format(CONFIG_FILE))
        
        process_config = config.Config(CONFIG_FILE, 
                               args.target_id, 
                               args.source_corpus #, target_ontology
                               )
        process_config.source_type = args.source_type
        # config.output_ontology_namespace = target_ontology_namespace
        
        process_config.engine = args.engine
        
        logger.debug(process_config.get_full_config())
        
        return process_config
    
    
    def init_process(config):
        
        logger.info("-- Creating output target directory: " + config.output_dir)
        os.makedirs(config.output_dir, exist_ok=True)
        
        logger.debug("-- Counting number of graph files (sentences) ")
        sentence_count = 0
        for file_ref in glob.glob(config.source_sentence_file, recursive = True):
            sentence_count += 1   
        logger.debug("----- Graph count: {0}".format(sentence_count))
        
        
    def run_shacl_extraction(config):
        logger.debug("-- Process level: document")    
        work_graph = structure.prepare_work_graph_at_document_level(config)
        shacl_extraction.apply(config, work_graph)
            
        
    def run_tenet_extraction(config):
         
        if config.process_level == 'sentence':
            logger.debug("-- Process level: sentence")  
                    
            sentence_dir = config.source_sentence_file
            sentence_count = 0
            result_triple_list = []
            for sentence_file in glob.glob(sentence_dir, recursive = True):
                sentence_count += 1
                config.sentence_output_dir = '-' + str(sentence_count)
                logger.info("     *** sentence {0} *** ".format(sentence_count))
                os.makedirs(config.sentence_output_dir, exist_ok=True)
                work_graph = structure.prepare_sentence_work(config, sentence_file)
                # New extraction engine running
                _, new_triple_list = tenet_extraction.apply(config, work_graph)
                result_triple_list.extend(new_triple_list)
             
            logger.info(' === Final Ontology Generation  === ')   
            config.sentence_output_dir = ''
            logger.info("-- Making complete factoid graph by merging sentence factoid graphs")
            factoid_graph = Graph()
            for new_triple in result_triple_list:
                factoid_graph.add(new_triple) 
            logger.info("----- Total factoid number: " + str(len(new_triple_list)))    
            uuid_str = config.uuid_str
            base_ref = "http://" + uuid_str + '/' + 'factoid'
            logger.info("----- Graph base: {0}".format(base_ref)) 
            factoid_file = config.output_file.replace('.ttl', '_factoid.ttl')
            logger.info("-- Serializing graph to factoid file ({0})".format(factoid_file))
            factoid_graph.serialize(destination=factoid_file, 
                                    base=base_ref, 
                                    format='turtle')
            
        else: # config.process_level == 'document'
            logger.debug("-- Process level: document")  
            work_graph = structure.prepare_document_work(config)
            shacl_extraction.apply(config, work_graph)
        
    
    #==============================================================================
    # Main processing
    #==============================================================================
    
    
    def run(args):
        
        logger.info('[TENET] Extraction Processing')    
        
        # -- Process Initialization
        logger.info(' === Process Initialization === ')
        config = set_config(args)
        init_process(config)
        
        # -- Extraction Processing using TENET Engine    
        if config.engine == "shacl":
            logger.info(' === Extraction Processing using SHACL Engine === ')
            run_shacl_extraction(config)
        else: # config.engine == "tenet":
            logger.info(' === Extraction Processing using New TENET Engine === ')
            run_tenet_extraction(config)
            
        logger.info(' === Done === ')
    
    
    if __name__ == '__main__':
        args = control_arguments()
        run(args)