diff --git a/scripts/unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar b/scripts/unl-tools-main-0.9.jar similarity index 64% rename from scripts/unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar rename to scripts/unl-tools-main-0.9.jar index 16eded785118a6c0d87e945f1e4b159d0cf1ea3c..d0ead502cee7aca59136aee28e37d02c6fa7d330 100644 Binary files a/scripts/unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar and b/scripts/unl-tools-main-0.9.jar differ diff --git a/scripts/unlizeToRdf.py b/scripts/unlizeToRdf.py new file mode 100755 index 0000000000000000000000000000000000000000..e20129881aba87aa4883651ff9e6c0fec10eba68 --- /dev/null +++ b/scripts/unlizeToRdf.py @@ -0,0 +1,41 @@ +#!/bin/env python3 + +import click +from lxml import etree, objectify +from unlizeXml import remove_namespace, unlize, nestedBody2Str, unl2rdf + +@click.command() +@click.argument('input', nargs=1, + type=click.Path(dir_okay=False, exists=True)) +@click.argument('output', nargs=1, + type=click.Path(dir_okay=False, writable=True)) +@click.option('--lang', default='en', + type=click.Choice(['en', 'ru'])) +@click.option('--dry-run/--no-dry-run', default=False, + help='if true do not send request to unl.ru') +def unlizeXmlRdf(input, output, lang, dry_run): + parser = etree.XMLParser(remove_comments=True) + doc = objectify.parse(input, parser=parser) + remove_namespace(doc) + tags = ['title', 'text_body', 'term', 'meaning'] + for t in tags: + for node in doc.xpath('//'+t): + num = str(node.xpath('../@id')[0]) + if node.text: + # Add + unl = unlize(nestedBody2Str(node), lang, dry_run) + print(unl2rdf(unl)) + else: + print(unl2rdf(getText(node, 'unl'))) + # Save + #with open(output, 'w') as f: + # nbf.write(nb, f) + +def getText(node, tag): + try: + return node.xpath(tag)[0].text + except IndexError: + return '' + +if __name__ == '__main__': + unlizeXmlRdf() diff --git a/scripts/unlizeXml.py b/scripts/unlizeXml.py index b5ef35f38c847de8afff5e20b262f123e72ae5b8..6b3049f23f51ff85d045b38823ef65fab63a2173 100755 --- a/scripts/unlizeXml.py +++ b/scripts/unlizeXml.py @@ -111,6 +111,45 @@ def unl2dot(text, path): return "" +def unl2rdf(text, path='./unl-tools-main-0.9.jar'): + with tempfile.NamedTemporaryFile() as temp: + out_name = os.path.basename(temp.name) + out_dir = os.path.dirname(temp.name) + + with tempfile.NamedTemporaryFile(mode="w") as in_file: + # Remove CRLF and flush output to avoid java errors + in_file.write(text.replace("\r\n", "\n")) + in_file.flush() + + # Run java parser + cmd = ['java', '-jar', path, + '--input-file', in_file.name, + '--output-Dir', out_dir, '--output-file', out_name, + '--output-type', 'rdf'] + # generate dot output + fname = '{}/{}.ttl'.format(out_dir, out_name) + + + with Popen(cmd, stdout=PIPE, stderr=STDOUT) as p: + p.wait() + p.stdout.flush() + if p.returncode != 0: + print("Error in unl2rdf: \n\n"+p.stdout.read().decode()) + print('UNL;') + print(text) + try: + with open(fname) as f: + print('RDF:') + print(f.read()) + except FileNotFoundError: + pass + else: + ttl = p.stdout.read().decode() + os.remove(fname) + return ttl + return "" + + @click.command() @click.argument('input', nargs=1, type=click.Path(dir_okay=False, exists=True)) @@ -126,9 +165,12 @@ def unl2dot(text, path): type=click.Path(dir_okay=False), default='unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar', help='Path of the unltools jar') +@click.option('--output-type', default='dot', + type=click.Choice(['dot', 'rdf']), + help='Choose dot or rdf output format') -def unlizeXml(input, output, lang, dry_run, svg, unltools_path): +def unlizeXml(input, output, output_type, lang, dry_run, svg, unltools_path): parser = etree.XMLParser(remove_comments=True) doc = objectify.parse(input, parser=parser) @@ -137,11 +179,14 @@ def unlizeXml(input, output, lang, dry_run, svg, unltools_path): for t in tags: for node in doc.xpath('//'+t): if node.text: - addSubElement(node, 'orig', node.text) - node.text = "" - unl = addSubElement(node, 'unl', unlize(nestedBody2Str(node), lang, dry_run)) - if(svg and not dry_run): - addSubElement(node, 'svg', unl2dot(unl.text, unltools_path)) + if output_type=='dot': + addSubElement(node, 'orig', node.text) + node.text = "" + unl = addSubElement(node, 'unl', unlize(nestedBody2Str(node), lang, dry_run)) + if(svg and not dry_run): + addSubElement(node, 'svg', unl2dot(unl.text, unltools_path)) + elif output_type=='rdf': + unl2rdf(unl.text, unltools_path) with open(output, 'w') as out: out.write(etree.tostring(doc, pretty_print=True).decode('utf-8')) diff --git a/scripts/unlizeXmlNbSample.ipynb b/scripts/unlizeXmlNbSample.ipynb index 9dea92044f3c413a5ff7ffcfda213dfc89963c4e..f3752cd99f25032a6959986c8a734a9280e84aee 100644 --- a/scripts/unlizeXmlNbSample.ipynb +++ b/scripts/unlizeXmlNbSample.ipynb @@ -118,7 +118,7 @@ " # Keep one of the two lines below depending if you want to use a local jar or a webservice for unltools\n", " try:\n", " svg = unl2dotWeb(unl)\n", - " #svg = unl2dot(unl, \"unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar\")\n", + " #svg = unl2dot(unl, \"unl-tools-main-0.9.jar\")\n", " display(HTML(insert_in_html_template(svg)))\n", " #return(insert_in_html_template(svg))\n", " except Exception as e :\n",