diff --git a/unlizeXML-snippets.ipynb b/unlizeXML-snippets.ipynb deleted file mode 100644 index 0d70cc15edcb114a8aab679962a2712ee7e4a6b2..0000000000000000000000000000000000000000 --- a/unlizeXML-snippets.ipynb +++ /dev/null @@ -1,100 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from lxml import etree" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "def unlize (text):\n", - " return ('I UNLized the following text : ###'+text+'###')" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [], - "source": [ - "def nestedBody2Str (b) :\n", - "# Takes a node and return the children text nodes\n", - "# Nested texts are separated by commas\n", - " children = b.xpath('./node()')\n", - " result = ''\n", - " for child in children :\n", - " if type(child) == etree._ElementUnicodeResult :\n", - " result += str(child).strip()\n", - " else :\n", - " result += \" \"\n", - " nested = child.xpath('.//text()')\n", - " nestedStriped = [str(i).strip() for i in nested]\n", - " nestedFiltered = filter( lambda s: not (s == ''), nestedStriped)\n", - " result += ', '.join(nestedFiltered)\n", - " result += \". \"\n", - " return (result)" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], - "source": [ - "filename = 'exemple_2007-ertms.xml'\n", - "\n", - "doc = etree.parse(filename)\n", - "bodies = doc.xpath('//text_body')\n", - "for b in bodies:\n", - " #textList = b.xpath('.//')\n", - " parent = b.xpath('../node()')\n", - " unl_node = etree.Element('unl_body')\n", - " unl_node.text=unlize (nestedBody2Str(b))\n", - " parent.append(unl_node)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python3 (dev venv)", - "language": "python", - "name": "dev" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version_major": 2, - "version_minor": 0 - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/xml2unl.py b/xml2unl.py new file mode 100755 index 0000000000000000000000000000000000000000..0e8868d1045267c89420e66973528bab0f3b8dd6 --- /dev/null +++ b/xml2unl.py @@ -0,0 +1,140 @@ +#!/bin/env python3 + +from lxml import etree +import requests +import click +import tempfile +import os +from subprocess import Popen, PIPE, STDOUT + + +def unlize(text, lang, dry_run=False): + + if (dry_run): + return ('I UNLized the following text : ###'+text+'###') + + url = "http://unl.ru/etap-cgi/etap-cgi-old/cgiunl.exe" + + data = { + 'DOMAIN': 'SPORT', + 'password': 'guest', + 'TAGERROR': 'NO', + 'username': 'UNL_guest', + 'conversion': 'true', + 'language': lang, + 'data': text, + 'outputmode': 'text', + 'coding': 'utf-8', + 'translate': 'Process' + } + + r = requests.post(url, data) + # Remove garbage before first '[' + return ''.join(r.text.partition('[')[1:]) + + +def nestedBody2Str(b): + # Takes a node and return the children text nodes + # Nested texts are separated by commas + children = b.xpath('./node()') + result = '' + for child in children: + if type(child) == etree._ElementUnicodeResult: + result += str(child).strip() + else: + result += " " + nested = child.xpath('.//text()') + nestedStriped = [str(i).strip() for i in nested] + nestedFiltered = filter(lambda s: not (s == ''), nestedStriped) + result += ', '.join(nestedFiltered) + result += ". " + return (result) + + +def addSubElement(parent, tag, text): + sub = etree.SubElement(parent, tag) + sub.text = text + return sub + + +def unl2dot(text, path): + with tempfile.NamedTemporaryFile() as temp: + out_name = os.path.basename(temp.name) + out_dir = os.path.dirname(temp.name) + + with tempfile.NamedTemporaryFile(mode="w") as in_file: + # Remove CRLF and flush output to avoid java errors + in_file.write(text.replace("\r\n", "\n")) + in_file.flush() + + # Run java parser + cmd = ['java', '-jar', path, + '--input-file', in_file.name, + '--output-Dir', out_dir, '--output-file', out_name, + '--output-type', 'dot'] + + with Popen(cmd, stdout=PIPE, stderr=STDOUT) as p: + p.wait() + p.stdout.flush() + if p.returncode != 0: + print("Error in unl2rdf: \n\n"+p.stdout.read().decode()) + print('UNL;') + print(text) + + # generate dot output + fname = '{}/{}.dot'.format(out_dir, out_name) + cmd = ['dot', '-Tsvg', fname] + with Popen(cmd, stdout=PIPE, stderr=PIPE) as p: + p.wait() + if p.returncode != 0: + print("Error creating svg: \n\n"+p.stderr.read().decode()) + print('UNL:') + print(text) + try: + with open(fname) as f: + print('DOT:') + print(f.read()) + except FileNotFoundError: + pass + else: + svg = p.stdout.read().decode() + os.remove(fname) + return svg + + return "" + + +@click.command() +@click.argument('input', nargs=1, + type=click.Path(dir_okay=False, exists=True)) +@click.argument('output', nargs=1, + type=click.Path(dir_okay=False, writable=True)) +@click.option('--lang', default='en', + type=click.Choice(['en', 'ru'])) +@click.option('--dry-run/--no-dry-run', default=False, + help='if true do not send request to unl.ru') +@click.option('--svg/--no-svg', default=True, + help='Add svg node representing unl graph') +@click.option('--unltools-path', nargs=1, + type=click.Path(dir_okay=False), + default='unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar', + help='Path of the unltools jar') +def xml2unl(input, output, lang, dry_run, svg, unltools_path): + + doc = etree.parse(input) + tags = ['title', 'text_body', 'term', 'meaning'] + for t in tags: + for node in doc.xpath('//'+t): + if node.text: + addSubElement(node, 'orig', node.text) + node.text = "" + unl = addSubElement(node, 'unl', unlize(nestedBody2Str(node), lang, dry_run)) + if(svg and not dry_run): + addSubElement(node, 'dot', unl2dot(unl.text, unltools_path)) + + with open(output, 'w') as out: + out.write(etree.tostring(doc, pretty_print=True).decode('utf-8')) + + +if __name__ == '__main__': + xml2unl()