xml2unl.py

Fix #1

xml2unl.py
9c649b08 · David Beniamine · 2727cac3 · 2727cac3 · 9c649b08
Verified Commit 9c649b08 authored 5 years ago by David Beniamine
--- a/unlizeXML-snippets.ipynb
+++ b/unlizeXML-snippets.ipynb
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from lxml import etree"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def unlize (text):\n",
-    "    return ('I UNLized the following text : ###'+text+'###')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def nestedBody2Str (b) :\n",
-    "# Takes a node and return the children text nodes\n",
-    "# Nested texts are separated by commas\n",
-    "    children = b.xpath('./node()')\n",
-    "    result = ''\n",
-    "    for child in children :\n",
-    "        if type(child) == etree._ElementUnicodeResult :\n",
-    "            result += str(child).strip()\n",
-    "        else :\n",
-    "            result += \" \"\n",
-    "            nested = child.xpath('.//text()')\n",
-    "            nestedStriped = [str(i).strip() for i in nested]\n",
-    "            nestedFiltered = filter( lambda s: not (s == ''), nestedStriped)\n",
-    "            result += ', '.join(nestedFiltered)\n",
-    "            result += \". \"\n",
-    "    return (result)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 57,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "filename = 'exemple_2007-ertms.xml'\n",
-    "\n",
-    "doc = etree.parse(filename)\n",
-    "bodies = doc.xpath('//text_body')\n",
-    "for b in bodies:\n",
-    "    #textList = b.xpath('.//')\n",
-    "    parent = b.xpath('../node()')\n",
-    "    unl_node = etree.Element('unl_body')\n",
-    "    unl_node.text=unlize (nestedBody2Str(b))\n",
-    "    parent.append(unl_node)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python3 (dev venv)",
-   "language": "python",
-   "name": "dev"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.3"
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "state": {},
-    "version_major": 2,
-    "version_minor": 0
-   }
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
-%% Cell type:code id: tags:
-``` python
-from lxml import etree
-```
-%% Cell type:code id: tags:
-``` python
-def unlize (text):
-    return ('I UNLized the following text : ###'+text+'###')
-```
-%% Cell type:code id: tags:
-``` python
-def nestedBody2Str (b) :
-# Takes a node and return the children text nodes
-# Nested texts are separated by commas
-    children = b.xpath('./node()')
-    result = ''
-    for child in children :
-        if type(child) == etree._ElementUnicodeResult :
-            result += str(child).strip()
-        else :
-            result += " "
-            nested = child.xpath('.//text()')
-            nestedStriped = [str(i).strip() for i in nested]
-            nestedFiltered = filter( lambda s: not (s == ''), nestedStriped)
-            result += ', '.join(nestedFiltered)
-            result += ". "
-    return (result)
-```
-%% Cell type:code id: tags:
-``` python
-filename = 'exemple_2007-ertms.xml'
-doc = etree.parse(filename)
-bodies = doc.xpath('//text_body')
-for b in bodies:
-    #textList = b.xpath('.//')
-    parent = b.xpath('../node()')
-    unl_node = etree.Element('unl_body')
-    unl_node.text=unlize (nestedBody2Str(b))
-    parent.append(unl_node)
-```
-%% Cell type:code id: tags:
-``` python
-```
--- a/xml2unl.py
+++ b/xml2unl.py
+#!/bin/env python3
+from lxml import etree
+import requests
+import click
+import tempfile
+import os
+from subprocess import Popen, PIPE, STDOUT
+def unlize(text, lang, dry_run=False):
+    if (dry_run):
+        return ('I UNLized the following text : ###'+text+'###')
+    url = "http://unl.ru/etap-cgi/etap-cgi-old/cgiunl.exe"
+    data = {
+        'DOMAIN': 'SPORT',
+        'password': 'guest',
+        'TAGERROR': 'NO',
+        'username': 'UNL_guest',
+        'conversion': 'true',
+        'language': lang,
+        'data': text,
+        'outputmode': 'text',
+        'coding': 'utf-8',
+        'translate': 'Process'
+    }
+    r = requests.post(url, data)
+    # Remove garbage before first '['
+    return ''.join(r.text.partition('[')[1:])
+def nestedBody2Str(b):
+    # Takes a node and return the children text nodes
+    # Nested texts are separated by commas
+    children = b.xpath('./node()')
+    result = ''
+    for child in children:
+        if type(child) == etree._ElementUnicodeResult:
+            result += str(child).strip()
+        else:
+            result += " "
+            nested = child.xpath('.//text()')
+            nestedStriped = [str(i).strip() for i in nested]
+            nestedFiltered = filter(lambda s: not (s == ''), nestedStriped)
+            result += ', '.join(nestedFiltered)
+            result += ". "
+    return (result)
+def addSubElement(parent, tag, text):
+    sub = etree.SubElement(parent, tag)
+    sub.text = text
+    return sub
+def unl2dot(text, path):
+    with tempfile.NamedTemporaryFile() as temp:
+        out_name = os.path.basename(temp.name)
+        out_dir = os.path.dirname(temp.name)
+    with tempfile.NamedTemporaryFile(mode="w") as in_file:
+        # Remove CRLF and flush output to avoid java errors
+        in_file.write(text.replace("\r\n", "\n"))
+        in_file.flush()
+        # Run java parser
+        cmd = ['java', '-jar', path,
+               '--input-file', in_file.name,
+               '--output-Dir', out_dir, '--output-file', out_name,
+               '--output-type', 'dot']
+        with Popen(cmd, stdout=PIPE, stderr=STDOUT) as p:
+            p.wait()
+            p.stdout.flush()
+            if p.returncode != 0:
+                print("Error in unl2rdf: \n\n"+p.stdout.read().decode())
+                print('UNL;')
+                print(text)
+    # generate dot output
+    fname = '{}/{}.dot'.format(out_dir, out_name)
+    cmd = ['dot', '-Tsvg', fname]
+    with Popen(cmd, stdout=PIPE, stderr=PIPE) as p:
+        p.wait()
+        if p.returncode != 0:
+            print("Error creating svg: \n\n"+p.stderr.read().decode())
+            print('UNL:')
+            print(text)
+            try:
+                with open(fname) as f:
+                    print('DOT:')
+                    print(f.read())
+            except FileNotFoundError:
+                pass
+        else:
+            svg = p.stdout.read().decode()
+            os.remove(fname)
+            return svg
+    return ""
+@click.command()
+@click.argument('input', nargs=1,
+                type=click.Path(dir_okay=False, exists=True))
+@click.argument('output', nargs=1,
+                type=click.Path(dir_okay=False, writable=True))
+@click.option('--lang', default='en',
+              type=click.Choice(['en', 'ru']))
+@click.option('--dry-run/--no-dry-run', default=False,
+              help='if true do not send request to unl.ru')
+@click.option('--svg/--no-svg', default=True,
+              help='Add svg node representing unl graph')
+@click.option('--unltools-path', nargs=1,
+              type=click.Path(dir_okay=False),
+              default='unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar',
+              help='Path of the unltools jar')
+def xml2unl(input, output, lang, dry_run, svg, unltools_path):
+    doc = etree.parse(input)
+    tags = ['title', 'text_body', 'term', 'meaning']
+    for t in tags:
+        for node in doc.xpath('//'+t):
+            if node.text:
+                addSubElement(node, 'orig', node.text)
+                node.text = ""
+                unl = addSubElement(node, 'unl', unlize(nestedBody2Str(node), lang, dry_run))
+                if(svg and not dry_run):
+                    addSubElement(node, 'dot', unl2dot(unl.text, unltools_path))
+    with open(output, 'w') as out:
+        out.write(etree.tostring(doc, pretty_print=True).decode('utf-8'))
+if __name__ == '__main__':
+    xml2unl()