From 8288ad8038cf11191afcc79166a9d6a68ab0776c Mon Sep 17 00:00:00 2001
From: David Beniamine <david.beniamine@tetras-libre.fr>
Date: Mon, 15 Jun 2020 15:48:19 +0200
Subject: [PATCH] Scripts to transform xml file to notebook

See #2
---
 scripts/unlizeToNotebook.py     |  53 +++++++++++++++++
 scripts/unlizeXmlNbSample.ipynb | 101 ++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+)
 create mode 100755 scripts/unlizeToNotebook.py
 create mode 100644 scripts/unlizeXmlNbSample.ipynb

diff --git a/scripts/unlizeToNotebook.py b/scripts/unlizeToNotebook.py
new file mode 100755
index 0000000..ab314d0
--- /dev/null
+++ b/scripts/unlizeToNotebook.py
@@ -0,0 +1,53 @@
+#!/bin/env python3
+
+import nbformat as nbf
+from nbformat.v4 import new_code_cell
+import click
+from lxml import etree, objectify
+from unlizeXml import remove_namespace, unlize, nestedBody2Str
+
+
+@click.command()
+@click.argument('input', nargs=1,
+                type=click.Path(dir_okay=False, exists=True))
+@click.argument('output', nargs=1,
+                type=click.Path(dir_okay=False, writable=True))
+@click.option('--template', default='unlizeXmlNbSample.ipynb',
+              type=click.Path(dir_okay=False, exists=True))
+@click.option('--lang', default='en',
+              type=click.Choice(['en', 'ru']))
+@click.option('--dry-run/--no-dry-run', default=False,
+              help='if true do not send request to unl.ru')
+def unlizeXmlNb(input, output, template, lang, dry_run):
+    nb = nbf.read(template, 4)
+
+    parser = etree.XMLParser(remove_comments=True)
+    doc = objectify.parse(input, parser=parser)
+    remove_namespace(doc)
+    tags = ['title', 'text_body', 'term', 'meaning']
+    for t in tags:
+        for node in doc.xpath('//'+t):
+            if node.text:
+                # Add
+                unl = unlize(nestedBody2Str(node), lang, dry_run)
+                addCell(nb, node.text, unl)
+
+    with open(output, 'w') as f:
+        nbf.write(nb, f)
+
+
+def addCell(nb, xml, unl):
+    code = """
+xmldata = \"\"\"
+{xml}
+\"\"\"
+unldata = \"\"\"
+{unl}
+\"\"\"
+displayUnl(unldata)
+""".format(xml=xml, unl=unl)
+    nb['cells'].append(new_code_cell(code))
+
+
+if __name__ == '__main__':
+    unlizeXmlNb()
diff --git a/scripts/unlizeXmlNbSample.ipynb b/scripts/unlizeXmlNbSample.ipynb
new file mode 100644
index 0000000..49b0f6e
--- /dev/null
+++ b/scripts/unlizeXmlNbSample.ipynb
@@ -0,0 +1,101 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tempfile\n",
+    "import os\n",
+    "from subprocess import Popen, PIPE, STDOUT\n",
+    "from IPython.core.display import SVG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def unl2dot(text, path):\n",
+    "    with tempfile.NamedTemporaryFile() as temp:\n",
+    "        out_name = os.path.basename(temp.name)\n",
+    "        out_dir = os.path.dirname(temp.name)\n",
+    "\n",
+    "    with tempfile.NamedTemporaryFile(mode=\"w\") as in_file:\n",
+    "        # Remove CRLF and flush output to avoid java errors\n",
+    "        in_file.write(text.replace(\"\\r\\n\", \"\\n\"))\n",
+    "        in_file.flush()\n",
+    "\n",
+    "        # Run java parser\n",
+    "        cmd = ['java', '-jar', path,\n",
+    "               '--input-file', in_file.name,\n",
+    "               '--output-Dir', out_dir, '--output-file', out_name,\n",
+    "               '--output-type', 'dot']\n",
+    "\n",
+    "        with Popen(cmd, stdout=PIPE, stderr=STDOUT) as p:\n",
+    "            p.wait()\n",
+    "            p.stdout.flush()\n",
+    "            if p.returncode != 0:\n",
+    "                print(\"Error in unl2rdf: \\n\\n\"+p.stdout.read().decode())\n",
+    "                print('UNL;')\n",
+    "                print(text)\n",
+    "\n",
+    "    # generate dot output\n",
+    "    fname = '{}/{}.dot'.format(out_dir, out_name)\n",
+    "    cmd = ['dot', '-Tsvg', fname]\n",
+    "    with Popen(cmd, stdout=PIPE, stderr=PIPE) as p:\n",
+    "        p.wait()\n",
+    "        if p.returncode != 0:\n",
+    "            print(\"Error creating svg: \\n\\n\"+p.stderr.read().decode())\n",
+    "            print('UNL:')\n",
+    "            print(text)\n",
+    "            try:\n",
+    "                with open(fname) as f:\n",
+    "                    print('DOT:')\n",
+    "                    print(f.read())\n",
+    "            except FileNotFoundError:\n",
+    "                pass\n",
+    "        else:\n",
+    "            svg = p.stdout.read().decode()\n",
+    "            os.remove(fname)\n",
+    "            return svg\n",
+    "    return \"\"\n",
+    "\n",
+    "\n",
+    "def displayUnl(unldata) :\n",
+    "# We generate protoSVG because whent there are several sentences, \n",
+    "# a string composed of several concatenated SVG is produced (not a valid SVG).\n",
+    "# We must then split the string to obtain several valid SVG to display.\n",
+    "    protoSvg = unl2dot(unldata, \"unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar\")\n",
+    "    sep = \"</svg>\\n\"\n",
+    "    svgArray = [x+sep for x in protoSvg.split(sep)]\n",
+    "    svgArray.pop()\n",
+    "    for svg in svgArray :\n",
+    "        display(SVG(svg))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
-- 
GitLab