Scripts to transform xml file to notebook

See #2

Scripts to transform xml file to notebook
8288ad80 · David Beniamine · 464bf2dd · 8288ad80 · 8288ad80
Verified Commit 8288ad80 authored 5 years ago by David Beniamine
--- a/scripts/unlizeToNotebook.py
+++ b/scripts/unlizeToNotebook.py
+#!/bin/env python3
+import nbformat as nbf
+from nbformat.v4 import new_code_cell
+import click
+from lxml import etree, objectify
+from unlizeXml import remove_namespace, unlize, nestedBody2Str
+@click.command()
+@click.argument('input', nargs=1,
+                type=click.Path(dir_okay=False, exists=True))
+@click.argument('output', nargs=1,
+                type=click.Path(dir_okay=False, writable=True))
+@click.option('--template', default='unlizeXmlNbSample.ipynb',
+              type=click.Path(dir_okay=False, exists=True))
+@click.option('--lang', default='en',
+              type=click.Choice(['en', 'ru']))
+@click.option('--dry-run/--no-dry-run', default=False,
+              help='if true do not send request to unl.ru')
+def unlizeXmlNb(input, output, template, lang, dry_run):
+    nb = nbf.read(template, 4)
+    parser = etree.XMLParser(remove_comments=True)
+    doc = objectify.parse(input, parser=parser)
+    remove_namespace(doc)
+    tags = ['title', 'text_body', 'term', 'meaning']
+    for t in tags:
+        for node in doc.xpath('//'+t):
+            if node.text:
+                # Add
+                unl = unlize(nestedBody2Str(node), lang, dry_run)
+                addCell(nb, node.text, unl)
+    with open(output, 'w') as f:
+        nbf.write(nb, f)
+def addCell(nb, xml, unl):
+    code = """
+xmldata = \"\"\"
+{xml}
+\"\"\"
+unldata = \"\"\"
+{unl}
+\"\"\"
+displayUnl(unldata)
+""".format(xml=xml, unl=unl)
+    nb['cells'].append(new_code_cell(code))
+if __name__ == '__main__':
+    unlizeXmlNb()
--- a/scripts/unlizeXmlNbSample.ipynb
+++ b/scripts/unlizeXmlNbSample.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tempfile\n",
+    "import os\n",
+    "from subprocess import Popen, PIPE, STDOUT\n",
+    "from IPython.core.display import SVG"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def unl2dot(text, path):\n",
+    "    with tempfile.NamedTemporaryFile() as temp:\n",
+    "        out_name = os.path.basename(temp.name)\n",
+    "        out_dir = os.path.dirname(temp.name)\n",
+    "\n",
+    "    with tempfile.NamedTemporaryFile(mode=\"w\") as in_file:\n",
+    "        # Remove CRLF and flush output to avoid java errors\n",
+    "        in_file.write(text.replace(\"\\r\\n\", \"\\n\"))\n",
+    "        in_file.flush()\n",
+    "\n",
+    "        # Run java parser\n",
+    "        cmd = ['java', '-jar', path,\n",
+    "               '--input-file', in_file.name,\n",
+    "               '--output-Dir', out_dir, '--output-file', out_name,\n",
+    "               '--output-type', 'dot']\n",
+    "\n",
+    "        with Popen(cmd, stdout=PIPE, stderr=STDOUT) as p:\n",
+    "            p.wait()\n",
+    "            p.stdout.flush()\n",
+    "            if p.returncode != 0:\n",
+    "                print(\"Error in unl2rdf: \\n\\n\"+p.stdout.read().decode())\n",
+    "                print('UNL;')\n",
+    "                print(text)\n",
+    "\n",
+    "    # generate dot output\n",
+    "    fname = '{}/{}.dot'.format(out_dir, out_name)\n",
+    "    cmd = ['dot', '-Tsvg', fname]\n",
+    "    with Popen(cmd, stdout=PIPE, stderr=PIPE) as p:\n",
+    "        p.wait()\n",
+    "        if p.returncode != 0:\n",
+    "            print(\"Error creating svg: \\n\\n\"+p.stderr.read().decode())\n",
+    "            print('UNL:')\n",
+    "            print(text)\n",
+    "            try:\n",
+    "                with open(fname) as f:\n",
+    "                    print('DOT:')\n",
+    "                    print(f.read())\n",
+    "            except FileNotFoundError:\n",
+    "                pass\n",
+    "        else:\n",
+    "            svg = p.stdout.read().decode()\n",
+    "            os.remove(fname)\n",
+    "            return svg\n",
+    "    return \"\"\n",
+    "\n",
+    "\n",
+    "def displayUnl(unldata) :\n",
+    "# We generate protoSVG because whent there are several sentences, \n",
+    "# a string composed of several concatenated SVG is produced (not a valid SVG).\n",
+    "# We must then split the string to obtain several valid SVG to display.\n",
+    "    protoSvg = unl2dot(unldata, \"unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar\")\n",
+    "    sep = \"</svg>\\n\"\n",
+    "    svgArray = [x+sep for x in protoSvg.split(sep)]\n",
+    "    svgArray.pop()\n",
+    "    for svg in svgArray :\n",
+    "        display(SVG(svg))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:code id: tags:
+``` python
+import tempfile
+import os
+from subprocess import Popen, PIPE, STDOUT
+from IPython.core.display import SVG
+```
+%% Cell type:code id: tags:
+``` python
+def unl2dot(text, path):
+    with tempfile.NamedTemporaryFile() as temp:
+        out_name = os.path.basename(temp.name)
+        out_dir = os.path.dirname(temp.name)
+    with tempfile.NamedTemporaryFile(mode="w") as in_file:
+        # Remove CRLF and flush output to avoid java errors
+        in_file.write(text.replace("\r\n", "\n"))
+        in_file.flush()
+        # Run java parser
+        cmd = ['java', '-jar', path,
+               '--input-file', in_file.name,
+               '--output-Dir', out_dir, '--output-file', out_name,
+               '--output-type', 'dot']
+        with Popen(cmd, stdout=PIPE, stderr=STDOUT) as p:
+            p.wait()
+            p.stdout.flush()
+            if p.returncode != 0:
+                print("Error in unl2rdf: \n\n"+p.stdout.read().decode())
+                print('UNL;')
+                print(text)
+    # generate dot output
+    fname = '{}/{}.dot'.format(out_dir, out_name)
+    cmd = ['dot', '-Tsvg', fname]
+    with Popen(cmd, stdout=PIPE, stderr=PIPE) as p:
+        p.wait()
+        if p.returncode != 0:
+            print("Error creating svg: \n\n"+p.stderr.read().decode())
+            print('UNL:')
+            print(text)
+            try:
+                with open(fname) as f:
+                    print('DOT:')
+                    print(f.read())
+            except FileNotFoundError:
+                pass
+        else:
+            svg = p.stdout.read().decode()
+            os.remove(fname)
+            return svg
+    return ""
+def displayUnl(unldata) :
+# We generate protoSVG because whent there are several sentences,
+# a string composed of several concatenated SVG is produced (not a valid SVG).
+# We must then split the string to obtain several valid SVG to display.
+    protoSvg = unl2dot(unldata, "unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar")
+    sep = "</svg>\n"
+    svgArray = [x+sep for x in protoSvg.split(sep)]
+    svgArray.pop()
+    for svg in svgArray :
+        display(SVG(svg))
+```