From 8288ad8038cf11191afcc79166a9d6a68ab0776c Mon Sep 17 00:00:00 2001 From: David Beniamine <david.beniamine@tetras-libre.fr> Date: Mon, 15 Jun 2020 15:48:19 +0200 Subject: [PATCH] Scripts to transform xml file to notebook See #2 --- scripts/unlizeToNotebook.py | 53 +++++++++++++++++ scripts/unlizeXmlNbSample.ipynb | 101 ++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100755 scripts/unlizeToNotebook.py create mode 100644 scripts/unlizeXmlNbSample.ipynb diff --git a/scripts/unlizeToNotebook.py b/scripts/unlizeToNotebook.py new file mode 100755 index 0000000..ab314d0 --- /dev/null +++ b/scripts/unlizeToNotebook.py @@ -0,0 +1,53 @@ +#!/bin/env python3 + +import nbformat as nbf +from nbformat.v4 import new_code_cell +import click +from lxml import etree, objectify +from unlizeXml import remove_namespace, unlize, nestedBody2Str + + +@click.command() +@click.argument('input', nargs=1, + type=click.Path(dir_okay=False, exists=True)) +@click.argument('output', nargs=1, + type=click.Path(dir_okay=False, writable=True)) +@click.option('--template', default='unlizeXmlNbSample.ipynb', + type=click.Path(dir_okay=False, exists=True)) +@click.option('--lang', default='en', + type=click.Choice(['en', 'ru'])) +@click.option('--dry-run/--no-dry-run', default=False, + help='if true do not send request to unl.ru') +def unlizeXmlNb(input, output, template, lang, dry_run): + nb = nbf.read(template, 4) + + parser = etree.XMLParser(remove_comments=True) + doc = objectify.parse(input, parser=parser) + remove_namespace(doc) + tags = ['title', 'text_body', 'term', 'meaning'] + for t in tags: + for node in doc.xpath('//'+t): + if node.text: + # Add + unl = unlize(nestedBody2Str(node), lang, dry_run) + addCell(nb, node.text, unl) + + with open(output, 'w') as f: + nbf.write(nb, f) + + +def addCell(nb, xml, unl): + code = """ +xmldata = \"\"\" +{xml} +\"\"\" +unldata = \"\"\" +{unl} +\"\"\" +displayUnl(unldata) +""".format(xml=xml, unl=unl) + nb['cells'].append(new_code_cell(code)) + + +if __name__ == '__main__': + unlizeXmlNb() diff --git a/scripts/unlizeXmlNbSample.ipynb b/scripts/unlizeXmlNbSample.ipynb new file mode 100644 index 0000000..49b0f6e --- /dev/null +++ b/scripts/unlizeXmlNbSample.ipynb @@ -0,0 +1,101 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import tempfile\n", + "import os\n", + "from subprocess import Popen, PIPE, STDOUT\n", + "from IPython.core.display import SVG" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def unl2dot(text, path):\n", + " with tempfile.NamedTemporaryFile() as temp:\n", + " out_name = os.path.basename(temp.name)\n", + " out_dir = os.path.dirname(temp.name)\n", + "\n", + " with tempfile.NamedTemporaryFile(mode=\"w\") as in_file:\n", + " # Remove CRLF and flush output to avoid java errors\n", + " in_file.write(text.replace(\"\\r\\n\", \"\\n\"))\n", + " in_file.flush()\n", + "\n", + " # Run java parser\n", + " cmd = ['java', '-jar', path,\n", + " '--input-file', in_file.name,\n", + " '--output-Dir', out_dir, '--output-file', out_name,\n", + " '--output-type', 'dot']\n", + "\n", + " with Popen(cmd, stdout=PIPE, stderr=STDOUT) as p:\n", + " p.wait()\n", + " p.stdout.flush()\n", + " if p.returncode != 0:\n", + " print(\"Error in unl2rdf: \\n\\n\"+p.stdout.read().decode())\n", + " print('UNL;')\n", + " print(text)\n", + "\n", + " # generate dot output\n", + " fname = '{}/{}.dot'.format(out_dir, out_name)\n", + " cmd = ['dot', '-Tsvg', fname]\n", + " with Popen(cmd, stdout=PIPE, stderr=PIPE) as p:\n", + " p.wait()\n", + " if p.returncode != 0:\n", + " print(\"Error creating svg: \\n\\n\"+p.stderr.read().decode())\n", + " print('UNL:')\n", + " print(text)\n", + " try:\n", + " with open(fname) as f:\n", + " print('DOT:')\n", + " print(f.read())\n", + " except FileNotFoundError:\n", + " pass\n", + " else:\n", + " svg = p.stdout.read().decode()\n", + " os.remove(fname)\n", + " return svg\n", + " return \"\"\n", + "\n", + "\n", + "def displayUnl(unldata) :\n", + "# We generate protoSVG because whent there are several sentences, \n", + "# a string composed of several concatenated SVG is produced (not a valid SVG).\n", + "# We must then split the string to obtain several valid SVG to display.\n", + " protoSvg = unl2dot(unldata, \"unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar\")\n", + " sep = \"</svg>\\n\"\n", + " svgArray = [x+sep for x in protoSvg.split(sep)]\n", + " svgArray.pop()\n", + " for svg in svgArray :\n", + " display(SVG(svg))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} -- GitLab