Skip to content
Snippets Groups Projects
Verified Commit 8288ad80 authored by David Beniamine's avatar David Beniamine
Browse files

Scripts to transform xml file to notebook

See #2
parent 464bf2dd
Branches
No related tags found
No related merge requests found
Pipeline #200 passed
#!/bin/env python3
import nbformat as nbf
from nbformat.v4 import new_code_cell
import click
from lxml import etree, objectify
from unlizeXml import remove_namespace, unlize, nestedBody2Str
@click.command()
@click.argument('input', nargs=1,
type=click.Path(dir_okay=False, exists=True))
@click.argument('output', nargs=1,
type=click.Path(dir_okay=False, writable=True))
@click.option('--template', default='unlizeXmlNbSample.ipynb',
type=click.Path(dir_okay=False, exists=True))
@click.option('--lang', default='en',
type=click.Choice(['en', 'ru']))
@click.option('--dry-run/--no-dry-run', default=False,
help='if true do not send request to unl.ru')
def unlizeXmlNb(input, output, template, lang, dry_run):
nb = nbf.read(template, 4)
parser = etree.XMLParser(remove_comments=True)
doc = objectify.parse(input, parser=parser)
remove_namespace(doc)
tags = ['title', 'text_body', 'term', 'meaning']
for t in tags:
for node in doc.xpath('//'+t):
if node.text:
# Add
unl = unlize(nestedBody2Str(node), lang, dry_run)
addCell(nb, node.text, unl)
with open(output, 'w') as f:
nbf.write(nb, f)
def addCell(nb, xml, unl):
code = """
xmldata = \"\"\"
{xml}
\"\"\"
unldata = \"\"\"
{unl}
\"\"\"
displayUnl(unldata)
""".format(xml=xml, unl=unl)
nb['cells'].append(new_code_cell(code))
if __name__ == '__main__':
unlizeXmlNb()
%% Cell type:code id: tags:
``` python
import tempfile
import os
from subprocess import Popen, PIPE, STDOUT
from IPython.core.display import SVG
```
%% Cell type:code id: tags:
``` python
def unl2dot(text, path):
with tempfile.NamedTemporaryFile() as temp:
out_name = os.path.basename(temp.name)
out_dir = os.path.dirname(temp.name)
with tempfile.NamedTemporaryFile(mode="w") as in_file:
# Remove CRLF and flush output to avoid java errors
in_file.write(text.replace("\r\n", "\n"))
in_file.flush()
# Run java parser
cmd = ['java', '-jar', path,
'--input-file', in_file.name,
'--output-Dir', out_dir, '--output-file', out_name,
'--output-type', 'dot']
with Popen(cmd, stdout=PIPE, stderr=STDOUT) as p:
p.wait()
p.stdout.flush()
if p.returncode != 0:
print("Error in unl2rdf: \n\n"+p.stdout.read().decode())
print('UNL;')
print(text)
# generate dot output
fname = '{}/{}.dot'.format(out_dir, out_name)
cmd = ['dot', '-Tsvg', fname]
with Popen(cmd, stdout=PIPE, stderr=PIPE) as p:
p.wait()
if p.returncode != 0:
print("Error creating svg: \n\n"+p.stderr.read().decode())
print('UNL:')
print(text)
try:
with open(fname) as f:
print('DOT:')
print(f.read())
except FileNotFoundError:
pass
else:
svg = p.stdout.read().decode()
os.remove(fname)
return svg
return ""
def displayUnl(unldata) :
# We generate protoSVG because whent there are several sentences,
# a string composed of several concatenated SVG is produced (not a valid SVG).
# We must then split the string to obtain several valid SVG to display.
protoSvg = unl2dot(unldata, "unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar")
sep = "</svg>\n"
svgArray = [x+sep for x in protoSvg.split(sep)]
svgArray.pop()
for svg in svgArray :
display(SVG(svg))
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment