Select Git revision
WindowTopMenuButton.js
unlizeXml.py 4.76 KiB
#!/bin/env python3
from lxml import etree, objectify
import requests
import click
import tempfile
import os
from subprocess import Popen, PIPE, STDOUT
def remove_namespace(doc):
#Remove namespace in the passed document in place
for elem in doc.getiterator():
elem.tag=etree.QName(elem.tag).localname
def unlize(text, lang, dry_run=False):
if (dry_run):
return ('I UNLized the following text : ###'+text+'###')
url = "http://unl.ru/etap-cgi/etap-cgi-old/cgiunl.exe"
data = {
'DOMAIN': 'SPORT',
'password': 'guest',
'TAGERROR': 'NO',
'username': 'UNL_guest',
'conversion': 'true',
'language': lang,
'data': text,
'outputmode': 'text',
'coding': 'utf-8',
'translate': 'Process'
}
try:
r = requests.post(url, data)
# Remove garbage before first '['
return ''.join(r.text.partition('[')[1:])
except Exception as e:
return 'Error calling unl.ru : "{error}"'.format(error=e)
def nestedBody2Str(b):
# Takes a node and return the children text nodes
# Nested texts are separated by commas
children = b.xpath('./node()')
result = ''
for child in children:
if type(child) == etree._ElementUnicodeResult:
result += str(child).strip()
else:
result += " "
nested = child.xpath('.//text()')
nestedStriped = [str(i).strip() for i in nested]
nestedFiltered = filter(lambda s: not (s == ''), nestedStriped)
result += ', '.join(nestedFiltered)
result += ". "
return (result.replace('.,','.').replace('..','.').replace(',,',',').replace(';,',';'))
def addSubElement(parent, tag, text):
sub = etree.SubElement(parent, tag)
sub.text = etree.CDATA(text.replace("\r\n", "\n"))
return sub
def unl2dot(text, path):
with tempfile.NamedTemporaryFile() as temp:
out_name = os.path.basename(temp.name)
out_dir = os.path.dirname(temp.name)
with tempfile.NamedTemporaryFile(mode="w") as in_file:
# Remove CRLF and flush output to avoid java errors
in_file.write(text.replace("\r\n", "\n"))
in_file.flush()
# Run java parser
cmd = ['java', '-jar', path,
'--input-file', in_file.name,
'--output-Dir', out_dir, '--output-file', out_name,
'--output-type', 'dot']
with Popen(cmd, stdout=PIPE, stderr=STDOUT) as p:
p.wait()
p.stdout.flush()
if p.returncode != 0:
print("Error in unl2rdf: \n\n"+p.stdout.read().decode())
print('UNL;')
print(text)
# generate dot output
fname = '{}/{}.dot'.format(out_dir, out_name)
cmd = ['dot', '-Tsvg', fname]
with Popen(cmd, stdout=PIPE, stderr=PIPE) as p:
p.wait()
if p.returncode != 0:
print("Error creating svg: \n\n"+p.stderr.read().decode())
print('UNL:')
print(text)
try:
with open(fname) as f:
print('DOT:')
print(f.read())
except FileNotFoundError:
pass
else:
svg = p.stdout.read().decode()
os.remove(fname)
return svg
return ""
@click.command()
@click.argument('input', nargs=1,
type=click.Path(dir_okay=False, exists=True))
@click.argument('output', nargs=1,
type=click.Path(dir_okay=False, writable=True))
@click.option('--lang', default='en',
type=click.Choice(['en', 'ru']))
@click.option('--dry-run/--no-dry-run', default=False,
help='if true do not send request to unl.ru')
@click.option('--svg/--no-svg', default=True,
help='Add svg node representing unl graph')
@click.option('--unltools-path', nargs=1,
type=click.Path(dir_okay=False),
default='unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar',
help='Path of the unltools jar')
def unlizeXml(input, output, lang, dry_run, svg, unltools_path):
parser = etree.XMLParser(remove_comments=True)
doc = objectify.parse(input, parser=parser)
remove_namespace(doc)
tags = ['title', 'text_body', 'term', 'meaning']
for t in tags:
for node in doc.xpath('//'+t):
if node.text:
addSubElement(node, 'orig', node.text)
node.text = ""
unl = addSubElement(node, 'unl', unlize(nestedBody2Str(node), lang, dry_run))
if(svg and not dry_run):
addSubElement(node, 'svg', unl2dot(unl.text, unltools_path))
with open(output, 'w') as out:
out.write(etree.tostring(doc, pretty_print=True).decode('utf-8'))
if __name__ == '__main__':
unlizeXml()