unlizeXml.py

#!/bin/env python3

from lxml import etree, objectify
import requests
import click
import tempfile
import os
from subprocess import Popen, PIPE, STDOUT

def remove_namespace(doc):
    #Remove namespace in the passed document in place
    for elem in doc.getiterator():
        elem.tag=etree.QName(elem.tag).localname

def unlize(text, lang, dry_run=False):

    if (dry_run):
        return ('I UNLized the following text : ###'+text+'###')

    url = "http://unl.ru/etap-cgi/etap-cgi-old/cgiunl.exe"

    data = {
        'DOMAIN': 'SPORT',
        'password': 'guest',
        'TAGERROR': 'NO',
        'username': 'UNL_guest',
        'conversion': 'true',
        'language': lang,
        'data': text,
        'outputmode': 'text',
        'coding': 'utf-8',
        'translate': 'Process'
    }

    try:
        r = requests.post(url, data)
        # Remove garbage before first '['
        return ''.join(r.text.partition('[')[1:])
    except Exception as e:
        return 'Error calling unl.ru : "{error}"'.format(error=e)


def nestedBody2Str(b):
    # Takes a node and return the children text nodes
    # Nested texts are separated by commas
    children = b.xpath('./node()')
    result = ''
    for child in children:
        if type(child) == etree._ElementUnicodeResult:
            result += str(child).strip()
        else:
            result += " "
            nested = child.xpath('.//text()')
            nestedStriped = [str(i).strip() for i in nested]
            nestedFiltered = filter(lambda s: not (s == ''), nestedStriped)
            result += ', '.join(nestedFiltered)
            result += ". "
    return (result.replace('.,','.').replace('..','.').replace(',,',',').replace(';,',';'))


def addSubElement(parent, tag, text):
    sub = etree.SubElement(parent, tag)
    sub.text = etree.CDATA(text.replace("\r\n", "\n"))
    return sub


def unl2dot(text, path):
    with tempfile.NamedTemporaryFile() as temp:
        out_name = os.path.basename(temp.name)
        out_dir = os.path.dirname(temp.name)

    with tempfile.NamedTemporaryFile(mode="w") as in_file:
        # Remove CRLF and flush output to avoid java errors
        in_file.write(text.replace("\r\n", "\n"))
        in_file.flush()

        # Run java parser
        cmd = ['java', '-jar', path,
               '--input-file', in_file.name,
               '--output-Dir', out_dir, '--output-file', out_name,
               '--output-type', 'dot']

        with Popen(cmd, stdout=PIPE, stderr=STDOUT) as p:
            p.wait()
            p.stdout.flush()
            if p.returncode != 0:
                print("Error in unl2rdf: \n\n"+p.stdout.read().decode())
                print('UNL;')
                print(text)

    # generate dot output
    fname = '{}/{}.dot'.format(out_dir, out_name)
    cmd = ['dot', '-Tsvg', fname]
    with Popen(cmd, stdout=PIPE, stderr=PIPE) as p:
        p.wait()
        if p.returncode != 0:
            print("Error creating svg: \n\n"+p.stderr.read().decode())
            print('UNL:')
            print(text)
            try:
                with open(fname) as f:
                    print('DOT:')
                    print(f.read())
            except FileNotFoundError:
                pass
        else:
            svg = p.stdout.read().decode()
            os.remove(fname)
            return svg

    return ""


@click.command()
@click.argument('input', nargs=1,
                type=click.Path(dir_okay=False, exists=True))
@click.argument('output', nargs=1,
                type=click.Path(dir_okay=False, writable=True))
@click.option('--lang', default='en',
              type=click.Choice(['en', 'ru']))
@click.option('--dry-run/--no-dry-run', default=False,
              help='if true do not send request to unl.ru')
@click.option('--svg/--no-svg', default=True,
              help='Add svg node representing unl graph')
@click.option('--unltools-path', nargs=1,
              type=click.Path(dir_okay=False),
              default='unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar',
              help='Path of the unltools jar')


def unlizeXml(input, output, lang, dry_run, svg, unltools_path):

    parser = etree.XMLParser(remove_comments=True)
    doc = objectify.parse(input, parser=parser)
    remove_namespace(doc)
    tags = ['title', 'text_body', 'term', 'meaning']
    for t in tags:
        for node in doc.xpath('//'+t):
            if node.text:
                addSubElement(node, 'orig', node.text)
                node.text = ""
                unl = addSubElement(node, 'unl', unlize(nestedBody2Str(node), lang, dry_run))
                if(svg and not dry_run):
                    addSubElement(node, 'svg', unl2dot(unl.text, unltools_path))

    with open(output, 'w') as out:
        out.write(etree.tostring(doc, pretty_print=True).decode('utf-8'))


if __name__ == '__main__':
    unlizeXml()