Skip to content
Snippets Groups Projects
Verified Commit 9c649b08 authored by David Beniamine's avatar David Beniamine
Browse files

xml2unl.py

Fix #1
parent 2727cac3
Branches
Tags
No related merge requests found
%% Cell type:code id: tags:
``` python
from lxml import etree
```
%% Cell type:code id: tags:
``` python
def unlize (text):
return ('I UNLized the following text : ###'+text+'###')
```
%% Cell type:code id: tags:
``` python
def nestedBody2Str (b) :
# Takes a node and return the children text nodes
# Nested texts are separated by commas
children = b.xpath('./node()')
result = ''
for child in children :
if type(child) == etree._ElementUnicodeResult :
result += str(child).strip()
else :
result += " "
nested = child.xpath('.//text()')
nestedStriped = [str(i).strip() for i in nested]
nestedFiltered = filter( lambda s: not (s == ''), nestedStriped)
result += ', '.join(nestedFiltered)
result += ". "
return (result)
```
%% Cell type:code id: tags:
``` python
filename = 'exemple_2007-ertms.xml'
doc = etree.parse(filename)
bodies = doc.xpath('//text_body')
for b in bodies:
#textList = b.xpath('.//')
parent = b.xpath('../node()')
unl_node = etree.Element('unl_body')
unl_node.text=unlize (nestedBody2Str(b))
parent.append(unl_node)
```
%% Cell type:code id: tags:
``` python
```
#!/bin/env python3
from lxml import etree
import requests
import click
import tempfile
import os
from subprocess import Popen, PIPE, STDOUT
def unlize(text, lang, dry_run=False):
if (dry_run):
return ('I UNLized the following text : ###'+text+'###')
url = "http://unl.ru/etap-cgi/etap-cgi-old/cgiunl.exe"
data = {
'DOMAIN': 'SPORT',
'password': 'guest',
'TAGERROR': 'NO',
'username': 'UNL_guest',
'conversion': 'true',
'language': lang,
'data': text,
'outputmode': 'text',
'coding': 'utf-8',
'translate': 'Process'
}
r = requests.post(url, data)
# Remove garbage before first '['
return ''.join(r.text.partition('[')[1:])
def nestedBody2Str(b):
# Takes a node and return the children text nodes
# Nested texts are separated by commas
children = b.xpath('./node()')
result = ''
for child in children:
if type(child) == etree._ElementUnicodeResult:
result += str(child).strip()
else:
result += " "
nested = child.xpath('.//text()')
nestedStriped = [str(i).strip() for i in nested]
nestedFiltered = filter(lambda s: not (s == ''), nestedStriped)
result += ', '.join(nestedFiltered)
result += ". "
return (result)
def addSubElement(parent, tag, text):
sub = etree.SubElement(parent, tag)
sub.text = text
return sub
def unl2dot(text, path):
with tempfile.NamedTemporaryFile() as temp:
out_name = os.path.basename(temp.name)
out_dir = os.path.dirname(temp.name)
with tempfile.NamedTemporaryFile(mode="w") as in_file:
# Remove CRLF and flush output to avoid java errors
in_file.write(text.replace("\r\n", "\n"))
in_file.flush()
# Run java parser
cmd = ['java', '-jar', path,
'--input-file', in_file.name,
'--output-Dir', out_dir, '--output-file', out_name,
'--output-type', 'dot']
with Popen(cmd, stdout=PIPE, stderr=STDOUT) as p:
p.wait()
p.stdout.flush()
if p.returncode != 0:
print("Error in unl2rdf: \n\n"+p.stdout.read().decode())
print('UNL;')
print(text)
# generate dot output
fname = '{}/{}.dot'.format(out_dir, out_name)
cmd = ['dot', '-Tsvg', fname]
with Popen(cmd, stdout=PIPE, stderr=PIPE) as p:
p.wait()
if p.returncode != 0:
print("Error creating svg: \n\n"+p.stderr.read().decode())
print('UNL:')
print(text)
try:
with open(fname) as f:
print('DOT:')
print(f.read())
except FileNotFoundError:
pass
else:
svg = p.stdout.read().decode()
os.remove(fname)
return svg
return ""
@click.command()
@click.argument('input', nargs=1,
type=click.Path(dir_okay=False, exists=True))
@click.argument('output', nargs=1,
type=click.Path(dir_okay=False, writable=True))
@click.option('--lang', default='en',
type=click.Choice(['en', 'ru']))
@click.option('--dry-run/--no-dry-run', default=False,
help='if true do not send request to unl.ru')
@click.option('--svg/--no-svg', default=True,
help='Add svg node representing unl graph')
@click.option('--unltools-path', nargs=1,
type=click.Path(dir_okay=False),
default='unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar',
help='Path of the unltools jar')
def xml2unl(input, output, lang, dry_run, svg, unltools_path):
doc = etree.parse(input)
tags = ['title', 'text_body', 'term', 'meaning']
for t in tags:
for node in doc.xpath('//'+t):
if node.text:
addSubElement(node, 'orig', node.text)
node.text = ""
unl = addSubElement(node, 'unl', unlize(nestedBody2Str(node), lang, dry_run))
if(svg and not dry_run):
addSubElement(node, 'dot', unl2dot(unl.text, unltools_path))
with open(output, 'w') as out:
out.write(etree.tostring(doc, pretty_print=True).decode('utf-8'))
if __name__ == '__main__':
xml2unl()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment