Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
nlreqdataset-unl-enco
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
UNL
nlreqdataset-unl-enco
Commits
8288ad80
Verified
Commit
8288ad80
authored
5 years ago
by
David Beniamine
Browse files
Options
Downloads
Patches
Plain Diff
Scripts to transform xml file to notebook
See
#2
parent
464bf2dd
Branches
Branches containing commit
No related tags found
No related merge requests found
Pipeline
#200
passed
5 years ago
Stage: deploy
Changes
2
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
scripts/unlizeToNotebook.py
+53
-0
53 additions, 0 deletions
scripts/unlizeToNotebook.py
scripts/unlizeXmlNbSample.ipynb
+101
-0
101 additions, 0 deletions
scripts/unlizeXmlNbSample.ipynb
with
154 additions
and
0 deletions
scripts/unlizeToNotebook.py
0 → 100755
+
53
−
0
View file @
8288ad80
#!/bin/env python3
import
nbformat
as
nbf
from
nbformat.v4
import
new_code_cell
import
click
from
lxml
import
etree
,
objectify
from
unlizeXml
import
remove_namespace
,
unlize
,
nestedBody2Str
@click.command
()
@click.argument
(
'
input
'
,
nargs
=
1
,
type
=
click
.
Path
(
dir_okay
=
False
,
exists
=
True
))
@click.argument
(
'
output
'
,
nargs
=
1
,
type
=
click
.
Path
(
dir_okay
=
False
,
writable
=
True
))
@click.option
(
'
--template
'
,
default
=
'
unlizeXmlNbSample.ipynb
'
,
type
=
click
.
Path
(
dir_okay
=
False
,
exists
=
True
))
@click.option
(
'
--lang
'
,
default
=
'
en
'
,
type
=
click
.
Choice
([
'
en
'
,
'
ru
'
]))
@click.option
(
'
--dry-run/--no-dry-run
'
,
default
=
False
,
help
=
'
if true do not send request to unl.ru
'
)
def
unlizeXmlNb
(
input
,
output
,
template
,
lang
,
dry_run
):
nb
=
nbf
.
read
(
template
,
4
)
parser
=
etree
.
XMLParser
(
remove_comments
=
True
)
doc
=
objectify
.
parse
(
input
,
parser
=
parser
)
remove_namespace
(
doc
)
tags
=
[
'
title
'
,
'
text_body
'
,
'
term
'
,
'
meaning
'
]
for
t
in
tags
:
for
node
in
doc
.
xpath
(
'
//
'
+
t
):
if
node
.
text
:
# Add
unl
=
unlize
(
nestedBody2Str
(
node
),
lang
,
dry_run
)
addCell
(
nb
,
node
.
text
,
unl
)
with
open
(
output
,
'
w
'
)
as
f
:
nbf
.
write
(
nb
,
f
)
def
addCell
(
nb
,
xml
,
unl
):
code
=
"""
xmldata =
\"\"\"
{xml}
\"\"\"
unldata =
\"\"\"
{unl}
\"\"\"
displayUnl(unldata)
"""
.
format
(
xml
=
xml
,
unl
=
unl
)
nb
[
'
cells
'
].
append
(
new_code_cell
(
code
))
if
__name__
==
'
__main__
'
:
unlizeXmlNb
()
This diff is collapsed.
Click to expand it.
scripts/unlizeXmlNbSample.ipynb
0 → 100644
+
101
−
0
View file @
8288ad80
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import tempfile\n",
"import os\n",
"from subprocess import Popen, PIPE, STDOUT\n",
"from IPython.core.display import SVG"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def unl2dot(text, path):\n",
" with tempfile.NamedTemporaryFile() as temp:\n",
" out_name = os.path.basename(temp.name)\n",
" out_dir = os.path.dirname(temp.name)\n",
"\n",
" with tempfile.NamedTemporaryFile(mode=\"w\") as in_file:\n",
" # Remove CRLF and flush output to avoid java errors\n",
" in_file.write(text.replace(\"\\r\\n\", \"\\n\"))\n",
" in_file.flush()\n",
"\n",
" # Run java parser\n",
" cmd = ['java', '-jar', path,\n",
" '--input-file', in_file.name,\n",
" '--output-Dir', out_dir, '--output-file', out_name,\n",
" '--output-type', 'dot']\n",
"\n",
" with Popen(cmd, stdout=PIPE, stderr=STDOUT) as p:\n",
" p.wait()\n",
" p.stdout.flush()\n",
" if p.returncode != 0:\n",
" print(\"Error in unl2rdf: \\n\\n\"+p.stdout.read().decode())\n",
" print('UNL;')\n",
" print(text)\n",
"\n",
" # generate dot output\n",
" fname = '{}/{}.dot'.format(out_dir, out_name)\n",
" cmd = ['dot', '-Tsvg', fname]\n",
" with Popen(cmd, stdout=PIPE, stderr=PIPE) as p:\n",
" p.wait()\n",
" if p.returncode != 0:\n",
" print(\"Error creating svg: \\n\\n\"+p.stderr.read().decode())\n",
" print('UNL:')\n",
" print(text)\n",
" try:\n",
" with open(fname) as f:\n",
" print('DOT:')\n",
" print(f.read())\n",
" except FileNotFoundError:\n",
" pass\n",
" else:\n",
" svg = p.stdout.read().decode()\n",
" os.remove(fname)\n",
" return svg\n",
" return \"\"\n",
"\n",
"\n",
"def displayUnl(unldata) :\n",
"# We generate protoSVG because whent there are several sentences, \n",
"# a string composed of several concatenated SVG is produced (not a valid SVG).\n",
"# We must then split the string to obtain several valid SVG to display.\n",
" protoSvg = unl2dot(unldata, \"unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar\")\n",
" sep = \"</svg>\\n\"\n",
" svgArray = [x+sep for x in protoSvg.split(sep)]\n",
" svgArray.pop()\n",
" for svg in svgArray :\n",
" display(SVG(svg))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
%% Cell type:code id: tags:
```
python
import
tempfile
import
os
from
subprocess
import
Popen
,
PIPE
,
STDOUT
from
IPython.core.display
import
SVG
```
%% Cell type:code id: tags:
```
python
def
unl2dot
(
text
,
path
):
with
tempfile
.
NamedTemporaryFile
()
as
temp
:
out_name
=
os
.
path
.
basename
(
temp
.
name
)
out_dir
=
os
.
path
.
dirname
(
temp
.
name
)
with
tempfile
.
NamedTemporaryFile
(
mode
=
"
w
"
)
as
in_file
:
# Remove CRLF and flush output to avoid java errors
in_file
.
write
(
text
.
replace
(
"
\r\n
"
,
"
\n
"
))
in_file
.
flush
()
# Run java parser
cmd
=
[
'
java
'
,
'
-jar
'
,
path
,
'
--input-file
'
,
in_file
.
name
,
'
--output-Dir
'
,
out_dir
,
'
--output-file
'
,
out_name
,
'
--output-type
'
,
'
dot
'
]
with
Popen
(
cmd
,
stdout
=
PIPE
,
stderr
=
STDOUT
)
as
p
:
p
.
wait
()
p
.
stdout
.
flush
()
if
p
.
returncode
!=
0
:
print
(
"
Error in unl2rdf:
\n\n
"
+
p
.
stdout
.
read
().
decode
())
print
(
'
UNL;
'
)
print
(
text
)
# generate dot output
fname
=
'
{}/{}.dot
'
.
format
(
out_dir
,
out_name
)
cmd
=
[
'
dot
'
,
'
-Tsvg
'
,
fname
]
with
Popen
(
cmd
,
stdout
=
PIPE
,
stderr
=
PIPE
)
as
p
:
p
.
wait
()
if
p
.
returncode
!=
0
:
print
(
"
Error creating svg:
\n\n
"
+
p
.
stderr
.
read
().
decode
())
print
(
'
UNL:
'
)
print
(
text
)
try
:
with
open
(
fname
)
as
f
:
print
(
'
DOT:
'
)
print
(
f
.
read
())
except
FileNotFoundError
:
pass
else
:
svg
=
p
.
stdout
.
read
().
decode
()
os
.
remove
(
fname
)
return
svg
return
""
def
displayUnl
(
unldata
)
:
# We generate protoSVG because whent there are several sentences,
# a string composed of several concatenated SVG is produced (not a valid SVG).
# We must then split the string to obtain several valid SVG to display.
protoSvg
=
unl2dot
(
unldata
,
"
unl2rdf-app-1.0-SNAPSHOT-jar-with-dependencies.jar
"
)
sep
=
"
</svg>
\n
"
svgArray
=
[
x
+
sep
for
x
in
protoSvg
.
split
(
sep
)]
svgArray
.
pop
()
for
svg
in
svgArray
:
display
(
SVG
(
svg
))
```
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment