From 24553ca77f27fd9a2968609dbeb7d87387397b58 Mon Sep 17 00:00:00 2001 From: David Rouquet <david.rouquet@tetras-libre.fr> Date: Wed, 10 Jun 2020 16:47:22 +0200 Subject: [PATCH] =?UTF-8?q?Ajout=20d'une=20fct=20qui=20ignore=20les=20name?= =?UTF-8?q?spaces=20dans=20le=20document=20(pb=20car=20le=20NS=20avait=20?= =?UTF-8?q?=C3=A9t=C3=A9=20supprim=C3=A9=20de=20l'exemple=20alors=20qu'il?= =?UTF-8?q?=20est=20pr=C3=A9sent=20dans=20les=20autres=20XML=20du=20corpus?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 11 +---------- scripts/unlizeXml.py | 5 +++++ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index e1761b0..055ba51 100644 --- a/README.md +++ b/README.md @@ -30,16 +30,7 @@ Examples of an input anf outputs are provided in `./data/examples/` Ziped folders of "unlized" XML files of the corpus are available in the ./data folder. -:bangbang: For some reason a namespace attribute of the root node make the script crash on the documents of the corpus. -Please modify the following : -``` -<req_document xsi:schemaLocation="req_document.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="req_document.xsd"> -``` -to -``` -<req_document xsi:schemaLocation="req_document.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> -``` -before submiting an xml of the corpus to the script. +:bangbang: `unlizeXml.py` ignores namespaces in the XML document. First clone the repo (or at least download the scripts folder): ``` diff --git a/scripts/unlizeXml.py b/scripts/unlizeXml.py index c16ce6d..210b736 100755 --- a/scripts/unlizeXml.py +++ b/scripts/unlizeXml.py @@ -7,6 +7,10 @@ import tempfile import os from subprocess import Popen, PIPE, STDOUT +def remove_namespace(doc): + #Remove namespace in the passed document in place + for elem in doc.getiterator(): + elem.tag=etree.QName(elem.tag).localname def unlize(text, lang, dry_run=False): @@ -122,6 +126,7 @@ def unl2dot(text, path): def unlizeXml(input, output, lang, dry_run, svg, unltools_path): doc = etree.parse(input) + remove_namespace(doc) tags = ['title', 'text_body', 'term', 'meaning'] for t in tags: for node in doc.xpath('//'+t): -- GitLab