From 24553ca77f27fd9a2968609dbeb7d87387397b58 Mon Sep 17 00:00:00 2001
From: David Rouquet <david.rouquet@tetras-libre.fr>
Date: Wed, 10 Jun 2020 16:47:22 +0200
Subject: [PATCH] =?UTF-8?q?Ajout=20d'une=20fct=20qui=20ignore=20les=20name?=
 =?UTF-8?q?spaces=20dans=20le=20document=20(pb=20car=20le=20NS=20avait=20?=
 =?UTF-8?q?=C3=A9t=C3=A9=20supprim=C3=A9=20de=20l'exemple=20alors=20qu'il?=
 =?UTF-8?q?=20est=20pr=C3=A9sent=20dans=20les=20autres=20XML=20du=20corpus?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md            | 11 +----------
 scripts/unlizeXml.py |  5 +++++
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index e1761b0..055ba51 100644
--- a/README.md
+++ b/README.md
@@ -30,16 +30,7 @@ Examples of an input anf outputs are provided in `./data/examples/`
 
 Ziped folders of "unlized" XML files of the corpus are available in the ./data folder.
 
-:bangbang: For some reason a namespace attribute of the root node make the script crash on the documents of the corpus.
-Please modify the following :
-```
-<req_document xsi:schemaLocation="req_document.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="req_document.xsd">
-```
-to
-```
-<req_document xsi:schemaLocation="req_document.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
-```
-before submiting an xml of the corpus to the script.
+:bangbang: `unlizeXml.py` ignores namespaces in the XML document.
 
 First clone the repo (or at least download the scripts folder):
 ```
diff --git a/scripts/unlizeXml.py b/scripts/unlizeXml.py
index c16ce6d..210b736 100755
--- a/scripts/unlizeXml.py
+++ b/scripts/unlizeXml.py
@@ -7,6 +7,10 @@ import tempfile
 import os
 from subprocess import Popen, PIPE, STDOUT
 
+def remove_namespace(doc):
+    #Remove namespace in the passed document in place
+    for elem in doc.getiterator():
+        elem.tag=etree.QName(elem.tag).localname
 
 def unlize(text, lang, dry_run=False):
 
@@ -122,6 +126,7 @@ def unl2dot(text, path):
 def unlizeXml(input, output, lang, dry_run, svg, unltools_path):
 
     doc = etree.parse(input)
+    remove_namespace(doc)
     tags = ['title', 'text_body', 'term', 'meaning']
     for t in tags:
         for node in doc.xpath('//'+t):
-- 
GitLab