diff --git a/README.md b/README.md index e1761b02e50b3166a1767952e72d171d031d1f6a..055ba51e14c7b44d51eb3cf6ff3a71066e72036d 100644 --- a/README.md +++ b/README.md @@ -30,16 +30,7 @@ Examples of an input anf outputs are provided in `./data/examples/` Ziped folders of "unlized" XML files of the corpus are available in the ./data folder. -:bangbang: For some reason a namespace attribute of the root node make the script crash on the documents of the corpus. -Please modify the following : -``` -<req_document xsi:schemaLocation="req_document.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="req_document.xsd"> -``` -to -``` -<req_document xsi:schemaLocation="req_document.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> -``` -before submiting an xml of the corpus to the script. +:bangbang: `unlizeXml.py` ignores namespaces in the XML document. First clone the repo (or at least download the scripts folder): ``` diff --git a/scripts/unlizeXml.py b/scripts/unlizeXml.py index c16ce6d0195dd11c7a491177a5f210a62c22711b..210b736a3fcc27ccfca4ae39307c7ceb2a8f6501 100755 --- a/scripts/unlizeXml.py +++ b/scripts/unlizeXml.py @@ -7,6 +7,10 @@ import tempfile import os from subprocess import Popen, PIPE, STDOUT +def remove_namespace(doc): + #Remove namespace in the passed document in place + for elem in doc.getiterator(): + elem.tag=etree.QName(elem.tag).localname def unlize(text, lang, dry_run=False): @@ -122,6 +126,7 @@ def unl2dot(text, path): def unlizeXml(input, output, lang, dry_run, svg, unltools_path): doc = etree.parse(input) + remove_namespace(doc) tags = ['title', 'text_body', 'term', 'meaning'] for t in tags: for node in doc.xpath('//'+t):