diff --git a/unlizeXML-snippets.ipynb b/unlizeXML-snippets.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..0d70cc15edcb114a8aab679962a2712ee7e4a6b2 --- /dev/null +++ b/unlizeXML-snippets.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from lxml import etree" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def unlize (text):\n", + " return ('I UNLized the following text : ###'+text+'###')" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "def nestedBody2Str (b) :\n", + "# Takes a node and return the children text nodes\n", + "# Nested texts are separated by commas\n", + " children = b.xpath('./node()')\n", + " result = ''\n", + " for child in children :\n", + " if type(child) == etree._ElementUnicodeResult :\n", + " result += str(child).strip()\n", + " else :\n", + " result += \" \"\n", + " nested = child.xpath('.//text()')\n", + " nestedStriped = [str(i).strip() for i in nested]\n", + " nestedFiltered = filter( lambda s: not (s == ''), nestedStriped)\n", + " result += ', '.join(nestedFiltered)\n", + " result += \". \"\n", + " return (result)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "filename = 'exemple_2007-ertms.xml'\n", + "\n", + "doc = etree.parse(filename)\n", + "bodies = doc.xpath('//text_body')\n", + "for b in bodies:\n", + " #textList = b.xpath('.//')\n", + " parent = b.xpath('../node()')\n", + " unl_node = etree.Element('unl_body')\n", + " unl_node.text=unlize (nestedBody2Str(b))\n", + " parent.append(unl_node)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python3 (dev venv)", + "language": "python", + "name": "dev" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}