from rdflib import OWL, RDF, Graph, Literal
from lxml import html
#import pandoc
from markitdown import MarkItDown
from markdown import markdown
import tempfile
import re
from glob import glob
from bs4 import BeautifulSoup

from common import *

log = get_logger("transform")


def construct(g: Graph, query: str):
    """Performs a SPARQL `CONSTRUCT` query and add the resulting triples
    to the graph, in-place.

    :return: Number of *new* triples (i.e. not present in the initial graph)
    """
    res = g.query(query)
    if res.graph is not None:
        new_triples = res.graph - g
        n = len(new_triples)
        g += new_triples
        if n > 0:
            log.info(f"\tConstructed {n} triples")
        return n
    return 0


def construct_while(g: Graph, query: str):
    """Repeat a CONSTRUCT query, adding triples to the graph, until the query
    stops generating new triples"""
    while construct(g, query) > 0:
        pass

def markFileDown(filepath):
    with open(filepath, 'r', encoding='ISO 8859-1') as f0:
        str0 = f0.read().replace("</p>", "</p>@BR@")
    tmp = tempfile.NamedTemporaryFile(suffix=".html")
    with open(tmp.name, 'w') as f:
        f.write(str0)
    # Convert the body to MD
    mid = MarkItDown()
    body_content_md = mid.convert(tmp.name).text_content.replace('| Imprimer  ','|').replace('@BR@','\n\n')
    return(body_content_md)


def prepareHTMLforMD(str):
    str = str.replace("\r"," ")
    # Identify audio content and add markups to identify them in the MD
    regexAV = re.compile(r".*?PF_clipAV\('.*?', '(.*?).swf',.*")
    str = regexAV.sub(r"@AUDIOSTART\1@AUDIOEND", str)
    # Identify image content and add markups to identify them in the MD
    regexIMG = re.compile(r"<img.*?src=\"\.\./media/(.*?)\".*?>")
    str = regexIMG.sub(r"@IMAGESTART\1@IMAGEEND", str)
    # Identify words supporting comments and add markups to identify them in the MD
    regexANCHOR= re.compile(r"<a class=\"STY_lienComt\" href=\"javascript:CRS_afficherDetail\('(.)'\)\" onclick=\"enrPosClic\(event,'.'\)\">(.*?)</a>")
    str = regexANCHOR.sub(r"@ANCHORSTART@\1@\2@ANCHOREND", str)
    # Identify comments ids  and add markups to identify them in the MD
    regexCOMMENTID= re.compile(r"<div id=\"divCmt(.)\" onclick=\"SPE_clicDansBulle\(event,'.*?'\)\">")#(.*)</div>", re.MULTILINE)
    str = regexCOMMENTID.sub(r"@COMMENTIDSTART@\1@COMMENTIDEND", str)
    # Identify anchor words that support Doc
    regexANCHOR2= re.compile(r"<a.*?ouvrirDoc\('(.*?)','(.*?)'\)\">(.*?\n?.*?)</a>", re.MULTILINE)
    str = regexANCHOR2.sub(r"@ANCHORSTART@commentfile_\1.\2@\3@ANCHOREND", str)
    str = re.sub(r'@(.*)\n(.*)@ANCHOREND', r"@\1 \2@ANCHOREND", str)
    str = str.replace(r"\_","_").replace("PAGE","html")
    return(str)


def postEditMD(str1):
    ###################################################
    # Various string cleaning and replacements
    ###################################################
    str1 = str1.replace(r"\_","_").replace(r" - ","\n- ").replace(r"| --- |", "").replace(r"|","")
    str1 = str1.replace("Ecoutez","Écoutez").replace("Ecouter","Écouter").replace("Enoncés","Énoncés").replace(r"A l'oral",r"À l'oral").replace("Enoncé","Énoncé").replace("A un bout","À un bout").replace("A l","À l")




    ###################################################
    # Add audio players for audio extracted from SWF
    ###################################################
    regexAV = re.compile(r"@AUDIOSTART.*?@AUDIOEND")
    for audioElt in regexAV.findall(str1):
        audioFolder = audioElt.replace("@AUDIOSTART","").replace("@AUDIOEND","").replace(r"\_","_")
        audioPaths = glob(HUGO_MEDIA_DIR+"/"+audioFolder+"/*.mp3")
        audioStr = ''
        for audioPath in audioPaths:
            audioFile = audioPath.split('/')[-1]
            #audioStr += r'{{< audio id="'+audioFile+r'" src="media/'+audioFolder+r'/'+audioFile+r'" >}}'
            audioStr += '<div><audio id="'+audioFile+'" ><source src="/macao-hugo/media/'+audioFolder+r'/'+audioFile+'" type="audio/mpeg"></audio><button onclick="document.getElementById(\''+audioFile+'\').play()">Play</button></div>'
        str1 = str1.replace(audioElt, audioStr)
    ###################################################

    ###################################################
    # Add html code to MD for images
    ###################################################
    regexIMG = re.compile(r"@IMAGESTART.*?@IMAGEEND")
    for imageElt in regexIMG.findall(str1):
        imgFilename = imageElt.replace("@IMAGESTART","").replace("@IMAGEEND","").replace(r"\_","_")
        imgHtml = '<img class="inlineImage" src="/macao-hugo/media/'+imgFilename+'" id="'+imgFilename+'"/>'
        str1 = str1.replace(imageElt, imgHtml)
    ###################################################

    ###################################################
    # Add html code to MD for comment anchors
    ###################################################
    commentaireInfo_md = ''
    i=0
    for match in re.findall(r'@ANCHORSTART@(commentfile_.*?)@', str1):
        i += 1
        filepath="/home/daxid/DEV/MACAO/macao-legacy/Basilisk/MACAO/macao_12/contenu/pages/"+match.replace("commentfile_","")
        try:
            file = open(filepath, 'r', encoding='utf-8')
            file.close()
            simpleHtml = markdown(markFileDown(filepath))
            commentaireInfo_md = '<div class="commentaireInfo" commentaireId="'+str(i)+'">'+simpleHtml+'</div>'
            str1 = str1.replace(match,str(i))
        except FileNotFoundError:
            pass
            print(f"File {filepath} not found.")

    # First for the words supporting the comments
    regexANCHOR = re.compile(r"@ANCHORSTART@(.*?)@(.*?)@ANCHOREND")
    str1 = regexANCHOR.sub(r'<span spanId="\1">\2</span>', str1)

    # For the comments themselves
    if '@COMMENTIDSTART' in str1 :
        str1 = str1.replace('@COMMENTIDSTART@','<div class="commentaireInfo" commentaireId="').replace('@COMMENTIDEND','">') + '</div>'

    ###################################################
    # Add html for images and fix media paths
    ###################################################
    regexIMG = re.compile(r"!\[\]\(\.\./media/(.*?)\)")
    str1 = regexIMG.sub(r"<img src='/macao-hugo/media/\1'>", str1)

    ###################################################
    # Some global string replacement
    ###################################################
    str1 = str1.replace("""Pour vous enregistrer ou interrompre
l'enregistrement, cliquez sur le bouton rouge.
Pour vous réécouter, cliquez sur la flèche
noire.""","")
    str1 = str1.replace("hauts-parleurs","flèches")
    str1 = str1.replace('\n<img class="inlineImage"','<img class="inlineImage"')

    return(str1, commentaireInfo_md)


def transform_html(graph: Graph):
    html_properties = ['commentaireInfo', 'commentaireSucces', 'commentaireSugg', 'html', 'description']


    # 1st attempt : clean up audio <script> tags
    """
    for t in graph.triples((None, NS['description'], None)):
        desc_str = t[2]
        tree = html.fragment_fromstring(desc_str)
        for script in tree.findall(".//script"):
            # `HtmlElement.drop_tree()` removes an element along with its
            # children and text, however it has an interesting feature :
            # the tail text is not removed, but instead joined to the previous
            # sibling or parent automatically.
            # This means that when we want to replace an element with string,
            # we only need to prepend the string to the tail, and it will be
            # inserted in the right place
            script.tail = script.text + script.tail
            script.drop_tree()
        pass
    """

    # Process all html content through Pandoc -> We use Markitdown instead at the moment
    #for prop in html_properties:
    #    for t in graph.triples((None, NS[prop], None))  :
    #        desc_str = t[2]
    #        desc_doc = pandoc.read(desc_str, format="html")
    #        desc_md = pandoc.write(desc_doc, format="markdown")
    #        l = list(t)
    #        l[2] = Literal(desc_md)
    #        l[1] = NS[prop+'_md']
    #        graph.add(tuple(l))

    # Add mannually edited surveyjs_json to the main graph
    #for t in graph.triples((None, surveyjs_json, None))  :


    # Process all html content through Markitdown
    for prop in html_properties:
        for t in graph.triples((None, NS[prop], None))  :
            desc_str = prepareHTMLforMD(t[2])
            if desc_str == '':
                desc_md = ''
            else:
                tmp = tempfile.NamedTemporaryFile(suffix=".html")
                with open(tmp.name, 'w') as f:
                    f.write(desc_str)
                mid = MarkItDown()
                tmp_md = mid.convert(tmp.name).text_content
                (desc_md, commentaireInfo_md) = postEditMD(tmp_md)

            if commentaireInfo_md != "":
                l0 = list(t)
                l0[2] = Literal(commentaireInfo_md)
                l0[1] = NS["commentaireInfo_md"]
                graph.add(tuple(l0))

            l = list(t)
            l[2] = Literal(desc_md)
            l[1] = NS[prop+'_md']
            ################################################
            # Add manually eddited content to the main graph
            manual_edition_list = list(graph.triples((l[0], NS[prop+"_md_manual_edition"], None)))
            if len(manual_edition_list)>=1:
                t_manual_edition = manual_edition_list[0]
                literal = t_manual_edition[2]
                # Identify audio content and add
                literal = postEditMD(prepareHTMLforMD(literal))[0]
                l[2] = Literal(literal)
            graph.add(tuple(l))
            #################################################



def main():
    # Load graph
    graph = Graph()
    graph.bind("", NS)
    graph.parse(RDF_SCHEMA_FILE)
    graph.parse(RDF_MANUAL_EDITION_FILE)
    graph.parse(RDF_CONTENT_FILE)


    transform_html(graph)

    # Apply property 'subClassOf' transitively, except on the "fake" class
    # hierarchy based on MacaoRoot
    log.info("Adding transitive subclasses...")
    q_transitive_subclass = """
        CONSTRUCT {
            ?a rdfs:subClassOf ?c
        } WHERE {
            ?a rdfs:subClassOf ?b .
            ?b rdfs:subClassOf ?c .
            ?b rdfs:subClassOf :MacaoObject
            FILTER(?a != owl:Nothing && ?c != owl:Thing)
        }
    """
    construct_while(graph, q_transitive_subclass)

    # Apply type inheritance (rdfs:subClassOf)
    log.info("Adding supertypes...")
    construct_while(
        graph,
        """
        CONSTRUCT {
            ?subj rdf:type ?supertype
        } WHERE {
            ?subj a ?type .
            ?type rdfs:subClassOf ?supertype
        }
        """,
    )

    # Apply property inheritance (rdfs:subPropertyOf)
    log.info("Adding super-properties...")
    construct_while(
        graph,
        """
        CONSTRUCT {
            ?subj ?superprop ?obj
        } WHERE {
            ?subj ?prop ?obj .
            ?prop rdfs:subPropertyOf ?superprop .
            FILTER(?superprop != owl:topDataProperty)
        }
        """,
    )

    # A

    # ==> Save
    # Remove dependency on previous ontologies
    for ontology in graph.subjects(RDF.type, OWL.Ontology):
        graph.remove((ontology, None, None))
    # Declare new ontology
    onto_uri = URIRef("http://www.semanticweb.org/eliott/ontologies/2024/4/macao-full")
    graph.add((onto_uri, RDF.type, OWL.Ontology))
    graph.add((onto_uri, RDFS.label, Literal("macao-full")))
    graph.serialize(RDF_FULL_FILE, "turtle", base=NS)


if __name__ == "__main__":
    main()