from pathlib import Path
from zipfile import ZipFile
from itertools import islice
import json

from rdflib import Graph, URIRef, Literal, BNode, Namespace
from rdflib.resource import Resource
from rdflib.namespace import RDF, DCAT, SDO
from requests.utils import requote_uri, unquote
from uuid6 import uuid7

from m4i import M4I, M4I_NAMESPACES


def to_absolute_file_uri(archive_path, file_path_in_archive):
    uri = (archive_path / file_path_in_archive).absolute().as_uri()
    uri = f"{uri}/"  # we need to force trailing forward slash so @base works correctly

    return uri


def to_paths(file_uri, file_extension):
    parts = file_uri.split(f"{file_extension}/")
    if len(parts) != 2:
        raise ValueError(f"archive location and file path inside archive "
                         f"could not be identified correctly from uri '{file_uri}'")

    archive_path = f"{unquote(parts[0])}{file_extension}"
    file_path_in_archive = unquote(parts[1])

    return archive_path, file_path_in_archive


def fix_iri(dict_in):
    if "@id" in dict_in:
        dict_in["@id"] = requote_uri(dict_in["@id"])
        return dict_in

    return dict_in


def map_dataset_agents(dataset):
    for author in dataset[SCHEMA["author"]]:
        author.add(RDF.type, M4I["agent"])

    for group in filter_dataset_mentions(dataset, category="Group"):
        group.add(RDF.type, M4I["agent"])


def map_dataset_mentions(dataset, experiment_dict):
    g = dataset.graph
    item_categories = {item["elabid"]: item["category_title"] for item in experiment_dict["items_links"]}
    # probably use sparql query instead
    for mention in g.objects(subject=dataset.identifier,
                             predicate=SCHEMA["mentions"]):
        identifiers = g.objects(subject=mention,
                                predicate=SCHEMA["identifier"])
        for identifier in identifiers:
            identifier = identifier.toPython()
            if identifier in item_categories.keys():
                g.add((mention, SCHEMA["category"], Literal(item_categories[identifier])))


def filter_dataset_mentions(dataset, category):
    # probably use sparql query instead
    mentions = [mention for mention in dataset[SCHEMA["mentions"]]
                if Literal(category) in mention[SCHEMA["category"]]]
    return mentions


ROCRATE_METADATA_FILENAME = "ro-crate-metadata.json"

QUDT = Namespace("http://qudt.org/schema/qudt/")
QUANTITYKIND = Namespace("http://qudt.org/vocab/quantitykind/")
UNIT = Namespace("http://qudt.org/vocab/unit/")
PMDCO = Namespace("https://w3id.org/pmd/co/")
PMDTTO = Namespace("http://w3id.org/pmd/tto/")
SCHEMA = Namespace("http://schema.org/")
UUID = Namespace("urn:uuid:")

namespaces = M4I_NAMESPACES | {"quantitykind": QUANTITYKIND,
                               "unit": UNIT,
                               "pmdco": PMDCO,
                               "pmdtto": PMDTTO,
                               "schema-http": SCHEMA,
                               "uuid": UUID}


rocrate_archive_path = Path("2024-08-23-114358-export.eln")

elabid = "20240628-f0b8d8e12e011daad357fd659715968bd15613f7"  # ID of Tensile Test

# get ro-crate-metadata.json content
# find (relative) iri of Metadata File Descriptor
# find (relative) iri of Root Data Entity
with ZipFile(rocrate_archive_path, 'r') as archive:
    rocrate_metadata_filepath = [filepath for filepath in archive.namelist()
                                 if filepath.endswith(ROCRATE_METADATA_FILENAME)][0]

    rocrate_root_path_in_archive = Path(rocrate_metadata_filepath.rstrip(ROCRATE_METADATA_FILENAME))
    rocrate_root_uri = to_absolute_file_uri(rocrate_archive_path, rocrate_root_path_in_archive)

    with archive.open(rocrate_metadata_filepath) as rocrate_metadata_file:
        rocrate_json = json.load(rocrate_metadata_file, object_hook=fix_iri)
        rocrate_json = json.dumps(rocrate_json)

rocrate_graph = Graph()
rocrate_graph.parse(data=rocrate_json, format="json-ld", publicID=rocrate_root_uri)

# rocrate spec binds schma = http://schema.org/, dct = http://purl.org/dc/terms/
# we force rebinds that match rdflib default and M4I specification
# even though https://github.com/RDFLib/pySHACL/issues/118 suggests re-binding before parsing
# we re-bind after parsing, because parsing seems to override binds AGAIN...
for prefix, namespace in namespaces.items():
    rocrate_graph.bind(prefix, namespace, override=True, replace=True)

# get dataset where schema:identifier = "elabid"
# this is the "root experiment / processing step"
# get export-elabftw.json file iri
# arbitrary length match (hasPart+) and specifying ?elabid might be unnessessary?
query = """
SELECT ?file ?dset
WHERE {
    ?mfd dcterms:conformsTo <https://w3id.org/ro/crate/1.1> .
    ?mdf schema-http:about ?root .
    ?root schema-http:hasPart+ ?dset .
    ?dset schema-http:identifier ?elabid .
    ?dset schema-http:hasPart ?file .
    ?file rdf:type schema-http:MediaObject .
    FILTER(CONTAINS(str(?file), "export-elabftw.json")) .
}"""
results = rocrate_graph.query(query, initBindings={"elabid": Literal(elabid)})
if len(results) != 1:
    raise LookupError(f"exactly one file 'export-elabftw.json' expected, but found {len(results)}")
elab_export_iri = list(results)[0].file
main_dset_iri = list(results)[0].dset

# get export-elabftw.json content
# this should be identical to API response?
_, elab_export_filepath = to_paths(elab_export_iri, ".eln")
with ZipFile(rocrate_archive_path, 'r') as archive:
    with archive.open(elab_export_filepath) as elab_export_file:
        elab_json = json.load(elab_export_file)
        # get object "id" / "elabid" where "type" = "experiments"
        for experiment_dict in elab_json:
            if experiment_dict["type"] != "experiments":
                continue
            if experiment_dict["elabid"] != elabid:
                continue
            break

# duplicate keys are overridden, most-recently-defined wins
# therefore we sometimes need to use german, sometimes english key in M4I Context
# this is expected behaviour, but bad authoring practice regarding the context
# c.f. https://www.w3.org/TR/json-ld11/#advanced-context-usage
main_step = Resource(rocrate_graph, URIRef(experiment_dict["sharelink"]))  # use elabid instead?
main_dset = Resource(rocrate_graph, main_dset_iri)

map_dataset_mentions(main_dset, experiment_dict)
map_dataset_agents(main_dset)

main_step[RDF.type] = M4I["processing step"]
main_step[M4I["hat Output"]] = main_dset
main_step[M4I["hat Teilnehmer"]] = main_dset.value(SCHEMA["author"])
main_step[M4I["title"]] = main_dset.value(SCHEMA["name"])
main_step[SCHEMA["category"]] = Literal(experiment_dict["metadata"]["hidden_fields"]["Type of experiment"]["value"])

# find linked experiments
# for now find .csv files and assume one sub-experiment per file
# possibly reuse ?dset iri from previous query
query = """
SELECT ?file ?dset
WHERE {
    ?mfd dcterms:conformsTo <https://w3id.org/ro/crate/1.1> .
    ?mdf schema-http:about ?root .
    ?root schema-http:hasPart+ ?dset .
    ?dset schema-http:identifier ?elabid .
    ?dset schema-http:hasPart ?file .
    ?file rdf:type schema-http:MediaObject .
    FILTER(CONTAINS(str(?file), ".csv")) .
}"""
results = rocrate_graph.query(query, initBindings={"elabid": Literal(elabid)})

# per .csv file
# get table headers
csv_metadata = []
for result in results:
    _, csv_filepath = to_paths(result.file, ".eln")
    with ZipFile(rocrate_archive_path, 'r') as archive:
        with archive.open(csv_filepath) as csv_file:
            # lines 4-5, zero-indexed = line 3 up to but not including line 5
            lines = [line.decode("utf-8").rstrip().split(",") for line in islice(csv_file, 3, 5)]
            columns = [list(line) for line in zip(*lines)]
            csv_metadata.append({"iri": result.file,
                                 "columns": columns})

for item in csv_metadata:
    # > elab specimen if we use linked experiments in the future
    sample = Resource(rocrate_graph, URIRef(f"urn:uuid:{str(uuid7())}"))
    sample_length = Resource(rocrate_graph, URIRef(f"{sample.identifier}#Length"))
    sample_diameter = Resource(rocrate_graph, URIRef(f"{sample.identifier}#Diameter"))
    # step ~ elab experiment
    # > elab dataset sharelink (elabid) if we use linked experiments in the future
    step = Resource(rocrate_graph, URIRef(f"urn:uuid:{str(uuid7())}"))
    # dataset represented by file(s)
    # > elab dataset (relative iri) if we use linked experiments in the future
    dset = Resource(rocrate_graph, URIRef(f"{step.identifier}#data"))
    # Blank node for now, append -<file extension> to iri (c.f. DCAT examples) instead?
    dist = Resource(rocrate_graph, BNode())

    for column_index, column in enumerate(item["columns"]):
        match column[0]:
            case "Time":
                continue  # only relevant if we aggregate time values to range
                # start time / end time of step
                # end time = creation date of csv?
                # => obj > "uploads" > filter by "real name" > "created_at"
            case "Extension":
                property_iri = URIRef(f"{sample.identifier}#Extension")
                # pmd class?
                substep_iri = URIRef(f"{step.identifier}#ExtensionMeasurement")
                subdset_iri = URIRef(f"{step.identifier}#ExtensionData")
            case "Load":
                property_iri = URIRef(f"{sample.identifier}#Load")
                # pmd class?
                substep_iri = URIRef(f"{step.identifier}#LoadMeasurement")
                subdset_iri = URIRef(f"{step.identifier}#LoadData")
            case "Strain 1":
                property_iri = URIRef(f"{sample.identifier}#Strain1")
                # pmd class?
                substep_iri = URIRef(f"{step.identifier}#Strain1Measurement")
                subdset_iri = URIRef(f"{step.identifier}#Strain1Data")
            case _:
                raise LookupError(f"unexpected column header {column[0]}")

        match column[1]:
            case "(s)":
                property_unit = UNIT["SEC"]
                property_quantitykind = QUANTITYKIND["Time"]
            case "(mm)":
                property_unit = UNIT["MilliM"]
                property_quantitykind = QUANTITYKIND["Length"]
            case "(N)":
                property_unit = UNIT["N"]
                property_quantitykind = QUANTITYKIND["Force"]
            case "(mm/mm)":
                property_unit = UNIT["UNITLESS"]
                property_quantitykind = QUANTITYKIND["Dimensionless"]
            case _:
                raise LookupError(f"unexpected column header {column[0]}")

        sample_property = Resource(rocrate_graph, property_iri)
        substep = Resource(rocrate_graph, substep_iri)
        subdset = Resource(rocrate_graph, subdset_iri)
        subdist = Resource(rocrate_graph, BNode())
        coldesc = Resource(rocrate_graph, BNode())

        # use .add() to not overwrite rdf:type
        sample_property.add(RDF.type, M4I["variable"])
        sample_property.add(RDF.type, QUDT["Quantity"])
        sample_property[M4I["ist Eigenschaft von"]] = sample
        sample_property[M4I["hat Größenart"]] = property_quantitykind
        sample_property[M4I["hat Einheit"]] = property_unit

        substep[RDF.type] = M4I["processing step"]
        substep[M4I["untersucht"]] = sample_property
        substep[M4I["part of"]] = step
        substep[M4I["hat Output"]] = subdset

        subdset[RDF.type] = M4I["dataset"]
        subdset[M4I["hat Verteilung"]] = subdist

        subdist[RDF.type] = M4I["distribution"]
        subdist[DCAT["accessURL"]] = item["iri"] + f"#col={column_index + 1}"

    sample_length_dict = experiment_dict["metadata"]["extra_fields"]["Sample Length"]
    sample_length[M4I["ist Eigenschaft von"]] = sample
    sample_length.add(RDF.type, M4I["variable"])
    sample_length.add(RDF.type, QUDT["Quantity"])
    sample_length.add(RDF.type, URIRef(sample_length_dict["ID"]))
    sample_length[M4I["hat Größenart"]] = QUANTITYKIND["Length"]
    sample_length[M4I["hat Einheit"]] = UNIT[sample_length_dict["unit"]]
    sample_length[M4I["hat Zahlenwert"]] = Literal(sample_length_dict["value"])
    sample_length[M4I["hat Variablenbeschreibung"]] = Literal(sample_length_dict["description"])

    sample_diameter_dict = experiment_dict["metadata"]["extra_fields"]["Sample Diameter"]
    sample_diameter[M4I["ist Eigenschaft von"]] = sample
    sample_diameter.add(RDF.type, M4I["variable"])
    sample_diameter.add(RDF.type, QUDT["Quantity"])
    sample_diameter.add(RDF.type, URIRef(sample_diameter_dict["ID"]))
    sample_diameter[M4I["hat Größenart"]] = QUANTITYKIND["Length"]
    sample_diameter[M4I["hat Einheit"]] = UNIT[sample_diameter_dict["unit"]]
    sample_diameter[M4I["hat Zahlenwert"]] = Literal(sample_diameter_dict["value"])
    sample_diameter[M4I["hat Variablenbeschreibung"]] = Literal(sample_diameter_dict["description"])

    sample[RDF.type] = PMDCO["Sample"]
    sample.add(SDO["material"], *filter_dataset_mentions(main_dset, category="Material"))

    step[RDF.type] = M4I["processing step"]
    step[M4I["part of"]] = main_step
    step[M4I["untersucht"]] = sample
    step[M4I["hat Output"]] = dset
    step.add(M4I["hat Teilnehmer"], *filter_dataset_mentions(main_dset, category="Group"))
    step.add(M4I["hat eingesetztes Werkzeug"], *filter_dataset_mentions(main_dset, category="Equipment"))

    dset[RDF.type] = M4I["dataset"]
    dset[M4I["hat Verteilung"]] = dist

    dist[RDF.type] = M4I["distribution"]
    dist[DCAT["accessURL"]] = item["iri"]

# make a "config" mapping table header to property / respective processing step
# template for processing step = measurement of property?
# for now let investigates point to material

# find pdf, assume its a visualization of all .csv files

rocrate_graph.serialize(destination=f"{str(rocrate_archive_path).rstrip(".eln")}.ttl",
                        format="longturtle",
                        base=rocrate_root_uri)
