import json
import logging
import os
import pathlib
import pprint
import re
import shutil
import tempfile
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union

import deepdiff

from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.sink.file import write_metadata_file
from datahub.ingestion.source.file import read_metadata_file
from datahub.metadata.schema_classes import MetadataChangeEventClass
from datahub.utilities.urns.urn import Urn
from tests.test_helpers.type_helpers import PytestConfig

logger = logging.getLogger(__name__)

IGNORE_PATH_TIMESTAMPS = [
    # Ignore timestamps from the ETL pipeline. A couple examples:
    # root[0]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.common.Ownership']['lastModified']['time']
    # root[69]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][0]['com.linkedin.pegasus2avro.schema.SchemaMetadata']['lastModified']['time']"
    # root[0]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][1]['com.linkedin.pegasus2avro.dataset.UpstreamLineage']['upstreams'][0]['auditStamp']['time']
    r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['created'\]\['time'\]",
    r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['lastModified'\]\['time'\]",
    r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['createStamp'\]\['time'\]",
    r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['auditStamp'\]\['time'\]",
]


class MCEConstants:
    PROPOSED_SNAPSHOT = "proposedSnapshot"
    DATASET_SNAPSHOT_CLASS = (
        "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot"
    )


class MCPConstants:
    CHANGE_TYPE = "changeType"
    ENTITY_URN = "entityUrn"
    ENTITY_TYPE = "entityType"
    ASPECT_NAME = "aspectName"
    ASPECT_VALUE = "aspect"


class EntityType:
    DATASET = "dataset"
    PIPELINE = "dataFlow"
    FLOW = "dataFlow"
    TASK = "dataJob"
    JOB = "dataJob"
    USER = "corpuser"
    GROUP = "corpGroup"


def load_json_file(filename: Union[str, os.PathLike]) -> object:
    with open(str(filename)) as f:
        a = json.load(f)
    return a


def clean_nones(value):
    """
    Recursively remove all None values from dictionaries and lists, and returns
    the result as a new dictionary or list.
    """
    if isinstance(value, list):
        return [clean_nones(x) for x in value if x is not None]
    elif isinstance(value, dict):
        return {key: clean_nones(val) for key, val in value.items() if val is not None}
    else:
        return value


def assert_mces_equal(
    output: object, golden: object, ignore_paths: Optional[List[str]] = None
) -> None:
    # This method assumes we're given a list of MCE json objects.
    diff = deepdiff.DeepDiff(
        golden, output, exclude_regex_paths=ignore_paths, ignore_order=True
    )
    if diff:
        # Attempt a clean diff (removing None-s)
        assert isinstance(output, list)
        assert isinstance(golden, list)
        clean_output = [clean_nones(o) for o in output]
        clean_golden = [clean_nones(g) for g in golden]
        clean_diff = deepdiff.DeepDiff(
            clean_golden,
            clean_output,
            exclude_regex_paths=ignore_paths,
            ignore_order=True,
        )
        if not clean_diff:
            logger.debug(f"MCE-s differ, clean MCE-s are fine\n{pprint.pformat(diff)}")
        diff = clean_diff
        if diff:
            # do some additional processing to emit helpful messages
            output_urns = _get_entity_urns(output)
            golden_urns = _get_entity_urns(golden)
            in_golden_but_not_in_output = golden_urns - output_urns
            in_output_but_not_in_golden = output_urns - golden_urns
            if in_golden_but_not_in_output:
                logger.info(
                    f"Golden file has {len(in_golden_but_not_in_output)} more urns: {in_golden_but_not_in_output}"
                )
            if in_output_but_not_in_golden:
                logger.info(
                    f"Golden file has {len(in_output_but_not_in_golden)} more urns: {in_output_but_not_in_golden}"
                )

    assert (
        not diff
    ), f"MCEs differ\n{pprint.pformat(diff)} \n output was: {json.dumps(output)}"


def check_golden_file(
    pytestconfig: PytestConfig,
    output_path: Union[str, os.PathLike],
    golden_path: Union[str, os.PathLike],
    ignore_paths: Optional[List[str]] = None,
) -> None:
    update_golden = pytestconfig.getoption("--update-golden-files")
    copy_output = pytestconfig.getoption("--copy-output-files")
    golden_exists = os.path.isfile(golden_path)

    if copy_output:
        shutil.copyfile(str(output_path), str(golden_path) + ".output")
        print(f"Copied output file to {golden_path}.output")

    if not update_golden and not golden_exists:
        raise FileNotFoundError(
            "Golden file does not exist. Please run with the --update-golden-files option to create."
        )

    output = load_json_file(output_path)

    # if updating a golden file that doesn't exist yet, load the output again
    if update_golden and not golden_exists:
        golden = load_json_file(output_path)
        shutil.copyfile(str(output_path), str(golden_path))
    else:
        # We have to "normalize" the golden file by reading and writing it back out.
        # This will clean up nulls, double serialization, and other formatting issues.
        with tempfile.NamedTemporaryFile() as temp:
            golden_metadata = read_metadata_file(pathlib.Path(golden_path))
            write_metadata_file(pathlib.Path(temp.name), golden_metadata)
            golden = load_json_file(temp.name)

    try:
        assert_mces_equal(output, golden, ignore_paths)

    except AssertionError as e:
        # only update golden files if the diffs are not empty
        if update_golden:
            shutil.copyfile(str(output_path), str(golden_path))

        # raise the error if we're just running the test
        else:
            raise e


def _get_field_for_entity_type_in_mce(entity_type: str) -> str:
    """Returns the field to look for depending on the type of entity in the MCE"""
    if entity_type == EntityType.DATASET:
        return MCEConstants.DATASET_SNAPSHOT_CLASS
    raise Exception(f"Not implemented for entity_type {entity_type}")


def _get_filter(
    mce: bool = False, mcp: bool = False, entity_type: Optional[str] = None
) -> Callable[[Dict], bool]:
    if mce:
        # cheap way to determine if we are working with an MCE for the appropriate entity_type
        if entity_type:
            return (
                lambda x: MCEConstants.PROPOSED_SNAPSHOT in x
                and _get_field_for_entity_type_in_mce(str(entity_type))
                in x[MCEConstants.PROPOSED_SNAPSHOT]
            )
        else:
            return lambda x: MCEConstants.PROPOSED_SNAPSHOT in x
    if mcp:
        # cheap way to determine if we are working with an MCP
        return lambda x: MCPConstants.CHANGE_TYPE in x and (
            x[MCPConstants.ENTITY_TYPE] == entity_type if entity_type else True
        )
    return lambda _: False


def _get_element(event: Dict[str, Any], path_spec: List[str]) -> Any:
    try:
        for p in path_spec:
            if p not in event:
                return None
            else:
                event = event.get(p, {})
        return event
    except Exception as e:
        print(event)
        raise e


def _element_matches_pattern(
    event: Dict[str, Any], path_spec: List[str], pattern: str
) -> Tuple[bool, bool]:
    import re

    element = _get_element(event, path_spec)
    if element is None:
        return (False, False)
    else:
        return (True, re.search(pattern, str(element)) is not None)


def get_entity_urns(events_file: str) -> Set[str]:
    events = load_json_file(events_file)
    assert isinstance(events, list)
    return _get_entity_urns(events)


def _get_entity_urns(events_list: List[Dict]) -> Set[str]:
    entity_type = "dataset"
    # mce urns
    mce_urns = set(
        [
            _get_element(x, _get_mce_urn_path_spec(entity_type))
            for x in events_list
            if _get_filter(mce=True, entity_type=entity_type)(x)
        ]
    )
    mcp_urns = set(
        [
            _get_element(x, _get_mcp_urn_path_spec())
            for x in events_list
            if _get_filter(mcp=True, entity_type=entity_type)(x)
        ]
    )
    all_urns = mce_urns.union(mcp_urns)
    return all_urns


def assert_mcp_entity_urn(
    filter: str, entity_type: str, regex_pattern: str, file: str
) -> int:
    def get_path_spec_for_urn() -> List[str]:
        return [MCPConstants.ENTITY_URN]

    test_output = load_json_file(file)
    if isinstance(test_output, list):
        path_spec = get_path_spec_for_urn()
        filter_operator = _get_filter(mcp=True, entity_type=entity_type)
        filtered_events = [
            (x, _element_matches_pattern(x, path_spec, regex_pattern))
            for x in test_output
            if filter_operator(x)
        ]
        failed_events = [y for y in filtered_events if not y[1][0] or not y[1][1]]
        if failed_events:
            raise Exception("Failed to match events", failed_events)
        return len(filtered_events)
    else:
        raise Exception(
            f"Did not expect the file {file} to not contain a list of items"
        )


def _get_mce_urn_path_spec(entity_type: str) -> List[str]:
    if entity_type == EntityType.DATASET:
        return [
            MCEConstants.PROPOSED_SNAPSHOT,
            MCEConstants.DATASET_SNAPSHOT_CLASS,
            "urn",
        ]
    raise Exception(f"Not implemented for entity_type: {entity_type}")


def _get_mcp_urn_path_spec() -> List[str]:
    return [MCPConstants.ENTITY_URN]


def assert_mce_entity_urn(
    filter: str, entity_type: str, regex_pattern: str, file: str
) -> int:
    """Assert that all mce entity urns must match the regex pattern passed in. Return the number of events matched"""

    test_output = load_json_file(file)
    if isinstance(test_output, list):
        path_spec = _get_mce_urn_path_spec(entity_type)
        filter_operator = _get_filter(mce=True)
        filtered_events = [
            (x, _element_matches_pattern(x, path_spec, regex_pattern))
            for x in test_output
            if filter_operator(x)
        ]
        failed_events = [y for y in filtered_events if not y[1][0] or not y[1][1]]
        if failed_events:
            raise Exception(
                "Failed to match events: {json.dumps(failed_events, indent=2)}"
            )
        return len(filtered_events)
    else:
        raise Exception(
            f"Did not expect the file {file} to not contain a list of items"
        )


def assert_for_each_entity(
    entity_type: str,
    aspect_name: str,
    aspect_field_matcher: Dict[str, Any],
    file: str,
    exception_urns: List[str] = [],
) -> int:
    """Assert that an aspect name with the desired fields exists for each entity urn"""
    test_output = load_json_file(file)
    assert isinstance(test_output, list)
    # mce urns
    mce_urns = set(
        [
            _get_element(x, _get_mce_urn_path_spec(entity_type))
            for x in test_output
            if _get_filter(mce=True, entity_type=entity_type)(x)
        ]
    )
    mcp_urns = set(
        [
            _get_element(x, _get_mcp_urn_path_spec())
            for x in test_output
            if _get_filter(mcp=True, entity_type=entity_type)(x)
        ]
    )
    all_urns = mce_urns.union(mcp_urns)
    # there should not be any None urns
    assert None not in all_urns
    aspect_map = {urn: None for urn in all_urns}
    # iterate over all mcps
    for o in [
        mcp
        for mcp in test_output
        if _get_filter(mcp=True, entity_type=entity_type)(mcp)
    ]:
        if o.get(MCPConstants.ASPECT_NAME) == aspect_name:
            # load the inner aspect payload and assign to this urn
            aspect_map[o[MCPConstants.ENTITY_URN]] = o.get(
                MCPConstants.ASPECT_VALUE, {}
            ).get("json")

    success: List[str] = []
    failures: List[str] = []
    for urn, aspect_val in aspect_map.items():
        if aspect_val is not None:
            for f in aspect_field_matcher:
                assert aspect_field_matcher[f] == _get_element(
                    aspect_val, [f]
                ), f"urn: {urn} -> Field {f} must match value {aspect_field_matcher[f]}, found {_get_element(aspect_val, [f])}"
            success.append(urn)
        elif urn not in exception_urns:
            print(f"Adding {urn} to failures")
            failures.append(urn)

    if success:
        print(f"Succeeded on assertion for urns {success}")
    if failures:
        raise AssertionError(
            f"Failed to find aspect_name {aspect_name} for urns {json.dumps(failures, indent=2)}"
        )

    return len(success)


def assert_entity_mce_aspect(
    entity_urn: str, aspect: Any, aspect_type: Type, file: str
) -> int:
    # TODO: Replace with read_metadata_file()
    test_output = load_json_file(file)
    entity_type = Urn.create_from_string(entity_urn).get_type()
    assert isinstance(test_output, list)
    # mce urns
    mces: List[MetadataChangeEventClass] = [
        MetadataChangeEventClass.from_obj(x)
        for x in test_output
        if _get_filter(mce=True, entity_type=entity_type)(x)
        and _get_element(x, _get_mce_urn_path_spec(entity_type)) == entity_urn
    ]
    matches = 0
    for mce in mces:
        for a in mce.proposedSnapshot.aspects:
            if isinstance(a, aspect_type):
                assert a == aspect
                matches = matches + 1
    return matches


def assert_entity_mcp_aspect(
    entity_urn: str, aspect_field_matcher: Dict[str, Any], aspect_name: str, file: str
) -> int:
    # TODO: Replace with read_metadata_file()
    test_output = load_json_file(file)
    entity_type = Urn.create_from_string(entity_urn).get_type()
    assert isinstance(test_output, list)
    # mcps that match entity_urn
    mcps: List[MetadataChangeProposalWrapper] = [
        MetadataChangeProposalWrapper.from_obj_require_wrapper(x)
        for x in test_output
        if _get_filter(mcp=True, entity_type=entity_type)(x)
        and _get_element(x, _get_mcp_urn_path_spec()) == entity_urn
    ]
    matches = 0
    for mcp in mcps:
        if mcp.aspectName == aspect_name:
            assert mcp.aspect
            aspect_val = mcp.aspect.to_obj()
            for f in aspect_field_matcher:
                assert aspect_field_matcher[f] == _get_element(
                    aspect_val, [f]
                ), f"urn: {mcp.entityUrn} -> Field {f} must match value {aspect_field_matcher[f]}, found {_get_element(aspect_val, [f])}"
                matches = matches + 1
    return matches


def assert_entity_urn_not_like(entity_type: str, regex_pattern: str, file: str) -> int:
    """Assert that there are no entity urns that match the regex pattern passed in. Returns the total number of events in the file"""

    # TODO: Refactor common code with assert_entity_urn_like.
    test_output = load_json_file(file)
    assert isinstance(test_output, list)
    # mce urns
    mce_urns = set(
        [
            _get_element(x, _get_mce_urn_path_spec(entity_type))
            for x in test_output
            if _get_filter(mce=True, entity_type=entity_type)(x)
        ]
    )
    mcp_urns = set(
        [
            _get_element(x, _get_mcp_urn_path_spec())
            for x in test_output
            if _get_filter(mcp=True, entity_type=entity_type)(x)
        ]
    )
    all_urns = mce_urns.union(mcp_urns)
    print(all_urns)
    matched_urns = [u for u in all_urns if re.match(regex_pattern, u)]
    if matched_urns:
        raise AssertionError(f"urns found that match the deny list {matched_urns}")
    return len(test_output)


def assert_entity_urn_like(entity_type: str, regex_pattern: str, file: str) -> int:
    """Assert that there exist entity urns that match the regex pattern passed in. Returns the total number of events in the file"""

    test_output = load_json_file(file)
    assert isinstance(test_output, list)
    # mce urns
    mce_urns = set(
        [
            _get_element(x, _get_mce_urn_path_spec(entity_type))
            for x in test_output
            if _get_filter(mce=True, entity_type=entity_type)(x)
        ]
    )
    mcp_urns = set(
        [
            _get_element(x, _get_mcp_urn_path_spec())
            for x in test_output
            if _get_filter(mcp=True, entity_type=entity_type)(x)
        ]
    )
    all_urns = mce_urns.union(mcp_urns)
    print(all_urns)
    matched_urns = [u for u in all_urns if re.match(regex_pattern, u)]
    if matched_urns:
        return len(matched_urns)
    else:
        raise AssertionError(
            f"No urns found that match the pattern {regex_pattern}. Full list is {all_urns}"
        )