datahub/metadata-ingestion/tests/integration/preset/test_preset.py

from typing import Any, Dict, Optional
from unittest.mock import patch

import pytest
from freezegun import freeze_time

from datahub.ingestion.run.pipeline import Pipeline
from tests.test_helpers import mce_helpers
from tests.test_helpers.state_helpers import (
    get_current_checkpoint_from_pipeline,
    run_and_get_pipeline,
    validate_all_providers_have_committed_successfully,
)

FROZEN_TIME = "2024-07-10 07:00:00"
GMS_PORT = 8080
GMS_SERVER = f"http://localhost:{GMS_PORT}"


def register_mock_api(request_mock: Any, override_data: Optional[dict] = None) -> None:
    if override_data is None:
        override_data = {}

    api_vs_response = {
        "mock://mock-domain.preset.io/v1/auth/": {
            "method": "POST",
            "status_code": 200,
            "json": {
                "payload": {
                    "access_token": "test_token",
                }
            },
        },
        "mock://mock-domain.preset.io/version": {
            "method": "GET",
            "status_code": 200,
            "json": {
                "ci": {
                    "built_at": "Tue Jul  10 00:00:00 UTC 2024",
                    "build_num": "1",
                    "triggered_by": "Not triggered by a user",
                },
                "git": {
                    "branch": "4.0.1.6",
                    "sha": "test_sha",
                    "sha_superset": "test_sha_superset",
                    "release_name": "test_release_name",
                },
                "chart_version": "1.16.1",
                "start_time": "2024-07-10 00:00:00",
                "mt_deployment": True,
            },
        },
        "mock://mock-domain.preset.io/api/v1/dashboard/": {
            "method": "GET",
            "status_code": 200,
            "json": {
                "count": 2,
                "result": [
                    {
                        "id": "1",
                        "changed_by": {
                            "first_name": "Test",
                            "id": 1,
                            "last_name": "Owner1",
                        },
                        "changed_on_utc": "2024-07-10T07:00:00.000000+0000",
                        "dashboard_title": "test_dashboard_title_1",
                        "url": "/dashboard/test_dashboard_url_1",
                        "position_json": '{"CHART-test-1": {"meta": { "chartId": "10" }}, "CHART-test-2": {"meta": { "chartId": "11" }}}',
                        "status": "published",
                        "published": True,
                        "owners": [
                            {
                                "first_name": "Test",
                                "id": 1,
                                "last_name": "Owner1",
                            },
                            {
                                "first_name": "Test",
                                "id": 2,
                                "last_name": "Owner2",
                            },
                        ],
                        "certified_by": "Certification team",
                        "certification_details": "Approved",
                    },
                    {
                        "id": "2",
                        "changed_by": {
                            "first_name": "Test",
                            "id": 2,
                            "last_name": "Owner2",
                        },
                        "changed_on_utc": "2024-07-10T07:00:00.000000+0000",
                        "dashboard_title": "test_dashboard_title_2",
                        "url": "/dashboard/test_dashboard_url_2",
                        "position_json": '{"CHART-test-3": {"meta": { "chartId": "12" }}, "CHART-test-4": {"meta": { "chartId": "13" }}}',
                        "status": "draft",
                        "published": False,
                        "owners": [
                            {
                                "first_name": "Test",
                                "id": 2,
                                "last_name": "Owner2",
                            }
                        ],
                        "certified_by": "",
                        "certification_details": "",
                    },
                ],
            },
        },
        "mock://mock-domain.preset.io/api/v1/chart/": {
            "method": "GET",
            "status_code": 200,
            "json": {
                "count": 4,
                "result": [
                    {
                        "id": "10",
                        "changed_by": {
                            "first_name": "Test",
                            "id": 1,
                            "last_name": "Owner1",
                        },
                        "changed_on_utc": "2024-07-10T07:00:00.000000+0000",
                        "slice_name": "test_chart_title_1",
                        "viz_type": "box_plot",
                        "url": "/explore/test_chart_url_10",
                        "datasource_id": "20",
                        "params": '{"metrics": [], "adhoc_filters": []}',
                    },
                    {
                        "id": "11",
                        "changed_by": {
                            "first_name": "Test",
                            "id": 1,
                            "last_name": "Owner1",
                        },
                        "changed_on_utc": "2024-07-10T07:00:00.000000+0000",
                        "slice_name": "test_chart_title_2",
                        "viz_type": "pie",
                        "url": "/explore/test_chart_url_11",
                        "datasource_id": "20",
                        "params": '{"metrics": [], "adhoc_filters": []}',
                    },
                    {
                        "id": "12",
                        "changed_by": {
                            "first_name": "Test",
                            "id": 2,
                            "last_name": "Owner2",
                        },
                        "changed_on_utc": "2024-07-10T07:00:00.000000+0000",
                        "slice_name": "test_chart_title_3",
                        "viz_type": "treemap",
                        "url": "/explore/test_chart_url_12",
                        "datasource_id": "20",
                        "params": '{"metrics": [], "adhoc_filters": []}',
                    },
                    {
                        "id": "13",
                        "changed_by": {
                            "first_name": "Test",
                            "id": 2,
                            "last_name": "Owner2",
                        },
                        "changed_on_utc": "2024-07-10T07:00:00.000000+0000",
                        "slice_name": "test_chart_title_4",
                        "viz_type": "histogram",
                        "url": "/explore/test_chart_url_13",
                        "datasource_id": "20",
                        "params": '{"metrics": [], "adhoc_filters": []}',
                    },
                ],
            },
        },
        "mock://mock-domain.preset.io/api/v1/dataset/20": {
            "method": "GET",
            "status_code": 200,
            "json": {
                "result": {
                    "schema": "test_schema_name",
                    "table_name": "test_table_name",
                    "database": {
                        "id": "30",
                        "database_name": "test_database_name",
                    },
                },
            },
        },
        "mock://mock-domain.preset.io/api/v1/database/30": {
            "method": "GET",
            "status_code": 200,
            "json": {
                "result": {
                    "sqlalchemy_uri": "test_sqlalchemy_uri",
                },
            },
        },
        "mock://mock-domain.preset.io/api/v1/dashboard/related/owners": {
            "method": "GET",
            "status_code": 200,
            "json": {
                "count": 2,
                "result": [
                    {
                        "extra": {"active": True, "email": "test_owner1@example.com"},
                        "text": "test_owner1",
                        "value": 1,
                    },
                    {
                        "extra": {"active": True, "email": "test_owner2@example.com"},
                        "text": "test_owner2",
                        "value": 2,
                    },
                ],
            },
        },
        "mock://mock-domain.preset.io/api/v1/dataset/related/owners": {
            "method": "GET",
            "status_code": 200,
            "json": {
                "count": 2,
                "result": [
                    {
                        "extra": {"active": True, "email": "test_owner3@example.com"},
                        "text": "test_owner3",
                        "value": 3,
                    },
                    {
                        "extra": {"active": True, "email": "test_owner4@example.com"},
                        "text": "test_owner4",
                        "value": 4,
                    },
                ],
            },
        },
        "mock://mock-domain.preset.io/api/v1/chart/related/owners": {
            "method": "GET",
            "status_code": 200,
            "json": {
                "count": 2,
                "result": [
                    {
                        "extra": {"active": True, "email": "test_owner5@example.com"},
                        "text": "test_owner5",
                        "value": 5,
                    },
                    {
                        "extra": {"active": True, "email": "test_owner6@example.com"},
                        "text": "test_owner6",
                        "value": 6,
                    },
                ],
            },
        },
    }

    api_vs_response.update(override_data)

    for url in api_vs_response:
        request_mock.register_uri(
            api_vs_response[url]["method"],
            url,
            json=api_vs_response[url]["json"],
            status_code=api_vs_response[url]["status_code"],
        )


@freeze_time(FROZEN_TIME)
@pytest.mark.integration
def test_preset_ingest(pytestconfig, tmp_path, mock_time, requests_mock):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/preset"

    register_mock_api(request_mock=requests_mock)

    pipeline = Pipeline.create(
        {
            "run_id": "preset-test",
            "source": {
                "type": "preset",
                "config": {
                    "connect_uri": "mock://mock-domain.preset.io/",
                    "manager_uri": "mock://mock-domain.preset.io",
                    "api_key": "test_key",
                    "api_secret": "test_secret",
                    "provider": "db",
                },
            },
            "sink": {
                "type": "file",
                "config": {
                    "filename": f"{tmp_path}/preset_mces.json",
                },
            },
        }
    )

    pipeline.run()
    pipeline.raise_from_status()
    golden_file = "golden_test_ingest.json"

    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=tmp_path / "preset_mces.json",
        golden_path=f"{test_resources_dir}/{golden_file}",
    )


@freeze_time(FROZEN_TIME)
@pytest.mark.integration
def test_preset_stateful_ingest(
    pytestconfig, tmp_path, mock_time, requests_mock, mock_datahub_graph
):
    test_resources_dir = pytestconfig.rootpath / "tests/integration/preset"

    register_mock_api(request_mock=requests_mock)

    pipeline_config_dict: Dict[str, Any] = {
        "source": {
            "type": "preset",
            "config": {
                "connect_uri": "mock://mock-domain.preset.io/",
                "manager_uri": "mock://mock-domain.preset.io",
                "api_key": "test_key",
                "api_secret": "test_secret",
                "provider": "db",
                # enable stateful ingestion
                "stateful_ingestion": {
                    "enabled": True,
                    "remove_stale_metadata": True,
                    "fail_safe_threshold": 100.0,
                    "state_provider": {
                        "type": "datahub",
                        "config": {"datahub_api": {"server": GMS_SERVER}},
                    },
                },
            },
        },
        "sink": {
            # we are not really interested in the resulting events for this test
            "type": "console"
        },
        "pipeline_name": "test_pipeline",
    }

    dashboard_endpoint_override = {
        "mock://mock-domain.preset.io/api/v1/dashboard/": {
            "method": "GET",
            "status_code": 200,
            "json": {
                "count": 1,
                "result": [
                    {
                        "id": "1",
                        "changed_by": {
                            "first_name": "Test",
                            "id": 1,
                            "last_name": "Owner1",
                        },
                        "changed_on_utc": "2024-07-10T07:00:00.000000+0000",
                        "dashboard_title": "test_dashboard_title_1",
                        "url": "/dashboard/test_dashboard_url_1",
                        "position_json": '{"CHART-test-1": {"meta": { "chartId": "10" }}, "CHART-test-2": {"meta": { "chartId": "11" }}}',
                        "status": "published",
                        "published": True,
                        "owners": [
                            {
                                "first_name": "Test",
                                "id": 1,
                                "last_name": "Owners1",
                            },
                            {
                                "first_name": "Test",
                                "id": 2,
                                "last_name": "Owners2",
                            },
                        ],
                        "certified_by": "Certification team",
                        "certification_details": "Approved",
                    },
                ],
            },
        },
    }

    with patch(
        "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph",
        mock_datahub_graph,
    ) as mock_checkpoint:
        # Both checkpoint and reporting will use the same mocked graph instance.
        mock_checkpoint.return_value = mock_datahub_graph

        # Do the first run of the pipeline and get the default job's checkpoint.
        pipeline_run1 = run_and_get_pipeline(pipeline_config_dict)
        checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1)

        assert checkpoint1
        assert checkpoint1.state

        # Remove one dashboard from the preset config.
        register_mock_api(
            request_mock=requests_mock, override_data=dashboard_endpoint_override
        )

        # Capture MCEs of second run to validate Status(removed=true)
        deleted_mces_path = f"{tmp_path}/preset_deleted_mces.json"
        pipeline_config_dict["sink"]["type"] = "file"
        pipeline_config_dict["sink"]["config"] = {"filename": deleted_mces_path}

        # Do the second run of the pipeline.
        pipeline_run2 = run_and_get_pipeline(pipeline_config_dict)
        checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2)

        assert checkpoint2
        assert checkpoint2.state

        # Perform all assertions on the states. The deleted dashboard should not be
        # part of the second state
        state1 = checkpoint1.state
        state2 = checkpoint2.state
        difference_urns = list(
            state1.get_urns_not_in(type="dashboard", other_checkpoint_state=state2)
        )

        assert len(difference_urns) == 1

        urn1 = "urn:li:dashboard:(preset,2)"

        assert urn1 in difference_urns

        # Validate that all providers have committed successfully.
        validate_all_providers_have_committed_successfully(
            pipeline=pipeline_run1, expected_providers=1
        )
        validate_all_providers_have_committed_successfully(
            pipeline=pipeline_run2, expected_providers=1
        )

        # Verify the output.
        mce_helpers.check_golden_file(
            pytestconfig,
            output_path=deleted_mces_path,
            golden_path=test_resources_dir / "golden_test_stateful_ingest.json",
        )