import json import time from datetime import datetime from typing import List, Optional, cast from unittest import mock from freezegun import freeze_time from looker_sdk.rtl import transport from looker_sdk.rtl.transport import TransportOptions from looker_sdk.sdk.api40.models import ( Dashboard, DashboardElement, LookmlModelExplore, LookmlModelExploreField, LookmlModelExploreFieldset, LookmlModelExploreJoins, LookWithQuery, Query, User, WriteQuery, ) from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.looker import looker_usage from datahub.ingestion.source.looker.looker_query_model import ( HistoryViewField, LookViewField, UserViewField, ) from datahub.ingestion.source.looker.looker_source import LookerDashboardSource from datahub.ingestion.source.state.checkpoint import Checkpoint from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from tests.test_helpers import mce_helpers from tests.test_helpers.state_helpers import ( validate_all_providers_have_committed_successfully, ) FROZEN_TIME = "2020-04-14 07:00:00" GMS_PORT = 8080 GMS_SERVER = f"http://localhost:{GMS_PORT}" @freeze_time(FROZEN_TIME) def test_looker_ingest(pytestconfig, tmp_path, mock_time): mocked_client = mock.MagicMock() with mock.patch("looker_sdk.init40") as mock_sdk: mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) setup_mock_explore(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" pipeline = Pipeline.create( { "run_id": "looker-test", "source": { "type": "looker", "config": { "base_url": "https://looker.company.com", "client_id": "foo", "client_secret": "bar", "extract_usage_history": False, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/looker_mces.json", }, }, } ) pipeline.run() pipeline.raise_from_status() mce_out_file = "golden_test_ingest.json" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "looker_mces.json", golden_path=f"{test_resources_dir}/{mce_out_file}", ) @freeze_time(FROZEN_TIME) def test_looker_ingest_joins(pytestconfig, tmp_path, mock_time): mocked_client = mock.MagicMock() with mock.patch("looker_sdk.init40") as mock_sdk: mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) setup_mock_explore_with_joins(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" pipeline = Pipeline.create( { "run_id": "looker-test", "source": { "type": "looker", "config": { "base_url": "https://looker.company.com", "client_id": "foo", "client_secret": "bar", "extract_usage_history": False, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/looker_mces_joins.json", }, }, } ) pipeline.run() pipeline.raise_from_status() mce_out_file = "golden_test_ingest_joins.json" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "looker_mces_joins.json", golden_path=f"{test_resources_dir}/{mce_out_file}", ) @freeze_time(FROZEN_TIME) def test_looker_ingest_unaliased_joins(pytestconfig, tmp_path, mock_time): mocked_client = mock.MagicMock() with mock.patch("looker_sdk.init40") as mock_sdk: mock_sdk.return_value = mocked_client mocked_client.all_dashboards.return_value = [Dashboard(id="1")] mocked_client.dashboard.return_value = Dashboard( id="1", title="foo", created_at=datetime.utcfromtimestamp(time.time()), updated_at=datetime.utcfromtimestamp(time.time()), description="lorem ipsum", dashboard_elements=[ DashboardElement( id="2", type="", subtitle_text="Some text", query=Query( model="data", view="my_view", fields=["dim1"], dynamic_fields='[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]', ), ) ], ) setup_mock_explore_unaliased_with_joins(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" pipeline = Pipeline.create( { "run_id": "looker-test", "source": { "type": "looker", "config": { "base_url": "https://looker.company.com", "client_id": "foo", "client_secret": "bar", "extract_usage_history": False, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/looker_mces_unaliased_joins.json", }, }, } ) pipeline.run() pipeline.raise_from_status() mce_out_file = "golden_test_ingest_unaliased_joins.json" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "looker_mces_unaliased_joins.json", golden_path=f"{test_resources_dir}/{mce_out_file}", ) def setup_mock_dashboard(mocked_client): mocked_client.all_dashboards.return_value = [Dashboard(id="1")] mocked_client.dashboard.return_value = Dashboard( id="1", title="foo", created_at=datetime.utcfromtimestamp(time.time()), updated_at=datetime.utcfromtimestamp(time.time()), description="lorem ipsum", dashboard_elements=[ DashboardElement( id="2", type="", subtitle_text="Some text", query=Query( model="data", fields=["dim1"], view="my_view", dynamic_fields='[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]', ), ) ], ) def setup_mock_dashboard_multiple_charts(mocked_client): mocked_client.all_dashboards.return_value = [Dashboard(id="1")] mocked_client.dashboard.return_value = Dashboard( id="11", title="foo", created_at=datetime.utcfromtimestamp(time.time()), updated_at=datetime.utcfromtimestamp(time.time()), description="lorem ipsum", dashboard_elements=[ DashboardElement( id="2", type="", subtitle_text="Some text", query=Query( model="data", fields=["dim1"], view="my_view", dynamic_fields='[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]', ), ), DashboardElement( id="10", type="", subtitle_text="Some other text", query=Query( model="bogus data", view="my_view", dynamic_fields='[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]', ), ), ], ) def setup_mock_dashboard_with_usage(mocked_client): mocked_client.all_dashboards.return_value = [Dashboard(id="1")] mocked_client.dashboard.return_value = Dashboard( id="1", title="foo", created_at=datetime.utcfromtimestamp(time.time()), updated_at=datetime.utcfromtimestamp(time.time()), description="lorem ipsum", favorite_count=5, view_count=25, last_viewed_at=datetime.utcfromtimestamp(time.time()), dashboard_elements=[ DashboardElement( id="2", type="", subtitle_text="Some text", query=Query( model="data", view="my_view", dynamic_fields='[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]', ), ), DashboardElement( id="3", type="", look=LookWithQuery(id="3", view_count=30) ), ], ) def setup_mock_explore_with_joins(mocked_client): mock_model = mock.MagicMock(project_name="lkml_samples") mocked_client.lookml_model.return_value = mock_model mocked_client.lookml_model_explore.return_value = LookmlModelExplore( id="1", name="my_explore_name", label="My Explore View", description="lorem ipsum", view_name="underlying_view", project_name="lkml_samples", fields=LookmlModelExploreFieldset( dimensions=[ LookmlModelExploreField( name="dim1", type="string", description="dimension one description", label_short="Dimensions One Label", ) ] ), source_file="test_source_file.lkml", joins=[ LookmlModelExploreJoins( name="my_joined_view", dependent_fields=["my_joined_view.field", "bare_field"], ), LookmlModelExploreJoins( name="my_view_has_no_fields", view_label="My Labeled View", relationship="one_to_one", sql_on="1=1", ), LookmlModelExploreJoins( name="my_joined_view_join_name", from_="my_joined_view_original_name", ), ], ) def setup_mock_explore_unaliased_with_joins(mocked_client): mock_model = mock.MagicMock(project_name="lkml_samples") mocked_client.lookml_model.return_value = mock_model mocked_client.lookml_model_explore.return_value = LookmlModelExplore( id="1", name="my_view", label="My Explore View", description="lorem ipsum", project_name="lkml_samples", fields=LookmlModelExploreFieldset( dimensions=[ LookmlModelExploreField( name="dim1", type="string", dimension_group=None, description="dimension one description", label_short="Dimensions One Label", ) ] ), source_file="test_source_file.lkml", joins=[ LookmlModelExploreJoins( name="my_view_has_no_fields", view_label="My Labeled View", relationship="one_to_one", sql_on="1=1", ), LookmlModelExploreJoins( name="my_joined_view_join_name", from_="my_joined_view_original_name", ), ], ) def setup_mock_explore(mocked_client): mock_model = mock.MagicMock(project_name="lkml_samples") mocked_client.lookml_model.return_value = mock_model mocked_client.lookml_model_explore.return_value = LookmlModelExplore( id="1", name="my_explore_name", label="My Explore View", description="lorem ipsum", view_name="underlying_view", project_name="lkml_samples", fields=LookmlModelExploreFieldset( dimensions=[ LookmlModelExploreField( name="dim1", type="string", dimension_group=None, description="dimension one description", label_short="Dimensions One Label", ) ] ), source_file="test_source_file.lkml", ) def setup_mock_user(mocked_client): def get_user( id_: str, fields: Optional[str] = None, transport_options: Optional[transport.TransportOptions] = None, ) -> User: return User(id=id_, email=f"test-{id_}@looker.com") mocked_client.user.side_effect = get_user def side_effect_query_inline( result_format: str, body: WriteQuery, transport_options: Optional[TransportOptions] ) -> str: query_type: looker_usage.QueryId if result_format == "sql": return "" # Placeholder for sql text for query_id, query_template in looker_usage.query_collection.items(): if body.fields == query_template.to_write_query().fields: query_type = query_id break query_id_vs_response = { looker_usage.QueryId.DASHBOARD_PER_DAY_USAGE_STAT: json.dumps( [ { HistoryViewField.HISTORY_DASHBOARD_ID: "1", HistoryViewField.HISTORY_CREATED_DATE: "2022-07-05", HistoryViewField.HISTORY_DASHBOARD_USER: 1, HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 14, }, { HistoryViewField.HISTORY_DASHBOARD_ID: "1", HistoryViewField.HISTORY_CREATED_DATE: "2022-07-06", HistoryViewField.HISTORY_DASHBOARD_USER: 1, HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 14, }, { HistoryViewField.HISTORY_DASHBOARD_ID: "1", HistoryViewField.HISTORY_CREATED_DATE: "2022-07-07", HistoryViewField.HISTORY_DASHBOARD_USER: 1, HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5, }, ] ), looker_usage.QueryId.DASHBOARD_PER_USER_PER_DAY_USAGE_STAT: json.dumps( [ { HistoryViewField.HISTORY_CREATED_DATE: "2022-07-05", HistoryViewField.HISTORY_DASHBOARD_ID: "1", UserViewField.USER_ID: 1, HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 16, }, { HistoryViewField.HISTORY_CREATED_DATE: "2022-07-05", HistoryViewField.HISTORY_DASHBOARD_ID: "1", UserViewField.USER_ID: 2, HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 14, }, { HistoryViewField.HISTORY_CREATED_DATE: "2022-07-07", HistoryViewField.HISTORY_DASHBOARD_ID: "1", UserViewField.USER_ID: 1, HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5, }, ] ), looker_usage.QueryId.LOOK_PER_DAY_USAGE_STAT: json.dumps( [ { HistoryViewField.HISTORY_CREATED_DATE: "2022-07-05", HistoryViewField.HISTORY_COUNT: 10, LookViewField.LOOK_ID: 3, }, { HistoryViewField.HISTORY_CREATED_DATE: "2022-07-06", HistoryViewField.HISTORY_COUNT: 20, LookViewField.LOOK_ID: 3, }, { HistoryViewField.HISTORY_CREATED_DATE: "2022-07-07", HistoryViewField.HISTORY_COUNT: 35, LookViewField.LOOK_ID: 3, }, ] ), looker_usage.QueryId.LOOK_PER_USER_PER_DAY_USAGE_STAT: json.dumps( [ { HistoryViewField.HISTORY_CREATED_DATE: "2022-07-05", HistoryViewField.HISTORY_COUNT: 10, LookViewField.LOOK_ID: 3, UserViewField.USER_ID: 1, }, { HistoryViewField.HISTORY_CREATED_DATE: "2022-07-05", HistoryViewField.HISTORY_COUNT: 20, LookViewField.LOOK_ID: 3, UserViewField.USER_ID: 2, }, ] ), } if query_id_vs_response.get(query_type) is None: raise Exception("Unknown Query") return query_id_vs_response[query_type] @freeze_time(FROZEN_TIME) def test_looker_ingest_allow_pattern(pytestconfig, tmp_path, mock_time): mocked_client = mock.MagicMock() with mock.patch("looker_sdk.init40") as mock_sdk: mock_sdk.return_value = mocked_client setup_mock_dashboard_multiple_charts(mocked_client) setup_mock_explore(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" pipeline = Pipeline.create( { "run_id": "looker-test", "source": { "type": "looker", "config": { "base_url": "https://looker.company.com", "client_id": "foo", "client_secret": "bar", "chart_pattern": {"allow": ["2"]}, "extract_usage_history": False, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/looker_mces.json", }, }, } ) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status() mce_out_file = "golden_test_allow_ingest.json" mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / "looker_mces.json", golden_path=f"{test_resources_dir}/{mce_out_file}", ) @freeze_time(FROZEN_TIME) def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time): mocked_client = mock.MagicMock() with mock.patch("looker_sdk.init40") as mock_sdk: mock_sdk.return_value = mocked_client setup_mock_dashboard_with_usage(mocked_client) mocked_client.run_inline_query.side_effect = side_effect_query_inline setup_mock_explore(mocked_client) setup_mock_user(mocked_client) test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" temp_output_file = f"{tmp_path}/looker_mces.json" pipeline = Pipeline.create( { "run_id": "looker-test", "source": { "type": "looker", "config": { "base_url": "https://looker.company.com", "client_id": "foo", "client_secret": "bar", "extract_usage_history": True, "max_threads": 1, }, }, "sink": { "type": "file", "config": { "filename": temp_output_file, }, }, } ) pipeline.run() pipeline.pretty_print_summary() pipeline.raise_from_status() mce_out_file = "looker_mces_usage_history.json" # There should be 4 dashboardUsageStatistics aspects (one absolute and 3 timeseries) dashboard_usage_aspect_count = 0 # There should be 4 chartUsageStatistics (one absolute and 3 timeseries) chart_usage_aspect_count = 0 with open(temp_output_file) as f: temp_output_dict = json.load(f) for element in temp_output_dict: if ( element.get("entityType") == "dashboard" and element.get("aspectName") == "dashboardUsageStatistics" ): dashboard_usage_aspect_count = dashboard_usage_aspect_count + 1 if ( element.get("entityType") == "chart" and element.get("aspectName") == "chartUsageStatistics" ): chart_usage_aspect_count = chart_usage_aspect_count + 1 assert dashboard_usage_aspect_count == 4 assert chart_usage_aspect_count == 4 mce_helpers.check_golden_file( pytestconfig, output_path=temp_output_file, golden_path=f"{test_resources_dir}/{mce_out_file}", ) @freeze_time(FROZEN_TIME) def test_looker_ingest_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): output_file_name: str = "looker_mces.json" golden_file_name: str = "golden_looker_mces.json" output_file_deleted_name: str = "looker_mces_deleted_stateful.json" golden_file_deleted_name: str = "looker_mces_golden_deleted_stateful.json" test_resources_dir = pytestconfig.rootpath / "tests/integration/looker" def looker_source_config(sink_file_name): return { "run_id": "looker-test", "pipeline_name": "stateful-looker-pipeline", "source": { "type": "looker", "config": { "base_url": "https://looker.company.com", "client_id": "foo", "client_secret": "bar", "extract_usage_history": False, "stateful_ingestion": { "enabled": True, "remove_stale_metadata": True, "fail_safe_threshold": 100.0, "state_provider": { "type": "datahub", "config": {"datahub_api": {"server": GMS_SERVER}}, }, }, }, }, "sink": { "type": "file", "config": { "filename": f"{tmp_path}/{sink_file_name}", }, }, } mocked_client = mock.MagicMock() pipeline_run1 = None with mock.patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, ) as mock_checkpoint, mock.patch("looker_sdk.init40") as mock_sdk: mock_checkpoint.return_value = mock_datahub_graph mock_sdk.return_value = mocked_client setup_mock_dashboard_multiple_charts(mocked_client) setup_mock_explore(mocked_client) pipeline_run1 = Pipeline.create(looker_source_config(output_file_name)) pipeline_run1.run() pipeline_run1.raise_from_status() pipeline_run1.pretty_print_summary() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / output_file_name, golden_path=f"{test_resources_dir}/{golden_file_name}", ) checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1) assert checkpoint1 assert checkpoint1.state pipeline_run2 = None mocked_client = mock.MagicMock() with mock.patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, ) as mock_checkpoint, mock.patch("looker_sdk.init40") as mock_sdk: mock_checkpoint.return_value = mock_datahub_graph mock_sdk.return_value = mocked_client setup_mock_dashboard(mocked_client) setup_mock_explore(mocked_client) pipeline_run2 = Pipeline.create(looker_source_config(output_file_deleted_name)) pipeline_run2.run() pipeline_run2.raise_from_status() pipeline_run2.pretty_print_summary() mce_helpers.check_golden_file( pytestconfig, output_path=tmp_path / output_file_deleted_name, golden_path=f"{test_resources_dir}/{golden_file_deleted_name}", ) checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2) assert checkpoint2 assert checkpoint2.state # Validate that all providers have committed successfully. validate_all_providers_have_committed_successfully( pipeline=pipeline_run1, expected_providers=1 ) validate_all_providers_have_committed_successfully( pipeline=pipeline_run2, expected_providers=1 ) # Perform all assertions on the states. The deleted table should not be # part of the second state state1 = cast(GenericCheckpointState, checkpoint1.state) state2 = cast(GenericCheckpointState, checkpoint2.state) difference_dataset_urns = list( state1.get_urns_not_in(type="dataset", other_checkpoint_state=state2) ) assert len(difference_dataset_urns) == 1 deleted_dataset_urns: List[str] = [ "urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)" ] assert sorted(deleted_dataset_urns) == sorted(difference_dataset_urns) difference_chart_urns = list( state1.get_urns_not_in(type="chart", other_checkpoint_state=state2) ) assert len(difference_chart_urns) == 1 deleted_chart_urns = ["urn:li:chart:(looker,dashboard_elements.10)"] assert sorted(deleted_chart_urns) == sorted(difference_chart_urns) difference_dashboard_urns = list( state1.get_urns_not_in(type="dashboard", other_checkpoint_state=state2) ) assert len(difference_dashboard_urns) == 1 deleted_dashboard_urns = ["urn:li:dashboard:(looker,dashboards.11)"] assert sorted(deleted_dashboard_urns) == sorted(difference_dashboard_urns) def get_current_checkpoint_from_pipeline( pipeline: Pipeline, ) -> Optional[Checkpoint]: dbt_source = cast(LookerDashboardSource, pipeline.source) return dbt_source.get_current_checkpoint( dbt_source.stale_entity_removal_handler.job_id )