721 lines
26 KiB
Python
Raw Normal View History

import json
import time
from datetime import datetime
from typing import List, Optional, cast
from unittest import mock
from freezegun import freeze_time
from looker_sdk.rtl import transport
from looker_sdk.rtl.transport import TransportOptions
from looker_sdk.sdk.api40.models import (
Dashboard,
DashboardElement,
LookmlModelExplore,
LookmlModelExploreField,
LookmlModelExploreFieldset,
LookmlModelExploreJoins,
LookWithQuery,
Query,
User,
WriteQuery,
)
from datahub.ingestion.run.pipeline import Pipeline
from datahub.ingestion.source.looker import looker_usage
from datahub.ingestion.source.looker.looker_query_model import (
HistoryViewField,
LookViewField,
UserViewField,
)
from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState
from tests.test_helpers import mce_helpers
from tests.test_helpers.state_helpers import (
get_current_checkpoint_from_pipeline,
validate_all_providers_have_committed_successfully,
)
FROZEN_TIME = "2020-04-14 07:00:00"
GMS_PORT = 8080
GMS_SERVER = f"http://localhost:{GMS_PORT}"
@freeze_time(FROZEN_TIME)
def test_looker_ingest(pytestconfig, tmp_path, mock_time):
mocked_client = mock.MagicMock()
with mock.patch("looker_sdk.init40") as mock_sdk:
mock_sdk.return_value = mocked_client
setup_mock_dashboard(mocked_client)
setup_mock_explore(mocked_client)
test_resources_dir = pytestconfig.rootpath / "tests/integration/looker"
pipeline = Pipeline.create(
{
"run_id": "looker-test",
"source": {
"type": "looker",
"config": {
"base_url": "https://looker.company.com",
"client_id": "foo",
"client_secret": "bar",
"extract_usage_history": False,
},
},
"sink": {
"type": "file",
"config": {
"filename": f"{tmp_path}/looker_mces.json",
},
},
}
)
pipeline.run()
pipeline.raise_from_status()
mce_out_file = "golden_test_ingest.json"
mce_helpers.check_golden_file(
pytestconfig,
output_path=tmp_path / "looker_mces.json",
golden_path=f"{test_resources_dir}/{mce_out_file}",
)
@freeze_time(FROZEN_TIME)
def test_looker_ingest_joins(pytestconfig, tmp_path, mock_time):
mocked_client = mock.MagicMock()
with mock.patch("looker_sdk.init40") as mock_sdk:
mock_sdk.return_value = mocked_client
setup_mock_dashboard(mocked_client)
setup_mock_explore_with_joins(mocked_client)
test_resources_dir = pytestconfig.rootpath / "tests/integration/looker"
pipeline = Pipeline.create(
{
"run_id": "looker-test",
"source": {
"type": "looker",
"config": {
"base_url": "https://looker.company.com",
"client_id": "foo",
"client_secret": "bar",
"extract_usage_history": False,
},
},
"sink": {
"type": "file",
"config": {
"filename": f"{tmp_path}/looker_mces_joins.json",
},
},
}
)
pipeline.run()
pipeline.raise_from_status()
mce_out_file = "golden_test_ingest_joins.json"
mce_helpers.check_golden_file(
pytestconfig,
output_path=tmp_path / "looker_mces_joins.json",
golden_path=f"{test_resources_dir}/{mce_out_file}",
)
@freeze_time(FROZEN_TIME)
def test_looker_ingest_unaliased_joins(pytestconfig, tmp_path, mock_time):
mocked_client = mock.MagicMock()
with mock.patch("looker_sdk.init40") as mock_sdk:
mock_sdk.return_value = mocked_client
mocked_client.all_dashboards.return_value = [Dashboard(id="1")]
mocked_client.dashboard.return_value = Dashboard(
id="1",
title="foo",
created_at=datetime.utcfromtimestamp(time.time()),
updated_at=datetime.utcfromtimestamp(time.time()),
description="lorem ipsum",
dashboard_elements=[
DashboardElement(
id="2",
type="",
subtitle_text="Some text",
query=Query(
model="data",
view="my_view",
fields=["dim1"],
dynamic_fields='[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]',
),
)
],
)
setup_mock_explore_unaliased_with_joins(mocked_client)
test_resources_dir = pytestconfig.rootpath / "tests/integration/looker"
pipeline = Pipeline.create(
{
"run_id": "looker-test",
"source": {
"type": "looker",
"config": {
"base_url": "https://looker.company.com",
"client_id": "foo",
"client_secret": "bar",
"extract_usage_history": False,
},
},
"sink": {
"type": "file",
"config": {
"filename": f"{tmp_path}/looker_mces_unaliased_joins.json",
},
},
}
)
pipeline.run()
pipeline.raise_from_status()
mce_out_file = "golden_test_ingest_unaliased_joins.json"
mce_helpers.check_golden_file(
pytestconfig,
output_path=tmp_path / "looker_mces_unaliased_joins.json",
golden_path=f"{test_resources_dir}/{mce_out_file}",
)
def setup_mock_dashboard(mocked_client):
mocked_client.all_dashboards.return_value = [Dashboard(id="1")]
mocked_client.dashboard.return_value = Dashboard(
id="1",
title="foo",
created_at=datetime.utcfromtimestamp(time.time()),
updated_at=datetime.utcfromtimestamp(time.time()),
description="lorem ipsum",
dashboard_elements=[
DashboardElement(
id="2",
type="",
subtitle_text="Some text",
query=Query(
model="data",
fields=["dim1"],
view="my_view",
dynamic_fields='[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]',
),
)
],
)
def setup_mock_dashboard_multiple_charts(mocked_client):
mocked_client.all_dashboards.return_value = [Dashboard(id="1")]
mocked_client.dashboard.return_value = Dashboard(
id="11",
title="foo",
created_at=datetime.utcfromtimestamp(time.time()),
updated_at=datetime.utcfromtimestamp(time.time()),
description="lorem ipsum",
dashboard_elements=[
DashboardElement(
id="2",
type="",
subtitle_text="Some text",
query=Query(
model="data",
fields=["dim1"],
view="my_view",
dynamic_fields='[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]',
),
),
DashboardElement(
id="10",
type="",
subtitle_text="Some other text",
query=Query(
model="bogus data",
view="my_view",
dynamic_fields='[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]',
),
),
],
)
def setup_mock_dashboard_with_usage(mocked_client):
mocked_client.all_dashboards.return_value = [Dashboard(id="1")]
mocked_client.dashboard.return_value = Dashboard(
id="1",
title="foo",
created_at=datetime.utcfromtimestamp(time.time()),
updated_at=datetime.utcfromtimestamp(time.time()),
description="lorem ipsum",
favorite_count=5,
view_count=25,
last_viewed_at=datetime.utcfromtimestamp(time.time()),
dashboard_elements=[
DashboardElement(
id="2",
type="",
subtitle_text="Some text",
query=Query(
model="data",
view="my_view",
dynamic_fields='[{"table_calculation":"calc","label":"foobar","expression":"offset(${my_table.value},1)","value_format":null,"value_format_name":"eur","_kind_hint":"measure","_type_hint":"number"}]',
),
),
DashboardElement(
id="3", type="", look=LookWithQuery(id="3", view_count=30)
),
],
)
def setup_mock_explore_with_joins(mocked_client):
mock_model = mock.MagicMock(project_name="lkml_samples")
mocked_client.lookml_model.return_value = mock_model
mocked_client.lookml_model_explore.return_value = LookmlModelExplore(
id="1",
name="my_explore_name",
label="My Explore View",
description="lorem ipsum",
view_name="underlying_view",
project_name="lkml_samples",
fields=LookmlModelExploreFieldset(
dimensions=[
LookmlModelExploreField(
name="dim1",
type="string",
description="dimension one description",
label_short="Dimensions One Label",
)
]
),
source_file="test_source_file.lkml",
joins=[
LookmlModelExploreJoins(
name="my_joined_view",
dependent_fields=["my_joined_view.field", "bare_field"],
),
LookmlModelExploreJoins(
name="my_view_has_no_fields",
view_label="My Labeled View",
relationship="one_to_one",
sql_on="1=1",
),
LookmlModelExploreJoins(
name="my_joined_view_join_name",
from_="my_joined_view_original_name",
),
],
)
def setup_mock_explore_unaliased_with_joins(mocked_client):
mock_model = mock.MagicMock(project_name="lkml_samples")
mocked_client.lookml_model.return_value = mock_model
mocked_client.lookml_model_explore.return_value = LookmlModelExplore(
id="1",
name="my_view",
label="My Explore View",
description="lorem ipsum",
project_name="lkml_samples",
fields=LookmlModelExploreFieldset(
dimensions=[
LookmlModelExploreField(
name="dim1",
type="string",
dimension_group=None,
description="dimension one description",
label_short="Dimensions One Label",
)
]
),
source_file="test_source_file.lkml",
joins=[
LookmlModelExploreJoins(
name="my_view_has_no_fields",
view_label="My Labeled View",
relationship="one_to_one",
sql_on="1=1",
),
LookmlModelExploreJoins(
name="my_joined_view_join_name",
from_="my_joined_view_original_name",
),
],
)
def setup_mock_explore(mocked_client):
mock_model = mock.MagicMock(project_name="lkml_samples")
mocked_client.lookml_model.return_value = mock_model
mocked_client.lookml_model_explore.return_value = LookmlModelExplore(
id="1",
name="my_explore_name",
label="My Explore View",
description="lorem ipsum",
view_name="underlying_view",
project_name="lkml_samples",
fields=LookmlModelExploreFieldset(
dimensions=[
LookmlModelExploreField(
name="dim1",
type="string",
dimension_group=None,
description="dimension one description",
label_short="Dimensions One Label",
)
]
),
source_file="test_source_file.lkml",
)
def setup_mock_user(mocked_client):
def get_user(
id_: str,
fields: Optional[str] = None,
transport_options: Optional[transport.TransportOptions] = None,
) -> User:
return User(id=id_, email=f"test-{id_}@looker.com")
mocked_client.user.side_effect = get_user
def side_effect_query_inline(
result_format: str, body: WriteQuery, transport_options: Optional[TransportOptions]
) -> str:
query_type: looker_usage.QueryId
if result_format == "sql":
return "" # Placeholder for sql text
for query_id, query_template in looker_usage.query_collection.items():
if body.fields == query_template.to_write_query().fields:
query_type = query_id
break
query_id_vs_response = {
looker_usage.QueryId.DASHBOARD_PER_DAY_USAGE_STAT: json.dumps(
[
{
HistoryViewField.HISTORY_DASHBOARD_ID: "1",
HistoryViewField.HISTORY_CREATED_DATE: "2022-07-05",
HistoryViewField.HISTORY_DASHBOARD_USER: 1,
HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 14,
},
{
HistoryViewField.HISTORY_DASHBOARD_ID: "1",
HistoryViewField.HISTORY_CREATED_DATE: "2022-07-06",
HistoryViewField.HISTORY_DASHBOARD_USER: 1,
HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 14,
},
{
HistoryViewField.HISTORY_DASHBOARD_ID: "1",
HistoryViewField.HISTORY_CREATED_DATE: "2022-07-07",
HistoryViewField.HISTORY_DASHBOARD_USER: 1,
HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5,
},
]
),
looker_usage.QueryId.DASHBOARD_PER_USER_PER_DAY_USAGE_STAT: json.dumps(
[
{
HistoryViewField.HISTORY_CREATED_DATE: "2022-07-05",
HistoryViewField.HISTORY_DASHBOARD_ID: "1",
UserViewField.USER_ID: 1,
HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 16,
},
{
HistoryViewField.HISTORY_CREATED_DATE: "2022-07-05",
HistoryViewField.HISTORY_DASHBOARD_ID: "1",
UserViewField.USER_ID: 2,
HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 14,
},
{
HistoryViewField.HISTORY_CREATED_DATE: "2022-07-07",
HistoryViewField.HISTORY_DASHBOARD_ID: "1",
UserViewField.USER_ID: 1,
HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5,
},
]
),
looker_usage.QueryId.LOOK_PER_DAY_USAGE_STAT: json.dumps(
[
{
HistoryViewField.HISTORY_CREATED_DATE: "2022-07-05",
HistoryViewField.HISTORY_COUNT: 10,
LookViewField.LOOK_ID: 3,
},
{
HistoryViewField.HISTORY_CREATED_DATE: "2022-07-06",
HistoryViewField.HISTORY_COUNT: 20,
LookViewField.LOOK_ID: 3,
},
{
HistoryViewField.HISTORY_CREATED_DATE: "2022-07-07",
HistoryViewField.HISTORY_COUNT: 35,
LookViewField.LOOK_ID: 3,
},
]
),
looker_usage.QueryId.LOOK_PER_USER_PER_DAY_USAGE_STAT: json.dumps(
[
{
HistoryViewField.HISTORY_CREATED_DATE: "2022-07-05",
HistoryViewField.HISTORY_COUNT: 10,
LookViewField.LOOK_ID: 3,
UserViewField.USER_ID: 1,
},
{
HistoryViewField.HISTORY_CREATED_DATE: "2022-07-05",
HistoryViewField.HISTORY_COUNT: 20,
LookViewField.LOOK_ID: 3,
UserViewField.USER_ID: 2,
},
]
),
}
if query_id_vs_response.get(query_type) is None:
raise Exception("Unknown Query")
return query_id_vs_response[query_type]
@freeze_time(FROZEN_TIME)
def test_looker_ingest_allow_pattern(pytestconfig, tmp_path, mock_time):
mocked_client = mock.MagicMock()
with mock.patch("looker_sdk.init40") as mock_sdk:
mock_sdk.return_value = mocked_client
setup_mock_dashboard_multiple_charts(mocked_client)
setup_mock_explore(mocked_client)
test_resources_dir = pytestconfig.rootpath / "tests/integration/looker"
pipeline = Pipeline.create(
{
"run_id": "looker-test",
"source": {
"type": "looker",
"config": {
"base_url": "https://looker.company.com",
"client_id": "foo",
"client_secret": "bar",
"chart_pattern": {"allow": ["2"]},
"extract_usage_history": False,
},
},
"sink": {
"type": "file",
"config": {
"filename": f"{tmp_path}/looker_mces.json",
},
},
}
)
pipeline.run()
pipeline.pretty_print_summary()
pipeline.raise_from_status()
mce_out_file = "golden_test_allow_ingest.json"
mce_helpers.check_golden_file(
pytestconfig,
output_path=tmp_path / "looker_mces.json",
golden_path=f"{test_resources_dir}/{mce_out_file}",
)
@freeze_time(FROZEN_TIME)
def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time):
mocked_client = mock.MagicMock()
with mock.patch("looker_sdk.init40") as mock_sdk:
mock_sdk.return_value = mocked_client
setup_mock_dashboard_with_usage(mocked_client)
mocked_client.run_inline_query.side_effect = side_effect_query_inline
setup_mock_explore(mocked_client)
setup_mock_user(mocked_client)
test_resources_dir = pytestconfig.rootpath / "tests/integration/looker"
temp_output_file = f"{tmp_path}/looker_mces.json"
pipeline = Pipeline.create(
{
"run_id": "looker-test",
"source": {
"type": "looker",
"config": {
"base_url": "https://looker.company.com",
"client_id": "foo",
"client_secret": "bar",
"extract_usage_history": True,
"max_threads": 1,
},
},
"sink": {
"type": "file",
"config": {
"filename": temp_output_file,
},
},
}
)
pipeline.run()
pipeline.pretty_print_summary()
pipeline.raise_from_status()
mce_out_file = "looker_mces_usage_history.json"
# There should be 4 dashboardUsageStatistics aspects (one absolute and 3 timeseries)
dashboard_usage_aspect_count = 0
# There should be 4 chartUsageStatistics (one absolute and 3 timeseries)
chart_usage_aspect_count = 0
with open(temp_output_file) as f:
temp_output_dict = json.load(f)
for element in temp_output_dict:
if (
element.get("entityType") == "dashboard"
and element.get("aspectName") == "dashboardUsageStatistics"
):
dashboard_usage_aspect_count = dashboard_usage_aspect_count + 1
if (
element.get("entityType") == "chart"
and element.get("aspectName") == "chartUsageStatistics"
):
chart_usage_aspect_count = chart_usage_aspect_count + 1
assert dashboard_usage_aspect_count == 4
assert chart_usage_aspect_count == 4
mce_helpers.check_golden_file(
pytestconfig,
output_path=temp_output_file,
golden_path=f"{test_resources_dir}/{mce_out_file}",
)
@freeze_time(FROZEN_TIME)
def test_looker_ingest_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
output_file_name: str = "looker_mces.json"
golden_file_name: str = "golden_looker_mces.json"
output_file_deleted_name: str = "looker_mces_deleted_stateful.json"
golden_file_deleted_name: str = "looker_mces_golden_deleted_stateful.json"
test_resources_dir = pytestconfig.rootpath / "tests/integration/looker"
def looker_source_config(sink_file_name):
return {
"run_id": "looker-test",
"pipeline_name": "stateful-looker-pipeline",
"source": {
"type": "looker",
"config": {
"base_url": "https://looker.company.com",
"client_id": "foo",
"client_secret": "bar",
"extract_usage_history": False,
"stateful_ingestion": {
"enabled": True,
"remove_stale_metadata": True,
"fail_safe_threshold": 100.0,
"state_provider": {
"type": "datahub",
"config": {"datahub_api": {"server": GMS_SERVER}},
},
},
},
},
"sink": {
"type": "file",
"config": {
"filename": f"{tmp_path}/{sink_file_name}",
},
},
}
mocked_client = mock.MagicMock()
pipeline_run1 = None
with mock.patch(
"datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph",
mock_datahub_graph,
) as mock_checkpoint, mock.patch("looker_sdk.init40") as mock_sdk:
mock_checkpoint.return_value = mock_datahub_graph
mock_sdk.return_value = mocked_client
setup_mock_dashboard_multiple_charts(mocked_client)
setup_mock_explore(mocked_client)
pipeline_run1 = Pipeline.create(looker_source_config(output_file_name))
pipeline_run1.run()
pipeline_run1.raise_from_status()
pipeline_run1.pretty_print_summary()
mce_helpers.check_golden_file(
pytestconfig,
output_path=tmp_path / output_file_name,
golden_path=f"{test_resources_dir}/{golden_file_name}",
)
checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1)
assert checkpoint1
assert checkpoint1.state
pipeline_run2 = None
mocked_client = mock.MagicMock()
with mock.patch(
"datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph",
mock_datahub_graph,
) as mock_checkpoint, mock.patch("looker_sdk.init40") as mock_sdk:
mock_checkpoint.return_value = mock_datahub_graph
mock_sdk.return_value = mocked_client
setup_mock_dashboard(mocked_client)
setup_mock_explore(mocked_client)
pipeline_run2 = Pipeline.create(looker_source_config(output_file_deleted_name))
pipeline_run2.run()
pipeline_run2.raise_from_status()
pipeline_run2.pretty_print_summary()
mce_helpers.check_golden_file(
pytestconfig,
output_path=tmp_path / output_file_deleted_name,
golden_path=f"{test_resources_dir}/{golden_file_deleted_name}",
)
checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2)
assert checkpoint2
assert checkpoint2.state
# Validate that all providers have committed successfully.
validate_all_providers_have_committed_successfully(
pipeline=pipeline_run1, expected_providers=1
)
validate_all_providers_have_committed_successfully(
pipeline=pipeline_run2, expected_providers=1
)
# Perform all assertions on the states. The deleted table should not be
# part of the second state
state1 = cast(GenericCheckpointState, checkpoint1.state)
state2 = cast(GenericCheckpointState, checkpoint2.state)
difference_dataset_urns = list(
state1.get_urns_not_in(type="dataset", other_checkpoint_state=state2)
)
assert len(difference_dataset_urns) == 1
deleted_dataset_urns: List[str] = [
"urn:li:dataset:(urn:li:dataPlatform:looker,bogus data.explore.my_view,PROD)"
]
assert sorted(deleted_dataset_urns) == sorted(difference_dataset_urns)
difference_chart_urns = list(
state1.get_urns_not_in(type="chart", other_checkpoint_state=state2)
)
assert len(difference_chart_urns) == 1
deleted_chart_urns = ["urn:li:chart:(looker,dashboard_elements.10)"]
assert sorted(deleted_chart_urns) == sorted(difference_chart_urns)
difference_dashboard_urns = list(
state1.get_urns_not_in(type="dashboard", other_checkpoint_state=state2)
)
assert len(difference_dashboard_urns) == 1
deleted_dashboard_urns = ["urn:li:dashboard:(looker,dashboards.11)"]
assert sorted(deleted_dashboard_urns) == sorted(difference_dashboard_urns)