feat(ingest/looker): Do not emit usage for non-ingested dashboards and charts (#11647)

This commit is contained in:
Andrew Sikowitz 2024-12-23 22:39:57 -08:00 committed by GitHub
parent 047644b888
commit 09a9b6eef9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 482 additions and 40 deletions

View File

@ -1408,6 +1408,15 @@ class LookerDashboardSourceReport(StaleEntityRemovalSourceReport):
dashboards_with_activity: LossySet[str] = dataclasses_field(
default_factory=LossySet
)
# Entities that don't seem to exist, so we don't emit usage aspects for them despite having usage data
dashboards_skipped_for_usage: LossySet[str] = dataclasses_field(
default_factory=LossySet
)
charts_skipped_for_usage: LossySet[str] = dataclasses_field(
default_factory=LossySet
)
stage_latency: List[StageLatency] = dataclasses_field(default_factory=list)
_looker_explore_registry: Optional[LookerExploreRegistry] = None
total_explores: int = 0

View File

@ -68,6 +68,7 @@ from datahub.ingestion.source.looker.looker_common import (
ViewField,
ViewFieldType,
gen_model_key,
get_urn_looker_element_id,
)
from datahub.ingestion.source.looker.looker_config import LookerDashboardSourceConfig
from datahub.ingestion.source.looker.looker_lib_wrapper import LookerAPI
@ -165,6 +166,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
# Required, as we do not ingest all folders but only those that have dashboards/looks
self.processed_folders: List[str] = []
# Keep track of ingested chart urns, to omit usage for non-ingested entities
self.chart_urns: Set[str] = set()
@staticmethod
def test_connection(config_dict: dict) -> TestConnectionReport:
test_report = TestConnectionReport()
@ -642,6 +646,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
chart_urn = self._make_chart_urn(
element_id=dashboard_element.get_urn_element_id()
)
self.chart_urns.add(chart_urn)
chart_snapshot = ChartSnapshot(
urn=chart_urn,
aspects=[Status(removed=False)],
@ -1380,7 +1385,9 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
yield from self._emit_folder_as_container(folder)
def extract_usage_stat(
self, looker_dashboards: List[looker_usage.LookerDashboardForUsage]
self,
looker_dashboards: List[looker_usage.LookerDashboardForUsage],
ingested_chart_urns: Set[str],
) -> List[MetadataChangeProposalWrapper]:
looks: List[looker_usage.LookerChartForUsage] = []
# filter out look from all dashboard
@ -1391,6 +1398,15 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
# dedup looks
looks = list({str(look.id): look for look in looks}.values())
filtered_looks = []
for look in looks:
if not look.id:
continue
chart_urn = self._make_chart_urn(get_urn_looker_element_id(look.id))
if chart_urn in ingested_chart_urns:
filtered_looks.append(look)
else:
self.reporter.charts_skipped_for_usage.add(look.id)
# Keep stat generators to generate entity stat aspect later
stat_generator_config: looker_usage.StatGeneratorConfig = (
@ -1414,7 +1430,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
stat_generator_config,
self.reporter,
self._make_chart_urn,
looks,
filtered_looks,
)
mcps: List[MetadataChangeProposalWrapper] = []
@ -1669,7 +1685,7 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
if self.source_config.extract_usage_history:
self.reporter.report_stage_start("usage_extraction")
usage_mcps: List[MetadataChangeProposalWrapper] = self.extract_usage_stat(
looker_dashboards_for_usage
looker_dashboards_for_usage, self.chart_urns
)
for usage_mcp in usage_mcps:
yield usage_mcp.as_workunit()

View File

@ -42,6 +42,7 @@ from datahub.metadata.schema_classes import (
TimeWindowSizeClass,
_Aspect as AspectAbstract,
)
from datahub.utilities.lossy_collections import LossySet
logger = logging.getLogger(__name__)
@ -170,7 +171,7 @@ class BaseStatGenerator(ABC):
self.config = config
self.looker_models = looker_models
# Later it will help to find out for what are the looker entities from query result
self.id_vs_model: Dict[str, ModelForUsage] = {
self.id_to_model: Dict[str, ModelForUsage] = {
self.get_id(looker_object): looker_object for looker_object in looker_models
}
self.post_filter = len(self.looker_models) > 100
@ -225,6 +226,10 @@ class BaseStatGenerator(ABC):
def get_id_from_row(self, row: dict) -> str:
pass
@abstractmethod
def report_skip_set(self) -> LossySet[str]:
pass
def create_mcp(
self, model: ModelForUsage, aspect: Aspect
) -> MetadataChangeProposalWrapper:
@ -258,20 +263,11 @@ class BaseStatGenerator(ABC):
return entity_stat_aspect
def _process_absolute_aspect(self) -> List[Tuple[ModelForUsage, AspectAbstract]]:
aspects: List[Tuple[ModelForUsage, AspectAbstract]] = []
for looker_object in self.looker_models:
aspects.append(
(looker_object, self.to_entity_absolute_stat_aspect(looker_object))
)
return aspects
def _fill_user_stat_aspect(
self,
entity_usage_stat: Dict[Tuple[str, str], Aspect],
user_wise_rows: List[Dict],
) -> Iterable[Tuple[ModelForUsage, Aspect]]:
) -> Iterable[Tuple[str, Aspect]]:
logger.debug("Entering fill user stat aspect")
# We first resolve all the users using a threadpool to warm up the cache
@ -300,7 +296,7 @@ class BaseStatGenerator(ABC):
for row in user_wise_rows:
# Confirm looker object was given for stat generation
looker_object = self.id_vs_model.get(self.get_id_from_row(row))
looker_object = self.id_to_model.get(self.get_id_from_row(row))
if looker_object is None:
logger.warning(
"Looker object with id({}) was not register with stat generator".format(
@ -338,7 +334,7 @@ class BaseStatGenerator(ABC):
logger.debug("Starting to yield answers for user-wise counts")
for (id, _), aspect in entity_usage_stat.items():
yield self.id_vs_model[id], aspect
yield id, aspect
def _execute_query(self, query: LookerQuery, query_name: str) -> List[Dict]:
rows = []
@ -357,7 +353,7 @@ class BaseStatGenerator(ABC):
)
if self.post_filter:
logger.debug("post filtering")
rows = [r for r in rows if self.get_id_from_row(r) in self.id_vs_model]
rows = [r for r in rows if self.get_id_from_row(r) in self.id_to_model]
logger.debug("Filtered down to %d rows", len(rows))
except Exception as e:
logger.warning(f"Failed to execute {query_name} query: {e}")
@ -378,7 +374,8 @@ class BaseStatGenerator(ABC):
return
# yield absolute stat for looker entities
for looker_object, aspect in self._process_absolute_aspect(): # type: ignore
for looker_object in self.looker_models:
aspect = self.to_entity_absolute_stat_aspect(looker_object)
yield self.create_mcp(looker_object, aspect)
# Execute query and process the raw json which contains stat information
@ -399,10 +396,13 @@ class BaseStatGenerator(ABC):
)
user_wise_rows = self._execute_query(user_wise_query_with_filters, "user_query")
# yield absolute stat for entity
for looker_object, aspect in self._fill_user_stat_aspect(
for object_id, aspect in self._fill_user_stat_aspect(
entity_usage_stat, user_wise_rows
):
yield self.create_mcp(looker_object, aspect)
if object_id in self.id_to_model:
yield self.create_mcp(self.id_to_model[object_id], aspect)
else:
self.report_skip_set().add(object_id)
class DashboardStatGenerator(BaseStatGenerator):
@ -425,6 +425,9 @@ class DashboardStatGenerator(BaseStatGenerator):
def get_stats_generator_name(self) -> str:
return "DashboardStats"
def report_skip_set(self) -> LossySet[str]:
return self.report.dashboards_skipped_for_usage
def get_filter(self) -> Dict[ViewField, str]:
return {
HistoryViewField.HISTORY_DASHBOARD_ID: ",".join(
@ -541,6 +544,9 @@ class LookStatGenerator(BaseStatGenerator):
def get_stats_generator_name(self) -> str:
return "ChartStats"
def report_skip_set(self) -> LossySet[str]:
return self.report.charts_skipped_for_usage
def get_filter(self) -> Dict[ViewField, str]:
return {
LookViewField.LOOK_ID: ",".join(

View File

@ -1,4 +1,66 @@
[
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": {
"urn": "urn:li:chart:(looker,dashboard_elements.3)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.chart.ChartInfo": {
"customProperties": {
"upstream_fields": ""
},
"title": "",
"description": "",
"lastModified": {
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
}
},
"chartUrl": "https://looker.company.com/x/",
"inputs": [
{
"string": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)"
}
]
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "chart",
"entityUrn": "urn:li:chart:(looker,dashboard_elements.3)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Look"
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": {
@ -9,7 +71,9 @@
"customProperties": {},
"title": "foo",
"description": "lorem ipsum",
"charts": [],
"charts": [
"urn:li:chart:(looker,dashboard_elements.3)"
],
"datasets": [],
"dashboards": [],
"lastModified": {
@ -89,6 +153,22 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "chart",
"entityUrn": "urn:li:chart:(looker,dashboard_elements.3)",
"changeType": "UPSERT",
"aspectName": "inputFields",
"aspect": {
"json": {
"fields": []
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dashboard",
"entityUrn": "urn:li:dashboard:(looker,dashboards.1)",
@ -215,6 +295,98 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
"json": {
"customProperties": {
"platform": "looker",
"env": "PROD",
"model_name": "look_data"
},
"name": "look_data",
"env": "PROD"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
"json": {
"platform": "urn:li:dataPlatform:looker"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"LookML Model"
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "Explore"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
@ -389,6 +561,180 @@
"lastRunId": "no-run-id-provided"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
"urn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.BrowsePaths": {
"paths": [
"/Explore/look_data"
]
}
},
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.dataset.DatasetProperties": {
"customProperties": {
"project": "lkml_samples",
"model": "look_data",
"looker.explore.label": "My Explore View",
"looker.explore.name": "look_view",
"looker.explore.file": "test_source_file.lkml"
},
"externalUrl": "https://looker.company.com/explore/look_data/look_view",
"name": "My Explore View",
"description": "lorem ipsum",
"tags": []
}
},
{
"com.linkedin.pegasus2avro.dataset.UpstreamLineage": {
"upstreams": [
{
"auditStamp": {
"time": 1586847600000,
"actor": "urn:li:corpuser:datahub"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.underlying_view,PROD)",
"type": "VIEW"
}
]
}
},
{
"com.linkedin.pegasus2avro.schema.SchemaMetadata": {
"schemaName": "look_view",
"platform": "urn:li:dataPlatform:looker",
"version": 0,
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"hash": "",
"platformSchema": {
"com.linkedin.pegasus2avro.schema.OtherSchema": {
"rawSchema": ""
}
},
"fields": [
{
"fieldPath": "dim1",
"nullable": false,
"description": "dimension one description",
"label": "Dimensions One Label",
"type": {
"type": {
"com.linkedin.pegasus2avro.schema.StringType": {}
}
},
"nativeDataType": "string",
"recursive": false,
"globalTags": {
"tags": [
{
"tag": "urn:li:tag:Dimension"
}
]
},
"isPartOfKey": false
}
],
"primaryKeys": []
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Explore"
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)",
"changeType": "UPSERT",
"aspectName": "embed",
"aspect": {
"json": {
"renderUrl": "https://looker.company.com/embed/explore/look_data/look_view"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb"
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,look_data.explore.look_view,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "Explore"
},
{
"id": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb",
"urn": "urn:li:container:a2a7aa63752695f9a1705faed9d03ffb"
}
]
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": {
@ -747,22 +1093,6 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "chart",
"entityUrn": "urn:li:chart:(looker,dashboard_elements.3)",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
},
"systemMetadata": {
"lastObserved": 1586847600000,
"runId": "looker-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "tag",
"entityUrn": "urn:li:tag:Dimension",

View File

@ -31,7 +31,10 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.source import SourceReport
from datahub.ingestion.run.pipeline import Pipeline, PipelineInitError
from datahub.ingestion.source.looker import looker_common, looker_usage
from datahub.ingestion.source.looker.looker_common import LookerExplore
from datahub.ingestion.source.looker.looker_common import (
LookerDashboardSourceReport,
LookerExplore,
)
from datahub.ingestion.source.looker.looker_config import LookerCommonConfig
from datahub.ingestion.source.looker.looker_lib_wrapper import (
LookerAPI,
@ -414,7 +417,9 @@ def setup_mock_dashboard_multiple_charts(mocked_client):
)
def setup_mock_dashboard_with_usage(mocked_client):
def setup_mock_dashboard_with_usage(
mocked_client: mock.MagicMock, skip_look: bool = False
) -> None:
mocked_client.all_dashboards.return_value = [Dashboard(id="1")]
mocked_client.dashboard.return_value = Dashboard(
id="1",
@ -437,7 +442,13 @@ def setup_mock_dashboard_with_usage(mocked_client):
),
),
DashboardElement(
id="3", type="", look=LookWithQuery(id="3", view_count=30)
id="3",
type="" if skip_look else "vis", # Looks only ingested if type == `vis`
look=LookWithQuery(
id="3",
view_count=30,
query=Query(model="look_data", view="look_view"),
),
),
],
)
@ -611,6 +622,12 @@ def side_effect_query_inline(
HistoryViewField.HISTORY_DASHBOARD_USER: 1,
HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5,
},
{
HistoryViewField.HISTORY_DASHBOARD_ID: "5",
HistoryViewField.HISTORY_CREATED_DATE: "2022-07-07",
HistoryViewField.HISTORY_DASHBOARD_USER: 1,
HistoryViewField.HISTORY_DASHBOARD_RUN_COUNT: 5,
},
]
),
looker_usage.QueryId.DASHBOARD_PER_USER_PER_DAY_USAGE_STAT: json.dumps(
@ -790,6 +807,70 @@ def test_looker_ingest_usage_history(pytestconfig, tmp_path, mock_time):
)
@freeze_time(FROZEN_TIME)
def test_looker_filter_usage_history(pytestconfig, tmp_path, mock_time):
mocked_client = mock.MagicMock()
with mock.patch("looker_sdk.init40") as mock_sdk:
mock_sdk.return_value = mocked_client
setup_mock_dashboard_with_usage(mocked_client, skip_look=True)
mocked_client.run_inline_query.side_effect = side_effect_query_inline
setup_mock_explore(mocked_client)
setup_mock_user(mocked_client)
temp_output_file = f"{tmp_path}/looker_mces.json"
pipeline = Pipeline.create(
{
"run_id": "looker-test",
"source": {
"type": "looker",
"config": {
"base_url": "https://looker.company.com",
"client_id": "foo",
"client_secret": "bar",
"extract_usage_history": True,
"max_threads": 1,
},
},
"sink": {
"type": "file",
"config": {
"filename": temp_output_file,
},
},
}
)
pipeline.run()
pipeline.pretty_print_summary()
pipeline.raise_from_status()
# There should be 4 dashboardUsageStatistics aspects (one absolute and 3 timeseries)
dashboard_usage_aspect_count = 0
# There should be 0 chartUsageStatistics -- filtered by set of ingested charts
chart_usage_aspect_count = 0
with open(temp_output_file) as f:
temp_output_dict = json.load(f)
for element in temp_output_dict:
if (
element.get("entityType") == "dashboard"
and element.get("aspectName") == "dashboardUsageStatistics"
):
dashboard_usage_aspect_count = dashboard_usage_aspect_count + 1
if (
element.get("entityType") == "chart"
and element.get("aspectName") == "chartUsageStatistics"
):
chart_usage_aspect_count = chart_usage_aspect_count + 1
assert dashboard_usage_aspect_count == 4
assert chart_usage_aspect_count == 0
source_report = cast(LookerDashboardSourceReport, pipeline.source.get_report())
# From timeseries query
assert str(source_report.dashboards_skipped_for_usage) == str(["5"])
# From dashboard element
assert str(source_report.charts_skipped_for_usage) == str(["3"])
@freeze_time(FROZEN_TIME)
def test_looker_ingest_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
output_file_name: str = "looker_mces.json"