2023-06-22 17:07:50 -04:00
|
|
|
import logging
|
|
|
|
from datetime import datetime
|
2023-05-24 02:46:46 -04:00
|
|
|
from typing import Any, Dict, Iterable, List, Union
|
2023-06-09 13:35:54 -04:00
|
|
|
from unittest.mock import patch
|
2022-12-09 12:24:39 -05:00
|
|
|
|
2023-12-04 20:00:11 -05:00
|
|
|
import pytest
|
2023-06-22 17:07:50 -04:00
|
|
|
from freezegun import freeze_time
|
|
|
|
|
2022-12-09 12:24:39 -05:00
|
|
|
import datahub.metadata.schema_classes as models
|
2023-06-22 17:07:50 -04:00
|
|
|
from datahub.configuration.time_window_config import BaseTimeWindowConfig
|
2023-06-09 13:35:54 -04:00
|
|
|
from datahub.emitter.mce_builder import (
|
|
|
|
make_container_urn,
|
|
|
|
make_dataplatform_instance_urn,
|
|
|
|
make_dataset_urn,
|
|
|
|
)
|
2022-12-09 12:24:39 -05:00
|
|
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
2024-09-23 22:36:05 +05:30
|
|
|
from datahub.ingestion.api.auto_work_units.auto_dataset_properties_aspect import (
|
|
|
|
auto_patch_last_modified,
|
|
|
|
)
|
2023-05-24 02:46:46 -04:00
|
|
|
from datahub.ingestion.api.source_helpers import (
|
2024-04-12 22:51:06 +05:30
|
|
|
_prepend_platform_instance,
|
2023-05-24 02:46:46 -04:00
|
|
|
auto_browse_path_v2,
|
2023-06-22 17:07:50 -04:00
|
|
|
auto_empty_dataset_usage_statistics,
|
2023-10-12 13:56:30 +02:00
|
|
|
auto_lowercase_urns,
|
2023-05-24 02:46:46 -04:00
|
|
|
auto_status_aspect,
|
|
|
|
auto_workunit,
|
2024-09-23 22:36:05 +05:30
|
|
|
create_dataset_props_patch_builder,
|
2023-05-24 02:46:46 -04:00
|
|
|
)
|
2022-12-09 12:24:39 -05:00
|
|
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
2024-09-23 22:36:05 +05:30
|
|
|
from datahub.metadata.schema_classes import (
|
|
|
|
DatasetPropertiesClass,
|
|
|
|
OperationTypeClass,
|
|
|
|
TimeStampClass,
|
|
|
|
)
|
|
|
|
from datahub.specific.dataset import DatasetPatchBuilder
|
2022-12-09 12:24:39 -05:00
|
|
|
|
|
|
|
_base_metadata: List[
|
|
|
|
Union[MetadataChangeProposalWrapper, models.MetadataChangeEventClass]
|
|
|
|
] = [
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn="urn:li:container:008e111aa1d250dd52e0fd5d4b307b1a",
|
|
|
|
aspect=models.ContainerPropertiesClass(
|
|
|
|
name="test",
|
|
|
|
),
|
|
|
|
),
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn="urn:li:container:108e111aa1d250dd52e0fd5d4b307b12",
|
|
|
|
aspect=models.StatusClass(removed=True),
|
|
|
|
),
|
|
|
|
models.MetadataChangeEventClass(
|
|
|
|
proposedSnapshot=models.DatasetSnapshotClass(
|
|
|
|
urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,bigquery-public-data.covid19_aha.staffing,PROD)",
|
|
|
|
aspects=[
|
|
|
|
models.DatasetPropertiesClass(
|
|
|
|
customProperties={
|
|
|
|
"key": "value",
|
|
|
|
},
|
|
|
|
),
|
|
|
|
],
|
|
|
|
),
|
|
|
|
),
|
|
|
|
models.MetadataChangeEventClass(
|
|
|
|
proposedSnapshot=models.DatasetSnapshotClass(
|
|
|
|
urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,bigquery-public-data.covid19_aha.hospital_beds,PROD)",
|
|
|
|
aspects=[
|
|
|
|
models.StatusClass(removed=True),
|
|
|
|
],
|
|
|
|
),
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_workunit():
|
|
|
|
wu = list(auto_workunit(_base_metadata))
|
|
|
|
assert all(isinstance(w, MetadataWorkUnit) for w in wu)
|
|
|
|
|
|
|
|
ids = [w.id for w in wu]
|
|
|
|
assert ids == [
|
|
|
|
"urn:li:container:008e111aa1d250dd52e0fd5d4b307b1a-containerProperties",
|
|
|
|
"urn:li:container:108e111aa1d250dd52e0fd5d4b307b12-status",
|
|
|
|
"urn:li:dataset:(urn:li:dataPlatform:bigquery,bigquery-public-data.covid19_aha.staffing,PROD)/mce",
|
|
|
|
"urn:li:dataset:(urn:li:dataPlatform:bigquery,bigquery-public-data.covid19_aha.hospital_beds,PROD)/mce",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def test_auto_status_aspect():
|
|
|
|
initial_wu = list(auto_workunit(_base_metadata))
|
|
|
|
|
|
|
|
expected = [
|
|
|
|
*initial_wu,
|
|
|
|
*list(
|
|
|
|
auto_workunit(
|
|
|
|
[
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn="urn:li:container:008e111aa1d250dd52e0fd5d4b307b1a",
|
|
|
|
aspect=models.StatusClass(removed=False),
|
|
|
|
),
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn="urn:li:dataset:(urn:li:dataPlatform:bigquery,bigquery-public-data.covid19_aha.staffing,PROD)",
|
|
|
|
aspect=models.StatusClass(removed=False),
|
|
|
|
),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
),
|
|
|
|
]
|
|
|
|
assert list(auto_status_aspect(initial_wu)) == expected
|
2023-05-24 02:46:46 -04:00
|
|
|
|
|
|
|
|
2023-06-09 13:35:54 -04:00
|
|
|
def _create_container_aspects(
|
|
|
|
d: Dict[str, Any],
|
|
|
|
other_aspects: Dict[str, List[models._Aspect]] = {},
|
|
|
|
root: bool = True,
|
|
|
|
) -> Iterable[MetadataWorkUnit]:
|
2023-05-24 02:46:46 -04:00
|
|
|
for k, v in d.items():
|
2023-06-09 13:35:54 -04:00
|
|
|
urn = make_container_urn(k)
|
2023-05-24 02:46:46 -04:00
|
|
|
yield MetadataChangeProposalWrapper(
|
2023-06-09 13:35:54 -04:00
|
|
|
entityUrn=urn, aspect=models.StatusClass(removed=False)
|
2023-05-24 02:46:46 -04:00
|
|
|
).as_workunit()
|
|
|
|
|
2023-06-09 13:35:54 -04:00
|
|
|
for aspect in other_aspects.pop(k, []):
|
|
|
|
yield MetadataChangeProposalWrapper(
|
|
|
|
entityUrn=urn, aspect=aspect
|
|
|
|
).as_workunit()
|
|
|
|
|
2023-05-24 02:46:46 -04:00
|
|
|
for child in list(v):
|
|
|
|
yield MetadataChangeProposalWrapper(
|
|
|
|
entityUrn=make_container_urn(child),
|
2023-06-09 13:35:54 -04:00
|
|
|
aspect=models.ContainerClass(container=urn),
|
2023-05-24 02:46:46 -04:00
|
|
|
).as_workunit()
|
|
|
|
if isinstance(v, dict):
|
2023-06-09 13:35:54 -04:00
|
|
|
yield from _create_container_aspects(
|
|
|
|
v, other_aspects=other_aspects, root=False
|
|
|
|
)
|
|
|
|
|
|
|
|
if root:
|
|
|
|
for k, v in other_aspects.items():
|
|
|
|
for aspect in v:
|
|
|
|
yield MetadataChangeProposalWrapper(
|
|
|
|
entityUrn=make_container_urn(k), aspect=aspect
|
|
|
|
).as_workunit()
|
2023-05-24 02:46:46 -04:00
|
|
|
|
|
|
|
|
2023-06-02 15:50:38 -04:00
|
|
|
def _make_container_browse_path_entries(
|
|
|
|
path: List[str],
|
|
|
|
) -> List[models.BrowsePathEntryClass]:
|
2023-05-24 02:46:46 -04:00
|
|
|
return [
|
|
|
|
models.BrowsePathEntryClass(id=make_container_urn(s), urn=make_container_urn(s))
|
|
|
|
for s in path
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2023-06-02 15:50:38 -04:00
|
|
|
def _make_browse_path_entries(path: List[str]) -> List[models.BrowsePathEntryClass]:
|
|
|
|
return [models.BrowsePathEntryClass(id=s, urn=None) for s in path]
|
|
|
|
|
|
|
|
|
2024-04-12 22:51:06 +05:30
|
|
|
def prepend_platform_instance(
|
|
|
|
path: List[models.BrowsePathEntryClass],
|
|
|
|
) -> List[models.BrowsePathEntryClass]:
|
|
|
|
platform = "platform"
|
|
|
|
instance = "instance"
|
|
|
|
return _prepend_platform_instance(path, platform, instance)
|
|
|
|
|
|
|
|
|
2023-05-24 02:46:46 -04:00
|
|
|
def _get_browse_paths_from_wu(
|
|
|
|
stream: Iterable[MetadataWorkUnit],
|
|
|
|
) -> Dict[str, List[models.BrowsePathEntryClass]]:
|
|
|
|
paths = {}
|
|
|
|
for wu in stream:
|
2023-06-09 13:35:54 -04:00
|
|
|
browse_path_v2 = wu.get_aspect_of_type(models.BrowsePathsV2Class)
|
2023-05-24 02:46:46 -04:00
|
|
|
if browse_path_v2:
|
|
|
|
name = wu.get_urn().split(":")[-1]
|
2023-06-09 13:35:54 -04:00
|
|
|
paths[name] = browse_path_v2.path
|
2023-05-24 02:46:46 -04:00
|
|
|
return paths
|
|
|
|
|
|
|
|
|
2023-06-09 13:35:54 -04:00
|
|
|
@patch("datahub.ingestion.api.source_helpers.telemetry.telemetry_instance.ping")
|
|
|
|
def test_auto_browse_path_v2_by_container_hierarchy(telemetry_ping_mock):
|
2023-05-24 02:46:46 -04:00
|
|
|
structure = {
|
|
|
|
"one": {
|
|
|
|
"a": {"i": ["1", "2", "3"], "ii": ["4"]},
|
|
|
|
"b": {"iii": ["5", "6"]},
|
|
|
|
},
|
|
|
|
"two": {
|
|
|
|
"c": {"iv": [], "v": ["7", "8"]},
|
|
|
|
},
|
|
|
|
"three": {"d": {}},
|
|
|
|
"four": {},
|
|
|
|
}
|
|
|
|
|
|
|
|
wus = list(auto_status_aspect(_create_container_aspects(structure)))
|
|
|
|
assert ( # Sanity check
|
2023-06-09 13:35:54 -04:00
|
|
|
sum(bool(wu.get_aspect_of_type(models.StatusClass)) for wu in wus) == 21
|
2023-05-24 02:46:46 -04:00
|
|
|
)
|
|
|
|
|
2023-06-09 13:35:54 -04:00
|
|
|
new_wus = list(auto_browse_path_v2(wus))
|
|
|
|
assert not telemetry_ping_mock.call_count, telemetry_ping_mock.call_args_list
|
2023-05-24 02:46:46 -04:00
|
|
|
assert (
|
2023-06-09 13:35:54 -04:00
|
|
|
sum(bool(wu.get_aspect_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
|
2023-05-24 02:46:46 -04:00
|
|
|
== 21
|
|
|
|
)
|
|
|
|
|
|
|
|
paths = _get_browse_paths_from_wu(new_wus)
|
|
|
|
assert paths["one"] == []
|
2023-06-02 15:50:38 -04:00
|
|
|
assert (
|
|
|
|
paths["7"]
|
|
|
|
== paths["8"]
|
|
|
|
== _make_container_browse_path_entries(["two", "c", "v"])
|
|
|
|
)
|
|
|
|
assert paths["d"] == _make_container_browse_path_entries(["three"])
|
|
|
|
assert paths["i"] == _make_container_browse_path_entries(["one", "a"])
|
2023-05-24 02:46:46 -04:00
|
|
|
|
2023-06-09 13:35:54 -04:00
|
|
|
# Check urns emitted on demand -- not all at end
|
2024-05-15 22:31:05 -07:00
|
|
|
for urn in {wu.get_urn() for wu in new_wus}:
|
2023-06-09 13:35:54 -04:00
|
|
|
try:
|
|
|
|
idx = next(
|
|
|
|
i
|
|
|
|
for i, wu in enumerate(new_wus)
|
|
|
|
if wu.get_aspect_of_type(models.ContainerClass) and wu.get_urn() == urn
|
|
|
|
)
|
|
|
|
except StopIteration:
|
|
|
|
idx = next(
|
|
|
|
i
|
|
|
|
for i, wu in enumerate(new_wus)
|
|
|
|
if wu.get_aspect_of_type(models.StatusClass) and wu.get_urn() == urn
|
|
|
|
)
|
|
|
|
assert new_wus[idx + 1].get_aspect_of_type(
|
|
|
|
models.BrowsePathsV2Class
|
|
|
|
) or new_wus[idx + 2].get_aspect_of_type(models.BrowsePathsV2Class)
|
|
|
|
|
2023-05-24 02:46:46 -04:00
|
|
|
|
2023-06-09 13:35:54 -04:00
|
|
|
@patch("datahub.ingestion.api.source_helpers.telemetry.telemetry_instance.ping")
|
|
|
|
def test_auto_browse_path_v2_ignores_urns_already_with(telemetry_ping_mock):
|
2023-05-24 02:46:46 -04:00
|
|
|
structure = {"a": {"b": {"c": {"d": ["e"]}}}}
|
|
|
|
|
2023-06-02 15:50:38 -04:00
|
|
|
wus = [
|
|
|
|
*auto_status_aspect(
|
2023-06-09 13:35:54 -04:00
|
|
|
_create_container_aspects(
|
|
|
|
structure,
|
|
|
|
other_aspects={
|
|
|
|
"f": [
|
|
|
|
models.BrowsePathsClass(paths=["/one/two"]),
|
|
|
|
models.BrowsePathsV2Class(
|
|
|
|
path=_make_browse_path_entries(["my", "path"])
|
|
|
|
),
|
|
|
|
],
|
|
|
|
"c": [
|
|
|
|
models.BrowsePathsV2Class(
|
|
|
|
path=_make_container_browse_path_entries(["custom", "path"])
|
|
|
|
)
|
|
|
|
],
|
|
|
|
},
|
|
|
|
),
|
2023-06-02 15:50:38 -04:00
|
|
|
)
|
|
|
|
]
|
2023-06-09 13:35:54 -04:00
|
|
|
new_wus = list(auto_browse_path_v2(wus))
|
|
|
|
assert not telemetry_ping_mock.call_count, telemetry_ping_mock.call_args_list
|
2023-06-02 15:50:38 -04:00
|
|
|
assert (
|
2023-06-09 13:35:54 -04:00
|
|
|
sum(bool(wu.get_aspect_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
|
2023-06-02 15:50:38 -04:00
|
|
|
== 6
|
2023-05-24 02:46:46 -04:00
|
|
|
)
|
|
|
|
|
2023-06-02 15:50:38 -04:00
|
|
|
paths = _get_browse_paths_from_wu(new_wus)
|
|
|
|
assert paths["a"] == []
|
|
|
|
assert paths["c"] == _make_container_browse_path_entries(["custom", "path"])
|
|
|
|
assert paths["f"] == _make_browse_path_entries(["my", "path"])
|
2024-04-12 22:51:06 +05:30
|
|
|
assert paths["d"] == _make_container_browse_path_entries(["custom", "path", "c"])
|
|
|
|
assert paths["e"] == _make_container_browse_path_entries(
|
|
|
|
["custom", "path", "c", "d"]
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@patch("datahub.ingestion.api.source_helpers.telemetry.telemetry_instance.ping")
|
|
|
|
def test_auto_browse_path_v2_with_platform_instance_and_source_browse_path_v2(
|
|
|
|
telemetry_ping_mock,
|
|
|
|
):
|
|
|
|
structure = {"a": {"b": {"c": {"d": ["e"]}}}}
|
|
|
|
|
|
|
|
platform = "platform"
|
|
|
|
instance = "instance"
|
|
|
|
|
|
|
|
wus = [
|
|
|
|
*auto_status_aspect(
|
|
|
|
_create_container_aspects(
|
|
|
|
structure,
|
|
|
|
other_aspects={
|
|
|
|
"a": [
|
|
|
|
models.BrowsePathsV2Class(
|
|
|
|
path=_make_browse_path_entries(["my", "path"]),
|
|
|
|
),
|
|
|
|
],
|
|
|
|
},
|
|
|
|
),
|
|
|
|
)
|
|
|
|
]
|
|
|
|
new_wus = list(
|
|
|
|
auto_browse_path_v2(wus, platform=platform, platform_instance=instance)
|
|
|
|
)
|
|
|
|
assert not telemetry_ping_mock.call_count, telemetry_ping_mock.call_args_list
|
|
|
|
assert (
|
|
|
|
sum(bool(wu.get_aspect_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
|
|
|
|
== 5
|
|
|
|
)
|
|
|
|
|
|
|
|
paths = _get_browse_paths_from_wu(new_wus)
|
|
|
|
assert paths["a"] == prepend_platform_instance(
|
|
|
|
_make_browse_path_entries(["my", "path"]),
|
|
|
|
)
|
|
|
|
assert paths["b"] == prepend_platform_instance(
|
|
|
|
[
|
|
|
|
*_make_browse_path_entries(["my", "path"]),
|
|
|
|
*_make_container_browse_path_entries(["a"]),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
assert paths["c"] == prepend_platform_instance(
|
|
|
|
[
|
|
|
|
*_make_browse_path_entries(["my", "path"]),
|
|
|
|
*_make_container_browse_path_entries(["a", "b"]),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
assert paths["d"] == prepend_platform_instance(
|
|
|
|
[
|
|
|
|
*_make_browse_path_entries(["my", "path"]),
|
|
|
|
*_make_container_browse_path_entries(["a", "b", "c"]),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
assert paths["e"] == prepend_platform_instance(
|
|
|
|
[
|
|
|
|
*_make_browse_path_entries(["my", "path"]),
|
|
|
|
*_make_container_browse_path_entries(["a", "b", "c", "d"]),
|
|
|
|
],
|
|
|
|
)
|
2023-06-02 15:50:38 -04:00
|
|
|
|
|
|
|
|
2023-06-09 13:35:54 -04:00
|
|
|
@patch("datahub.ingestion.api.source_helpers.telemetry.telemetry_instance.ping")
|
|
|
|
def test_auto_browse_path_v2_legacy_browse_path(telemetry_ping_mock):
|
2023-06-02 15:50:38 -04:00
|
|
|
platform = "platform"
|
|
|
|
env = "PROD"
|
|
|
|
wus = [
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn=make_dataset_urn(platform, "dataset-1", env),
|
|
|
|
aspect=models.BrowsePathsClass(["/one/two"]),
|
|
|
|
).as_workunit(),
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn=make_dataset_urn(platform, "dataset-2", env),
|
|
|
|
aspect=models.BrowsePathsClass([f"/{platform}/{env}/something"]),
|
|
|
|
).as_workunit(),
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn=make_dataset_urn(platform, "dataset-3", env),
|
|
|
|
aspect=models.BrowsePathsClass([f"/{platform}/one/two"]),
|
|
|
|
).as_workunit(),
|
|
|
|
]
|
2023-06-09 13:35:54 -04:00
|
|
|
new_wus = list(auto_browse_path_v2(wus, drop_dirs=["platform", "PROD", "unused"]))
|
|
|
|
assert not telemetry_ping_mock.call_count, telemetry_ping_mock.call_args_list
|
2023-06-02 15:50:38 -04:00
|
|
|
assert len(new_wus) == 6
|
|
|
|
paths = _get_browse_paths_from_wu(new_wus)
|
|
|
|
assert (
|
|
|
|
paths["platform,dataset-1,PROD)"]
|
|
|
|
== paths["platform,dataset-3,PROD)"]
|
|
|
|
== _make_browse_path_entries(["one", "two"])
|
|
|
|
)
|
|
|
|
assert paths["platform,dataset-2,PROD)"] == _make_browse_path_entries(["something"])
|
|
|
|
|
|
|
|
|
2023-10-12 13:56:30 +02:00
|
|
|
def test_auto_lowercase_aspects():
|
|
|
|
mcws = auto_workunit(
|
|
|
|
[
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn=make_dataset_urn(
|
|
|
|
"bigquery", "myProject.mySchema.myTable", "PROD"
|
|
|
|
),
|
|
|
|
aspect=models.DatasetKeyClass(
|
|
|
|
"urn:li:dataPlatform:bigquery", "myProject.mySchema.myTable", "PROD"
|
|
|
|
),
|
|
|
|
),
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn="urn:li:container:008e111aa1d250dd52e0fd5d4b307b1a",
|
|
|
|
aspect=models.ContainerPropertiesClass(
|
|
|
|
name="test",
|
|
|
|
),
|
|
|
|
),
|
|
|
|
models.MetadataChangeEventClass(
|
|
|
|
proposedSnapshot=models.DatasetSnapshotClass(
|
|
|
|
urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,bigquery-Public-Data.Covid19_Aha.staffing,PROD)",
|
|
|
|
aspects=[
|
|
|
|
models.DatasetPropertiesClass(
|
|
|
|
customProperties={
|
|
|
|
"key": "value",
|
|
|
|
},
|
|
|
|
),
|
|
|
|
],
|
|
|
|
),
|
|
|
|
),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
|
|
|
|
expected = [
|
|
|
|
*list(
|
|
|
|
auto_workunit(
|
|
|
|
[
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn="urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.myschema.mytable,PROD)",
|
|
|
|
aspect=models.DatasetKeyClass(
|
|
|
|
"urn:li:dataPlatform:bigquery",
|
|
|
|
"myProject.mySchema.myTable",
|
|
|
|
"PROD",
|
|
|
|
),
|
|
|
|
),
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn="urn:li:container:008e111aa1d250dd52e0fd5d4b307b1a",
|
|
|
|
aspect=models.ContainerPropertiesClass(
|
|
|
|
name="test",
|
|
|
|
),
|
|
|
|
),
|
|
|
|
models.MetadataChangeEventClass(
|
|
|
|
proposedSnapshot=models.DatasetSnapshotClass(
|
|
|
|
urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,bigquery-public-data.covid19_aha.staffing,PROD)",
|
|
|
|
aspects=[
|
|
|
|
models.DatasetPropertiesClass(
|
|
|
|
customProperties={
|
|
|
|
"key": "value",
|
|
|
|
},
|
|
|
|
),
|
|
|
|
],
|
|
|
|
),
|
|
|
|
),
|
|
|
|
]
|
|
|
|
)
|
|
|
|
),
|
|
|
|
]
|
|
|
|
assert list(auto_lowercase_urns(mcws)) == expected
|
|
|
|
|
|
|
|
|
2023-06-09 13:35:54 -04:00
|
|
|
@patch("datahub.ingestion.api.source_helpers.telemetry.telemetry_instance.ping")
|
|
|
|
def test_auto_browse_path_v2_container_over_legacy_browse_path(telemetry_ping_mock):
|
2023-06-02 15:50:38 -04:00
|
|
|
structure = {"a": {"b": ["c"]}}
|
|
|
|
wus = list(
|
|
|
|
auto_status_aspect(
|
2023-06-09 13:35:54 -04:00
|
|
|
_create_container_aspects(
|
|
|
|
structure,
|
|
|
|
other_aspects={"b": [models.BrowsePathsClass(paths=["/one/two"])]},
|
|
|
|
),
|
2023-06-02 15:50:38 -04:00
|
|
|
)
|
|
|
|
)
|
2023-06-09 13:35:54 -04:00
|
|
|
new_wus = list(auto_browse_path_v2(wus))
|
|
|
|
assert not telemetry_ping_mock.call_count, telemetry_ping_mock.call_args_list
|
2023-05-24 02:46:46 -04:00
|
|
|
assert (
|
2023-06-09 13:35:54 -04:00
|
|
|
sum(bool(wu.get_aspect_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
|
2023-06-02 15:50:38 -04:00
|
|
|
== 3
|
2023-05-24 02:46:46 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
paths = _get_browse_paths_from_wu(new_wus)
|
|
|
|
assert paths["a"] == []
|
2023-06-02 15:50:38 -04:00
|
|
|
assert paths["b"] == _make_container_browse_path_entries(["a"])
|
|
|
|
assert paths["c"] == _make_container_browse_path_entries(["a", "b"])
|
2023-06-09 13:35:54 -04:00
|
|
|
|
|
|
|
|
|
|
|
@patch("datahub.ingestion.api.source_helpers.telemetry.telemetry_instance.ping")
|
2024-04-12 22:51:06 +05:30
|
|
|
def test_auto_browse_path_v2_with_platform_instance(telemetry_ping_mock):
|
2023-06-09 13:35:54 -04:00
|
|
|
platform = "my_platform"
|
|
|
|
platform_instance = "my_instance"
|
|
|
|
platform_instance_urn = make_dataplatform_instance_urn(platform, platform_instance)
|
|
|
|
platform_instance_entry = models.BrowsePathEntryClass(
|
|
|
|
platform_instance_urn, platform_instance_urn
|
|
|
|
)
|
|
|
|
|
|
|
|
structure = {"a": {"b": ["c"]}}
|
|
|
|
wus = list(auto_status_aspect(_create_container_aspects(structure)))
|
|
|
|
|
|
|
|
new_wus = list(
|
|
|
|
auto_browse_path_v2(
|
|
|
|
wus,
|
2023-06-15 23:10:15 -04:00
|
|
|
platform=platform,
|
|
|
|
platform_instance=platform_instance,
|
2023-06-09 13:35:54 -04:00
|
|
|
)
|
|
|
|
)
|
|
|
|
assert telemetry_ping_mock.call_count == 0
|
|
|
|
|
|
|
|
assert (
|
|
|
|
sum(bool(wu.get_aspect_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
|
|
|
|
== 3
|
|
|
|
)
|
|
|
|
paths = _get_browse_paths_from_wu(new_wus)
|
|
|
|
assert paths["a"] == [platform_instance_entry]
|
|
|
|
assert paths["b"] == [
|
|
|
|
platform_instance_entry,
|
|
|
|
*_make_container_browse_path_entries(["a"]),
|
|
|
|
]
|
|
|
|
assert paths["c"] == [
|
|
|
|
platform_instance_entry,
|
|
|
|
*_make_container_browse_path_entries(["a", "b"]),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@patch("datahub.ingestion.api.source_helpers.telemetry.telemetry_instance.ping")
|
|
|
|
def test_auto_browse_path_v2_invalid_batch_telemetry(telemetry_ping_mock):
|
|
|
|
structure = {"a": {"b": ["c"]}}
|
|
|
|
b_urn = make_container_urn("b")
|
|
|
|
wus = [
|
|
|
|
*_create_container_aspects(structure),
|
|
|
|
MetadataChangeProposalWrapper( # Browse path for b separate from its Container aspect
|
|
|
|
entityUrn=b_urn,
|
|
|
|
aspect=models.BrowsePathsClass(paths=["/one/two"]),
|
|
|
|
).as_workunit(),
|
|
|
|
]
|
|
|
|
wus = list(auto_status_aspect(wus))
|
|
|
|
|
|
|
|
assert telemetry_ping_mock.call_count == 0
|
|
|
|
_ = list(auto_browse_path_v2(wus))
|
|
|
|
assert telemetry_ping_mock.call_count == 1
|
|
|
|
assert telemetry_ping_mock.call_args_list[0][0][0] == "incorrect_browse_path_v2"
|
|
|
|
assert telemetry_ping_mock.call_args_list[0][0][1]["num_out_of_order"] == 0
|
|
|
|
assert telemetry_ping_mock.call_args_list[0][0][1]["num_out_of_batch"] == 1
|
|
|
|
|
|
|
|
|
|
|
|
@patch("datahub.ingestion.api.source_helpers.telemetry.telemetry_instance.ping")
|
|
|
|
def test_auto_browse_path_v2_no_invalid_batch_telemetry_for_unrelated_aspects(
|
|
|
|
telemetry_ping_mock,
|
|
|
|
):
|
|
|
|
structure = {"a": {"b": ["c"]}}
|
|
|
|
b_urn = make_container_urn("b")
|
|
|
|
wus = [
|
|
|
|
*_create_container_aspects(structure),
|
|
|
|
MetadataChangeProposalWrapper( # Browse path for b separate from its Container aspect
|
|
|
|
entityUrn=b_urn,
|
|
|
|
aspect=models.ContainerPropertiesClass("container name"),
|
|
|
|
).as_workunit(),
|
|
|
|
]
|
|
|
|
wus = list(auto_status_aspect(wus))
|
|
|
|
|
|
|
|
assert telemetry_ping_mock.call_count == 0
|
|
|
|
_ = list(auto_browse_path_v2(wus))
|
|
|
|
assert telemetry_ping_mock.call_count == 0
|
|
|
|
|
|
|
|
|
|
|
|
@patch("datahub.ingestion.api.source_helpers.telemetry.telemetry_instance.ping")
|
|
|
|
def test_auto_browse_path_v2_invalid_order_telemetry(telemetry_ping_mock):
|
|
|
|
structure = {"a": {"b": ["c"]}}
|
|
|
|
wus = list(reversed(list(_create_container_aspects(structure))))
|
|
|
|
wus = list(auto_status_aspect(wus))
|
|
|
|
|
|
|
|
assert telemetry_ping_mock.call_count == 0
|
|
|
|
new_wus = list(auto_browse_path_v2(wus))
|
|
|
|
assert (
|
|
|
|
sum(bool(wu.get_aspect_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
|
|
|
|
> 0
|
|
|
|
)
|
|
|
|
assert telemetry_ping_mock.call_count == 1
|
|
|
|
assert telemetry_ping_mock.call_args_list[0][0][0] == "incorrect_browse_path_v2"
|
|
|
|
assert telemetry_ping_mock.call_args_list[0][0][1]["num_out_of_order"] == 1
|
|
|
|
assert telemetry_ping_mock.call_args_list[0][0][1]["num_out_of_batch"] == 0
|
|
|
|
|
|
|
|
|
|
|
|
@patch("datahub.ingestion.api.source_helpers.telemetry.telemetry_instance.ping")
|
|
|
|
def test_auto_browse_path_v2_dry_run(telemetry_ping_mock):
|
|
|
|
structure = {"a": {"b": ["c"]}}
|
|
|
|
wus = list(reversed(list(_create_container_aspects(structure))))
|
|
|
|
wus = list(auto_status_aspect(wus))
|
|
|
|
|
|
|
|
assert telemetry_ping_mock.call_count == 0
|
|
|
|
new_wus = list(auto_browse_path_v2(wus, dry_run=True))
|
|
|
|
assert wus == new_wus
|
|
|
|
assert (
|
|
|
|
sum(bool(wu.get_aspect_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
|
|
|
|
== 0
|
|
|
|
)
|
|
|
|
assert telemetry_ping_mock.call_count == 1
|
2023-06-22 17:07:50 -04:00
|
|
|
|
|
|
|
|
|
|
|
@freeze_time("2023-01-02 00:00:00")
|
2023-12-04 20:00:11 -05:00
|
|
|
def test_auto_empty_dataset_usage_statistics(caplog: pytest.LogCaptureFixture) -> None:
|
2023-06-22 17:07:50 -04:00
|
|
|
has_urn = make_dataset_urn("my_platform", "has_aspect")
|
|
|
|
empty_urn = make_dataset_urn("my_platform", "no_aspect")
|
|
|
|
config = BaseTimeWindowConfig()
|
|
|
|
wus = [
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn=has_urn,
|
|
|
|
aspect=models.DatasetUsageStatisticsClass(
|
|
|
|
timestampMillis=int(config.start_time.timestamp() * 1000),
|
|
|
|
eventGranularity=models.TimeWindowSizeClass(
|
|
|
|
models.CalendarIntervalClass.DAY
|
|
|
|
),
|
|
|
|
uniqueUserCount=1,
|
|
|
|
totalSqlQueries=1,
|
|
|
|
),
|
|
|
|
).as_workunit()
|
|
|
|
]
|
2023-12-04 20:00:11 -05:00
|
|
|
caplog.clear()
|
2023-06-22 17:07:50 -04:00
|
|
|
with caplog.at_level(logging.WARNING):
|
|
|
|
new_wus = list(
|
|
|
|
auto_empty_dataset_usage_statistics(
|
|
|
|
wus,
|
|
|
|
dataset_urns={has_urn, empty_urn},
|
|
|
|
config=config,
|
|
|
|
all_buckets=False,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
assert not caplog.records
|
|
|
|
|
|
|
|
assert new_wus == [
|
|
|
|
*wus,
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn=empty_urn,
|
|
|
|
aspect=models.DatasetUsageStatisticsClass(
|
|
|
|
timestampMillis=int(datetime(2023, 1, 1).timestamp() * 1000),
|
|
|
|
eventGranularity=models.TimeWindowSizeClass(
|
|
|
|
models.CalendarIntervalClass.DAY
|
|
|
|
),
|
|
|
|
uniqueUserCount=0,
|
|
|
|
totalSqlQueries=0,
|
|
|
|
topSqlQueries=[],
|
|
|
|
userCounts=[],
|
|
|
|
fieldCounts=[],
|
|
|
|
),
|
|
|
|
).as_workunit(),
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@freeze_time("2023-01-02 00:00:00")
|
2023-12-04 20:00:11 -05:00
|
|
|
def test_auto_empty_dataset_usage_statistics_invalid_timestamp(
|
|
|
|
caplog: pytest.LogCaptureFixture,
|
|
|
|
) -> None:
|
2023-06-22 17:07:50 -04:00
|
|
|
urn = make_dataset_urn("my_platform", "my_dataset")
|
|
|
|
config = BaseTimeWindowConfig()
|
|
|
|
wus = [
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn=urn,
|
|
|
|
aspect=models.DatasetUsageStatisticsClass(
|
|
|
|
timestampMillis=0,
|
|
|
|
eventGranularity=models.TimeWindowSizeClass(
|
|
|
|
models.CalendarIntervalClass.DAY
|
|
|
|
),
|
|
|
|
uniqueUserCount=1,
|
|
|
|
totalSqlQueries=1,
|
|
|
|
),
|
|
|
|
).as_workunit()
|
|
|
|
]
|
2023-12-04 20:00:11 -05:00
|
|
|
caplog.clear()
|
2023-06-22 17:07:50 -04:00
|
|
|
with caplog.at_level(logging.WARNING):
|
|
|
|
new_wus = list(
|
|
|
|
auto_empty_dataset_usage_statistics(
|
|
|
|
wus,
|
|
|
|
dataset_urns={urn},
|
|
|
|
config=config,
|
|
|
|
all_buckets=True,
|
|
|
|
)
|
|
|
|
)
|
|
|
|
assert len(caplog.records) == 1
|
|
|
|
assert "1970-01-01 00:00:00+00:00" in caplog.records[0].msg
|
|
|
|
|
|
|
|
assert new_wus == [
|
|
|
|
*wus,
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn=urn,
|
|
|
|
aspect=models.DatasetUsageStatisticsClass(
|
|
|
|
timestampMillis=int(config.start_time.timestamp() * 1000),
|
|
|
|
eventGranularity=models.TimeWindowSizeClass(
|
|
|
|
models.CalendarIntervalClass.DAY
|
|
|
|
),
|
|
|
|
uniqueUserCount=0,
|
|
|
|
totalSqlQueries=0,
|
|
|
|
topSqlQueries=[],
|
|
|
|
userCounts=[],
|
|
|
|
fieldCounts=[],
|
|
|
|
),
|
|
|
|
changeType=models.ChangeTypeClass.CREATE,
|
|
|
|
).as_workunit(),
|
|
|
|
]
|
2024-09-23 22:36:05 +05:30
|
|
|
|
|
|
|
|
|
|
|
def get_sample_mcps(mcps_to_append: List = []) -> List[MetadataChangeProposalWrapper]:
|
|
|
|
mcps = [
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn="urn:li:dataset:(urn:li:dataPlatform:dbt,abc.foo.bar,PROD)",
|
|
|
|
aspect=models.OperationClass(
|
|
|
|
timestampMillis=10,
|
|
|
|
lastUpdatedTimestamp=12,
|
|
|
|
operationType=OperationTypeClass.CREATE,
|
|
|
|
),
|
|
|
|
),
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn="urn:li:dataset:(urn:li:dataPlatform:dbt,abc.foo.bar,PROD)",
|
|
|
|
aspect=models.OperationClass(
|
|
|
|
timestampMillis=11,
|
|
|
|
lastUpdatedTimestamp=20,
|
|
|
|
operationType=OperationTypeClass.CREATE,
|
|
|
|
),
|
|
|
|
),
|
|
|
|
]
|
|
|
|
mcps.extend(mcps_to_append)
|
|
|
|
return mcps
|
|
|
|
|
|
|
|
|
|
|
|
def to_patch_work_units(patch_builder: DatasetPatchBuilder) -> List[MetadataWorkUnit]:
|
|
|
|
return [
|
|
|
|
MetadataWorkUnit(
|
|
|
|
id=MetadataWorkUnit.generate_workunit_id(patch_mcp), mcp_raw=patch_mcp
|
|
|
|
)
|
|
|
|
for patch_mcp in patch_builder.build()
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
def get_auto_generated_wu() -> List[MetadataWorkUnit]:
|
|
|
|
dataset_patch_builder = DatasetPatchBuilder(
|
|
|
|
urn="urn:li:dataset:(urn:li:dataPlatform:dbt,abc.foo.bar,PROD)"
|
|
|
|
).set_last_modified(TimeStampClass(time=20))
|
|
|
|
|
|
|
|
auto_generated_work_units = to_patch_work_units(dataset_patch_builder)
|
|
|
|
|
|
|
|
return auto_generated_work_units
|
|
|
|
|
|
|
|
|
|
|
|
@freeze_time("2023-01-02 00:00:00")
|
|
|
|
def test_auto_patch_last_modified_no_change():
|
|
|
|
mcps = [
|
|
|
|
MetadataChangeProposalWrapper(
|
|
|
|
entityUrn="urn:li:container:008e111aa1d250dd52e0fd5d4b307b1a",
|
|
|
|
aspect=models.StatusClass(removed=False),
|
|
|
|
)
|
|
|
|
]
|
|
|
|
|
|
|
|
initial_wu = list(auto_workunit(mcps))
|
|
|
|
|
|
|
|
expected = initial_wu
|
|
|
|
|
|
|
|
assert (
|
|
|
|
list(auto_patch_last_modified(initial_wu)) == expected
|
|
|
|
) # There should be no change
|
|
|
|
|
|
|
|
|
|
|
|
@freeze_time("2023-01-02 00:00:00")
|
|
|
|
def test_auto_patch_last_modified_max_last_updated_timestamp():
|
|
|
|
mcps = get_sample_mcps()
|
|
|
|
|
|
|
|
expected = list(auto_workunit(mcps))
|
|
|
|
|
|
|
|
auto_generated_work_units = get_auto_generated_wu()
|
|
|
|
|
|
|
|
expected.extend(auto_generated_work_units)
|
|
|
|
|
|
|
|
# work unit should contain a path of datasetProperties with lastModified set to max of operation.lastUpdatedTime
|
|
|
|
# i.e., 20
|
|
|
|
assert list(auto_patch_last_modified(auto_workunit(mcps))) == expected
|
|
|
|
|
|
|
|
|
|
|
|
@freeze_time("2023-01-02 00:00:00")
|
|
|
|
def test_auto_patch_last_modified_multi_patch():
|
|
|
|
mcps = get_sample_mcps()
|
|
|
|
|
|
|
|
dataset_patch_builder = DatasetPatchBuilder(
|
|
|
|
urn="urn:li:dataset:(urn:li:dataPlatform:dbt,abc.foo.bar,PROD)"
|
|
|
|
)
|
|
|
|
|
|
|
|
dataset_patch_builder.set_display_name("foo")
|
|
|
|
dataset_patch_builder.set_description("it is fake")
|
|
|
|
|
|
|
|
patch_work_units = to_patch_work_units(dataset_patch_builder)
|
|
|
|
|
|
|
|
work_units = [*list(auto_workunit(mcps)), *patch_work_units]
|
|
|
|
|
|
|
|
auto_generated_work_units = get_auto_generated_wu()
|
|
|
|
|
|
|
|
expected = [*work_units, *auto_generated_work_units]
|
|
|
|
|
|
|
|
# In this case, the final work units include two patch units: one originating from the source and
|
|
|
|
# the other from auto_patch_last_modified.
|
|
|
|
assert list(auto_patch_last_modified(work_units)) == expected
|
|
|
|
|
|
|
|
|
|
|
|
@freeze_time("2023-01-02 00:00:00")
|
|
|
|
def test_auto_patch_last_modified_last_modified_patch_exist():
|
|
|
|
mcps = get_sample_mcps()
|
|
|
|
|
|
|
|
patch_builder = create_dataset_props_patch_builder(
|
|
|
|
dataset_urn="urn:li:dataset:(urn:li:dataPlatform:dbt,abc.foo.bar,PROD)",
|
|
|
|
dataset_properties=DatasetPropertiesClass(
|
|
|
|
name="foo",
|
|
|
|
description="dataset for collection of foo",
|
|
|
|
lastModified=TimeStampClass(time=20),
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
work_units = [
|
|
|
|
*list(auto_workunit(mcps)),
|
|
|
|
*to_patch_work_units(patch_builder),
|
|
|
|
]
|
|
|
|
# The input and output should align since the source is generating a patch for datasetProperties with the
|
|
|
|
# lastModified attribute.
|
|
|
|
# Therefore, `auto_patch_last_modified` should not create any additional patch.
|
|
|
|
assert list(auto_patch_last_modified(work_units)) == work_units
|
|
|
|
|
|
|
|
|
|
|
|
@freeze_time("2023-01-02 00:00:00")
|
|
|
|
def test_auto_patch_last_modified_last_modified_patch_not_exist():
|
|
|
|
mcps = get_sample_mcps()
|
|
|
|
|
|
|
|
patch_builder = create_dataset_props_patch_builder(
|
|
|
|
dataset_urn="urn:li:dataset:(urn:li:dataPlatform:dbt,abc.foo.bar,PROD)",
|
|
|
|
dataset_properties=DatasetPropertiesClass(
|
|
|
|
name="foo",
|
|
|
|
description="dataset for collection of foo",
|
|
|
|
),
|
|
|
|
)
|
|
|
|
|
|
|
|
work_units = [
|
|
|
|
*list(auto_workunit(mcps)),
|
|
|
|
*to_patch_work_units(patch_builder),
|
|
|
|
]
|
|
|
|
|
|
|
|
expected = [
|
|
|
|
*work_units,
|
|
|
|
*get_auto_generated_wu(), # The output should include an additional patch for the `lastModified` attribute.
|
|
|
|
]
|
|
|
|
|
|
|
|
assert list(auto_patch_last_modified(work_units)) == expected
|