feat(ingest): Create Browse Paths V2 under flag (#8120)

This commit is contained in:
Andrew Sikowitz 2023-06-02 15:50:38 -04:00 committed by GitHub
parent fe1ff71318
commit 802c91a0a7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 231 additions and 79 deletions

View File

@ -8,7 +8,7 @@ import unittest.mock
from dataclasses import Field, dataclass, field from dataclasses import Field, dataclass, field
from enum import auto from enum import auto
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Tuple, Iterable
import avro.schema import avro.schema
import click import click
@ -27,7 +27,6 @@ from datahub.metadata.schema_classes import (
DatasetSnapshotClass, DatasetSnapshotClass,
ForeignKeyConstraintClass, ForeignKeyConstraintClass,
GlobalTagsClass, GlobalTagsClass,
MetadataChangeEventClass,
OtherSchemaClass, OtherSchemaClass,
SchemaFieldClass as SchemaField, SchemaFieldClass as SchemaField,
SchemaFieldDataTypeClass, SchemaFieldDataTypeClass,
@ -35,6 +34,8 @@ from datahub.metadata.schema_classes import (
StringTypeClass, StringTypeClass,
SubTypesClass, SubTypesClass,
TagAssociationClass, TagAssociationClass,
BrowsePathsV2Class,
BrowsePathEntryClass,
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -316,9 +317,10 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
raise Exception(f"Failed to find information for entity: {entity_name}") raise Exception(f"Failed to find information for entity: {entity_name}")
def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]: def generate_stitched_record(
relnships_graph: RelationshipGraph,
) -> Iterable[MetadataChangeProposalWrapper]:
def strip_types(field_path: str) -> str: def strip_types(field_path: str) -> str:
final_path = field_path final_path = field_path
final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path) final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path)
final_path = re.sub(r"^\[version=2.0\]\.", "", final_path) final_path = re.sub(r"^\[version=2.0\]\.", "", final_path)
@ -455,52 +457,41 @@ def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]:
edge_id=f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}", edge_id=f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}",
) )
schemaMetadata = SchemaMetadataClass( dataset_urn = make_dataset_urn(
schemaName=f"{entity_name}", platform="datahub",
name=entity_display_name,
)
yield from MetadataChangeProposalWrapper.construct_many(
entityUrn=dataset_urn,
aspects=[
SchemaMetadataClass(
schemaName=str(entity_name),
platform=make_data_platform_urn("datahub"), platform=make_data_platform_urn("datahub"),
platformSchema=OtherSchemaClass(rawSchema=rawSchema), platformSchema=OtherSchemaClass(rawSchema=rawSchema),
fields=schema_fields, fields=schema_fields,
version=0, version=0,
hash="", hash="",
foreignKeys=foreign_keys if foreign_keys else None, foreignKeys=foreign_keys if foreign_keys else None,
)
dataset = DatasetSnapshotClass(
urn=make_dataset_urn(
platform="datahub",
name=f"{entity_display_name}",
), ),
aspects=[
schemaMetadata,
GlobalTagsClass( GlobalTagsClass(
tags=[TagAssociationClass(tag="urn:li:tag:Entity")] tags=[TagAssociationClass(tag="urn:li:tag:Entity")]
), ),
BrowsePathsClass([f"/prod/datahub/entities/{entity_display_name}"]), BrowsePathsClass([f"/prod/datahub/entities/{entity_display_name}"]),
BrowsePathsV2Class(
[
BrowsePathEntryClass(id="entities"),
BrowsePathEntryClass(id=entity_display_name),
]
),
DatasetPropertiesClass(
description=make_entity_docs(
dataset_urn.split(":")[-1].split(",")[1], relnships_graph
)
),
SubTypesClass(typeNames=["entity"]),
], ],
) )
datasets.append(dataset)
events: List[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]] = []
for d in datasets:
entity_name = d.urn.split(":")[-1].split(",")[1]
d.aspects.append(
DatasetPropertiesClass(
description=make_entity_docs(entity_name, relnships_graph)
)
)
mce = MetadataChangeEventClass(
proposedSnapshot=d,
)
events.append(mce)
mcp = MetadataChangeProposalWrapper(
entityUrn=d.urn,
aspect=SubTypesClass(typeNames=["entity"]),
)
events.append(mcp)
return events
class EntityRegistry(ConfigModel): class EntityRegistry(ConfigModel):
@ -614,7 +605,7 @@ def generate(
] ]
relationship_graph = RelationshipGraph() relationship_graph = RelationshipGraph()
events = generate_stitched_record(relationship_graph) mcps = generate_stitched_record(relationship_graph)
shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True) shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True)
entity_names = [(x, entity_registry[x]) for x in generated_documentation] entity_names = [(x, entity_registry[x]) for x in generated_documentation]
@ -645,7 +636,7 @@ def generate(
PipelineContext(run_id="generated-metaModel"), PipelineContext(run_id="generated-metaModel"),
FileSinkConfig(filename=file), FileSinkConfig(filename=file),
) )
for e in events: for e in mcps:
fileSink.write_record_async( fileSink.write_record_async(
RecordEnvelope(e, metadata={}), write_callback=NoopWriteCallback() RecordEnvelope(e, metadata={}), write_callback=NoopWriteCallback()
) )
@ -674,7 +665,7 @@ def generate(
assert server.startswith("http://"), "server address must start with http://" assert server.startswith("http://"), "server address must start with http://"
emitter = DatahubRestEmitter(gms_server=server) emitter = DatahubRestEmitter(gms_server=server)
emitter.test_connection() emitter.test_connection()
for e in events: for e in mcps:
emitter.emit(e) emitter.emit(e)
if dot: if dot:

View File

@ -27,6 +27,7 @@ from datahub.ingestion.api.closeable import Closeable
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
from datahub.ingestion.api.report import Report from datahub.ingestion.api.report import Report
from datahub.ingestion.api.source_helpers import ( from datahub.ingestion.api.source_helpers import (
auto_browse_path_v2,
auto_materialize_referenced_tags, auto_materialize_referenced_tags,
auto_status_aspect, auto_status_aspect,
auto_workunit_reporter, auto_workunit_reporter,
@ -181,10 +182,31 @@ class Source(Closeable, metaclass=ABCMeta):
"""A list of functions that transforms the workunits produced by this source. """A list of functions that transforms the workunits produced by this source.
Run in order, first in list is applied first. Be careful with order when overriding. Run in order, first in list is applied first. Be careful with order when overriding.
""" """
browse_path_processor: Optional[MetadataWorkUnitProcessor] = None
if (
self.ctx.pipeline_config
and self.ctx.pipeline_config.flags.generate_browse_path_v2
):
platform = getattr(self, "platform", None) or getattr(
self.get_config(), "platform", None
)
env = getattr(self.get_config(), "env", None)
browse_path_drop_dirs = [
platform,
platform and platform.lower(),
env,
env and env.lower(),
]
browse_path_processor = partial(
auto_browse_path_v2,
[s for s in browse_path_drop_dirs if s is not None],
)
return [ return [
auto_status_aspect, auto_status_aspect,
auto_materialize_referenced_tags, auto_materialize_referenced_tags,
partial(auto_workunit_reporter, self.get_report()), partial(auto_workunit_reporter, self.get_report()),
browse_path_processor,
] ]
@staticmethod @staticmethod
@ -207,6 +229,18 @@ class Source(Closeable, metaclass=ABCMeta):
"get_workunits_internal must be implemented if get_workunits is not overriden." "get_workunits_internal must be implemented if get_workunits is not overriden."
) )
def get_config(self) -> Optional[ConfigModel]:
"""Overridable method to return the config object for this source.
Enables defining workunit processors in this class, rather than per source.
More broadly, this method contributes to the standardization of sources,
to promote more source-generic functionality.
Eventually, would like to replace this call with a Protocol that requires
a config object to be defined on each source.
"""
return getattr(self, "config", None) or getattr(self, "source_config", None)
@abstractmethod @abstractmethod
def get_report(self) -> SourceReport: def get_report(self) -> SourceReport:
pass pass

View File

@ -7,6 +7,7 @@ from typing import (
Iterable, Iterable,
List, List,
Optional, Optional,
Sequence,
Set, Set,
TypeVar, TypeVar,
Union, Union,
@ -16,6 +17,7 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.metadata.schema_classes import ( from datahub.metadata.schema_classes import (
BrowsePathEntryClass, BrowsePathEntryClass,
BrowsePathsClass,
BrowsePathsV2Class, BrowsePathsV2Class,
ContainerClass, ContainerClass,
MetadataChangeEventClass, MetadataChangeEventClass,
@ -164,12 +166,13 @@ def auto_materialize_referenced_tags(
def auto_browse_path_v2( def auto_browse_path_v2(
drop_dirs: Sequence[str],
stream: Iterable[MetadataWorkUnit], stream: Iterable[MetadataWorkUnit],
) -> Iterable[MetadataWorkUnit]: ) -> Iterable[MetadataWorkUnit]:
"""Generate BrowsePathsV2 from Container aspects.""" """Generate BrowsePathsV2 from Container and BrowsePaths aspects."""
# TODO: Generate BrowsePathsV2 from BrowsePaths as well
ignore_urns: Set[str] = set() ignore_urns: Set[str] = set()
legacy_browse_paths: Dict[str, List[str]] = defaultdict(list)
container_urns: Set[str] = set() container_urns: Set[str] = set()
parent_container_map: Dict[str, str] = {} parent_container_map: Dict[str, str] = {}
children: Dict[str, List[str]] = defaultdict(list) children: Dict[str, List[str]] = defaultdict(list)
@ -181,16 +184,25 @@ def auto_browse_path_v2(
container_urns.add(urn) container_urns.add(urn)
container_aspects = wu.get_aspects_of_type(ContainerClass) container_aspects = wu.get_aspects_of_type(ContainerClass)
for aspect in container_aspects: for c_aspect in container_aspects:
parent = aspect.container parent = c_aspect.container
parent_container_map[urn] = parent parent_container_map[urn] = parent
children[parent].append(urn) children[parent].append(urn)
browse_path_aspects = wu.get_aspects_of_type(BrowsePathsClass)
for b_aspect in browse_path_aspects:
if b_aspect.paths:
path = b_aspect.paths[0] # Only take first path
legacy_browse_paths[urn] = [
p for p in path.strip("/").split("/") if p.strip() not in drop_dirs
]
if wu.get_aspects_of_type(BrowsePathsV2Class): if wu.get_aspects_of_type(BrowsePathsV2Class):
ignore_urns.add(urn) ignore_urns.add(urn)
paths: Dict[str, List[str]] = {} # Maps urn -> list of urns in path paths: Dict[str, List[str]] = {} # Maps urn -> list of urns in path
# Yield browse paths v2 in topological order, starting with root containers # Yield browse paths v2 in topological order, starting with root containers
processed_urns = set()
nodes = container_urns - parent_container_map.keys() nodes = container_urns - parent_container_map.keys()
while nodes: while nodes:
node = nodes.pop() node = nodes.pop()
@ -208,3 +220,14 @@ def auto_browse_path_v2(
path=[BrowsePathEntryClass(id=urn, urn=urn) for urn in paths[node]] path=[BrowsePathEntryClass(id=urn, urn=urn) for urn in paths[node]]
), ),
).as_workunit() ).as_workunit()
processed_urns.add(node)
# Yield browse paths v2 based on browse paths v1 (legacy)
# Only done if the entity is not part of a container hierarchy
for urn in legacy_browse_paths.keys() - processed_urns - ignore_urns:
yield MetadataChangeProposalWrapper(
entityUrn=urn,
aspect=BrowsePathsV2Class(
path=[BrowsePathEntryClass(id=p) for p in legacy_browse_paths[urn]]
),
).as_workunit()

View File

@ -37,6 +37,19 @@ class FailureLoggingConfig(ConfigModel):
log_config: Optional[FileSinkConfig] = None log_config: Optional[FileSinkConfig] = None
class FlagsConfig(ConfigModel):
"""Experimental flags for the ingestion pipeline.
As ingestion flags an experimental feature, we do not guarantee backwards compatibility.
Use at your own risk!
"""
generate_browse_path_v2: bool = Field(
default=False,
description="Generate BrowsePathsV2 aspects from container hierarchy and existing BrowsePaths aspects.",
)
class PipelineConfig(ConfigModel): class PipelineConfig(ConfigModel):
# Once support for discriminated unions gets merged into Pydantic, we can # Once support for discriminated unions gets merged into Pydantic, we can
# simplify this configuration and validation. # simplify this configuration and validation.
@ -45,6 +58,7 @@ class PipelineConfig(ConfigModel):
source: SourceConfig source: SourceConfig
sink: DynamicTypedConfig sink: DynamicTypedConfig
transformers: Optional[List[DynamicTypedConfig]] transformers: Optional[List[DynamicTypedConfig]]
flags: FlagsConfig = Field(default=FlagsConfig())
reporting: List[ReporterConfig] = [] reporting: List[ReporterConfig] = []
run_id: str = DEFAULT_RUN_ID run_id: str = DEFAULT_RUN_ID
datahub_api: Optional[DatahubClientConfig] = None datahub_api: Optional[DatahubClientConfig] = None

View File

@ -41,6 +41,7 @@ class SagemakerSource(Source):
- Models, jobs, and lineage between the two (e.g. when jobs output a model or a model is used by a job) - Models, jobs, and lineage between the two (e.g. when jobs output a model or a model is used by a job)
""" """
platform = "sagemaker"
source_config: SagemakerSourceConfig source_config: SagemakerSourceConfig
report = SagemakerSourceReport() report = SagemakerSourceReport()

View File

@ -116,6 +116,7 @@ class FeastRepositorySource(Source):
- Column types associated with each entity and feature - Column types associated with each entity and feature
""" """
platform = "feast"
source_config: FeastRepositorySourceConfig source_config: FeastRepositorySourceConfig
report: SourceReport report: SourceReport
feature_store: FeatureStore feature_store: FeatureStore

View File

@ -13,7 +13,7 @@ from okta.models import Group, GroupProfile, User, UserProfile, UserStatus
from pydantic import validator from pydantic import validator
from pydantic.fields import Field from pydantic.fields import Field
from datahub.configuration.common import ConfigurationError from datahub.configuration.common import ConfigModel, ConfigurationError
from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.decorators import ( from datahub.ingestion.api.decorators import (
@ -53,7 +53,7 @@ from datahub.metadata.schema_classes import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class OktaConfig(StatefulIngestionConfigBase): class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
# Required: Domain of the Okta deployment. Example: dev-33231928.okta.com # Required: Domain of the Okta deployment. Example: dev-33231928.okta.com
okta_domain: str = Field( okta_domain: str = Field(
description="The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. e.g. dev-33231928.okta.com", description="The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. e.g. dev-33231928.okta.com",

View File

@ -160,7 +160,7 @@ class ModeSource(Source):
config: ModeConfig config: ModeConfig
report: SourceReport report: SourceReport
tool = "mode" platform = "mode"
def __hash__(self): def __hash__(self):
return id(self) return id(self)
@ -200,7 +200,9 @@ class ModeSource(Source):
self, space_name: str, report_info: dict self, space_name: str, report_info: dict
) -> DashboardSnapshot: ) -> DashboardSnapshot:
report_token = report_info.get("token", "") report_token = report_info.get("token", "")
dashboard_urn = builder.make_dashboard_urn(self.tool, report_info.get("id", "")) dashboard_urn = builder.make_dashboard_urn(
self.platform, report_info.get("id", "")
)
dashboard_snapshot = DashboardSnapshot( dashboard_snapshot = DashboardSnapshot(
urn=dashboard_urn, urn=dashboard_urn,
aspects=[], aspects=[],
@ -304,7 +306,9 @@ class ModeSource(Source):
charts = self._get_charts(report_token, query.get("token", "")) charts = self._get_charts(report_token, query.get("token", ""))
# build chart urns # build chart urns
for chart in charts: for chart in charts:
chart_urn = builder.make_chart_urn(self.tool, chart.get("token", "")) chart_urn = builder.make_chart_urn(
self.platform, chart.get("token", "")
)
chart_urns.append(chart_urn) chart_urns.append(chart_urn)
return chart_urns return chart_urns
@ -580,7 +584,7 @@ class ModeSource(Source):
def construct_chart_from_api_data( def construct_chart_from_api_data(
self, chart_data: dict, query: dict, path: str self, chart_data: dict, query: dict, path: str
) -> ChartSnapshot: ) -> ChartSnapshot:
chart_urn = builder.make_chart_urn(self.tool, chart_data.get("token", "")) chart_urn = builder.make_chart_urn(self.platform, chart_data.get("token", ""))
chart_snapshot = ChartSnapshot( chart_snapshot = ChartSnapshot(
urn=chart_urn, urn=chart_urn,
aspects=[], aspects=[],

View File

@ -8,6 +8,7 @@ import requests
from pydantic.class_validators import root_validator, validator from pydantic.class_validators import root_validator, validator
from pydantic.fields import Field from pydantic.fields import Field
from datahub.configuration import ConfigModel
from datahub.emitter.mce_builder import DEFAULT_ENV from datahub.emitter.mce_builder import DEFAULT_ENV
from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.decorators import ( from datahub.ingestion.api.decorators import (
@ -69,7 +70,7 @@ chart_type_from_viz_type = {
} }
class SupersetConfig(StatefulIngestionConfigBase): class SupersetConfig(StatefulIngestionConfigBase, ConfigModel):
# See the Superset /security/login endpoint for details # See the Superset /security/login endpoint for details
# https://superset.apache.org/docs/rest-api # https://superset.apache.org/docs/rest-api
connect_uri: str = Field( connect_uri: str = Field(

View File

@ -332,7 +332,8 @@ class AddStatusRemovedTransformer(Transformer):
class FakeSource(Source): class FakeSource(Source):
def __init__(self): def __init__(self, ctx: PipelineContext):
super().__init__(ctx)
self.source_report = SourceReport() self.source_report = SourceReport()
self.work_units: List[MetadataWorkUnit] = [ self.work_units: List[MetadataWorkUnit] = [
MetadataWorkUnit(id="workunit-1", mce=get_initial_mce()) MetadataWorkUnit(id="workunit-1", mce=get_initial_mce())
@ -341,7 +342,7 @@ class FakeSource(Source):
@classmethod @classmethod
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source": def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
assert not config_dict assert not config_dict
return cls() return cls(ctx)
def get_workunits(self) -> Iterable[MetadataWorkUnit]: def get_workunits(self) -> Iterable[MetadataWorkUnit]:
return self.work_units return self.work_units
@ -354,8 +355,8 @@ class FakeSource(Source):
class FakeSourceWithWarnings(FakeSource): class FakeSourceWithWarnings(FakeSource):
def __init__(self): def __init__(self, ctx: PipelineContext):
super().__init__() super().__init__(ctx)
self.source_report.report_warning("test_warning", "warning_text") self.source_report.report_warning("test_warning", "warning_text")
def get_report(self) -> SourceReport: def get_report(self) -> SourceReport:
@ -363,8 +364,8 @@ class FakeSourceWithWarnings(FakeSource):
class FakeSourceWithFailures(FakeSource): class FakeSourceWithFailures(FakeSource):
def __init__(self): def __init__(self, ctx: PipelineContext):
super().__init__() super().__init__(ctx)
self.source_report.report_failure("test_failure", "failure_text") self.source_report.report_failure("test_failure", "failure_text")
def get_report(self) -> SourceReport: def get_report(self) -> SourceReport:

View File

@ -27,6 +27,7 @@ class FakeSource(Source):
] ]
def __init__(self, ctx: PipelineContext): def __init__(self, ctx: PipelineContext):
super().__init__(ctx)
self.source_report = SourceReport() self.source_report = SourceReport()
@classmethod @classmethod

View File

@ -1,7 +1,7 @@
from typing import Any, Dict, Iterable, List, Union from typing import Any, Dict, Iterable, List, Union
import datahub.metadata.schema_classes as models import datahub.metadata.schema_classes as models
from datahub.emitter.mce_builder import make_container_urn from datahub.emitter.mce_builder import make_container_urn, make_dataset_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp import MetadataChangeProposalWrapper
from datahub.ingestion.api.source_helpers import ( from datahub.ingestion.api.source_helpers import (
auto_browse_path_v2, auto_browse_path_v2,
@ -100,13 +100,19 @@ def _create_container_aspects(d: Dict[str, Any]) -> Iterable[MetadataWorkUnit]:
yield from _create_container_aspects(v) yield from _create_container_aspects(v)
def _make_browse_path_entries(path: List[str]) -> List[models.BrowsePathEntryClass]: def _make_container_browse_path_entries(
path: List[str],
) -> List[models.BrowsePathEntryClass]:
return [ return [
models.BrowsePathEntryClass(id=make_container_urn(s), urn=make_container_urn(s)) models.BrowsePathEntryClass(id=make_container_urn(s), urn=make_container_urn(s))
for s in path for s in path
] ]
def _make_browse_path_entries(path: List[str]) -> List[models.BrowsePathEntryClass]:
return [models.BrowsePathEntryClass(id=s, urn=None) for s in path]
def _get_browse_paths_from_wu( def _get_browse_paths_from_wu(
stream: Iterable[MetadataWorkUnit], stream: Iterable[MetadataWorkUnit],
) -> Dict[str, List[models.BrowsePathEntryClass]]: ) -> Dict[str, List[models.BrowsePathEntryClass]]:
@ -119,7 +125,7 @@ def _get_browse_paths_from_wu(
return paths return paths
def test_auto_browse_path_v2(): def test_auto_browse_path_v2_by_container_hierarchy():
structure = { structure = {
"one": { "one": {
"a": {"i": ["1", "2", "3"], "ii": ["4"]}, "a": {"i": ["1", "2", "3"], "ii": ["4"]},
@ -137,7 +143,7 @@ def test_auto_browse_path_v2():
sum(len(wu.get_aspects_of_type(models.StatusClass)) for wu in wus) == 21 sum(len(wu.get_aspects_of_type(models.StatusClass)) for wu in wus) == 21
) )
new_wus = list(auto_browse_path_v2(wus)) new_wus = list(auto_browse_path_v2([], wus))
assert ( assert (
sum(len(wu.get_aspects_of_type(models.BrowsePathsV2Class)) for wu in new_wus) sum(len(wu.get_aspects_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
== 21 == 21
@ -145,29 +151,104 @@ def test_auto_browse_path_v2():
paths = _get_browse_paths_from_wu(new_wus) paths = _get_browse_paths_from_wu(new_wus)
assert paths["one"] == [] assert paths["one"] == []
assert paths["7"] == paths["8"] == _make_browse_path_entries(["two", "c", "v"]) assert (
assert paths["d"] == _make_browse_path_entries(["three"]) paths["7"]
assert paths["i"] == _make_browse_path_entries(["one", "a"]) == paths["8"]
== _make_container_browse_path_entries(["two", "c", "v"])
)
assert paths["d"] == _make_container_browse_path_entries(["three"])
assert paths["i"] == _make_container_browse_path_entries(["one", "a"])
def test_auto_browse_path_v2_ignores_urns_already_with(): def test_auto_browse_path_v2_ignores_urns_already_with():
structure = {"a": {"b": {"c": {"d": ["e"]}}}} structure = {"a": {"b": {"c": {"d": ["e"]}}}}
mcp = MetadataChangeProposalWrapper( mcps = [
*MetadataChangeProposalWrapper.construct_many(
entityUrn=make_container_urn("f"),
aspects=[
models.BrowsePathsClass(paths=["/one/two"]),
models.BrowsePathsV2Class(
path=_make_browse_path_entries(["my", "path"])
),
],
),
MetadataChangeProposalWrapper(
entityUrn=make_container_urn("c"), entityUrn=make_container_urn("c"),
aspect=models.BrowsePathsV2Class( aspect=models.BrowsePathsV2Class(
path=_make_browse_path_entries(["custom", "path"]) path=_make_container_browse_path_entries(["custom", "path"])
), ),
),
]
wus = [
*auto_status_aspect(
[
*_create_container_aspects(structure),
*(mcp.as_workunit() for mcp in mcps),
]
) )
wus = [*auto_status_aspect(_create_container_aspects(structure)), mcp.as_workunit()] ]
new_wus = list(auto_browse_path_v2([], wus))
new_wus = list(auto_browse_path_v2(wus))
assert ( assert (
sum(len(wu.get_aspects_of_type(models.BrowsePathsV2Class)) for wu in new_wus) sum(len(wu.get_aspects_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
== 5 == 6
) )
paths = _get_browse_paths_from_wu(new_wus) paths = _get_browse_paths_from_wu(new_wus)
assert paths["a"] == [] assert paths["a"] == []
assert paths["c"] == _make_browse_path_entries(["custom", "path"]) assert paths["c"] == _make_container_browse_path_entries(["custom", "path"])
assert paths["e"] == _make_browse_path_entries(["a", "b", "c", "d"]) assert paths["e"] == _make_container_browse_path_entries(["a", "b", "c", "d"])
assert paths["f"] == _make_browse_path_entries(["my", "path"])
def test_auto_browse_path_v2_legacy_browse_path():
platform = "platform"
env = "PROD"
wus = [
MetadataChangeProposalWrapper(
entityUrn=make_dataset_urn(platform, "dataset-1", env),
aspect=models.BrowsePathsClass(["/one/two"]),
).as_workunit(),
MetadataChangeProposalWrapper(
entityUrn=make_dataset_urn(platform, "dataset-2", env),
aspect=models.BrowsePathsClass([f"/{platform}/{env}/something"]),
).as_workunit(),
MetadataChangeProposalWrapper(
entityUrn=make_dataset_urn(platform, "dataset-3", env),
aspect=models.BrowsePathsClass([f"/{platform}/one/two"]),
).as_workunit(),
]
new_wus = list(auto_browse_path_v2(["platform", "PROD", "unused"], wus))
assert len(new_wus) == 6
paths = _get_browse_paths_from_wu(new_wus)
assert (
paths["platform,dataset-1,PROD)"]
== paths["platform,dataset-3,PROD)"]
== _make_browse_path_entries(["one", "two"])
)
assert paths["platform,dataset-2,PROD)"] == _make_browse_path_entries(["something"])
def test_auto_browse_path_v2_container_over_legacy_browse_path():
structure = {"a": {"b": ["c"]}}
wus = list(
auto_status_aspect(
[
*_create_container_aspects(structure),
MetadataChangeProposalWrapper(
entityUrn=make_container_urn("b"),
aspect=models.BrowsePathsClass(paths=["/one/two"]),
).as_workunit(),
]
)
)
new_wus = list(auto_browse_path_v2([], wus))
assert (
sum(len(wu.get_aspects_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
== 3
)
paths = _get_browse_paths_from_wu(new_wus)
assert paths["a"] == []
assert paths["b"] == _make_container_browse_path_entries(["a"])
assert paths["c"] == _make_container_browse_path_entries(["a", "b"])