mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-08 15:30:55 +00:00
feat(ingest): Create Browse Paths V2 under flag (#8120)
This commit is contained in:
parent
fe1ff71318
commit
802c91a0a7
@ -8,7 +8,7 @@ import unittest.mock
|
|||||||
from dataclasses import Field, dataclass, field
|
from dataclasses import Field, dataclass, field
|
||||||
from enum import auto
|
from enum import auto
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
from typing import Any, Dict, List, Optional, Tuple, Iterable
|
||||||
|
|
||||||
import avro.schema
|
import avro.schema
|
||||||
import click
|
import click
|
||||||
@ -27,7 +27,6 @@ from datahub.metadata.schema_classes import (
|
|||||||
DatasetSnapshotClass,
|
DatasetSnapshotClass,
|
||||||
ForeignKeyConstraintClass,
|
ForeignKeyConstraintClass,
|
||||||
GlobalTagsClass,
|
GlobalTagsClass,
|
||||||
MetadataChangeEventClass,
|
|
||||||
OtherSchemaClass,
|
OtherSchemaClass,
|
||||||
SchemaFieldClass as SchemaField,
|
SchemaFieldClass as SchemaField,
|
||||||
SchemaFieldDataTypeClass,
|
SchemaFieldDataTypeClass,
|
||||||
@ -35,6 +34,8 @@ from datahub.metadata.schema_classes import (
|
|||||||
StringTypeClass,
|
StringTypeClass,
|
||||||
SubTypesClass,
|
SubTypesClass,
|
||||||
TagAssociationClass,
|
TagAssociationClass,
|
||||||
|
BrowsePathsV2Class,
|
||||||
|
BrowsePathEntryClass,
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -316,9 +317,10 @@ def make_entity_docs(entity_display_name: str, graph: RelationshipGraph) -> str:
|
|||||||
raise Exception(f"Failed to find information for entity: {entity_name}")
|
raise Exception(f"Failed to find information for entity: {entity_name}")
|
||||||
|
|
||||||
|
|
||||||
def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]:
|
def generate_stitched_record(
|
||||||
|
relnships_graph: RelationshipGraph,
|
||||||
|
) -> Iterable[MetadataChangeProposalWrapper]:
|
||||||
def strip_types(field_path: str) -> str:
|
def strip_types(field_path: str) -> str:
|
||||||
|
|
||||||
final_path = field_path
|
final_path = field_path
|
||||||
final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path)
|
final_path = re.sub(r"(\[type=[a-zA-Z]+\]\.)", "", final_path)
|
||||||
final_path = re.sub(r"^\[version=2.0\]\.", "", final_path)
|
final_path = re.sub(r"^\[version=2.0\]\.", "", final_path)
|
||||||
@ -455,52 +457,41 @@ def generate_stitched_record(relnships_graph: RelationshipGraph) -> List[Any]:
|
|||||||
edge_id=f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}",
|
edge_id=f"{entity_display_name}:{fkey.name}:{destination_entity_name}:{strip_types(f_field.fieldPath)}",
|
||||||
)
|
)
|
||||||
|
|
||||||
schemaMetadata = SchemaMetadataClass(
|
dataset_urn = make_dataset_urn(
|
||||||
schemaName=f"{entity_name}",
|
platform="datahub",
|
||||||
|
name=entity_display_name,
|
||||||
|
)
|
||||||
|
|
||||||
|
yield from MetadataChangeProposalWrapper.construct_many(
|
||||||
|
entityUrn=dataset_urn,
|
||||||
|
aspects=[
|
||||||
|
SchemaMetadataClass(
|
||||||
|
schemaName=str(entity_name),
|
||||||
platform=make_data_platform_urn("datahub"),
|
platform=make_data_platform_urn("datahub"),
|
||||||
platformSchema=OtherSchemaClass(rawSchema=rawSchema),
|
platformSchema=OtherSchemaClass(rawSchema=rawSchema),
|
||||||
fields=schema_fields,
|
fields=schema_fields,
|
||||||
version=0,
|
version=0,
|
||||||
hash="",
|
hash="",
|
||||||
foreignKeys=foreign_keys if foreign_keys else None,
|
foreignKeys=foreign_keys if foreign_keys else None,
|
||||||
)
|
|
||||||
|
|
||||||
dataset = DatasetSnapshotClass(
|
|
||||||
urn=make_dataset_urn(
|
|
||||||
platform="datahub",
|
|
||||||
name=f"{entity_display_name}",
|
|
||||||
),
|
),
|
||||||
aspects=[
|
|
||||||
schemaMetadata,
|
|
||||||
GlobalTagsClass(
|
GlobalTagsClass(
|
||||||
tags=[TagAssociationClass(tag="urn:li:tag:Entity")]
|
tags=[TagAssociationClass(tag="urn:li:tag:Entity")]
|
||||||
),
|
),
|
||||||
BrowsePathsClass([f"/prod/datahub/entities/{entity_display_name}"]),
|
BrowsePathsClass([f"/prod/datahub/entities/{entity_display_name}"]),
|
||||||
|
BrowsePathsV2Class(
|
||||||
|
[
|
||||||
|
BrowsePathEntryClass(id="entities"),
|
||||||
|
BrowsePathEntryClass(id=entity_display_name),
|
||||||
|
]
|
||||||
|
),
|
||||||
|
DatasetPropertiesClass(
|
||||||
|
description=make_entity_docs(
|
||||||
|
dataset_urn.split(":")[-1].split(",")[1], relnships_graph
|
||||||
|
)
|
||||||
|
),
|
||||||
|
SubTypesClass(typeNames=["entity"]),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
datasets.append(dataset)
|
|
||||||
|
|
||||||
events: List[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]] = []
|
|
||||||
|
|
||||||
for d in datasets:
|
|
||||||
entity_name = d.urn.split(":")[-1].split(",")[1]
|
|
||||||
d.aspects.append(
|
|
||||||
DatasetPropertiesClass(
|
|
||||||
description=make_entity_docs(entity_name, relnships_graph)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
mce = MetadataChangeEventClass(
|
|
||||||
proposedSnapshot=d,
|
|
||||||
)
|
|
||||||
events.append(mce)
|
|
||||||
|
|
||||||
mcp = MetadataChangeProposalWrapper(
|
|
||||||
entityUrn=d.urn,
|
|
||||||
aspect=SubTypesClass(typeNames=["entity"]),
|
|
||||||
)
|
|
||||||
events.append(mcp)
|
|
||||||
return events
|
|
||||||
|
|
||||||
|
|
||||||
class EntityRegistry(ConfigModel):
|
class EntityRegistry(ConfigModel):
|
||||||
@ -614,7 +605,7 @@ def generate(
|
|||||||
]
|
]
|
||||||
|
|
||||||
relationship_graph = RelationshipGraph()
|
relationship_graph = RelationshipGraph()
|
||||||
events = generate_stitched_record(relationship_graph)
|
mcps = generate_stitched_record(relationship_graph)
|
||||||
|
|
||||||
shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True)
|
shutil.rmtree(f"{generated_docs_dir}/entities", ignore_errors=True)
|
||||||
entity_names = [(x, entity_registry[x]) for x in generated_documentation]
|
entity_names = [(x, entity_registry[x]) for x in generated_documentation]
|
||||||
@ -645,7 +636,7 @@ def generate(
|
|||||||
PipelineContext(run_id="generated-metaModel"),
|
PipelineContext(run_id="generated-metaModel"),
|
||||||
FileSinkConfig(filename=file),
|
FileSinkConfig(filename=file),
|
||||||
)
|
)
|
||||||
for e in events:
|
for e in mcps:
|
||||||
fileSink.write_record_async(
|
fileSink.write_record_async(
|
||||||
RecordEnvelope(e, metadata={}), write_callback=NoopWriteCallback()
|
RecordEnvelope(e, metadata={}), write_callback=NoopWriteCallback()
|
||||||
)
|
)
|
||||||
@ -674,7 +665,7 @@ def generate(
|
|||||||
assert server.startswith("http://"), "server address must start with http://"
|
assert server.startswith("http://"), "server address must start with http://"
|
||||||
emitter = DatahubRestEmitter(gms_server=server)
|
emitter = DatahubRestEmitter(gms_server=server)
|
||||||
emitter.test_connection()
|
emitter.test_connection()
|
||||||
for e in events:
|
for e in mcps:
|
||||||
emitter.emit(e)
|
emitter.emit(e)
|
||||||
|
|
||||||
if dot:
|
if dot:
|
||||||
|
|||||||
@ -27,6 +27,7 @@ from datahub.ingestion.api.closeable import Closeable
|
|||||||
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
|
||||||
from datahub.ingestion.api.report import Report
|
from datahub.ingestion.api.report import Report
|
||||||
from datahub.ingestion.api.source_helpers import (
|
from datahub.ingestion.api.source_helpers import (
|
||||||
|
auto_browse_path_v2,
|
||||||
auto_materialize_referenced_tags,
|
auto_materialize_referenced_tags,
|
||||||
auto_status_aspect,
|
auto_status_aspect,
|
||||||
auto_workunit_reporter,
|
auto_workunit_reporter,
|
||||||
@ -181,10 +182,31 @@ class Source(Closeable, metaclass=ABCMeta):
|
|||||||
"""A list of functions that transforms the workunits produced by this source.
|
"""A list of functions that transforms the workunits produced by this source.
|
||||||
Run in order, first in list is applied first. Be careful with order when overriding.
|
Run in order, first in list is applied first. Be careful with order when overriding.
|
||||||
"""
|
"""
|
||||||
|
browse_path_processor: Optional[MetadataWorkUnitProcessor] = None
|
||||||
|
if (
|
||||||
|
self.ctx.pipeline_config
|
||||||
|
and self.ctx.pipeline_config.flags.generate_browse_path_v2
|
||||||
|
):
|
||||||
|
platform = getattr(self, "platform", None) or getattr(
|
||||||
|
self.get_config(), "platform", None
|
||||||
|
)
|
||||||
|
env = getattr(self.get_config(), "env", None)
|
||||||
|
browse_path_drop_dirs = [
|
||||||
|
platform,
|
||||||
|
platform and platform.lower(),
|
||||||
|
env,
|
||||||
|
env and env.lower(),
|
||||||
|
]
|
||||||
|
browse_path_processor = partial(
|
||||||
|
auto_browse_path_v2,
|
||||||
|
[s for s in browse_path_drop_dirs if s is not None],
|
||||||
|
)
|
||||||
|
|
||||||
return [
|
return [
|
||||||
auto_status_aspect,
|
auto_status_aspect,
|
||||||
auto_materialize_referenced_tags,
|
auto_materialize_referenced_tags,
|
||||||
partial(auto_workunit_reporter, self.get_report()),
|
partial(auto_workunit_reporter, self.get_report()),
|
||||||
|
browse_path_processor,
|
||||||
]
|
]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -207,6 +229,18 @@ class Source(Closeable, metaclass=ABCMeta):
|
|||||||
"get_workunits_internal must be implemented if get_workunits is not overriden."
|
"get_workunits_internal must be implemented if get_workunits is not overriden."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def get_config(self) -> Optional[ConfigModel]:
|
||||||
|
"""Overridable method to return the config object for this source.
|
||||||
|
|
||||||
|
Enables defining workunit processors in this class, rather than per source.
|
||||||
|
More broadly, this method contributes to the standardization of sources,
|
||||||
|
to promote more source-generic functionality.
|
||||||
|
|
||||||
|
Eventually, would like to replace this call with a Protocol that requires
|
||||||
|
a config object to be defined on each source.
|
||||||
|
"""
|
||||||
|
return getattr(self, "config", None) or getattr(self, "source_config", None)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_report(self) -> SourceReport:
|
def get_report(self) -> SourceReport:
|
||||||
pass
|
pass
|
||||||
|
|||||||
@ -7,6 +7,7 @@ from typing import (
|
|||||||
Iterable,
|
Iterable,
|
||||||
List,
|
List,
|
||||||
Optional,
|
Optional,
|
||||||
|
Sequence,
|
||||||
Set,
|
Set,
|
||||||
TypeVar,
|
TypeVar,
|
||||||
Union,
|
Union,
|
||||||
@ -16,6 +17,7 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
|||||||
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
||||||
from datahub.metadata.schema_classes import (
|
from datahub.metadata.schema_classes import (
|
||||||
BrowsePathEntryClass,
|
BrowsePathEntryClass,
|
||||||
|
BrowsePathsClass,
|
||||||
BrowsePathsV2Class,
|
BrowsePathsV2Class,
|
||||||
ContainerClass,
|
ContainerClass,
|
||||||
MetadataChangeEventClass,
|
MetadataChangeEventClass,
|
||||||
@ -164,12 +166,13 @@ def auto_materialize_referenced_tags(
|
|||||||
|
|
||||||
|
|
||||||
def auto_browse_path_v2(
|
def auto_browse_path_v2(
|
||||||
|
drop_dirs: Sequence[str],
|
||||||
stream: Iterable[MetadataWorkUnit],
|
stream: Iterable[MetadataWorkUnit],
|
||||||
) -> Iterable[MetadataWorkUnit]:
|
) -> Iterable[MetadataWorkUnit]:
|
||||||
"""Generate BrowsePathsV2 from Container aspects."""
|
"""Generate BrowsePathsV2 from Container and BrowsePaths aspects."""
|
||||||
# TODO: Generate BrowsePathsV2 from BrowsePaths as well
|
|
||||||
|
|
||||||
ignore_urns: Set[str] = set()
|
ignore_urns: Set[str] = set()
|
||||||
|
legacy_browse_paths: Dict[str, List[str]] = defaultdict(list)
|
||||||
container_urns: Set[str] = set()
|
container_urns: Set[str] = set()
|
||||||
parent_container_map: Dict[str, str] = {}
|
parent_container_map: Dict[str, str] = {}
|
||||||
children: Dict[str, List[str]] = defaultdict(list)
|
children: Dict[str, List[str]] = defaultdict(list)
|
||||||
@ -181,16 +184,25 @@ def auto_browse_path_v2(
|
|||||||
container_urns.add(urn)
|
container_urns.add(urn)
|
||||||
|
|
||||||
container_aspects = wu.get_aspects_of_type(ContainerClass)
|
container_aspects = wu.get_aspects_of_type(ContainerClass)
|
||||||
for aspect in container_aspects:
|
for c_aspect in container_aspects:
|
||||||
parent = aspect.container
|
parent = c_aspect.container
|
||||||
parent_container_map[urn] = parent
|
parent_container_map[urn] = parent
|
||||||
children[parent].append(urn)
|
children[parent].append(urn)
|
||||||
|
|
||||||
|
browse_path_aspects = wu.get_aspects_of_type(BrowsePathsClass)
|
||||||
|
for b_aspect in browse_path_aspects:
|
||||||
|
if b_aspect.paths:
|
||||||
|
path = b_aspect.paths[0] # Only take first path
|
||||||
|
legacy_browse_paths[urn] = [
|
||||||
|
p for p in path.strip("/").split("/") if p.strip() not in drop_dirs
|
||||||
|
]
|
||||||
|
|
||||||
if wu.get_aspects_of_type(BrowsePathsV2Class):
|
if wu.get_aspects_of_type(BrowsePathsV2Class):
|
||||||
ignore_urns.add(urn)
|
ignore_urns.add(urn)
|
||||||
|
|
||||||
paths: Dict[str, List[str]] = {} # Maps urn -> list of urns in path
|
paths: Dict[str, List[str]] = {} # Maps urn -> list of urns in path
|
||||||
# Yield browse paths v2 in topological order, starting with root containers
|
# Yield browse paths v2 in topological order, starting with root containers
|
||||||
|
processed_urns = set()
|
||||||
nodes = container_urns - parent_container_map.keys()
|
nodes = container_urns - parent_container_map.keys()
|
||||||
while nodes:
|
while nodes:
|
||||||
node = nodes.pop()
|
node = nodes.pop()
|
||||||
@ -208,3 +220,14 @@ def auto_browse_path_v2(
|
|||||||
path=[BrowsePathEntryClass(id=urn, urn=urn) for urn in paths[node]]
|
path=[BrowsePathEntryClass(id=urn, urn=urn) for urn in paths[node]]
|
||||||
),
|
),
|
||||||
).as_workunit()
|
).as_workunit()
|
||||||
|
processed_urns.add(node)
|
||||||
|
|
||||||
|
# Yield browse paths v2 based on browse paths v1 (legacy)
|
||||||
|
# Only done if the entity is not part of a container hierarchy
|
||||||
|
for urn in legacy_browse_paths.keys() - processed_urns - ignore_urns:
|
||||||
|
yield MetadataChangeProposalWrapper(
|
||||||
|
entityUrn=urn,
|
||||||
|
aspect=BrowsePathsV2Class(
|
||||||
|
path=[BrowsePathEntryClass(id=p) for p in legacy_browse_paths[urn]]
|
||||||
|
),
|
||||||
|
).as_workunit()
|
||||||
|
|||||||
@ -37,6 +37,19 @@ class FailureLoggingConfig(ConfigModel):
|
|||||||
log_config: Optional[FileSinkConfig] = None
|
log_config: Optional[FileSinkConfig] = None
|
||||||
|
|
||||||
|
|
||||||
|
class FlagsConfig(ConfigModel):
|
||||||
|
"""Experimental flags for the ingestion pipeline.
|
||||||
|
|
||||||
|
As ingestion flags an experimental feature, we do not guarantee backwards compatibility.
|
||||||
|
Use at your own risk!
|
||||||
|
"""
|
||||||
|
|
||||||
|
generate_browse_path_v2: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="Generate BrowsePathsV2 aspects from container hierarchy and existing BrowsePaths aspects.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class PipelineConfig(ConfigModel):
|
class PipelineConfig(ConfigModel):
|
||||||
# Once support for discriminated unions gets merged into Pydantic, we can
|
# Once support for discriminated unions gets merged into Pydantic, we can
|
||||||
# simplify this configuration and validation.
|
# simplify this configuration and validation.
|
||||||
@ -45,6 +58,7 @@ class PipelineConfig(ConfigModel):
|
|||||||
source: SourceConfig
|
source: SourceConfig
|
||||||
sink: DynamicTypedConfig
|
sink: DynamicTypedConfig
|
||||||
transformers: Optional[List[DynamicTypedConfig]]
|
transformers: Optional[List[DynamicTypedConfig]]
|
||||||
|
flags: FlagsConfig = Field(default=FlagsConfig())
|
||||||
reporting: List[ReporterConfig] = []
|
reporting: List[ReporterConfig] = []
|
||||||
run_id: str = DEFAULT_RUN_ID
|
run_id: str = DEFAULT_RUN_ID
|
||||||
datahub_api: Optional[DatahubClientConfig] = None
|
datahub_api: Optional[DatahubClientConfig] = None
|
||||||
|
|||||||
@ -41,6 +41,7 @@ class SagemakerSource(Source):
|
|||||||
- Models, jobs, and lineage between the two (e.g. when jobs output a model or a model is used by a job)
|
- Models, jobs, and lineage between the two (e.g. when jobs output a model or a model is used by a job)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
platform = "sagemaker"
|
||||||
source_config: SagemakerSourceConfig
|
source_config: SagemakerSourceConfig
|
||||||
report = SagemakerSourceReport()
|
report = SagemakerSourceReport()
|
||||||
|
|
||||||
|
|||||||
@ -116,6 +116,7 @@ class FeastRepositorySource(Source):
|
|||||||
- Column types associated with each entity and feature
|
- Column types associated with each entity and feature
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
platform = "feast"
|
||||||
source_config: FeastRepositorySourceConfig
|
source_config: FeastRepositorySourceConfig
|
||||||
report: SourceReport
|
report: SourceReport
|
||||||
feature_store: FeatureStore
|
feature_store: FeatureStore
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from okta.models import Group, GroupProfile, User, UserProfile, UserStatus
|
|||||||
from pydantic import validator
|
from pydantic import validator
|
||||||
from pydantic.fields import Field
|
from pydantic.fields import Field
|
||||||
|
|
||||||
from datahub.configuration.common import ConfigurationError
|
from datahub.configuration.common import ConfigModel, ConfigurationError
|
||||||
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
||||||
from datahub.ingestion.api.common import PipelineContext
|
from datahub.ingestion.api.common import PipelineContext
|
||||||
from datahub.ingestion.api.decorators import (
|
from datahub.ingestion.api.decorators import (
|
||||||
@ -53,7 +53,7 @@ from datahub.metadata.schema_classes import (
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class OktaConfig(StatefulIngestionConfigBase):
|
class OktaConfig(StatefulIngestionConfigBase, ConfigModel):
|
||||||
# Required: Domain of the Okta deployment. Example: dev-33231928.okta.com
|
# Required: Domain of the Okta deployment. Example: dev-33231928.okta.com
|
||||||
okta_domain: str = Field(
|
okta_domain: str = Field(
|
||||||
description="The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. e.g. dev-33231928.okta.com",
|
description="The location of your Okta Domain, without a protocol. Can be found in Okta Developer console. e.g. dev-33231928.okta.com",
|
||||||
|
|||||||
@ -160,7 +160,7 @@ class ModeSource(Source):
|
|||||||
|
|
||||||
config: ModeConfig
|
config: ModeConfig
|
||||||
report: SourceReport
|
report: SourceReport
|
||||||
tool = "mode"
|
platform = "mode"
|
||||||
|
|
||||||
def __hash__(self):
|
def __hash__(self):
|
||||||
return id(self)
|
return id(self)
|
||||||
@ -200,7 +200,9 @@ class ModeSource(Source):
|
|||||||
self, space_name: str, report_info: dict
|
self, space_name: str, report_info: dict
|
||||||
) -> DashboardSnapshot:
|
) -> DashboardSnapshot:
|
||||||
report_token = report_info.get("token", "")
|
report_token = report_info.get("token", "")
|
||||||
dashboard_urn = builder.make_dashboard_urn(self.tool, report_info.get("id", ""))
|
dashboard_urn = builder.make_dashboard_urn(
|
||||||
|
self.platform, report_info.get("id", "")
|
||||||
|
)
|
||||||
dashboard_snapshot = DashboardSnapshot(
|
dashboard_snapshot = DashboardSnapshot(
|
||||||
urn=dashboard_urn,
|
urn=dashboard_urn,
|
||||||
aspects=[],
|
aspects=[],
|
||||||
@ -304,7 +306,9 @@ class ModeSource(Source):
|
|||||||
charts = self._get_charts(report_token, query.get("token", ""))
|
charts = self._get_charts(report_token, query.get("token", ""))
|
||||||
# build chart urns
|
# build chart urns
|
||||||
for chart in charts:
|
for chart in charts:
|
||||||
chart_urn = builder.make_chart_urn(self.tool, chart.get("token", ""))
|
chart_urn = builder.make_chart_urn(
|
||||||
|
self.platform, chart.get("token", "")
|
||||||
|
)
|
||||||
chart_urns.append(chart_urn)
|
chart_urns.append(chart_urn)
|
||||||
|
|
||||||
return chart_urns
|
return chart_urns
|
||||||
@ -580,7 +584,7 @@ class ModeSource(Source):
|
|||||||
def construct_chart_from_api_data(
|
def construct_chart_from_api_data(
|
||||||
self, chart_data: dict, query: dict, path: str
|
self, chart_data: dict, query: dict, path: str
|
||||||
) -> ChartSnapshot:
|
) -> ChartSnapshot:
|
||||||
chart_urn = builder.make_chart_urn(self.tool, chart_data.get("token", ""))
|
chart_urn = builder.make_chart_urn(self.platform, chart_data.get("token", ""))
|
||||||
chart_snapshot = ChartSnapshot(
|
chart_snapshot = ChartSnapshot(
|
||||||
urn=chart_urn,
|
urn=chart_urn,
|
||||||
aspects=[],
|
aspects=[],
|
||||||
|
|||||||
@ -8,6 +8,7 @@ import requests
|
|||||||
from pydantic.class_validators import root_validator, validator
|
from pydantic.class_validators import root_validator, validator
|
||||||
from pydantic.fields import Field
|
from pydantic.fields import Field
|
||||||
|
|
||||||
|
from datahub.configuration import ConfigModel
|
||||||
from datahub.emitter.mce_builder import DEFAULT_ENV
|
from datahub.emitter.mce_builder import DEFAULT_ENV
|
||||||
from datahub.ingestion.api.common import PipelineContext
|
from datahub.ingestion.api.common import PipelineContext
|
||||||
from datahub.ingestion.api.decorators import (
|
from datahub.ingestion.api.decorators import (
|
||||||
@ -69,7 +70,7 @@ chart_type_from_viz_type = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class SupersetConfig(StatefulIngestionConfigBase):
|
class SupersetConfig(StatefulIngestionConfigBase, ConfigModel):
|
||||||
# See the Superset /security/login endpoint for details
|
# See the Superset /security/login endpoint for details
|
||||||
# https://superset.apache.org/docs/rest-api
|
# https://superset.apache.org/docs/rest-api
|
||||||
connect_uri: str = Field(
|
connect_uri: str = Field(
|
||||||
|
|||||||
@ -332,7 +332,8 @@ class AddStatusRemovedTransformer(Transformer):
|
|||||||
|
|
||||||
|
|
||||||
class FakeSource(Source):
|
class FakeSource(Source):
|
||||||
def __init__(self):
|
def __init__(self, ctx: PipelineContext):
|
||||||
|
super().__init__(ctx)
|
||||||
self.source_report = SourceReport()
|
self.source_report = SourceReport()
|
||||||
self.work_units: List[MetadataWorkUnit] = [
|
self.work_units: List[MetadataWorkUnit] = [
|
||||||
MetadataWorkUnit(id="workunit-1", mce=get_initial_mce())
|
MetadataWorkUnit(id="workunit-1", mce=get_initial_mce())
|
||||||
@ -341,7 +342,7 @@ class FakeSource(Source):
|
|||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
|
||||||
assert not config_dict
|
assert not config_dict
|
||||||
return cls()
|
return cls(ctx)
|
||||||
|
|
||||||
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
def get_workunits(self) -> Iterable[MetadataWorkUnit]:
|
||||||
return self.work_units
|
return self.work_units
|
||||||
@ -354,8 +355,8 @@ class FakeSource(Source):
|
|||||||
|
|
||||||
|
|
||||||
class FakeSourceWithWarnings(FakeSource):
|
class FakeSourceWithWarnings(FakeSource):
|
||||||
def __init__(self):
|
def __init__(self, ctx: PipelineContext):
|
||||||
super().__init__()
|
super().__init__(ctx)
|
||||||
self.source_report.report_warning("test_warning", "warning_text")
|
self.source_report.report_warning("test_warning", "warning_text")
|
||||||
|
|
||||||
def get_report(self) -> SourceReport:
|
def get_report(self) -> SourceReport:
|
||||||
@ -363,8 +364,8 @@ class FakeSourceWithWarnings(FakeSource):
|
|||||||
|
|
||||||
|
|
||||||
class FakeSourceWithFailures(FakeSource):
|
class FakeSourceWithFailures(FakeSource):
|
||||||
def __init__(self):
|
def __init__(self, ctx: PipelineContext):
|
||||||
super().__init__()
|
super().__init__(ctx)
|
||||||
self.source_report.report_failure("test_failure", "failure_text")
|
self.source_report.report_failure("test_failure", "failure_text")
|
||||||
|
|
||||||
def get_report(self) -> SourceReport:
|
def get_report(self) -> SourceReport:
|
||||||
|
|||||||
@ -27,6 +27,7 @@ class FakeSource(Source):
|
|||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, ctx: PipelineContext):
|
def __init__(self, ctx: PipelineContext):
|
||||||
|
super().__init__(ctx)
|
||||||
self.source_report = SourceReport()
|
self.source_report = SourceReport()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|||||||
@ -1,7 +1,7 @@
|
|||||||
from typing import Any, Dict, Iterable, List, Union
|
from typing import Any, Dict, Iterable, List, Union
|
||||||
|
|
||||||
import datahub.metadata.schema_classes as models
|
import datahub.metadata.schema_classes as models
|
||||||
from datahub.emitter.mce_builder import make_container_urn
|
from datahub.emitter.mce_builder import make_container_urn, make_dataset_urn
|
||||||
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
||||||
from datahub.ingestion.api.source_helpers import (
|
from datahub.ingestion.api.source_helpers import (
|
||||||
auto_browse_path_v2,
|
auto_browse_path_v2,
|
||||||
@ -100,13 +100,19 @@ def _create_container_aspects(d: Dict[str, Any]) -> Iterable[MetadataWorkUnit]:
|
|||||||
yield from _create_container_aspects(v)
|
yield from _create_container_aspects(v)
|
||||||
|
|
||||||
|
|
||||||
def _make_browse_path_entries(path: List[str]) -> List[models.BrowsePathEntryClass]:
|
def _make_container_browse_path_entries(
|
||||||
|
path: List[str],
|
||||||
|
) -> List[models.BrowsePathEntryClass]:
|
||||||
return [
|
return [
|
||||||
models.BrowsePathEntryClass(id=make_container_urn(s), urn=make_container_urn(s))
|
models.BrowsePathEntryClass(id=make_container_urn(s), urn=make_container_urn(s))
|
||||||
for s in path
|
for s in path
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _make_browse_path_entries(path: List[str]) -> List[models.BrowsePathEntryClass]:
|
||||||
|
return [models.BrowsePathEntryClass(id=s, urn=None) for s in path]
|
||||||
|
|
||||||
|
|
||||||
def _get_browse_paths_from_wu(
|
def _get_browse_paths_from_wu(
|
||||||
stream: Iterable[MetadataWorkUnit],
|
stream: Iterable[MetadataWorkUnit],
|
||||||
) -> Dict[str, List[models.BrowsePathEntryClass]]:
|
) -> Dict[str, List[models.BrowsePathEntryClass]]:
|
||||||
@ -119,7 +125,7 @@ def _get_browse_paths_from_wu(
|
|||||||
return paths
|
return paths
|
||||||
|
|
||||||
|
|
||||||
def test_auto_browse_path_v2():
|
def test_auto_browse_path_v2_by_container_hierarchy():
|
||||||
structure = {
|
structure = {
|
||||||
"one": {
|
"one": {
|
||||||
"a": {"i": ["1", "2", "3"], "ii": ["4"]},
|
"a": {"i": ["1", "2", "3"], "ii": ["4"]},
|
||||||
@ -137,7 +143,7 @@ def test_auto_browse_path_v2():
|
|||||||
sum(len(wu.get_aspects_of_type(models.StatusClass)) for wu in wus) == 21
|
sum(len(wu.get_aspects_of_type(models.StatusClass)) for wu in wus) == 21
|
||||||
)
|
)
|
||||||
|
|
||||||
new_wus = list(auto_browse_path_v2(wus))
|
new_wus = list(auto_browse_path_v2([], wus))
|
||||||
assert (
|
assert (
|
||||||
sum(len(wu.get_aspects_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
|
sum(len(wu.get_aspects_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
|
||||||
== 21
|
== 21
|
||||||
@ -145,29 +151,104 @@ def test_auto_browse_path_v2():
|
|||||||
|
|
||||||
paths = _get_browse_paths_from_wu(new_wus)
|
paths = _get_browse_paths_from_wu(new_wus)
|
||||||
assert paths["one"] == []
|
assert paths["one"] == []
|
||||||
assert paths["7"] == paths["8"] == _make_browse_path_entries(["two", "c", "v"])
|
assert (
|
||||||
assert paths["d"] == _make_browse_path_entries(["three"])
|
paths["7"]
|
||||||
assert paths["i"] == _make_browse_path_entries(["one", "a"])
|
== paths["8"]
|
||||||
|
== _make_container_browse_path_entries(["two", "c", "v"])
|
||||||
|
)
|
||||||
|
assert paths["d"] == _make_container_browse_path_entries(["three"])
|
||||||
|
assert paths["i"] == _make_container_browse_path_entries(["one", "a"])
|
||||||
|
|
||||||
|
|
||||||
def test_auto_browse_path_v2_ignores_urns_already_with():
|
def test_auto_browse_path_v2_ignores_urns_already_with():
|
||||||
structure = {"a": {"b": {"c": {"d": ["e"]}}}}
|
structure = {"a": {"b": {"c": {"d": ["e"]}}}}
|
||||||
|
|
||||||
mcp = MetadataChangeProposalWrapper(
|
mcps = [
|
||||||
|
*MetadataChangeProposalWrapper.construct_many(
|
||||||
|
entityUrn=make_container_urn("f"),
|
||||||
|
aspects=[
|
||||||
|
models.BrowsePathsClass(paths=["/one/two"]),
|
||||||
|
models.BrowsePathsV2Class(
|
||||||
|
path=_make_browse_path_entries(["my", "path"])
|
||||||
|
),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
MetadataChangeProposalWrapper(
|
||||||
entityUrn=make_container_urn("c"),
|
entityUrn=make_container_urn("c"),
|
||||||
aspect=models.BrowsePathsV2Class(
|
aspect=models.BrowsePathsV2Class(
|
||||||
path=_make_browse_path_entries(["custom", "path"])
|
path=_make_container_browse_path_entries(["custom", "path"])
|
||||||
),
|
),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
wus = [
|
||||||
|
*auto_status_aspect(
|
||||||
|
[
|
||||||
|
*_create_container_aspects(structure),
|
||||||
|
*(mcp.as_workunit() for mcp in mcps),
|
||||||
|
]
|
||||||
)
|
)
|
||||||
wus = [*auto_status_aspect(_create_container_aspects(structure)), mcp.as_workunit()]
|
]
|
||||||
|
new_wus = list(auto_browse_path_v2([], wus))
|
||||||
new_wus = list(auto_browse_path_v2(wus))
|
|
||||||
assert (
|
assert (
|
||||||
sum(len(wu.get_aspects_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
|
sum(len(wu.get_aspects_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
|
||||||
== 5
|
== 6
|
||||||
)
|
)
|
||||||
|
|
||||||
paths = _get_browse_paths_from_wu(new_wus)
|
paths = _get_browse_paths_from_wu(new_wus)
|
||||||
assert paths["a"] == []
|
assert paths["a"] == []
|
||||||
assert paths["c"] == _make_browse_path_entries(["custom", "path"])
|
assert paths["c"] == _make_container_browse_path_entries(["custom", "path"])
|
||||||
assert paths["e"] == _make_browse_path_entries(["a", "b", "c", "d"])
|
assert paths["e"] == _make_container_browse_path_entries(["a", "b", "c", "d"])
|
||||||
|
assert paths["f"] == _make_browse_path_entries(["my", "path"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_browse_path_v2_legacy_browse_path():
|
||||||
|
platform = "platform"
|
||||||
|
env = "PROD"
|
||||||
|
wus = [
|
||||||
|
MetadataChangeProposalWrapper(
|
||||||
|
entityUrn=make_dataset_urn(platform, "dataset-1", env),
|
||||||
|
aspect=models.BrowsePathsClass(["/one/two"]),
|
||||||
|
).as_workunit(),
|
||||||
|
MetadataChangeProposalWrapper(
|
||||||
|
entityUrn=make_dataset_urn(platform, "dataset-2", env),
|
||||||
|
aspect=models.BrowsePathsClass([f"/{platform}/{env}/something"]),
|
||||||
|
).as_workunit(),
|
||||||
|
MetadataChangeProposalWrapper(
|
||||||
|
entityUrn=make_dataset_urn(platform, "dataset-3", env),
|
||||||
|
aspect=models.BrowsePathsClass([f"/{platform}/one/two"]),
|
||||||
|
).as_workunit(),
|
||||||
|
]
|
||||||
|
new_wus = list(auto_browse_path_v2(["platform", "PROD", "unused"], wus))
|
||||||
|
assert len(new_wus) == 6
|
||||||
|
paths = _get_browse_paths_from_wu(new_wus)
|
||||||
|
assert (
|
||||||
|
paths["platform,dataset-1,PROD)"]
|
||||||
|
== paths["platform,dataset-3,PROD)"]
|
||||||
|
== _make_browse_path_entries(["one", "two"])
|
||||||
|
)
|
||||||
|
assert paths["platform,dataset-2,PROD)"] == _make_browse_path_entries(["something"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_browse_path_v2_container_over_legacy_browse_path():
|
||||||
|
structure = {"a": {"b": ["c"]}}
|
||||||
|
wus = list(
|
||||||
|
auto_status_aspect(
|
||||||
|
[
|
||||||
|
*_create_container_aspects(structure),
|
||||||
|
MetadataChangeProposalWrapper(
|
||||||
|
entityUrn=make_container_urn("b"),
|
||||||
|
aspect=models.BrowsePathsClass(paths=["/one/two"]),
|
||||||
|
).as_workunit(),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
new_wus = list(auto_browse_path_v2([], wus))
|
||||||
|
assert (
|
||||||
|
sum(len(wu.get_aspects_of_type(models.BrowsePathsV2Class)) for wu in new_wus)
|
||||||
|
== 3
|
||||||
|
)
|
||||||
|
|
||||||
|
paths = _get_browse_paths_from_wu(new_wus)
|
||||||
|
assert paths["a"] == []
|
||||||
|
assert paths["b"] == _make_container_browse_path_entries(["a"])
|
||||||
|
assert paths["c"] == _make_container_browse_path_entries(["a", "b"])
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user