diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py index 501162455c..878b8dd1bb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py @@ -34,21 +34,26 @@ def get_bucket_relative_path(s3_uri: str) -> str: return "/".join(strip_s3_prefix(s3_uri).split("/")[1:]) -def make_s3_urn(s3_uri: str, env: str) -> str: +def make_s3_urn(s3_uri: str, env: str, remove_extension: bool = True) -> str: s3_name = strip_s3_prefix(s3_uri) if s3_name.endswith("/"): s3_name = s3_name[:-1] name, extension = os.path.splitext(s3_name) - - if extension != "": + if remove_extension and extension != "": extension = extension[1:] # remove the dot return f"urn:li:dataset:(urn:li:dataPlatform:s3,{name}_{extension},{env})" return f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name},{env})" +def make_s3_urn_for_lineage(s3_uri: str, env: str) -> str: + # Ideally this is the implementation for all S3 URNs + # Don't feel comfortable changing `make_s3_urn` for glue, sagemaker, and athena + return make_s3_urn(s3_uri, env, remove_extension=False) + + def get_bucket_name(s3_uri: str) -> str: if not is_s3_uri(s3_uri): raise ValueError( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 9a993f5774..0a15c352fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -21,7 +21,7 @@ from snowflake.connector import SnowflakeConnection import datahub.emitter.mce_builder as builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.aws.s3_util import make_s3_urn +from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.snowflake.constants import ( LINEAGE_PERMISSION_ERROR, SnowflakeEdition, @@ -652,7 +652,9 @@ class SnowflakeLineageExtractor( # For now, populate only for S3 if external_lineage_entry.startswith("s3://"): external_upstream_table = UpstreamClass( - dataset=make_s3_urn(external_lineage_entry, self.config.env), + dataset=make_s3_urn_for_lineage( + external_lineage_entry, self.config.env + ), type=DatasetLineageTypeClass.COPY, ) external_upstreams.append(external_upstream_table) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index a57ee39848..16820c37d5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -166,6 +166,14 @@ class UnityCatalogSourceConfig( description="Option to enable/disable lineage generation.", ) + include_external_lineage: bool = pydantic.Field( + default=True, + description=( + "Option to enable/disable lineage generation for external tables." + " Only external S3 tables are supported at the moment." + ), + ) + include_notebooks: bool = pydantic.Field( default=False, description="Ingest notebooks, represented as DataHub datasets.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index 9bcdb200f1..3fb77ce512 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -33,6 +33,7 @@ from datahub.ingestion.source.unity.proxy_types import ( ALLOWED_STATEMENT_TYPES, Catalog, Column, + ExternalTableReference, Metastore, Notebook, Query, @@ -248,6 +249,13 @@ class UnityCatalogApiProxy(UnityCatalogProxyProfilingMixin): ) if table_ref: table.upstreams[table_ref] = {} + elif "fileInfo" in item: + external_ref = ExternalTableReference.create_from_lineage( + item["fileInfo"] + ) + if external_ref: + table.external_upstreams.add(external_ref) + for notebook in item.get("notebookInfos") or []: table.upstream_notebooks.add(notebook["notebook_id"]) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py index 18ac2475b5..315c1c0d20 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py @@ -10,6 +10,7 @@ from databricks.sdk.service.catalog import ( CatalogType, ColumnTypeName, DataSourceFormat, + SecurableType, TableType, ) from databricks.sdk.service.sql import QueryStatementType @@ -176,6 +177,35 @@ class TableReference: return f"{self.catalog}/{self.schema}/{self.table}" +@dataclass(frozen=True, order=True) +class ExternalTableReference: + path: str + has_permission: bool + name: Optional[str] + type: Optional[SecurableType] + storage_location: Optional[str] + + @classmethod + def create_from_lineage(cls, d: dict) -> Optional["ExternalTableReference"]: + try: + securable_type: Optional[SecurableType] + try: + securable_type = SecurableType(d.get("securable_type", "").lower()) + except ValueError: + securable_type = None + + return cls( + path=d["path"], + has_permission=d.get("has_permission") or True, + name=d.get("securable_name"), + type=securable_type, + storage_location=d.get("storage_location"), + ) + except Exception as e: + logger.warning(f"Failed to create ExternalTableReference from {d}: {e}") + return None + + @dataclass class Table(CommonProperty): schema: Schema @@ -193,6 +223,7 @@ class Table(CommonProperty): view_definition: Optional[str] properties: Dict[str, str] upstreams: Dict[TableReference, Dict[str, List[str]]] = field(default_factory=dict) + external_upstreams: Set[ExternalTableReference] = field(default_factory=set) upstream_notebooks: Set[NotebookId] = field(default_factory=set) downstream_notebooks: Set[NotebookId] = field(default_factory=set) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py index fa61571fa9..4153d9dd88 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py @@ -19,6 +19,8 @@ class UnityCatalogReport(IngestionStageReport, StaleEntityRemovalSourceReport): notebooks: EntityFilterReport = EntityFilterReport.field(type="notebook") num_column_lineage_skipped_column_count: int = 0 + num_external_upstreams_lacking_permissions: int = 0 + num_external_upstreams_unsupported: int = 0 num_queries: int = 0 num_queries_dropped_parse_failure: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 27c1f341aa..b63cf65d55 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -41,6 +41,7 @@ from datahub.ingestion.api.source import ( TestConnectionReport, ) from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, DatasetSubTypes, @@ -455,6 +456,28 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource): ) ) + if self.config.include_external_lineage: + for external_ref in table.external_upstreams: + if not external_ref.has_permission or not external_ref.path: + self.report.num_external_upstreams_lacking_permissions += 1 + logger.warning( + f"Lacking permissions for external file upstream on {table.ref}" + ) + elif external_ref.path.startswith("s3://"): + upstreams.append( + UpstreamClass( + dataset=make_s3_urn_for_lineage( + external_ref.path, self.config.env + ), + type=DatasetLineageTypeClass.COPY, + ) + ) + else: + self.report.num_external_upstreams_unsupported += 1 + logger.warning( + f"Unsupported external file upstream on {table.ref}: {external_ref.path}" + ) + if upstreams: return UpstreamLineageClass( upstreams=upstreams,