fix(mssql): adds missing containers and browsepathsv2 for dataflow and datajob (#12483)

This commit is contained in:
Sergio Gómez Villamor 2025-01-31 09:52:26 +01:00 committed by GitHub
parent 301d628ba7
commit a7598ca20e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 300 additions and 40 deletions

View File

@ -7,7 +7,12 @@ from datahub.emitter.mce_builder import (
make_data_platform_urn,
make_dataplatform_instance_urn,
)
from datahub.emitter.mcp_builder import (
DatabaseKey,
SchemaKey,
)
from datahub.metadata.schema_classes import (
ContainerClass,
DataFlowInfoClass,
DataJobInfoClass,
DataJobInputOutputClass,
@ -171,11 +176,7 @@ class MSSQLDataJob:
flow_id=self.entity.flow.formatted_name,
job_id=self.entity.formatted_name,
cluster=self.entity.flow.cluster,
platform_instance=(
self.entity.flow.platform_instance
if self.entity.flow.platform_instance
else None
),
platform_instance=self.entity.flow.platform_instance,
)
def add_property(
@ -222,6 +223,26 @@ class MSSQLDataJob:
)
return None
@property
def as_container_aspect(self) -> ContainerClass:
key_args = dict(
platform=self.entity.flow.orchestrator,
instance=self.entity.flow.platform_instance,
env=self.entity.flow.env,
database=self.entity.flow.db,
)
container_key = (
SchemaKey(
schema=self.entity.schema,
**key_args,
)
if isinstance(self.entity, StoredProcedure)
else DatabaseKey(
**key_args,
)
)
return ContainerClass(container=container_key.as_urn())
@dataclass
class MSSQLDataFlow:
@ -244,9 +265,7 @@ class MSSQLDataFlow:
orchestrator=self.entity.orchestrator,
flow_id=self.entity.formatted_name,
cluster=self.entity.cluster,
platform_instance=(
self.entity.platform_instance if self.entity.platform_instance else None
),
platform_instance=self.entity.platform_instance,
)
@property
@ -267,3 +286,13 @@ class MSSQLDataFlow:
),
)
return None
@property
def as_container_aspect(self) -> ContainerClass:
databaseKey = DatabaseKey(
platform=self.entity.orchestrator,
instance=self.entity.platform_instance,
env=self.entity.env,
database=self.entity.db,
)
return ContainerClass(container=databaseKey.as_urn())

View File

@ -108,6 +108,10 @@ class SQLServerConfig(BasicSQLAlchemyConfig):
default=True,
description="Enable lineage extraction for stored procedures",
)
include_containers_for_pipelines: bool = Field(
default=False,
description="Enable the container aspects ingestion for both pipelines and tasks. Note that this feature requires the corresponding model support in the backend, which was introduced in version 0.15.0.1.",
)
@pydantic.validator("uri_args")
def passwords_match(cls, v, values, **kwargs):
@ -641,6 +645,12 @@ class SQLServerSource(SQLAlchemySource):
aspect=data_platform_instance_aspect,
).as_workunit()
if self.config.include_containers_for_pipelines:
yield MetadataChangeProposalWrapper(
entityUrn=data_job.urn,
aspect=data_job.as_container_aspect,
).as_workunit()
if include_lineage:
yield MetadataChangeProposalWrapper(
entityUrn=data_job.urn,
@ -683,6 +693,13 @@ class SQLServerSource(SQLAlchemySource):
entityUrn=data_flow.urn,
aspect=data_platform_instance_aspect,
).as_workunit()
if self.config.include_containers_for_pipelines:
yield MetadataChangeProposalWrapper(
entityUrn=data_flow.urn,
aspect=data_flow.as_container_aspect,
).as_workunit()
# TODO: Add SubType when it appear
def get_inspectors(self) -> Iterable[Inspector]:

View File

@ -112,11 +112,11 @@
"aspect": {
"json": {
"customProperties": {
"job_id": "f5a6c120-500a-4300-9b21-0c3225af1f80",
"job_id": "2fc72675-0c68-4260-ab00-c361b96c8c36",
"job_name": "Weekly Demo Data Backup",
"description": "No description available.",
"date_created": "2024-12-30 19:59:24.690000",
"date_modified": "2024-12-30 19:59:24.690000",
"date_created": "2025-01-31 08:02:41.167000",
"date_modified": "2025-01-31 08:02:41.360000",
"step_id": "1",
"step_name": "Set database to read only",
"subsystem": "TSQL",
@ -2279,8 +2279,8 @@
"code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n",
"input parameters": "['@ID']",
"parameter @ID": "{'type': 'int'}",
"date_created": "2024-12-30 19:59:24.690000",
"date_modified": "2024-12-30 19:59:24.690000"
"date_created": "2025-01-31 08:02:40.980000",
"date_modified": "2025-01-31 08:02:40.980000"
},
"name": "DemoData.Foo.Proc.With.SpecialChar",
"type": {
@ -2329,8 +2329,8 @@
"depending_on_procedure": "{}",
"code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n",
"input parameters": "[]",
"date_created": "2024-12-30 19:59:24.690000",
"date_modified": "2024-12-30 19:59:24.690000"
"date_created": "2025-01-31 08:02:40.987000",
"date_modified": "2025-01-31 08:02:40.987000"
},
"name": "DemoData.Foo.NewProc",
"type": {
@ -4969,7 +4969,7 @@
"actor": "urn:li:corpuser:_ingestion"
},
"lastModified": {
"time": 1735588784503,
"time": 1738310563767,
"actor": "urn:li:corpuser:_ingestion"
}
}
@ -5092,7 +5092,7 @@
"actor": "urn:li:corpuser:_ingestion"
},
"lastModified": {
"time": 1735588784511,
"time": 1738310563770,
"actor": "urn:li:corpuser:_ingestion"
}
}

View File

@ -112,11 +112,11 @@
"aspect": {
"json": {
"customProperties": {
"job_id": "f5a6c120-500a-4300-9b21-0c3225af1f80",
"job_id": "2fc72675-0c68-4260-ab00-c361b96c8c36",
"job_name": "Weekly Demo Data Backup",
"description": "No description available.",
"date_created": "2024-12-30 19:59:24.690000",
"date_modified": "2024-12-30 19:59:24.690000",
"date_created": "2025-01-31 08:02:41.167000",
"date_modified": "2025-01-31 08:02:41.360000",
"step_id": "1",
"step_name": "Set database to read only",
"subsystem": "TSQL",
@ -2279,8 +2279,8 @@
"code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n",
"input parameters": "['@ID']",
"parameter @ID": "{'type': 'int'}",
"date_created": "2024-12-30 19:59:24.690000",
"date_modified": "2024-12-30 19:59:24.690000"
"date_created": "2025-01-31 08:02:40.980000",
"date_modified": "2025-01-31 08:02:40.980000"
},
"name": "DemoData.Foo.Proc.With.SpecialChar",
"type": {
@ -2694,7 +2694,7 @@
"actor": "urn:li:corpuser:_ingestion"
},
"lastModified": {
"time": 1735588789629,
"time": 1738310565884,
"actor": "urn:li:corpuser:_ingestion"
}
}

View File

@ -128,6 +128,47 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataFlow",
"entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a"
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
"runId": "mssql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataFlow",
"entityUrn": "urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)"
},
{
"id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a",
"urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a"
}
]
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
"runId": "mssql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)",
@ -136,11 +177,11 @@
"aspect": {
"json": {
"customProperties": {
"job_id": "f5a6c120-500a-4300-9b21-0c3225af1f80",
"job_id": "5a260993-c4ce-4bb3-a273-eaf6ef6e0382",
"job_name": "Weekly Demo Data Backup",
"description": "No description available.",
"date_created": "2024-12-30 19:59:24.690000",
"date_modified": "2024-12-30 19:59:24.690000",
"date_created": "2025-01-28 15:27:31.437000",
"date_modified": "2025-01-28 15:27:31.593000",
"step_id": "1",
"step_name": "Set database to read only",
"subsystem": "TSQL",
@ -175,6 +216,22 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a"
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
"runId": "mssql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)",
@ -193,6 +250,31 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.Weekly Demo Data Backup,PROD),Weekly Demo Data Backup)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)"
},
{
"id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a",
"urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a"
}
]
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
"runId": "mssql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:5726a09b23f60be6f661206c879a3683",
@ -2516,6 +2598,47 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataFlow",
"entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a"
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
"runId": "mssql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataFlow",
"entityUrn": "urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)"
},
{
"id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a",
"urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a"
}
]
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
"runId": "mssql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)",
@ -2529,8 +2652,8 @@
"code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n",
"input parameters": "['@ID']",
"parameter @ID": "{'type': 'int'}",
"date_created": "2024-12-30 19:59:24.690000",
"date_modified": "2024-12-30 19:59:24.690000"
"date_created": "2025-01-28 15:27:31.257000",
"date_modified": "2025-01-28 15:27:31.257000"
},
"name": "DemoData.Foo.Proc.With.SpecialChar",
"type": {
@ -2561,6 +2684,22 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680"
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
"runId": "mssql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)",
@ -2584,6 +2723,35 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),Proc.With.SpecialChar)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)"
},
{
"id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a",
"urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a"
},
{
"id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680",
"urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680"
}
]
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
"runId": "mssql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)",
@ -2596,8 +2764,8 @@
"depending_on_procedure": "{}",
"code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n",
"input parameters": "[]",
"date_created": "2024-12-30 19:59:24.690000",
"date_modified": "2024-12-30 19:59:24.690000"
"date_created": "2025-01-28 15:27:31.263000",
"date_modified": "2025-01-28 15:27:31.263000"
},
"name": "DemoData.Foo.NewProc",
"type": {
@ -2628,6 +2796,22 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:6fbadfb496ee98718da210cc2fca1680"
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
"runId": "mssql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)",
@ -2651,6 +2835,35 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataJob",
"entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,my-instance.DemoData.Foo.stored_procedures,PROD),NewProc)",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)",
"urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mssql,my-instance)"
},
{
"id": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a",
"urn": "urn:li:container:db8117ee3cc6397c503e7824ae3e0f6a"
},
{
"id": "urn:li:container:6fbadfb496ee98718da210cc2fca1680",
"urn": "urn:li:container:6fbadfb496ee98718da210cc2fca1680"
}
]
}
},
"systemMetadata": {
"lastObserved": 1615443388097,
"runId": "mssql-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:5631370915311469374ef3cb5f0ebbf0",
@ -3046,7 +3259,7 @@
"actor": "urn:li:corpuser:_ingestion"
},
"lastModified": {
"time": 1735588787786,
"time": 1738078055642,
"actor": "urn:li:corpuser:_ingestion"
}
}

View File

@ -112,11 +112,11 @@
"aspect": {
"json": {
"customProperties": {
"job_id": "f5a6c120-500a-4300-9b21-0c3225af1f80",
"job_id": "2fc72675-0c68-4260-ab00-c361b96c8c36",
"job_name": "Weekly Demo Data Backup",
"description": "No description available.",
"date_created": "2024-12-30 19:59:24.690000",
"date_modified": "2024-12-30 19:59:24.690000",
"date_created": "2025-01-31 08:02:41.167000",
"date_modified": "2025-01-31 08:02:41.360000",
"step_id": "1",
"step_name": "Set database to read only",
"subsystem": "TSQL",
@ -2279,8 +2279,8 @@
"code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n",
"input parameters": "['@ID']",
"parameter @ID": "{'type': 'int'}",
"date_created": "2024-12-30 19:59:24.690000",
"date_modified": "2024-12-30 19:59:24.690000"
"date_created": "2025-01-31 08:02:40.980000",
"date_modified": "2025-01-31 08:02:40.980000"
},
"name": "DemoData.Foo.Proc.With.SpecialChar",
"type": {
@ -2329,8 +2329,8 @@
"depending_on_procedure": "{}",
"code": "CREATE PROCEDURE [Foo].[NewProc]\n AS\n BEGIN\n --insert into items table from salesreason table\n insert into Foo.Items (ID, ItemName)\n SELECT TempID, Name\n FROM Foo.SalesReason;\n\n\n IF OBJECT_ID('Foo.age_dist', 'U') IS NULL\n BEGIN\n -- Create and populate if table doesn't exist\n SELECT Age, COUNT(*) as Count\n INTO Foo.age_dist\n FROM Foo.Persons\n GROUP BY Age\n END\n ELSE\n BEGIN\n -- Update existing table\n TRUNCATE TABLE Foo.age_dist;\n\n INSERT INTO Foo.age_dist (Age, Count)\n SELECT Age, COUNT(*) as Count\n FROM Foo.Persons\n GROUP BY Age\n END\n\n SELECT ID, Age INTO #TEMPTABLE FROM NewData.FooNew.PersonsNew\n \n UPDATE DemoData.Foo.Persons\n SET Age = t.Age\n FROM DemoData.Foo.Persons p\n JOIN #TEMPTABLE t ON p.ID = t.ID\n\n END\n",
"input parameters": "[]",
"date_created": "2024-12-30 19:59:24.690000",
"date_modified": "2024-12-30 19:59:24.690000"
"date_created": "2025-01-31 08:02:40.987000",
"date_modified": "2025-01-31 08:02:40.987000"
},
"name": "DemoData.Foo.NewProc",
"type": {
@ -5019,7 +5019,7 @@
"actor": "urn:li:corpuser:_ingestion"
},
"lastModified": {
"time": 1735588791954,
"time": 1738310566860,
"actor": "urn:li:corpuser:_ingestion"
}
}
@ -5166,7 +5166,7 @@
"actor": "urn:li:corpuser:_ingestion"
},
"lastModified": {
"time": 1735588791966,
"time": 1738310566866,
"actor": "urn:li:corpuser:_ingestion"
}
}

View File

@ -8,6 +8,7 @@ source:
database: DemoData
host_port: localhost:21433
platform_instance: my-instance
include_containers_for_pipelines: true
# use_odbc: True
# uri_args:
# driver: "ODBC Driver 17 for SQL Server"