mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-29 10:57:52 +00:00
fix(ingest/dbt): resolve more dbt ephemeral node lineage gaps (#10553)
This commit is contained in:
parent
666de9e4e6
commit
7a519ac73c
@ -142,11 +142,17 @@ _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
|
||||
|
||||
@dataclass
|
||||
class DBTSourceReport(StaleEntityRemovalSourceReport):
|
||||
sql_statements_parsed: int = 0
|
||||
sql_statements_table_error: int = 0
|
||||
sql_statements_column_error: int = 0
|
||||
sql_parser_detach_ctes_failures: LossyList[str] = field(default_factory=LossyList)
|
||||
sql_parser_skipped_missing_code: LossyList[str] = field(default_factory=LossyList)
|
||||
sql_parser_parse_failures: int = 0
|
||||
sql_parser_detach_ctes_failures: int = 0
|
||||
sql_parser_table_errors: int = 0
|
||||
sql_parser_column_errors: int = 0
|
||||
sql_parser_successes: int = 0
|
||||
|
||||
sql_parser_parse_failures_list: LossyList[str] = field(default_factory=LossyList)
|
||||
sql_parser_detach_ctes_failures_list: LossyList[str] = field(
|
||||
default_factory=LossyList
|
||||
)
|
||||
|
||||
in_manifest_but_missing_catalog: LossyList[str] = field(default_factory=LossyList)
|
||||
|
||||
@ -558,10 +564,11 @@ class DBTNode:
|
||||
assert self.is_ephemeral_model()
|
||||
|
||||
# Similar to get_db_fqn.
|
||||
fqn = self._join_parts(
|
||||
db_fqn = self._join_parts(
|
||||
[self.database, self.schema, f"__datahub__dbt__ephemeral__{self.name}"]
|
||||
)
|
||||
return fqn.replace('"', "")
|
||||
db_fqn = db_fqn.lower()
|
||||
return db_fqn.replace('"', "")
|
||||
|
||||
def get_urn_for_upstream_lineage(
|
||||
self,
|
||||
@ -819,9 +826,10 @@ def get_column_type(
|
||||
|
||||
# if still not found, report the warning
|
||||
if TypeClass is None:
|
||||
report.report_warning(
|
||||
dataset_name, f"unable to map type {column_type} to metadata schema"
|
||||
)
|
||||
if column_type:
|
||||
report.report_warning(
|
||||
dataset_name, f"unable to map type {column_type} to metadata schema"
|
||||
)
|
||||
TypeClass = NullTypeClass
|
||||
|
||||
return SchemaFieldDataType(type=TypeClass())
|
||||
@ -1041,7 +1049,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
||||
|
||||
# Iterate over the dbt nodes in topological order.
|
||||
# This ensures that we process upstream nodes before downstream nodes.
|
||||
for dbt_name in topological_sort(
|
||||
node_order = topological_sort(
|
||||
list(all_nodes_map.keys()),
|
||||
edges=list(
|
||||
(upstream, node.dbt_name)
|
||||
@ -1049,7 +1057,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
||||
for upstream in node.upstream_nodes
|
||||
if upstream in all_nodes_map
|
||||
),
|
||||
):
|
||||
)
|
||||
for dbt_name in node_order:
|
||||
node = all_nodes_map[dbt_name]
|
||||
logger.debug(f"Processing CLL/schemas for {node.dbt_name}")
|
||||
|
||||
@ -1119,55 +1128,26 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
||||
|
||||
# Run sql parser to infer the schema + generate column lineage.
|
||||
sql_result = None
|
||||
if node.node_type in {"source", "test"}:
|
||||
if node.node_type in {"source", "test", "seed"}:
|
||||
# For sources, we generate CLL as a 1:1 mapping.
|
||||
# We don't support CLL for tests (assertions).
|
||||
# We don't support CLL for tests (assertions) or seeds.
|
||||
pass
|
||||
elif node.compiled_code:
|
||||
try:
|
||||
# Add CTE stops based on the upstreams list.
|
||||
cte_mapping = {
|
||||
cte_name: upstream_node.get_fake_ephemeral_table_name()
|
||||
for upstream_node in [
|
||||
all_nodes_map[upstream_node_name]
|
||||
for upstream_node_name in node.upstream_nodes
|
||||
if upstream_node_name in all_nodes_map
|
||||
]
|
||||
if upstream_node.is_ephemeral_model()
|
||||
for cte_name in _get_dbt_cte_names(
|
||||
upstream_node.name, schema_resolver.platform
|
||||
)
|
||||
}
|
||||
preprocessed_sql = detach_ctes(
|
||||
parse_statements_and_pick(
|
||||
node.compiled_code,
|
||||
platform=schema_resolver.platform,
|
||||
),
|
||||
platform=schema_resolver.platform,
|
||||
cte_mapping=cte_mapping,
|
||||
# Add CTE stops based on the upstreams list.
|
||||
cte_mapping = {
|
||||
cte_name: upstream_node.get_fake_ephemeral_table_name()
|
||||
for upstream_node in [
|
||||
all_nodes_map[upstream_node_name]
|
||||
for upstream_node_name in node.upstream_nodes
|
||||
if upstream_node_name in all_nodes_map
|
||||
]
|
||||
if upstream_node.is_ephemeral_model()
|
||||
for cte_name in _get_dbt_cte_names(
|
||||
upstream_node.name, schema_resolver.platform
|
||||
)
|
||||
except Exception as e:
|
||||
self.report.sql_parser_detach_ctes_failures.append(node.dbt_name)
|
||||
logger.debug(
|
||||
f"Failed to detach CTEs from compiled code. {node.dbt_name} will not have column lineage."
|
||||
)
|
||||
sql_result = SqlParsingResult.make_from_error(e)
|
||||
else:
|
||||
sql_result = sqlglot_lineage(
|
||||
preprocessed_sql, schema_resolver=schema_resolver
|
||||
)
|
||||
if sql_result.debug_info.error:
|
||||
self.report.sql_statements_table_error += 1
|
||||
logger.info(
|
||||
f"Failed to parse compiled code for {node.dbt_name}: {sql_result.debug_info.error}"
|
||||
)
|
||||
elif sql_result.debug_info.column_error:
|
||||
self.report.sql_statements_column_error += 1
|
||||
logger.info(
|
||||
f"Failed to generate CLL for {node.dbt_name}: {sql_result.debug_info.column_error}"
|
||||
)
|
||||
else:
|
||||
self.report.sql_statements_parsed += 1
|
||||
}
|
||||
|
||||
sql_result = self._parse_cll(node, cte_mapping, schema_resolver)
|
||||
else:
|
||||
self.report.sql_parser_skipped_missing_code.append(node.dbt_name)
|
||||
|
||||
@ -1212,6 +1192,56 @@ class DBTSourceBase(StatefulIngestionSourceBase):
|
||||
if inferred_schema_fields:
|
||||
node.columns_setdefault(inferred_schema_fields)
|
||||
|
||||
def _parse_cll(
|
||||
self,
|
||||
node: DBTNode,
|
||||
cte_mapping: Dict[str, str],
|
||||
schema_resolver: SchemaResolver,
|
||||
) -> SqlParsingResult:
|
||||
assert node.compiled_code is not None
|
||||
|
||||
try:
|
||||
picked_statement = parse_statements_and_pick(
|
||||
node.compiled_code,
|
||||
platform=schema_resolver.platform,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
f"Failed to parse compiled code. {node.dbt_name} will not have column lineage."
|
||||
)
|
||||
self.report.sql_parser_parse_failures += 1
|
||||
self.report.sql_parser_parse_failures_list.append(node.dbt_name)
|
||||
return SqlParsingResult.make_from_error(e)
|
||||
|
||||
try:
|
||||
preprocessed_sql = detach_ctes(
|
||||
picked_statement,
|
||||
platform=schema_resolver.platform,
|
||||
cte_mapping=cte_mapping,
|
||||
)
|
||||
except Exception as e:
|
||||
self.report.sql_parser_detach_ctes_failures += 1
|
||||
self.report.sql_parser_detach_ctes_failures_list.append(node.dbt_name)
|
||||
logger.debug(
|
||||
f"Failed to detach CTEs from compiled code. {node.dbt_name} will not have column lineage."
|
||||
)
|
||||
return SqlParsingResult.make_from_error(e)
|
||||
|
||||
sql_result = sqlglot_lineage(preprocessed_sql, schema_resolver=schema_resolver)
|
||||
if sql_result.debug_info.table_error:
|
||||
self.report.sql_parser_table_errors += 1
|
||||
logger.info(
|
||||
f"Failed to generate any CLL lineage for {node.dbt_name}: {sql_result.debug_info.error}"
|
||||
)
|
||||
elif sql_result.debug_info.column_error:
|
||||
self.report.sql_parser_column_errors += 1
|
||||
logger.info(
|
||||
f"Failed to generate CLL for {node.dbt_name}: {sql_result.debug_info.column_error}"
|
||||
)
|
||||
else:
|
||||
self.report.sql_parser_successes += 1
|
||||
return sql_result
|
||||
|
||||
def create_dbt_platform_mces(
|
||||
self,
|
||||
dbt_nodes: List[DBTNode],
|
||||
|
||||
@ -365,7 +365,7 @@ def _column_level_lineage( # noqa: C901
|
||||
col_normalized = col
|
||||
|
||||
table_schema_normalized_mapping[table][col_normalized] = col
|
||||
normalized_table_schema[col_normalized] = col_type
|
||||
normalized_table_schema[col_normalized] = col_type or "UNKNOWN"
|
||||
|
||||
sqlglot_db_schema.add_table(
|
||||
table.as_sqlglot_table(),
|
||||
@ -923,12 +923,20 @@ def _sqlglot_lineage_inner(
|
||||
out_urns = sorted({table_name_urn_mapping[table] for table in modified})
|
||||
column_lineage_urns = None
|
||||
if column_lineage:
|
||||
column_lineage_urns = [
|
||||
_translate_internal_column_lineage(
|
||||
table_name_urn_mapping, internal_col_lineage, dialect=dialect
|
||||
try:
|
||||
column_lineage_urns = [
|
||||
_translate_internal_column_lineage(
|
||||
table_name_urn_mapping, internal_col_lineage, dialect=dialect
|
||||
)
|
||||
for internal_col_lineage in column_lineage
|
||||
]
|
||||
except KeyError as e:
|
||||
# When this happens, it's usually because of things like PIVOT where we can't
|
||||
# really go up the scope chain.
|
||||
logger.debug(
|
||||
f"Failed to translate column lineage to urns: {e}", exc_info=True
|
||||
)
|
||||
for internal_col_lineage in column_lineage
|
||||
]
|
||||
debug_info.column_error = e
|
||||
|
||||
query_type, query_type_props = get_query_type_of_sql(
|
||||
original_statement, dialect=dialect
|
||||
|
||||
@ -81,6 +81,8 @@ def parse_statement(
|
||||
|
||||
|
||||
def parse_statements_and_pick(sql: str, platform: DialectOrStr) -> sqlglot.Expression:
|
||||
logger.debug("Parsing SQL query: %s", sql)
|
||||
|
||||
dialect = get_dialect(platform)
|
||||
statements = [
|
||||
expression for expression in sqlglot.parse(sql, dialect=dialect) if expression
|
||||
|
||||
@ -907,6 +907,31 @@
|
||||
"lastRunId": "no-run-id-provided"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "dataset",
|
||||
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)",
|
||||
"changeType": "UPSERT",
|
||||
"aspectName": "upstreamLineage",
|
||||
"aspect": {
|
||||
"json": {
|
||||
"upstreams": [
|
||||
{
|
||||
"auditStamp": {
|
||||
"time": 0,
|
||||
"actor": "urn:li:corpuser:unknown"
|
||||
},
|
||||
"dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD)",
|
||||
"type": "TRANSFORMED"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"systemMetadata": {
|
||||
"lastObserved": 1643871600000,
|
||||
"runId": "powerbi-test",
|
||||
"lastRunId": "no-run-id-provided"
|
||||
}
|
||||
},
|
||||
{
|
||||
"entityType": "corpuser",
|
||||
"entityUrn": "urn:li:corpuser:users.User1@foo.com",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user