fix(ingest/dbt): resolve more dbt ephemeral node lineage gaps (#10553)

This commit is contained in:
Harshal Sheth 2024-05-22 12:44:54 -07:00 committed by GitHub
parent 666de9e4e6
commit 7a519ac73c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 127 additions and 62 deletions

View File

@ -142,11 +142,17 @@ _DEFAULT_ACTOR = mce_builder.make_user_urn("unknown")
@dataclass
class DBTSourceReport(StaleEntityRemovalSourceReport):
sql_statements_parsed: int = 0
sql_statements_table_error: int = 0
sql_statements_column_error: int = 0
sql_parser_detach_ctes_failures: LossyList[str] = field(default_factory=LossyList)
sql_parser_skipped_missing_code: LossyList[str] = field(default_factory=LossyList)
sql_parser_parse_failures: int = 0
sql_parser_detach_ctes_failures: int = 0
sql_parser_table_errors: int = 0
sql_parser_column_errors: int = 0
sql_parser_successes: int = 0
sql_parser_parse_failures_list: LossyList[str] = field(default_factory=LossyList)
sql_parser_detach_ctes_failures_list: LossyList[str] = field(
default_factory=LossyList
)
in_manifest_but_missing_catalog: LossyList[str] = field(default_factory=LossyList)
@ -558,10 +564,11 @@ class DBTNode:
assert self.is_ephemeral_model()
# Similar to get_db_fqn.
fqn = self._join_parts(
db_fqn = self._join_parts(
[self.database, self.schema, f"__datahub__dbt__ephemeral__{self.name}"]
)
return fqn.replace('"', "")
db_fqn = db_fqn.lower()
return db_fqn.replace('"', "")
def get_urn_for_upstream_lineage(
self,
@ -819,9 +826,10 @@ def get_column_type(
# if still not found, report the warning
if TypeClass is None:
report.report_warning(
dataset_name, f"unable to map type {column_type} to metadata schema"
)
if column_type:
report.report_warning(
dataset_name, f"unable to map type {column_type} to metadata schema"
)
TypeClass = NullTypeClass
return SchemaFieldDataType(type=TypeClass())
@ -1041,7 +1049,7 @@ class DBTSourceBase(StatefulIngestionSourceBase):
# Iterate over the dbt nodes in topological order.
# This ensures that we process upstream nodes before downstream nodes.
for dbt_name in topological_sort(
node_order = topological_sort(
list(all_nodes_map.keys()),
edges=list(
(upstream, node.dbt_name)
@ -1049,7 +1057,8 @@ class DBTSourceBase(StatefulIngestionSourceBase):
for upstream in node.upstream_nodes
if upstream in all_nodes_map
),
):
)
for dbt_name in node_order:
node = all_nodes_map[dbt_name]
logger.debug(f"Processing CLL/schemas for {node.dbt_name}")
@ -1119,55 +1128,26 @@ class DBTSourceBase(StatefulIngestionSourceBase):
# Run sql parser to infer the schema + generate column lineage.
sql_result = None
if node.node_type in {"source", "test"}:
if node.node_type in {"source", "test", "seed"}:
# For sources, we generate CLL as a 1:1 mapping.
# We don't support CLL for tests (assertions).
# We don't support CLL for tests (assertions) or seeds.
pass
elif node.compiled_code:
try:
# Add CTE stops based on the upstreams list.
cte_mapping = {
cte_name: upstream_node.get_fake_ephemeral_table_name()
for upstream_node in [
all_nodes_map[upstream_node_name]
for upstream_node_name in node.upstream_nodes
if upstream_node_name in all_nodes_map
]
if upstream_node.is_ephemeral_model()
for cte_name in _get_dbt_cte_names(
upstream_node.name, schema_resolver.platform
)
}
preprocessed_sql = detach_ctes(
parse_statements_and_pick(
node.compiled_code,
platform=schema_resolver.platform,
),
platform=schema_resolver.platform,
cte_mapping=cte_mapping,
# Add CTE stops based on the upstreams list.
cte_mapping = {
cte_name: upstream_node.get_fake_ephemeral_table_name()
for upstream_node in [
all_nodes_map[upstream_node_name]
for upstream_node_name in node.upstream_nodes
if upstream_node_name in all_nodes_map
]
if upstream_node.is_ephemeral_model()
for cte_name in _get_dbt_cte_names(
upstream_node.name, schema_resolver.platform
)
except Exception as e:
self.report.sql_parser_detach_ctes_failures.append(node.dbt_name)
logger.debug(
f"Failed to detach CTEs from compiled code. {node.dbt_name} will not have column lineage."
)
sql_result = SqlParsingResult.make_from_error(e)
else:
sql_result = sqlglot_lineage(
preprocessed_sql, schema_resolver=schema_resolver
)
if sql_result.debug_info.error:
self.report.sql_statements_table_error += 1
logger.info(
f"Failed to parse compiled code for {node.dbt_name}: {sql_result.debug_info.error}"
)
elif sql_result.debug_info.column_error:
self.report.sql_statements_column_error += 1
logger.info(
f"Failed to generate CLL for {node.dbt_name}: {sql_result.debug_info.column_error}"
)
else:
self.report.sql_statements_parsed += 1
}
sql_result = self._parse_cll(node, cte_mapping, schema_resolver)
else:
self.report.sql_parser_skipped_missing_code.append(node.dbt_name)
@ -1212,6 +1192,56 @@ class DBTSourceBase(StatefulIngestionSourceBase):
if inferred_schema_fields:
node.columns_setdefault(inferred_schema_fields)
def _parse_cll(
self,
node: DBTNode,
cte_mapping: Dict[str, str],
schema_resolver: SchemaResolver,
) -> SqlParsingResult:
assert node.compiled_code is not None
try:
picked_statement = parse_statements_and_pick(
node.compiled_code,
platform=schema_resolver.platform,
)
except Exception as e:
logger.debug(
f"Failed to parse compiled code. {node.dbt_name} will not have column lineage."
)
self.report.sql_parser_parse_failures += 1
self.report.sql_parser_parse_failures_list.append(node.dbt_name)
return SqlParsingResult.make_from_error(e)
try:
preprocessed_sql = detach_ctes(
picked_statement,
platform=schema_resolver.platform,
cte_mapping=cte_mapping,
)
except Exception as e:
self.report.sql_parser_detach_ctes_failures += 1
self.report.sql_parser_detach_ctes_failures_list.append(node.dbt_name)
logger.debug(
f"Failed to detach CTEs from compiled code. {node.dbt_name} will not have column lineage."
)
return SqlParsingResult.make_from_error(e)
sql_result = sqlglot_lineage(preprocessed_sql, schema_resolver=schema_resolver)
if sql_result.debug_info.table_error:
self.report.sql_parser_table_errors += 1
logger.info(
f"Failed to generate any CLL lineage for {node.dbt_name}: {sql_result.debug_info.error}"
)
elif sql_result.debug_info.column_error:
self.report.sql_parser_column_errors += 1
logger.info(
f"Failed to generate CLL for {node.dbt_name}: {sql_result.debug_info.column_error}"
)
else:
self.report.sql_parser_successes += 1
return sql_result
def create_dbt_platform_mces(
self,
dbt_nodes: List[DBTNode],

View File

@ -365,7 +365,7 @@ def _column_level_lineage( # noqa: C901
col_normalized = col
table_schema_normalized_mapping[table][col_normalized] = col
normalized_table_schema[col_normalized] = col_type
normalized_table_schema[col_normalized] = col_type or "UNKNOWN"
sqlglot_db_schema.add_table(
table.as_sqlglot_table(),
@ -923,12 +923,20 @@ def _sqlglot_lineage_inner(
out_urns = sorted({table_name_urn_mapping[table] for table in modified})
column_lineage_urns = None
if column_lineage:
column_lineage_urns = [
_translate_internal_column_lineage(
table_name_urn_mapping, internal_col_lineage, dialect=dialect
try:
column_lineage_urns = [
_translate_internal_column_lineage(
table_name_urn_mapping, internal_col_lineage, dialect=dialect
)
for internal_col_lineage in column_lineage
]
except KeyError as e:
# When this happens, it's usually because of things like PIVOT where we can't
# really go up the scope chain.
logger.debug(
f"Failed to translate column lineage to urns: {e}", exc_info=True
)
for internal_col_lineage in column_lineage
]
debug_info.column_error = e
query_type, query_type_props = get_query_type_of_sql(
original_statement, dialect=dialect

View File

@ -81,6 +81,8 @@ def parse_statement(
def parse_statements_and_pick(sql: str, platform: DialectOrStr) -> sqlglot.Expression:
logger.debug("Parsing SQL query: %s", sql)
dialect = get_dialect(platform)
statements = [
expression for expression in sqlglot.parse(sql, dialect=dialect) if expression

View File

@ -907,6 +907,31 @@
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "dataset",
"entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)",
"changeType": "UPSERT",
"aspectName": "upstreamLineage",
"aspect": {
"json": {
"upstreams": [
{
"auditStamp": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD)",
"type": "TRANSFORMED"
}
]
}
},
"systemMetadata": {
"lastObserved": 1643871600000,
"runId": "powerbi-test",
"lastRunId": "no-run-id-provided"
}
},
{
"entityType": "corpuser",
"entityUrn": "urn:li:corpuser:users.User1@foo.com",