fix(ingest): fix lineage after dbt metadata ingestion when tables name and identifier differ (#2596)

2025-11-16 03:13:09 +00:00 · 2021-05-25 19:59:35 -06:00 · 2021-05-25 19:59:35 -06:00 · 6aa133f99c
commit 6aa133f99c
parent d23c58734b
2 changed files with 33 additions and 16 deletions
--- a/metadata-ingestion/README.md
+++ b/metadata-ingestion/README.md
@ -53,7 +53,7 @@ We use a plugin architecture so that you can install only the dependencies you a
 | lookml        | `pip install 'acryl-datahub[lookml]'`                      | LookML source, requires Python 3.7+ |
 | kafka         | `pip install 'acryl-datahub[kafka]'`                       | Kafka source                        |
 | druid         | `pip install 'acryl-datahub[druid]'`                       | Druid Source                        |
-| dbt           | _no additional dependencies_                               | DBT source                          |
+| dbt           | _no additional dependencies_                               | dbt source                          |
 | datahub-rest  | `pip install 'acryl-datahub[datahub-rest]'`                | DataHub sink over REST API          |
 | datahub-kafka | `pip install 'acryl-datahub[datahub-kafka]'`               | DataHub sink over Kafka             |
@ -595,19 +595,19 @@ source:
    filename: ./path/to/mce/file.json
 ```
-### DBT `dbt`
+### dbt `dbt`
-Pull metadata from DBT output files:
+Pull metadata from dbt artifacts files:
 - [dbt manifest file](https://docs.getdbt.com/reference/artifacts/manifest-json)
  - This file contains model, source and lineage data.
 - [dbt catalog file](https://docs.getdbt.com/reference/artifacts/catalog-json)
  - This file contains schema data.
-  - DBT does not record schema data for Ephemeral models, as such datahub will show Ephemeral models in the lineage, however there will be no associated schema for Ephemeral models
+  - dbt does not record schema data for Ephemeral models, as such datahub will show Ephemeral models in the lineage, however there will be no associated schema for Ephemeral models
 - target_platform:
-  - The data platform you are enriching with DBT metadata.
+  - The data platform you are enriching with dbt metadata.
  - [data platforms](https://github.com/linkedin/datahub/blob/master/gms/impl/src/main/resources/DataPlatformInfo.json)
- load_schema:
+- load_schemas:
  - Load schemas from dbt catalog file, not necessary when the underlying data platform already has this data.
 ```yml
@ -616,10 +616,12 @@ source:
  config:
    manifest_path: "./path/dbt/manifest_file.json"
    catalog_path: "./path/dbt/catalog_file.json"
-    target_platform: "postgres" # optional eg postgres, snowflake etc.
+    target_platform: "postgres" # optional, eg "postgres", "snowflake", etc.
-    load_schema: True / False
+    load_schemas: True or False
 ```
 Note: when `load_schemas` is False, models that use [identifiers](https://docs.getdbt.com/reference/resource-properties/identifier) to reference their source tables are ingested using the model identifier as the model name to preserve the lineage.
 ### Kafka Connect `kafka-connect`
 Extracts:
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt.py
@ -58,7 +58,7 @@ class DBTNode:
    dbt_file_path: str
    node_type: str  # source, model
    materialization: str  # table, view, ephemeral
-    name: str
+    name: str  # name, identifier
    columns: List[DBTColumn]
    upstream_urns: List[str]
    datahub_urn: str
@ -98,17 +98,24 @@ def extract_dbt_entities(
        dbtNode = DBTNode()
        dbtNode.dbt_name = key
        dbtNode.node_type = node["resource_type"]
        dbtNode.name = node["name"]
        dbtNode.database = node["database"]
        dbtNode.schema = node["schema"]
        dbtNode.dbt_file_path = node["original_file_path"]
        dbtNode.node_type = node["resource_type"]
        if "identifier" in node and load_catalog is False:
            dbtNode.name = node["identifier"]
        else:
            dbtNode.name = node["name"]
        if "materialized" in node["config"].keys():
            # It's a model
            dbtNode.materialization = node["config"]["materialized"]
            dbtNode.upstream_urns = get_upstreams(
-                node["depends_on"]["nodes"], nodes, target_platform, environment
+                node["depends_on"]["nodes"],
                nodes,
                load_catalog,
                target_platform,
                environment,
            )
        else:
            # It's a source
@ -190,19 +197,27 @@ def get_custom_properties(node: DBTNode) -> Dict[str, str]:
 def get_upstreams(
    upstreams: List[str],
    all_nodes: Dict[str, dict],
    load_catalog: bool,
    target_platform: str,
    environment: str,
 ) -> List[str]:
    upstream_urns = []
    for upstream in upstreams:
-        upstream_node = all_nodes[upstream]
+        dbtNode_upstream = DBTNode()
        dbtNode_upstream.database = all_nodes[upstream]["database"]
        dbtNode_upstream.schema = all_nodes[upstream]["schema"]
        if "identifier" in all_nodes[upstream] and load_catalog is False:
            dbtNode_upstream.name = all_nodes[upstream]["identifier"]
        else:
            dbtNode_upstream.name = all_nodes[upstream]["name"]
        upstream_urns.append(
            get_urn_from_dbtNode(
-                upstream_node["database"],
+                dbtNode_upstream.database,
-                upstream_node["schema"],
+                dbtNode_upstream.schema,
-                upstream_node["name"],
+                dbtNode_upstream.name,
                target_platform,
                environment,
            )