mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-16 03:13:09 +00:00
fix(ingest): fix lineage after dbt metadata ingestion when tables name and identifier differ (#2596)
This commit is contained in:
parent
d23c58734b
commit
6aa133f99c
@ -53,7 +53,7 @@ We use a plugin architecture so that you can install only the dependencies you a
|
|||||||
| lookml | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ |
|
| lookml | `pip install 'acryl-datahub[lookml]'` | LookML source, requires Python 3.7+ |
|
||||||
| kafka | `pip install 'acryl-datahub[kafka]'` | Kafka source |
|
| kafka | `pip install 'acryl-datahub[kafka]'` | Kafka source |
|
||||||
| druid | `pip install 'acryl-datahub[druid]'` | Druid Source |
|
| druid | `pip install 'acryl-datahub[druid]'` | Druid Source |
|
||||||
| dbt | _no additional dependencies_ | DBT source |
|
| dbt | _no additional dependencies_ | dbt source |
|
||||||
| datahub-rest | `pip install 'acryl-datahub[datahub-rest]'` | DataHub sink over REST API |
|
| datahub-rest | `pip install 'acryl-datahub[datahub-rest]'` | DataHub sink over REST API |
|
||||||
| datahub-kafka | `pip install 'acryl-datahub[datahub-kafka]'` | DataHub sink over Kafka |
|
| datahub-kafka | `pip install 'acryl-datahub[datahub-kafka]'` | DataHub sink over Kafka |
|
||||||
|
|
||||||
@ -595,19 +595,19 @@ source:
|
|||||||
filename: ./path/to/mce/file.json
|
filename: ./path/to/mce/file.json
|
||||||
```
|
```
|
||||||
|
|
||||||
### DBT `dbt`
|
### dbt `dbt`
|
||||||
|
|
||||||
Pull metadata from DBT output files:
|
Pull metadata from dbt artifacts files:
|
||||||
|
|
||||||
- [dbt manifest file](https://docs.getdbt.com/reference/artifacts/manifest-json)
|
- [dbt manifest file](https://docs.getdbt.com/reference/artifacts/manifest-json)
|
||||||
- This file contains model, source and lineage data.
|
- This file contains model, source and lineage data.
|
||||||
- [dbt catalog file](https://docs.getdbt.com/reference/artifacts/catalog-json)
|
- [dbt catalog file](https://docs.getdbt.com/reference/artifacts/catalog-json)
|
||||||
- This file contains schema data.
|
- This file contains schema data.
|
||||||
- DBT does not record schema data for Ephemeral models, as such datahub will show Ephemeral models in the lineage, however there will be no associated schema for Ephemeral models
|
- dbt does not record schema data for Ephemeral models, as such datahub will show Ephemeral models in the lineage, however there will be no associated schema for Ephemeral models
|
||||||
- target_platform:
|
- target_platform:
|
||||||
- The data platform you are enriching with DBT metadata.
|
- The data platform you are enriching with dbt metadata.
|
||||||
- [data platforms](https://github.com/linkedin/datahub/blob/master/gms/impl/src/main/resources/DataPlatformInfo.json)
|
- [data platforms](https://github.com/linkedin/datahub/blob/master/gms/impl/src/main/resources/DataPlatformInfo.json)
|
||||||
- load_schema:
|
- load_schemas:
|
||||||
- Load schemas from dbt catalog file, not necessary when the underlying data platform already has this data.
|
- Load schemas from dbt catalog file, not necessary when the underlying data platform already has this data.
|
||||||
|
|
||||||
```yml
|
```yml
|
||||||
@ -616,10 +616,12 @@ source:
|
|||||||
config:
|
config:
|
||||||
manifest_path: "./path/dbt/manifest_file.json"
|
manifest_path: "./path/dbt/manifest_file.json"
|
||||||
catalog_path: "./path/dbt/catalog_file.json"
|
catalog_path: "./path/dbt/catalog_file.json"
|
||||||
target_platform: "postgres" # optional eg postgres, snowflake etc.
|
target_platform: "postgres" # optional, eg "postgres", "snowflake", etc.
|
||||||
load_schema: True / False
|
load_schemas: True or False
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Note: when `load_schemas` is False, models that use [identifiers](https://docs.getdbt.com/reference/resource-properties/identifier) to reference their source tables are ingested using the model identifier as the model name to preserve the lineage.
|
||||||
|
|
||||||
### Kafka Connect `kafka-connect`
|
### Kafka Connect `kafka-connect`
|
||||||
|
|
||||||
Extracts:
|
Extracts:
|
||||||
|
|||||||
@ -58,7 +58,7 @@ class DBTNode:
|
|||||||
dbt_file_path: str
|
dbt_file_path: str
|
||||||
node_type: str # source, model
|
node_type: str # source, model
|
||||||
materialization: str # table, view, ephemeral
|
materialization: str # table, view, ephemeral
|
||||||
name: str
|
name: str # name, identifier
|
||||||
columns: List[DBTColumn]
|
columns: List[DBTColumn]
|
||||||
upstream_urns: List[str]
|
upstream_urns: List[str]
|
||||||
datahub_urn: str
|
datahub_urn: str
|
||||||
@ -98,17 +98,24 @@ def extract_dbt_entities(
|
|||||||
dbtNode = DBTNode()
|
dbtNode = DBTNode()
|
||||||
|
|
||||||
dbtNode.dbt_name = key
|
dbtNode.dbt_name = key
|
||||||
dbtNode.node_type = node["resource_type"]
|
|
||||||
dbtNode.name = node["name"]
|
|
||||||
dbtNode.database = node["database"]
|
dbtNode.database = node["database"]
|
||||||
dbtNode.schema = node["schema"]
|
dbtNode.schema = node["schema"]
|
||||||
dbtNode.dbt_file_path = node["original_file_path"]
|
dbtNode.dbt_file_path = node["original_file_path"]
|
||||||
|
dbtNode.node_type = node["resource_type"]
|
||||||
|
if "identifier" in node and load_catalog is False:
|
||||||
|
dbtNode.name = node["identifier"]
|
||||||
|
else:
|
||||||
|
dbtNode.name = node["name"]
|
||||||
|
|
||||||
if "materialized" in node["config"].keys():
|
if "materialized" in node["config"].keys():
|
||||||
# It's a model
|
# It's a model
|
||||||
dbtNode.materialization = node["config"]["materialized"]
|
dbtNode.materialization = node["config"]["materialized"]
|
||||||
dbtNode.upstream_urns = get_upstreams(
|
dbtNode.upstream_urns = get_upstreams(
|
||||||
node["depends_on"]["nodes"], nodes, target_platform, environment
|
node["depends_on"]["nodes"],
|
||||||
|
nodes,
|
||||||
|
load_catalog,
|
||||||
|
target_platform,
|
||||||
|
environment,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# It's a source
|
# It's a source
|
||||||
@ -190,19 +197,27 @@ def get_custom_properties(node: DBTNode) -> Dict[str, str]:
|
|||||||
def get_upstreams(
|
def get_upstreams(
|
||||||
upstreams: List[str],
|
upstreams: List[str],
|
||||||
all_nodes: Dict[str, dict],
|
all_nodes: Dict[str, dict],
|
||||||
|
load_catalog: bool,
|
||||||
target_platform: str,
|
target_platform: str,
|
||||||
environment: str,
|
environment: str,
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
upstream_urns = []
|
upstream_urns = []
|
||||||
|
|
||||||
for upstream in upstreams:
|
for upstream in upstreams:
|
||||||
upstream_node = all_nodes[upstream]
|
dbtNode_upstream = DBTNode()
|
||||||
|
|
||||||
|
dbtNode_upstream.database = all_nodes[upstream]["database"]
|
||||||
|
dbtNode_upstream.schema = all_nodes[upstream]["schema"]
|
||||||
|
if "identifier" in all_nodes[upstream] and load_catalog is False:
|
||||||
|
dbtNode_upstream.name = all_nodes[upstream]["identifier"]
|
||||||
|
else:
|
||||||
|
dbtNode_upstream.name = all_nodes[upstream]["name"]
|
||||||
|
|
||||||
upstream_urns.append(
|
upstream_urns.append(
|
||||||
get_urn_from_dbtNode(
|
get_urn_from_dbtNode(
|
||||||
upstream_node["database"],
|
dbtNode_upstream.database,
|
||||||
upstream_node["schema"],
|
dbtNode_upstream.schema,
|
||||||
upstream_node["name"],
|
dbtNode_upstream.name,
|
||||||
target_platform,
|
target_platform,
|
||||||
environment,
|
environment,
|
||||||
)
|
)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user