mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-02 19:58:59 +00:00
docs: improve lineage docs (#10396)
This commit is contained in:
parent
aaeeaa7ff5
commit
ad3b8f9b09
@ -15,6 +15,7 @@ This guide will show you how to
|
||||
|
||||
- Add lineage between datasets.
|
||||
- Add column-level lineage between datasets.
|
||||
- Read lineage.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
@ -109,7 +110,7 @@ Expected Response:
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Expected Outcomes of Adding Lineage
|
||||
### Expected Outcome
|
||||
|
||||
You can now see the lineage between `fct_users_deleted` and `logging_events`.
|
||||
|
||||
@ -117,6 +118,7 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`.
|
||||
<img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/lineage-added.png"/>
|
||||
</p>
|
||||
|
||||
|
||||
## Add Column-level Lineage
|
||||
|
||||
<Tabs>
|
||||
@ -129,7 +131,7 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`.
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Expected Outcome of Adding Column Level Lineage
|
||||
### Expected Outcome
|
||||
|
||||
You can now see the column-level lineage between datasets. Note that you have to enable `Show Columns` to be able to see the column-level lineage.
|
||||
|
||||
@ -137,18 +139,30 @@ You can now see the column-level lineage between datasets. Note that you have to
|
||||
<img width="70%" src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/column-level-lineage-added.png"/>
|
||||
</p>
|
||||
|
||||
## Read Table Lineage
|
||||
## Add Lineage to Non-Dataset Entities
|
||||
|
||||
You can also add lineage to non-dataset entities, such as DataJobs, Charts, and Dashboards.
|
||||
Please refer to the following examples.
|
||||
|
||||
| Connection | Examples | A.K.A |
|
||||
|---------------------|-------------------|-----------------|
|
||||
| DataJob to DataFlow | - [lineage_job_dataflow.py](../../../metadata-ingestion/examples/library/lineage_job_dataflow.py) | |
|
||||
| DataJob to Dataset | - [lineage_dataset_job_dataset.py](../../../metadata-ingestion/examples/library/lineage_dataset_job_dataset.py) <br /> | Pipeline Lineage |
|
||||
| Chart to Dashboard | - [lineage_chart_dashboard.py](../../../metadata-ingestion/examples/library/lineage_chart_dashboard.py) | |
|
||||
| Chart to Dataset | - [lineage_dataset_chart.py](../../../metadata-ingestion/examples/library/lineage_dataset_chart.py) | |
|
||||
|
||||
|
||||
## Read Lineage (Lineage Impact Analysis)
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="graphql" label="GraphQL" default>
|
||||
|
||||
```graphql
|
||||
query searchAcrossLineage {
|
||||
searchAcrossLineage(
|
||||
query scrollAcrossLineage {
|
||||
scrollAcrossLineage(
|
||||
input: {
|
||||
query: "*"
|
||||
urn: "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)"
|
||||
start: 0
|
||||
urn: "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)"
|
||||
count: 10
|
||||
direction: DOWNSTREAM
|
||||
orFilters: [
|
||||
@ -175,8 +189,13 @@ query searchAcrossLineage {
|
||||
}
|
||||
}
|
||||
```
|
||||
:::info Degree
|
||||
Note that `degree` means the number of hops in the lineage. For example, `degree: 1` means the immediate downstream entities, `degree: 2` means the entities that are two hops away, and so on.
|
||||
:::
|
||||
|
||||
The GraphQL example shows using lineage degrees as a filter, but additional search filters can be included here as well.
|
||||
This will perform a multi-hop lineage search on the urn specified. For more information about the `scrollAcrossLineage` mutation, please refer to [scrollAcrossLineage](https://datahubproject.io/docs/graphql/queries/#scrollacrosslineage).
|
||||
|
||||
This example shows using lineage degrees as a filter, but additional search filters can be included here as well.
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="curl" label="Curl">
|
||||
@ -184,7 +203,7 @@ This example shows using lineage degrees as a filter, but additional search filt
|
||||
```shell
|
||||
curl --location --request POST 'http://localhost:8080/api/graphql' \
|
||||
--header 'Authorization: Bearer <my-access-token>' \
|
||||
--header 'Content-Type: application/json' --data-raw '{ { "query": "query searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}"
|
||||
--header 'Content-Type: application/json' --data-raw '{ { "query": "query scrollAcrossLineage { scrollAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)\" count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}"
|
||||
}}'
|
||||
```
|
||||
|
||||
@ -192,67 +211,116 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \
|
||||
<TabItem value="python" label="Python">
|
||||
|
||||
```python
|
||||
{{ inline /metadata-ingestion/examples/library/read_lineage_rest.py show_path_as_comment }}
|
||||
{{ inline /metadata-ingestion/examples/library/read_lineage_execute_graphql.py show_path_as_comment }}
|
||||
```
|
||||
The Python SDK example shows how to read lineage of a dataset. Please note that the `aspect_type` parameter can vary depending on the entity type.
|
||||
Below is a few examples of `aspect_type` for different entities.
|
||||
|
||||
|Entity|Aspect_type| Reference |
|
||||
|-------|------------|--------------------------------------------------------------------------|
|
||||
|Dataset|`UpstreamLineageClass`| [Link](/docs/generated/metamodel/entities/dataset.md#upstreamlineage) |
|
||||
|Datajob|`DataJobInputOutputClass`| [Link](/docs/generated/metamodel/entities/dataJob.md#datajobinputoutput) |
|
||||
|Dashboard|`DashboardInfoClass`| [Link](/docs/generated/metamodel/entities/dashboard.md#dashboardinfo) |
|
||||
|DataFlow|`DataFlowInfoClass`| [Link](/docs/generated/metamodel/entities/dataFlow.md#dataflowinfo) |
|
||||
|
||||
Learn more about lineages of different entities in the [Add Lineage to Non-Dataset Entities](#add-lineage-to-non-dataset-entities) Section.
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
||||
### Expected Outcome
|
||||
|
||||
As an outcome, you should see the downstream entities of `logging_events`.
|
||||
|
||||
```graphql
|
||||
{
|
||||
"data": {
|
||||
"scrollAcrossLineage": {
|
||||
"searchResults": [
|
||||
{
|
||||
"degree": 1,
|
||||
"entity": {
|
||||
"urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_123)",
|
||||
"type": "DATA_JOB"
|
||||
}
|
||||
},
|
||||
...
|
||||
{
|
||||
"degree": 2,
|
||||
"entity": {
|
||||
"urn": "urn:li:mlPrimaryKey:(user_analytics,user_name)",
|
||||
"type": "MLPRIMARY_KEY"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"extensions": {}
|
||||
}
|
||||
```
|
||||
|
||||
## Read Column-level Lineage
|
||||
|
||||
You can also read column-level lineage via Python SDK.
|
||||
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="python" label="Python">
|
||||
|
||||
```python
|
||||
{{ inline /metadata-ingestion/examples/library/read_lineage_dataset_rest.py show_path_as_comment }}
|
||||
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
This will perform a multi-hop lineage search on the urn specified. For more information about the `searchAcrossLineage` mutation, please refer to [searchAcrossLineage](https://datahubproject.io/docs/graphql/queries/#searchacrosslineage).
|
||||
### Expected Outcome
|
||||
|
||||
## Read Column Lineage
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="graphql" label="GraphQL" default>
|
||||
As a response, you will get the full lineage information like this.
|
||||
|
||||
```graphql
|
||||
query searchAcrossLineage {
|
||||
searchAcrossLineage(
|
||||
input: {
|
||||
query: "*"
|
||||
urn: "urn:li:schemaField(urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD),profile_id)"
|
||||
start: 0
|
||||
count: 10
|
||||
direction: DOWNSTREAM
|
||||
orFilters: [
|
||||
{
|
||||
and: [
|
||||
{
|
||||
condition: EQUAL
|
||||
negated: false
|
||||
field: "degree"
|
||||
values: ["1", "2", "3+"]
|
||||
{
|
||||
"UpstreamLineageClass": {
|
||||
"upstreams": [
|
||||
{
|
||||
"UpstreamClass": {
|
||||
"auditStamp": {
|
||||
"AuditStampClass": {
|
||||
"time": 0,
|
||||
"actor": "urn:li:corpuser:unknown",
|
||||
"impersonator": null,
|
||||
"message": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"created": null,
|
||||
"dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)",
|
||||
"type": "TRANSFORMED",
|
||||
"properties": null,
|
||||
"query": null
|
||||
}
|
||||
]
|
||||
}
|
||||
) {
|
||||
searchResults {
|
||||
degree
|
||||
entity {
|
||||
urn
|
||||
type
|
||||
}
|
||||
}
|
||||
],
|
||||
"fineGrainedLineages": [
|
||||
{
|
||||
"FineGrainedLineageClass": {
|
||||
"upstreamType": "FIELD_SET",
|
||||
"upstreams": [
|
||||
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD),browser_id)",
|
||||
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD),user_id)"
|
||||
],
|
||||
"downstreamType": "FIELD",
|
||||
"downstreams": [
|
||||
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD),browser)"
|
||||
],
|
||||
"transformOperation": null,
|
||||
"confidenceScore": 1.0,
|
||||
"query": null
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
This example shows using lineage degrees as a filter, but additional search filters can be included here as well.
|
||||
|
||||
</TabItem>
|
||||
<TabItem value="curl" label="Curl">
|
||||
|
||||
```shell
|
||||
curl --location --request POST 'http://localhost:8080/api/graphql' \
|
||||
--header 'Authorization: Bearer <my-access-token>' \
|
||||
--header 'Content-Type: application/json' --data-raw '{ { "query": "query searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:schemaField(urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD),profile_id)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}"
|
||||
}}'
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
This will perform a multi-hop lineage search on the urn specified. You can see schemaField URNs are made up of two parts: first the table they are a column of, and second the path of the column. For more information about the `searchAcrossLineage` mutation, please refer to [searchAcrossLineage](https://datahubproject.io/docs/graphql/queries/#searchacrosslineage).
|
||||
|
||||
|
||||
@ -0,0 +1,13 @@
|
||||
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
|
||||
|
||||
# Imports for metadata model classes
|
||||
from datahub.metadata.schema_classes import DataJobInputOutputClass
|
||||
|
||||
# Get the current lineage for a datajob
|
||||
gms_endpoint = "http://localhost:8080"
|
||||
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
|
||||
|
||||
urn = "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)"
|
||||
result = graph.get_aspect(entity_urn=urn, aspect_type=DataJobInputOutputClass)
|
||||
|
||||
print(result)
|
||||
@ -0,0 +1,13 @@
|
||||
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
|
||||
|
||||
# Imports for metadata model classes
|
||||
from datahub.metadata.schema_classes import UpstreamLineageClass
|
||||
|
||||
# Get the current lineage for a dataset
|
||||
gms_endpoint = "http://localhost:8080"
|
||||
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
|
||||
|
||||
urn = "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)"
|
||||
result = graph.get_aspect(entity_urn=urn, aspect_type=UpstreamLineageClass)
|
||||
|
||||
print(result)
|
||||
@ -0,0 +1,44 @@
|
||||
# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough)
|
||||
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
|
||||
|
||||
gms_endpoint = "http://localhost:8080"
|
||||
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
|
||||
|
||||
# Query multiple aspects from entity
|
||||
query = """
|
||||
query scrollAcrossLineage($input: ScrollAcrossLineageInput!) {
|
||||
scrollAcrossLineage(input: $input) {
|
||||
searchResults {
|
||||
degree
|
||||
entity {
|
||||
urn
|
||||
type
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
|
||||
variables = {
|
||||
"input": {
|
||||
"query": "*",
|
||||
"urn": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)",
|
||||
"count": 10,
|
||||
"direction": "DOWNSTREAM",
|
||||
"orFilters": [
|
||||
{
|
||||
"and": [
|
||||
{
|
||||
"condition": "EQUAL",
|
||||
"negated": "false",
|
||||
"field": "degree",
|
||||
"values": ["1", "2", "3+"],
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
}
|
||||
}
|
||||
result = graph.execute_graphql(query=query, variables=variables)
|
||||
|
||||
print(result)
|
||||
@ -1,43 +0,0 @@
|
||||
# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough)
|
||||
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
|
||||
|
||||
gms_endpoint = "http://localhost:8080"
|
||||
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
|
||||
|
||||
# Query multiple aspects from entity
|
||||
query = """
|
||||
query searchAcrossLineage {
|
||||
searchAcrossLineage(
|
||||
input: {
|
||||
query: "*"
|
||||
urn: "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)"
|
||||
start: 0
|
||||
count: 10
|
||||
direction: DOWNSTREAM
|
||||
orFilters: [
|
||||
{
|
||||
and: [
|
||||
{
|
||||
condition: EQUAL
|
||||
negated: false
|
||||
field: "degree"
|
||||
values: ["1", "2", "3+"]
|
||||
}
|
||||
] # Additional search filters can be included here as well
|
||||
}
|
||||
]
|
||||
}
|
||||
) {
|
||||
searchResults {
|
||||
degree
|
||||
entity {
|
||||
urn
|
||||
type
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
result = graph.execute_graphql(query=query)
|
||||
|
||||
print(result)
|
||||
Loading…
x
Reference in New Issue
Block a user