mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-11-03 20:27:50 +00:00 
			
		
		
		
	docs: improve lineage docs (#10396)
This commit is contained in:
		
							parent
							
								
									aaeeaa7ff5
								
							
						
					
					
						commit
						ad3b8f9b09
					
				@ -15,6 +15,7 @@ This guide will show you how to
 | 
			
		||||
 | 
			
		||||
- Add lineage between datasets.
 | 
			
		||||
- Add column-level lineage between datasets.
 | 
			
		||||
- Read lineage.
 | 
			
		||||
 | 
			
		||||
## Prerequisites
 | 
			
		||||
 | 
			
		||||
@ -109,7 +110,7 @@ Expected Response:
 | 
			
		||||
</TabItem>
 | 
			
		||||
</Tabs>
 | 
			
		||||
 | 
			
		||||
### Expected Outcomes of Adding Lineage
 | 
			
		||||
### Expected Outcome
 | 
			
		||||
 | 
			
		||||
You can now see the lineage between `fct_users_deleted` and `logging_events`.
 | 
			
		||||
 | 
			
		||||
@ -117,6 +118,7 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`.
 | 
			
		||||
  <img width="70%"  src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/lineage-added.png"/>
 | 
			
		||||
</p>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Add Column-level Lineage
 | 
			
		||||
 | 
			
		||||
<Tabs>
 | 
			
		||||
@ -129,7 +131,7 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`.
 | 
			
		||||
</TabItem>
 | 
			
		||||
</Tabs>
 | 
			
		||||
 | 
			
		||||
### Expected Outcome of Adding Column Level Lineage
 | 
			
		||||
### Expected Outcome
 | 
			
		||||
 | 
			
		||||
You can now see the column-level lineage between datasets. Note that you have to enable `Show Columns` to be able to see the column-level lineage.
 | 
			
		||||
 | 
			
		||||
@ -137,18 +139,30 @@ You can now see the column-level lineage between datasets. Note that you have to
 | 
			
		||||
  <img width="70%"  src="https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/apis/tutorials/column-level-lineage-added.png"/>
 | 
			
		||||
</p>
 | 
			
		||||
 | 
			
		||||
## Read Table Lineage
 | 
			
		||||
## Add Lineage to Non-Dataset Entities
 | 
			
		||||
 | 
			
		||||
You can also add lineage to non-dataset entities, such as DataJobs, Charts, and Dashboards.
 | 
			
		||||
Please refer to the following examples.
 | 
			
		||||
 | 
			
		||||
| Connection          | Examples         | A.K.A           |
 | 
			
		||||
|---------------------|-------------------|-----------------|
 | 
			
		||||
| DataJob to DataFlow | - [lineage_job_dataflow.py](../../../metadata-ingestion/examples/library/lineage_job_dataflow.py)    | | 
 | 
			
		||||
| DataJob to Dataset  | - [lineage_dataset_job_dataset.py](../../../metadata-ingestion/examples/library/lineage_dataset_job_dataset.py) <br /> | Pipeline Lineage |
 | 
			
		||||
| Chart to Dashboard  | - [lineage_chart_dashboard.py](../../../metadata-ingestion/examples/library/lineage_chart_dashboard.py) |  |
 | 
			
		||||
| Chart to Dataset    | - [lineage_dataset_chart.py](../../../metadata-ingestion/examples/library/lineage_dataset_chart.py) |  |
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Read Lineage (Lineage Impact Analysis)
 | 
			
		||||
 | 
			
		||||
<Tabs>
 | 
			
		||||
<TabItem value="graphql" label="GraphQL" default>
 | 
			
		||||
 | 
			
		||||
```graphql
 | 
			
		||||
query searchAcrossLineage {
 | 
			
		||||
  searchAcrossLineage(
 | 
			
		||||
query scrollAcrossLineage {
 | 
			
		||||
  scrollAcrossLineage(
 | 
			
		||||
    input: {
 | 
			
		||||
      query: "*"
 | 
			
		||||
      urn: "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)"
 | 
			
		||||
      start: 0
 | 
			
		||||
      urn: "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)"
 | 
			
		||||
      count: 10
 | 
			
		||||
      direction: DOWNSTREAM
 | 
			
		||||
      orFilters: [
 | 
			
		||||
@ -175,8 +189,13 @@ query searchAcrossLineage {
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
:::info Degree
 | 
			
		||||
Note that `degree` means the number of hops in the lineage. For example, `degree: 1` means the immediate downstream entities, `degree: 2` means the entities that are two hops away, and so on.
 | 
			
		||||
:::
 | 
			
		||||
 | 
			
		||||
The GraphQL example shows using lineage degrees as a filter, but additional search filters can be included here as well.
 | 
			
		||||
This will perform a multi-hop lineage search on the urn specified. For more information about the `scrollAcrossLineage` mutation, please refer to [scrollAcrossLineage](https://datahubproject.io/docs/graphql/queries/#scrollacrosslineage).
 | 
			
		||||
 | 
			
		||||
This example shows using lineage degrees as a filter, but additional search filters can be included here as well.
 | 
			
		||||
 | 
			
		||||
</TabItem>
 | 
			
		||||
<TabItem value="curl" label="Curl">
 | 
			
		||||
@ -184,7 +203,7 @@ This example shows using lineage degrees as a filter, but additional search filt
 | 
			
		||||
```shell
 | 
			
		||||
curl --location --request POST 'http://localhost:8080/api/graphql' \
 | 
			
		||||
--header 'Authorization: Bearer <my-access-token>' \
 | 
			
		||||
--header 'Content-Type: application/json'  --data-raw '{ { "query": "query searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}"
 | 
			
		||||
--header 'Content-Type: application/json'  --data-raw '{ { "query": "query scrollAcrossLineage { scrollAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)\" count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}"
 | 
			
		||||
}}'
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
@ -192,67 +211,116 @@ curl --location --request POST 'http://localhost:8080/api/graphql' \
 | 
			
		||||
<TabItem value="python" label="Python">
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
{{ inline /metadata-ingestion/examples/library/read_lineage_rest.py show_path_as_comment }}
 | 
			
		||||
{{ inline /metadata-ingestion/examples/library/read_lineage_execute_graphql.py show_path_as_comment }}
 | 
			
		||||
```
 | 
			
		||||
The Python SDK example shows how to read lineage of a dataset. Please note that the `aspect_type` parameter can vary depending on the entity type. 
 | 
			
		||||
Below is a few examples of `aspect_type` for different entities.
 | 
			
		||||
 | 
			
		||||
|Entity|Aspect_type| Reference                                                                |
 | 
			
		||||
|-------|------------|--------------------------------------------------------------------------|
 | 
			
		||||
|Dataset|`UpstreamLineageClass`| [Link](/docs/generated/metamodel/entities/dataset.md#upstreamlineage)    |
 | 
			
		||||
|Datajob|`DataJobInputOutputClass`| [Link](/docs/generated/metamodel/entities/dataJob.md#datajobinputoutput) |
 | 
			
		||||
|Dashboard|`DashboardInfoClass`| [Link](/docs/generated/metamodel/entities/dashboard.md#dashboardinfo)    |
 | 
			
		||||
|DataFlow|`DataFlowInfoClass`| [Link](/docs/generated/metamodel/entities/dataFlow.md#dataflowinfo)      |
 | 
			
		||||
 | 
			
		||||
Learn more about lineages of different entities in the [Add Lineage to Non-Dataset Entities](#add-lineage-to-non-dataset-entities) Section.
 | 
			
		||||
 | 
			
		||||
</TabItem>
 | 
			
		||||
</Tabs>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Expected Outcome
 | 
			
		||||
 | 
			
		||||
As an outcome, you should see the downstream entities of `logging_events`.
 | 
			
		||||
 | 
			
		||||
```graphql
 | 
			
		||||
{
 | 
			
		||||
  "data": {
 | 
			
		||||
    "scrollAcrossLineage": {
 | 
			
		||||
      "searchResults": [
 | 
			
		||||
        {
 | 
			
		||||
          "degree": 1,
 | 
			
		||||
          "entity": {
 | 
			
		||||
            "urn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_123)",
 | 
			
		||||
            "type": "DATA_JOB"
 | 
			
		||||
          }
 | 
			
		||||
        },
 | 
			
		||||
        ...
 | 
			
		||||
        {
 | 
			
		||||
          "degree": 2,
 | 
			
		||||
          "entity": {
 | 
			
		||||
            "urn": "urn:li:mlPrimaryKey:(user_analytics,user_name)",
 | 
			
		||||
            "type": "MLPRIMARY_KEY"
 | 
			
		||||
          }
 | 
			
		||||
        }
 | 
			
		||||
      ]
 | 
			
		||||
    }
 | 
			
		||||
  },
 | 
			
		||||
  "extensions": {}
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
## Read Column-level Lineage 
 | 
			
		||||
 | 
			
		||||
You can also read column-level lineage via Python SDK. 
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
<Tabs>
 | 
			
		||||
<TabItem value="python" label="Python">
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
{{ inline /metadata-ingestion/examples/library/read_lineage_dataset_rest.py show_path_as_comment }}
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
</TabItem>
 | 
			
		||||
</Tabs>
 | 
			
		||||
 | 
			
		||||
This will perform a multi-hop lineage search on the urn specified. For more information about the `searchAcrossLineage` mutation, please refer to [searchAcrossLineage](https://datahubproject.io/docs/graphql/queries/#searchacrosslineage).
 | 
			
		||||
### Expected Outcome
 | 
			
		||||
 | 
			
		||||
## Read Column Lineage
 | 
			
		||||
 | 
			
		||||
<Tabs>
 | 
			
		||||
<TabItem value="graphql" label="GraphQL" default>
 | 
			
		||||
As a response, you will get the full lineage information like this. 
 | 
			
		||||
 | 
			
		||||
```graphql
 | 
			
		||||
query searchAcrossLineage {
 | 
			
		||||
  searchAcrossLineage(
 | 
			
		||||
    input: {
 | 
			
		||||
      query: "*"
 | 
			
		||||
      urn: "urn:li:schemaField(urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD),profile_id)"
 | 
			
		||||
      start: 0
 | 
			
		||||
      count: 10
 | 
			
		||||
      direction: DOWNSTREAM
 | 
			
		||||
      orFilters: [
 | 
			
		||||
        {
 | 
			
		||||
          and: [
 | 
			
		||||
            {
 | 
			
		||||
              condition: EQUAL
 | 
			
		||||
              negated: false
 | 
			
		||||
              field: "degree"
 | 
			
		||||
              values: ["1", "2", "3+"]
 | 
			
		||||
{
 | 
			
		||||
  "UpstreamLineageClass": {
 | 
			
		||||
    "upstreams": [
 | 
			
		||||
      {
 | 
			
		||||
        "UpstreamClass": {
 | 
			
		||||
          "auditStamp": {
 | 
			
		||||
            "AuditStampClass": {
 | 
			
		||||
              "time": 0,
 | 
			
		||||
              "actor": "urn:li:corpuser:unknown",
 | 
			
		||||
              "impersonator": null,
 | 
			
		||||
              "message": null
 | 
			
		||||
            }
 | 
			
		||||
          ]
 | 
			
		||||
          },
 | 
			
		||||
          "created": null,
 | 
			
		||||
          "dataset": "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)",
 | 
			
		||||
          "type": "TRANSFORMED",
 | 
			
		||||
          "properties": null,
 | 
			
		||||
          "query": null
 | 
			
		||||
        }
 | 
			
		||||
      ]
 | 
			
		||||
    }
 | 
			
		||||
  ) {
 | 
			
		||||
    searchResults {
 | 
			
		||||
      degree
 | 
			
		||||
      entity {
 | 
			
		||||
        urn
 | 
			
		||||
        type
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
    ],
 | 
			
		||||
    "fineGrainedLineages": [
 | 
			
		||||
      {
 | 
			
		||||
        "FineGrainedLineageClass": {
 | 
			
		||||
          "upstreamType": "FIELD_SET",
 | 
			
		||||
          "upstreams": [
 | 
			
		||||
            "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD),browser_id)",
 | 
			
		||||
            "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD),user_id)"
 | 
			
		||||
          ],
 | 
			
		||||
          "downstreamType": "FIELD",
 | 
			
		||||
          "downstreams": [
 | 
			
		||||
            "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD),browser)"
 | 
			
		||||
          ],
 | 
			
		||||
          "transformOperation": null,
 | 
			
		||||
          "confidenceScore": 1.0,
 | 
			
		||||
          "query": null
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    ]
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
This example shows using lineage degrees as a filter, but additional search filters can be included here as well.
 | 
			
		||||
 | 
			
		||||
</TabItem>
 | 
			
		||||
<TabItem value="curl" label="Curl">
 | 
			
		||||
 | 
			
		||||
```shell
 | 
			
		||||
curl --location --request POST 'http://localhost:8080/api/graphql' \
 | 
			
		||||
--header 'Authorization: Bearer <my-access-token>' \
 | 
			
		||||
--header 'Content-Type: application/json'  --data-raw '{ { "query": "query searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:schemaField(urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD),profile_id)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}"
 | 
			
		||||
}}'
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
</TabItem>
 | 
			
		||||
</Tabs>
 | 
			
		||||
 | 
			
		||||
This will perform a multi-hop lineage search on the urn specified. You can see schemaField URNs are made up of two parts: first the table they are a column of, and second the path of the column. For more information about the `searchAcrossLineage` mutation, please refer to [searchAcrossLineage](https://datahubproject.io/docs/graphql/queries/#searchacrosslineage).
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -0,0 +1,13 @@
 | 
			
		||||
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
 | 
			
		||||
 | 
			
		||||
# Imports for metadata model classes
 | 
			
		||||
from datahub.metadata.schema_classes import DataJobInputOutputClass
 | 
			
		||||
 | 
			
		||||
# Get the current lineage for a datajob
 | 
			
		||||
gms_endpoint = "http://localhost:8080"
 | 
			
		||||
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
 | 
			
		||||
 | 
			
		||||
urn = "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)"
 | 
			
		||||
result = graph.get_aspect(entity_urn=urn, aspect_type=DataJobInputOutputClass)
 | 
			
		||||
 | 
			
		||||
print(result)
 | 
			
		||||
@ -0,0 +1,13 @@
 | 
			
		||||
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
 | 
			
		||||
 | 
			
		||||
# Imports for metadata model classes
 | 
			
		||||
from datahub.metadata.schema_classes import UpstreamLineageClass
 | 
			
		||||
 | 
			
		||||
# Get the current lineage for a dataset
 | 
			
		||||
gms_endpoint = "http://localhost:8080"
 | 
			
		||||
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
 | 
			
		||||
 | 
			
		||||
urn = "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)"
 | 
			
		||||
result = graph.get_aspect(entity_urn=urn, aspect_type=UpstreamLineageClass)
 | 
			
		||||
 | 
			
		||||
print(result)
 | 
			
		||||
@ -0,0 +1,44 @@
 | 
			
		||||
# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough)
 | 
			
		||||
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
 | 
			
		||||
 | 
			
		||||
gms_endpoint = "http://localhost:8080"
 | 
			
		||||
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
 | 
			
		||||
 | 
			
		||||
# Query multiple aspects from entity
 | 
			
		||||
query = """
 | 
			
		||||
query scrollAcrossLineage($input: ScrollAcrossLineageInput!) {
 | 
			
		||||
  scrollAcrossLineage(input: $input) {
 | 
			
		||||
    searchResults {
 | 
			
		||||
      degree
 | 
			
		||||
      entity {
 | 
			
		||||
        urn
 | 
			
		||||
        type
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
variables = {
 | 
			
		||||
    "input": {
 | 
			
		||||
        "query": "*",
 | 
			
		||||
        "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,logging_events,PROD)",
 | 
			
		||||
        "count": 10,
 | 
			
		||||
        "direction": "DOWNSTREAM",
 | 
			
		||||
        "orFilters": [
 | 
			
		||||
            {
 | 
			
		||||
                "and": [
 | 
			
		||||
                    {
 | 
			
		||||
                        "condition": "EQUAL",
 | 
			
		||||
                        "negated": "false",
 | 
			
		||||
                        "field": "degree",
 | 
			
		||||
                        "values": ["1", "2", "3+"],
 | 
			
		||||
                    }
 | 
			
		||||
                ]
 | 
			
		||||
            }
 | 
			
		||||
        ],
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
result = graph.execute_graphql(query=query, variables=variables)
 | 
			
		||||
 | 
			
		||||
print(result)
 | 
			
		||||
@ -1,43 +0,0 @@
 | 
			
		||||
# read-modify-write requires access to the DataHubGraph (RestEmitter is not enough)
 | 
			
		||||
from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph
 | 
			
		||||
 | 
			
		||||
gms_endpoint = "http://localhost:8080"
 | 
			
		||||
graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint))
 | 
			
		||||
 | 
			
		||||
# Query multiple aspects from entity
 | 
			
		||||
query = """
 | 
			
		||||
query searchAcrossLineage {
 | 
			
		||||
  searchAcrossLineage(
 | 
			
		||||
    input: {
 | 
			
		||||
      query: "*"
 | 
			
		||||
      urn: "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)"
 | 
			
		||||
      start: 0
 | 
			
		||||
      count: 10
 | 
			
		||||
      direction: DOWNSTREAM
 | 
			
		||||
      orFilters: [
 | 
			
		||||
        {
 | 
			
		||||
          and: [
 | 
			
		||||
            {
 | 
			
		||||
              condition: EQUAL
 | 
			
		||||
              negated: false
 | 
			
		||||
              field: "degree"
 | 
			
		||||
              values: ["1", "2", "3+"]
 | 
			
		||||
            }
 | 
			
		||||
          ]                                     # Additional search filters can be included here as well
 | 
			
		||||
        }
 | 
			
		||||
      ]
 | 
			
		||||
    }
 | 
			
		||||
  ) {
 | 
			
		||||
    searchResults {
 | 
			
		||||
      degree
 | 
			
		||||
      entity {
 | 
			
		||||
        urn
 | 
			
		||||
        type
 | 
			
		||||
      }
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
}
 | 
			
		||||
"""
 | 
			
		||||
result = graph.execute_graphql(query=query)
 | 
			
		||||
 | 
			
		||||
print(result)
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user