mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-14 12:16:52 +00:00
feat: add docs on column-level linage (#8062)
This commit is contained in:
parent
820981072f
commit
a8b622a016
@ -12,7 +12,8 @@ For more information about lineage, refer to [About DataHub Lineage](/docs/linea
|
||||
|
||||
This guide will show you how to
|
||||
|
||||
- Add lineage between two hive datasets.
|
||||
- Add lineage between datasets.
|
||||
- Add column-level lineage between datasets.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
@ -112,3 +113,21 @@ Expected Response:
|
||||
You can now see the lineage between `fct_users_deleted` and `logging_events`.
|
||||
|
||||

|
||||
|
||||
## Add Column-level Lineage
|
||||
|
||||
<Tabs>
|
||||
<TabItem value="python" label="Python">
|
||||
|
||||
```python
|
||||
{{ inline /metadata-ingestion/examples/library/lineage_emitter_dataset_finegrained_sample.py show_path_as_comment }}
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
### Expected Outcome of Adding Column Level Lineage
|
||||
|
||||
You can now see the column-level lineage between datasets. Note that you have to enable `Show Columns` to be able to see the column-level lineage.
|
||||
|
||||

|
||||
|
BIN
docs/imgs/apis/tutorials/column-level-lineage-added.png
Normal file
BIN
docs/imgs/apis/tutorials/column-level-lineage-added.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 129 KiB |
@ -0,0 +1,53 @@
|
||||
import datahub.emitter.mce_builder as builder
|
||||
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
||||
from datahub.emitter.rest_emitter import DatahubRestEmitter
|
||||
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
|
||||
DatasetLineageType,
|
||||
FineGrainedLineage,
|
||||
FineGrainedLineageDownstreamType,
|
||||
FineGrainedLineageUpstreamType,
|
||||
Upstream,
|
||||
UpstreamLineage,
|
||||
)
|
||||
|
||||
|
||||
def datasetUrn(tbl):
|
||||
return builder.make_dataset_urn("hive", tbl)
|
||||
|
||||
|
||||
def fldUrn(tbl, fld):
|
||||
return builder.make_schema_field_urn(datasetUrn(tbl), fld)
|
||||
|
||||
|
||||
fineGrainedLineages = [
|
||||
FineGrainedLineage(
|
||||
upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
|
||||
upstreams=[
|
||||
fldUrn("fct_users_deleted", "browser_id"),
|
||||
fldUrn("fct_users_created", "user_id"),
|
||||
],
|
||||
downstreamType=FineGrainedLineageDownstreamType.FIELD,
|
||||
downstreams=[fldUrn("logging_events", "browser")],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
# this is just to check if any conflicts with existing Upstream, particularly the DownstreamOf relationship
|
||||
upstream = Upstream(
|
||||
dataset=datasetUrn("fct_users_deleted"), type=DatasetLineageType.TRANSFORMED
|
||||
)
|
||||
|
||||
fieldLineages = UpstreamLineage(
|
||||
upstreams=[upstream], fineGrainedLineages=fineGrainedLineages
|
||||
)
|
||||
|
||||
lineageMcp = MetadataChangeProposalWrapper(
|
||||
entityUrn=datasetUrn("logging_events"),
|
||||
aspect=fieldLineages,
|
||||
)
|
||||
|
||||
# Create an emitter to the GMS REST API.
|
||||
emitter = DatahubRestEmitter("http://localhost:8080")
|
||||
|
||||
# Emit metadata!
|
||||
emitter.emit_mcp(lineageMcp)
|
Loading…
x
Reference in New Issue
Block a user