diff --git a/docs/api/tutorials/lineage.md b/docs/api/tutorials/lineage.md index db23de4f05..e3a4e65ca5 100644 --- a/docs/api/tutorials/lineage.md +++ b/docs/api/tutorials/lineage.md @@ -12,7 +12,8 @@ For more information about lineage, refer to [About DataHub Lineage](/docs/linea This guide will show you how to -- Add lineage between two hive datasets. +- Add lineage between datasets. +- Add column-level lineage between datasets. ## Prerequisites @@ -112,3 +113,21 @@ Expected Response: You can now see the lineage between `fct_users_deleted` and `logging_events`. ![lineage-added](../../imgs/apis/tutorials/lineage-added.png) + +## Add Column-level Lineage + + + + +```python +{{ inline /metadata-ingestion/examples/library/lineage_emitter_dataset_finegrained_sample.py show_path_as_comment }} +``` + + + + +### Expected Outcome of Adding Column Level Lineage + +You can now see the column-level lineage between datasets. Note that you have to enable `Show Columns` to be able to see the column-level lineage. + +![column-level-lineage-added](../../imgs/apis/tutorials/column-level-lineage-added.png) diff --git a/docs/imgs/apis/tutorials/column-level-lineage-added.png b/docs/imgs/apis/tutorials/column-level-lineage-added.png new file mode 100644 index 0000000000..6092436e0a Binary files /dev/null and b/docs/imgs/apis/tutorials/column-level-lineage-added.png differ diff --git a/metadata-ingestion/examples/library/lineage_emitter_dataset_finegrained_sample.py b/metadata-ingestion/examples/library/lineage_emitter_dataset_finegrained_sample.py new file mode 100644 index 0000000000..637b24887f --- /dev/null +++ b/metadata-ingestion/examples/library/lineage_emitter_dataset_finegrained_sample.py @@ -0,0 +1,53 @@ +import datahub.emitter.mce_builder as builder +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.rest_emitter import DatahubRestEmitter +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + DatasetLineageType, + FineGrainedLineage, + FineGrainedLineageDownstreamType, + FineGrainedLineageUpstreamType, + Upstream, + UpstreamLineage, +) + + +def datasetUrn(tbl): + return builder.make_dataset_urn("hive", tbl) + + +def fldUrn(tbl, fld): + return builder.make_schema_field_urn(datasetUrn(tbl), fld) + + +fineGrainedLineages = [ + FineGrainedLineage( + upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, + upstreams=[ + fldUrn("fct_users_deleted", "browser_id"), + fldUrn("fct_users_created", "user_id"), + ], + downstreamType=FineGrainedLineageDownstreamType.FIELD, + downstreams=[fldUrn("logging_events", "browser")], + ), +] + + +# this is just to check if any conflicts with existing Upstream, particularly the DownstreamOf relationship +upstream = Upstream( + dataset=datasetUrn("fct_users_deleted"), type=DatasetLineageType.TRANSFORMED +) + +fieldLineages = UpstreamLineage( + upstreams=[upstream], fineGrainedLineages=fineGrainedLineages +) + +lineageMcp = MetadataChangeProposalWrapper( + entityUrn=datasetUrn("logging_events"), + aspect=fieldLineages, +) + +# Create an emitter to the GMS REST API. +emitter = DatahubRestEmitter("http://localhost:8080") + +# Emit metadata! +emitter.emit_mcp(lineageMcp)