| 
									
										
										
										
											2022-02-24 14:36:19 -08:00
										 |  |  | import datahub.emitter.mce_builder as builder | 
					
						
							|  |  |  | from datahub.emitter.mcp import MetadataChangeProposalWrapper | 
					
						
							|  |  |  | from datahub.emitter.rest_emitter import DatahubRestEmitter | 
					
						
							|  |  |  | from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( | 
					
						
							|  |  |  |     DatasetLineageType, | 
					
						
							|  |  |  |     FineGrainedLineage, | 
					
						
							|  |  |  |     FineGrainedLineageDownstreamType, | 
					
						
							|  |  |  |     FineGrainedLineageUpstreamType, | 
					
						
							|  |  |  |     Upstream, | 
					
						
							| 
									
										
										
										
											2022-03-30 01:55:51 +05:30
										 |  |  |     UpstreamLineage, | 
					
						
							| 
									
										
										
										
											2022-02-24 14:36:19 -08:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2022-03-30 01:55:51 +05:30
										 |  |  | from datahub.metadata.schema_classes import ChangeTypeClass | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-24 14:36:19 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | def datasetUrn(tbl): | 
					
						
							|  |  |  |     return builder.make_dataset_urn("postgres", tbl) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-30 01:55:51 +05:30
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-24 14:36:19 -08:00
										 |  |  | def fldUrn(tbl, fld): | 
					
						
							| 
									
										
										
										
											2022-03-30 01:55:51 +05:30
										 |  |  |     return builder.make_schema_field_urn(datasetUrn(tbl), fld) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-24 14:36:19 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-25 21:05:48 +05:30
										 |  |  | # Lineage of fields in a dataset | 
					
						
							| 
									
										
										
										
											2022-02-24 14:36:19 -08:00
										 |  |  | # c1      <-- unknownFunc(bar2.c1, bar4.c1) | 
					
						
							|  |  |  | # c2      <-- myfunc(bar3.c2) | 
					
						
							|  |  |  | # {c3,c4} <-- unknownFunc(bar2.c2, bar2.c3, bar3.c1) | 
					
						
							|  |  |  | # c5      <-- unknownFunc(bar3) | 
					
						
							|  |  |  | # {c6,c7} <-- unknownFunc(bar4) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # note that the semantic of the "transformOperation" value is contextual. | 
					
						
							|  |  |  | # In above example, it is regarded as some kind of UDF; but it could also be an expression etc. | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-30 01:55:51 +05:30
										 |  |  | fineGrainedLineages = [ | 
					
						
							|  |  |  |     FineGrainedLineage( | 
					
						
							|  |  |  |         upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, | 
					
						
							|  |  |  |         upstreams=[fldUrn("bar2", "c1"), fldUrn("bar4", "c1")], | 
					
						
							|  |  |  |         downstreamType=FineGrainedLineageDownstreamType.FIELD, | 
					
						
							|  |  |  |         downstreams=[fldUrn("bar", "c1")], | 
					
						
							|  |  |  |     ), | 
					
						
							|  |  |  |     FineGrainedLineage( | 
					
						
							|  |  |  |         upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, | 
					
						
							|  |  |  |         upstreams=[fldUrn("bar3", "c2")], | 
					
						
							|  |  |  |         downstreamType=FineGrainedLineageDownstreamType.FIELD, | 
					
						
							|  |  |  |         downstreams=[fldUrn("bar", "c2")], | 
					
						
							|  |  |  |         confidenceScore=0.8, | 
					
						
							|  |  |  |         transformOperation="myfunc", | 
					
						
							|  |  |  |     ), | 
					
						
							|  |  |  |     FineGrainedLineage( | 
					
						
							|  |  |  |         upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, | 
					
						
							|  |  |  |         upstreams=[fldUrn("bar2", "c2"), fldUrn("bar2", "c3"), fldUrn("bar3", "c1")], | 
					
						
							|  |  |  |         downstreamType=FineGrainedLineageDownstreamType.FIELD_SET, | 
					
						
							|  |  |  |         downstreams=[fldUrn("bar", "c3"), fldUrn("bar", "c4")], | 
					
						
							|  |  |  |         confidenceScore=0.7, | 
					
						
							|  |  |  |     ), | 
					
						
							|  |  |  |     FineGrainedLineage( | 
					
						
							|  |  |  |         upstreamType=FineGrainedLineageUpstreamType.DATASET, | 
					
						
							|  |  |  |         upstreams=[datasetUrn("bar3")], | 
					
						
							|  |  |  |         downstreamType=FineGrainedLineageDownstreamType.FIELD, | 
					
						
							|  |  |  |         downstreams=[fldUrn("bar", "c5")], | 
					
						
							|  |  |  |     ), | 
					
						
							|  |  |  |     FineGrainedLineage( | 
					
						
							|  |  |  |         upstreamType=FineGrainedLineageUpstreamType.DATASET, | 
					
						
							|  |  |  |         upstreams=[datasetUrn("bar4")], | 
					
						
							|  |  |  |         downstreamType=FineGrainedLineageDownstreamType.FIELD_SET, | 
					
						
							|  |  |  |         downstreams=[fldUrn("bar", "c6"), fldUrn("bar", "c7")], | 
					
						
							|  |  |  |     ), | 
					
						
							|  |  |  | ] | 
					
						
							| 
									
										
										
										
											2022-02-24 14:36:19 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # this is just to check if any conflicts with existing Upstream, particularly the DownstreamOf relationship | 
					
						
							|  |  |  | upstream = Upstream(dataset=datasetUrn("bar2"), type=DatasetLineageType.TRANSFORMED) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-03-30 01:55:51 +05:30
										 |  |  | fieldLineages = UpstreamLineage( | 
					
						
							|  |  |  |     upstreams=[upstream], fineGrainedLineages=fineGrainedLineages | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2022-02-24 14:36:19 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-02-25 21:05:48 +05:30
										 |  |  | lineageMcp = MetadataChangeProposalWrapper( | 
					
						
							| 
									
										
										
										
											2022-02-24 14:36:19 -08:00
										 |  |  |     entityType="dataset", | 
					
						
							|  |  |  |     changeType=ChangeTypeClass.UPSERT, | 
					
						
							|  |  |  |     entityUrn=datasetUrn("bar"), | 
					
						
							|  |  |  |     aspectName="upstreamLineage", | 
					
						
							| 
									
										
										
										
											2022-03-30 01:55:51 +05:30
										 |  |  |     aspect=fieldLineages, | 
					
						
							| 
									
										
										
										
											2022-02-24 14:36:19 -08:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Create an emitter to the GMS REST API. | 
					
						
							|  |  |  | emitter = DatahubRestEmitter("http://localhost:8080") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Emit metadata! | 
					
						
							| 
									
										
										
										
											2022-03-30 01:55:51 +05:30
										 |  |  | emitter.emit_mcp(lineageMcp) |