| 
									
										
										
										
											2022-02-18 09:45:45 -08:00
										 |  |  | import logging | 
					
						
							|  |  |  | import time | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from datahub.emitter.mce_builder import make_dataset_urn, make_term_urn | 
					
						
							|  |  |  | from datahub.emitter.mcp import MetadataChangeProposalWrapper | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # read-modify-write requires access to the DataHubGraph (RestEmitter is not enough) | 
					
						
							|  |  |  | from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Imports for metadata model classes | 
					
						
							|  |  |  | from datahub.metadata.schema_classes import ( | 
					
						
							|  |  |  |     AuditStampClass, | 
					
						
							|  |  |  |     EditableSchemaFieldInfoClass, | 
					
						
							|  |  |  |     EditableSchemaMetadataClass, | 
					
						
							|  |  |  |     GlossaryTermAssociationClass, | 
					
						
							|  |  |  |     GlossaryTermsClass, | 
					
						
							|  |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-11-30 18:11:36 -05:00
										 |  |  | from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path | 
					
						
							| 
									
										
										
										
											2022-02-18 09:45:45 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | log = logging.getLogger(__name__) | 
					
						
							|  |  |  | logging.basicConfig(level=logging.INFO) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Inputs -> the column, dataset and the term to set | 
					
						
							|  |  |  | column = "address.zipcode" | 
					
						
							|  |  |  | dataset_urn = make_dataset_urn(platform="hive", name="realestate_db.sales", env="PROD") | 
					
						
							|  |  |  | term_to_add = make_term_urn("Classification.Location") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # First we get the current editable schema metadata | 
					
						
							|  |  |  | gms_endpoint = "http://localhost:8080" | 
					
						
							|  |  |  | graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2022-10-11 20:35:41 +00:00
										 |  |  | current_editable_schema_metadata = graph.get_aspect( | 
					
						
							|  |  |  |     entity_urn=dataset_urn, aspect_type=EditableSchemaMetadataClass | 
					
						
							| 
									
										
										
										
											2022-02-18 09:45:45 -08:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | # Some pre-built objects to help all the conditional pathways | 
					
						
							|  |  |  | now = int(time.time() * 1000)  # milliseconds since epoch | 
					
						
							|  |  |  | current_timestamp = AuditStampClass(time=now, actor="urn:li:corpuser:ingestion") | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | term_association_to_add = GlossaryTermAssociationClass(urn=term_to_add) | 
					
						
							|  |  |  | term_aspect_to_set = GlossaryTermsClass( | 
					
						
							|  |  |  |     terms=[term_association_to_add], auditStamp=current_timestamp | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | field_info_to_set = EditableSchemaFieldInfoClass( | 
					
						
							|  |  |  |     fieldPath=column, glossaryTerms=term_aspect_to_set | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | need_write = False | 
					
						
							|  |  |  | field_match = False | 
					
						
							|  |  |  | if current_editable_schema_metadata: | 
					
						
							|  |  |  |     for fieldInfo in current_editable_schema_metadata.editableSchemaFieldInfo: | 
					
						
							|  |  |  |         if get_simple_field_path_from_v2_field_path(fieldInfo.fieldPath) == column: | 
					
						
							|  |  |  |             # we have some editable schema metadata for this field | 
					
						
							|  |  |  |             field_match = True | 
					
						
							|  |  |  |             if fieldInfo.glossaryTerms: | 
					
						
							|  |  |  |                 if term_to_add not in [x.urn for x in fieldInfo.glossaryTerms.terms]: | 
					
						
							|  |  |  |                     # this tag is not present | 
					
						
							|  |  |  |                     fieldInfo.glossaryTerms.terms.append(term_association_to_add) | 
					
						
							|  |  |  |                     need_write = True | 
					
						
							|  |  |  |             else: | 
					
						
							|  |  |  |                 fieldInfo.glossaryTerms = term_aspect_to_set | 
					
						
							|  |  |  |                 need_write = True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if not field_match: | 
					
						
							|  |  |  |         # this field isn't present in the editable schema metadata aspect, add it | 
					
						
							|  |  |  |         field_info = field_info_to_set | 
					
						
							|  |  |  |         current_editable_schema_metadata.editableSchemaFieldInfo.append(field_info) | 
					
						
							|  |  |  |         need_write = True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | else: | 
					
						
							|  |  |  |     # create a brand new editable schema metadata aspect | 
					
						
							|  |  |  |     current_editable_schema_metadata = EditableSchemaMetadataClass( | 
					
						
							|  |  |  |         editableSchemaFieldInfo=[field_info_to_set], | 
					
						
							|  |  |  |         created=current_timestamp, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     need_write = True | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | if need_write: | 
					
						
							|  |  |  |     event: MetadataChangeProposalWrapper = MetadataChangeProposalWrapper( | 
					
						
							|  |  |  |         entityUrn=dataset_urn, | 
					
						
							|  |  |  |         aspect=current_editable_schema_metadata, | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     graph.emit(event) | 
					
						
							| 
									
										
										
										
											2023-05-03 07:32:23 +09:00
										 |  |  |     log.info(f"Term {term_to_add} added to column {column} of dataset {dataset_urn}") | 
					
						
							| 
									
										
										
										
											2022-02-18 09:45:45 -08:00
										 |  |  | 
 | 
					
						
							|  |  |  | else: | 
					
						
							| 
									
										
										
										
											2023-05-03 07:32:23 +09:00
										 |  |  |     log.info(f"Term {term_to_add} already attached to column {column}, omitting write") |