mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-31 10:49:00 +00:00 
			
		
		
		
	feat(ingest/transform): extend ownership transformer to other entities (#11700)
This commit is contained in:
		
							parent
							
								
									bea253a064
								
							
						
					
					
						commit
						02f0a3dee7
					
				| @ -13,9 +13,7 @@ from datahub.emitter.mce_builder import Aspect | |||||||
| from datahub.emitter.mcp import MetadataChangeProposalWrapper | from datahub.emitter.mcp import MetadataChangeProposalWrapper | ||||||
| from datahub.ingestion.api.common import PipelineContext | from datahub.ingestion.api.common import PipelineContext | ||||||
| from datahub.ingestion.graph.client import DataHubGraph | from datahub.ingestion.graph.client import DataHubGraph | ||||||
| from datahub.ingestion.transformer.dataset_transformer import ( | from datahub.ingestion.transformer.dataset_transformer import OwnershipTransformer | ||||||
|     DatasetOwnershipTransformer, |  | ||||||
| ) |  | ||||||
| from datahub.metadata.schema_classes import ( | from datahub.metadata.schema_classes import ( | ||||||
|     BrowsePathsV2Class, |     BrowsePathsV2Class, | ||||||
|     MetadataChangeProposalClass, |     MetadataChangeProposalClass, | ||||||
| @ -37,7 +35,7 @@ class AddDatasetOwnershipConfig(TransformerSemanticsConfigModel): | |||||||
|     is_container: bool = False |     is_container: bool = False | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class AddDatasetOwnership(DatasetOwnershipTransformer): | class AddDatasetOwnership(OwnershipTransformer): | ||||||
|     """Transformer that adds owners to datasets according to a callback function.""" |     """Transformer that adds owners to datasets according to a callback function.""" | ||||||
| 
 | 
 | ||||||
|     ctx: PipelineContext |     ctx: PipelineContext | ||||||
|  | |||||||
| @ -27,6 +27,22 @@ class DatasetTransformer(BaseTransformer, SingleAspectTransformer, metaclass=ABC | |||||||
|         return ["dataset"] |         return ["dataset"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | class OwnershipTransformer( | ||||||
|  |     DatasetTransformer, SingleAspectTransformer, metaclass=ABCMeta | ||||||
|  | ): | ||||||
|  |     def aspect_name(self) -> str: | ||||||
|  |         return "ownership" | ||||||
|  | 
 | ||||||
|  |     def entity_types(self) -> List[str]: | ||||||
|  |         return [ | ||||||
|  |             "dataset", | ||||||
|  |             "dataJob", | ||||||
|  |             "dataFlow", | ||||||
|  |             "chart", | ||||||
|  |             "dashboard", | ||||||
|  |         ] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class TagTransformer(BaseTransformer, SingleAspectTransformer, metaclass=ABCMeta): | class TagTransformer(BaseTransformer, SingleAspectTransformer, metaclass=ABCMeta): | ||||||
|     """Transformer that does transform sequentially on each tag.""" |     """Transformer that does transform sequentially on each tag.""" | ||||||
| 
 | 
 | ||||||
| @ -47,11 +63,6 @@ class ContainerTransformer(BaseTransformer, SingleAspectTransformer, metaclass=A | |||||||
|         return ["container"] |         return ["container"] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class DatasetOwnershipTransformer(DatasetTransformer, metaclass=ABCMeta): |  | ||||||
|     def aspect_name(self) -> str: |  | ||||||
|         return "ownership" |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class DatasetDomainTransformer(DatasetTransformer, metaclass=ABCMeta): | class DatasetDomainTransformer(DatasetTransformer, metaclass=ABCMeta): | ||||||
|     def aspect_name(self) -> str: |     def aspect_name(self) -> str: | ||||||
|         return "domains" |         return "domains" | ||||||
|  | |||||||
| @ -4,9 +4,7 @@ from typing import List, Optional, Set, cast | |||||||
| import datahub.emitter.mce_builder as builder | import datahub.emitter.mce_builder as builder | ||||||
| from datahub.configuration.common import ConfigModel | from datahub.configuration.common import ConfigModel | ||||||
| from datahub.ingestion.api.common import PipelineContext | from datahub.ingestion.api.common import PipelineContext | ||||||
| from datahub.ingestion.transformer.dataset_transformer import ( | from datahub.ingestion.transformer.dataset_transformer import OwnershipTransformer | ||||||
|     DatasetOwnershipTransformer, |  | ||||||
| ) |  | ||||||
| from datahub.metadata.schema_classes import ( | from datahub.metadata.schema_classes import ( | ||||||
|     OwnerClass, |     OwnerClass, | ||||||
|     OwnershipClass, |     OwnershipClass, | ||||||
| @ -20,7 +18,7 @@ class PatternCleanUpOwnershipConfig(ConfigModel): | |||||||
|     pattern_for_cleanup: List[str] |     pattern_for_cleanup: List[str] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class PatternCleanUpOwnership(DatasetOwnershipTransformer): | class PatternCleanUpOwnership(OwnershipTransformer): | ||||||
|     """Transformer that clean the ownership URN.""" |     """Transformer that clean the ownership URN.""" | ||||||
| 
 | 
 | ||||||
|     ctx: PipelineContext |     ctx: PipelineContext | ||||||
|  | |||||||
| @ -3,9 +3,7 @@ from typing import Optional, cast | |||||||
| from datahub.configuration.common import ConfigModel | from datahub.configuration.common import ConfigModel | ||||||
| from datahub.emitter.mce_builder import Aspect | from datahub.emitter.mce_builder import Aspect | ||||||
| from datahub.ingestion.api.common import PipelineContext | from datahub.ingestion.api.common import PipelineContext | ||||||
| from datahub.ingestion.transformer.dataset_transformer import ( | from datahub.ingestion.transformer.dataset_transformer import OwnershipTransformer | ||||||
|     DatasetOwnershipTransformer, |  | ||||||
| ) |  | ||||||
| from datahub.metadata.schema_classes import OwnershipClass | from datahub.metadata.schema_classes import OwnershipClass | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -13,7 +11,7 @@ class ClearDatasetOwnershipConfig(ConfigModel): | |||||||
|     pass |     pass | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class SimpleRemoveDatasetOwnership(DatasetOwnershipTransformer): | class SimpleRemoveDatasetOwnership(OwnershipTransformer): | ||||||
|     """Transformer that clears all owners on each dataset.""" |     """Transformer that clears all owners on each dataset.""" | ||||||
| 
 | 
 | ||||||
|     def __init__(self, config: ClearDatasetOwnershipConfig, ctx: PipelineContext): |     def __init__(self, config: ClearDatasetOwnershipConfig, ctx: PipelineContext): | ||||||
|  | |||||||
| @ -220,7 +220,7 @@ def make_dataset_with_properties() -> models.MetadataChangeEventClass: | |||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_simple_dataset_ownership_transformation(mock_time): | def test_dataset_ownership_transformation(mock_time): | ||||||
|     no_owner_aspect = make_generic_dataset() |     no_owner_aspect = make_generic_dataset() | ||||||
| 
 | 
 | ||||||
|     with_owner_aspect = make_dataset_with_owner() |     with_owner_aspect = make_dataset_with_owner() | ||||||
| @ -254,7 +254,7 @@ def test_simple_dataset_ownership_transformation(mock_time): | |||||||
|         transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) |         transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     assert len(outputs) == len(inputs) + 1 |     assert len(outputs) == len(inputs) + 2 | ||||||
| 
 | 
 | ||||||
|     # Check the first entry. |     # Check the first entry. | ||||||
|     first_ownership_aspect = builder.get_aspect_if_available( |     first_ownership_aspect = builder.get_aspect_if_available( | ||||||
| @ -287,11 +287,21 @@ def test_simple_dataset_ownership_transformation(mock_time): | |||||||
|         ] |         ] | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|  |     third_ownership_aspect = outputs[4].record.aspect | ||||||
|  |     assert third_ownership_aspect | ||||||
|  |     assert len(third_ownership_aspect.owners) == 2 | ||||||
|  |     assert all( | ||||||
|  |         [ | ||||||
|  |             owner.type == models.OwnershipTypeClass.DATAOWNER and owner.typeUrn is None | ||||||
|  |             for owner in second_ownership_aspect.owners | ||||||
|  |         ] | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|     # Verify that the third entry is unchanged. |     # Verify that the third entry is unchanged. | ||||||
|     assert inputs[2] == outputs[2].record |     assert inputs[2] == outputs[2].record | ||||||
| 
 | 
 | ||||||
|     # Verify that the last entry is EndOfStream |     # Verify that the last entry is EndOfStream | ||||||
|     assert inputs[3] == outputs[4].record |     assert inputs[-1] == outputs[-1].record | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_simple_dataset_ownership_with_type_transformation(mock_time): | def test_simple_dataset_ownership_with_type_transformation(mock_time): | ||||||
| @ -1003,6 +1013,7 @@ def test_pattern_dataset_ownership_transformation(mock_time): | |||||||
|                 "rules": { |                 "rules": { | ||||||
|                     ".*example1.*": [builder.make_user_urn("person1")], |                     ".*example1.*": [builder.make_user_urn("person1")], | ||||||
|                     ".*example2.*": [builder.make_user_urn("person2")], |                     ".*example2.*": [builder.make_user_urn("person2")], | ||||||
|  |                     ".*dag_abc.*": [builder.make_user_urn("person2")], | ||||||
|                 } |                 } | ||||||
|             }, |             }, | ||||||
|             "ownership_type": "DATAOWNER", |             "ownership_type": "DATAOWNER", | ||||||
| @ -1014,7 +1025,9 @@ def test_pattern_dataset_ownership_transformation(mock_time): | |||||||
|         transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) |         transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     assert len(outputs) == len(inputs) + 1  # additional MCP due to the no-owner MCE |     assert ( | ||||||
|  |         len(outputs) == len(inputs) + 2 | ||||||
|  |     )  # additional MCP due to the no-owner MCE + datajob | ||||||
| 
 | 
 | ||||||
|     # Check the first entry. |     # Check the first entry. | ||||||
|     assert inputs[0] == outputs[0].record |     assert inputs[0] == outputs[0].record | ||||||
| @ -1042,6 +1055,16 @@ def test_pattern_dataset_ownership_transformation(mock_time): | |||||||
|         ] |         ] | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|  |     third_ownership_aspect = outputs[4].record.aspect | ||||||
|  |     assert third_ownership_aspect | ||||||
|  |     assert len(third_ownership_aspect.owners) == 1 | ||||||
|  |     assert all( | ||||||
|  |         [ | ||||||
|  |             owner.type == models.OwnershipTypeClass.DATAOWNER | ||||||
|  |             for owner in third_ownership_aspect.owners | ||||||
|  |         ] | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|     # Verify that the third entry is unchanged. |     # Verify that the third entry is unchanged. | ||||||
|     assert inputs[2] == outputs[2].record |     assert inputs[2] == outputs[2].record | ||||||
| 
 | 
 | ||||||
| @ -1122,14 +1145,14 @@ def test_pattern_container_and_dataset_ownership_transformation( | |||||||
|     pipeline_context.graph.get_aspect = fake_get_aspect  # type: ignore |     pipeline_context.graph.get_aspect = fake_get_aspect  # type: ignore | ||||||
| 
 | 
 | ||||||
|     # No owner aspect for the first dataset |     # No owner aspect for the first dataset | ||||||
|     no_owner_aspect = models.MetadataChangeEventClass( |     no_owner_aspect_dataset = models.MetadataChangeEventClass( | ||||||
|         proposedSnapshot=models.DatasetSnapshotClass( |         proposedSnapshot=models.DatasetSnapshotClass( | ||||||
|             urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example1,PROD)", |             urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example1,PROD)", | ||||||
|             aspects=[models.StatusClass(removed=False)], |             aspects=[models.StatusClass(removed=False)], | ||||||
|         ), |         ), | ||||||
|     ) |     ) | ||||||
|     # Dataset with an existing owner |     # Dataset with an existing owner | ||||||
|     with_owner_aspect = models.MetadataChangeEventClass( |     with_owner_aspect_dataset = models.MetadataChangeEventClass( | ||||||
|         proposedSnapshot=models.DatasetSnapshotClass( |         proposedSnapshot=models.DatasetSnapshotClass( | ||||||
|             urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)", |             urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)", | ||||||
|             aspects=[ |             aspects=[ | ||||||
| @ -1148,8 +1171,7 @@ def test_pattern_container_and_dataset_ownership_transformation( | |||||||
|         ), |         ), | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     # Not a dataset, should be ignored |     datajob = models.MetadataChangeEventClass( | ||||||
|     not_a_dataset = models.MetadataChangeEventClass( |  | ||||||
|         proposedSnapshot=models.DataJobSnapshotClass( |         proposedSnapshot=models.DataJobSnapshotClass( | ||||||
|             urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", |             urn="urn:li:dataJob:(urn:li:dataFlow:(airflow,dag_abc,PROD),task_456)", | ||||||
|             aspects=[ |             aspects=[ | ||||||
| @ -1163,9 +1185,9 @@ def test_pattern_container_and_dataset_ownership_transformation( | |||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     inputs = [ |     inputs = [ | ||||||
|         no_owner_aspect, |         no_owner_aspect_dataset, | ||||||
|         with_owner_aspect, |         with_owner_aspect_dataset, | ||||||
|         not_a_dataset, |         datajob, | ||||||
|         EndOfStream(), |         EndOfStream(), | ||||||
|     ] |     ] | ||||||
| 
 | 
 | ||||||
| @ -1176,6 +1198,7 @@ def test_pattern_container_and_dataset_ownership_transformation( | |||||||
|                 "rules": { |                 "rules": { | ||||||
|                     ".*example1.*": [builder.make_user_urn("person1")], |                     ".*example1.*": [builder.make_user_urn("person1")], | ||||||
|                     ".*example2.*": [builder.make_user_urn("person2")], |                     ".*example2.*": [builder.make_user_urn("person2")], | ||||||
|  |                     ".*dag_abc.*": [builder.make_user_urn("person3")], | ||||||
|                 } |                 } | ||||||
|             }, |             }, | ||||||
|             "ownership_type": "DATAOWNER", |             "ownership_type": "DATAOWNER", | ||||||
| @ -1188,9 +1211,9 @@ def test_pattern_container_and_dataset_ownership_transformation( | |||||||
|         transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) |         transformer.transform([RecordEnvelope(input, metadata={}) for input in inputs]) | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|     assert len(outputs) == len(inputs) + 3 |     assert len(outputs) == len(inputs) + 4 | ||||||
| 
 | 
 | ||||||
|     # Check the first entry. |     # Check that DatasetSnapshotClass has not changed | ||||||
|     assert inputs[0] == outputs[0].record |     assert inputs[0] == outputs[0].record | ||||||
| 
 | 
 | ||||||
|     # Check the ownership for the first dataset (example1) |     # Check the ownership for the first dataset (example1) | ||||||
| @ -1217,12 +1240,16 @@ def test_pattern_container_and_dataset_ownership_transformation( | |||||||
|         ] |         ] | ||||||
|     ) |     ) | ||||||
| 
 | 
 | ||||||
|  |     third_ownership_aspect = outputs[4].record.aspect | ||||||
|  |     assert third_ownership_aspect | ||||||
|  |     assert len(third_ownership_aspect.owners) == 1  # new for datajob | ||||||
|  | 
 | ||||||
|     # Check container ownerships |     # Check container ownerships | ||||||
|     for i in range(2): |     for i in range(2): | ||||||
|         container_ownership_aspect = outputs[i + 4].record.aspect |         container_ownership_aspect = outputs[i + 5].record.aspect | ||||||
|         assert container_ownership_aspect |         assert container_ownership_aspect | ||||||
|         ownership = json.loads(container_ownership_aspect.value.decode("utf-8")) |         ownership = json.loads(container_ownership_aspect.value.decode("utf-8")) | ||||||
|         assert len(ownership) == 2 |         assert len(ownership) == 3 | ||||||
|         assert ownership[0]["value"]["owner"] == builder.make_user_urn("person1") |         assert ownership[0]["value"]["owner"] == builder.make_user_urn("person1") | ||||||
|         assert ownership[1]["value"]["owner"] == builder.make_user_urn("person2") |         assert ownership[1]["value"]["owner"] == builder.make_user_urn("person2") | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Aseem Bansal
						Aseem Bansal