feat(ingest): add transformer to add properties (#3480)

This commit is contained in:
Noé López 2021-11-09 22:03:21 -06:00 committed by GitHub
parent 3a432b7154
commit cde1ce043d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 156 additions and 5 deletions

View File

@ -0,0 +1,61 @@
from abc import ABC, abstractmethod
from typing import Dict, Type
import datahub.emitter.mce_builder as builder
from datahub.configuration.common import ConfigModel
from datahub.configuration.import_resolver import pydantic_resolve_key
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.transformer.dataset_transformer import DatasetTransformer
from datahub.metadata.schema_classes import (
DatasetPropertiesClass,
DatasetSnapshotClass,
MetadataChangeEventClass,
)
class AddDatasetPropertiesResolverBase(ABC):
@abstractmethod
def get_properties_to_add(self, current: DatasetSnapshotClass) -> Dict[str, str]:
pass
class AddDatasetPropertiesConfig(ConfigModel):
add_properties_resolver_class: Type[AddDatasetPropertiesResolverBase]
class Config:
arbitrary_types_allowed = True
_resolve_properties_class = pydantic_resolve_key("add_properties_resolver_class")
class AddDatasetProperties(DatasetTransformer):
"""Transformer that adds properties to datasets according to a callback function."""
ctx: PipelineContext
config: AddDatasetPropertiesConfig
def __init__(self, config: AddDatasetPropertiesConfig, ctx: PipelineContext):
self.ctx = ctx
self.config = config
@classmethod
def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddDatasetProperties":
config = AddDatasetPropertiesConfig.parse_obj(config_dict)
return cls(config, ctx)
def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass:
if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass):
return mce
properties_to_add = (
self.config.add_properties_resolver_class().get_properties_to_add(
mce.proposedSnapshot
)
)
if properties_to_add:
properties = builder.get_or_add_aspect(
mce, DatasetPropertiesClass(customProperties={})
)
properties.customProperties.update(properties_to_add)
return mce

View File

@ -8,6 +8,7 @@ from datahub.ingestion.transformer.add_dataset_ownership import (
PatternAddDatasetOwnership,
SimpleAddDatasetOwnership,
)
from datahub.ingestion.transformer.add_dataset_properties import AddDatasetProperties
from datahub.ingestion.transformer.add_dataset_tags import (
AddDatasetTags,
SimpleAddDatasetTags,
@ -31,3 +32,5 @@ transform_registry.register("pattern_add_dataset_ownership", PatternAddDatasetOw
transform_registry.register("add_dataset_tags", AddDatasetTags)
transform_registry.register("simple_add_dataset_tags", SimpleAddDatasetTags)
transform_registry.register("add_dataset_properties", AddDatasetProperties)

View File

@ -1,4 +1,4 @@
from typing import List, Union
from typing import Dict, List, Union
from unittest import mock
import pytest
@ -14,6 +14,10 @@ from datahub.ingestion.transformer.add_dataset_ownership import (
PatternAddDatasetOwnership,
SimpleAddDatasetOwnership,
)
from datahub.ingestion.transformer.add_dataset_properties import (
AddDatasetProperties,
AddDatasetPropertiesResolverBase,
)
from datahub.ingestion.transformer.add_dataset_tags import (
AddDatasetTags,
SimpleAddDatasetTags,
@ -22,9 +26,10 @@ from datahub.ingestion.transformer.mark_dataset_status import MarkDatasetStatus
from datahub.ingestion.transformer.remove_dataset_ownership import (
SimpleRemoveDatasetOwnership,
)
from datahub.metadata.schema_classes import DatasetSnapshotClass
def make_generic_dataset():
def make_generic_dataset() -> models.MetadataChangeEventClass:
return models.MetadataChangeEventClass(
proposedSnapshot=models.DatasetSnapshotClass(
urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example1,PROD)",
@ -35,7 +40,7 @@ def make_generic_dataset():
)
def make_dataset_with_owner():
def make_dataset_with_owner() -> models.MetadataChangeEventClass:
return models.MetadataChangeEventClass(
proposedSnapshot=models.DatasetSnapshotClass(
urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example2,PROD)",
@ -56,6 +61,21 @@ def make_dataset_with_owner():
)
EXISTING_PROPERTIES = {"my_existing_property": "existing property value"}
def make_dataset_with_properties() -> models.MetadataChangeEventClass:
return models.MetadataChangeEventClass(
proposedSnapshot=models.DatasetSnapshotClass(
urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,example1,PROD)",
aspects=[
models.StatusClass(removed=False),
models.DatasetPropertiesClass(customProperties=EXISTING_PROPERTIES),
],
),
)
def test_simple_dataset_ownership_transformation(mock_time):
no_owner_aspect = make_generic_dataset()
@ -543,3 +563,39 @@ def test_ownership_patching_with_different_types_2(mock_time):
assert ("baz", models.OwnershipTypeClass.DATAOWNER) in [
(o.owner, o.type) for o in test_ownership.owners
]
PROPERTIES_TO_ADD = {"my_new_property": "property value"}
class DummyPropertiesResolverClass(AddDatasetPropertiesResolverBase):
def get_properties_to_add(self, current: DatasetSnapshotClass) -> Dict[str, str]:
return PROPERTIES_TO_ADD
def test_add_dataset_properties(mock_time):
dataset_mce = make_dataset_with_properties()
transformer = AddDatasetProperties.create(
{
"add_properties_resolver_class": "tests.unit.test_transform_dataset.DummyPropertiesResolverClass"
},
PipelineContext(run_id="test-properties"),
)
outputs = list(
transformer.transform(
[RecordEnvelope(input, metadata={}) for input in [dataset_mce]]
)
)
assert len(outputs) == 1
custom_properties = builder.get_aspect_if_available(
outputs[0].record, models.DatasetPropertiesClass
)
assert custom_properties is not None
assert custom_properties.customProperties == {
**EXISTING_PROPERTIES,
**PROPERTIES_TO_ADD,
}

View File

@ -2,13 +2,13 @@
## Whats a transformer?
Oftentimes we want to modify metadata before it reaches the ingestion sink for instance, we might want to add custom tags, ownership, or patch some fields. A transformer allows us to do exactly these things.
Oftentimes we want to modify metadata before it reaches the ingestion sink for instance, we might want to add custom tags, ownership, properties, or patch some fields. A transformer allows us to do exactly these things.
Moreover, a transformer allows one to have fine-grained control over the metadata thats ingested without having to modify the ingestion framework's code yourself. Instead, you can write your own module that can take MCEs however you like. To configure the recipe, all that's needed is a module name as well as any arguments.
## Provided transformers
Aside from the option of writing your own transformer (see below), we provide two simple transformers for the use cases of adding dataset tags and ownership information.
Aside from the option of writing your own transformer (see below), we provide some simple transformers for the use cases of adding: dataset tags, dataset properties and ownership information.
### Adding a set of tags
@ -191,6 +191,37 @@ transformers:
In this case, the resulting dataset will have only 1 browse path, the one from the transform.
Note that whatever browse paths you send via this will overwrite the browse paths present in the UI.
### Adding a set of properties
If you'd like to add more complex logic for assigning properties, you can use the `add_dataset_properties` transformer, which calls a user-provided class (that extends from `AddDatasetPropertiesResolverBase` class) to determine the properties for each dataset.
The config, which wed append to our ingestion recipe YAML, would look like this:
```yaml
transformers:
- type: "add_dataset_properties"
config:
add_properties_resolver_class: "<your_module>.<your_class>"
```
Then define your class to return a list of custom properties, for example:
```python
import logging
from typing import Dict
from datahub.ingestion.transformer.add_dataset_properties import AddDatasetPropertiesResolverBase
from datahub.metadata.schema_classes import DatasetSnapshotClass
class MyPropertiesResolver(AddDatasetPropertiesResolverBase):
def get_properties_to_add(self, current: DatasetSnapshotClass) -> Dict[str, str]:
### Add custom logic here
properties= {'my_custom_property': 'property value'}
logging.info(f"Adding properties: {properties} to dataset: {current.urn}.")
return properties
```
## Writing a custom transformer from scratch
In the above couple of examples, we use classes that have already been implemented in the ingestion framework. However, its common for more advanced cases to pop up where custom code is required, for instance if you'd like to utilize conditional logic or rewrite properties. In such cases, we can add our own modules and define the arguments it takes as a custom transformer.