mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-05 14:06:40 +00:00
feat(docs): tutorial for writing a custom transformer (#2959)
This commit is contained in:
parent
14bca4c2f6
commit
a1d1dd4269
@ -22,6 +22,8 @@ function list_ids_in_directory(directory) {
|
|||||||
return ids;
|
return ids;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// note: to handle errors where you don't want a markdown file in the sidebar, add it as a comment.
|
||||||
|
// this will fix errors like `Error: File not accounted for in sidebar: ...`
|
||||||
module.exports = {
|
module.exports = {
|
||||||
// users
|
// users
|
||||||
// architects
|
// architects
|
||||||
@ -73,6 +75,8 @@ module.exports = {
|
|||||||
"docs/docker/development",
|
"docs/docker/development",
|
||||||
"metadata-ingestion/adding-source",
|
"metadata-ingestion/adding-source",
|
||||||
"metadata-ingestion/s3-ingestion",
|
"metadata-ingestion/s3-ingestion",
|
||||||
|
//"metadata-ingestion/examples/transforms/README"
|
||||||
|
"metadata-ingestion/transformers",
|
||||||
//"docs/what/graph",
|
//"docs/what/graph",
|
||||||
//"docs/what/search-index",
|
//"docs/what/search-index",
|
||||||
//"docs/how/add-new-aspect",
|
//"docs/how/add-new-aspect",
|
||||||
|
@ -948,58 +948,9 @@ sink:
|
|||||||
|
|
||||||
## Transformations
|
## Transformations
|
||||||
|
|
||||||
Beyond basic ingestion, sometimes there might exist a need to modify the source data before passing it on to the sink.
|
If you'd like to modify data before it reaches the ingestion sinks – for instance, adding additional owners or tags – you can use a transformer to write your own module and integrate it with DataHub.
|
||||||
Example use cases could be to add ownership information, add extra tags etc.
|
|
||||||
|
|
||||||
In such a scenario, it is possible to configure a recipe with a list of transformers.
|
Check out the [transformers guide](./transformers.md) for more info!
|
||||||
|
|
||||||
```yml
|
|
||||||
transformers:
|
|
||||||
- type: "fully-qualified-class-name-of-transformer"
|
|
||||||
config:
|
|
||||||
some_property: "some.value"
|
|
||||||
```
|
|
||||||
|
|
||||||
A transformer class needs to inherit from [`Transformer`](./src/datahub/ingestion/api/transform.py).
|
|
||||||
|
|
||||||
### `simple_add_dataset_ownership`
|
|
||||||
|
|
||||||
Adds a set of owners to every dataset.
|
|
||||||
|
|
||||||
```yml
|
|
||||||
transformers:
|
|
||||||
- type: "simple_add_dataset_ownership"
|
|
||||||
config:
|
|
||||||
owner_urns:
|
|
||||||
- "urn:li:corpuser:username1"
|
|
||||||
- "urn:li:corpuser:username2"
|
|
||||||
- "urn:li:corpGroup:groupname"
|
|
||||||
```
|
|
||||||
|
|
||||||
:::tip
|
|
||||||
|
|
||||||
If you'd like to add more complex logic for assigning ownership, you can use the more generic [`add_dataset_ownership` transformer](./src/datahub/ingestion/transformer/add_dataset_ownership.py), which calls a user-provided function to determine the ownership of each dataset.
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
### `simple_add_dataset_tags`
|
|
||||||
|
|
||||||
Adds a set of tags to every dataset.
|
|
||||||
|
|
||||||
```yml
|
|
||||||
transformers:
|
|
||||||
- type: "simple_add_dataset_tags"
|
|
||||||
config:
|
|
||||||
tag_urns:
|
|
||||||
- "urn:li:tag:NeedsDocumentation"
|
|
||||||
- "urn:li:tag:Legacy"
|
|
||||||
```
|
|
||||||
|
|
||||||
:::tip
|
|
||||||
|
|
||||||
If you'd like to add more complex logic for assigning tags, you can use the more generic [`add_dataset_tags` transformer](./src/datahub/ingestion/transformer/add_dataset_tags.py), which calls a user-provided function to determine the tags for each dataset.
|
|
||||||
|
|
||||||
:::
|
|
||||||
|
|
||||||
## Using as a library
|
## Using as a library
|
||||||
|
|
||||||
@ -1067,4 +1018,4 @@ In order to use this example, you must first configure the Datahub hook. Like in
|
|||||||
|
|
||||||
## Developing
|
## Developing
|
||||||
|
|
||||||
See the [developing guide](./developing.md) or the [adding a source guide](./adding-source.md).
|
See the guides on [developing](./developing.md), [adding a source](./adding-source.md) and [using transformers](./transformers.md).
|
||||||
|
5
metadata-ingestion/examples/transforms/README.md
Normal file
5
metadata-ingestion/examples/transforms/README.md
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
# Custom transformer script
|
||||||
|
|
||||||
|
This script sets up a transformer that reads in a list of owner URNs from a JSON file specified via `owners_json` and appends these owners to every MCE.
|
||||||
|
|
||||||
|
See the transformers tutorial (https://datahubproject.io/docs/metadata-ingestion/transformers) for how this module is built and run.
|
@ -0,0 +1,77 @@
|
|||||||
|
# see https://datahubproject.io/docs/metadata-ingestion/transformers for original tutorial
|
||||||
|
from datahub.configuration.common import ConfigModel
|
||||||
|
|
||||||
|
|
||||||
|
class AddCustomOwnershipConfig(ConfigModel):
|
||||||
|
owners_json: str
|
||||||
|
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
import datahub.emitter.mce_builder as builder
|
||||||
|
from datahub.configuration.common import ConfigModel
|
||||||
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope
|
||||||
|
from datahub.ingestion.api.transform import Transformer
|
||||||
|
from datahub.metadata.schema_classes import (
|
||||||
|
DatasetSnapshotClass,
|
||||||
|
MetadataChangeEventClass,
|
||||||
|
OwnerClass,
|
||||||
|
OwnershipClass,
|
||||||
|
OwnershipTypeClass,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AddCustomOwnership(Transformer):
|
||||||
|
"""Transformer that adds owners to datasets according to a callback function."""
|
||||||
|
|
||||||
|
# context param to generate run metadata such as a run ID
|
||||||
|
ctx: PipelineContext
|
||||||
|
# as defined in the previous block
|
||||||
|
config: AddCustomOwnershipConfig
|
||||||
|
|
||||||
|
def __init__(self, config: AddCustomOwnershipConfig, ctx: PipelineContext):
|
||||||
|
self.ctx = ctx
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
with open(self.config.owners_json, "r") as f:
|
||||||
|
raw_owner_urns = json.load(f)
|
||||||
|
|
||||||
|
self.owners = [
|
||||||
|
OwnerClass(owner=owner, type=OwnershipTypeClass.DATAOWNER)
|
||||||
|
for owner in raw_owner_urns
|
||||||
|
]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddCustomOwnership":
|
||||||
|
config = AddCustomOwnershipConfig.parse_obj(config_dict)
|
||||||
|
return cls(config, ctx)
|
||||||
|
|
||||||
|
def transform(
|
||||||
|
self, record_envelopes: Iterable[RecordEnvelope]
|
||||||
|
) -> Iterable[RecordEnvelope]:
|
||||||
|
|
||||||
|
# loop over envelopes
|
||||||
|
for envelope in record_envelopes:
|
||||||
|
|
||||||
|
# if envelope is an MCE, add the ownership classes
|
||||||
|
if isinstance(envelope.record, MetadataChangeEventClass):
|
||||||
|
envelope.record = self.transform_one(envelope.record)
|
||||||
|
yield envelope
|
||||||
|
|
||||||
|
def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass:
|
||||||
|
if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass):
|
||||||
|
return mce
|
||||||
|
|
||||||
|
owners_to_add = self.owners
|
||||||
|
|
||||||
|
if owners_to_add:
|
||||||
|
ownership = builder.get_or_add_aspect(
|
||||||
|
mce,
|
||||||
|
OwnershipClass(
|
||||||
|
owners=[],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
ownership.owners.extend(owners_to_add)
|
||||||
|
|
||||||
|
return mce
|
7
metadata-ingestion/examples/transforms/setup.py
Normal file
7
metadata-ingestion/examples/transforms/setup.py
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="custom_transform_example",
|
||||||
|
version="1.0",
|
||||||
|
packages=find_packages(),
|
||||||
|
)
|
254
metadata-ingestion/transformers.md
Normal file
254
metadata-ingestion/transformers.md
Normal file
@ -0,0 +1,254 @@
|
|||||||
|
# Using transformers
|
||||||
|
|
||||||
|
## What’s a transformer?
|
||||||
|
|
||||||
|
Oftentimes we want to modify metadata before it reaches the ingestion sink – for instance, we might want to add custom tags, ownership, or patch some fields. A transformer allows us to do exactly these things.
|
||||||
|
|
||||||
|
Moreover, a transformer allows one to have fine-grained control over the metadata that’s ingested without having to modify the ingestion framework's code yourself. Instead, you can write your own module that can take MCEs however you like. To configure the recipe, all that's needed is a module name as well as any arguments.
|
||||||
|
|
||||||
|
## Provided transformers
|
||||||
|
|
||||||
|
Aside from the option of writing your own transformer (see below), we provide two simple transformers for the use cases of adding dataset tags and ownership information.
|
||||||
|
|
||||||
|
### Adding a set of tags
|
||||||
|
|
||||||
|
Let’s suppose we’d like to add a set of dataset tags. To do so, we can use the `simple_add_dataset_tags` module that’s included in the ingestion framework.
|
||||||
|
|
||||||
|
The config, which we’d append to our ingestion recipe YAML, would look like this:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
transformers:
|
||||||
|
- type: "simple_add_dataset_tags"
|
||||||
|
config:
|
||||||
|
tag_urns:
|
||||||
|
- "urn:li:tag:NeedsDocumentation"
|
||||||
|
- "urn:li:tag:Legacy"
|
||||||
|
```
|
||||||
|
|
||||||
|
If you'd like to add more complex logic for assigning tags, you can use the more generic add_dataset_tags transformer, which calls a user-provided function to determine the tags for each dataset.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
transformers:
|
||||||
|
- type: "add_dataset_tags"
|
||||||
|
config:
|
||||||
|
get_tags_to_add: "<your_module>.<your_function>"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Setting ownership
|
||||||
|
|
||||||
|
Let’s suppose we’d like to append a series of users who we know to own a dataset but aren't detected during normal ingestion. To do so, we can use the `simple_add_dataset_ownership` module that’s included in the ingestion framework.
|
||||||
|
|
||||||
|
The config, which we’d append to our ingestion recipe YAML, would look like this:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
transformers:
|
||||||
|
- type: "simple_add_dataset_ownership"
|
||||||
|
config:
|
||||||
|
owner_urns:
|
||||||
|
- "urn:li:corpuser:username1"
|
||||||
|
- "urn:li:corpuser:username2"
|
||||||
|
- "urn:li:corpGroup:groupname"
|
||||||
|
```
|
||||||
|
|
||||||
|
If you'd like to add more complex logic for assigning ownership, you can use the more generic `add_dataset_ownership` transformer, which calls a user-provided function to determine the ownership of each dataset.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
transformers:
|
||||||
|
- type: "add_dataset_ownership"
|
||||||
|
config:
|
||||||
|
get_owners_to_add: "<your_module>.<your_function>"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Writing a custom transformer from scratch
|
||||||
|
|
||||||
|
In the above couple of examples, we use classes that have already been implemented in the ingestion framework. However, it’s common for more advanced cases to pop up where custom code is required, for instance if you'd like to utilize conditional logic or rewrite properties. In such cases, we can add our own modules and define the arguments it takes as a custom transformer.
|
||||||
|
|
||||||
|
As an example, suppose we want to append a set of ownership fields to our metadata that are dependent upon an external source – for instance, an API endpoint or file – rather than a preset list like above. In this case, we can set a JSON file as an argument to our custom config, and our transformer will read this file and append the included ownership classes to all our MCEs (if you'd like, you could also include filtering logic for specific MCEs).
|
||||||
|
|
||||||
|
Our JSON file might look like the following:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
"urn:li:corpuser:athos",
|
||||||
|
"urn:li:corpuser:porthos",
|
||||||
|
"urn:li:corpuser:aramis",
|
||||||
|
"urn:li:corpGroup:the_three_musketeers"
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Defining a config
|
||||||
|
|
||||||
|
To get started, we’ll initiate an `AddCustomOwnershipConfig` class that inherits from [`datahub.configuration.common.ConfigModel`](./src/datahub/configuration/common.py). The sole parameter will be an `owners_json` which expects a path to a JSON file containing a list of owner URNs. This will go in a file called `custom_transform_example.py`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from datahub.configuration.common import ConfigModel
|
||||||
|
|
||||||
|
class AddCustomOwnershipConfig(ConfigModel):
|
||||||
|
owners_json: str
|
||||||
|
```
|
||||||
|
|
||||||
|
### Defining the transformer
|
||||||
|
|
||||||
|
Next, we’ll define the transformer itself, which must inherit from [`datahub.ingestion.api.transform.Transformer`](./src/datahub/ingestion/api/transform.py). First, let's get all our imports in:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# append these to the start of custom_transform_example.py
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
# for constructing URNs
|
||||||
|
import datahub.emitter.mce_builder as builder
|
||||||
|
# for typing the config model
|
||||||
|
from datahub.configuration.common import ConfigModel
|
||||||
|
# for typing context and records
|
||||||
|
from datahub.ingestion.api.common import PipelineContext, RecordEnvelope
|
||||||
|
# base transformer class
|
||||||
|
from datahub.ingestion.api.transform import Transformer
|
||||||
|
# MCE-related classes
|
||||||
|
from datahub.metadata.schema_classes import (
|
||||||
|
DatasetSnapshotClass,
|
||||||
|
MetadataChangeEventClass,
|
||||||
|
OwnerClass,
|
||||||
|
OwnershipClass,
|
||||||
|
OwnershipTypeClass,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Next, let's define the base scaffolding for the class:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# append this to the end of custom_transform_example.py
|
||||||
|
|
||||||
|
class AddCustomOwnership(Transformer):
|
||||||
|
"""Transformer that adds owners to datasets according to a callback function."""
|
||||||
|
|
||||||
|
# context param to generate run metadata such as a run ID
|
||||||
|
ctx: PipelineContext
|
||||||
|
# as defined in the previous block
|
||||||
|
config: AddCustomOwnershipConfig
|
||||||
|
|
||||||
|
def __init__(self, config: AddCustomOwnershipConfig, ctx: PipelineContext):
|
||||||
|
self.ctx = ctx
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
self.owners = [
|
||||||
|
OwnerClass(owner=owner, type=OwnershipTypeClass.DATAOWNER)
|
||||||
|
for owner in json.loads(config.owner_file)
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
A transformer must have two functions: a `create()` function for initialization and a `transform()` function for executing the transformation.
|
||||||
|
|
||||||
|
Let's begin by adding a `create()` method for parsing our configuration dictionary:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# add this as a function of AddCustomOwnership
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(cls, config_dict: dict, ctx: PipelineContext) -> "AddCustomOwnership":
|
||||||
|
config = AddCustomOwnershipConfig.parse_obj(config_dict)
|
||||||
|
return cls(config, ctx)
|
||||||
|
```
|
||||||
|
|
||||||
|
Now we need to add a `transform()` method that does the work of adding our custom ownership classes. This method will take an MCE as input and output the transformed MCE. Let's offload the processing of each MCE to another `transform_one()` class.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# add this as a function of AddCustomOwnership
|
||||||
|
|
||||||
|
def transform(
|
||||||
|
self, record_envelopes: Iterable[RecordEnvelope]
|
||||||
|
) -> Iterable[RecordEnvelope]:
|
||||||
|
|
||||||
|
# loop over envelopes
|
||||||
|
for envelope in record_envelopes:
|
||||||
|
|
||||||
|
# if envelope is an MCE, add the ownership classes
|
||||||
|
if isinstance(envelope.record, MetadataChangeEventClass):
|
||||||
|
envelope.record = self.transform_one(envelope.record)
|
||||||
|
yield envelope
|
||||||
|
```
|
||||||
|
|
||||||
|
With the main `transform()` method set up, the `transform_one()` method will take a single MCE and add the owners that we loaded from the JSON.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# add this as a function of AddCustomOwnership
|
||||||
|
|
||||||
|
def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventClass:
|
||||||
|
if not isinstance(mce.proposedSnapshot, DatasetSnapshotClass):
|
||||||
|
return mce
|
||||||
|
|
||||||
|
owners_to_add = self.owners
|
||||||
|
|
||||||
|
if owners_to_add:
|
||||||
|
ownership = builder.get_or_add_aspect(
|
||||||
|
mce,
|
||||||
|
OwnershipClass(
|
||||||
|
owners=[],
|
||||||
|
),
|
||||||
|
)
|
||||||
|
ownership.owners.extend(owners_to_add)
|
||||||
|
|
||||||
|
return mce
|
||||||
|
```
|
||||||
|
|
||||||
|
### Installing the package
|
||||||
|
|
||||||
|
Now that we've defined the transformer, we need to make it visible to DataHub. The easiest way to do this is to just place it in the same directory as your recipe, in which case the module name is the same as the file – in this case, `custom_transform_example`.
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Advanced: installing as a package</summary>
|
||||||
|
Alternatively, create a `setup.py` in the same directory as our transform script to make it visible globally. After installing this package (e.g. with `python setup.py` or `pip install -e .`), our module will be installed and importable as `custom_transform_example`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from setuptools import find_packages, setup
|
||||||
|
|
||||||
|
setup(
|
||||||
|
name="custom_transform_example",
|
||||||
|
version="1.0",
|
||||||
|
packages=find_packages(),
|
||||||
|
# if you don't already have DataHub installed, add it under install_requires
|
||||||
|
# install_requires=["acryl-datahub"]
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
### Running the transform
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
transformers:
|
||||||
|
- type: "custom_transform_example.AddCustomOwnership"
|
||||||
|
config:
|
||||||
|
owners_json: "<path_to_owners_json>" # the JSON file mentioned at the start
|
||||||
|
```
|
||||||
|
|
||||||
|
After running `datahub ingest -c <path_to_recipe>`, our MCEs will now have the following owners appended:
|
||||||
|
|
||||||
|
```json
|
||||||
|
"owners": [
|
||||||
|
{
|
||||||
|
"owner": "urn:li:corpuser:athos",
|
||||||
|
"type": "DATAOWNER",
|
||||||
|
"source": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"owner": "urn:li:corpuser:porthos",
|
||||||
|
"type": "DATAOWNER",
|
||||||
|
"source": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"owner": "urn:li:corpuser:aramis",
|
||||||
|
"type": "DATAOWNER",
|
||||||
|
"source": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"owner": "urn:li:corpGroup:the_three_musketeers",
|
||||||
|
"type": "DATAOWNER",
|
||||||
|
"source": null
|
||||||
|
},
|
||||||
|
// ...and any additional owners
|
||||||
|
],
|
||||||
|
```
|
||||||
|
|
||||||
|
All the files for this tutorial may be found [here](./examples/transforms/).
|
Loading…
x
Reference in New Issue
Block a user