feat(ingest): use entry point for registering transformers (#6628)

This commit is contained in:
Felix Lüdin 2022-12-08 05:08:08 +01:00 committed by GitHub
parent f0f0355a83
commit 05e18a0ae7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 53 additions and 72 deletions

View File

@ -1145,7 +1145,7 @@ def transform_one(self, mce: MetadataChangeEventClass) -> MetadataChangeEventCla
Now that we've defined the transformer, we need to make it visible to DataHub. The easiest way to do this is to just place it in the same directory as your recipe, in which case the module name is the same as the file in this case, `custom_transform_example`.
<details>
<summary>Advanced: installing as a package</summary>
<summary>Advanced: Installing as a package and enable discoverability</summary>
Alternatively, create a `setup.py` in the same directory as our transform script to make it visible globally. After installing this package (e.g. with `python setup.py` or `pip install -e .`), our module will be installed and importable as `custom_transform_example`.
```python
@ -1156,17 +1156,25 @@ setup(
version="1.0",
packages=find_packages(),
# if you don't already have DataHub installed, add it under install_requires
# install_requires=["acryl-datahub"]
# install_requires=["acryl-datahub"],
entry_points={
"datahub.ingestion.transformer.plugins": [
"custom_transform_example_alias = custom_transform_example:AddCustomOwnership",
],
},
)
```
Additionally, declare the transformer under the `entry_points` variable of the setup script. This enables the transformer to be
listed when running `datahub check plugins`, and sets up the transformer's shortened alias for use in recipes.
</details>
### Running the transform
```yaml
transformers:
- type: "custom_transform_example.AddCustomOwnership"
- type: "custom_transform_example_alias"
config:
owners_json: "<path_to_owners_json>" # the JSON file mentioned at the start
```

View File

@ -545,6 +545,27 @@ entry_points = {
"demo-data = datahub.ingestion.source.demo_data.DemoDataSource",
"unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource",
],
"datahub.ingestion.transformer.plugins": [
"simple_remove_dataset_ownership = datahub.ingestion.transformer.remove_dataset_ownership:SimpleRemoveDatasetOwnership",
"mark_dataset_status = datahub.ingestion.transformer.mark_dataset_status:MarkDatasetStatus",
"set_dataset_browse_path = datahub.ingestion.transformer.add_dataset_browse_path:AddDatasetBrowsePathTransformer",
"add_dataset_ownership = datahub.ingestion.transformer.add_dataset_ownership:AddDatasetOwnership",
"simple_add_dataset_ownership = datahub.ingestion.transformer.add_dataset_ownership:SimpleAddDatasetOwnership",
"pattern_add_dataset_ownership = datahub.ingestion.transformer.add_dataset_ownership:PatternAddDatasetOwnership",
"add_dataset_domain = datahub.ingestion.transformer.dataset_domain:AddDatasetDomain",
"simple_add_dataset_domain = datahub.ingestion.transformer.dataset_domain:SimpleAddDatasetDomain",
"pattern_add_dataset_domain = datahub.ingestion.transformer.dataset_domain:PatternAddDatasetDomain",
"add_dataset_tags = datahub.ingestion.transformer.add_dataset_tags:AddDatasetTags",
"simple_add_dataset_tags = datahub.ingestion.transformer.add_dataset_tags:SimpleAddDatasetTags",
"pattern_add_dataset_tags = datahub.ingestion.transformer.add_dataset_tags:PatternAddDatasetTags",
"add_dataset_terms = datahub.ingestion.transformer.add_dataset_terms:AddDatasetTerms",
"simple_add_dataset_terms = datahub.ingestion.transformer.add_dataset_terms:SimpleAddDatasetTerms",
"pattern_add_dataset_terms = datahub.ingestion.transformer.add_dataset_terms:PatternAddDatasetTerms",
"add_dataset_properties = datahub.ingestion.transformer.add_dataset_properties:AddDatasetProperties",
"simple_add_dataset_properties = datahub.ingestion.transformer.add_dataset_properties:SimpleAddDatasetProperties",
"pattern_add_dataset_schema_terms = datahub.ingestion.transformer.add_dataset_schema_terms:PatternAddDatasetSchemaTerms",
"pattern_add_dataset_schema_tags = datahub.ingestion.transformer.add_dataset_schema_tags:PatternAddDatasetSchemaTags",
],
"datahub.ingestion.sink.plugins": [
"file = datahub.ingestion.sink.file:FileSink",
"console = datahub.ingestion.sink.console:ConsoleSink",

View File

@ -1,74 +1,26 @@
from datahub.ingestion.api.registry import PluginRegistry
from datahub.ingestion.api.transform import Transformer
from datahub.ingestion.transformer import dataset_domain
from datahub.ingestion.transformer.add_dataset_browse_path import (
AddDatasetBrowsePathTransformer,
)
from datahub.ingestion.transformer.add_dataset_ownership import (
AddDatasetOwnership,
PatternAddDatasetOwnership,
SimpleAddDatasetOwnership,
)
from datahub.ingestion.transformer.add_dataset_properties import (
AddDatasetProperties,
SimpleAddDatasetProperties,
)
from datahub.ingestion.transformer.add_dataset_schema_tags import (
PatternAddDatasetSchemaTags,
)
from datahub.ingestion.transformer.add_dataset_schema_terms import (
PatternAddDatasetSchemaTerms,
)
from datahub.ingestion.transformer.add_dataset_tags import (
AddDatasetTags,
PatternAddDatasetTags,
SimpleAddDatasetTags,
)
from datahub.ingestion.transformer.add_dataset_terms import (
AddDatasetTerms,
PatternAddDatasetTerms,
SimpleAddDatasetTerms,
)
from datahub.ingestion.transformer.mark_dataset_status import MarkDatasetStatus
from datahub.ingestion.transformer.remove_dataset_ownership import (
SimpleRemoveDatasetOwnership,
)
transform_registry = PluginRegistry[Transformer]()
transform_registry.register_from_entrypoint("datahub.ingestion.transformer.plugins")
transform_registry.register(
"simple_remove_dataset_ownership", SimpleRemoveDatasetOwnership
)
transform_registry.register("mark_dataset_status", MarkDatasetStatus)
transform_registry.register("set_dataset_browse_path", AddDatasetBrowsePathTransformer)
transform_registry.register("add_dataset_ownership", AddDatasetOwnership)
transform_registry.register("simple_add_dataset_ownership", SimpleAddDatasetOwnership)
transform_registry.register("pattern_add_dataset_ownership", PatternAddDatasetOwnership)
transform_registry.register("add_dataset_domain", dataset_domain.AddDatasetDomain)
transform_registry.register(
"simple_add_dataset_domain", dataset_domain.SimpleAddDatasetDomain
)
transform_registry.register(
"pattern_add_dataset_domain", dataset_domain.PatternAddDatasetDomain
)
transform_registry.register("add_dataset_tags", AddDatasetTags)
transform_registry.register("simple_add_dataset_tags", SimpleAddDatasetTags)
transform_registry.register("pattern_add_dataset_tags", PatternAddDatasetTags)
transform_registry.register("add_dataset_terms", AddDatasetTerms)
transform_registry.register("simple_add_dataset_terms", SimpleAddDatasetTerms)
transform_registry.register("pattern_add_dataset_terms", PatternAddDatasetTerms)
transform_registry.register("add_dataset_properties", AddDatasetProperties)
transform_registry.register("simple_add_dataset_properties", SimpleAddDatasetProperties)
transform_registry.register(
"pattern_add_dataset_schema_terms", PatternAddDatasetSchemaTerms
)
transform_registry.register(
"pattern_add_dataset_schema_tags", PatternAddDatasetSchemaTags
)
# These transformers are always enabled
assert transform_registry.get("simple_remove_dataset_ownership")
assert transform_registry.get("mark_dataset_status")
assert transform_registry.get("set_dataset_browse_path")
assert transform_registry.get("add_dataset_ownership")
assert transform_registry.get("simple_add_dataset_ownership")
assert transform_registry.get("pattern_add_dataset_ownership")
assert transform_registry.get("add_dataset_domain")
assert transform_registry.get("simple_add_dataset_domain")
assert transform_registry.get("pattern_add_dataset_domain")
assert transform_registry.get("add_dataset_tags")
assert transform_registry.get("simple_add_dataset_tags")
assert transform_registry.get("pattern_add_dataset_tags")
assert transform_registry.get("add_dataset_terms")
assert transform_registry.get("simple_add_dataset_terms")
assert transform_registry.get("pattern_add_dataset_terms")
assert transform_registry.get("add_dataset_properties")
assert transform_registry.get("simple_add_dataset_properties")
assert transform_registry.get("pattern_add_dataset_schema_terms")
assert transform_registry.get("pattern_add_dataset_schema_tags")