mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-14 18:31:59 +00:00
feat(ingest): lineage-file - add ability to provide lineage manually through a file (#4116)
This commit is contained in:
parent
f17c033d51
commit
6ff551cbcd
@ -57,12 +57,13 @@ We use a plugin architecture so that you can install only the dependencies you a
|
|||||||
|
|
||||||
### Sources
|
### Sources
|
||||||
|
|
||||||
| Plugin Name | Install Command | Provides |
|
| Plugin Name | Install Command | Provides |
|
||||||
|-----------------------------------------------------------------|------------------------------------------------------------| ----------------------------------- |
|
|-------------------------------------------------------------------------------------|------------------------------------------------------------| ----------------------------------- |
|
||||||
| [file](../metadata-ingestion/source_docs/file.md) | _included by default_ | File source and sink |
|
| [file](../metadata-ingestion/source_docs/file.md) | _included by default_ | File source and sink |
|
||||||
| [athena](../metadata-ingestion/source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source |
|
| [athena](../metadata-ingestion/source_docs/athena.md) | `pip install 'acryl-datahub[athena]'` | AWS Athena source |
|
||||||
| [bigquery](../metadata-ingestion/source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source |
|
| [bigquery](../metadata-ingestion/source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery]'` | BigQuery source |
|
||||||
| [bigquery-usage](../metadata-ingestion/source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source |
|
| [bigquery-usage](../metadata-ingestion/source_docs/bigquery.md) | `pip install 'acryl-datahub[bigquery-usage]'` | BigQuery usage statistics source |
|
||||||
|
| [datahub-lineage-file](../metadata-ingestion/source_docs/file_lineage.md) | _no additional dependencies_ | Lineage File source |
|
||||||
| [datahub-business-glossary](../metadata-ingestion/source_docs/business_glossary.md) | _no additional dependencies_ | Business Glossary File source |
|
| [datahub-business-glossary](../metadata-ingestion/source_docs/business_glossary.md) | _no additional dependencies_ | Business Glossary File source |
|
||||||
| [dbt](../metadata-ingestion/source_docs/dbt.md) | _no additional dependencies_ | dbt source |
|
| [dbt](../metadata-ingestion/source_docs/dbt.md) | _no additional dependencies_ | dbt source |
|
||||||
| [druid](../metadata-ingestion/source_docs/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source |
|
| [druid](../metadata-ingestion/source_docs/druid.md) | `pip install 'acryl-datahub[druid]'` | Druid Source |
|
||||||
|
|||||||
31
metadata-ingestion/examples/bootstrap_data/file_lineage.yml
Normal file
31
metadata-ingestion/examples/bootstrap_data/file_lineage.yml
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
---
|
||||||
|
version: 1
|
||||||
|
lineage:
|
||||||
|
- entity:
|
||||||
|
name: topic3
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
upstream:
|
||||||
|
- entity:
|
||||||
|
name: topic2
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
- entity:
|
||||||
|
name: topic1
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
- entity:
|
||||||
|
name: topic2
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
upstream:
|
||||||
|
- entity:
|
||||||
|
name: kafka.topic2
|
||||||
|
env: PROD
|
||||||
|
platform: snowflake
|
||||||
|
platform_instance: test
|
||||||
|
type: dataset
|
||||||
@ -114,6 +114,7 @@ plugins: Dict[str, Set[str]] = {
|
|||||||
"bigquery-usage": bigquery_common | {"cachetools"},
|
"bigquery-usage": bigquery_common | {"cachetools"},
|
||||||
"clickhouse": sql_common | {"clickhouse-sqlalchemy==0.1.8"},
|
"clickhouse": sql_common | {"clickhouse-sqlalchemy==0.1.8"},
|
||||||
"clickhouse-usage": sql_common | {"clickhouse-sqlalchemy==0.1.8"},
|
"clickhouse-usage": sql_common | {"clickhouse-sqlalchemy==0.1.8"},
|
||||||
|
"datahub-lineage-file": set(),
|
||||||
"datahub-business-glossary": set(),
|
"datahub-business-glossary": set(),
|
||||||
"data-lake": {*aws_common, "pydeequ==1.0.1", "pyspark==3.0.3", "parse==1.19.0"},
|
"data-lake": {*aws_common, "pydeequ==1.0.1", "pyspark==3.0.3", "parse==1.19.0"},
|
||||||
"dbt": {"requests"},
|
"dbt": {"requests"},
|
||||||
@ -314,6 +315,7 @@ entry_points = {
|
|||||||
"ldap = datahub.ingestion.source.ldap:LDAPSource",
|
"ldap = datahub.ingestion.source.ldap:LDAPSource",
|
||||||
"looker = datahub.ingestion.source.looker:LookerDashboardSource",
|
"looker = datahub.ingestion.source.looker:LookerDashboardSource",
|
||||||
"lookml = datahub.ingestion.source.lookml:LookMLSource",
|
"lookml = datahub.ingestion.source.lookml:LookMLSource",
|
||||||
|
"datahub-lineage-file = datahub.ingestion.source.metadata.lineage:LineageFileSource",
|
||||||
"datahub-business-glossary = datahub.ingestion.source.metadata.business_glossary:BusinessGlossaryFileSource",
|
"datahub-business-glossary = datahub.ingestion.source.metadata.business_glossary:BusinessGlossaryFileSource",
|
||||||
"mode = datahub.ingestion.source.mode:ModeSource",
|
"mode = datahub.ingestion.source.mode:ModeSource",
|
||||||
"mongodb = datahub.ingestion.source.mongodb:MongoDBSource",
|
"mongodb = datahub.ingestion.source.mongodb:MongoDBSource",
|
||||||
|
|||||||
76
metadata-ingestion/source_docs/file_lineage.md
Normal file
76
metadata-ingestion/source_docs/file_lineage.md
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
# File Based Lineage
|
||||||
|
|
||||||
|
For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md).
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
Works with `acryl-datahub` out of the box.
|
||||||
|
|
||||||
|
## Capabilities
|
||||||
|
|
||||||
|
This plugin pulls lineage metadata from a yaml-formatted file. An example of one such file is located in the examples
|
||||||
|
directory [here](../examples/bootstrap_data/file_lineage.yml).
|
||||||
|
|
||||||
|
## Quickstart recipe
|
||||||
|
|
||||||
|
Check out the following recipe to get started with ingestion! See [below](#config-details) for full configuration
|
||||||
|
options.
|
||||||
|
|
||||||
|
For general pointers on writing and running a recipe, see our [main recipe guide](../README.md#recipes).
|
||||||
|
|
||||||
|
```yml
|
||||||
|
source:
|
||||||
|
type: datahub-lineage-file
|
||||||
|
config:
|
||||||
|
# Coordinates
|
||||||
|
file: /path/to/file_lineage.yml
|
||||||
|
# Whether we want to query datahub-gms for upstream data
|
||||||
|
preserve_upstream: False
|
||||||
|
|
||||||
|
sink:
|
||||||
|
# sink configs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Config details
|
||||||
|
|
||||||
|
Note that a `.` is used to denote nested fields in the YAML recipe.
|
||||||
|
|
||||||
|
| Field | Required | Default | Description |
|
||||||
|
|---------------------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
|
| `file` | ✅ | | Path to lineage file to ingest. |
|
||||||
|
| `preserve_upstream` | | `True` | Whether we want to query datahub-gms for upstream data. `False` means it will hard replace upstream data for a given entity. `True` means it will query the backend for existing upstreams and include it in the ingestion run |
|
||||||
|
|
||||||
|
### Lineage File Format
|
||||||
|
|
||||||
|
The lineage source file should be a `.yml` file with the following top-level keys:
|
||||||
|
|
||||||
|
**version**: the version of lineage file config the config conforms to. Currently, the only version released
|
||||||
|
is `1`.
|
||||||
|
|
||||||
|
**lineage**: the top level key of the lineage file containing a list of **EntityNodeConfig** objects
|
||||||
|
|
||||||
|
**EntityNodeConfig**:
|
||||||
|
|
||||||
|
- **entity**: **EntityConfig** object
|
||||||
|
- **upstream**: (optional) list of child **EntityNodeConfig** objects
|
||||||
|
|
||||||
|
**EntityConfig**:
|
||||||
|
|
||||||
|
- **name** : name of the entity
|
||||||
|
- **type**: type of the entity (only `dataset` is supported as of now)
|
||||||
|
- **env**: the environment of this entity. Should match the values in the
|
||||||
|
table [here](https://datahubproject.io/docs/graphql/enums/#fabrictype)
|
||||||
|
- **platform**: a valid platform like kafka, snowflake, etc..
|
||||||
|
- **platform_instance**: optional string specifying the platform instance of this entity
|
||||||
|
|
||||||
|
You can also view an example lineage file checked in [here](../examples/bootstrap_data/file_lineage.yml)
|
||||||
|
|
||||||
|
## Compatibility
|
||||||
|
|
||||||
|
Compatible with version 1 of lineage format. The source will be evolved as we publish newer versions of this
|
||||||
|
format.
|
||||||
|
|
||||||
|
## Questions
|
||||||
|
|
||||||
|
If you've got any questions on configuring this source, feel free to ping us
|
||||||
|
on [our Slack](https://slack.datahubproject.io/)!
|
||||||
@ -165,3 +165,7 @@ class KeyValuePattern(ConfigModel):
|
|||||||
"""Return the list of allowed strings as a list, after taking into account deny patterns, if possible"""
|
"""Return the list of allowed strings as a list, after taking into account deny patterns, if possible"""
|
||||||
assert self.is_fully_specified_key()
|
assert self.is_fully_specified_key()
|
||||||
return self.rules
|
return self.rules
|
||||||
|
|
||||||
|
|
||||||
|
class VersionedConfig(ConfigModel):
|
||||||
|
version: str = "1"
|
||||||
|
|||||||
@ -0,0 +1,192 @@
|
|||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Dict, Iterable, List, Optional, Union
|
||||||
|
|
||||||
|
from pydantic import validator
|
||||||
|
|
||||||
|
import datahub.metadata.schema_classes as models
|
||||||
|
from datahub.cli.cli_utils import get_aspects_for_entity
|
||||||
|
from datahub.configuration.common import (
|
||||||
|
ConfigModel,
|
||||||
|
ConfigurationError,
|
||||||
|
VersionedConfig,
|
||||||
|
)
|
||||||
|
from datahub.configuration.config_loader import load_config_file
|
||||||
|
from datahub.configuration.source_common import EnvBasedSourceConfigBase
|
||||||
|
from datahub.emitter.mce_builder import (
|
||||||
|
get_sys_time,
|
||||||
|
make_dataset_urn_with_platform_instance,
|
||||||
|
)
|
||||||
|
from datahub.emitter.mcp import MetadataChangeProposalWrapper
|
||||||
|
from datahub.ingestion.api.common import PipelineContext
|
||||||
|
from datahub.ingestion.api.source import Source, SourceReport
|
||||||
|
from datahub.ingestion.api.workunit import MetadataWorkUnit, UsageStatsWorkUnit
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
auditStamp = models.AuditStampClass(
|
||||||
|
time=get_sys_time(), actor="urn:li:corpUser:pythonEmitter"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class EntityConfig(EnvBasedSourceConfigBase):
|
||||||
|
name: str
|
||||||
|
type: str
|
||||||
|
platform: str
|
||||||
|
platform_instance: Optional[str]
|
||||||
|
|
||||||
|
@validator("type")
|
||||||
|
def type_must_be_supported(cls, v: str) -> str:
|
||||||
|
allowed_types = ["dataset"]
|
||||||
|
if v not in allowed_types:
|
||||||
|
raise ConfigurationError(
|
||||||
|
f"Type must be one of {allowed_types}, {v} is not yet supported."
|
||||||
|
)
|
||||||
|
return v
|
||||||
|
|
||||||
|
|
||||||
|
class EntityNodeConfig(ConfigModel):
|
||||||
|
entity: EntityConfig
|
||||||
|
upstream: Optional[List["EntityNodeConfig"]]
|
||||||
|
|
||||||
|
|
||||||
|
# https://pydantic-docs.helpmanual.io/usage/postponed_annotations/ required for when you reference a model within itself
|
||||||
|
EntityNodeConfig.update_forward_refs()
|
||||||
|
|
||||||
|
|
||||||
|
class LineageFileSourceConfig(ConfigModel):
|
||||||
|
file: str
|
||||||
|
preserve_upstream: bool = True
|
||||||
|
|
||||||
|
|
||||||
|
class LineageConfig(VersionedConfig):
|
||||||
|
lineage: List[EntityNodeConfig]
|
||||||
|
|
||||||
|
@validator("version")
|
||||||
|
def version_must_be_1(cls, v):
|
||||||
|
if v != "1":
|
||||||
|
raise ValueError("Only version 1 is supported")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LineageFileSource(Source):
|
||||||
|
config: LineageFileSourceConfig
|
||||||
|
report: SourceReport = field(default_factory=SourceReport)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(
|
||||||
|
cls, config_dict: Dict[str, Any], ctx: PipelineContext
|
||||||
|
) -> "LineageFileSource":
|
||||||
|
config = LineageFileSourceConfig.parse_obj(config_dict)
|
||||||
|
return cls(ctx, config)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def load_lineage_config(file_name: str) -> LineageConfig:
|
||||||
|
config = load_config_file(file_name)
|
||||||
|
lineage_config = LineageConfig.parse_obj(config)
|
||||||
|
return lineage_config
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_lineage_metadata_change_event_proposal(
|
||||||
|
entities: List[EntityNodeConfig], preserve_upstream: bool
|
||||||
|
) -> Iterable[MetadataChangeProposalWrapper]:
|
||||||
|
"""
|
||||||
|
Builds a list of events to be emitted to datahub by going through each entity and its upstream nodes
|
||||||
|
:param preserve_upstream: This field determines if we want to query the datahub backend to extract
|
||||||
|
the existing upstream lineages for each entity and preserve it
|
||||||
|
:param entities: A list of entities we want to build a proposal on
|
||||||
|
:return: Returns a list of metadata change event proposals to be emitted to datahub
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _get_entity_urn(entity_config: EntityConfig) -> Optional[str]:
|
||||||
|
"""Helper inner function to extract a given entity_urn
|
||||||
|
A return value of None represents an unsupported entity type
|
||||||
|
"""
|
||||||
|
if entity_config.type == "dataset":
|
||||||
|
return make_dataset_urn_with_platform_instance(
|
||||||
|
platform=entity_config.platform,
|
||||||
|
name=entity_config.name,
|
||||||
|
env=entity_config.env,
|
||||||
|
platform_instance=entity_config.platform_instance,
|
||||||
|
)
|
||||||
|
logger.warning(f"Entity type: {entity_config.type} is not supported!")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# loop through all the entities
|
||||||
|
for entity_node in entities:
|
||||||
|
new_upstreams: List[models.UpstreamClass] = []
|
||||||
|
# if this entity has upstream nodes defined, we'll want to do some work.
|
||||||
|
# if no upstream nodes are present, we don't emit an MCP for it.
|
||||||
|
if entity_node.upstream:
|
||||||
|
entity = entity_node.entity
|
||||||
|
logger.info(f"Upstream detected for {entity}. Extracting urn...")
|
||||||
|
entity_urn = _get_entity_urn(entity)
|
||||||
|
if entity_urn:
|
||||||
|
# extract the old lineage and save it for the new mcp
|
||||||
|
if preserve_upstream:
|
||||||
|
old_upstream_lineage = get_aspects_for_entity(
|
||||||
|
entity_urn=entity_urn,
|
||||||
|
aspects=["upstreamLineage"],
|
||||||
|
typed=True,
|
||||||
|
).get("upstreamLineage")
|
||||||
|
if old_upstream_lineage:
|
||||||
|
# Can't seem to get mypy to be happy about
|
||||||
|
# `Argument 1 to "list" has incompatible type "Optional[Any]";
|
||||||
|
# expected "Iterable[UpstreamClass]"`
|
||||||
|
new_upstreams.extend(
|
||||||
|
old_upstream_lineage.get("upstreams") # type: ignore
|
||||||
|
)
|
||||||
|
for upstream_entity_node in entity_node.upstream:
|
||||||
|
upstream_entity = upstream_entity_node.entity
|
||||||
|
upstream_entity_urn = _get_entity_urn(upstream_entity)
|
||||||
|
if upstream_entity_urn:
|
||||||
|
new_upstream = models.UpstreamClass(
|
||||||
|
dataset=upstream_entity_urn,
|
||||||
|
type=models.DatasetLineageTypeClass.TRANSFORMED,
|
||||||
|
auditStamp=auditStamp,
|
||||||
|
)
|
||||||
|
new_upstreams.append(new_upstream)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f"Entity type: {upstream_entity.type} is unsupported. Upstream lineage will be skipped "
|
||||||
|
f"for {upstream_entity.name}->{entity.name}"
|
||||||
|
)
|
||||||
|
new_upstream_lineage = models.UpstreamLineageClass(
|
||||||
|
upstreams=new_upstreams
|
||||||
|
)
|
||||||
|
yield MetadataChangeProposalWrapper(
|
||||||
|
entityType=entity.type,
|
||||||
|
changeType=models.ChangeTypeClass.UPSERT,
|
||||||
|
entityUrn=entity_urn,
|
||||||
|
aspectName="upstreamLineage",
|
||||||
|
aspect=new_upstream_lineage,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f"Entity type: {entity.type} is unsupported. Entity node {entity.name} and its "
|
||||||
|
f"upstream lineages will be skipped"
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_workunits(self) -> Iterable[Union[MetadataWorkUnit, UsageStatsWorkUnit]]:
|
||||||
|
lineage_config = self.load_lineage_config(self.config.file)
|
||||||
|
lineage = lineage_config.lineage
|
||||||
|
preserve_upstream = self.config.preserve_upstream
|
||||||
|
logger.debug(lineage_config)
|
||||||
|
logger.info(f"preserve_upstream is set to {self.config.preserve_upstream}")
|
||||||
|
for (
|
||||||
|
metadata_change_event_proposal
|
||||||
|
) in self.get_lineage_metadata_change_event_proposal(
|
||||||
|
lineage, preserve_upstream
|
||||||
|
):
|
||||||
|
work_unit = MetadataWorkUnit(
|
||||||
|
f"lineage-{metadata_change_event_proposal.entityUrn}",
|
||||||
|
mcp=metadata_change_event_proposal,
|
||||||
|
)
|
||||||
|
self.report.report_workunit(work_unit)
|
||||||
|
yield work_unit
|
||||||
|
|
||||||
|
def get_report(self):
|
||||||
|
return self.report
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
pass
|
||||||
222
metadata-ingestion/tests/unit/test_file_lineage_source.py
Normal file
222
metadata-ingestion/tests/unit/test_file_lineage_source.py
Normal file
@ -0,0 +1,222 @@
|
|||||||
|
import logging
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from datahub.configuration.common import ConfigurationError
|
||||||
|
from datahub.ingestion.source.metadata.lineage import LineageConfig, LineageFileSource
|
||||||
|
from datahub.metadata.schema_classes import UpstreamClass
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def basic_mcp():
|
||||||
|
"""
|
||||||
|
The below mcp should represent a lineage that looks like this
|
||||||
|
topic1 -
|
||||||
|
->topic3
|
||||||
|
topic2 -
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
sample_lineage = """
|
||||||
|
lineage:
|
||||||
|
- entity:
|
||||||
|
name: topic3
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
upstream:
|
||||||
|
- entity:
|
||||||
|
name: topic1
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
- entity:
|
||||||
|
name: topic2
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
"""
|
||||||
|
config = yaml.safe_load(sample_lineage)
|
||||||
|
lineage_config: LineageConfig = LineageConfig.parse_obj(config)
|
||||||
|
mcp = list(
|
||||||
|
LineageFileSource.get_lineage_metadata_change_event_proposal(
|
||||||
|
entities=lineage_config.lineage, preserve_upstream=False
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return mcp
|
||||||
|
|
||||||
|
|
||||||
|
def unsupported_entity_type_mcp():
|
||||||
|
sample_lineage = """
|
||||||
|
lineage:
|
||||||
|
- entity:
|
||||||
|
name: topic3
|
||||||
|
type: NotSupported!
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
upstream:
|
||||||
|
- entity:
|
||||||
|
name: topic1
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
- entity:
|
||||||
|
name: topic2
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
- entity:
|
||||||
|
name: topic6
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
upstream:
|
||||||
|
- entity:
|
||||||
|
name: topic4
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
- entity:
|
||||||
|
name: topic5
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
"""
|
||||||
|
config = yaml.safe_load(sample_lineage)
|
||||||
|
lineage_config: LineageConfig = LineageConfig.parse_obj(config)
|
||||||
|
mcp = list(
|
||||||
|
LineageFileSource.get_lineage_metadata_change_event_proposal(
|
||||||
|
entities=lineage_config.lineage, preserve_upstream=False
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return mcp
|
||||||
|
|
||||||
|
|
||||||
|
def unsupported_upstream_entity_type_mcp():
|
||||||
|
sample_lineage = """
|
||||||
|
lineage:
|
||||||
|
- entity:
|
||||||
|
name: topic3
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
upstream:
|
||||||
|
- entity:
|
||||||
|
name: topic1
|
||||||
|
type: NotSupported
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
- entity:
|
||||||
|
name: topic2
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
"""
|
||||||
|
config = yaml.safe_load(sample_lineage)
|
||||||
|
lineage_config: LineageConfig = LineageConfig.parse_obj(config)
|
||||||
|
mcp = list(
|
||||||
|
LineageFileSource.get_lineage_metadata_change_event_proposal(
|
||||||
|
entities=lineage_config.lineage, preserve_upstream=False
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return mcp
|
||||||
|
|
||||||
|
|
||||||
|
def unsupported_entity_env_mcp():
|
||||||
|
sample_lineage = """
|
||||||
|
lineage:
|
||||||
|
- entity:
|
||||||
|
name: topic3
|
||||||
|
type: dataset
|
||||||
|
env: NotSupported!
|
||||||
|
platform: kafka
|
||||||
|
upstream:
|
||||||
|
- entity:
|
||||||
|
name: topic1
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
- entity:
|
||||||
|
name: topic2
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
- entity:
|
||||||
|
name: topic6
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
upstream:
|
||||||
|
- entity:
|
||||||
|
name: topic4
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
- entity:
|
||||||
|
name: topic5
|
||||||
|
type: dataset
|
||||||
|
env: DEV
|
||||||
|
platform: kafka
|
||||||
|
"""
|
||||||
|
config = yaml.safe_load(sample_lineage)
|
||||||
|
lineage_config: LineageConfig = LineageConfig.parse_obj(config)
|
||||||
|
mcp = list(
|
||||||
|
LineageFileSource.get_lineage_metadata_change_event_proposal(
|
||||||
|
entities=lineage_config.lineage, preserve_upstream=False
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return mcp
|
||||||
|
|
||||||
|
|
||||||
|
def test_basic_lineage_entity_root_node_urn(basic_mcp):
|
||||||
|
"""
|
||||||
|
Checks to see if the entityUrn extracted is correct for the root entity node
|
||||||
|
"""
|
||||||
|
|
||||||
|
assert (
|
||||||
|
basic_mcp[0].entityUrn
|
||||||
|
== "urn:li:dataset:(urn:li:dataPlatform:kafka,topic3,DEV)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_basic_lineage_upstream_urns(basic_mcp):
|
||||||
|
"""
|
||||||
|
Checks to see if the upstream urns are correct for a basic_mcp example
|
||||||
|
"""
|
||||||
|
basic_mcp_upstreams: List[UpstreamClass] = basic_mcp[0].aspect.upstreams
|
||||||
|
assert (
|
||||||
|
basic_mcp_upstreams[0].dataset
|
||||||
|
== "urn:li:dataset:(urn:li:dataPlatform:kafka,topic1,DEV)"
|
||||||
|
and basic_mcp_upstreams[1].dataset
|
||||||
|
== "urn:li:dataset:(urn:li:dataPlatform:kafka,topic2,DEV)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_unsupported_entity_type():
|
||||||
|
"""
|
||||||
|
Checks to see how we handle the case of unsupported entity types.
|
||||||
|
If validation is working correctly, it should raise a ConfigurationError
|
||||||
|
"""
|
||||||
|
with pytest.raises(ConfigurationError):
|
||||||
|
unsupported_entity_type_mcp()
|
||||||
|
|
||||||
|
|
||||||
|
def test_unsupported_upstream_entity_type():
|
||||||
|
"""
|
||||||
|
Checks to see how invalid types work in the upstream node.
|
||||||
|
If validation is working correctly, it should raise a ConfigurationError
|
||||||
|
"""
|
||||||
|
with pytest.raises(ConfigurationError):
|
||||||
|
unsupported_upstream_entity_type_mcp()
|
||||||
|
|
||||||
|
|
||||||
|
def test_unsupported_entity_env():
|
||||||
|
"""
|
||||||
|
Checks to see how invalid envs work.
|
||||||
|
If validation is working correctly, it should raise a ConfigurationError
|
||||||
|
"""
|
||||||
|
with pytest.raises(ConfigurationError):
|
||||||
|
unsupported_entity_env_mcp()
|
||||||
Loading…
x
Reference in New Issue
Block a user