mirror of
https://github.com/datahub-project/datahub.git
synced 2025-08-25 09:38:11 +00:00
feat(presto-on-hive): allow v1 fieldpaths in the presto-on-hive source (#8474)
This commit is contained in:
parent
a4a8182001
commit
843f82b943
@ -438,7 +438,7 @@ class CSVEnricherSource(Source):
|
|||||||
field_match = False
|
field_match = False
|
||||||
for field_info in current_editable_schema_metadata.editableSchemaFieldInfo:
|
for field_info in current_editable_schema_metadata.editableSchemaFieldInfo:
|
||||||
if (
|
if (
|
||||||
DatasetUrn._get_simple_field_path_from_v2_field_path(
|
DatasetUrn.get_simple_field_path_from_v2_field_path(
|
||||||
field_info.fieldPath
|
field_info.fieldPath
|
||||||
)
|
)
|
||||||
== field_path
|
== field_path
|
||||||
|
@ -134,6 +134,11 @@ class PrestoOnHiveConfig(BasicSQLAlchemyConfig):
|
|||||||
description="By default, the connector overwrites properties every time. Set this to True to enable merging of properties with what exists on the server.",
|
description="By default, the connector overwrites properties every time. Set this to True to enable merging of properties with what exists on the server.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
simplify_nested_field_paths: bool = Field(
|
||||||
|
default=False,
|
||||||
|
description="Simplify v2 field paths to v1 by default. If the schema has Union or Array types, still falls back to v2",
|
||||||
|
)
|
||||||
|
|
||||||
def get_sql_alchemy_url(
|
def get_sql_alchemy_url(
|
||||||
self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
|
self, uri_opts: Optional[Dict[str, Any]] = None, database: Optional[str] = None
|
||||||
) -> str:
|
) -> str:
|
||||||
@ -527,6 +532,7 @@ class PrestoOnHiveSource(SQLAlchemySource):
|
|||||||
None,
|
None,
|
||||||
None,
|
None,
|
||||||
schema_fields,
|
schema_fields,
|
||||||
|
self.config.simplify_nested_field_paths,
|
||||||
)
|
)
|
||||||
dataset_snapshot.aspects.append(schema_metadata)
|
dataset_snapshot.aspects.append(schema_metadata)
|
||||||
|
|
||||||
@ -756,6 +762,7 @@ class PrestoOnHiveSource(SQLAlchemySource):
|
|||||||
self.platform,
|
self.platform,
|
||||||
dataset.columns,
|
dataset.columns,
|
||||||
canonical_schema=schema_fields,
|
canonical_schema=schema_fields,
|
||||||
|
simplify_nested_field_paths=self.config.simplify_nested_field_paths,
|
||||||
)
|
)
|
||||||
dataset_snapshot.aspects.append(schema_metadata)
|
dataset_snapshot.aspects.append(schema_metadata)
|
||||||
|
|
||||||
|
@ -41,11 +41,13 @@ from datahub.ingestion.source.common.subtypes import (
|
|||||||
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig
|
from datahub.ingestion.source.sql.sql_config import SQLAlchemyConfig
|
||||||
from datahub.ingestion.source.sql.sql_utils import (
|
from datahub.ingestion.source.sql.sql_utils import (
|
||||||
add_table_to_schema_container,
|
add_table_to_schema_container,
|
||||||
|
downgrade_schema_from_v2,
|
||||||
gen_database_container,
|
gen_database_container,
|
||||||
gen_database_key,
|
gen_database_key,
|
||||||
gen_schema_container,
|
gen_schema_container,
|
||||||
gen_schema_key,
|
gen_schema_key,
|
||||||
get_domain_wu,
|
get_domain_wu,
|
||||||
|
schema_requires_v2,
|
||||||
)
|
)
|
||||||
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
from datahub.ingestion.source.state.stale_entity_removal_handler import (
|
||||||
StaleEntityRemovalHandler,
|
StaleEntityRemovalHandler,
|
||||||
@ -287,7 +289,15 @@ def get_schema_metadata(
|
|||||||
pk_constraints: Optional[dict] = None,
|
pk_constraints: Optional[dict] = None,
|
||||||
foreign_keys: Optional[List[ForeignKeyConstraint]] = None,
|
foreign_keys: Optional[List[ForeignKeyConstraint]] = None,
|
||||||
canonical_schema: Optional[List[SchemaField]] = None,
|
canonical_schema: Optional[List[SchemaField]] = None,
|
||||||
|
simplify_nested_field_paths: bool = False,
|
||||||
) -> SchemaMetadata:
|
) -> SchemaMetadata:
|
||||||
|
if (
|
||||||
|
simplify_nested_field_paths
|
||||||
|
and canonical_schema is not None
|
||||||
|
and not schema_requires_v2(canonical_schema)
|
||||||
|
):
|
||||||
|
canonical_schema = downgrade_schema_from_v2(canonical_schema)
|
||||||
|
|
||||||
schema_metadata = SchemaMetadata(
|
schema_metadata = SchemaMetadata(
|
||||||
schemaName=dataset_name,
|
schemaName=dataset_name,
|
||||||
platform=make_data_platform_urn(platform),
|
platform=make_data_platform_urn(platform),
|
||||||
|
@ -17,9 +17,16 @@ from datahub.emitter.mcp_builder import (
|
|||||||
)
|
)
|
||||||
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
||||||
from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
|
from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage
|
||||||
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
||||||
from datahub.metadata.schema_classes import DataPlatformInstanceClass
|
from datahub.metadata.schema_classes import DataPlatformInstanceClass
|
||||||
from datahub.specific.dataset import DatasetPatchBuilder
|
from datahub.specific.dataset import DatasetPatchBuilder
|
||||||
from datahub.utilities.registries.domain_registry import DomainRegistry
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
||||||
|
from datahub.utilities.urns.dataset_urn import DatasetUrn
|
||||||
|
|
||||||
|
ARRAY_TOKEN = "[type=array]"
|
||||||
|
UNION_TOKEN = "[type=union]"
|
||||||
|
KEY_SCHEMA_PREFIX = "[key=True]."
|
||||||
|
VERSION_PREFIX = "[version=2.0]."
|
||||||
|
|
||||||
|
|
||||||
def gen_schema_key(
|
def gen_schema_key(
|
||||||
@ -223,3 +230,27 @@ def gen_lineage(
|
|||||||
|
|
||||||
for wu in lineage_workunits:
|
for wu in lineage_workunits:
|
||||||
yield wu
|
yield wu
|
||||||
|
|
||||||
|
|
||||||
|
# downgrade a schema field
|
||||||
|
def downgrade_schema_field_from_v2(field: SchemaField) -> SchemaField:
|
||||||
|
field.fieldPath = DatasetUrn.get_simple_field_path_from_v2_field_path(
|
||||||
|
field.fieldPath
|
||||||
|
)
|
||||||
|
return field
|
||||||
|
|
||||||
|
|
||||||
|
# downgrade a list of schema fields
|
||||||
|
def downgrade_schema_from_v2(
|
||||||
|
canonical_schema: List[SchemaField],
|
||||||
|
) -> List[SchemaField]:
|
||||||
|
return [downgrade_schema_field_from_v2(field) for field in canonical_schema]
|
||||||
|
|
||||||
|
|
||||||
|
# v2 is only required in case UNION or ARRAY types are present- all other types can be represented in v1 paths
|
||||||
|
def schema_requires_v2(canonical_schema: List[SchemaField]) -> bool:
|
||||||
|
for field in canonical_schema:
|
||||||
|
field_name = field.fieldPath
|
||||||
|
if ARRAY_TOKEN in field_name or UNION_TOKEN in field_name:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
@ -328,7 +328,7 @@ class SchemaResolver(Closeable):
|
|||||||
cls, schema_metadata: SchemaMetadataClass
|
cls, schema_metadata: SchemaMetadataClass
|
||||||
) -> SchemaInfo:
|
) -> SchemaInfo:
|
||||||
return {
|
return {
|
||||||
DatasetUrn._get_simple_field_path_from_v2_field_path(col.fieldPath): (
|
DatasetUrn.get_simple_field_path_from_v2_field_path(col.fieldPath): (
|
||||||
# The actual types are more of a "nice to have".
|
# The actual types are more of a "nice to have".
|
||||||
col.nativeDataType
|
col.nativeDataType
|
||||||
or "str"
|
or "str"
|
||||||
@ -336,7 +336,7 @@ class SchemaResolver(Closeable):
|
|||||||
for col in schema_metadata.fields
|
for col in schema_metadata.fields
|
||||||
# TODO: We can't generate lineage to columns nested within structs yet.
|
# TODO: We can't generate lineage to columns nested within structs yet.
|
||||||
if "."
|
if "."
|
||||||
not in DatasetUrn._get_simple_field_path_from_v2_field_path(col.fieldPath)
|
not in DatasetUrn.get_simple_field_path_from_v2_field_path(col.fieldPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
# TODO add a method to load all from graphql
|
# TODO add a method to load all from graphql
|
||||||
|
@ -97,7 +97,7 @@ class DatasetUrn(Urn):
|
|||||||
"""A helper function to extract simple . path notation from the v2 field path"""
|
"""A helper function to extract simple . path notation from the v2 field path"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_simple_field_path_from_v2_field_path(field_path: str) -> str:
|
def get_simple_field_path_from_v2_field_path(field_path: str) -> str:
|
||||||
if field_path.startswith("[version=2.0]"):
|
if field_path.startswith("[version=2.0]"):
|
||||||
# this is a v2 field path
|
# this is a v2 field path
|
||||||
tokens = [
|
tokens = [
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -53,12 +53,14 @@ def loaded_presto_on_hive(presto_on_hive_runner):
|
|||||||
@freeze_time(FROZEN_TIME)
|
@freeze_time(FROZEN_TIME)
|
||||||
@pytest.mark.integration_batch_1
|
@pytest.mark.integration_batch_1
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"mode,use_catalog_subtype,use_dataset_pascalcase_subtype,include_catalog_name_in_ids,test_suffix",
|
"mode,use_catalog_subtype,use_dataset_pascalcase_subtype,include_catalog_name_in_ids,simplify_nested_field_paths,"
|
||||||
|
"test_suffix",
|
||||||
[
|
[
|
||||||
("hive", False, False, False, "_1"),
|
("hive", False, False, False, False, "_1"),
|
||||||
("presto-on-hive", True, True, False, "_2"),
|
("presto-on-hive", True, True, False, False, "_2"),
|
||||||
("hive", False, False, True, "_3"),
|
("hive", False, False, True, False, "_3"),
|
||||||
("presto-on-hive", True, True, True, "_4"),
|
("presto-on-hive", True, True, True, False, "_4"),
|
||||||
|
("hive", False, False, False, True, "_5"),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_presto_on_hive_ingest(
|
def test_presto_on_hive_ingest(
|
||||||
@ -71,6 +73,7 @@ def test_presto_on_hive_ingest(
|
|||||||
use_catalog_subtype,
|
use_catalog_subtype,
|
||||||
use_dataset_pascalcase_subtype,
|
use_dataset_pascalcase_subtype,
|
||||||
include_catalog_name_in_ids,
|
include_catalog_name_in_ids,
|
||||||
|
simplify_nested_field_paths,
|
||||||
test_suffix,
|
test_suffix,
|
||||||
):
|
):
|
||||||
# Run the metadata ingestion pipeline.
|
# Run the metadata ingestion pipeline.
|
||||||
@ -97,6 +100,7 @@ def test_presto_on_hive_ingest(
|
|||||||
"mode": mode,
|
"mode": mode,
|
||||||
"use_catalog_subtype": use_catalog_subtype,
|
"use_catalog_subtype": use_catalog_subtype,
|
||||||
"use_dataset_pascalcase_subtype": use_dataset_pascalcase_subtype,
|
"use_dataset_pascalcase_subtype": use_dataset_pascalcase_subtype,
|
||||||
|
"simplify_nested_field_paths": simplify_nested_field_paths,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"sink": {
|
"sink": {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user