mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-27 19:39:52 +00:00
fix(ingest): update athena type mapping (#9061)
This commit is contained in:
parent
9a59c452bf
commit
916235d31a
@ -37,7 +37,7 @@ from datahub.ingestion.source.sql.sql_utils import (
|
|||||||
gen_database_key,
|
gen_database_key,
|
||||||
)
|
)
|
||||||
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
||||||
from datahub.metadata.schema_classes import RecordTypeClass
|
from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass
|
||||||
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
|
from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column
|
||||||
from datahub.utilities.sqlalchemy_type_converter import (
|
from datahub.utilities.sqlalchemy_type_converter import (
|
||||||
MapType,
|
MapType,
|
||||||
@ -46,7 +46,9 @@ from datahub.utilities.sqlalchemy_type_converter import (
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
assert STRUCT, "required type modules are not available"
|
||||||
register_custom_type(STRUCT, RecordTypeClass)
|
register_custom_type(STRUCT, RecordTypeClass)
|
||||||
|
register_custom_type(MapType, MapTypeClass)
|
||||||
|
|
||||||
|
|
||||||
class CustomAthenaRestDialect(AthenaRestDialect):
|
class CustomAthenaRestDialect(AthenaRestDialect):
|
||||||
|
@ -80,7 +80,6 @@ from datahub.metadata.schema_classes import (
|
|||||||
DatasetLineageTypeClass,
|
DatasetLineageTypeClass,
|
||||||
DatasetPropertiesClass,
|
DatasetPropertiesClass,
|
||||||
GlobalTagsClass,
|
GlobalTagsClass,
|
||||||
MapTypeClass,
|
|
||||||
SubTypesClass,
|
SubTypesClass,
|
||||||
TagAssociationClass,
|
TagAssociationClass,
|
||||||
UpstreamClass,
|
UpstreamClass,
|
||||||
@ -90,7 +89,6 @@ from datahub.telemetry import telemetry
|
|||||||
from datahub.utilities.lossy_collections import LossyList
|
from datahub.utilities.lossy_collections import LossyList
|
||||||
from datahub.utilities.registries.domain_registry import DomainRegistry
|
from datahub.utilities.registries.domain_registry import DomainRegistry
|
||||||
from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
|
from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
|
||||||
from datahub.utilities.sqlalchemy_type_converter import MapType
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from datahub.ingestion.source.ge_data_profiler import (
|
from datahub.ingestion.source.ge_data_profiler import (
|
||||||
@ -140,6 +138,7 @@ class SqlWorkUnit(MetadataWorkUnit):
|
|||||||
|
|
||||||
|
|
||||||
_field_type_mapping: Dict[Type[TypeEngine], Type] = {
|
_field_type_mapping: Dict[Type[TypeEngine], Type] = {
|
||||||
|
# Note: to add dialect-specific types to this mapping, use the `register_custom_type` function.
|
||||||
types.Integer: NumberTypeClass,
|
types.Integer: NumberTypeClass,
|
||||||
types.Numeric: NumberTypeClass,
|
types.Numeric: NumberTypeClass,
|
||||||
types.Boolean: BooleanTypeClass,
|
types.Boolean: BooleanTypeClass,
|
||||||
@ -156,8 +155,6 @@ _field_type_mapping: Dict[Type[TypeEngine], Type] = {
|
|||||||
types.DATETIME: TimeTypeClass,
|
types.DATETIME: TimeTypeClass,
|
||||||
types.TIMESTAMP: TimeTypeClass,
|
types.TIMESTAMP: TimeTypeClass,
|
||||||
types.JSON: RecordTypeClass,
|
types.JSON: RecordTypeClass,
|
||||||
# additional type definitions that are used by the Athena source
|
|
||||||
MapType: MapTypeClass, # type: ignore
|
|
||||||
# Because the postgresql dialect is used internally by many other dialects,
|
# Because the postgresql dialect is used internally by many other dialects,
|
||||||
# we add some postgres types here. This is ok to do because the postgresql
|
# we add some postgres types here. This is ok to do because the postgresql
|
||||||
# dialect is built-in to sqlalchemy.
|
# dialect is built-in to sqlalchemy.
|
||||||
|
@ -7,7 +7,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|||||||
BytesType,
|
BytesType,
|
||||||
DateType,
|
DateType,
|
||||||
EnumType,
|
EnumType,
|
||||||
MapType as MapTypeAvro,
|
MapType,
|
||||||
NullType,
|
NullType,
|
||||||
NumberType,
|
NumberType,
|
||||||
RecordType,
|
RecordType,
|
||||||
@ -15,7 +15,6 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|||||||
TimeType,
|
TimeType,
|
||||||
UnionType,
|
UnionType,
|
||||||
)
|
)
|
||||||
from datahub.utilities.sqlalchemy_type_converter import MapType
|
|
||||||
|
|
||||||
# these can be obtained by running `select format_type(oid, null),* from pg_type;`
|
# these can be obtained by running `select format_type(oid, null),* from pg_type;`
|
||||||
# we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
|
# we've omitted the types without a meaningful DataHub type (e.g. postgres-specific types, index vectors, etc.)
|
||||||
@ -364,7 +363,7 @@ TRINO_SQL_TYPES_MAP: Dict[str, Any] = {
|
|||||||
"time": TimeType,
|
"time": TimeType,
|
||||||
"timestamp": TimeType,
|
"timestamp": TimeType,
|
||||||
"row": RecordType,
|
"row": RecordType,
|
||||||
"map": MapTypeAvro,
|
"map": MapType,
|
||||||
"array": ArrayType,
|
"array": ArrayType,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4,7 +4,6 @@ import uuid
|
|||||||
from typing import Any, Dict, List, Optional, Type, Union
|
from typing import Any, Dict, List, Optional, Type, Union
|
||||||
|
|
||||||
from sqlalchemy import types
|
from sqlalchemy import types
|
||||||
from sqlalchemy_bigquery import STRUCT
|
|
||||||
|
|
||||||
from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields
|
from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields
|
||||||
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField
|
||||||
@ -12,6 +11,12 @@ from datahub.metadata.schema_classes import NullTypeClass, SchemaFieldDataTypeCl
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# This is used for both BigQuery and Athena.
|
||||||
|
from sqlalchemy_bigquery import STRUCT
|
||||||
|
except ImportError:
|
||||||
|
STRUCT = None
|
||||||
|
|
||||||
|
|
||||||
class MapType(types.TupleType):
|
class MapType(types.TupleType):
|
||||||
# Wrapper class around SQLalchemy's TupleType to increase compatibility with DataHub
|
# Wrapper class around SQLalchemy's TupleType to increase compatibility with DataHub
|
||||||
@ -42,7 +47,9 @@ class SqlAlchemyColumnToAvroConverter:
|
|||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
"""Determines the concrete AVRO schema type for a SQLalchemy-typed column"""
|
"""Determines the concrete AVRO schema type for a SQLalchemy-typed column"""
|
||||||
|
|
||||||
if type(column_type) in cls.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE.keys():
|
if isinstance(
|
||||||
|
column_type, tuple(cls.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE.keys())
|
||||||
|
):
|
||||||
return {
|
return {
|
||||||
"type": cls.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE[type(column_type)],
|
"type": cls.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE[type(column_type)],
|
||||||
"native_data_type": str(column_type),
|
"native_data_type": str(column_type),
|
||||||
@ -88,7 +95,7 @@ class SqlAlchemyColumnToAvroConverter:
|
|||||||
"key_type": cls.get_avro_type(column_type=key_type, nullable=nullable),
|
"key_type": cls.get_avro_type(column_type=key_type, nullable=nullable),
|
||||||
"key_native_data_type": str(key_type),
|
"key_native_data_type": str(key_type),
|
||||||
}
|
}
|
||||||
if isinstance(column_type, STRUCT):
|
if STRUCT and isinstance(column_type, STRUCT):
|
||||||
fields = []
|
fields = []
|
||||||
for field_def in column_type._STRUCT_fields:
|
for field_def in column_type._STRUCT_fields:
|
||||||
field_name, field_type = field_def
|
field_name, field_type = field_def
|
||||||
|
Loading…
x
Reference in New Issue
Block a user