mirror of
https://github.com/datahub-project/datahub.git
synced 2025-07-24 18:10:11 +00:00
fix(ingest): downgrade column type mapping warning to info (#11115)
This commit is contained in:
parent
a25df8e6a0
commit
d6e46b9bcf
@ -8,29 +8,10 @@ import time
|
||||
from collections import OrderedDict
|
||||
from datetime import datetime
|
||||
from pathlib import PurePath
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
import smart_open.compression as so_compression
|
||||
from more_itertools import peekable
|
||||
from pyspark.sql.types import (
|
||||
ArrayType,
|
||||
BinaryType,
|
||||
BooleanType,
|
||||
ByteType,
|
||||
DateType,
|
||||
DecimalType,
|
||||
DoubleType,
|
||||
FloatType,
|
||||
IntegerType,
|
||||
LongType,
|
||||
MapType,
|
||||
NullType,
|
||||
ShortType,
|
||||
StringType,
|
||||
StructField,
|
||||
StructType,
|
||||
TimestampType,
|
||||
)
|
||||
from smart_open import open as smart_open
|
||||
|
||||
from datahub.emitter.mce_builder import (
|
||||
@ -48,7 +29,7 @@ from datahub.ingestion.api.decorators import (
|
||||
platform_name,
|
||||
support_status,
|
||||
)
|
||||
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
||||
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
||||
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
||||
from datahub.ingestion.source.abs.config import DataLakeSourceConfig, PathSpec
|
||||
from datahub.ingestion.source.abs.report import DataLakeSourceReport
|
||||
@ -72,22 +53,14 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
||||
StatefulIngestionSourceBase,
|
||||
)
|
||||
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
||||
BooleanTypeClass,
|
||||
BytesTypeClass,
|
||||
DateTypeClass,
|
||||
NullTypeClass,
|
||||
NumberTypeClass,
|
||||
RecordTypeClass,
|
||||
SchemaField,
|
||||
SchemaFieldDataType,
|
||||
SchemaMetadata,
|
||||
StringTypeClass,
|
||||
TimeTypeClass,
|
||||
)
|
||||
from datahub.metadata.schema_classes import (
|
||||
DataPlatformInstanceClass,
|
||||
DatasetPropertiesClass,
|
||||
MapTypeClass,
|
||||
OperationClass,
|
||||
OperationTypeClass,
|
||||
OtherSchemaClass,
|
||||
@ -100,55 +73,12 @@ from datahub.utilities.perf_timer import PerfTimer
|
||||
logging.getLogger("py4j").setLevel(logging.ERROR)
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
# for a list of all types, see https://spark.apache.org/docs/3.0.3/api/python/_modules/pyspark/sql/types.html
|
||||
_field_type_mapping = {
|
||||
NullType: NullTypeClass,
|
||||
StringType: StringTypeClass,
|
||||
BinaryType: BytesTypeClass,
|
||||
BooleanType: BooleanTypeClass,
|
||||
DateType: DateTypeClass,
|
||||
TimestampType: TimeTypeClass,
|
||||
DecimalType: NumberTypeClass,
|
||||
DoubleType: NumberTypeClass,
|
||||
FloatType: NumberTypeClass,
|
||||
ByteType: BytesTypeClass,
|
||||
IntegerType: NumberTypeClass,
|
||||
LongType: NumberTypeClass,
|
||||
ShortType: NumberTypeClass,
|
||||
ArrayType: NullTypeClass,
|
||||
MapType: MapTypeClass,
|
||||
StructField: RecordTypeClass,
|
||||
StructType: RecordTypeClass,
|
||||
}
|
||||
PAGE_SIZE = 1000
|
||||
|
||||
# Hack to support the .gzip extension with smart_open.
|
||||
so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"])
|
||||
|
||||
|
||||
def get_column_type(
|
||||
report: SourceReport, dataset_name: str, column_type: str
|
||||
) -> SchemaFieldDataType:
|
||||
"""
|
||||
Maps known Spark types to datahub types
|
||||
"""
|
||||
TypeClass: Any = None
|
||||
|
||||
for field_type, type_class in _field_type_mapping.items():
|
||||
if isinstance(column_type, field_type):
|
||||
TypeClass = type_class
|
||||
break
|
||||
|
||||
# if still not found, report the warning
|
||||
if TypeClass is None:
|
||||
report.report_warning(
|
||||
dataset_name, f"unable to map type {column_type} to metadata schema"
|
||||
)
|
||||
TypeClass = NullTypeClass
|
||||
|
||||
return SchemaFieldDataType(type=TypeClass())
|
||||
|
||||
|
||||
# config flags to emit telemetry for
|
||||
config_options_to_report = [
|
||||
"platform",
|
||||
|
@ -849,8 +849,11 @@ def get_column_type(
|
||||
# if still not found, report the warning
|
||||
if TypeClass is None:
|
||||
if column_type:
|
||||
report.report_warning(
|
||||
dataset_name, f"unable to map type {column_type} to metadata schema"
|
||||
report.info(
|
||||
title="Unable to map column types to DataHub types",
|
||||
message="Got an unexpected column type. The column's parsed field type will not be populated.",
|
||||
context=f"{dataset_name} - {column_type}",
|
||||
log=False,
|
||||
)
|
||||
TypeClass = NullTypeClass
|
||||
|
||||
|
@ -8,32 +8,13 @@ import time
|
||||
from collections import OrderedDict
|
||||
from datetime import datetime
|
||||
from pathlib import PurePath
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
import smart_open.compression as so_compression
|
||||
from more_itertools import peekable
|
||||
from pyspark.conf import SparkConf
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.dataframe import DataFrame
|
||||
from pyspark.sql.types import (
|
||||
ArrayType,
|
||||
BinaryType,
|
||||
BooleanType,
|
||||
ByteType,
|
||||
DateType,
|
||||
DecimalType,
|
||||
DoubleType,
|
||||
FloatType,
|
||||
IntegerType,
|
||||
LongType,
|
||||
MapType,
|
||||
NullType,
|
||||
ShortType,
|
||||
StringType,
|
||||
StructField,
|
||||
StructType,
|
||||
TimestampType,
|
||||
)
|
||||
from pyspark.sql.utils import AnalysisException
|
||||
from smart_open import open as smart_open
|
||||
|
||||
@ -52,7 +33,7 @@ from datahub.ingestion.api.decorators import (
|
||||
platform_name,
|
||||
support_status,
|
||||
)
|
||||
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport
|
||||
from datahub.ingestion.api.source import MetadataWorkUnitProcessor
|
||||
from datahub.ingestion.api.workunit import MetadataWorkUnit
|
||||
from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags, list_folders
|
||||
from datahub.ingestion.source.aws.s3_util import (
|
||||
@ -72,22 +53,13 @@ from datahub.ingestion.source.state.stateful_ingestion_base import (
|
||||
StatefulIngestionSourceBase,
|
||||
)
|
||||
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
||||
BooleanTypeClass,
|
||||
BytesTypeClass,
|
||||
DateTypeClass,
|
||||
NullTypeClass,
|
||||
NumberTypeClass,
|
||||
RecordTypeClass,
|
||||
SchemaField,
|
||||
SchemaFieldDataType,
|
||||
SchemaMetadata,
|
||||
StringTypeClass,
|
||||
TimeTypeClass,
|
||||
)
|
||||
from datahub.metadata.schema_classes import (
|
||||
DataPlatformInstanceClass,
|
||||
DatasetPropertiesClass,
|
||||
MapTypeClass,
|
||||
OperationClass,
|
||||
OperationTypeClass,
|
||||
OtherSchemaClass,
|
||||
@ -101,55 +73,12 @@ from datahub.utilities.perf_timer import PerfTimer
|
||||
logging.getLogger("py4j").setLevel(logging.ERROR)
|
||||
logger: logging.Logger = logging.getLogger(__name__)
|
||||
|
||||
# for a list of all types, see https://spark.apache.org/docs/3.0.3/api/python/_modules/pyspark/sql/types.html
|
||||
_field_type_mapping = {
|
||||
NullType: NullTypeClass,
|
||||
StringType: StringTypeClass,
|
||||
BinaryType: BytesTypeClass,
|
||||
BooleanType: BooleanTypeClass,
|
||||
DateType: DateTypeClass,
|
||||
TimestampType: TimeTypeClass,
|
||||
DecimalType: NumberTypeClass,
|
||||
DoubleType: NumberTypeClass,
|
||||
FloatType: NumberTypeClass,
|
||||
ByteType: BytesTypeClass,
|
||||
IntegerType: NumberTypeClass,
|
||||
LongType: NumberTypeClass,
|
||||
ShortType: NumberTypeClass,
|
||||
ArrayType: NullTypeClass,
|
||||
MapType: MapTypeClass,
|
||||
StructField: RecordTypeClass,
|
||||
StructType: RecordTypeClass,
|
||||
}
|
||||
PAGE_SIZE = 1000
|
||||
|
||||
# Hack to support the .gzip extension with smart_open.
|
||||
so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"])
|
||||
|
||||
|
||||
def get_column_type(
|
||||
report: SourceReport, dataset_name: str, column_type: str
|
||||
) -> SchemaFieldDataType:
|
||||
"""
|
||||
Maps known Spark types to datahub types
|
||||
"""
|
||||
TypeClass: Any = None
|
||||
|
||||
for field_type, type_class in _field_type_mapping.items():
|
||||
if isinstance(column_type, field_type):
|
||||
TypeClass = type_class
|
||||
break
|
||||
|
||||
# if still not found, report the warning
|
||||
if TypeClass is None:
|
||||
report.report_warning(
|
||||
dataset_name, f"unable to map type {column_type} to metadata schema"
|
||||
)
|
||||
TypeClass = NullTypeClass
|
||||
|
||||
return SchemaFieldDataType(type=TypeClass())
|
||||
|
||||
|
||||
# config flags to emit telemetry for
|
||||
config_options_to_report = [
|
||||
"platform",
|
||||
@ -490,9 +419,7 @@ class S3Source(StatefulIngestionSourceBase):
|
||||
if not is_fieldpath_v2
|
||||
else f"[version=2.0].[type=string].{partition_key}",
|
||||
nativeDataType="string",
|
||||
type=SchemaFieldDataType(StringTypeClass())
|
||||
if not is_fieldpath_v2
|
||||
else SchemaFieldDataTypeClass(type=StringTypeClass()),
|
||||
type=SchemaFieldDataTypeClass(StringTypeClass()),
|
||||
isPartitioningKey=True,
|
||||
nullable=True,
|
||||
recursive=False,
|
||||
|
@ -263,8 +263,11 @@ def get_column_type(
|
||||
break
|
||||
|
||||
if TypeClass is None:
|
||||
sql_report.report_warning(
|
||||
dataset_name, f"unable to map type {column_type!r} to metadata schema"
|
||||
sql_report.info(
|
||||
title="Unable to map column types to DataHub types",
|
||||
message="Got an unexpected column type. The column's parsed field type will not be populated.",
|
||||
context=f"{dataset_name} - {column_type!r}",
|
||||
log=False,
|
||||
)
|
||||
TypeClass = NullTypeClass
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user