Properly extract kafka schema

This commit is contained in:
Harshal Sheth 2021-02-10 14:37:14 -08:00 committed by Shirshanka Das
parent affd5728e6
commit 82c29d3c94
2 changed files with 70 additions and 36 deletions

View File

@ -1,38 +1,65 @@
import logging
from typing import List, Dict, Any
import avro.schema
"""A helper file for Avro schema -> MCE schema transformations"""
from gometa.metadata.com.linkedin.pegasus2avro.schema import (
SchemaMetadata, KafkaSchema, SchemaField, SchemaFieldDataType,
BooleanTypeClass, FixedTypeClass, StringTypeClass, BytesTypeClass, NumberTypeClass, EnumTypeClass, NullTypeClass, MapTypeClass, ArrayTypeClass, UnionTypeClass, RecordTypeClass,
)
"""A helper file for Avro schema -> MCE schema transformations"""
logger = logging.getLogger(__name__)
#TODO: Broken (UnionSchemas)
_field_type_mapping = {
"int" : "int",
"string" : "string",
"record" : "struct",
"null": NullTypeClass,
"bool": BooleanTypeClass,
"int" : NumberTypeClass,
"long" : NumberTypeClass,
"float" : NumberTypeClass,
"double" : NumberTypeClass,
"bytes" : BytesTypeClass,
"string" : StringTypeClass,
"record" : RecordTypeClass,
"enum" : EnumTypeClass,
"array" : ArrayTypeClass,
"union" : UnionTypeClass,
"fixed" : FixedTypeClass,
}
#TODO: Broken
def _get_column_type(field_type):
return _field_type_mapping.get(str(field_type), str(field_type))
def _get_column_type(field_type) -> SchemaFieldDataType:
tp = field_type
if hasattr(tp, 'type'):
tp = tp.type
tp = str(tp)
TypeClass: Any = _field_type_mapping.get(tp)
# TODO: we could populate the nestedTypes field for unions and similar fields
# for the other types as well. However, since we already populate the nativeDataType
# field below, it is mostly ok to leave this as not fully initialized.
dt = SchemaFieldDataType(type=TypeClass())
return dt
#TODO: Broken
def avro_schema_to_mce_fields(avro_schema_string):
def avro_schema_to_mce_fields(avro_schema_string: str) -> List[SchemaField]:
"""Converts an avro schema into a schema compatible with MCE"""
schema: avro.schema.RecordSchema = avro.schema.Parse(avro_schema_string)
canonical_fields = []
fields_skipped = 0
for field in schema.fields:
# only transform the fields we can, ignore the rest
if _field_type_mapping.get(str(field.type),None):
canonical_field = {
'fieldPath': field.name,
'nativeDataType': str(field.type),
'type': { "type": _get_column_type(field.type)},
'description': field.doc,
}
canonical_fields.append(canonical_field)
# Handle some library compatability issues.
if hasattr(avro.schema, 'parse'):
schema_parse_fn = avro.schema.parse
else:
fields_skipped = fields_skipped + 1
logger.warn(f'Schema {schema.name}: Skipped {fields_skipped} fields during Avro schema to canonical schema conversion')
return canonical_fields
schema_parse_fn = avro.schema.Parse
parsed_schema: avro.schema.RecordSchema = schema_parse_fn(avro_schema_string)
fields: List[SchemaField] = []
for parsed_field in parsed_schema.fields:
field = SchemaField(
fieldPath=parsed_field.name,
nativeDataType=str(parsed_field.type),
type=_get_column_type(parsed_field.type),
description=parsed_field.props.get('doc', None),
)
fields.append(field)
breakpoint()
return fields

View File

@ -16,7 +16,7 @@ import gometa.ingestion.extractor.schema_util as schema_util
from gometa.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
from gometa.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot
from gometa.metadata.com.linkedin.pegasus2avro.schema import SchemaMetadata, KafkaSchema
from gometa.metadata.com.linkedin.pegasus2avro.schema import SchemaMetadata, KafkaSchema, SchemaField
from gometa.metadata.com.linkedin.pegasus2avro.common import AuditStamp
logger = logging.getLogger(__name__)
@ -99,11 +99,18 @@ class KafkaSource(Source):
except Exception as e:
logger.debug(f"failed to get schema for {topic} with {e}")
fields: Optional[List[SchemaField]] = None
if schema and schema.schema_type == 'AVRO':
fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
elif schema:
logger.debug(f"unable to parse kafka schema type {schema.schema_type}")
is_incomplete = True
if schema:
# TODO: add schema parsing capabilities
# canonical_schema = []
# if schema.schema_type == "AVRO":
# canonical_schema = schema_util.avro_schema_to_mce_fields(schema.schema_str)
if not fields:
fields = []
else:
is_incomplete = False
schema_metadata = SchemaMetadata(
schemaName=topic,
@ -111,16 +118,16 @@ class KafkaSource(Source):
hash=str(schema._hash),
platform=f"urn:li:dataPlatform:{platform}",
platformSchema = KafkaSchema(
# TODO: keySchema
documentSchema=schema.schema_str
),
fields=[],
fields=(fields if fields is not None else []),
created=AuditStamp(time=sys_time, actor=actor),
lastModified=AuditStamp(time=sys_time, actor=actor),
)
dataset_snapshot.aspects.append(schema_metadata)
else:
if is_incomplete:
self.report.report_schema_incomplete(topic)
metadata_record.proposedSnapshot = dataset_snapshot