build(ingest): Bump avro pin: security vulnerability (#9042)

This commit is contained in:
Andrew Sikowitz 2023-10-25 13:06:12 -04:00 committed by GitHub
parent dd5d997390
commit 8a80e858a7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 99 additions and 54 deletions

View File

@ -152,7 +152,8 @@ def merge_schemas(schemas_obj: List[dict]) -> str:
return encoded return encoded
autogen_header = """# flake8: noqa autogen_header = """# mypy: ignore-errors
# flake8: noqa
# This file is autogenerated by /metadata-ingestion/scripts/avro_codegen.py # This file is autogenerated by /metadata-ingestion/scripts/avro_codegen.py
# Do not modify manually! # Do not modify manually!

View File

@ -351,8 +351,8 @@ def generate_stitched_record(
field_objects = [] field_objects = []
for f in entity_fields: for f in entity_fields:
field = avro.schema.Field( field = avro.schema.Field(
type=f["type"], f["type"],
name=f["name"], f["name"],
has_default=False, has_default=False,
) )
field_objects.append(field) field_objects.append(field)

View File

@ -32,7 +32,7 @@ framework_common = {
"expandvars>=0.6.5", "expandvars>=0.6.5",
"avro-gen3==0.7.11", "avro-gen3==0.7.11",
# "avro-gen3 @ git+https://github.com/acryldata/avro_gen@master#egg=avro-gen3", # "avro-gen3 @ git+https://github.com/acryldata/avro_gen@master#egg=avro-gen3",
"avro>=1.10.2,<1.11", "avro>=1.11.3,<1.12",
"python-dateutil>=2.8.0", "python-dateutil>=2.8.0",
"tabulate", "tabulate",
"progressbar2", "progressbar2",
@ -355,7 +355,11 @@ plugins: Dict[str, Set[str]] = {
| {"psycopg2-binary", "pymysql>=1.0.2"}, | {"psycopg2-binary", "pymysql>=1.0.2"},
"pulsar": {"requests"}, "pulsar": {"requests"},
"redash": {"redash-toolbelt", "sql-metadata"} | sqllineage_lib, "redash": {"redash-toolbelt", "sql-metadata"} | sqllineage_lib,
"redshift": sql_common | redshift_common | usage_common | {"redshift-connector"} | sqlglot_lib, "redshift": sql_common
| redshift_common
| usage_common
| {"redshift-connector"}
| sqlglot_lib,
"redshift-legacy": sql_common | redshift_common | sqlglot_lib, "redshift-legacy": sql_common | redshift_common | sqlglot_lib,
"redshift-usage-legacy": sql_common | redshift_common | sqlglot_lib | usage_common, "redshift-usage-legacy": sql_common | redshift_common | sqlglot_lib | usage_common,
"s3": {*s3_base, *data_lake_profiling}, "s3": {*s3_base, *data_lake_profiling},

View File

@ -1,6 +1,18 @@
import json import json
import logging import logging
from typing import Any, Callable, Dict, Generator, List, Optional, Type, Union from typing import (
Any,
Callable,
Dict,
Iterable,
List,
Mapping,
Optional,
Type,
Union,
cast,
overload,
)
import avro.schema import avro.schema
@ -54,6 +66,8 @@ AvroNonNestedSchemas = Union[
avro.schema.PrimitiveSchema, avro.schema.PrimitiveSchema,
] ]
SchemaOrField = Union[avro.schema.Schema, avro.schema.Field]
FieldStack = List[avro.schema.Field] FieldStack = List[avro.schema.Field]
# The latest avro code contains this type definition in a compatibility module, # The latest avro code contains this type definition in a compatibility module,
@ -124,16 +138,22 @@ class AvroToMceSchemaConverter:
self._meta_mapping_processor = meta_mapping_processor self._meta_mapping_processor = meta_mapping_processor
self._schema_tags_field = schema_tags_field self._schema_tags_field = schema_tags_field
self._tag_prefix = tag_prefix self._tag_prefix = tag_prefix
# Map of avro schema type to the conversion handler # Map of avro schema type to the conversion handler
self._avro_type_to_mce_converter_map: Dict[ # TODO: Clean up this type... perhaps refactor
avro.schema.Schema, self._avro_type_to_mce_converter_map: Mapping[
Callable[[ExtendedAvroNestedSchemas], Generator[SchemaField, None, None]], Union[
Type[avro.schema.Schema],
Type[avro.schema.Field],
Type[avro.schema.LogicalSchema],
],
Callable[[SchemaOrField], Iterable[SchemaField]],
] = { ] = {
avro.schema.RecordSchema: self._gen_from_non_field_nested_schemas, avro.schema.RecordSchema: self._gen_from_non_field_nested_schemas,
avro.schema.UnionSchema: self._gen_from_non_field_nested_schemas, avro.schema.UnionSchema: self._gen_from_non_field_nested_schemas,
avro.schema.ArraySchema: self._gen_from_non_field_nested_schemas, avro.schema.ArraySchema: self._gen_from_non_field_nested_schemas,
avro.schema.MapSchema: self._gen_from_non_field_nested_schemas, avro.schema.MapSchema: self._gen_from_non_field_nested_schemas,
avro.schema.Field: self._gen_nested_schema_from_field, avro.schema.Field: self._gen_nested_schema_from_field, # type: ignore
avro.schema.PrimitiveSchema: self._gen_non_nested_to_mce_fields, avro.schema.PrimitiveSchema: self._gen_non_nested_to_mce_fields,
avro.schema.FixedSchema: self._gen_non_nested_to_mce_fields, avro.schema.FixedSchema: self._gen_non_nested_to_mce_fields,
avro.schema.EnumSchema: self._gen_non_nested_to_mce_fields, avro.schema.EnumSchema: self._gen_non_nested_to_mce_fields,
@ -142,20 +162,22 @@ class AvroToMceSchemaConverter:
@staticmethod @staticmethod
def _get_type_name( def _get_type_name(
avro_schema: avro.schema.Schema, logical_if_present: bool = False avro_schema: SchemaOrField, logical_if_present: bool = False
) -> str: ) -> str:
logical_type_name: Optional[str] = None logical_type_name: Optional[str] = None
if logical_if_present: if logical_if_present:
logical_type_name = getattr( logical_type_name = cast(
avro_schema, "logical_type", None Optional[str],
) or avro_schema.props.get("logicalType") getattr(avro_schema, "logical_type", None)
or avro_schema.props.get("logicalType"),
)
return logical_type_name or str( return logical_type_name or str(
getattr(avro_schema.type, "type", avro_schema.type) getattr(avro_schema.type, "type", avro_schema.type)
) )
@staticmethod @staticmethod
def _get_column_type( def _get_column_type(
avro_schema: avro.schema.Schema, logical_type: Optional[str] avro_schema: SchemaOrField, logical_type: Optional[str]
) -> SchemaFieldDataType: ) -> SchemaFieldDataType:
type_name: str = AvroToMceSchemaConverter._get_type_name(avro_schema) type_name: str = AvroToMceSchemaConverter._get_type_name(avro_schema)
TypeClass: Optional[Type] = AvroToMceSchemaConverter.field_type_mapping.get( TypeClass: Optional[Type] = AvroToMceSchemaConverter.field_type_mapping.get(
@ -186,7 +208,7 @@ class AvroToMceSchemaConverter:
) )
return dt return dt
def _is_nullable(self, schema: avro.schema.Schema) -> bool: def _is_nullable(self, schema: SchemaOrField) -> bool:
if isinstance(schema, avro.schema.Field): if isinstance(schema, avro.schema.Field):
return self._is_nullable(schema.type) return self._is_nullable(schema.type)
if isinstance(schema, avro.schema.UnionSchema): if isinstance(schema, avro.schema.UnionSchema):
@ -208,7 +230,7 @@ class AvroToMceSchemaConverter:
return name_or_fullname.rsplit(".", maxsplit=1)[-1] return name_or_fullname.rsplit(".", maxsplit=1)[-1]
@staticmethod @staticmethod
def _get_simple_native_type(schema: ExtendedAvroNestedSchemas) -> str: def _get_simple_native_type(schema: SchemaOrField) -> str:
if isinstance(schema, (avro.schema.RecordSchema, avro.schema.Field)): if isinstance(schema, (avro.schema.RecordSchema, avro.schema.Field)):
# For Records, fields, always return the name. # For Records, fields, always return the name.
return AvroToMceSchemaConverter._strip_namespace(schema.name) return AvroToMceSchemaConverter._strip_namespace(schema.name)
@ -226,7 +248,7 @@ class AvroToMceSchemaConverter:
return schema.type return schema.type
@staticmethod @staticmethod
def _get_type_annotation(schema: ExtendedAvroNestedSchemas) -> str: def _get_type_annotation(schema: SchemaOrField) -> str:
simple_native_type = AvroToMceSchemaConverter._get_simple_native_type(schema) simple_native_type = AvroToMceSchemaConverter._get_simple_native_type(schema)
if simple_native_type.startswith("__struct_"): if simple_native_type.startswith("__struct_"):
simple_native_type = "struct" simple_native_type = "struct"
@ -238,9 +260,23 @@ class AvroToMceSchemaConverter:
return f"[type={simple_native_type}]" return f"[type={simple_native_type}]"
@staticmethod @staticmethod
@overload
def _get_underlying_type_if_option_as_union( def _get_underlying_type_if_option_as_union(
schema: AvroNestedSchemas, default: Optional[AvroNestedSchemas] = None schema: SchemaOrField, default: SchemaOrField
) -> AvroNestedSchemas: ) -> SchemaOrField:
...
@staticmethod
@overload
def _get_underlying_type_if_option_as_union(
schema: SchemaOrField, default: Optional[SchemaOrField] = None
) -> Optional[SchemaOrField]:
...
@staticmethod
def _get_underlying_type_if_option_as_union(
schema: SchemaOrField, default: Optional[SchemaOrField] = None
) -> Optional[SchemaOrField]:
if isinstance(schema, avro.schema.UnionSchema) and len(schema.schemas) == 2: if isinstance(schema, avro.schema.UnionSchema) and len(schema.schemas) == 2:
(first, second) = schema.schemas (first, second) = schema.schemas
if first.type == AVRO_TYPE_NULL: if first.type == AVRO_TYPE_NULL:
@ -258,8 +294,8 @@ class AvroToMceSchemaConverter:
def __init__( def __init__(
self, self,
schema: avro.schema.Schema, schema: SchemaOrField,
actual_schema: avro.schema.Schema, actual_schema: SchemaOrField,
converter: "AvroToMceSchemaConverter", converter: "AvroToMceSchemaConverter",
description: Optional[str] = None, description: Optional[str] = None,
default_value: Optional[str] = None, default_value: Optional[str] = None,
@ -275,7 +311,7 @@ class AvroToMceSchemaConverter:
self._converter._prefix_name_stack.append(type_annotation) self._converter._prefix_name_stack.append(type_annotation)
return self return self
def emit(self) -> Generator[SchemaField, None, None]: def emit(self) -> Iterable[SchemaField]:
if ( if (
not isinstance( not isinstance(
self._actual_schema, self._actual_schema,
@ -307,7 +343,7 @@ class AvroToMceSchemaConverter:
description = self._description description = self._description
if not description and actual_schema.props.get("doc"): if not description and actual_schema.props.get("doc"):
description = actual_schema.props.get("doc") description = cast(Optional[str], actual_schema.props.get("doc"))
if self._default_value is not None: if self._default_value is not None:
description = f"{description if description else ''}\nField default value: {self._default_value}" description = f"{description if description else ''}\nField default value: {self._default_value}"
@ -320,12 +356,12 @@ class AvroToMceSchemaConverter:
native_data_type = native_data_type[ native_data_type = native_data_type[
slice(len(type_prefix), len(native_data_type) - 1) slice(len(type_prefix), len(native_data_type) - 1)
] ]
native_data_type = actual_schema.props.get( native_data_type = cast(
"native_data_type", native_data_type str, actual_schema.props.get("native_data_type", native_data_type)
) )
field_path = self._converter._get_cur_field_path() field_path = self._converter._get_cur_field_path()
merged_props = {} merged_props: Dict[str, Any] = {}
merged_props.update(self._schema.other_props) merged_props.update(self._schema.other_props)
merged_props.update(schema.other_props) merged_props.update(schema.other_props)
@ -363,12 +399,13 @@ class AvroToMceSchemaConverter:
meta_terms_aspect = meta_aspects.get(Constants.ADD_TERM_OPERATION) meta_terms_aspect = meta_aspects.get(Constants.ADD_TERM_OPERATION)
logical_type_name: Optional[str] = ( logical_type_name: Optional[str] = cast(
Optional[str],
# logicalType nested inside type # logicalType nested inside type
getattr(actual_schema, "logical_type", None) getattr(actual_schema, "logical_type", None)
or actual_schema.props.get("logicalType") or actual_schema.props.get("logicalType")
# bare logicalType # bare logicalType
or self._actual_schema.props.get("logicalType") or self._actual_schema.props.get("logicalType"),
) )
field = SchemaField( field = SchemaField(
@ -392,14 +429,12 @@ class AvroToMceSchemaConverter:
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
self._converter._prefix_name_stack.pop() self._converter._prefix_name_stack.pop()
def _get_sub_schemas( def _get_sub_schemas(self, schema: SchemaOrField) -> Iterable[SchemaOrField]:
self, schema: ExtendedAvroNestedSchemas
) -> Generator[avro.schema.Schema, None, None]:
"""Responsible for generation for appropriate sub-schemas for every nested AVRO type.""" """Responsible for generation for appropriate sub-schemas for every nested AVRO type."""
def gen_items_from_list_tuple_or_scalar( def gen_items_from_list_tuple_or_scalar(
val: Any, val: Any,
) -> Generator[avro.schema.Schema, None, None]: ) -> Iterable[avro.schema.Schema]:
if isinstance(val, (list, tuple)): if isinstance(val, (list, tuple)):
for i in val: for i in val:
yield i yield i
@ -433,7 +468,7 @@ class AvroToMceSchemaConverter:
def _gen_nested_schema_from_field( def _gen_nested_schema_from_field(
self, self,
field: avro.schema.Field, field: avro.schema.Field,
) -> Generator[SchemaField, None, None]: ) -> Iterable[SchemaField]:
"""Handles generation of MCE SchemaFields for an AVRO Field type.""" """Handles generation of MCE SchemaFields for an AVRO Field type."""
# NOTE: Here we only manage the field stack and trigger MCE Field generation from this field's type. # NOTE: Here we only manage the field stack and trigger MCE Field generation from this field's type.
# The actual emitting of a field happens when # The actual emitting of a field happens when
@ -447,7 +482,7 @@ class AvroToMceSchemaConverter:
def _gen_from_last_field( def _gen_from_last_field(
self, schema_to_recurse: Optional[AvroNestedSchemas] = None self, schema_to_recurse: Optional[AvroNestedSchemas] = None
) -> Generator[SchemaField, None, None]: ) -> Iterable[SchemaField]:
"""Emits the field most-recent field, optionally triggering sub-schema generation under the field.""" """Emits the field most-recent field, optionally triggering sub-schema generation under the field."""
last_field_schema = self._fields_stack[-1] last_field_schema = self._fields_stack[-1]
# Generate the custom-description for the field. # Generate the custom-description for the field.
@ -467,8 +502,8 @@ class AvroToMceSchemaConverter:
yield from self._to_mce_fields(sub_schema) yield from self._to_mce_fields(sub_schema)
def _gen_from_non_field_nested_schemas( def _gen_from_non_field_nested_schemas(
self, schema: AvroNestedSchemas self, schema: SchemaOrField
) -> Generator[SchemaField, None, None]: ) -> Iterable[SchemaField]:
"""Handles generation of MCE SchemaFields for all standard AVRO nested types.""" """Handles generation of MCE SchemaFields for all standard AVRO nested types."""
# Handle recursive record definitions # Handle recursive record definitions
recurse: bool = True recurse: bool = True
@ -511,8 +546,8 @@ class AvroToMceSchemaConverter:
yield from self._to_mce_fields(sub_schema) yield from self._to_mce_fields(sub_schema)
def _gen_non_nested_to_mce_fields( def _gen_non_nested_to_mce_fields(
self, schema: AvroNonNestedSchemas self, schema: SchemaOrField
) -> Generator[SchemaField, None, None]: ) -> Iterable[SchemaField]:
"""Handles generation of MCE SchemaFields for non-nested AVRO types.""" """Handles generation of MCE SchemaFields for non-nested AVRO types."""
with AvroToMceSchemaConverter.SchemaFieldEmissionContextManager( with AvroToMceSchemaConverter.SchemaFieldEmissionContextManager(
schema, schema,
@ -521,9 +556,7 @@ class AvroToMceSchemaConverter:
) as non_nested_emitter: ) as non_nested_emitter:
yield from non_nested_emitter.emit() yield from non_nested_emitter.emit()
def _to_mce_fields( def _to_mce_fields(self, avro_schema: SchemaOrField) -> Iterable[SchemaField]:
self, avro_schema: avro.schema.Schema
) -> Generator[SchemaField, None, None]:
# Invoke the relevant conversion handler for the schema element type. # Invoke the relevant conversion handler for the schema element type.
schema_type = ( schema_type = (
type(avro_schema) type(avro_schema)
@ -541,7 +574,7 @@ class AvroToMceSchemaConverter:
meta_mapping_processor: Optional[OperationProcessor] = None, meta_mapping_processor: Optional[OperationProcessor] = None,
schema_tags_field: Optional[str] = None, schema_tags_field: Optional[str] = None,
tag_prefix: Optional[str] = None, tag_prefix: Optional[str] = None,
) -> Generator[SchemaField, None, None]: ) -> Iterable[SchemaField]:
""" """
Converts a key or value type AVRO schema string to appropriate MCE SchemaFields. Converts a key or value type AVRO schema string to appropriate MCE SchemaFields.
:param avro_schema_string: String representation of the AVRO schema. :param avro_schema_string: String representation of the AVRO schema.

View File

@ -3,7 +3,7 @@ import json
import logging import logging
from dataclasses import dataclass, field from dataclasses import dataclass, field
from enum import Enum from enum import Enum
from typing import Any, Dict, Iterable, List, Optional, Type from typing import Any, Dict, Iterable, List, Optional, Type, cast
import avro.schema import avro.schema
import confluent_kafka import confluent_kafka
@ -316,13 +316,20 @@ class KafkaSource(StatefulIngestionSourceBase):
avro_schema = avro.schema.parse( avro_schema = avro.schema.parse(
schema_metadata.platformSchema.documentSchema schema_metadata.platformSchema.documentSchema
) )
description = avro_schema.doc description = getattr(avro_schema, "doc", None)
# set the tags # set the tags
all_tags: List[str] = [] all_tags: List[str] = []
for tag in avro_schema.other_props.get( try:
self.source_config.schema_tags_field, [] schema_tags = cast(
): Iterable[str],
all_tags.append(self.source_config.tag_prefix + tag) avro_schema.other_props.get(
self.source_config.schema_tags_field, []
),
)
for tag in schema_tags:
all_tags.append(self.source_config.tag_prefix + tag)
except TypeError:
pass
if self.source_config.enable_meta_mapping: if self.source_config.enable_meta_mapping:
meta_aspects = self.meta_processor.process(avro_schema.other_props) meta_aspects = self.meta_processor.process(avro_schema.other_props)

View File

@ -4,7 +4,7 @@ import operator
import re import re
import time import time
from functools import reduce from functools import reduce
from typing import Any, Dict, List, Match, Optional, Union, cast from typing import Any, Dict, List, Mapping, Match, Optional, Union, cast
from datahub.emitter import mce_builder from datahub.emitter import mce_builder
from datahub.emitter.mce_builder import OwnerType from datahub.emitter.mce_builder import OwnerType
@ -111,7 +111,7 @@ class OperationProcessor:
self.owner_source_type = owner_source_type self.owner_source_type = owner_source_type
self.match_nested_props = match_nested_props self.match_nested_props = match_nested_props
def process(self, raw_props: Dict[str, Any]) -> Dict[str, Any]: def process(self, raw_props: Mapping[str, Any]) -> Dict[str, Any]:
# Defining the following local variables - # Defining the following local variables -
# operations_map - the final resulting map when operations are processed. # operations_map - the final resulting map when operations are processed.
# Against each operation the values to be applied are stored. # Against each operation the values to be applied are stored.

View File

@ -1,14 +1,14 @@
import tempfile import tempfile
from typing import List, Type from typing import List, Type
import avro.schema
import pandas as pd import pandas as pd
import ujson import ujson
from avro import schema as avro_schema from avro import schema as avro_schema
from avro.datafile import DataFileWriter from avro.datafile import DataFileWriter
from avro.io import DatumWriter from avro.io import DatumWriter
from datahub.ingestion.source.schema_inference import avro, csv_tsv, json, parquet from datahub.ingestion.source.schema_inference import csv_tsv, json, parquet
from datahub.ingestion.source.schema_inference.avro import AvroInferrer
from datahub.metadata.com.linkedin.pegasus2avro.schema import ( from datahub.metadata.com.linkedin.pegasus2avro.schema import (
BooleanTypeClass, BooleanTypeClass,
NumberTypeClass, NumberTypeClass,
@ -123,7 +123,7 @@ def test_infer_schema_avro():
file.seek(0) file.seek(0)
fields = avro.AvroInferrer().infer_schema(file) fields = AvroInferrer().infer_schema(file)
fields.sort(key=lambda x: x.fieldPath) fields.sort(key=lambda x: x.fieldPath)
assert_field_paths_match(fields, expected_field_paths_avro) assert_field_paths_match(fields, expected_field_paths_avro)