diff --git a/datahub-web-react/src/app/entity/dataset/profile/schema/utils/constants.ts b/datahub-web-react/src/app/entity/dataset/profile/schema/utils/constants.ts index 23dfb6ae6d..ae842ab955 100644 --- a/datahub-web-react/src/app/entity/dataset/profile/schema/utils/constants.ts +++ b/datahub-web-react/src/app/entity/dataset/profile/schema/utils/constants.ts @@ -1,2 +1,4 @@ export const KEY_SCHEMA_PREFIX = '[key=True].'; export const VERSION_PREFIX = '[version=2.0].'; +export const ARRAY_TOKEN = '[type=array]'; +export const UNION_TOKEN = '[type=union]'; diff --git a/datahub-web-react/src/app/entity/dataset/profile/schema/utils/translateFieldPathSegment.tsx b/datahub-web-react/src/app/entity/dataset/profile/schema/utils/translateFieldPathSegment.tsx index 88043c0fb9..7153a38a32 100644 --- a/datahub-web-react/src/app/entity/dataset/profile/schema/utils/translateFieldPathSegment.tsx +++ b/datahub-web-react/src/app/entity/dataset/profile/schema/utils/translateFieldPathSegment.tsx @@ -1,5 +1,4 @@ -const ARRAY_TOKEN = '[type=array]'; -const UNION_TOKEN = '[type=union]'; +import { ARRAY_TOKEN, UNION_TOKEN } from './constants'; export default function translateFieldPathSegment(fieldPathSegment, i, fieldPathParts) { // for each segment, convert its fieldPath representation into a human readable version diff --git a/datahub-web-react/src/app/entity/dataset/profile/schema/utils/utils.ts b/datahub-web-react/src/app/entity/dataset/profile/schema/utils/utils.ts index e012ce06cd..f217e2049d 100644 --- a/datahub-web-react/src/app/entity/dataset/profile/schema/utils/utils.ts +++ b/datahub-web-react/src/app/entity/dataset/profile/schema/utils/utils.ts @@ -9,7 +9,7 @@ import { } from '../../../../../../types.generated'; import { convertTagsForUpdate } from '../../../../../shared/tags/utils/convertTagsForUpdate'; import { SchemaDiffSummary } from '../components/SchemaVersionSummary'; -import { KEY_SCHEMA_PREFIX, VERSION_PREFIX } from './constants'; +import { KEY_SCHEMA_PREFIX, UNION_TOKEN, VERSION_PREFIX } from './constants'; import { ExtendedSchemaFields } from './types'; export function convertEditableSchemaMeta( @@ -85,9 +85,24 @@ export function groupByFieldPath( const row = { children: undefined, ...rows[rowIndex] }; for (let j = rowIndex - 1; j >= 0; j--) { - if (row.fieldPath.indexOf(rows[j].fieldPath) >= 0) { - parentRow = outputRowByPath[rows[j].fieldPath]; - break; + const rowTokens = row.fieldPath.split('.'); + const isQualifyingUnionField = rowTokens[rowTokens.length - 3] === UNION_TOKEN; + if (isQualifyingUnionField) { + // in the case of unions, parent will not be a subset of the child + rowTokens.splice(rowTokens.length - 2, 1); + const parentPath = rowTokens.join('.'); + console.log({ parentPath }); + + if (rows[j].fieldPath === parentPath) { + parentRow = outputRowByPath[rows[j].fieldPath]; + break; + } + } else if (!isQualifyingUnionField) { + // in the case of structs, arrays, etc, parent will be a subset + if (row.fieldPath.indexOf(rows[j].fieldPath) >= 0) { + parentRow = outputRowByPath[rows[j].fieldPath]; + break; + } } } diff --git a/docs/advanced/field-path-spec-v2.md b/docs/advanced/field-path-spec-v2.md index 63ee883c97..3154996b96 100644 --- a/docs/advanced/field-path-spec-v2.md +++ b/docs/advanced/field-path-spec-v2.md @@ -14,7 +14,6 @@ usage stats and data profiles. Therefore, it must satisfy the following requirem * must be unique across all fields within a schema. * make schema navigation in the UI more intuitive. -* allow for capturing and editing documentation of intermediate structs/records leading to a filed, and not just fields. * allow for identifying the type of schema the field is part of, such as a `key-schema` or a `value-schema`. * allow for future-evolution @@ -49,7 +48,8 @@ the record type `A` or `B`. The syntax for V2 encoding of the `fieldPath` is captured in the following grammar. The `FieldPathSpec` is essentially the type annotated path of the member, with each token along the path representing one level of nested member, starting from the most-enclosing type, leading up to the member. In the case of `unions` that have `one-of` semantics, -the corresponding field will be emitted once for each `member` of the union as its `type`. +the corresponding field will be emitted once for each `member` of the union as its `type`, along with one path +corresponding to the `union` itself. ### Formal Spec: @@ -58,9 +58,8 @@ the corresponding field will be emitted once for each `member` of the union as i | . // when part of a value schema := [version=] // [version=2.0] for v2 := [key=True] // when part of a key schema - := + // this is the type prefixed path upto the intermediate type or a field. - := | // intermeidate type, or a field - := . // type of field + field name + := + // this is the type prefixed path field (nested if repeats). + := . // type prefixed path of a field. := . | := [type=] := [type=] @@ -68,16 +67,12 @@ the corresponding field will be emitted once for each `member` of the union as i := int | float | double | string | fixed | enum ``` -For the [example above](#example-ambiguous-field-path), this encoding would produce the following 5 unique field paths. -Notice that there are 3 unique paths corresponding to the `union`, record `A`, and record `B` intermediate types, and 2 -unique paths corresponding to the `A.f` and `B.f` fields. +For the [example above](#example-ambiguous-field-path), this encoding would produce the following 2 unique paths +corresponding to the `A.f` and `B.f` fields. ```python unique_v2_field_paths = [ - "[version=2.0].[type=union]", - "[version=2.0].[type=union].[type=A]", "[version=2.0].[type=union].[type=A].[type=string].f", - "[version=2.0].[type=union].[type=B]", "[version=2.0].[type=union].[type=B].[type=string].f" ] ``` @@ -86,9 +81,8 @@ NOTE: - this encoding always ensures uniqueness within a schema since the full type annotation leading to a field is encoded in the fieldPath itself. -- field paths that end with an intermediate type allow us to capture details at record/struct level as well, in addition - to the fields themselves. -- processing a fieldPath, such as from UI, gets simplified simply by walking each token along the path from left-to-right. +- processing a fieldPath, such as from UI, gets simplified simply by walking each token along the path from + left-to-right. - adding PartOfKeySchemaToken allows for identifying if the field is part of key-schema. - adding VersionToken allows for future evolvability. - to represent `optional` fields, which sometimes are modeled as `unions` in formats like `Avro`, instead of treating it @@ -133,7 +127,6 @@ avro_schema = """ """ unique_v2_field_paths = [ - "[version=2.0].[type=E]", # this is the record E itself that we can maintain documentation for! "[version=2.0].[type=E].[type=string].a", "[version=2.0].[type=E].[type=string].b", ] @@ -160,8 +153,6 @@ avro_schema = """ """ unique_v2_field_paths = [ - "[version=2.0].[key=True].[type=SimpleNested]", - "[version=2.0].[key=True].[type=SimpleNested].[type=InnerRcd]", "[version=2.0].[key=True].[type=SimpleNested].[type=InnerRcd].nestedRcd", "[version=2.0].[key=True].[type=SimpleNested].[type=InnerRcd].nestedRcd.[type=string].aStringField", ] @@ -189,11 +180,8 @@ avro_schema = """ """ unique_v2_field_paths = [ - "[version=2.0].[type=Recursive]", - "[version=2.0].[type=Recursive].[type=R]", "[version=2.0].[type=Recursive].[type=R].r", "[version=2.0].[type=Recursive].[type=R].r.[type=int].anIntegerField", - "[version=2.0].[type=Recursive].[type=R].r.[type=R]", "[version=2.0].[type=Recursive].[type=R].r.[type=R].aRecursiveField" ] ``` @@ -216,10 +204,7 @@ avro_schema =""" } """ unique_v2_field_paths = [ - "[version=2.0].[type=TreeNode]", "[version=2.0].[type=TreeNode].[type=long].value", - "[version=2.0].[type=TreeNode].[type=array]", - "[version=2.0].[type=TreeNode].[type=array].[type=TreeNode]", "[version=2.0].[type=TreeNode].[type=array].[type=TreeNode].children", ] ``` @@ -246,14 +231,11 @@ avro_schema = """ } """ unique_v2_field_paths: List[str] = [ - "[version=2.0].[key=True].[type=ABFooUnion]", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union]", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A]", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B]", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f", + "[version=2.0].[key=True].[type=ABUnion].[type=union].a", + "[version=2.0].[key=True].[type=ABUnion].[type=union].[type=A].a", + "[version=2.0].[key=True].[type=ABUnion].[type=union].[type=A].a.[type=string].f", + "[version=2.0].[key=True].[type=ABUnion].[type=union].[type=B].a", + "[version=2.0].[key=True].[type=ABUnion].[type=union].[type=B].a.[type=string].f", ] ``` ### Arrays @@ -286,10 +268,6 @@ avro_schema = """ } """ unique_v2_field_paths: List[str] = [ - "[version=2.0].[type=NestedArray]", - "[version=2.0].[type=NestedArray].[type=array]", - "[version=2.0].[type=NestedArray].[type=array].[type=array]", - "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo]", "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar", "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar.[type=long].a", ] @@ -313,8 +291,6 @@ avro_schema = """ } """ unique_v2_field_paths = [ - "[version=2.0].[type=R]", - "[version=2.0].[type=R].[type=map]", "[version=2.0].[type=R].[type=map].[type=long].a_map_of_longs_field", ] @@ -357,25 +333,18 @@ avro_schema = """ """ unique_v2_field_paths: List[str] = [ - "[version=2.0].[type=ABFooUnion]", - "[version=2.0].[type=ABFooUnion].[type=union]", - "[version=2.0].[type=ABFooUnion].[type=union].[type=A]", + "[version=2.0].[type=ABFooUnion].[type=union].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=A].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f", - "[version=2.0].[type=ABFooUnion].[type=union].[type=B]", "[version=2.0].[type=ABFooUnion].[type=union].[type=B].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f", - "[version=2.0].[type=ABFooUnion].[type=union].[type=array]", - "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array]", - "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo]", "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=long].f", ] ``` For more examples, see -the [unit-tests for AvroToMceSchemaConverter](https://github.com/linkedin/datahub/blob/master/metadata-ingestion/tests/unit/test_schema_util.py) -. +the [unit-tests for AvroToMceSchemaConverter](https://github.com/linkedin/datahub/blob/master/metadata-ingestion/tests/unit/test_schema_util.py). ### Backward-compatibility diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py index c1aa7e6d1c..4873358293 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py @@ -129,25 +129,42 @@ class AvroToMceSchemaConverter: return ".".join(self._prefix_name_stack) @staticmethod - def _get_type_annotation(schema: ExtendedAvroNestedSchemas) -> str: - if isinstance(schema, avro.schema.RecordSchema): - return f"[type={schema.name}]" - - # For fields, just return the plain name. - if isinstance(schema, avro.schema.Field): - return f"{schema.name}" + def _get_simple_native_type(schema: ExtendedAvroNestedSchemas) -> str: + if isinstance(schema, (avro.schema.RecordSchema, avro.schema.Field)): + # For Records, fields, always return the name. + return schema.name # For optional, use the underlying non-null type if isinstance(schema, avro.schema.UnionSchema) and len(schema.schemas) == 2: # Optional types as unions in AVRO. Return underlying non-null sub-type. (first, second) = schema.schemas if first.type == avro.schema.NULL: - return f"[type={second.type}]" + return second.type elif second.type == avro.schema.NULL: - return f"[type={first.type}]" + return first.type # For everything else, use the schema's type - return f"[type={schema.type}]" + return schema.type + + @staticmethod + def _get_type_annotation(schema: ExtendedAvroNestedSchemas) -> str: + simple_native_type = AvroToMceSchemaConverter._get_simple_native_type(schema) + if isinstance(schema, avro.schema.Field): + return simple_native_type + else: + return f"[type={simple_native_type}]" + + @staticmethod + def _get_underlying_type_if_option_as_union( + schema: AvroNestedSchemas, default: Optional[AvroNestedSchemas] = None + ) -> AvroNestedSchemas: + if isinstance(schema, avro.schema.UnionSchema) and len(schema.schemas) == 2: + (first, second) = schema.schemas + if first.type == avro.schema.NULL: + return second + elif second.type == avro.schema.NULL: + return first + return default class SchemaFieldEmissionContextManager: """Context Manager for MCE SchemaFiled emission @@ -178,32 +195,16 @@ class AvroToMceSchemaConverter: avro.schema.ArraySchema, avro.schema.Field, avro.schema.MapSchema, - avro.schema.UnionSchema, avro.schema.RecordSchema, ), ) and self._converter._fields_stack ): - # We are in the context of a non-nested(simple) field. - last_field_schema = self._converter._fields_stack[-1] - - description = ( - last_field_schema.doc - if last_field_schema.doc - else "No description available." - ) - - if last_field_schema.has_default: - description = f"{description}\nField default value: {last_field_schema.default}" - - with AvroToMceSchemaConverter.SchemaFieldEmissionContextManager( - last_field_schema, - last_field_schema, - self._converter, - description, - ) as field_emitter: - yield from field_emitter.emit() + # We are in the context of a non-nested(simple) field or the special-cased union. + yield from self._converter._gen_from_last_field() else: + # Just emit the SchemaField from schema provided in the Ctor. + schema = self._schema actual_schema = self._actual_schema if isinstance(schema, avro.schema.Field): @@ -214,15 +215,24 @@ class AvroToMceSchemaConverter: schema, schema ) ) - # Emit the schema field provided in the Ctor. + description = self._description if description is None: description = schema.props.get("doc", None) + native_data_type = self._converter._prefix_name_stack[-1] + if isinstance(schema, (avro.schema.Field, avro.schema.UnionSchema)): + native_data_type = self._converter._prefix_name_stack[-2] + type_prefix = "[type=" + if native_data_type.startswith(type_prefix): + native_data_type = native_data_type[ + slice(len(type_prefix), len(native_data_type) - 1) + ] + field = SchemaField( fieldPath=self._converter._get_cur_field_path(), - # Not populating this field for Avro, since this is blowing up the KAFKA message size. - nativeDataType="", + # Populate it with the simple native type for now. + nativeDataType=native_data_type, type=self._converter._get_column_type(actual_schema.type), description=description, recursive=False, @@ -234,18 +244,6 @@ class AvroToMceSchemaConverter: def __exit__(self, exc_type, exc_val, exc_tb): self._converter._prefix_name_stack.pop() - @staticmethod - def _get_underlying_type_if_option_as_union( - schema: AvroNestedSchemas, default: Optional[AvroNestedSchemas] = None - ) -> AvroNestedSchemas: - if isinstance(schema, avro.schema.UnionSchema) and len(schema.schemas) == 2: - (first, second) = schema.schemas - if first.type == avro.schema.NULL: - return second - elif second.type == avro.schema.NULL: - return first - return default - def _get_sub_schemas( self, schema: ExtendedAvroNestedSchemas ) -> Generator[avro.schema.Schema, None, None]: @@ -290,13 +288,41 @@ class AvroToMceSchemaConverter: ) -> Generator[SchemaField, None, None]: """Handles generation of MCE SchemaFields for an AVRO Field type.""" # NOTE: Here we only manage the field stack and trigger MCE Field generation from this field's type. - # The actual emitting of a field happens - # (a) when another nested record is encountered or (b) a non-nested type has been reached. + # The actual emitting of a field happens when + # (a) another nested record is encountered or + # (b) a non-nested type has been reached or + # (c) during the special-casing for unions. self._fields_stack.append(field) for sub_schema in self._get_sub_schemas(field): yield from self._to_mce_fields(sub_schema) self._fields_stack.pop() + def _gen_from_last_field( + self, schema_to_recurse: Optional[AvroNestedSchemas] = None + ) -> Generator[SchemaField, None, None]: + """Emits the field most-recent field, optionally triggering sub-schema generation under the field.""" + last_field_schema = self._fields_stack[-1] + # Generate the custom-description for the field. + description = ( + last_field_schema.doc + if last_field_schema.doc + else "No description available." + ) + if last_field_schema.has_default: + description = ( + f"{description}\nField default value: {last_field_schema.default}" + ) + + with AvroToMceSchemaConverter.SchemaFieldEmissionContextManager( + last_field_schema, last_field_schema, self, description + ) as f_emit: + yield from f_emit.emit() + + if schema_to_recurse is not None: + # Generate the nested sub-schemas under the most-recent field. + for sub_schema in self._get_sub_schemas(schema_to_recurse): + yield from self._to_mce_fields(sub_schema) + def _gen_from_non_field_nested_schemas( self, schema: AvroNestedSchemas ) -> Generator[SchemaField, None, None]: @@ -315,30 +341,24 @@ class AvroToMceSchemaConverter: with AvroToMceSchemaConverter.SchemaFieldEmissionContextManager( schema, actual_schema, self ) as fe_schema: - # Emit non-AVRO field complex schemas(even optional unions that become primitives). - yield from fe_schema.emit() + if isinstance( + actual_schema, + ( + avro.schema.UnionSchema, + avro.schema.PrimitiveSchema, + avro.schema.FixedSchema, + avro.schema.EnumSchema, + ), + ): + # Emit non-AVRO field complex schemas(even optional unions that become primitives) and special-casing for extra union emission. + yield from fe_schema.emit() if ( isinstance(actual_schema, avro.schema.RecordSchema) and self._fields_stack ): # We have encountered a nested record, emit the most-recently seen field. - last_field_schema = self._fields_stack[-1] - description = ( - last_field_schema.doc - if last_field_schema.doc - else "No description available." - ) - if last_field_schema.has_default: - description = f"{description}\nField default value: {last_field_schema.default}" - with AvroToMceSchemaConverter.SchemaFieldEmissionContextManager( - last_field_schema, last_field_schema, self, description - ) as f_emit: - yield from f_emit.emit() - # Generate the nested sub-schemas under the most-recent field. - if recurse: - for sub_schema in self._get_sub_schemas(actual_schema): - yield from self._to_mce_fields(sub_schema) + yield from self._gen_from_last_field(actual_schema if recurse else None) else: # We are not yet in the context of any field. Generate all nested sub-schemas under the complex type. if recurse: diff --git a/metadata-ingestion/tests/unit/test_schema_util.py b/metadata-ingestion/tests/unit/test_schema_util.py index 882c96914d..971bbfa65b 100644 --- a/metadata-ingestion/tests/unit/test_schema_util.py +++ b/metadata-ingestion/tests/unit/test_schema_util.py @@ -98,8 +98,8 @@ def assret_field_paths_match( ) def test_avro_schema_to_mce_fields_events_with_nullable_fields(schema): fields = avro_schema_to_mce_fields(schema) - assert 2 == len(fields) - assert fields[1].nullable + assert 1 == len(fields) + assert fields[0].nullable def test_avro_schema_to_mce_fields_sample_events_with_different_field_types(): @@ -121,8 +121,6 @@ def test_avro_schema_to_mce_fields_sample_events_with_different_field_types(): """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ - "[version=2.0].[type=R]", - "[version=2.0].[type=R].[type=map]", "[version=2.0].[type=R].[type=map].[type=long].a_map_of_longs_field", ] assret_field_paths_match(fields, expected_field_paths) @@ -150,7 +148,6 @@ def test_avro_schema_to_mce_fields_record_with_two_fields(): """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ - "[version=2.0].[type=name]", "[version=2.0].[type=name].[type=string].a", "[version=2.0].[type=name].[type=string].b", ] @@ -186,11 +183,10 @@ def test_avro_schema_to_mce_fields_with_default(): """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ - "[version=2.0].[type=name]", "[version=2.0].[type=name].[type=string].aStringField", ] assret_field_paths_match(fields, expected_field_paths) - description = fields[1].description + description = fields[0].description assert description and "custom, default value" in description @@ -213,10 +209,7 @@ def test_avro_schema_with_recursion(): """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ - "[version=2.0].[type=TreeNode]", "[version=2.0].[type=TreeNode].[type=long].value", - "[version=2.0].[type=TreeNode].[type=array]", - "[version=2.0].[type=TreeNode].[type=array].[type=TreeNode]", "[version=2.0].[type=TreeNode].[type=array].[type=TreeNode].children", ] assret_field_paths_match(fields, expected_field_paths) @@ -263,11 +256,9 @@ def test_avro_sample_payment_schema_to_mce_fields_with_nesting(): """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ - "[version=2.0].[type=Payment]", "[version=2.0].[type=Payment].[type=string].id", "[version=2.0].[type=Payment].[type=double].amount", "[version=2.0].[type=Payment].[type=string].name", - "[version=2.0].[type=Payment].[type=PhoneNumber]", "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber", "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].areaCode", "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].countryCode", @@ -302,13 +293,10 @@ def test_avro_schema_to_mce_fields_with_nesting_across_records(): fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ "[version=2.0].[type=union]", - "[version=2.0].[type=union].[type=Address]", "[version=2.0].[type=union].[type=Address].[type=string].streetAddress", "[version=2.0].[type=union].[type=Address].[type=string].city", - "[version=2.0].[type=union].[type=Person]", "[version=2.0].[type=union].[type=Person].[type=string].firstname", "[version=2.0].[type=union].[type=Person].[type=string].lastname", - "[version=2.0].[type=union].[type=Person].[type=Address]", "[version=2.0].[type=union].[type=Person].[type=Address].address", ] assret_field_paths_match(fields, expected_field_paths) @@ -328,7 +316,7 @@ def test_simple_record_with_primitive_types(): "name": "enumField", "type": { "type": "enum", - "name": "EnumField", + "name": "MyTestEnumField", "symbols": [ "TEST", "TEST1" ], "symbolDoc": { "TEST": "test enum", @@ -341,7 +329,6 @@ def test_simple_record_with_primitive_types(): """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ - "[version=2.0].[type=Simple]", "[version=2.0].[type=Simple].[type=string].stringField", "[version=2.0].[type=Simple].[type=boolean].booleanField", "[version=2.0].[type=Simple].[type=int].intField", @@ -371,8 +358,6 @@ def test_simple_nested_record_with_a_string_field_for_key_schema(): """ fields = avro_schema_to_mce_fields(schema, True) expected_field_paths: List[str] = [ - "[version=2.0].[key=True].[type=SimpleNested]", - "[version=2.0].[key=True].[type=SimpleNested].[type=InnerRcd]", "[version=2.0].[key=True].[type=SimpleNested].[type=InnerRcd].nestedRcd", "[version=2.0].[key=True].[type=SimpleNested].[type=InnerRcd].nestedRcd.[type=string].aStringField", ] @@ -407,15 +392,17 @@ def test_union_with_nested_record_of_union(): """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ - "[version=2.0].[type=UnionSample]", - "[version=2.0].[type=UnionSample].[type=union]", + "[version=2.0].[type=UnionSample].[type=union].aUnion", "[version=2.0].[type=UnionSample].[type=union].[type=boolean].aUnion", - "[version=2.0].[type=UnionSample].[type=union].[type=Rcd]", "[version=2.0].[type=UnionSample].[type=union].[type=Rcd].aUnion", "[version=2.0].[type=UnionSample].[type=union].[type=Rcd].aUnion.[type=string].aNullableStringField", ] assret_field_paths_match(fields, expected_field_paths) - assert isinstance(fields[5].type.type, StringTypeClass) + assert isinstance(fields[3].type.type, StringTypeClass) + assert fields[0].nativeDataType == "union" + assert fields[1].nativeDataType == "boolean" + assert fields[2].nativeDataType == "Rcd" + assert fields[3].nativeDataType == "string" def test_nested_arrays(): @@ -448,10 +435,6 @@ def test_nested_arrays(): """ fields = avro_schema_to_mce_fields(schema) expected_field_paths: List[str] = [ - "[version=2.0].[type=NestedArray]", - "[version=2.0].[type=NestedArray].[type=array]", - "[version=2.0].[type=NestedArray].[type=array].[type=array]", - "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo]", "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar", "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar.[type=long].a", ] @@ -485,13 +468,10 @@ def test_map_of_union_of_int_and_record_of_union(): """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ - "[version=2.0].[type=MapSample]", - "[version=2.0].[type=MapSample].[type=map]", - "[version=2.0].[type=MapSample].[type=map].[type=union]", + "[version=2.0].[type=MapSample].[type=map].[type=union].aMap", "[version=2.0].[type=MapSample].[type=map].[type=union].[type=int].aMap", - "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd]", "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap", - "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union]", + "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union].aUnion", "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union].[type=string].aUnion", "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union].[type=int].aUnion", ] @@ -519,11 +499,8 @@ def test_recursive_avro(): """ fields = avro_schema_to_mce_fields(schema) expected_field_paths = [ - "[version=2.0].[type=Recursive]", - "[version=2.0].[type=Recursive].[type=R]", "[version=2.0].[type=Recursive].[type=R].r", "[version=2.0].[type=Recursive].[type=R].r.[type=int].anIntegerField", - "[version=2.0].[type=Recursive].[type=R].r.[type=R]", "[version=2.0].[type=Recursive].[type=R].r.[type=R].aRecursiveField", ] assret_field_paths_match(fields, expected_field_paths) @@ -564,17 +541,11 @@ def test_needs_disambiguation_nested_union_of_records_with_same_field_name(): """ fields = avro_schema_to_mce_fields(schema) expected_field_paths: List[str] = [ - "[version=2.0].[type=ABFooUnion]", - "[version=2.0].[type=ABFooUnion].[type=union]", - "[version=2.0].[type=ABFooUnion].[type=union].[type=A]", + "[version=2.0].[type=ABFooUnion].[type=union].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=A].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f", - "[version=2.0].[type=ABFooUnion].[type=union].[type=B]", "[version=2.0].[type=ABFooUnion].[type=union].[type=B].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f", - "[version=2.0].[type=ABFooUnion].[type=union].[type=array]", - "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array]", - "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo]", "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a", "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=long].f", ] @@ -637,17 +608,11 @@ def test_key_schema_handling(): """ fields: List[SchemaField] = avro_schema_to_mce_fields(schema, is_key_schema=True) expected_field_paths: List[str] = [ - "[version=2.0].[key=True].[type=ABFooUnion]", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union]", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A]", + "[version=2.0].[key=True].[type=ABFooUnion].[type=union].a", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B]", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array]", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array]", - "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo]", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a", "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=long].f", ]