nested objects inside array dtype in schema

This commit is contained in:
harshsoni2024 2025-06-26 16:33:14 +05:30
parent c77f6d5db6
commit ef755100f7
2 changed files with 242 additions and 8 deletions

View File

@ -300,38 +300,79 @@ class RestSource(ApiServiceSource):
logger.warning(f"Error while parsing response schema: {err}")
return None
def process_schema_fields(self, schema_ref: str) -> Optional[List[FieldModel]]:
def process_schema_fields(
self, schema_ref: str, parent_refs: Optional[List[str]] = None
) -> Optional[List[FieldModel]]:
try:
if parent_refs is None:
parent_refs = []
schema_name = schema_ref.split("/")[-1]
schema_fields = (
self.json_response.get("components").get("schemas").get(schema_name)
)
parent_refs.append(schema_ref)
fetched_fields = []
for key, val in schema_fields.get("properties", {}).items():
dtype = val.get("type")
if dtype:
dtype = "INT" if dtype.upper() == "INTEGER" else dtype
parsed_dtype = (
DataTypeTopic[dtype.upper()]
if dtype.upper() in DataTypeTopic.__members__
else DataTypeTopic.UNKNOWN
)
fetched_fields.append(FieldModel(name=key, dataType=parsed_dtype))
children = None
if parsed_dtype.value == DataTypeTopic.ARRAY.value:
# If field of array type then parse children
children_ref = val.get("items", {}).get("$ref")
if children_ref:
# check infinite recursion by checking pre-processed schemas(parent_refs)
if children_ref not in parent_refs:
logger.debug(
f"Processing array fields inside schema: {children_ref}"
)
children = self.process_schema_fields(
children_ref, parent_refs
)
logger.debug(
f"Completed processing array fields inside schema: {children_ref}"
)
else:
logger.debug(
f"Skipping array fields inside schema: {children_ref} to avoid infinite recursion"
)
fetched_fields.append(
FieldModel(name=key, dataType=parsed_dtype, children=children)
)
else:
# If type of field is not defined then check for sub-schema
# Check if it's `object` type field
# check infinite recrusrion by comparing with parent(schema_ref)
object_children = None
if val.get("$ref") and val.get("$ref") != schema_ref:
object_children = self.process_schema_fields(val.get("$ref"))
children = None
if val.get("$ref"):
# check infinite recursion by checking pre-processed schemas(parent_refs)
if val.get("$ref") not in parent_refs:
children = self.process_schema_fields(
val.get("$ref"), parent_refs
)
else:
logger.debug(
f"Skipping object fields inside schema: {val.get('$ref')} to avoid infinite recursion"
)
fetched_fields.append(
FieldModel(
name=key,
dataType=DataTypeTopic.UNKNOWN,
children=object_children,
children=children,
)
)
if parent_refs and (schema_ref in parent_refs):
parent_refs.pop()
return fetched_fields
except Exception as err:
logger.warning(f"Error while processing schema fields: {err}")
if parent_refs and (schema_ref in parent_refs):
parent_refs.pop()
logger.debug(
f"Popping {schema_ref} from parent_refs due to processing error"
)
return None

View File

@ -35,6 +35,7 @@ from metadata.generated.schema.type.basic import (
FullyQualifiedEntityName,
Markdown,
)
from metadata.generated.schema.type.schema import DataTypeTopic
from metadata.ingestion.api.models import Either
from metadata.ingestion.source.api.rest.metadata import RestSource
from metadata.ingestion.source.api.rest.models import RESTCollection, RESTEndpoint
@ -142,6 +143,106 @@ MOCK_JSON_RESPONSE = {
],
}
# Mock data for testing process_schema_fields
MOCK_SCHEMA_RESPONSE_SIMPLE = {
"components": {
"schemas": {
"User": {
"type": "object",
"properties": {
"id": {"type": "integer"},
"name": {"type": "string"},
"email": {"type": "string"},
},
}
}
}
}
MOCK_SCHEMA_RESPONSE_WITH_ARRAY = {
"components": {
"schemas": {
"User": {
"type": "object",
"properties": {
"id": {"type": "integer"},
"name": {"type": "string"},
"tags": {
"type": "array",
"items": {"$ref": "#/components/schemas/Tag"},
},
},
},
"Tag": {
"type": "object",
"properties": {"id": {"type": "integer"}, "name": {"type": "string"}},
},
}
}
}
MOCK_SCHEMA_RESPONSE_WITH_OBJECT_REF = {
"components": {
"schemas": {
"User": {
"type": "object",
"properties": {
"id": {"type": "integer"},
"name": {"type": "string"},
"address": {"$ref": "#/components/schemas/Address"},
},
},
"Address": {
"type": "object",
"properties": {
"street": {"type": "string"},
"city": {"type": "string"},
},
},
}
}
}
MOCK_SCHEMA_RESPONSE_WITH_ARRAY_CIRCULAR = {
"components": {
"schemas": {
"User": {
"type": "object",
"properties": {
"id": {"type": "integer"},
"name": {"type": "string"},
"friends": {
"type": "array",
"items": {"$ref": "#/components/schemas/User"},
},
},
}
}
}
}
MOCK_SCHEMA_RESPONSE_WITH_OBJECT_REF_CIRCULAR = {
"components": {
"schemas": {
"User": {
"type": "object",
"properties": {
"id": {"type": "integer"},
"name": {"type": "string"},
"profile": {"$ref": "#/components/schemas/Profile"},
},
},
"Profile": {
"type": "object",
"properties": {
"bio": {"type": "string"},
"user": {"$ref": "#/components/schemas/User"},
},
},
}
}
}
class RESTTest(TestCase):
@patch("metadata.ingestion.source.api.api_service.ApiServiceSource.test_connection")
@ -244,3 +345,95 @@ class RESTTest(TestCase):
)
collections_invalid = list(rest_source_invalid.get_api_collections())
assert len(collections_invalid) == 0
def test_process_schema_fields_simple(self):
"""Test processing simple schema fields without references"""
self.rest_source.json_response = MOCK_SCHEMA_RESPONSE_SIMPLE
result = self.rest_source.process_schema_fields("#/components/schemas/User")
assert result is not None
assert len(result) == 3
# Check field names and types
field_names = {field.name.root for field in result}
assert field_names == {"id", "name", "email"}
# Check specific field types
id_field = next(field for field in result if field.name.root == "id")
assert id_field.dataType == DataTypeTopic.INT
assert id_field.children is None
def test_process_schema_fields_with_array(self):
"""Test processing schema fields with array references"""
self.rest_source.json_response = MOCK_SCHEMA_RESPONSE_WITH_ARRAY
result = self.rest_source.process_schema_fields("#/components/schemas/User")
assert result is not None
assert len(result) == 3
# Find the tags field (array type)
tags_field = next(field for field in result if field.name.root == "tags")
assert tags_field.dataType == DataTypeTopic.ARRAY
assert tags_field.children is not None
assert len(tags_field.children) == 2
# Check array children fields
child_names = {child.name.root for child in tags_field.children}
assert child_names == {"id", "name"}
def test_process_schema_fields_with_object_reference(self):
"""Test processing schema fields with object references"""
self.rest_source.json_response = MOCK_SCHEMA_RESPONSE_WITH_OBJECT_REF
result = self.rest_source.process_schema_fields("#/components/schemas/User")
assert result is not None
assert len(result) == 3
# Find the address field (object reference)
address_field = next(field for field in result if field.name.root == "address")
assert address_field.dataType == DataTypeTopic.UNKNOWN
assert address_field.children is not None
assert len(address_field.children) == 2
# Check object children fields
child_names = {child.name.root for child in address_field.children}
assert child_names == {"street", "city"}
def test_process_schema_fields_circular_reference_array(self):
"""Test processing schema fields with circular references in arrays"""
self.rest_source.json_response = MOCK_SCHEMA_RESPONSE_WITH_ARRAY_CIRCULAR
result = self.rest_source.process_schema_fields("#/components/schemas/User")
assert result is not None
assert len(result) == 3
# Find the friends field (array with circular reference)
friends_field = next(field for field in result if field.name.root == "friends")
assert friends_field.dataType == DataTypeTopic.ARRAY
# Should be None due to circular reference prevention
assert friends_field.children is None
def test_process_schema_fields_circular_reference_object(self):
"""Test processing schema fields with circular references in objects"""
self.rest_source.json_response = MOCK_SCHEMA_RESPONSE_WITH_OBJECT_REF_CIRCULAR
result = self.rest_source.process_schema_fields("#/components/schemas/User")
assert result is not None
assert len(result) == 3
# Find the profile field
profile_field = next(field for field in result if field.name.root == "profile")
assert profile_field.dataType == DataTypeTopic.UNKNOWN
assert profile_field.children is not None
assert len(profile_field.children) == 2
# Check that the circular reference is prevented
user_field_in_profile = next(
child for child in profile_field.children if child.name.root == "user"
)
# Should be None due to circular reference prevention
assert user_field_in_profile.children is None