diff --git a/ingestion/src/metadata/parsers/json_schema_parser.py b/ingestion/src/metadata/parsers/json_schema_parser.py index 412d3be013b..716fd237984 100644 --- a/ingestion/src/metadata/parsers/json_schema_parser.py +++ b/ingestion/src/metadata/parsers/json_schema_parser.py @@ -16,7 +16,7 @@ Utils module to parse the jsonschema import json import traceback from enum import Enum -from typing import List, Optional, Type +from typing import List, Optional, Tuple, Type from pydantic import BaseModel @@ -66,6 +66,49 @@ def parse_json_schema( return None +def get_child_models(key, value, field_models, cls: Type[BaseModel] = FieldModel): + """ + Method to parse the child objects in the json schema + """ + try: + cls_obj = cls( + name=key, + displayName=value.get("title"), + dataType=JsonSchemaDataTypes(value.get("type", "unknown")).name, + description=value.get("description"), + ) + children = None + if value.get("type") == JsonSchemaDataTypes.RECORD.value: + children = get_json_schema_fields(value.get("properties"), cls=cls) + if value.get("type") == JsonSchemaDataTypes.ARRAY.value: + datatype_display, children = get_json_schema_array_fields( + value.get("items"), cls=cls + ) + cls_obj.dataTypeDisplay = f"ARRAY<{datatype_display}>" + cls_obj.children = children + field_models.append(cls_obj) + except Exception as exc: # pylint: disable=broad-except + logger.debug(traceback.format_exc()) + logger.warning(f"Unable to parse the json schema into models: {exc}") + + +def get_json_schema_array_fields( + array_items, cls: Type[BaseModel] = FieldModel +) -> Optional[Tuple[str, List[FieldModel]]]: + """ + Recursively convert the parsed array schema into required models + """ + field_models = [] + if array_items.get("type") == JsonSchemaDataTypes.RECORD.value: + for key, value in array_items.get("properties", {}).items(): + get_child_models(key, value, field_models, cls) + + return ( + JsonSchemaDataTypes(array_items.get("type", "unknown")).name, + field_models or None, + ) + + def get_json_schema_fields( properties, cls: Type[BaseModel] = FieldModel ) -> Optional[List[FieldModel]]: @@ -74,20 +117,6 @@ def get_json_schema_fields( """ field_models = [] for key, value in properties.items(): - try: - field_models.append( - cls( - name=key, - displayName=value.get("title"), - dataType=JsonSchemaDataTypes(value.get("type", "unknown")).name, - description=value.get("description"), - children=get_json_schema_fields(value.get("properties"), cls=cls) - if value.get("type") == "object" - else None, - ) - ) - except Exception as exc: # pylint: disable=broad-except - logger.debug(traceback.format_exc()) - logger.warning(f"Unable to parse the json schema into models: {exc}") + get_child_models(key, value, field_models, cls) return field_models diff --git a/ingestion/tests/unit/test_json_schema_parser.py b/ingestion/tests/unit/test_json_schema_parser.py index fd1bb1d9ca8..7c5ce9acc40 100644 --- a/ingestion/tests/unit/test_json_schema_parser.py +++ b/ingestion/tests/unit/test_json_schema_parser.py @@ -91,8 +91,81 @@ class JsonSchemaParserTests(TestCase): } }""" + sample_array_json_schema = """ + { + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "firstName": { + "type": "string" + }, + "lastName": { + "type": "string" + }, + "age": { + "type": "integer" + }, + "address": { + "type": "object", + "properties": { + "streetAddress": { + "type": "string" + }, + "city": { + "type": "string" + }, + "state": { + "type": "string" + }, + "postalCode": { + "type": "string" + } + }, + "required": [ + "streetAddress", + "city", + "state", + "postalCode" + ] + }, + "phoneNumbers": { + "type": "array", + "items": { + "type": "object", + "properties": { + "type": { + "type": "string" + }, + "number": { + "type": "string" + } + }, + "required": [ + "type", + "number" + ] + } + }, + "hobbies": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "firstName", + "lastName", + "age", + "address", + "phoneNumbers" + ] + } + """ + parsed_schema = parse_json_schema(sample_json_schema) parsed_postgres_schema = parse_json_schema(sample_postgres_json_schema, Column) + parsed_array_schema = parse_json_schema(sample_array_json_schema) def test_schema_name(self): self.assertEqual(self.parsed_schema[0].name.root, "Person") @@ -143,3 +216,37 @@ class JsonSchemaParserTests(TestCase): ) self.assertEqual(len(self.parsed_postgres_schema[0].children), 3) self.assertEqual(len(self.parsed_postgres_schema[0].children[1].children), 4) + + def test_parse_postgres_json_fields(self): + self.assertEqual(self.parsed_array_schema[0].name.root, "default") + self.assertEqual(len(self.parsed_array_schema[0].children), 6) + + # Validate the complex array datatype + self.assertEqual( + self.parsed_array_schema[0].children[4].name.root, "phoneNumbers" + ) + self.assertEqual(self.parsed_array_schema[0].children[4].dataType.name, "ARRAY") + self.assertEqual( + self.parsed_array_schema[0].children[4].dataTypeDisplay, "ARRAY" + ) + self.assertEqual(len(self.parsed_array_schema[0].children[4].children), 2) + self.assertEqual( + self.parsed_array_schema[0].children[4].children[0].name.root, "type" + ) + self.assertEqual( + self.parsed_array_schema[0].children[4].children[0].dataType.name, "STRING" + ) + self.assertEqual( + self.parsed_array_schema[0].children[4].children[1].name.root, "number" + ) + self.assertEqual( + self.parsed_array_schema[0].children[4].children[1].dataType.name, "STRING" + ) + + # Validate the primitive array datatype + self.assertEqual(self.parsed_array_schema[0].children[5].name.root, "hobbies") + self.assertEqual(self.parsed_array_schema[0].children[5].dataType.name, "ARRAY") + self.assertEqual( + self.parsed_array_schema[0].children[5].dataTypeDisplay, "ARRAY" + ) + self.assertIsNone(self.parsed_array_schema[0].children[5].children)