datahub/metadata-ingestion/tests/unit/schema/test_json_schema_util.py
2025-02-28 17:49:52 +05:30

890 lines
33 KiB
Python

import json
import logging
import os
import re
from pathlib import Path
from typing import Dict, Iterable, List, Union
import pytest
from datahub.ingestion.extractor.json_schema_util import JsonSchemaTranslator
from datahub.ingestion.run.pipeline import Pipeline
from datahub.metadata.com.linkedin.pegasus2avro.schema import (
NumberTypeClass,
SchemaField,
StringTypeClass,
)
from datahub.metadata.schema_classes import (
ArrayTypeClass,
MapTypeClass,
RecordTypeClass,
UnionTypeClass,
)
logger = logging.getLogger(__name__)
SCHEMA_WITH_OPTIONAL_FIELD_VIA_UNION_TYPE = {
"type": "object",
"title": "TestRecord",
"properties": {
"my.field": {
"type": ["string", "null"],
"description": "some.doc",
}
},
}
def log_field_paths(fields: Iterable[SchemaField]) -> None:
logger.debug('FieldPaths=\n"' + '",\n"'.join(f.fieldPath for f in fields) + '"')
def assert_field_paths_are_unique(fields: Iterable[SchemaField]) -> None:
fields_paths = [f.fieldPath for f in fields if re.match(".*[^]]$", f.fieldPath)]
if fields_paths:
assert len(fields_paths) == len(set(fields_paths))
def assert_fields_are_valid(fields: Iterable[SchemaField]) -> None:
for f in fields:
f.validate()
assert isinstance(f.nativeDataType, str)
def assert_field_paths_match(
fields: Iterable[SchemaField], expected_field_paths: Union[List[str], List[Dict]]
) -> None:
log_field_paths(fields)
assert len([f for f in fields]) == len(expected_field_paths)
for f, efp in zip(fields, expected_field_paths):
if isinstance(efp, dict):
assert f.fieldPath == efp["path"]
assert isinstance(f.type.type, efp["type"])
else:
assert f.fieldPath == efp
assert_field_paths_are_unique(fields)
def json_schema_to_schema_fields(schema):
return list(JsonSchemaTranslator.get_fields_from_schema(schema))
@pytest.mark.parametrize(
"schema",
[
SCHEMA_WITH_OPTIONAL_FIELD_VIA_UNION_TYPE,
],
ids=[
"optional_field_via_union_type",
],
)
def test_json_schema_to_events_with_nullable_fields(schema):
fields = json_schema_to_schema_fields(schema)
assert len(fields) == 1
assert fields[0].nullable
def test_json_schema_to_mce_fields_sample_events_with_different_field_types():
schema = {
"type": "object",
"title": "R",
"namespace": "some.namespace",
"properties": {
"a_map_of_longs_field": {
"type": "object",
"additionalProperties": {"type": "integer"},
}
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths = [
{
"path": "[version=2.0].[type=R].[type=map].[type=integer].a_map_of_longs_field",
"type": MapTypeClass,
}
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)
def test_json_schema_to_record_with_two_fields():
schema = {
"type": "object",
"title": "some_event_name",
"namespace": "not.relevant.namespace",
"properties": {
"a": {"type": "string", "description": "some.doc"},
"b": {"type": "string", "description": "some.doc"},
},
}
fields = json_schema_to_schema_fields(schema)
expected_field_paths = [
{
"path": "[version=2.0].[type=some_event_name].[type=string].a",
"type": StringTypeClass,
},
{
"path": "[version=2.0].[type=some_event_name].[type=string].b",
"type": StringTypeClass,
},
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)
def test_json_schema_to_mce_fields_toplevel_isnt_a_record():
schema = {"type": "string"}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths = [
{"path": "[version=2.0].[type=string]", "type": StringTypeClass}
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)
def test_json_schema_with_recursion():
schema = {
"type": "object",
"title": "TreeNode",
"properties": {
"value": {"type": "integer"},
"children": {"type": "array", "items": {"$ref": "#"}},
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths = [
{
"path": "[version=2.0].[type=TreeNode].[type=integer].value",
"type": NumberTypeClass,
},
{
"path": "[version=2.0].[type=TreeNode].[type=array].children",
"type": ArrayTypeClass,
},
{
"path": "[version=2.0].[type=TreeNode].[type=array].children.[type=TreeNode].TreeNode",
"type": RecordTypeClass,
},
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)
def test_json_sample_payment_schema_to_schema_fields_with_nesting():
schema = {
"type": "object",
"title": "Payment",
"namespace": "some.event.namespace",
"properties": {
"id": {"type": "string"},
"amount": {"type": "number", "description": "amountDoc"},
"name": {"type": "string", "default": ""},
"phoneNumber": {
"type": "object",
"title": "PhoneNumber",
"description": "testDoc",
"properties": {
"areaCode": {
"type": "string",
"description": "areaCodeDoc",
"default": "",
},
"countryCode": {"type": "string", "default": ""},
"prefix": {"type": "string", "default": ""},
"number": {"type": "string", "default": ""},
},
"default": "null",
},
"address": {
"type": "object",
"title": "Address",
"properties": {"street": {"type": "string", "default": ""}},
"description": "addressDoc",
"default": "null",
},
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths = [
"[version=2.0].[type=Payment].[type=string].id",
"[version=2.0].[type=Payment].[type=number].amount",
"[version=2.0].[type=Payment].[type=string].name",
"[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber",
"[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].areaCode",
"[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].countryCode",
"[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].prefix",
"[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].number",
"[version=2.0].[type=Payment].[type=Address].address",
"[version=2.0].[type=Payment].[type=Address].address.[type=string].street",
]
assert_field_paths_match(fields, expected_field_paths)
assert fields[1].description == "amountDoc"
assert fields[3].description == "testDoc\nField default value: null"
assert fields[4].description == "areaCodeDoc\nField default value: "
assert fields[8].description == "addressDoc\nField default value: null"
assert_fields_are_valid(fields)
def test_json_schema_to_schema_fields_with_nesting_across_records():
schema = {
"definitions": {
"Address": {
"type": "object",
"title": "Address",
"properties": {
"streetAddress": {"type": "string"},
"city": {"type": "string"},
},
}
},
"oneOf": [
{"$ref": "#/definitions/Address"},
{
"type": "object",
"title": "Person",
"properties": {
"firstname": {"type": "string"},
"lastname": {"type": "string"},
"address": {"$ref": "#/definitions/Address"},
},
},
],
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths = [
"[version=2.0].[type=union].[type=Address].[type=string].streetAddress",
"[version=2.0].[type=union].[type=Address].[type=string].city",
"[version=2.0].[type=union].[type=Person].[type=string].firstname",
"[version=2.0].[type=union].[type=Person].[type=string].lastname",
"[version=2.0].[type=union].[type=Person].[type=Address].address",
"[version=2.0].[type=union].[type=Person].[type=Address].address.[type=string].streetAddress",
"[version=2.0].[type=union].[type=Person].[type=Address].address.[type=string].city",
]
assert_field_paths_match(fields, expected_field_paths)
def test_simple_record_with_primitive_types():
schema = {
"type": "object",
"title": "Simple",
"namespace": "com.linkedin",
"properties": {
"stringField": {"type": "string", "description": "string field"},
"booleanField": {"type": "boolean"},
"intField": {"type": "integer"},
"enumField": {"title": "MyTestEnumField", "enum": ["TEST", "TEST1"]},
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths = [
"[version=2.0].[type=Simple].[type=string].stringField",
"[version=2.0].[type=Simple].[type=boolean].booleanField",
"[version=2.0].[type=Simple].[type=integer].intField",
"[version=2.0].[type=Simple].[type=enum].enumField",
]
assert_field_paths_match(fields, expected_field_paths)
def test_simple_nested_record_with_a_string_field_for_key_schema():
schema = {
"type": "object",
"title": "SimpleNested",
"namespace": "com.linkedin",
"properties": {
"nestedRcd": {
"type": "object",
"title": "InnerRcd",
"properties": {"aStringField": {"type": "string"}},
}
},
}
fields = list(
JsonSchemaTranslator.get_fields_from_schema(schema, is_key_schema=True)
)
expected_field_paths: List[str] = [
"[version=2.0].[key=True].[type=SimpleNested].[type=InnerRcd].nestedRcd",
"[version=2.0].[key=True].[type=SimpleNested].[type=InnerRcd].nestedRcd.[type=string].aStringField",
]
assert_field_paths_match(fields, expected_field_paths)
def test_union_with_nested_record_of_union():
schema = {
"type": "object",
"title": "UnionSample",
"namespace": "com.linkedin",
"properties": {
"aUnion": {
"oneOf": [
{"type": "boolean"},
{
"type": "object",
"title": "Rcd",
"properties": {"aNullableStringField": {"type": "string"}},
},
]
}
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths = [
{
"path": "[version=2.0].[type=UnionSample].[type=union].aUnion",
"type": UnionTypeClass,
},
{
"path": "[version=2.0].[type=UnionSample].[type=union].[type=boolean].aUnion",
"type": UnionTypeClass,
},
{
"path": "[version=2.0].[type=UnionSample].[type=union].[type=Rcd].aUnion",
"type": UnionTypeClass,
},
{
"path": "[version=2.0].[type=UnionSample].[type=union].[type=Rcd].aUnion.[type=string].aNullableStringField",
"type": StringTypeClass,
},
]
assert_field_paths_match(fields, expected_field_paths)
assert isinstance(fields[3].type.type, StringTypeClass)
assert fields[0].nativeDataType == "union(oneOf)"
assert fields[1].nativeDataType == "boolean"
assert fields[2].nativeDataType == "Rcd"
assert fields[3].nativeDataType == "string"
def test_nested_arrays():
schema = {
"type": "object",
"title": "NestedArray",
"namespace": "com.linkedin",
"properties": {
"ar": {
"type": "array",
"items": {
"type": "array",
"items": {
"type": "object",
"title": "Foo",
"properties": {"a": {"type": "integer"}},
},
},
}
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=NestedArray].[type=array].ar",
"[version=2.0].[type=NestedArray].[type=array].ar.[type=array].array",
"[version=2.0].[type=NestedArray].[type=array].ar.[type=array].array.[type=Foo].Foo",
"[version=2.0].[type=NestedArray].[type=array].ar.[type=array].array.[type=Foo].Foo.[type=integer].a",
]
assert_field_paths_match(fields, expected_field_paths)
assert isinstance(fields[0].type.type, ArrayTypeClass)
def test_map_of_union_of_int_and_record_of_union():
schema = {
"type": "object",
"title": "MapSample",
"namespace": "com.linkedin",
"properties": {
"aMap": {
"type": "object",
"additionalProperties": {
"oneOf": [
{"type": "integer"},
{
"type": "object",
"title": "Rcd",
"properties": {
"aUnion": {
"oneOf": [{"type": "string"}, {"type": "integer"}]
}
},
},
]
},
}
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths = [
{
"path": "[version=2.0].[type=MapSample].[type=map].[type=union].aMap",
"type": MapTypeClass,
},
{
"path": "[version=2.0].[type=MapSample].[type=map].[type=union].[type=integer].aMap",
"type": MapTypeClass,
},
{
"path": "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap",
"type": MapTypeClass,
},
{
"path": "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union].aUnion",
"type": UnionTypeClass,
},
{
"path": "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union].[type=string].aUnion",
"type": UnionTypeClass,
},
{
"path": "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union].[type=integer].aUnion",
"type": UnionTypeClass,
},
]
assert_field_paths_match(fields, expected_field_paths)
def test_recursive_json():
schema = {
"type": "object",
"title": "Recursive",
"namespace": "com.linkedin",
"properties": {
"r": {
"type": "object",
"title": "R",
"properties": {
"anIntegerField": {"type": "integer"},
"aRecursiveField": {"$ref": "#/properties/r"},
},
}
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths = [
"[version=2.0].[type=Recursive].[type=R].r",
"[version=2.0].[type=Recursive].[type=R].r.[type=integer].anIntegerField",
"[version=2.0].[type=Recursive].[type=R].r.[type=R].aRecursiveField",
]
assert fields[2].recursive
assert isinstance(fields[2].type.type, RecordTypeClass)
assert fields[2].nativeDataType == "R"
assert_field_paths_match(fields, expected_field_paths)
def test_needs_disambiguation_nested_union_of_records_with_same_field_name():
schema = {
"type": "object",
"title": "ABFooUnion",
"namespace": "com.linkedin",
"properties": {
"a": {
"oneOf": [
{
"type": "object",
"title": "A",
"properties": {"f": {"type": "string"}},
},
{
"type": "object",
"title": "B",
"properties": {"f": {"type": "string"}},
},
{
"type": "array",
"items": {
"type": "array",
"items": {
"type": "object",
"title": "Foo",
"properties": {"f": {"type": "integer"}},
},
},
},
]
}
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=ABFooUnion].[type=union].a",
"[version=2.0].[type=ABFooUnion].[type=union].[type=A].a",
"[version=2.0].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f",
"[version=2.0].[type=ABFooUnion].[type=union].[type=B].a",
"[version=2.0].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f",
"[version=2.0].[type=ABFooUnion].[type=union].[type=array].a",
"[version=2.0].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array",
"[version=2.0].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo",
"[version=2.0].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo.[type=integer].f",
]
assert_field_paths_match(fields, expected_field_paths)
def test_datahub_json_schemas_parses_okay(tmp_path):
"""This is more like an integration test that helps us exercise the complexity in parsing and catch unexpected regressions."""
json_path: Path = Path(os.path.dirname(__file__)) / Path(
"../../../../metadata-models/src/generatedJsonSchema/json/"
)
pipeline = Pipeline.create(
config_dict={
"source": {
"type": "json-schema",
"config": {
"path": str(json_path),
"platform": "schemaregistry",
},
},
"sink": {
"type": "file",
"config": {
"filename": f"{tmp_path}/json_schema_test.json",
},
},
}
)
pipeline.run()
pipeline.raise_from_status()
logger.info(f"Wrote file to {tmp_path}/json_schema_test.json")
def test_key_schema_handling():
"""Tests key schema handling"""
schema = {
"type": "object",
"title": "ABFooUnion",
"properties": {
"a": {
"oneOf": [
{
"type": "object",
"title": "A",
"properties": {"f": {"type": "string"}},
},
{
"type": "object",
"title": "B",
"properties": {"f": {"type": "string"}},
},
{
"type": "array",
"items": {
"type": "array",
"items": {
"type": "object",
"title": "Foo",
"properties": {"f": {"type": "number"}},
},
},
},
]
}
},
}
fields: List[SchemaField] = list(
JsonSchemaTranslator.get_fields_from_schema(schema, is_key_schema=True)
)
expected_field_paths: List[str] = [
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].a",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo",
"[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].a.[type=array].array.[type=Foo].Foo.[type=number].f",
]
assert_field_paths_match(fields, expected_field_paths)
for f in fields:
assert f.isPartOfKey
def test_ignore_exceptions():
malformed_schema = {
"name": "event_ts",
"type": "long",
"logicalType": "timestamp-millis",
"tags": ["business-timestamp"],
}
fields: List[SchemaField] = list(
JsonSchemaTranslator.get_fields_from_schema(malformed_schema)
)
assert not fields
SCHEMA_WITH_ARRAY_TYPE = {
"title": "Administrative-Unit",
"type": "object",
"properties": {
"Identifier": {"type": ["integer"]},
"ValidFrom": {"format": "date", "type": ["string"]},
"ValidTo": {"format": "date", "type": ["string", "null"]},
"Level": {"minimum": 1, "maximum": 3, "type": ["integer"]},
"Parent": {"type": ["integer", "null"]},
"Name_en": {"type": ["string", "null"]},
"Name_fr": {"type": ["string", "null"]},
"Name_de": {"type": ["string", "null"]},
"Name_it": {"type": ["string", "null"]},
"ABBREV_1_Text_en": {"type": ["string", "null"]},
"ABBREV_1_Text_fr": {"type": ["string", "null"]},
"ABBREV_1_Text_de": {"type": ["string", "null"]},
"ABBREV_1_Text_it": {"type": ["string", "null"]},
"ABBREV_1_Text": {"type": ["string", "null"]},
"CODE_OFS_1_Text_en": {"type": ["integer", "null"]},
"CODE_OFS_1_Text_fr": {"type": ["integer", "null"]},
"CODE_OFS_1_Text_de": {"type": ["integer", "null"]},
"CODE_OFS_1_Text_it": {"type": ["integer", "null"]},
"CODE_OFS_1_Text": {"type": ["integer", "null"]},
},
}
def test_array_handling():
schema = SCHEMA_WITH_ARRAY_TYPE
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=Administrative-Unit].[type=integer].Identifier",
"[version=2.0].[type=Administrative-Unit].[type=string(date)].ValidFrom",
"[version=2.0].[type=Administrative-Unit].[type=string(date)].ValidTo",
"[version=2.0].[type=Administrative-Unit].[type=integer].Level",
"[version=2.0].[type=Administrative-Unit].[type=integer].Parent",
"[version=2.0].[type=Administrative-Unit].[type=string].Name_en",
"[version=2.0].[type=Administrative-Unit].[type=string].Name_fr",
"[version=2.0].[type=Administrative-Unit].[type=string].Name_de",
"[version=2.0].[type=Administrative-Unit].[type=string].Name_it",
"[version=2.0].[type=Administrative-Unit].[type=string].ABBREV_1_Text_en",
"[version=2.0].[type=Administrative-Unit].[type=string].ABBREV_1_Text_fr",
"[version=2.0].[type=Administrative-Unit].[type=string].ABBREV_1_Text_de",
"[version=2.0].[type=Administrative-Unit].[type=string].ABBREV_1_Text_it",
"[version=2.0].[type=Administrative-Unit].[type=string].ABBREV_1_Text",
"[version=2.0].[type=Administrative-Unit].[type=integer].CODE_OFS_1_Text_en",
"[version=2.0].[type=Administrative-Unit].[type=integer].CODE_OFS_1_Text_fr",
"[version=2.0].[type=Administrative-Unit].[type=integer].CODE_OFS_1_Text_de",
"[version=2.0].[type=Administrative-Unit].[type=integer].CODE_OFS_1_Text_it",
"[version=2.0].[type=Administrative-Unit].[type=integer].CODE_OFS_1_Text",
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)
def test_simple_array():
schema = {
"type": "object",
"title": "ObjectWithArray",
"namespace": "com.linkedin",
"properties": {"ar": {"type": "array", "items": {"type": "string"}}},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=ObjectWithArray].[type=array].ar",
"[version=2.0].[type=ObjectWithArray].[type=array].ar.[type=string].string",
]
assert_field_paths_match(fields, expected_field_paths)
assert isinstance(fields[0].type.type, ArrayTypeClass)
def test_simple_object():
schema = {
"type": "object",
"title": "Object With Object",
"namespace": "io.datahubproject",
"properties": {"inner": {"type": "object"}},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=Object With Object].[type=object].inner",
]
assert_field_paths_match(fields, expected_field_paths)
assert isinstance(fields[0].type.type, RecordTypeClass)
def test_required_field():
schema = {
"$id": "test",
"$schema": "http://json-schema.org/draft-06/schema#",
"properties": {
"a_str": {
"description": "Example String",
"type": "string",
},
"b_str": {"type": ["string", "null"]},
},
"required": ["b_str"],
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=object].[type=string].a_str",
"[version=2.0].[type=object].[type=string].b_str",
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)
assert fields[0].nullable is False
assert fields[1].nullable is True
assert json.loads(fields[1].jsonProps or "{}")["required"] is True
assert json.loads(fields[0].jsonProps or "{}")["required"] is False
def test_non_str_enums():
schema = {
"$id": "test",
"$schema": "http://json-schema.org/draft-06/schema#",
"properties": {"bar": {"description": "Mixed enum", "enum": ["baz", 1, None]}},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = ["[version=2.0].[type=object].[type=enum].bar"]
assert_field_paths_match(fields, expected_field_paths)
assert fields[0].description == 'One of: "baz", 1, null'
def test_const_description_pulled_correctly():
schema = {
"$id": "test",
"$schema": "http://json-schema.org/draft-07/schema#",
"properties": {"bar": {"type": "string", "const": "not_defined"}},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = ["[version=2.0].[type=object].[type=string].bar"]
assert_field_paths_match(fields, expected_field_paths)
assert fields[0].description == "Const value: not_defined"
def test_anyof_with_properties():
# We expect the event / timestamp fields to be included in both branches of the anyOf.
schema = {
"$id": "test",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"additionalProperties": False,
"anyOf": [{"required": ["anonymousId"]}, {"required": ["userId"]}],
"properties": {
"anonymousId": {
"description": "A temporary user id, used before a user logs in.",
"format": "uuid",
"type": "string",
},
"userId": {
"description": "Unique user id.",
"type": "string",
},
"event": {"description": "Unique name of the event.", "type": "string"},
"timestamp": {
"description": "Timestamp of when the message itself took place.",
"type": "string",
},
},
"required": ["event"],
"type": "object",
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=union].[type=union_0].[type=string(uuid)].anonymousId",
"[version=2.0].[type=union].[type=union_0].[type=string].userId",
"[version=2.0].[type=union].[type=union_0].[type=string].event",
"[version=2.0].[type=union].[type=union_0].[type=string].timestamp",
"[version=2.0].[type=union].[type=union_1].[type=string(uuid)].anonymousId",
"[version=2.0].[type=union].[type=union_1].[type=string].userId",
"[version=2.0].[type=union].[type=union_1].[type=string].event",
"[version=2.0].[type=union].[type=union_1].[type=string].timestamp",
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)
# In the first one, the anonymousId is required, but the userId is not.
assert json.loads(fields[0].jsonProps or "{}")["required"] is True
assert json.loads(fields[1].jsonProps or "{}")["required"] is False
# In the second one, the userId is required, but the anonymousId is not.
assert json.loads(fields[4].jsonProps or "{}")["required"] is False
assert json.loads(fields[5].jsonProps or "{}")["required"] is True
# The event field is required in both branches.
assert json.loads(fields[2].jsonProps or "{}")["required"] is True
assert json.loads(fields[6].jsonProps or "{}")["required"] is True
# The timestamp field is not required in either branch.
assert json.loads(fields[3].jsonProps or "{}")["required"] is False
assert json.loads(fields[7].jsonProps or "{}")["required"] is False
def test_top_level_trival_allof():
schema = {
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "event-wrapper",
"type": "object",
"allOf": [
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "event",
"properties": {
"userId": {
"description": "Unique user id.",
"type": "string",
},
"event": {
"description": "Unique name of the event.",
"type": "string",
},
"timestamp": {
"description": "Timestamp of when the message itself took place.",
"type": "string",
},
},
"required": ["event"],
"type": "object",
"additionalProperties": False,
},
],
"properties": {
"extra-top-level-property": {
"type": "string",
},
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=object].[type=string].extra-top-level-property",
"[version=2.0].[type=object].[type=string].userId",
"[version=2.0].[type=object].[type=string].event",
"[version=2.0].[type=object].[type=string].timestamp",
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)
assert json.loads(fields[0].jsonProps or "{}")["required"] is False
assert json.loads(fields[1].jsonProps or "{}")["required"] is False
assert json.loads(fields[2].jsonProps or "{}")["required"] is True
assert json.loads(fields[3].jsonProps or "{}")["required"] is False
def test_description_extraction():
schema = {
"$id": "test",
"$schema": "http://json-schema.org/draft-07/schema#",
"properties": {
"bar": {
"type": "array",
"items": {"type": "string"},
"description": "XYZ",
}
},
}
fields = list(JsonSchemaTranslator.get_fields_from_schema(schema))
expected_field_paths: List[str] = [
"[version=2.0].[type=object].[type=array].bar",
"[version=2.0].[type=object].[type=array].bar.[type=string].string",
]
assert_field_paths_match(fields, expected_field_paths)
assert_fields_are_valid(fields)
# Additional check for the description extraction
array_field = next(
field
for field in fields
if field.fieldPath == "[version=2.0].[type=object].[type=array].bar"
)
assert array_field.description == "XYZ"