mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-15 03:56:59 +00:00
fix(ingest): json serializable fix (#12246)
This commit is contained in:
parent
554a37a551
commit
ea249caee1
@ -146,12 +146,55 @@ class DatahubIngestionRunSummaryProvider(PipelineRunListener):
|
|||||||
aspect_value=source_info_aspect,
|
aspect_value=source_info_aspect,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _convert_sets_to_lists(obj: Any) -> Any:
|
||||||
|
"""
|
||||||
|
Recursively converts all sets to lists in a Python object.
|
||||||
|
Works with nested dictionaries, lists, and sets.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
obj: Any Python object that might contain sets
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The object with all sets converted to lists
|
||||||
|
"""
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return {
|
||||||
|
key: DatahubIngestionRunSummaryProvider._convert_sets_to_lists(value)
|
||||||
|
for key, value in obj.items()
|
||||||
|
}
|
||||||
|
elif isinstance(obj, list):
|
||||||
|
return [
|
||||||
|
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
|
||||||
|
for element in obj
|
||||||
|
]
|
||||||
|
elif isinstance(obj, set):
|
||||||
|
return [
|
||||||
|
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
|
||||||
|
for element in obj
|
||||||
|
]
|
||||||
|
elif isinstance(obj, tuple):
|
||||||
|
return tuple(
|
||||||
|
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(element)
|
||||||
|
for element in obj
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return obj
|
||||||
|
|
||||||
def _get_recipe_to_report(self, ctx: PipelineContext) -> str:
|
def _get_recipe_to_report(self, ctx: PipelineContext) -> str:
|
||||||
assert ctx.pipeline_config
|
assert ctx.pipeline_config
|
||||||
if not self.report_recipe or not ctx.pipeline_config.get_raw_dict():
|
if not self.report_recipe or not ctx.pipeline_config.get_raw_dict():
|
||||||
return ""
|
return ""
|
||||||
else:
|
else:
|
||||||
return json.dumps(redact_raw_config(ctx.pipeline_config.get_raw_dict()))
|
redacted_recipe = redact_raw_config(ctx.pipeline_config.get_raw_dict())
|
||||||
|
# This is required otherwise json dumps will fail
|
||||||
|
# with a TypeError: Object of type set is not JSON serializable
|
||||||
|
converted_recipe = (
|
||||||
|
DatahubIngestionRunSummaryProvider._convert_sets_to_lists(
|
||||||
|
redacted_recipe
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return json.dumps(converted_recipe)
|
||||||
|
|
||||||
def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
|
def _emit_aspect(self, entity_urn: Urn, aspect_value: _Aspect) -> None:
|
||||||
self.sink.write_record_async(
|
self.sink.write_record_async(
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
from typing import Any, Dict, List, Set, Tuple, Union
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from datahub.ingestion.reporting.datahub_ingestion_run_summary_provider import (
|
from datahub.ingestion.reporting.datahub_ingestion_run_summary_provider import (
|
||||||
@ -50,3 +52,136 @@ def test_default_config():
|
|||||||
typed_config = DatahubIngestionRunSummaryProviderConfig.parse_obj({})
|
typed_config = DatahubIngestionRunSummaryProviderConfig.parse_obj({})
|
||||||
assert typed_config.sink is None
|
assert typed_config.sink is None
|
||||||
assert typed_config.report_recipe is True
|
assert typed_config.report_recipe is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_simple_set() -> None:
|
||||||
|
"""Test conversion of a simple set"""
|
||||||
|
input_data: Set[int] = {1, 2, 3}
|
||||||
|
expected: List[int] = [1, 2, 3]
|
||||||
|
result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data)
|
||||||
|
assert sorted(result) == sorted(expected)
|
||||||
|
assert isinstance(result, list)
|
||||||
|
|
||||||
|
|
||||||
|
def test_nested_dict_with_sets() -> None:
|
||||||
|
"""Test conversion of nested dictionary containing sets"""
|
||||||
|
input_data: Dict[str, Union[Set[int], Dict[str, Set[str]]]] = {
|
||||||
|
"set1": {1, 2, 3},
|
||||||
|
"dict1": {"set2": {"a", "b"}},
|
||||||
|
}
|
||||||
|
expected = {
|
||||||
|
"set1": [1, 2, 3],
|
||||||
|
"dict1": {"set2": ["a", "b"]},
|
||||||
|
}
|
||||||
|
result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data)
|
||||||
|
|
||||||
|
def sort_nested_lists(d):
|
||||||
|
return {
|
||||||
|
k: (
|
||||||
|
sorted(v)
|
||||||
|
if isinstance(v, list)
|
||||||
|
else (sort_nested_lists(v) if isinstance(v, dict) else v)
|
||||||
|
)
|
||||||
|
for k, v in d.items()
|
||||||
|
}
|
||||||
|
|
||||||
|
assert sort_nested_lists(result) == sort_nested_lists(expected)
|
||||||
|
|
||||||
|
|
||||||
|
def test_nested_lists_with_sets() -> None:
|
||||||
|
"""Test conversion of nested lists containing sets"""
|
||||||
|
input_data = [{1, 2}, [{3, 4}, {5, 6}]]
|
||||||
|
expected = [[1, 2], [[3, 4], [5, 6]]]
|
||||||
|
result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data)
|
||||||
|
assert [
|
||||||
|
sorted(x)
|
||||||
|
if isinstance(x, list) and len(x) > 0 and not isinstance(x[0], list)
|
||||||
|
else x
|
||||||
|
for x in result
|
||||||
|
] == [
|
||||||
|
sorted(x)
|
||||||
|
if isinstance(x, list) and len(x) > 0 and not isinstance(x[0], list)
|
||||||
|
else x
|
||||||
|
for x in expected
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_tuple_with_sets() -> None:
|
||||||
|
"""Test conversion of tuples containing sets"""
|
||||||
|
input_data = (1, {2, 3}, 4)
|
||||||
|
expected = (1, [2, 3], 4)
|
||||||
|
result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data)
|
||||||
|
assert (result[0], sorted(result[1]), result[2]) == (
|
||||||
|
expected[0],
|
||||||
|
sorted(expected[1]),
|
||||||
|
expected[2],
|
||||||
|
)
|
||||||
|
assert isinstance(result, tuple)
|
||||||
|
|
||||||
|
|
||||||
|
def test_mixed_nested_structure() -> None:
|
||||||
|
"""Test conversion of a complex nested structure"""
|
||||||
|
input_data = {
|
||||||
|
"simple_set": {1, 2, 3},
|
||||||
|
"nested_dict": {
|
||||||
|
"another_set": {"a", "b", "c"},
|
||||||
|
"mixed_list": [1, {2, 3}, {"x", "y"}],
|
||||||
|
},
|
||||||
|
"tuple_with_set": (1, {4, 5}, 6),
|
||||||
|
"list_of_sets": [{1, 2}, {3, 4}],
|
||||||
|
}
|
||||||
|
result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data)
|
||||||
|
|
||||||
|
# Verify structure types
|
||||||
|
assert isinstance(result["simple_set"], list)
|
||||||
|
assert isinstance(result["nested_dict"]["another_set"], list)
|
||||||
|
assert isinstance(result["nested_dict"]["mixed_list"][1], list)
|
||||||
|
assert isinstance(result["nested_dict"]["mixed_list"][2], list)
|
||||||
|
assert isinstance(result["tuple_with_set"], tuple)
|
||||||
|
assert isinstance(result["tuple_with_set"][1], list)
|
||||||
|
assert isinstance(result["list_of_sets"][0], list)
|
||||||
|
|
||||||
|
|
||||||
|
def test_non_set_data() -> None:
|
||||||
|
"""Test that non-set data remains unchanged"""
|
||||||
|
input_data = {
|
||||||
|
"string": "hello",
|
||||||
|
"int": 42,
|
||||||
|
"float": 3.14,
|
||||||
|
"bool": True,
|
||||||
|
"none": None,
|
||||||
|
"list": [1, 2, 3],
|
||||||
|
"dict": {"a": 1, "b": 2},
|
||||||
|
}
|
||||||
|
result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data)
|
||||||
|
assert result == input_data
|
||||||
|
|
||||||
|
|
||||||
|
def test_empty_structures() -> None:
|
||||||
|
"""Test handling of empty structures"""
|
||||||
|
input_data: Dict[
|
||||||
|
str, Union[Set[Any], Dict[Any, Any], List[Any], Tuple[Any, ...]]
|
||||||
|
] = {"empty_set": set(), "empty_dict": {}, "empty_list": [], "empty_tuple": ()}
|
||||||
|
expected: Dict[
|
||||||
|
str, Union[List[Any], Dict[Any, Any], List[Any], Tuple[Any, ...]]
|
||||||
|
] = {"empty_set": [], "empty_dict": {}, "empty_list": [], "empty_tuple": ()}
|
||||||
|
result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data)
|
||||||
|
assert result == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_json_serializable() -> None:
|
||||||
|
"""Test that the converted structure is JSON serializable"""
|
||||||
|
import json
|
||||||
|
|
||||||
|
input_data = {
|
||||||
|
"set": {1, 2, 3},
|
||||||
|
"nested": {"set": {"a", "b"}},
|
||||||
|
"mixed": [1, {2, 3}, {"x"}],
|
||||||
|
}
|
||||||
|
result = DatahubIngestionRunSummaryProvider._convert_sets_to_lists(input_data)
|
||||||
|
try:
|
||||||
|
json.dumps(result)
|
||||||
|
serializable = True
|
||||||
|
except TypeError:
|
||||||
|
serializable = False
|
||||||
|
assert serializable
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user