From 570ee078a4b5a6b82905133aa6517b828d6325e3 Mon Sep 17 00:00:00 2001 From: jordan-homan <90481160+jordan-homan@users.noreply.github.com> Date: Mon, 19 May 2025 14:24:44 -0400 Subject: [PATCH] fix: throw validation error when json is passed with invalid unstructured json (#4002) ### Notes Adds validation if `json` / `ndjson` are not valid unstructured schema. ### Testing Manually tested serverless API with example json: ``` test_length = [] = 200 test_invalid = [{"invalid": "schema"}] = 422 test_invalid_ndjson ={"hi": "there"} = 422 test_chunk = [{"type":"Header","element_id":"a23fdadef9277f217563e217ebd074d5" ... = 200 ``` --- CHANGELOG.md | 1 + test_unstructured/partition/test_json.py | 11 +++++++++++ test_unstructured/partition/test_ndjson.py | 16 ++++++++++++++-- unstructured/partition/json.py | 5 +++++ unstructured/partition/ndjson.py | 5 +++++ 5 files changed, 36 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20a4bcaf7..4169fa951 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ ### Fixes - **Fix image extraction for PNG files.** When `extract_image_block_to_payload` is True, and the image is a PNG, we get a Pillow error. We need to remove the PNG transparency layer before saving the image. +- **Throw validation error when json is passed with invalid unstructured json ## 0.17.6 diff --git a/test_unstructured/partition/test_json.py b/test_unstructured/partition/test_json.py index 7a591953d..388cbb23c 100644 --- a/test_unstructured/partition/test_json.py +++ b/test_unstructured/partition/test_json.py @@ -187,6 +187,11 @@ def test_partition_json_works_with_empty_string(): assert partition_json(text="") == [] +def test_partition_json_fails_with_empty_item(): + with pytest.raises(ValueError): + partition_json(text="{}") + + def test_partition_json_works_with_empty_list(): assert partition_json(text="[]") == [] @@ -288,6 +293,12 @@ def test_partition_json_from_text_prefers_metadata_last_modified(): # ------------------------------------------------------------------------------------------------ +def test_partition_json_raises_with_unprocessable_json_array(): + text = '[{"invalid": "schema"}]' + with pytest.raises(ValueError): + partition_json(text=text) + + def test_partition_json_raises_with_unprocessable_json(): # NOTE(robinson) - This is unprocessable because it is not a list of dicts, # per the Unstructured ISD format diff --git a/test_unstructured/partition/test_ndjson.py b/test_unstructured/partition/test_ndjson.py index c86ce1c8e..3ac5aca98 100644 --- a/test_unstructured/partition/test_ndjson.py +++ b/test_unstructured/partition/test_ndjson.py @@ -189,8 +189,14 @@ def test_partition_ndjson_works_with_empty_string(): assert partition_ndjson(text="") == [] -def test_partition_ndjson_works_with_empty_list(): - assert partition_ndjson(text="{}") == [] +def test_partition_ndjson_fails_with_empty_item(): + with pytest.raises(ValueError): + partition_ndjson(text="{}") + + +def test_partition_ndjson_fails_with_empty_list(): + with pytest.raises(ValueError): + partition_ndjson(text="[]") def test_partition_ndjson_raises_with_too_many_specified(): @@ -293,6 +299,12 @@ def test_partition_ndjson_from_text_prefers_metadata_last_modified(): # ------------------------------------------------------------------------------------------------ +def test_partition_json_raises_with_unprocessable_json(): + text = '{"invalid": "schema"}' + with pytest.raises(ValueError): + partition_ndjson(text=text) + + def test_partition_json_raises_with_invalid_json(): text = '[{"hi": "there"}]]' with pytest.raises(ValueError): diff --git a/unstructured/partition/json.py b/unstructured/partition/json.py index 4a900de38..40654487d 100644 --- a/unstructured/partition/json.py +++ b/unstructured/partition/json.py @@ -74,6 +74,11 @@ def partition_json( try: element_dicts = json.loads(file_text) elements = elements_from_dicts(element_dicts) + # if we found at least one json element, but no unstructured elements were found, throw 422 + if len(element_dicts) > 0 and len(elements) == 0: + raise ValueError( + "JSON cannot be partitioned. Schema does not match the Unstructured schema.", + ) except json.JSONDecodeError: raise ValueError("Not a valid json") diff --git a/unstructured/partition/ndjson.py b/unstructured/partition/ndjson.py index 925e71e95..2f4d22343 100644 --- a/unstructured/partition/ndjson.py +++ b/unstructured/partition/ndjson.py @@ -75,6 +75,11 @@ def partition_ndjson( try: element_dicts = ndjson_loads(file_text) elements = elements_from_dicts(element_dicts) + # if we found at least one json element, but no unstructured elements were found, throw 422 + if len(element_dicts) > 0 and len(elements) == 0: + raise ValueError( + "JSON cannot be partitioned. Schema does not match the Unstructured schema.", + ) except json.JSONDecodeError: raise ValueError("Not a valid ndjson")