fix: throw validation error when json is passed with invalid unstructured json (#4002)

### Notes
Adds validation if `json` / `ndjson` are not valid unstructured schema.

### Testing
Manually tested serverless API with example json:

```

test_length = [] = 200

test_invalid = [{"invalid": "schema"}] = 422
test_invalid_ndjson ={"hi": "there"} = 422

test_chunk = [{"type":"Header","element_id":"a23fdadef9277f217563e217ebd074d5" ... = 200

```
This commit is contained in:
jordan-homan 2025-05-19 14:24:44 -04:00 committed by GitHub
parent e3417d7e98
commit 570ee078a4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 36 additions and 2 deletions

View File

@ -6,6 +6,7 @@
### Fixes
- **Fix image extraction for PNG files.** When `extract_image_block_to_payload` is True, and the image is a PNG, we get a Pillow error. We need to remove the PNG transparency layer before saving the image.
- **Throw validation error when json is passed with invalid unstructured json
## 0.17.6

View File

@ -187,6 +187,11 @@ def test_partition_json_works_with_empty_string():
assert partition_json(text="") == []
def test_partition_json_fails_with_empty_item():
with pytest.raises(ValueError):
partition_json(text="{}")
def test_partition_json_works_with_empty_list():
assert partition_json(text="[]") == []
@ -288,6 +293,12 @@ def test_partition_json_from_text_prefers_metadata_last_modified():
# ------------------------------------------------------------------------------------------------
def test_partition_json_raises_with_unprocessable_json_array():
text = '[{"invalid": "schema"}]'
with pytest.raises(ValueError):
partition_json(text=text)
def test_partition_json_raises_with_unprocessable_json():
# NOTE(robinson) - This is unprocessable because it is not a list of dicts,
# per the Unstructured ISD format

View File

@ -189,8 +189,14 @@ def test_partition_ndjson_works_with_empty_string():
assert partition_ndjson(text="") == []
def test_partition_ndjson_works_with_empty_list():
assert partition_ndjson(text="{}") == []
def test_partition_ndjson_fails_with_empty_item():
with pytest.raises(ValueError):
partition_ndjson(text="{}")
def test_partition_ndjson_fails_with_empty_list():
with pytest.raises(ValueError):
partition_ndjson(text="[]")
def test_partition_ndjson_raises_with_too_many_specified():
@ -293,6 +299,12 @@ def test_partition_ndjson_from_text_prefers_metadata_last_modified():
# ------------------------------------------------------------------------------------------------
def test_partition_json_raises_with_unprocessable_json():
text = '{"invalid": "schema"}'
with pytest.raises(ValueError):
partition_ndjson(text=text)
def test_partition_json_raises_with_invalid_json():
text = '[{"hi": "there"}]]'
with pytest.raises(ValueError):

View File

@ -74,6 +74,11 @@ def partition_json(
try:
element_dicts = json.loads(file_text)
elements = elements_from_dicts(element_dicts)
# if we found at least one json element, but no unstructured elements were found, throw 422
if len(element_dicts) > 0 and len(elements) == 0:
raise ValueError(
"JSON cannot be partitioned. Schema does not match the Unstructured schema.",
)
except json.JSONDecodeError:
raise ValueError("Not a valid json")

View File

@ -75,6 +75,11 @@ def partition_ndjson(
try:
element_dicts = ndjson_loads(file_text)
elements = elements_from_dicts(element_dicts)
# if we found at least one json element, but no unstructured elements were found, throw 422
if len(element_dicts) > 0 and len(elements) == 0:
raise ValueError(
"JSON cannot be partitioned. Schema does not match the Unstructured schema.",
)
except json.JSONDecodeError:
raise ValueError("Not a valid ndjson")