From 882370022ef9a4348682e17f7967b02d8a8b2c16 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Wed, 14 Feb 2024 08:41:43 -0500 Subject: [PATCH] fix: don't treat double quote enclosed text as JSON (#2544) ### Summary Closes #2444. Treats JSON serializable content that results in a string as plain text. Even though this is valid JSON per [RFC 4627](https://www.ietf.org/rfc/rfc4627.txt), this is valid JSON, but in almost every cases were really want to treat this as a text file. ### Testing 1. Put `"This is not a JSON"` is a text file `notajson.txt` 2. Run the following ```python from unstructured.file_utils.filetype import _is_text_file_a_json _is_text_file_a_json(filename="notajson.txt") # Should be False ``` --- CHANGELOG.md | 6 ++++-- test_unstructured/file_utils/test_filetype.py | 12 ++++++++++++ test_unstructured/test_utils.py | 10 ++++++++++ unstructured/__version__.py | 2 +- unstructured/file_utils/filetype.py | 10 +++++++++- 5 files changed, 36 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4ee9e474..cc46ca7b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.12.5-dev3 +## 0.12.5-dev4 ### Enhancements @@ -8,7 +8,9 @@ ### Fixes -* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors ** +* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors ** +* **Fix don't treat plain text files with double quotes as JSON ** If a file can be deserialized as JSON but it deserializes as a string, treat it as plain text even though it's valid JSON. +* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors ** * **Fix cluster of bugs in `partition_xlsx()` that dropped content.** Algorithm for detecting "subtables" within a worksheet dropped table elements for certain patterns of populated cells such as when a trailing single-cell row appeared in a contiguous block of populated cells. ## 0.12.4 diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 2388b07aa..dc641945d 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -166,6 +166,17 @@ def test_detects_go_mime_type(): assert _is_code_mime_type("text/x-go") is True +def test_detect_xml_application_go(monkeypatch, tmpdir): + monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-go") + + filename = os.path.join(tmpdir, "fake.go") + with open(filename, "w") as f: + f.write("") + + with open(filename, "rb") as f: + assert detect_filetype(filename=filename) == FileType.TXT + + def test_detect_xml_application_rtf(monkeypatch): monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/rtf") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.rtf") @@ -411,6 +422,7 @@ def test_filetype_order(): (b"d\xe2\x80", False), # Invalid JSON (b'[{"key": "value"}]', True), # Valid JSON (b"", False), # Empty content + (b'"This is not a JSON"', False), # Serializable as JSON, but we want to treat it as txt ], ) def test_is_text_file_a_json(content, expected): diff --git a/test_unstructured/test_utils.py b/test_unstructured/test_utils.py index cb5a8799f..aed633b0d 100644 --- a/test_unstructured/test_utils.py +++ b/test_unstructured/test_utils.py @@ -327,3 +327,13 @@ def test_catch_overlapping_and_nested_bboxes( ) assert overlapping_flag == expectation[0] assert overlapping_cases == expectation[1] + + +def test_validate_data_args(): + assert utils.validate_date_args("2020-10-10") is True + + with pytest.raises(ValueError): + utils.validate_date_args("blah") + + with pytest.raises(ValueError): + utils.validate_date_args(None) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a35bf0ce8..593ff8878 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.12.5-dev3" # pragma: no cover +__version__ = "0.12.5-dev4" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index ca4a0fd0d..b3dcb01d2 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -477,7 +477,15 @@ def _is_text_file_a_json( encoding=encoding, ) try: - json.loads(file_text) + output = json.loads(file_text) + # NOTE(robinson) - Per RFC 4627 which defines the application/json media type, + # a string is a valid JSON. For our purposes, however, we want to treat that + # as a text file even if it is serializable as json. + # References: + # https://stackoverflow.com/questions/7487869/is-this-simple-string-considered-valid-json + # https://www.ietf.org/rfc/rfc4627.txt + if isinstance(output, str): + return False return True except json.JSONDecodeError: return False