fix: don't treat double quote enclosed text as JSON (#2544)

### Summary

Closes #2444. Treats JSON serializable content that results in a string
as plain text. Even though this is valid JSON per [RFC
4627](https://www.ietf.org/rfc/rfc4627.txt), this is valid JSON, but in
almost every cases were really want to treat this as a text file.

### Testing

1. Put `"This is not a JSON"` is a text file `notajson.txt`
2. Run the following

```python
from unstructured.file_utils.filetype import _is_text_file_a_json

_is_text_file_a_json(filename="notajson.txt") # Should be False
```
This commit is contained in:
Matt Robinson 2024-02-14 08:41:43 -05:00 committed by GitHub
parent d11a83ce65
commit 882370022e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 36 additions and 4 deletions

View File

@ -1,4 +1,4 @@
## 0.12.5-dev3
## 0.12.5-dev4
### Enhancements
@ -8,7 +8,9 @@
### Fixes
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors **
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors **
* **Fix don't treat plain text files with double quotes as JSON ** If a file can be deserialized as JSON but it deserializes as a string, treat it as plain text even though it's valid JSON.
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors **
* **Fix cluster of bugs in `partition_xlsx()` that dropped content.** Algorithm for detecting "subtables" within a worksheet dropped table elements for certain patterns of populated cells such as when a trailing single-cell row appeared in a contiguous block of populated cells.
## 0.12.4

View File

@ -166,6 +166,17 @@ def test_detects_go_mime_type():
assert _is_code_mime_type("text/x-go") is True
def test_detect_xml_application_go(monkeypatch, tmpdir):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-go")
filename = os.path.join(tmpdir, "fake.go")
with open(filename, "w") as f:
f.write("")
with open(filename, "rb") as f:
assert detect_filetype(filename=filename) == FileType.TXT
def test_detect_xml_application_rtf(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/rtf")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.rtf")
@ -411,6 +422,7 @@ def test_filetype_order():
(b"d\xe2\x80", False), # Invalid JSON
(b'[{"key": "value"}]', True), # Valid JSON
(b"", False), # Empty content
(b'"This is not a JSON"', False), # Serializable as JSON, but we want to treat it as txt
],
)
def test_is_text_file_a_json(content, expected):

View File

@ -327,3 +327,13 @@ def test_catch_overlapping_and_nested_bboxes(
)
assert overlapping_flag == expectation[0]
assert overlapping_cases == expectation[1]
def test_validate_data_args():
assert utils.validate_date_args("2020-10-10") is True
with pytest.raises(ValueError):
utils.validate_date_args("blah")
with pytest.raises(ValueError):
utils.validate_date_args(None)

View File

@ -1 +1 @@
__version__ = "0.12.5-dev3" # pragma: no cover
__version__ = "0.12.5-dev4" # pragma: no cover

View File

@ -477,7 +477,15 @@ def _is_text_file_a_json(
encoding=encoding,
)
try:
json.loads(file_text)
output = json.loads(file_text)
# NOTE(robinson) - Per RFC 4627 which defines the application/json media type,
# a string is a valid JSON. For our purposes, however, we want to treat that
# as a text file even if it is serializable as json.
# References:
# https://stackoverflow.com/questions/7487869/is-this-simple-string-considered-valid-json
# https://www.ietf.org/rfc/rfc4627.txt
if isinstance(output, str):
return False
return True
except json.JSONDecodeError:
return False