mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-29 08:05:08 +00:00
fix: don't treat double quote enclosed text as JSON (#2544)
### Summary Closes #2444. Treats JSON serializable content that results in a string as plain text. Even though this is valid JSON per [RFC 4627](https://www.ietf.org/rfc/rfc4627.txt), this is valid JSON, but in almost every cases were really want to treat this as a text file. ### Testing 1. Put `"This is not a JSON"` is a text file `notajson.txt` 2. Run the following ```python from unstructured.file_utils.filetype import _is_text_file_a_json _is_text_file_a_json(filename="notajson.txt") # Should be False ```
This commit is contained in:
parent
d11a83ce65
commit
882370022e
@ -1,4 +1,4 @@
|
||||
## 0.12.5-dev3
|
||||
## 0.12.5-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -8,7 +8,9 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors **
|
||||
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors **
|
||||
* **Fix don't treat plain text files with double quotes as JSON ** If a file can be deserialized as JSON but it deserializes as a string, treat it as plain text even though it's valid JSON.
|
||||
* **Fix `check_connection` in opensearch, databricks, postgres, azure connectors **
|
||||
* **Fix cluster of bugs in `partition_xlsx()` that dropped content.** Algorithm for detecting "subtables" within a worksheet dropped table elements for certain patterns of populated cells such as when a trailing single-cell row appeared in a contiguous block of populated cells.
|
||||
|
||||
## 0.12.4
|
||||
|
||||
@ -166,6 +166,17 @@ def test_detects_go_mime_type():
|
||||
assert _is_code_mime_type("text/x-go") is True
|
||||
|
||||
|
||||
def test_detect_xml_application_go(monkeypatch, tmpdir):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-go")
|
||||
|
||||
filename = os.path.join(tmpdir, "fake.go")
|
||||
with open(filename, "w") as f:
|
||||
f.write("")
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
assert detect_filetype(filename=filename) == FileType.TXT
|
||||
|
||||
|
||||
def test_detect_xml_application_rtf(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/rtf")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.rtf")
|
||||
@ -411,6 +422,7 @@ def test_filetype_order():
|
||||
(b"d\xe2\x80", False), # Invalid JSON
|
||||
(b'[{"key": "value"}]', True), # Valid JSON
|
||||
(b"", False), # Empty content
|
||||
(b'"This is not a JSON"', False), # Serializable as JSON, but we want to treat it as txt
|
||||
],
|
||||
)
|
||||
def test_is_text_file_a_json(content, expected):
|
||||
|
||||
@ -327,3 +327,13 @@ def test_catch_overlapping_and_nested_bboxes(
|
||||
)
|
||||
assert overlapping_flag == expectation[0]
|
||||
assert overlapping_cases == expectation[1]
|
||||
|
||||
|
||||
def test_validate_data_args():
|
||||
assert utils.validate_date_args("2020-10-10") is True
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
utils.validate_date_args("blah")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
utils.validate_date_args(None)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.12.5-dev3" # pragma: no cover
|
||||
__version__ = "0.12.5-dev4" # pragma: no cover
|
||||
|
||||
@ -477,7 +477,15 @@ def _is_text_file_a_json(
|
||||
encoding=encoding,
|
||||
)
|
||||
try:
|
||||
json.loads(file_text)
|
||||
output = json.loads(file_text)
|
||||
# NOTE(robinson) - Per RFC 4627 which defines the application/json media type,
|
||||
# a string is a valid JSON. For our purposes, however, we want to treat that
|
||||
# as a text file even if it is serializable as json.
|
||||
# References:
|
||||
# https://stackoverflow.com/questions/7487869/is-this-simple-string-considered-valid-json
|
||||
# https://www.ietf.org/rfc/rfc4627.txt
|
||||
if isinstance(output, str):
|
||||
return False
|
||||
return True
|
||||
except json.JSONDecodeError:
|
||||
return False
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user