enhancement: improve json detection by detect_filetype (#971)

* update regex pattern * improve json regex pattern checks and add test file * update file name * update tests and formatting * update changelog and version
2025-11-14 17:37:27 +00:00 · 2023-07-25 11:47:39 -05:00 · 2023-07-25 11:47:39 -05:00 · f282a10715
commit f282a10715
parent f7def03d55
6 changed files with 43 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.8.2-dev5
+## 0.8.2-dev6
 ### Enhancements
@ -9,6 +9,7 @@
 * set the file's current position to the beginning after reading the file in `convert_to_bytes`
 * Add slide notes to pptx
 * Add `--encoding` directive to ingest
 * Improve json detection by `detect_filetype`
 ### Features
--- a/example-docs/fake-incomplete-json.txt
+++ b/example-docs/fake-incomplete-json.txt
@ -0,0 +1,10 @@
 {
  "name": "John Doe",
  "age": 30,
  "email": "johndoe@example.com",
  "is_student": true,
  "address": {
    "city": "New York",
    "zipcode": "10001"
  },
  "hobbies": ["reading", "running", "cooking"]
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -77,6 +77,7 @@ class MockDocumentLayout(layout.DocumentLayout):
        ("README.rst", FileType.RST),
        ("README.md", FileType.MD),
        ("fake.odt", FileType.ODT),
        ("fake-incomplete-json.txt", FileType.JSON),
    ],
 )
 def test_detect_filetype_from_filename(file, expected):
@ -103,6 +104,7 @@ def test_detect_filetype_from_filename(file, expected):
        ("fake-doc.rtf", FileType.RTF),
        ("spring-weather.html.json", FileType.JSON),
        ("fake.odt", FileType.ODT),
        ("fake-incomplete-json.txt", FileType.TXT),
    ],
 )
 def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
@ -139,6 +141,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
        ("stanley-cups.tsv", FileType.TSV),
        ("fake-power-point.pptx", FileType.PPTX),
        ("winter-sports.epub", FileType.EPUB),
        ("fake-incomplete-json.txt", FileType.JSON),
    ],
 )
 def test_detect_filetype_from_file(file, expected):
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.8.2-dev5"  # pragma: no cover
+__version__ = "0.8.2-dev6"  # pragma: no cover
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -11,7 +11,7 @@ from typing import IO, TYPE_CHECKING, Callable, List, Optional
 from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import Element, PageBreak
 from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
-from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
+from unstructured.nlp.patterns import JSON_PATTERN, VALID_JSON_CHARACTERS
 from unstructured.partition.common import (
    _add_element_metadata,
    _remove_element_metadata,
@ -300,9 +300,6 @@ def detect_filetype(
            encoding = "utf-8"
        formatted_encoding = format_encoding_str(encoding)
        if extension in PLAIN_TEXT_EXTENSIONS:
            return EXT_TO_FILETYPE.get(extension)
        # NOTE(crag): for older versions of the OS libmagic package, such as is currently
        # installed on the Unstructured docker image, .json files resolve to "text/plain"
        # rather than "application/json". this corrects for that case.
@ -315,6 +312,9 @@ def detect_filetype(
        if file and _check_eml_from_buffer(file=file) is True:
            return FileType.EML
        if extension in PLAIN_TEXT_EXTENSIONS:
            return EXT_TO_FILETYPE.get(extension)
        # Safety catch
        if mime_type in STR_TO_FILETYPE:
            return STR_TO_FILETYPE[mime_type]
@ -417,7 +417,15 @@ def _is_text_file_a_json(
 ):
    """Detects if a file that has a text/plain MIME type is a JSON file."""
    file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
-    return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
+    text_without_strings = re.sub(r'"(?:\\.|[^"\\])*"', "", file_text)
    if not re.match(VALID_JSON_CHARACTERS, text_without_strings):
        return False
    if not re.match(JSON_PATTERN, file_text):
        return False
    return True
 def _count_commas(text: str):
--- a/unstructured/nlp/patterns.py
+++ b/unstructured/nlp/patterns.py
@ -108,4 +108,17 @@ ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
 # NOTE(robinson) - Used to detect if text is in the expected "list of dicts"
 # format for document elements
 LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
-JSON_PATTERN = r"^(?:\{.*\}|\[.*\])$"
+
 # (?s) dot all (including newline characters)
 # \{(?=.*:) opening brace and at least one colon
 # .*? any characters (non-greedy)
 # (?:\}|$) non-capturing group that matches either the closing brace } or the end of
 # the string to handle cases where the JSON is cut off
 # | or
 # \[(?s:.*?)\] matches the opening bracket [ in a JSON array and any characters inside the array
 # (?:$|,|\]) non-capturing group that matches either the end of the string, a comma,
 # or the closing bracket to handle cases where the JSON array is cut off
 JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
 # taken from https://stackoverflow.com/a/3845829/12406158
 VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"
`@ -1 +1 @@`
	`__version__ = "0.8.2-dev5" # pragma: no cover`	`__version__ = "0.8.2-dev6" # pragma: no cover`