enhancement: improve json detection by detect_filetype (#971)

* update regex pattern

* improve json regex pattern checks and add test file

* update file name

* update tests and formatting

* update changelog and version
This commit is contained in:
John 2023-07-25 11:47:39 -05:00 committed by GitHub
parent f7def03d55
commit f282a10715
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 43 additions and 8 deletions

View File

@ -1,4 +1,4 @@
## 0.8.2-dev5
## 0.8.2-dev6
### Enhancements
@ -9,6 +9,7 @@
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
* Add slide notes to pptx
* Add `--encoding` directive to ingest
* Improve json detection by `detect_filetype`
### Features

View File

@ -0,0 +1,10 @@
{
"name": "John Doe",
"age": 30,
"email": "johndoe@example.com",
"is_student": true,
"address": {
"city": "New York",
"zipcode": "10001"
},
"hobbies": ["reading", "running", "cooking"]

View File

@ -77,6 +77,7 @@ class MockDocumentLayout(layout.DocumentLayout):
("README.rst", FileType.RST),
("README.md", FileType.MD),
("fake.odt", FileType.ODT),
("fake-incomplete-json.txt", FileType.JSON),
],
)
def test_detect_filetype_from_filename(file, expected):
@ -103,6 +104,7 @@ def test_detect_filetype_from_filename(file, expected):
("fake-doc.rtf", FileType.RTF),
("spring-weather.html.json", FileType.JSON),
("fake.odt", FileType.ODT),
("fake-incomplete-json.txt", FileType.TXT),
],
)
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
@ -139,6 +141,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
("stanley-cups.tsv", FileType.TSV),
("fake-power-point.pptx", FileType.PPTX),
("winter-sports.epub", FileType.EPUB),
("fake-incomplete-json.txt", FileType.JSON),
],
)
def test_detect_filetype_from_file(file, expected):

View File

@ -1 +1 @@
__version__ = "0.8.2-dev5" # pragma: no cover
__version__ = "0.8.2-dev6" # pragma: no cover

View File

@ -11,7 +11,7 @@ from typing import IO, TYPE_CHECKING, Callable, List, Optional
from unstructured.documents.coordinates import PixelSpace
from unstructured.documents.elements import Element, PageBreak
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
from unstructured.nlp.patterns import JSON_PATTERN, VALID_JSON_CHARACTERS
from unstructured.partition.common import (
_add_element_metadata,
_remove_element_metadata,
@ -300,9 +300,6 @@ def detect_filetype(
encoding = "utf-8"
formatted_encoding = format_encoding_str(encoding)
if extension in PLAIN_TEXT_EXTENSIONS:
return EXT_TO_FILETYPE.get(extension)
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
# installed on the Unstructured docker image, .json files resolve to "text/plain"
# rather than "application/json". this corrects for that case.
@ -315,6 +312,9 @@ def detect_filetype(
if file and _check_eml_from_buffer(file=file) is True:
return FileType.EML
if extension in PLAIN_TEXT_EXTENSIONS:
return EXT_TO_FILETYPE.get(extension)
# Safety catch
if mime_type in STR_TO_FILETYPE:
return STR_TO_FILETYPE[mime_type]
@ -417,7 +417,15 @@ def _is_text_file_a_json(
):
"""Detects if a file that has a text/plain MIME type is a JSON file."""
file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
text_without_strings = re.sub(r'"(?:\\.|[^"\\])*"', "", file_text)
if not re.match(VALID_JSON_CHARACTERS, text_without_strings):
return False
if not re.match(JSON_PATTERN, file_text):
return False
return True
def _count_commas(text: str):

View File

@ -108,4 +108,17 @@ ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
# NOTE(robinson) - Used to detect if text is in the expected "list of dicts"
# format for document elements
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
JSON_PATTERN = r"^(?:\{.*\}|\[.*\])$"
# (?s) dot all (including newline characters)
# \{(?=.*:) opening brace and at least one colon
# .*? any characters (non-greedy)
# (?:\}|$) non-capturing group that matches either the closing brace } or the end of
# the string to handle cases where the JSON is cut off
# | or
# \[(?s:.*?)\] matches the opening bracket [ in a JSON array and any characters inside the array
# (?:$|,|\]) non-capturing group that matches either the end of the string, a comma,
# or the closing bracket to handle cases where the JSON array is cut off
JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
# taken from https://stackoverflow.com/a/3845829/12406158
VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"