mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
enhancement: improve json detection by detect_filetype (#971)
* update regex pattern * improve json regex pattern checks and add test file * update file name * update tests and formatting * update changelog and version
This commit is contained in:
parent
f7def03d55
commit
f282a10715
@ -1,4 +1,4 @@
|
||||
## 0.8.2-dev5
|
||||
## 0.8.2-dev6
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
|
||||
* Add slide notes to pptx
|
||||
* Add `--encoding` directive to ingest
|
||||
* Improve json detection by `detect_filetype`
|
||||
|
||||
### Features
|
||||
|
||||
|
10
example-docs/fake-incomplete-json.txt
Normal file
10
example-docs/fake-incomplete-json.txt
Normal file
@ -0,0 +1,10 @@
|
||||
{
|
||||
"name": "John Doe",
|
||||
"age": 30,
|
||||
"email": "johndoe@example.com",
|
||||
"is_student": true,
|
||||
"address": {
|
||||
"city": "New York",
|
||||
"zipcode": "10001"
|
||||
},
|
||||
"hobbies": ["reading", "running", "cooking"]
|
@ -77,6 +77,7 @@ class MockDocumentLayout(layout.DocumentLayout):
|
||||
("README.rst", FileType.RST),
|
||||
("README.md", FileType.MD),
|
||||
("fake.odt", FileType.ODT),
|
||||
("fake-incomplete-json.txt", FileType.JSON),
|
||||
],
|
||||
)
|
||||
def test_detect_filetype_from_filename(file, expected):
|
||||
@ -103,6 +104,7 @@ def test_detect_filetype_from_filename(file, expected):
|
||||
("fake-doc.rtf", FileType.RTF),
|
||||
("spring-weather.html.json", FileType.JSON),
|
||||
("fake.odt", FileType.ODT),
|
||||
("fake-incomplete-json.txt", FileType.TXT),
|
||||
],
|
||||
)
|
||||
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
|
||||
@ -139,6 +141,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
|
||||
("stanley-cups.tsv", FileType.TSV),
|
||||
("fake-power-point.pptx", FileType.PPTX),
|
||||
("winter-sports.epub", FileType.EPUB),
|
||||
("fake-incomplete-json.txt", FileType.JSON),
|
||||
],
|
||||
)
|
||||
def test_detect_filetype_from_file(file, expected):
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.8.2-dev5" # pragma: no cover
|
||||
__version__ = "0.8.2-dev6" # pragma: no cover
|
||||
|
@ -11,7 +11,7 @@ from typing import IO, TYPE_CHECKING, Callable, List, Optional
|
||||
from unstructured.documents.coordinates import PixelSpace
|
||||
from unstructured.documents.elements import Element, PageBreak
|
||||
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
|
||||
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
|
||||
from unstructured.nlp.patterns import JSON_PATTERN, VALID_JSON_CHARACTERS
|
||||
from unstructured.partition.common import (
|
||||
_add_element_metadata,
|
||||
_remove_element_metadata,
|
||||
@ -300,9 +300,6 @@ def detect_filetype(
|
||||
encoding = "utf-8"
|
||||
formatted_encoding = format_encoding_str(encoding)
|
||||
|
||||
if extension in PLAIN_TEXT_EXTENSIONS:
|
||||
return EXT_TO_FILETYPE.get(extension)
|
||||
|
||||
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
||||
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
||||
# rather than "application/json". this corrects for that case.
|
||||
@ -315,6 +312,9 @@ def detect_filetype(
|
||||
if file and _check_eml_from_buffer(file=file) is True:
|
||||
return FileType.EML
|
||||
|
||||
if extension in PLAIN_TEXT_EXTENSIONS:
|
||||
return EXT_TO_FILETYPE.get(extension)
|
||||
|
||||
# Safety catch
|
||||
if mime_type in STR_TO_FILETYPE:
|
||||
return STR_TO_FILETYPE[mime_type]
|
||||
@ -417,7 +417,15 @@ def _is_text_file_a_json(
|
||||
):
|
||||
"""Detects if a file that has a text/plain MIME type is a JSON file."""
|
||||
file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
|
||||
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
|
||||
text_without_strings = re.sub(r'"(?:\\.|[^"\\])*"', "", file_text)
|
||||
|
||||
if not re.match(VALID_JSON_CHARACTERS, text_without_strings):
|
||||
return False
|
||||
|
||||
if not re.match(JSON_PATTERN, file_text):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def _count_commas(text: str):
|
||||
|
@ -108,4 +108,17 @@ ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
||||
# NOTE(robinson) - Used to detect if text is in the expected "list of dicts"
|
||||
# format for document elements
|
||||
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
|
||||
JSON_PATTERN = r"^(?:\{.*\}|\[.*\])$"
|
||||
|
||||
# (?s) dot all (including newline characters)
|
||||
# \{(?=.*:) opening brace and at least one colon
|
||||
# .*? any characters (non-greedy)
|
||||
# (?:\}|$) non-capturing group that matches either the closing brace } or the end of
|
||||
# the string to handle cases where the JSON is cut off
|
||||
# | or
|
||||
# \[(?s:.*?)\] matches the opening bracket [ in a JSON array and any characters inside the array
|
||||
# (?:$|,|\]) non-capturing group that matches either the end of the string, a comma,
|
||||
# or the closing bracket to handle cases where the JSON array is cut off
|
||||
JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
|
||||
|
||||
# taken from https://stackoverflow.com/a/3845829/12406158
|
||||
VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"
|
||||
|
Loading…
x
Reference in New Issue
Block a user