mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
enhancement: improve json detection by detect_filetype (#971)
* update regex pattern * improve json regex pattern checks and add test file * update file name * update tests and formatting * update changelog and version
This commit is contained in:
parent
f7def03d55
commit
f282a10715
@ -1,4 +1,4 @@
|
|||||||
## 0.8.2-dev5
|
## 0.8.2-dev6
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -9,6 +9,7 @@
|
|||||||
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
|
* set the file's current position to the beginning after reading the file in `convert_to_bytes`
|
||||||
* Add slide notes to pptx
|
* Add slide notes to pptx
|
||||||
* Add `--encoding` directive to ingest
|
* Add `--encoding` directive to ingest
|
||||||
|
* Improve json detection by `detect_filetype`
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
10
example-docs/fake-incomplete-json.txt
Normal file
10
example-docs/fake-incomplete-json.txt
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
{
|
||||||
|
"name": "John Doe",
|
||||||
|
"age": 30,
|
||||||
|
"email": "johndoe@example.com",
|
||||||
|
"is_student": true,
|
||||||
|
"address": {
|
||||||
|
"city": "New York",
|
||||||
|
"zipcode": "10001"
|
||||||
|
},
|
||||||
|
"hobbies": ["reading", "running", "cooking"]
|
@ -77,6 +77,7 @@ class MockDocumentLayout(layout.DocumentLayout):
|
|||||||
("README.rst", FileType.RST),
|
("README.rst", FileType.RST),
|
||||||
("README.md", FileType.MD),
|
("README.md", FileType.MD),
|
||||||
("fake.odt", FileType.ODT),
|
("fake.odt", FileType.ODT),
|
||||||
|
("fake-incomplete-json.txt", FileType.JSON),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_detect_filetype_from_filename(file, expected):
|
def test_detect_filetype_from_filename(file, expected):
|
||||||
@ -103,6 +104,7 @@ def test_detect_filetype_from_filename(file, expected):
|
|||||||
("fake-doc.rtf", FileType.RTF),
|
("fake-doc.rtf", FileType.RTF),
|
||||||
("spring-weather.html.json", FileType.JSON),
|
("spring-weather.html.json", FileType.JSON),
|
||||||
("fake.odt", FileType.ODT),
|
("fake.odt", FileType.ODT),
|
||||||
|
("fake-incomplete-json.txt", FileType.TXT),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
|
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
|
||||||
@ -139,6 +141,7 @@ def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expecte
|
|||||||
("stanley-cups.tsv", FileType.TSV),
|
("stanley-cups.tsv", FileType.TSV),
|
||||||
("fake-power-point.pptx", FileType.PPTX),
|
("fake-power-point.pptx", FileType.PPTX),
|
||||||
("winter-sports.epub", FileType.EPUB),
|
("winter-sports.epub", FileType.EPUB),
|
||||||
|
("fake-incomplete-json.txt", FileType.JSON),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_detect_filetype_from_file(file, expected):
|
def test_detect_filetype_from_file(file, expected):
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.8.2-dev5" # pragma: no cover
|
__version__ = "0.8.2-dev6" # pragma: no cover
|
||||||
|
@ -11,7 +11,7 @@ from typing import IO, TYPE_CHECKING, Callable, List, Optional
|
|||||||
from unstructured.documents.coordinates import PixelSpace
|
from unstructured.documents.coordinates import PixelSpace
|
||||||
from unstructured.documents.elements import Element, PageBreak
|
from unstructured.documents.elements import Element, PageBreak
|
||||||
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
|
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
|
||||||
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
|
from unstructured.nlp.patterns import JSON_PATTERN, VALID_JSON_CHARACTERS
|
||||||
from unstructured.partition.common import (
|
from unstructured.partition.common import (
|
||||||
_add_element_metadata,
|
_add_element_metadata,
|
||||||
_remove_element_metadata,
|
_remove_element_metadata,
|
||||||
@ -300,9 +300,6 @@ def detect_filetype(
|
|||||||
encoding = "utf-8"
|
encoding = "utf-8"
|
||||||
formatted_encoding = format_encoding_str(encoding)
|
formatted_encoding = format_encoding_str(encoding)
|
||||||
|
|
||||||
if extension in PLAIN_TEXT_EXTENSIONS:
|
|
||||||
return EXT_TO_FILETYPE.get(extension)
|
|
||||||
|
|
||||||
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
||||||
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
||||||
# rather than "application/json". this corrects for that case.
|
# rather than "application/json". this corrects for that case.
|
||||||
@ -315,6 +312,9 @@ def detect_filetype(
|
|||||||
if file and _check_eml_from_buffer(file=file) is True:
|
if file and _check_eml_from_buffer(file=file) is True:
|
||||||
return FileType.EML
|
return FileType.EML
|
||||||
|
|
||||||
|
if extension in PLAIN_TEXT_EXTENSIONS:
|
||||||
|
return EXT_TO_FILETYPE.get(extension)
|
||||||
|
|
||||||
# Safety catch
|
# Safety catch
|
||||||
if mime_type in STR_TO_FILETYPE:
|
if mime_type in STR_TO_FILETYPE:
|
||||||
return STR_TO_FILETYPE[mime_type]
|
return STR_TO_FILETYPE[mime_type]
|
||||||
@ -417,7 +417,15 @@ def _is_text_file_a_json(
|
|||||||
):
|
):
|
||||||
"""Detects if a file that has a text/plain MIME type is a JSON file."""
|
"""Detects if a file that has a text/plain MIME type is a JSON file."""
|
||||||
file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
|
file_text = _read_file_start_for_type_check(file=file, filename=filename, encoding=encoding)
|
||||||
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
|
text_without_strings = re.sub(r'"(?:\\.|[^"\\])*"', "", file_text)
|
||||||
|
|
||||||
|
if not re.match(VALID_JSON_CHARACTERS, text_without_strings):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if not re.match(JSON_PATTERN, file_text):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def _count_commas(text: str):
|
def _count_commas(text: str):
|
||||||
|
@ -108,4 +108,17 @@ ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
|||||||
# NOTE(robinson) - Used to detect if text is in the expected "list of dicts"
|
# NOTE(robinson) - Used to detect if text is in the expected "list of dicts"
|
||||||
# format for document elements
|
# format for document elements
|
||||||
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
|
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
|
||||||
JSON_PATTERN = r"^(?:\{.*\}|\[.*\])$"
|
|
||||||
|
# (?s) dot all (including newline characters)
|
||||||
|
# \{(?=.*:) opening brace and at least one colon
|
||||||
|
# .*? any characters (non-greedy)
|
||||||
|
# (?:\}|$) non-capturing group that matches either the closing brace } or the end of
|
||||||
|
# the string to handle cases where the JSON is cut off
|
||||||
|
# | or
|
||||||
|
# \[(?s:.*?)\] matches the opening bracket [ in a JSON array and any characters inside the array
|
||||||
|
# (?:$|,|\]) non-capturing group that matches either the end of the string, a comma,
|
||||||
|
# or the closing bracket to handle cases where the JSON array is cut off
|
||||||
|
JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
|
||||||
|
|
||||||
|
# taken from https://stackoverflow.com/a/3845829/12406158
|
||||||
|
VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user