From 19ab6d960f068c1a407ada02b37408c38214ab5d Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 9 Jun 2023 16:07:50 -0400 Subject: [PATCH] enhancement: handling for empty files in `detect_filetype` and `partition` (#710) * add empty filetype * add empty handling to partition * changelog and version --- CHANGELOG.md | 10 ++++++++++ example-docs/empty.txt | 0 test_file.html | 1 + test_unstructured/file_utils/test_filetype.py | 9 +++++++++ test_unstructured/partition/test_auto.py | 17 +++++++++++++++-- unstructured/__version__.py | 2 +- unstructured/file_utils/filetype.py | 5 +++++ unstructured/partition/auto.py | 2 ++ 8 files changed, 43 insertions(+), 3 deletions(-) create mode 100644 example-docs/empty.txt create mode 100644 test_file.html diff --git a/CHANGELOG.md b/CHANGELOG.md index 40dc1a8c0..32dba0e99 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.7.4-dev0 + +### Enhancements + +* Adds handling for empty files in `detect_filetype` and `partition`. + +### Features + +### Fixes + ## 0.7.3 ### Enhancements diff --git a/example-docs/empty.txt b/example-docs/empty.txt new file mode 100644 index 000000000..e69de29bb diff --git a/test_file.html b/test_file.html new file mode 100644 index 000000000..b14c936cf --- /dev/null +++ b/test_file.html @@ -0,0 +1 @@ +

Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020

\ No newline at end of file diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index 02b6a9584..99f05e515 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -408,3 +408,12 @@ def test_csv_json_check_with_file_and_utf_32(filename="example-docs/fake-text-ut with open(filename, "rb") as f: assert _is_text_file_a_json(file=f) is False + + +def test_detect_filetype_detects_empty_filename(filename="example-docs/empty.txt"): + assert detect_filetype(filename=filename) == FileType.EMPTY + + +def test_detect_filetype_detects_empty_file(filename="example-docs/empty.txt"): + with open(filename, "rb") as f: + assert detect_filetype(file=f) == FileType.EMPTY diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index ad1a887c5..88220cf8d 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -582,7 +582,7 @@ FILETYPE_TO_MODULE = { @pytest.mark.parametrize("filetype", supported_filetypes) def test_file_specific_produces_correct_filetype(filetype: FileType): - if filetype in (FileType.JPG, FileType.PNG): + if filetype in (FileType.JPG, FileType.PNG, FileType.EMPTY): pytest.skip() extension = filetype.name.lower() filetype_module = ( @@ -594,7 +594,11 @@ def test_file_specific_produces_correct_filetype(filetype: FileType): for file in pathlib.Path("example-docs").iterdir(): if file.is_file() and file.suffix == f".{extension}": elements = fun(str(file)) - assert all(el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype] for el in elements) + assert all( + el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype] + for el in elements + if el.metadata.filetype is not None + ) break @@ -790,3 +794,12 @@ def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"): assert isinstance(elements[0], Table) assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE assert elements[0].metadata.filetype == "text/csv" + + +def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"): + assert partition(filename=filename) == [] + + +def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"): + with open(filename, "rb") as f: + assert partition(file=f) == [] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 26d31cdeb..b22b05c4f 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.7.3" # pragma: no cover +__version__ = "0.7.4-dev0" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 2ae58b436..2c2275aa6 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -50,6 +50,7 @@ EXPECTED_PPTX_FILES = [ class FileType(Enum): UNK = 0 + EMPTY = 1 # MS Office Types DOC = 10 @@ -121,6 +122,7 @@ STR_TO_FILETYPE = { "message/rfc822": FileType.EML, "application/x-ole-storage": FileType.MSG, "application/vnd.ms-outlook": FileType.MSG, + "inode/x-empty": FileType.EMPTY, } MIMETYPES_TO_EXCLUDE = [ @@ -300,6 +302,9 @@ def detect_filetype( # later if needed. return FileType.TXT + elif mime_type.endswith("empty"): + return FileType.EMPTY + # For everything else elif mime_type in STR_TO_FILETYPE: return STR_TO_FILETYPE[mime_type] diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index a98cc7957..09bd97f9e 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -204,6 +204,8 @@ def partition( elements = partition_xlsx(filename=filename, file=file) elif filetype == FileType.CSV: elements = partition_csv(filename=filename, file=file) + elif filetype == FileType.EMPTY: + elements = [] else: msg = "Invalid file" if not filename else f"Invalid file {filename}" raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")