mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
enhancement: handling for empty files in detect_filetype
and partition
(#710)
* add empty filetype * add empty handling to partition * changelog and version
This commit is contained in:
parent
80f0b4a132
commit
19ab6d960f
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.7.4-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Adds handling for empty files in `detect_filetype` and `partition`.
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
## 0.7.3
|
||||
|
||||
### Enhancements
|
||||
|
0
example-docs/empty.txt
Normal file
0
example-docs/empty.txt
Normal file
1
test_file.html
Normal file
1
test_file.html
Normal file
@ -0,0 +1 @@
|
||||
<h3 class="l_titel">Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020</h3>
|
@ -408,3 +408,12 @@ def test_csv_json_check_with_file_and_utf_32(filename="example-docs/fake-text-ut
|
||||
|
||||
with open(filename, "rb") as f:
|
||||
assert _is_text_file_a_json(file=f) is False
|
||||
|
||||
|
||||
def test_detect_filetype_detects_empty_filename(filename="example-docs/empty.txt"):
|
||||
assert detect_filetype(filename=filename) == FileType.EMPTY
|
||||
|
||||
|
||||
def test_detect_filetype_detects_empty_file(filename="example-docs/empty.txt"):
|
||||
with open(filename, "rb") as f:
|
||||
assert detect_filetype(file=f) == FileType.EMPTY
|
||||
|
@ -582,7 +582,7 @@ FILETYPE_TO_MODULE = {
|
||||
|
||||
@pytest.mark.parametrize("filetype", supported_filetypes)
|
||||
def test_file_specific_produces_correct_filetype(filetype: FileType):
|
||||
if filetype in (FileType.JPG, FileType.PNG):
|
||||
if filetype in (FileType.JPG, FileType.PNG, FileType.EMPTY):
|
||||
pytest.skip()
|
||||
extension = filetype.name.lower()
|
||||
filetype_module = (
|
||||
@ -594,7 +594,11 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
|
||||
for file in pathlib.Path("example-docs").iterdir():
|
||||
if file.is_file() and file.suffix == f".{extension}":
|
||||
elements = fun(str(file))
|
||||
assert all(el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype] for el in elements)
|
||||
assert all(
|
||||
el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
|
||||
for el in elements
|
||||
if el.metadata.filetype is not None
|
||||
)
|
||||
break
|
||||
|
||||
|
||||
@ -790,3 +794,12 @@ def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
|
||||
assert isinstance(elements[0], Table)
|
||||
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
||||
assert elements[0].metadata.filetype == "text/csv"
|
||||
|
||||
|
||||
def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
|
||||
assert partition(filename=filename) == []
|
||||
|
||||
|
||||
def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
|
||||
with open(filename, "rb") as f:
|
||||
assert partition(file=f) == []
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.7.3" # pragma: no cover
|
||||
__version__ = "0.7.4-dev0" # pragma: no cover
|
||||
|
@ -50,6 +50,7 @@ EXPECTED_PPTX_FILES = [
|
||||
|
||||
class FileType(Enum):
|
||||
UNK = 0
|
||||
EMPTY = 1
|
||||
|
||||
# MS Office Types
|
||||
DOC = 10
|
||||
@ -121,6 +122,7 @@ STR_TO_FILETYPE = {
|
||||
"message/rfc822": FileType.EML,
|
||||
"application/x-ole-storage": FileType.MSG,
|
||||
"application/vnd.ms-outlook": FileType.MSG,
|
||||
"inode/x-empty": FileType.EMPTY,
|
||||
}
|
||||
|
||||
MIMETYPES_TO_EXCLUDE = [
|
||||
@ -300,6 +302,9 @@ def detect_filetype(
|
||||
# later if needed.
|
||||
return FileType.TXT
|
||||
|
||||
elif mime_type.endswith("empty"):
|
||||
return FileType.EMPTY
|
||||
|
||||
# For everything else
|
||||
elif mime_type in STR_TO_FILETYPE:
|
||||
return STR_TO_FILETYPE[mime_type]
|
||||
|
@ -204,6 +204,8 @@ def partition(
|
||||
elements = partition_xlsx(filename=filename, file=file)
|
||||
elif filetype == FileType.CSV:
|
||||
elements = partition_csv(filename=filename, file=file)
|
||||
elif filetype == FileType.EMPTY:
|
||||
elements = []
|
||||
else:
|
||||
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
||||
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
|
||||
|
Loading…
x
Reference in New Issue
Block a user