mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-24 17:41:15 +00:00
enhancement: handling for empty files in detect_filetype
and partition
(#710)
* add empty filetype * add empty handling to partition * changelog and version
This commit is contained in:
parent
80f0b4a132
commit
19ab6d960f
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
|||||||
|
## 0.7.4-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
* Adds handling for empty files in `detect_filetype` and `partition`.
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
## 0.7.3
|
## 0.7.3
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
0
example-docs/empty.txt
Normal file
0
example-docs/empty.txt
Normal file
1
test_file.html
Normal file
1
test_file.html
Normal file
@ -0,0 +1 @@
|
|||||||
|
<h3 class="l_titel">Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020</h3>
|
@ -408,3 +408,12 @@ def test_csv_json_check_with_file_and_utf_32(filename="example-docs/fake-text-ut
|
|||||||
|
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
assert _is_text_file_a_json(file=f) is False
|
assert _is_text_file_a_json(file=f) is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_filetype_detects_empty_filename(filename="example-docs/empty.txt"):
|
||||||
|
assert detect_filetype(filename=filename) == FileType.EMPTY
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_filetype_detects_empty_file(filename="example-docs/empty.txt"):
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
assert detect_filetype(file=f) == FileType.EMPTY
|
||||||
|
@ -582,7 +582,7 @@ FILETYPE_TO_MODULE = {
|
|||||||
|
|
||||||
@pytest.mark.parametrize("filetype", supported_filetypes)
|
@pytest.mark.parametrize("filetype", supported_filetypes)
|
||||||
def test_file_specific_produces_correct_filetype(filetype: FileType):
|
def test_file_specific_produces_correct_filetype(filetype: FileType):
|
||||||
if filetype in (FileType.JPG, FileType.PNG):
|
if filetype in (FileType.JPG, FileType.PNG, FileType.EMPTY):
|
||||||
pytest.skip()
|
pytest.skip()
|
||||||
extension = filetype.name.lower()
|
extension = filetype.name.lower()
|
||||||
filetype_module = (
|
filetype_module = (
|
||||||
@ -594,7 +594,11 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
|
|||||||
for file in pathlib.Path("example-docs").iterdir():
|
for file in pathlib.Path("example-docs").iterdir():
|
||||||
if file.is_file() and file.suffix == f".{extension}":
|
if file.is_file() and file.suffix == f".{extension}":
|
||||||
elements = fun(str(file))
|
elements = fun(str(file))
|
||||||
assert all(el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype] for el in elements)
|
assert all(
|
||||||
|
el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
|
||||||
|
for el in elements
|
||||||
|
if el.metadata.filetype is not None
|
||||||
|
)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
@ -790,3 +794,12 @@ def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
|
|||||||
assert isinstance(elements[0], Table)
|
assert isinstance(elements[0], Table)
|
||||||
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
|
||||||
assert elements[0].metadata.filetype == "text/csv"
|
assert elements[0].metadata.filetype == "text/csv"
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
|
||||||
|
assert partition(filename=filename) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
|
||||||
|
with open(filename, "rb") as f:
|
||||||
|
assert partition(file=f) == []
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.7.3" # pragma: no cover
|
__version__ = "0.7.4-dev0" # pragma: no cover
|
||||||
|
@ -50,6 +50,7 @@ EXPECTED_PPTX_FILES = [
|
|||||||
|
|
||||||
class FileType(Enum):
|
class FileType(Enum):
|
||||||
UNK = 0
|
UNK = 0
|
||||||
|
EMPTY = 1
|
||||||
|
|
||||||
# MS Office Types
|
# MS Office Types
|
||||||
DOC = 10
|
DOC = 10
|
||||||
@ -121,6 +122,7 @@ STR_TO_FILETYPE = {
|
|||||||
"message/rfc822": FileType.EML,
|
"message/rfc822": FileType.EML,
|
||||||
"application/x-ole-storage": FileType.MSG,
|
"application/x-ole-storage": FileType.MSG,
|
||||||
"application/vnd.ms-outlook": FileType.MSG,
|
"application/vnd.ms-outlook": FileType.MSG,
|
||||||
|
"inode/x-empty": FileType.EMPTY,
|
||||||
}
|
}
|
||||||
|
|
||||||
MIMETYPES_TO_EXCLUDE = [
|
MIMETYPES_TO_EXCLUDE = [
|
||||||
@ -300,6 +302,9 @@ def detect_filetype(
|
|||||||
# later if needed.
|
# later if needed.
|
||||||
return FileType.TXT
|
return FileType.TXT
|
||||||
|
|
||||||
|
elif mime_type.endswith("empty"):
|
||||||
|
return FileType.EMPTY
|
||||||
|
|
||||||
# For everything else
|
# For everything else
|
||||||
elif mime_type in STR_TO_FILETYPE:
|
elif mime_type in STR_TO_FILETYPE:
|
||||||
return STR_TO_FILETYPE[mime_type]
|
return STR_TO_FILETYPE[mime_type]
|
||||||
|
@ -204,6 +204,8 @@ def partition(
|
|||||||
elements = partition_xlsx(filename=filename, file=file)
|
elements = partition_xlsx(filename=filename, file=file)
|
||||||
elif filetype == FileType.CSV:
|
elif filetype == FileType.CSV:
|
||||||
elements = partition_csv(filename=filename, file=file)
|
elements = partition_csv(filename=filename, file=file)
|
||||||
|
elif filetype == FileType.EMPTY:
|
||||||
|
elements = []
|
||||||
else:
|
else:
|
||||||
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
||||||
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
|
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user