enhancement: handling for empty files in detect_filetype and partition (#710)

* add empty filetype

* add empty handling to partition

* changelog and version
This commit is contained in:
Matt Robinson 2023-06-09 16:07:50 -04:00 committed by GitHub
parent 80f0b4a132
commit 19ab6d960f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 43 additions and 3 deletions

View File

@ -1,3 +1,13 @@
## 0.7.4-dev0
### Enhancements
* Adds handling for empty files in `detect_filetype` and `partition`.
### Features
### Fixes
## 0.7.3
### Enhancements

0
example-docs/empty.txt Normal file
View File

1
test_file.html Normal file
View File

@ -0,0 +1 @@
<h3 class="l_titel">Jahresabschluss zum Geschäftsjahr vom 01.01.2020 bis zum 31.12.2020</h3>

View File

@ -408,3 +408,12 @@ def test_csv_json_check_with_file_and_utf_32(filename="example-docs/fake-text-ut
with open(filename, "rb") as f:
assert _is_text_file_a_json(file=f) is False
def test_detect_filetype_detects_empty_filename(filename="example-docs/empty.txt"):
assert detect_filetype(filename=filename) == FileType.EMPTY
def test_detect_filetype_detects_empty_file(filename="example-docs/empty.txt"):
with open(filename, "rb") as f:
assert detect_filetype(file=f) == FileType.EMPTY

View File

@ -582,7 +582,7 @@ FILETYPE_TO_MODULE = {
@pytest.mark.parametrize("filetype", supported_filetypes)
def test_file_specific_produces_correct_filetype(filetype: FileType):
if filetype in (FileType.JPG, FileType.PNG):
if filetype in (FileType.JPG, FileType.PNG, FileType.EMPTY):
pytest.skip()
extension = filetype.name.lower()
filetype_module = (
@ -594,7 +594,11 @@ def test_file_specific_produces_correct_filetype(filetype: FileType):
for file in pathlib.Path("example-docs").iterdir():
if file.is_file() and file.suffix == f".{extension}":
elements = fun(str(file))
assert all(el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype] for el in elements)
assert all(
el.metadata.filetype == FILETYPE_TO_MIMETYPE[filetype]
for el in elements
if el.metadata.filetype is not None
)
break
@ -790,3 +794,12 @@ def test_auto_partition_csv_from_file(filename="example-docs/stanley-cups.csv"):
assert isinstance(elements[0], Table)
assert elements[0].metadata.text_as_html == EXPECTED_XLSX_TABLE
assert elements[0].metadata.filetype == "text/csv"
def test_auto_partition_works_on_empty_filename(filename="example-docs/empty.txt"):
assert partition(filename=filename) == []
def test_auto_partition_works_on_empty_file(filename="example-docs/empty.txt"):
with open(filename, "rb") as f:
assert partition(file=f) == []

View File

@ -1 +1 @@
__version__ = "0.7.3" # pragma: no cover
__version__ = "0.7.4-dev0" # pragma: no cover

View File

@ -50,6 +50,7 @@ EXPECTED_PPTX_FILES = [
class FileType(Enum):
UNK = 0
EMPTY = 1
# MS Office Types
DOC = 10
@ -121,6 +122,7 @@ STR_TO_FILETYPE = {
"message/rfc822": FileType.EML,
"application/x-ole-storage": FileType.MSG,
"application/vnd.ms-outlook": FileType.MSG,
"inode/x-empty": FileType.EMPTY,
}
MIMETYPES_TO_EXCLUDE = [
@ -300,6 +302,9 @@ def detect_filetype(
# later if needed.
return FileType.TXT
elif mime_type.endswith("empty"):
return FileType.EMPTY
# For everything else
elif mime_type in STR_TO_FILETYPE:
return STR_TO_FILETYPE[mime_type]

View File

@ -204,6 +204,8 @@ def partition(
elements = partition_xlsx(filename=filename, file=file)
elif filetype == FileType.CSV:
elements = partition_csv(filename=filename, file=file)
elif filetype == FileType.EMPTY:
elements = []
else:
msg = "Invalid file" if not filename else f"Invalid file {filename}"
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")