fix: update detect_filetype to properly handle older office files (#161)

This commit is contained in:
Matt Robinson 2023-01-18 11:18:20 -05:00 committed by GitHub
parent 08ccee0acb
commit 74ce2ae6e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 41 additions and 2 deletions

View File

@ -1,7 +1,8 @@
## 0.4.3-dev1
## 0.4.3-dev2
* Fix in `exceeds_cap_ratio` so the function doesn't break with empty text
* Fix bug in `_parse_received_data`.
* Update `detect_filetype` to properly handle `.doc`, `.xls`, and `.ppt`.
## 0.4.2

View File

@ -88,6 +88,24 @@ def test_detect_application_zip_files(monkeypatch, tmpdir):
assert filetype == FileType.ZIP
def test_detect_doc_file_from_mime_type(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/msword")
filetype = detect_filetype(filename="fake.doc")
assert filetype == FileType.DOC
def test_detect_ppt_file_from_mime_type(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-powerpoint")
filetype = detect_filetype(filename="fake.ppt")
assert filetype == FileType.PPT
def test_detect_xls_file_from_mime_type(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-excel")
filetype = detect_filetype(filename="fake.xls")
assert filetype == FileType.XLS
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")

View File

@ -1 +1 @@
__version__ = "0.4.3-dev1" # pragma: no cover
__version__ = "0.4.3-dev2" # pragma: no cover

View File

@ -11,14 +11,25 @@ from unstructured.nlp.patterns import EMAIL_HEAD_RE
DOCX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]
DOC_MIME_TYPES = [
"application/msword",
]
XLSX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]
XLS_MIME_TYPES = [
"application/vnd.ms-excel",
]
PPTX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
]
PPT_MIME_TYPES = [
"application/vnd.ms-powerpoint",
]
@ -125,6 +136,9 @@ def detect_filetype(
elif mime_type in DOCX_MIME_TYPES:
return FileType.DOCX
elif mime_type in DOC_MIME_TYPES:
return FileType.DOC
elif mime_type == "image/jpeg":
return FileType.JPG
@ -157,9 +171,15 @@ def detect_filetype(
elif mime_type in XLSX_MIME_TYPES:
return FileType.XLSX
elif mime_type in XLS_MIME_TYPES:
return FileType.XLS
elif mime_type in PPTX_MIME_TYPES:
return FileType.PPTX
elif mime_type in PPT_MIME_TYPES:
return FileType.PPT
elif mime_type == "application/octet-stream":
if file and not extension:
return _detect_filetype_from_octet_stream(file=file)