fix: update detect_filetype to properly handle older office files (#161)

This commit is contained in:
Matt Robinson 2023-01-18 11:18:20 -05:00 committed by GitHub
parent 08ccee0acb
commit 74ce2ae6e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 41 additions and 2 deletions

View File

@ -1,7 +1,8 @@
## 0.4.3-dev1 ## 0.4.3-dev2
* Fix in `exceeds_cap_ratio` so the function doesn't break with empty text * Fix in `exceeds_cap_ratio` so the function doesn't break with empty text
* Fix bug in `_parse_received_data`. * Fix bug in `_parse_received_data`.
* Update `detect_filetype` to properly handle `.doc`, `.xls`, and `.ppt`.
## 0.4.2 ## 0.4.2

View File

@ -88,6 +88,24 @@ def test_detect_application_zip_files(monkeypatch, tmpdir):
assert filetype == FileType.ZIP assert filetype == FileType.ZIP
def test_detect_doc_file_from_mime_type(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/msword")
filetype = detect_filetype(filename="fake.doc")
assert filetype == FileType.DOC
def test_detect_ppt_file_from_mime_type(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-powerpoint")
filetype = detect_filetype(filename="fake.ppt")
assert filetype == FileType.PPT
def test_detect_xls_file_from_mime_type(monkeypatch):
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-excel")
filetype = detect_filetype(filename="fake.xls")
assert filetype == FileType.XLS
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch): def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream") monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")

View File

@ -1 +1 @@
__version__ = "0.4.3-dev1" # pragma: no cover __version__ = "0.4.3-dev2" # pragma: no cover

View File

@ -11,14 +11,25 @@ from unstructured.nlp.patterns import EMAIL_HEAD_RE
DOCX_MIME_TYPES = [ DOCX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
]
DOC_MIME_TYPES = [
"application/msword", "application/msword",
] ]
XLSX_MIME_TYPES = [ XLSX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]
XLS_MIME_TYPES = [
"application/vnd.ms-excel", "application/vnd.ms-excel",
] ]
PPTX_MIME_TYPES = [ PPTX_MIME_TYPES = [
"application/vnd.openxmlformats-officedocument.presentationml.presentation", "application/vnd.openxmlformats-officedocument.presentationml.presentation",
]
PPT_MIME_TYPES = [
"application/vnd.ms-powerpoint", "application/vnd.ms-powerpoint",
] ]
@ -125,6 +136,9 @@ def detect_filetype(
elif mime_type in DOCX_MIME_TYPES: elif mime_type in DOCX_MIME_TYPES:
return FileType.DOCX return FileType.DOCX
elif mime_type in DOC_MIME_TYPES:
return FileType.DOC
elif mime_type == "image/jpeg": elif mime_type == "image/jpeg":
return FileType.JPG return FileType.JPG
@ -157,9 +171,15 @@ def detect_filetype(
elif mime_type in XLSX_MIME_TYPES: elif mime_type in XLSX_MIME_TYPES:
return FileType.XLSX return FileType.XLSX
elif mime_type in XLS_MIME_TYPES:
return FileType.XLS
elif mime_type in PPTX_MIME_TYPES: elif mime_type in PPTX_MIME_TYPES:
return FileType.PPTX return FileType.PPTX
elif mime_type in PPT_MIME_TYPES:
return FileType.PPT
elif mime_type == "application/octet-stream": elif mime_type == "application/octet-stream":
if file and not extension: if file and not extension:
return _detect_filetype_from_octet_stream(file=file) return _detect_filetype_from_octet_stream(file=file)