diff --git a/CHANGELOG.md b/CHANGELOG.md index f48d97142..db22e7e55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,8 @@ -## 0.4.3-dev1 +## 0.4.3-dev2 * Fix in `exceeds_cap_ratio` so the function doesn't break with empty text * Fix bug in `_parse_received_data`. +* Update `detect_filetype` to properly handle `.doc`, `.xls`, and `.ppt`. ## 0.4.2 diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index df4780185..e4a5cffa4 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -88,6 +88,24 @@ def test_detect_application_zip_files(monkeypatch, tmpdir): assert filetype == FileType.ZIP +def test_detect_doc_file_from_mime_type(monkeypatch): + monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/msword") + filetype = detect_filetype(filename="fake.doc") + assert filetype == FileType.DOC + + +def test_detect_ppt_file_from_mime_type(monkeypatch): + monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-powerpoint") + filetype = detect_filetype(filename="fake.ppt") + assert filetype == FileType.PPT + + +def test_detect_xls_file_from_mime_type(monkeypatch): + monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-excel") + filetype = detect_filetype(filename="fake.xls") + assert filetype == FileType.XLS + + def test_detect_xlsx_filetype_application_octet_stream(monkeypatch): monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx") diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 4b2a24857..c5b74f907 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.4.3-dev1" # pragma: no cover +__version__ = "0.4.3-dev2" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index e206315d2..be8b37b7c 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -11,14 +11,25 @@ from unstructured.nlp.patterns import EMAIL_HEAD_RE DOCX_MIME_TYPES = [ "application/vnd.openxmlformats-officedocument.wordprocessingml.document", +] + +DOC_MIME_TYPES = [ "application/msword", ] + XLSX_MIME_TYPES = [ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", +] + +XLS_MIME_TYPES = [ "application/vnd.ms-excel", ] + PPTX_MIME_TYPES = [ "application/vnd.openxmlformats-officedocument.presentationml.presentation", +] + +PPT_MIME_TYPES = [ "application/vnd.ms-powerpoint", ] @@ -125,6 +136,9 @@ def detect_filetype( elif mime_type in DOCX_MIME_TYPES: return FileType.DOCX + elif mime_type in DOC_MIME_TYPES: + return FileType.DOC + elif mime_type == "image/jpeg": return FileType.JPG @@ -157,9 +171,15 @@ def detect_filetype( elif mime_type in XLSX_MIME_TYPES: return FileType.XLSX + elif mime_type in XLS_MIME_TYPES: + return FileType.XLS + elif mime_type in PPTX_MIME_TYPES: return FileType.PPTX + elif mime_type in PPT_MIME_TYPES: + return FileType.PPT + elif mime_type == "application/octet-stream": if file and not extension: return _detect_filetype_from_octet_stream(file=file)