mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-17 21:29:05 +00:00
fix: update detect_filetype
to properly handle older office files (#161)
This commit is contained in:
parent
08ccee0acb
commit
74ce2ae6e5
@ -1,7 +1,8 @@
|
||||
## 0.4.3-dev1
|
||||
## 0.4.3-dev2
|
||||
|
||||
* Fix in `exceeds_cap_ratio` so the function doesn't break with empty text
|
||||
* Fix bug in `_parse_received_data`.
|
||||
* Update `detect_filetype` to properly handle `.doc`, `.xls`, and `.ppt`.
|
||||
|
||||
## 0.4.2
|
||||
|
||||
|
@ -88,6 +88,24 @@ def test_detect_application_zip_files(monkeypatch, tmpdir):
|
||||
assert filetype == FileType.ZIP
|
||||
|
||||
|
||||
def test_detect_doc_file_from_mime_type(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/msword")
|
||||
filetype = detect_filetype(filename="fake.doc")
|
||||
assert filetype == FileType.DOC
|
||||
|
||||
|
||||
def test_detect_ppt_file_from_mime_type(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-powerpoint")
|
||||
filetype = detect_filetype(filename="fake.ppt")
|
||||
assert filetype == FileType.PPT
|
||||
|
||||
|
||||
def test_detect_xls_file_from_mime_type(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-excel")
|
||||
filetype = detect_filetype(filename="fake.xls")
|
||||
assert filetype == FileType.XLS
|
||||
|
||||
|
||||
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.4.3-dev1" # pragma: no cover
|
||||
__version__ = "0.4.3-dev2" # pragma: no cover
|
||||
|
@ -11,14 +11,25 @@ from unstructured.nlp.patterns import EMAIL_HEAD_RE
|
||||
|
||||
DOCX_MIME_TYPES = [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
]
|
||||
|
||||
DOC_MIME_TYPES = [
|
||||
"application/msword",
|
||||
]
|
||||
|
||||
XLSX_MIME_TYPES = [
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
]
|
||||
|
||||
XLS_MIME_TYPES = [
|
||||
"application/vnd.ms-excel",
|
||||
]
|
||||
|
||||
PPTX_MIME_TYPES = [
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
]
|
||||
|
||||
PPT_MIME_TYPES = [
|
||||
"application/vnd.ms-powerpoint",
|
||||
]
|
||||
|
||||
@ -125,6 +136,9 @@ def detect_filetype(
|
||||
elif mime_type in DOCX_MIME_TYPES:
|
||||
return FileType.DOCX
|
||||
|
||||
elif mime_type in DOC_MIME_TYPES:
|
||||
return FileType.DOC
|
||||
|
||||
elif mime_type == "image/jpeg":
|
||||
return FileType.JPG
|
||||
|
||||
@ -157,9 +171,15 @@ def detect_filetype(
|
||||
elif mime_type in XLSX_MIME_TYPES:
|
||||
return FileType.XLSX
|
||||
|
||||
elif mime_type in XLS_MIME_TYPES:
|
||||
return FileType.XLS
|
||||
|
||||
elif mime_type in PPTX_MIME_TYPES:
|
||||
return FileType.PPTX
|
||||
|
||||
elif mime_type in PPT_MIME_TYPES:
|
||||
return FileType.PPT
|
||||
|
||||
elif mime_type == "application/octet-stream":
|
||||
if file and not extension:
|
||||
return _detect_filetype_from_octet_stream(file=file)
|
||||
|
Loading…
x
Reference in New Issue
Block a user