mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-18 13:45:45 +00:00
fix: update detect_filetype
to properly handle older office files (#161)
This commit is contained in:
parent
08ccee0acb
commit
74ce2ae6e5
@ -1,7 +1,8 @@
|
|||||||
## 0.4.3-dev1
|
## 0.4.3-dev2
|
||||||
|
|
||||||
* Fix in `exceeds_cap_ratio` so the function doesn't break with empty text
|
* Fix in `exceeds_cap_ratio` so the function doesn't break with empty text
|
||||||
* Fix bug in `_parse_received_data`.
|
* Fix bug in `_parse_received_data`.
|
||||||
|
* Update `detect_filetype` to properly handle `.doc`, `.xls`, and `.ppt`.
|
||||||
|
|
||||||
## 0.4.2
|
## 0.4.2
|
||||||
|
|
||||||
|
@ -88,6 +88,24 @@ def test_detect_application_zip_files(monkeypatch, tmpdir):
|
|||||||
assert filetype == FileType.ZIP
|
assert filetype == FileType.ZIP
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_doc_file_from_mime_type(monkeypatch):
|
||||||
|
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/msword")
|
||||||
|
filetype = detect_filetype(filename="fake.doc")
|
||||||
|
assert filetype == FileType.DOC
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_ppt_file_from_mime_type(monkeypatch):
|
||||||
|
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-powerpoint")
|
||||||
|
filetype = detect_filetype(filename="fake.ppt")
|
||||||
|
assert filetype == FileType.PPT
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_xls_file_from_mime_type(monkeypatch):
|
||||||
|
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-excel")
|
||||||
|
filetype = detect_filetype(filename="fake.xls")
|
||||||
|
assert filetype == FileType.XLS
|
||||||
|
|
||||||
|
|
||||||
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
|
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
|
||||||
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
|
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-excel.xlsx")
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.4.3-dev1" # pragma: no cover
|
__version__ = "0.4.3-dev2" # pragma: no cover
|
||||||
|
@ -11,14 +11,25 @@ from unstructured.nlp.patterns import EMAIL_HEAD_RE
|
|||||||
|
|
||||||
DOCX_MIME_TYPES = [
|
DOCX_MIME_TYPES = [
|
||||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
]
|
||||||
|
|
||||||
|
DOC_MIME_TYPES = [
|
||||||
"application/msword",
|
"application/msword",
|
||||||
]
|
]
|
||||||
|
|
||||||
XLSX_MIME_TYPES = [
|
XLSX_MIME_TYPES = [
|
||||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
]
|
||||||
|
|
||||||
|
XLS_MIME_TYPES = [
|
||||||
"application/vnd.ms-excel",
|
"application/vnd.ms-excel",
|
||||||
]
|
]
|
||||||
|
|
||||||
PPTX_MIME_TYPES = [
|
PPTX_MIME_TYPES = [
|
||||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||||
|
]
|
||||||
|
|
||||||
|
PPT_MIME_TYPES = [
|
||||||
"application/vnd.ms-powerpoint",
|
"application/vnd.ms-powerpoint",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -125,6 +136,9 @@ def detect_filetype(
|
|||||||
elif mime_type in DOCX_MIME_TYPES:
|
elif mime_type in DOCX_MIME_TYPES:
|
||||||
return FileType.DOCX
|
return FileType.DOCX
|
||||||
|
|
||||||
|
elif mime_type in DOC_MIME_TYPES:
|
||||||
|
return FileType.DOC
|
||||||
|
|
||||||
elif mime_type == "image/jpeg":
|
elif mime_type == "image/jpeg":
|
||||||
return FileType.JPG
|
return FileType.JPG
|
||||||
|
|
||||||
@ -157,9 +171,15 @@ def detect_filetype(
|
|||||||
elif mime_type in XLSX_MIME_TYPES:
|
elif mime_type in XLSX_MIME_TYPES:
|
||||||
return FileType.XLSX
|
return FileType.XLSX
|
||||||
|
|
||||||
|
elif mime_type in XLS_MIME_TYPES:
|
||||||
|
return FileType.XLS
|
||||||
|
|
||||||
elif mime_type in PPTX_MIME_TYPES:
|
elif mime_type in PPTX_MIME_TYPES:
|
||||||
return FileType.PPTX
|
return FileType.PPTX
|
||||||
|
|
||||||
|
elif mime_type in PPT_MIME_TYPES:
|
||||||
|
return FileType.PPT
|
||||||
|
|
||||||
elif mime_type == "application/octet-stream":
|
elif mime_type == "application/octet-stream":
|
||||||
if file and not extension:
|
if file and not extension:
|
||||||
return _detect_filetype_from_octet_stream(file=file)
|
return _detect_filetype_from_octet_stream(file=file)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user