mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 15:13:35 +00:00
refactor: update detect_filetype() to use hashmap for mime type return (#591)
* Update detect_filetype() to use hashmap for mime type return * fix: text mime type and linting * fix: declare docx and xlsx mime types locally and also fix linting * Update CHANGELOG.md * tweaks for failing tests --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
parent
f4f40f58e3
commit
7eac1f8ca7
@ -7,6 +7,8 @@
|
||||
partition strategy in CLI. For example, `--partition-strategy fast`.
|
||||
* Added metadata for filetype.
|
||||
* Add Discord connector to pull messages from a list of channels
|
||||
* Refactor `unstructured/file-utils/filetype.py` to better utilise hashmap to return mime type.
|
||||
* Add local declaration of DOCX_MIME_TYPES and XLSX_MIME_TYPES for `test_filetype.py`.
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -7,8 +7,6 @@ import pytest
|
||||
|
||||
from unstructured.file_utils import filetype
|
||||
from unstructured.file_utils.filetype import (
|
||||
DOCX_MIME_TYPES,
|
||||
XLSX_MIME_TYPES,
|
||||
FileType,
|
||||
_is_text_file_a_json,
|
||||
detect_filetype,
|
||||
@ -17,6 +15,14 @@ from unstructured.file_utils.filetype import (
|
||||
FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs")
|
||||
|
||||
DOCX_MIME_TYPES = [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
]
|
||||
|
||||
XLSX_MIME_TYPES = [
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("file", "expected"),
|
||||
@ -142,7 +148,11 @@ def test_detect_html_text_xml(monkeypatch):
|
||||
|
||||
|
||||
def test_detect_docx_filetype_application_octet_stream(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
|
||||
monkeypatch.setattr(
|
||||
magic,
|
||||
"from_buffer",
|
||||
lambda *args, **kwargs: "application/octet-stream",
|
||||
)
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
|
||||
with open(filename, "rb") as f:
|
||||
filetype = detect_filetype(file=f)
|
||||
@ -150,7 +160,11 @@ def test_detect_docx_filetype_application_octet_stream(monkeypatch):
|
||||
|
||||
|
||||
def test_detect_docx_filetype_application_octet_stream_with_filename(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
|
||||
monkeypatch.setattr(
|
||||
magic,
|
||||
"from_file",
|
||||
lambda *args, **kwargs: "application/octet-stream",
|
||||
)
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
|
||||
filetype = detect_filetype(filename=filename)
|
||||
assert filetype == FileType.DOCX
|
||||
@ -173,25 +187,41 @@ def test_detect_application_zip_files(monkeypatch, tmpdir):
|
||||
|
||||
|
||||
def test_detect_doc_file_from_mime_type(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/msword")
|
||||
monkeypatch.setattr(
|
||||
magic,
|
||||
"from_file",
|
||||
lambda *args, **kwargs: "application/msword",
|
||||
)
|
||||
filetype = detect_filetype(filename="fake.doc")
|
||||
assert filetype == FileType.DOC
|
||||
|
||||
|
||||
def test_detect_ppt_file_from_mime_type(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-powerpoint")
|
||||
monkeypatch.setattr(
|
||||
magic,
|
||||
"from_file",
|
||||
lambda *args, **kwargs: "application/vnd.ms-powerpoint",
|
||||
)
|
||||
filetype = detect_filetype(filename="fake.ppt")
|
||||
assert filetype == FileType.PPT
|
||||
|
||||
|
||||
def test_detect_xls_file_from_mime_type(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-excel")
|
||||
monkeypatch.setattr(
|
||||
magic,
|
||||
"from_file",
|
||||
lambda *args, **kwargs: "application/vnd.ms-excel",
|
||||
)
|
||||
filetype = detect_filetype(filename="fake.xls")
|
||||
assert filetype == FileType.XLS
|
||||
|
||||
|
||||
def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
|
||||
monkeypatch.setattr(
|
||||
magic,
|
||||
"from_buffer",
|
||||
lambda *args, **kwargs: "application/octet-stream",
|
||||
)
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
|
||||
with open(filename, "rb") as f:
|
||||
filetype = detect_filetype(file=f)
|
||||
@ -199,14 +229,22 @@ def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
|
||||
|
||||
|
||||
def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
|
||||
monkeypatch.setattr(
|
||||
magic,
|
||||
"from_file",
|
||||
lambda *args, **kwargs: "application/octet-stream",
|
||||
)
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
|
||||
filetype = detect_filetype(filename=filename)
|
||||
assert filetype == FileType.XLSX
|
||||
|
||||
|
||||
def test_detect_pptx_filetype_application_octet_stream(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
|
||||
monkeypatch.setattr(
|
||||
magic,
|
||||
"from_buffer",
|
||||
lambda *args, **kwargs: "application/octet-stream",
|
||||
)
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
with open(filename, "rb") as f:
|
||||
filetype = detect_filetype(file=f)
|
||||
@ -214,14 +252,22 @@ def test_detect_pptx_filetype_application_octet_stream(monkeypatch):
|
||||
|
||||
|
||||
def test_detect_pptx_filetype_application_octet_stream_with_filename(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
|
||||
monkeypatch.setattr(
|
||||
magic,
|
||||
"from_file",
|
||||
lambda *args, **kwargs: "application/octet-stream",
|
||||
)
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
|
||||
filetype = detect_filetype(filename=filename)
|
||||
assert filetype == FileType.PPTX
|
||||
|
||||
|
||||
def test_detect_application_octet_stream_returns_none_with_unknown(monkeypatch):
|
||||
monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
|
||||
monkeypatch.setattr(
|
||||
magic,
|
||||
"from_buffer",
|
||||
lambda *args, **kwargs: "application/octet-stream",
|
||||
)
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
|
||||
with open(filename, "rb") as f:
|
||||
filetype = detect_filetype(file=f)
|
||||
|
||||
@ -20,54 +20,11 @@ except ImportError: # pragma: nocover
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import EMAIL_HEAD_RE
|
||||
|
||||
DOCX_MIME_TYPES = [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
]
|
||||
|
||||
DOC_MIME_TYPES = [
|
||||
"application/msword",
|
||||
]
|
||||
|
||||
ODT_MIME_TYPES = [
|
||||
"application/vnd.oasis.opendocument.text",
|
||||
]
|
||||
|
||||
XLSX_MIME_TYPES = [
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
]
|
||||
|
||||
XLS_MIME_TYPES = [
|
||||
"application/vnd.ms-excel",
|
||||
]
|
||||
|
||||
PPTX_MIME_TYPES = [
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
]
|
||||
|
||||
PPT_MIME_TYPES = [
|
||||
"application/vnd.ms-powerpoint",
|
||||
]
|
||||
|
||||
MSG_MIME_TYPES = [
|
||||
"application/vnd.ms-outlook",
|
||||
"application/x-ole-storage",
|
||||
]
|
||||
|
||||
TXT_MIME_TYPES = [
|
||||
"text/plain",
|
||||
"message/rfc822", # ref: https://www.rfc-editor.org/rfc/rfc822
|
||||
]
|
||||
|
||||
MD_MIME_TYPES = [
|
||||
"text/markdown",
|
||||
"text/x-markdown",
|
||||
]
|
||||
|
||||
EPUB_MIME_TYPES = [
|
||||
"application/epub",
|
||||
"application/epub+zip",
|
||||
]
|
||||
|
||||
# NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
|
||||
# If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
|
||||
# looking for expected filenames within the zip file.
|
||||
@ -141,6 +98,7 @@ STR_TO_FILETYPE = {
|
||||
"application/epub+zip": FileType.EPUB,
|
||||
"application/json": FileType.JSON,
|
||||
"application/rtf": FileType.RTF,
|
||||
"text/rtf": FileType.RTF,
|
||||
"text/html": FileType.HTML,
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
|
||||
"application/vnd.ms-excel": FileType.XLS,
|
||||
@ -149,6 +107,7 @@ STR_TO_FILETYPE = {
|
||||
"application/xml": FileType.XML,
|
||||
"application/vnd.oasis.opendocument.text": FileType.ODT,
|
||||
"message/rfc822": FileType.EML,
|
||||
"application/x-ole-storage": FileType.MSG,
|
||||
"application/vnd.ms-outlook": FileType.MSG,
|
||||
}
|
||||
|
||||
@ -206,13 +165,9 @@ def detect_filetype(
|
||||
extension = extension.lower()
|
||||
if os.path.isfile(_filename) and LIBMAGIC_AVAILABLE:
|
||||
mime_type = magic.from_file(filename or file_filename, mime=True) # type: ignore
|
||||
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
||||
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
||||
# rather than "application/json". this corrects for that case.
|
||||
if mime_type == "text/plain" and extension == ".json":
|
||||
return FileType.JSON
|
||||
else:
|
||||
return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
|
||||
|
||||
elif file is not None:
|
||||
extension = None
|
||||
# NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
|
||||
@ -229,51 +184,20 @@ def detect_filetype(
|
||||
else:
|
||||
raise ValueError("No filename, file, nor file_filename were specified.")
|
||||
|
||||
if mime_type == "application/pdf":
|
||||
return FileType.PDF
|
||||
"""Mime type special cases."""
|
||||
|
||||
elif mime_type == "application/json":
|
||||
# NOTE(crag): for older versions of the OS libmagic package, such as is currently
|
||||
# installed on the Unstructured docker image, .json files resolve to "text/plain"
|
||||
# rather than "application/json". this corrects for that case.
|
||||
if mime_type == "text/plain" and extension == ".json":
|
||||
return FileType.JSON
|
||||
|
||||
elif mime_type in DOCX_MIME_TYPES:
|
||||
return FileType.DOCX
|
||||
|
||||
elif mime_type in DOC_MIME_TYPES:
|
||||
return FileType.DOC
|
||||
|
||||
elif mime_type in ODT_MIME_TYPES:
|
||||
return FileType.ODT
|
||||
|
||||
elif mime_type in MSG_MIME_TYPES:
|
||||
return FileType.MSG
|
||||
|
||||
elif mime_type == "image/jpeg":
|
||||
return FileType.JPG
|
||||
|
||||
elif mime_type == "image/png":
|
||||
return FileType.PNG
|
||||
|
||||
elif mime_type in MD_MIME_TYPES:
|
||||
# NOTE - I am not sure whether libmagic ever returns these mimetypes.
|
||||
return FileType.MD
|
||||
|
||||
elif mime_type in EPUB_MIME_TYPES:
|
||||
return FileType.EPUB
|
||||
|
||||
# NOTE(robinson) - examples are application/rtf or text/rtf.
|
||||
# magic often returns text/plain for RTF files
|
||||
elif mime_type.endswith("rtf"):
|
||||
return FileType.RTF
|
||||
|
||||
elif mime_type.endswith("xml"):
|
||||
if extension and (extension == ".html" or extension == ".htm"):
|
||||
return FileType.HTML
|
||||
else:
|
||||
return FileType.XML
|
||||
|
||||
elif mime_type == "text/html":
|
||||
return FileType.HTML
|
||||
|
||||
elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
|
||||
if extension and extension == ".eml":
|
||||
return FileType.EML
|
||||
@ -281,26 +205,21 @@ def detect_filetype(
|
||||
return FileType.MD
|
||||
elif extension and extension == ".rtf":
|
||||
return FileType.RTF
|
||||
elif extension and extension == ".html":
|
||||
return FileType.HTML
|
||||
|
||||
if _is_text_file_a_json(file=file, filename=filename):
|
||||
return FileType.JSON
|
||||
|
||||
if file and not extension and _check_eml_from_buffer(file=file) is True:
|
||||
return FileType.EML
|
||||
|
||||
# Safety catch
|
||||
if mime_type in STR_TO_FILETYPE:
|
||||
return STR_TO_FILETYPE[mime_type]
|
||||
|
||||
return FileType.TXT
|
||||
|
||||
elif mime_type in XLSX_MIME_TYPES:
|
||||
return FileType.XLSX
|
||||
|
||||
elif mime_type in XLS_MIME_TYPES:
|
||||
return FileType.XLS
|
||||
|
||||
elif mime_type in PPTX_MIME_TYPES:
|
||||
return FileType.PPTX
|
||||
|
||||
elif mime_type in PPT_MIME_TYPES:
|
||||
return FileType.PPT
|
||||
|
||||
elif mime_type == "application/octet-stream":
|
||||
if file and not extension:
|
||||
return _detect_filetype_from_octet_stream(file=file)
|
||||
@ -321,6 +240,10 @@ def detect_filetype(
|
||||
else:
|
||||
return EXT_TO_FILETYPE.get(extension.lower(), filetype)
|
||||
|
||||
# For everything else
|
||||
elif mime_type in STR_TO_FILETYPE:
|
||||
return STR_TO_FILETYPE[mime_type]
|
||||
|
||||
logger.warning(
|
||||
f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. "
|
||||
"This file type is not currently supported in unstructured.",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user