refactor: update detect_filetype() to use hashmap for mime type return (#591)

* Update detect_filetype() to use hashmap for mime type return * fix: text mime type and linting * fix: declare docx and xlsx mime types locally and also fix linting * Update CHANGELOG.md * tweaks for failing tests --------- Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
2025-12-29 16:17:00 +00:00 · 2023-05-17 21:48:52 +08:00 · 2023-05-17 21:48:52 +08:00 · 7eac1f8ca7
commit 7eac1f8ca7
parent f4f40f58e3
3 changed files with 79 additions and 108 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,8 @@
  partition strategy in CLI. For example, `--partition-strategy fast`.
 * Added metadata for filetype.
 * Add Discord connector to pull messages from a list of channels
+* Refactor `unstructured/file-utils/filetype.py` to better utilise hashmap to return mime type.
+* Add local declaration of DOCX_MIME_TYPES and XLSX_MIME_TYPES for `test_filetype.py`.

 ### Features

--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -7,8 +7,6 @@ import pytest

 from unstructured.file_utils import filetype
 from unstructured.file_utils.filetype import (
-    DOCX_MIME_TYPES,
-    XLSX_MIME_TYPES,
    FileType,
    _is_text_file_a_json,
    detect_filetype,
@ -17,6 +15,14 @@ from unstructured.file_utils.filetype import (
 FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
 EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs")

+DOCX_MIME_TYPES = [
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+]
+
+XLSX_MIME_TYPES = [
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+]
+

@pytest.mark.parametrize(
    ("file", "expected"),
@ -142,7 +148,11 @@ def test_detect_html_text_xml(monkeypatch):


 def test_detect_docx_filetype_application_octet_stream(monkeypatch):
-    monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_buffer",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
    with open(filename, "rb") as f:
        filetype = detect_filetype(file=f)
@ -150,7 +160,11 @@ def test_detect_docx_filetype_application_octet_stream(monkeypatch):


 def test_detect_docx_filetype_application_octet_stream_with_filename(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_file",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
    filetype = detect_filetype(filename=filename)
    assert filetype == FileType.DOCX
@ -173,25 +187,41 @@ def test_detect_application_zip_files(monkeypatch, tmpdir):


 def test_detect_doc_file_from_mime_type(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/msword")
+    monkeypatch.setattr(
+        magic,
+        "from_file",
+        lambda *args, **kwargs: "application/msword",
+    )
    filetype = detect_filetype(filename="fake.doc")
    assert filetype == FileType.DOC


 def test_detect_ppt_file_from_mime_type(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-powerpoint")
+    monkeypatch.setattr(
+        magic,
+        "from_file",
+        lambda *args, **kwargs: "application/vnd.ms-powerpoint",
+    )
    filetype = detect_filetype(filename="fake.ppt")
    assert filetype == FileType.PPT


 def test_detect_xls_file_from_mime_type(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/vnd.ms-excel")
+    monkeypatch.setattr(
+        magic,
+        "from_file",
+        lambda *args, **kwargs: "application/vnd.ms-excel",
+    )
    filetype = detect_filetype(filename="fake.xls")
    assert filetype == FileType.XLS


 def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
-    monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_buffer",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
    with open(filename, "rb") as f:
        filetype = detect_filetype(file=f)
@ -199,14 +229,22 @@ def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):


 def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_file",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
    filetype = detect_filetype(filename=filename)
    assert filetype == FileType.XLSX


 def test_detect_pptx_filetype_application_octet_stream(monkeypatch):
-    monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_buffer",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
    with open(filename, "rb") as f:
        filetype = detect_filetype(file=f)
@ -214,14 +252,22 @@ def test_detect_pptx_filetype_application_octet_stream(monkeypatch):


 def test_detect_pptx_filetype_application_octet_stream_with_filename(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_file",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
    filetype = detect_filetype(filename=filename)
    assert filetype == FileType.PPTX


 def test_detect_application_octet_stream_returns_none_with_unknown(monkeypatch):
-    monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/octet-stream")
+    monkeypatch.setattr(
+        magic,
+        "from_buffer",
+        lambda *args, **kwargs: "application/octet-stream",
+    )
    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
    with open(filename, "rb") as f:
        filetype = detect_filetype(file=f)
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -20,54 +20,11 @@ except ImportError:  # pragma: nocover
 from unstructured.logger import logger
 from unstructured.nlp.patterns import EMAIL_HEAD_RE

-DOCX_MIME_TYPES = [
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-]
-
-DOC_MIME_TYPES = [
-    "application/msword",
-]
-
-ODT_MIME_TYPES = [
-    "application/vnd.oasis.opendocument.text",
-]
-
-XLSX_MIME_TYPES = [
-    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-]
-
-XLS_MIME_TYPES = [
-    "application/vnd.ms-excel",
-]
-
-PPTX_MIME_TYPES = [
-    "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-]
-
-PPT_MIME_TYPES = [
-    "application/vnd.ms-powerpoint",
-]
-
-MSG_MIME_TYPES = [
-    "application/vnd.ms-outlook",
-    "application/x-ole-storage",
-]
-
 TXT_MIME_TYPES = [
    "text/plain",
    "message/rfc822",  # ref: https://www.rfc-editor.org/rfc/rfc822
 ]

-MD_MIME_TYPES = [
-    "text/markdown",
-    "text/x-markdown",
-]
-
-EPUB_MIME_TYPES = [
-    "application/epub",
-    "application/epub+zip",
-]
-
 # NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
 # If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
 # looking for expected filenames within the zip file.
@ -141,6 +98,7 @@ STR_TO_FILETYPE = {
    "application/epub+zip": FileType.EPUB,
    "application/json": FileType.JSON,
    "application/rtf": FileType.RTF,
+    "text/rtf": FileType.RTF,
    "text/html": FileType.HTML,
    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
    "application/vnd.ms-excel": FileType.XLS,
@ -149,6 +107,7 @@ STR_TO_FILETYPE = {
    "application/xml": FileType.XML,
    "application/vnd.oasis.opendocument.text": FileType.ODT,
    "message/rfc822": FileType.EML,
+    "application/x-ole-storage": FileType.MSG,
    "application/vnd.ms-outlook": FileType.MSG,
 }

@ -206,13 +165,9 @@ def detect_filetype(
        extension = extension.lower()
        if os.path.isfile(_filename) and LIBMAGIC_AVAILABLE:
            mime_type = magic.from_file(filename or file_filename, mime=True)  # type: ignore
-            # NOTE(crag): for older versions of the OS libmagic package, such as is currently
-            # installed on the Unstructured docker image, .json files resolve to "text/plain"
-            # rather than "application/json". this corrects for that case.
-            if mime_type == "text/plain" and extension == ".json":
-                return FileType.JSON
        else:
            return EXT_TO_FILETYPE.get(extension.lower(), FileType.UNK)
+
    elif file is not None:
        extension = None
        # NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
@ -229,51 +184,20 @@ def detect_filetype(
    else:
        raise ValueError("No filename, file, nor file_filename were specified.")

-    if mime_type == "application/pdf":
-        return FileType.PDF
+    """Mime type special cases."""

-    elif mime_type == "application/json":
+    # NOTE(crag): for older versions of the OS libmagic package, such as is currently
+    # installed on the Unstructured docker image, .json files resolve to "text/plain"
+    # rather than "application/json". this corrects for that case.
+    if mime_type == "text/plain" and extension == ".json":
        return FileType.JSON

-    elif mime_type in DOCX_MIME_TYPES:
-        return FileType.DOCX
-
-    elif mime_type in DOC_MIME_TYPES:
-        return FileType.DOC
-
-    elif mime_type in ODT_MIME_TYPES:
-        return FileType.ODT
-
-    elif mime_type in MSG_MIME_TYPES:
-        return FileType.MSG
-
-    elif mime_type == "image/jpeg":
-        return FileType.JPG
-
-    elif mime_type == "image/png":
-        return FileType.PNG
-
-    elif mime_type in MD_MIME_TYPES:
-        # NOTE - I am not sure whether libmagic ever returns these mimetypes.
-        return FileType.MD
-
-    elif mime_type in EPUB_MIME_TYPES:
-        return FileType.EPUB
-
-    # NOTE(robinson) - examples are application/rtf or text/rtf.
-    # magic often returns text/plain for RTF files
-    elif mime_type.endswith("rtf"):
-        return FileType.RTF
-
    elif mime_type.endswith("xml"):
        if extension and (extension == ".html" or extension == ".htm"):
            return FileType.HTML
        else:
            return FileType.XML

-    elif mime_type == "text/html":
-        return FileType.HTML
-
    elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
        if extension and extension == ".eml":
            return FileType.EML
@ -281,26 +205,21 @@ def detect_filetype(
            return FileType.MD
        elif extension and extension == ".rtf":
            return FileType.RTF
+        elif extension and extension == ".html":
+            return FileType.HTML

        if _is_text_file_a_json(file=file, filename=filename):
            return FileType.JSON

        if file and not extension and _check_eml_from_buffer(file=file) is True:
            return FileType.EML
+
+        # Safety catch
+        if mime_type in STR_TO_FILETYPE:
+            return STR_TO_FILETYPE[mime_type]
+
        return FileType.TXT

-    elif mime_type in XLSX_MIME_TYPES:
-        return FileType.XLSX
-
-    elif mime_type in XLS_MIME_TYPES:
-        return FileType.XLS
-
-    elif mime_type in PPTX_MIME_TYPES:
-        return FileType.PPTX
-
-    elif mime_type in PPT_MIME_TYPES:
-        return FileType.PPT
-
    elif mime_type == "application/octet-stream":
        if file and not extension:
            return _detect_filetype_from_octet_stream(file=file)
@ -321,6 +240,10 @@ def detect_filetype(
        else:
            return EXT_TO_FILETYPE.get(extension.lower(), filetype)

+    # For everything else
+    elif mime_type in STR_TO_FILETYPE:
+        return STR_TO_FILETYPE[mime_type]
+
    logger.warning(
        f"The MIME type{f' of {filename!r}' if filename else ''} is {mime_type!r}. "
        "This file type is not currently supported in unstructured.",