rfctr(file): improve file-type auto-detect (#3409)

**Summary** In preparation for further work on auto file-type detection, improve `filetype.py` and related modules: - improve docstrings - improve type annotations - extract domain model to `.model` module
2026-01-06 04:11:08 +00:00 · 2024-07-16 22:27:31 -07:00 · 2024-07-16 22:27:31 -07:00 · a5c9a3695c
commit a5c9a3695c
parent 48bdf94656
10 changed files with 431 additions and 366 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.15.0-dev13
+## 0.15.0-dev14

 ### Enhancements

--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -8,6 +8,7 @@ import io
 import os
 import pathlib

+import magic
 import pytest

 from test_unstructured.unit_utils import (
@ -17,7 +18,7 @@ from test_unstructured.unit_utils import (
    MonkeyPatch,
    call,
    example_doc_path,
-    function_mock,
+    method_mock,
 )
 from unstructured.file_utils import filetype
 from unstructured.file_utils.filetype import (
@ -44,9 +45,7 @@ is_in_docker = os.path.exists("/.dockerenv")
        ("example-10k.html", FileType.HTML),
        ("fake-html.html", FileType.HTML),
        ("stanley-cups.xlsx", FileType.XLSX),
-        # NOTE(robinson) - currently failing in the docker tests because the detected
-        # MIME type is text/csv
-        # ("stanley-cups.csv", FileType.CSV),
+        ("stanley-cups.csv", FileType.CSV),
        ("stanley-cups.tsv", FileType.TSV),
        ("fake-power-point.pptx", FileType.PPTX),
        ("winter-sports.epub", FileType.EPUB),
@ -111,9 +110,7 @@ def test_detect_filetype_from_filename_with_extension(
        ("example-10k.html", [FileType.HTML, FileType.XML]),
        ("fake-html.html", [FileType.HTML]),
        ("stanley-cups.xlsx", [FileType.XLSX]),
-        # NOTE(robinson]) - currently failing in the docker tests because the detected
-        # MIME type is text/csv
-        # ("stanley-cups.csv", [FileType.CSV]),
+        ("stanley-cups.csv", [FileType.CSV]),
        ("stanley-cups.tsv", [FileType.TSV]),
        ("fake-power-point.pptx", [FileType.PPTX]),
        ("winter-sports.epub", [FileType.EPUB]),
@ -546,11 +543,13 @@ def test_detect_TXT_from_yaml_file(magic_from_buffer_: Mock):
 # ================================================================================================


+# -- `from_buffer()` and `from_file()` are not "methods" on `magic` per-se (`magic` is a module)
+# -- but they behave like methods for mocking purposes.
@pytest.fixture()
 def magic_from_buffer_(request: FixtureRequest):
-    return function_mock(request, "unstructured.file_utils.filetype.magic.from_buffer")
+    return method_mock(request, magic, "from_buffer")


@pytest.fixture()
 def magic_from_file_(request: FixtureRequest):
-    return function_mock(request, "unstructured.file_utils.filetype.magic.from_file")
+    return method_mock(request, magic, "from_file")
--- a/typings/filetype/init.pyi
+++ b/typings/filetype/init.pyi
@ -0,0 +1,6 @@
+from __future__ import annotations
+
+import pathlib
+from typing import IO
+
+def guess_mime(obj: bytearray | str | bytes | pathlib.PurePath | IO[bytes]) -> str | None: ...
--- a/typings/magic/init.pyi
+++ b/typings/magic/init.pyi
@ -0,0 +1,6 @@
+from __future__ import annotations
+
+from os import PathLike
+
+def from_buffer(buffer: bytes | str, mime: bool = ...) -> str: ...
+def from_file(filename: bytes | str | PathLike[str], mime: bool = ...) -> str: ...
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.15.0-dev13"  # pragma: no cover
+__version__ = "0.15.0-dev14"  # pragma: no cover
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -1,7 +1,7 @@
 from __future__ import annotations

-import enum
 import functools
+import importlib.util
 import json
 import os
 import re
@ -12,7 +12,15 @@ from typing_extensions import ParamSpec

 from unstructured.documents.elements import Element
 from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
-from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
+from unstructured.file_utils.model import (
+    EXT_TO_FILETYPE,
+    FILETYPE_TO_MIMETYPE,
+    PLAIN_TEXT_EXTENSIONS,
+    STR_TO_FILETYPE,
+    FileType,
+)
+from unstructured.logger import logger
+from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
 from unstructured.partition.common import (
    add_element_metadata,
    exactly_one,
@ -21,232 +29,7 @@ from unstructured.partition.common import (
 )
 from unstructured.utils import get_call_args_applying_defaults

-try:
-    import magic
-
-    LIBMAGIC_AVAILABLE = True
-except ImportError:  # pragma: nocover
-    LIBMAGIC_AVAILABLE = False  # pragma: nocover
-
-from unstructured.logger import logger
-from unstructured.nlp.patterns import EMAIL_HEAD_RE
-
-TXT_MIME_TYPES = [
-    "text/plain",
-    "message/rfc822",  # ref: https://www.rfc-editor.org/rfc/rfc822
-]
-
-# NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
-# If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
-# looking for expected filenames within the zip file.
-EXPECTED_DOCX_FILES = [
-    "docProps/core.xml",
-    "word/document.xml",
-]
-
-EXPECTED_XLSX_FILES = [
-    "xl/workbook.xml",
-]
-
-EXPECTED_PPTX_FILES = [
-    "docProps/core.xml",
-    "ppt/presentation.xml",
-]
-
-
-class FileType(enum.Enum):
-    UNK = 0
-    EMPTY = 1
-
-    # MS Office Types
-    DOC = 10
-    DOCX = 11
-    XLS = 12
-    XLSX = 13
-    PPT = 14
-    PPTX = 15
-    MSG = 16
-
-    # Adobe Types
-    PDF = 20
-
-    # Image Types
-    JPG = 30
-    PNG = 31
-    TIFF = 32
-    BMP = 33
-    HEIC = 34
-
-    # Plain Text Types
-    EML = 40
-    RTF = 41
-    TXT = 42
-    JSON = 43
-    CSV = 44
-    TSV = 45
-
-    # Markup Types
-    HTML = 50
-    XML = 51
-    MD = 52
-    EPUB = 53
-    RST = 54
-    ORG = 55
-
-    # Compressed Types
-    ZIP = 60
-
-    # Open Office Types
-    ODT = 70
-
-    # Audio Files
-    WAV = 80
-
-    # NOTE(robinson) - This is to support sorting for pandas groupby functions
-    def __lt__(self, other):
-        return self.name < other.name
-
-
-STR_TO_FILETYPE = {
-    "application/pdf": FileType.PDF,
-    "application/msword": FileType.DOC,
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX,
-    "image/jpeg": FileType.JPG,
-    "image/png": FileType.PNG,
-    "image/heic": FileType.HEIC,
-    "image/tiff": FileType.TIFF,
-    "image/bmp": FileType.BMP,
-    # NOTE(robinson) - https://mimetype.io/application/yaml
-    # In the future, we may have special processing for YAML
-    # files instead of treating them as plaintext
-    "application/yaml": FileType.TXT,
-    "application/x-yaml": FileType.TXT,
-    "text/x-yaml": FileType.TXT,
-    "text/yaml": FileType.TXT,
-    "text/plain": FileType.TXT,
-    "text/x-csv": FileType.CSV,
-    "application/csv": FileType.CSV,
-    "application/x-csv": FileType.CSV,
-    "text/comma-separated-values": FileType.CSV,
-    "text/x-comma-separated-values": FileType.CSV,
-    "text/csv": FileType.CSV,
-    "text/tsv": FileType.TSV,
-    "text/markdown": FileType.MD,
-    "text/x-markdown": FileType.MD,
-    "text/org": FileType.ORG,
-    "text/x-rst": FileType.RST,
-    "application/epub": FileType.EPUB,
-    "application/epub+zip": FileType.EPUB,
-    "application/json": FileType.JSON,
-    "application/rtf": FileType.RTF,
-    "text/rtf": FileType.RTF,
-    "text/html": FileType.HTML,
-    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
-    "application/vnd.ms-excel": FileType.XLS,
-    "application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
-    "application/vnd.ms-powerpoint": FileType.PPT,
-    "application/xml": FileType.XML,
-    "application/vnd.oasis.opendocument.text": FileType.ODT,
-    "message/rfc822": FileType.EML,
-    "application/x-ole-storage": FileType.MSG,
-    "application/vnd.ms-outlook": FileType.MSG,
-    # NOTE(robinson) - https://mimetype.io/audio/wav
-    "audio/vnd.wav": FileType.WAV,
-    "audio/vnd.wave": FileType.WAV,
-    "audio/wave": FileType.WAV,
-    "audio/x-pn-wav": FileType.WAV,
-    "audio/x-wav": FileType.WAV,
-    "inode/x-empty": FileType.EMPTY,
-}
-
-MIMETYPES_TO_EXCLUDE = [
-    "text/x-markdown",
-    "application/epub+zip",
-    "text/x-csv",
-    "application/csv",
-    "application/x-csv",
-    "text/comma-separated-values",
-    "text/x-comma-separated-values",
-]
-
-FILETYPE_TO_MIMETYPE = {v: k for k, v in STR_TO_FILETYPE.items() if k not in MIMETYPES_TO_EXCLUDE}
-
-EXT_TO_FILETYPE = {
-    ".pdf": FileType.PDF,
-    ".docx": FileType.DOCX,
-    ".jpg": FileType.JPG,
-    ".jpeg": FileType.JPG,
-    ".txt": FileType.TXT,
-    ".text": FileType.TXT,
-    ".log": FileType.TXT,
-    ".eml": FileType.EML,
-    ".xml": FileType.XML,
-    ".heic": FileType.HEIC,
-    ".htm": FileType.HTML,
-    ".html": FileType.HTML,
-    ".md": FileType.MD,
-    ".org": FileType.ORG,
-    ".rst": FileType.RST,
-    ".xlsx": FileType.XLSX,
-    ".pptx": FileType.PPTX,
-    ".p7s": FileType.EML,
-    ".png": FileType.PNG,
-    ".doc": FileType.DOC,
-    ".zip": FileType.ZIP,
-    ".xls": FileType.XLS,
-    ".ppt": FileType.PPT,
-    ".rtf": FileType.RTF,
-    ".json": FileType.JSON,
-    ".epub": FileType.EPUB,
-    ".msg": FileType.MSG,
-    ".odt": FileType.ODT,
-    ".csv": FileType.CSV,
-    ".tsv": FileType.TSV,
-    ".tab": FileType.TSV,
-    ".tiff": FileType.TIFF,
-    ".bmp": FileType.BMP,
-    ".wav": FileType.WAV,
-    # NOTE(robinson) - for now we are treating code files as plain text
-    ".js": FileType.TXT,
-    ".py": FileType.TXT,
-    ".java": FileType.TXT,
-    ".cpp": FileType.TXT,
-    ".cc": FileType.TXT,
-    ".cxx": FileType.TXT,
-    ".c": FileType.TXT,
-    ".cs": FileType.TXT,
-    ".php": FileType.TXT,
-    ".rb": FileType.TXT,
-    ".swift": FileType.TXT,
-    ".ts": FileType.TXT,
-    ".go": FileType.TXT,
-    ".yaml": FileType.TXT,
-    ".yml": FileType.TXT,
-    None: FileType.UNK,
-}
-
-PLAIN_TEXT_EXTENSIONS = [
-    ".txt",
-    ".text",
-    ".eml",
-    ".p7s",
-    ".md",
-    ".rtf",
-    ".html",
-    ".rst",
-    ".org",
-    ".csv",
-    ".tsv",
-    ".tab",
-    ".json",
-]
-
-
-def _resolve_symlink(file_path):
-    # Resolve the symlink to get the actual file path
-    if os.path.islink(file_path):
-        file_path = os.path.realpath(file_path)
-    return file_path
+LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))


 def detect_filetype(
@ -256,8 +39,10 @@ def detect_filetype(
    file_filename: Optional[str] = None,
    encoding: Optional[str] = "utf-8",
 ) -> Optional[FileType]:
-    """Use libmagic to determine a file's type. Helps determine which partition brick
-    to use for a given file. A return value of None indicates a non-supported file type.
+    """Use libmagic to determine a file's type.
+
+    Helps determine which partition brick to use for a given file. A return value of None indicates
+    a non-supported file type.
    """
    mime_type = None
    exactly_one(filename=filename, file=file)
@ -275,14 +60,13 @@ def detect_filetype(
        _, extension = os.path.splitext(_filename)
        extension = extension.lower()
        if os.path.isfile(_filename) and LIBMAGIC_AVAILABLE:
-            mime_type = magic.from_file(
-                _resolve_symlink(filename or file_filename),
-                mime=True,
-            )  # type: ignore
+            import magic
+
+            mime_type = magic.from_file(_resolve_symlink(_filename), mime=True)
        elif os.path.isfile(_filename):
            import filetype as ft

-            mime_type = ft.guess_mime(filename)
+            mime_type = ft.guess_mime(_filename)
        if mime_type is None:
            return EXT_TO_FILETYPE.get(extension, FileType.UNK)

@ -296,6 +80,8 @@ def detect_filetype(
        # Increased to 4096 because otherwise .xlsx files get detected as a zip file
        # ref: https://github.com/ahupp/python-magic#usage
        if LIBMAGIC_AVAILABLE:
+            import magic
+
            mime_type = magic.from_buffer(file.read(4096), mime=True)
        else:
            import filetype as ft
@ -324,7 +110,8 @@ def detect_filetype(
        else:
            return FileType.XML

-    elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
+    # -- ref: https://www.rfc-editor.org/rfc/rfc822 --
+    elif mime_type == "message/rfc822" or mime_type.startswith("text"):
        if not encoding:
            encoding = "utf-8"
        formatted_encoding = format_encoding_str(encoding)
@ -414,6 +201,42 @@ def detect_filetype(
    return EXT_TO_FILETYPE.get(extension, FileType.UNK)


+def is_json_processable(
+    filename: Optional[str] = None,
+    file: Optional[IO[bytes]] = None,
+    file_text: Optional[str] = None,
+    encoding: Optional[str] = "utf-8",
+) -> bool:
+    """True when file looks like a JSON array of objects.
+
+    Uses regex on a file prefix, so not entirely reliable but good enough if you already know the
+    file is JSON.
+    """
+    exactly_one(filename=filename, file=file, file_text=file_text)
+    if file_text is None:
+        file_text = _read_file_start_for_type_check(
+            file=file,
+            filename=filename,
+            encoding=encoding,
+        )
+    return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
+
+
+def _check_eml_from_buffer(file: IO[bytes] | IO[str]) -> bool:
+    """Checks if a text/plain file is actually a .eml file.
+
+    Uses a regex pattern to see if the start of the file matches the typical pattern for a .eml
+    file.
+    """
+    file.seek(0)
+    file_content = file.read(4096)
+    if isinstance(file_content, bytes):
+        file_head = file_content.decode("utf-8", errors="ignore")
+    else:
+        file_head = file_content
+    return EMAIL_HEAD_RE.match(file_head) is not None
+
+
 def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType:
    """Detects the filetype, given a file with an application/octet-stream MIME type."""
    file.seek(0)
@ -421,15 +244,20 @@ def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType:
        file.seek(0)
        archive = zipfile.ZipFile(file)

+        # NOTE(robinson) - .docx.xlsx files are actually zip file with a .docx/.xslx extension.
+        # If the MIME type is application/octet-stream, we check if it's a .docx/.xlsx file by
+        # looking for expected filenames within the zip file.
        archive_filenames = [f.filename for f in archive.filelist]
-        if all(f in archive_filenames for f in EXPECTED_DOCX_FILES):
+        if all(f in archive_filenames for f in ("docProps/core.xml", "word/document.xml")):
            return FileType.DOCX
-        elif all(f in archive_filenames for f in EXPECTED_XLSX_FILES):
+        elif all(f in archive_filenames for f in ("xl/workbook.xml",)):
            return FileType.XLSX
-        elif all(f in archive_filenames for f in EXPECTED_PPTX_FILES):
+        elif all(f in archive_filenames for f in ("docProps/core.xml", "ppt/presentation.xml")):
            return FileType.PPTX

    if LIBMAGIC_AVAILABLE:
+        import magic
+
        # Infer mime type using magic if octet-stream is not zip file
        mime_type = magic.from_buffer(file.read(4096), mime=True)
        return STR_TO_FILETYPE.get(mime_type, FileType.UNK)
@ -439,30 +267,55 @@ def _detect_filetype_from_octet_stream(file: IO[bytes]) -> FileType:
    return FileType.UNK


-def _read_file_start_for_type_check(
+def _is_code_mime_type(mime_type: str) -> bool:
+    """True when `mime_type` plausibly indicates a programming language source-code file."""
+    PROGRAMMING_LANGUAGES = [
+        "javascript",
+        "python",
+        "java",
+        "c++",
+        "cpp",
+        "csharp",
+        "c#",
+        "php",
+        "ruby",
+        "swift",
+        "typescript",
+    ]
+    mime_type = mime_type.lower()
+    # NOTE(robinson) - check this one explicitly to avoid conflicts with other
+    # MIME types that contain "go"
+    if mime_type == "text/x-go":
+        return True
+    return any(language in mime_type for language in PROGRAMMING_LANGUAGES)
+
+
+def _is_text_file_a_csv(
    filename: Optional[str] = None,
    file: Optional[IO[bytes]] = None,
    encoding: Optional[str] = "utf-8",
-) -> str:
-    """Reads the start of the file and returns the text content."""
-    exactly_one(filename=filename, file=file)
-    if file is not None:
-        file.seek(0)
-        file_content = file.read(4096)
-        if isinstance(file_content, str):
-            file_text = file_content
-        else:
-            file_text = file_content.decode(errors="ignore")
-        file.seek(0)
-    if filename is not None:
-        try:
-            with open(filename, encoding=encoding) as f:
-                file_text = f.read(4096)
-        except UnicodeDecodeError:
-            formatted_encoding, _ = detect_file_encoding(filename=filename)
-            with open(filename, encoding=formatted_encoding) as f:
-                file_text = f.read(4096)
-    return file_text
+):
+    """Detects if a file that has a text/plain MIME type is a CSV file."""
+
+    def count_commas(text: str):
+        """Counts the number of commas in a line, excluding commas in quotes."""
+        pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$),"
+        matches = re.findall(pattern, text)
+        return len(matches)
+
+    file_text = _read_file_start_for_type_check(
+        file=file,
+        filename=filename,
+        encoding=encoding,
+    )
+    lines = file_text.strip().splitlines()
+    if len(lines) < 2:
+        return False
+    lines = lines[: len(lines)] if len(lines) < 10 else lines[:10]
+    header_count = count_commas(lines[0])
+    if any("," not in line for line in lines):
+        return False
+    return all(count_commas(line) == header_count for line in lines[1:])


 def _is_text_file_a_json(
@ -484,93 +337,48 @@ def _is_text_file_a_json(
        # References:
        # https://stackoverflow.com/questions/7487869/is-this-simple-string-considered-valid-json
        # https://www.ietf.org/rfc/rfc4627.txt
-        if isinstance(output, str):
-            return False
-        return True
+        return not isinstance(output, str)
    except json.JSONDecodeError:
        return False


-def is_json_processable(
-    filename: Optional[str] = None,
-    file: Optional[IO[bytes]] = None,
-    file_text: Optional[str] = None,
-    encoding: Optional[str] = "utf-8",
-) -> bool:
-    exactly_one(filename=filename, file=file, file_text=file_text)
-    if file_text is None:
-        file_text = _read_file_start_for_type_check(
-            file=file,
-            filename=filename,
-            encoding=encoding,
-        )
-    return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
-
-
-def _count_commas(text: str):
-    """Counts the number of commas in a line, excluding commas in quotes."""
-    pattern = r"(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$),"
-    matches = re.findall(pattern, text)
-    return len(matches)
-
-
-def _is_text_file_a_csv(
+def _read_file_start_for_type_check(
    filename: Optional[str] = None,
    file: Optional[IO[bytes]] = None,
    encoding: Optional[str] = "utf-8",
-):
-    """Detects if a file that has a text/plain MIME type is a CSV file."""
-    file_text = _read_file_start_for_type_check(
-        file=file,
-        filename=filename,
-        encoding=encoding,
-    )
-    lines = file_text.strip().splitlines()
-    if len(lines) < 2:
-        return False
-    lines = lines[: len(lines)] if len(lines) < 10 else lines[:10]
-    header_count = _count_commas(lines[0])
-    if any("," not in line for line in lines):
-        return False
-    return all(_count_commas(line) == header_count for line in lines[1:])
+) -> str:
+    """Reads the start of the file and returns the text content."""
+    exactly_one(filename=filename, file=file)
+
+    if file is not None:
+        file.seek(0)
+        file_content = file.read(4096)
+        if isinstance(file_content, str):
+            file_text = file_content
+        else:
+            file_text = file_content.decode(errors="ignore")
+        file.seek(0)
+        return file_text
+
+    # -- guaranteed by `exactly_one()` call --
+    assert filename is not None
+
+    try:
+        with open(filename, encoding=encoding) as f:
+            file_text = f.read(4096)
+    except UnicodeDecodeError:
+        formatted_encoding, _ = detect_file_encoding(filename=filename)
+        with open(filename, encoding=formatted_encoding) as f:
+            file_text = f.read(4096)
+
+    return file_text


-def _check_eml_from_buffer(file: IO[bytes]) -> bool:
-    """Checks if a text/plain file is actually a .eml file. Uses a regex pattern to see if the
-    start of the file matches the typical pattern for a .eml file."""
-    file.seek(0)
-    file_content = file.read(4096)
-    if isinstance(file_content, bytes):
-        file_head = file_content.decode("utf-8", errors="ignore")
-    else:
-        file_head = file_content
-    return EMAIL_HEAD_RE.match(file_head) is not None
-
-
-PROGRAMMING_LANGUAGES = [
-    "javascript",
-    "python",
-    "java",
-    "c++",
-    "cpp",
-    "csharp",
-    "c#",
-    "php",
-    "ruby",
-    "swift",
-    "typescript",
-]
-
-
-def _is_code_mime_type(mime_type: str) -> bool:
-    """Checks to see if the MIME type is a MIME type that would be used for a code
-    file."""
-    mime_type = mime_type.lower()
-    # NOTE(robinson) - check this one explicitly to avoid conflicts with other
-    # MIME types that contain "go"
-    if mime_type == "text/x-go":
-        return True
-    return any(language in mime_type for language in PROGRAMMING_LANGUAGES)
+def _resolve_symlink(file_path: str) -> str:
+    """Resolve `file_path` containing symlink to the actual file path."""
+    if os.path.islink(file_path):
+        file_path = os.path.realpath(file_path)
+    return file_path


 _P = ParamSpec("_P")
--- a/unstructured/file_utils/model.py
+++ b/unstructured/file_utils/model.py
@ -0,0 +1,252 @@
+"""Domain-model for file-types."""
+
+from __future__ import annotations
+
+import enum
+
+
+class FileType(enum.Enum):
+    UNK = 0
+    EMPTY = 1
+
+    # MS Office Types
+    DOC = 10
+    DOCX = 11
+    XLS = 12
+    XLSX = 13
+    PPT = 14
+    PPTX = 15
+    MSG = 16
+
+    # Adobe Types
+    PDF = 20
+
+    # Image Types
+    JPG = 30
+    PNG = 31
+    TIFF = 32
+    BMP = 33
+    HEIC = 34
+
+    # Plain Text Types
+    EML = 40
+    RTF = 41
+    TXT = 42
+    JSON = 43
+    CSV = 44
+    TSV = 45
+
+    # Markup Types
+    HTML = 50
+    XML = 51
+    MD = 52
+    EPUB = 53
+    RST = 54
+    ORG = 55
+
+    # Compressed Types
+    ZIP = 60
+
+    # Open Office Types
+    ODT = 70
+
+    # Audio Files
+    WAV = 80
+
+    def __lt__(self, other: FileType) -> bool:
+        """Makes `FileType` members comparable with relational operators, at least with `<`.
+
+        This makes them sortable, in particular it supports sorting for pandas groupby functions.
+        """
+        return self.name < other.name
+
+
+STR_TO_FILETYPE = {
+    # -- BMP --
+    "image/bmp": FileType.BMP,
+    # -- CSV --
+    "text/csv": FileType.CSV,
+    "application/csv": FileType.CSV,
+    "application/x-csv": FileType.CSV,
+    "text/comma-separated-values": FileType.CSV,
+    "text/x-comma-separated-values": FileType.CSV,
+    "text/x-csv": FileType.CSV,
+    # -- DOC --
+    "application/msword": FileType.DOC,
+    # -- DOCX --
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document": FileType.DOCX,
+    # -- EML --
+    "message/rfc822": FileType.EML,
+    # -- EPUB --
+    "application/epub": FileType.EPUB,
+    "application/epub+zip": FileType.EPUB,
+    # -- HEIF --
+    "image/heic": FileType.HEIC,
+    # -- HTML --
+    "text/html": FileType.HTML,
+    # -- JPG --
+    "image/jpeg": FileType.JPG,
+    # -- JSON --
+    "application/json": FileType.JSON,
+    # -- MD --
+    "text/markdown": FileType.MD,
+    "text/x-markdown": FileType.MD,
+    # -- MSG --
+    "application/vnd.ms-outlook": FileType.MSG,
+    "application/x-ole-storage": FileType.MSG,
+    # -- ODT --
+    "application/vnd.oasis.opendocument.text": FileType.ODT,
+    # -- ORG --
+    "text/org": FileType.ORG,
+    # -- PDF --
+    "application/pdf": FileType.PDF,
+    # -- PNG --
+    "image/png": FileType.PNG,
+    # -- PPT --
+    "application/vnd.ms-powerpoint": FileType.PPT,
+    # -- PPTX --
+    "application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
+    # -- RST --
+    "text/x-rst": FileType.RST,
+    # -- RTF --
+    "text/rtf": FileType.RTF,
+    "application/rtf": FileType.RTF,
+    # -- TIFF --
+    "image/tiff": FileType.TIFF,
+    # -- TSV --
+    "text/tsv": FileType.TSV,
+    # -- TXT --
+    "text/plain": FileType.TXT,
+    # NOTE(robinson) - https://mimetype.io/application/yaml
+    # In the future, we may have special processing for YAML
+    # files instead of treating them as plaintext
+    "text/yaml": FileType.TXT,
+    "application/x-yaml": FileType.TXT,
+    "application/yaml": FileType.TXT,
+    "text/x-yaml": FileType.TXT,
+    # -- WAV --
+    # NOTE(robinson) - https://mimetype.io/audio/wav
+    "audio/wav": FileType.WAV,
+    "audio/vnd.wav": FileType.WAV,
+    "audio/vnd.wave": FileType.WAV,
+    "audio/wave": FileType.WAV,
+    "audio/x-pn-wav": FileType.WAV,
+    "audio/x-wav": FileType.WAV,
+    # -- XLS --
+    "application/vnd.ms-excel": FileType.XLS,
+    # -- XLSX --
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": FileType.XLSX,
+    # -- XML --
+    "application/xml": FileType.XML,
+    # -- EMPTY --
+    "inode/x-empty": FileType.EMPTY,
+}
+
+# -- MIME-types in STR_TO_FILETYPE that are not the canonical MIME-type for that file-type --
+MIMETYPE_ALIASES = (
+    "application/csv",
+    "application/epub+zip",
+    "application/rtf",
+    "application/x-csv",
+    "application/x-ole-storage",
+    "application/x-yaml",
+    "application/yaml",
+    "audio/vnd.wav",
+    "audio/vnd.wave",
+    "audio/wave",
+    "audio/x-pn-wav",
+    "audio/x-wav",
+    "text/comma-separated-values",
+    "text/x-comma-separated-values",
+    "text/x-csv",
+    "text/x-markdown",
+    "text/x-yaml",
+    "text/yaml",
+)
+
+FILETYPE_TO_MIMETYPE = {v: k for k, v in STR_TO_FILETYPE.items() if k not in MIMETYPE_ALIASES}
+
+EXT_TO_FILETYPE = {
+    # -- BMP --
+    ".bmp": FileType.BMP,
+    # -- CSV --
+    ".csv": FileType.CSV,
+    # -- DOC --
+    ".doc": FileType.DOC,
+    # -- DOCX --
+    ".docx": FileType.DOCX,
+    # -- EML --
+    ".eml": FileType.EML,
+    ".p7s": FileType.EML,
+    # -- EPUB --
+    ".epub": FileType.EPUB,
+    # -- HEIC --
+    ".heic": FileType.HEIC,
+    # -- HTML --
+    ".htm": FileType.HTML,
+    ".html": FileType.HTML,
+    # -- JPG --
+    ".jpeg": FileType.JPG,
+    ".jpg": FileType.JPG,
+    # -- JSON --
+    ".json": FileType.JSON,
+    # -- MD --
+    ".md": FileType.MD,
+    # -- MSG --
+    ".msg": FileType.MSG,
+    # -- ODT --
+    ".odt": FileType.ODT,
+    # -- ORG --
+    ".org": FileType.ORG,
+    # -- PDF --
+    ".pdf": FileType.PDF,
+    # -- PNG --
+    ".png": FileType.PNG,
+    # -- PPT --
+    ".ppt": FileType.PPT,
+    # -- PPTX --
+    ".pptx": FileType.PPTX,
+    # -- RST --
+    ".rst": FileType.RST,
+    # -- RTF --
+    ".rtf": FileType.RTF,
+    # -- TIFF --
+    ".tiff": FileType.TIFF,
+    # -- TSV --
+    ".tab": FileType.TSV,
+    ".tsv": FileType.TSV,
+    # -- TXT --
+    ".text": FileType.TXT,
+    ".txt": FileType.TXT,
+    # NOTE(robinson) - for now we are treating code files as plain text
+    ".c": FileType.TXT,
+    ".cc": FileType.TXT,
+    ".cpp": FileType.TXT,
+    ".cs": FileType.TXT,
+    ".cxx": FileType.TXT,
+    ".go": FileType.TXT,
+    ".java": FileType.TXT,
+    ".js": FileType.TXT,
+    ".log": FileType.TXT,
+    ".php": FileType.TXT,
+    ".py": FileType.TXT,
+    ".rb": FileType.TXT,
+    ".swift": FileType.TXT,
+    ".ts": FileType.TXT,
+    ".yaml": FileType.TXT,
+    ".yml": FileType.TXT,
+    # -- WAV --
+    ".wav": FileType.WAV,
+    # -- XLS --
+    ".xls": FileType.XLS,
+    # -- XLSX --
+    ".xlsx": FileType.XLSX,
+    # -- XML --
+    ".xml": FileType.XML,
+    # -- ZIP --
+    ".zip": FileType.ZIP,
+    # -- UNK --
+    None: FileType.UNK,
+}
+
+PLAIN_TEXT_EXTENSIONS = ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .text .tsv .txt".split()
--- a/unstructured/partition/auto.py
+++ b/unstructured/partition/auto.py
@ -8,13 +8,8 @@ from typing import IO, Any, Callable, Literal, Optional
 import requests

 from unstructured.documents.elements import DataSourceMetadata, Element
-from unstructured.file_utils.filetype import (
-    FILETYPE_TO_MIMETYPE,
-    STR_TO_FILETYPE,
-    FileType,
-    detect_filetype,
-    is_json_processable,
-)
+from unstructured.file_utils.filetype import detect_filetype, is_json_processable
+from unstructured.file_utils.model import FILETYPE_TO_MIMETYPE, STR_TO_FILETYPE, FileType
 from unstructured.logger import logger
 from unstructured.partition.common import exactly_one
 from unstructured.partition.email import partition_email
--- a/unstructured/partition/common.py
+++ b/unstructured/partition/common.py
@ -348,7 +348,7 @@ def add_element_metadata(
    return element


-def remove_element_metadata(layout_elements) -> list[Element]:
+def remove_element_metadata(layout_elements: list[Element]) -> list[Element]:
    """Removes document metadata from the document element.

    Document metadata includes information like the filename, source url, and page number.
--- a/unstructured/partition/msg.py
+++ b/unstructured/partition/msg.py
@ -10,7 +10,8 @@ from oxmsg.attachment import Attachment

 from unstructured.chunking import add_chunking_strategy
 from unstructured.documents.elements import Element, ElementMetadata, process_metadata
-from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
+from unstructured.file_utils.filetype import add_metadata_with_filetype
+from unstructured.file_utils.model import FileType
 from unstructured.logger import logger
 from unstructured.partition.common import (
    get_last_modified_date,
@ -98,14 +99,12 @@ class MsgPartitionerOptions:
        """True when message is encrypted."""
        # NOTE(robinson) - Per RFC 2015, the content type for emails with PGP encrypted content
        # is multipart/encrypted (ref: https://www.ietf.org/rfc/rfc2015.txt)
-        if "encrypted" in self.msg.message_headers.get("Content-Type", ""):
-            return True
-        # -- pretty sure we're going to want to dig deeper to discover messages that are encrypted
-        # -- with something other than PGP.
-        #    - might be able to distinguish based on PID_MESSAGE_CLASS = 'IPM.Note.Signed'
-        #    - Content-Type header might include "application/pkcs7-mime" for Microsoft S/MIME
-        #      encryption.
-        return False
+        # NOTE(scanny) - pretty sure we're going to want to dig deeper to discover messages that are
+        # encrypted with something other than PGP.
+        #   - might be able to distinguish based on PID_MESSAGE_CLASS = 'IPM.Note.Signed'
+        #   - Content-Type header might include "application/pkcs7-mime" for Microsoft S/MIME
+        #     encryption.
+        return "encrypted" in self.msg.message_headers.get("Content-Type", "")

    @lazyproperty
    def metadata_file_path(self) -> str | None: