rfctr(file): improve filetype tests (#3402)

**Summary** Improve file-detection tests in preparation for additional work and bug fixes. **Additional Context** - Add type annotations. - Use mocks instead of `monkeypatch` in most cases and verify calls to mock. This revealed a dozen broken tests, broken in that the mocks weren't being called so a different code path than intended was being exercised. - Use `example_doc_path()` instead of hard-coded paths. - Add actual test files for cases where they were being constructed in temporary directories. - Make test names consistent and more descriptive of behavior under test.
2025-12-04 11:10:22 +00:00 · 2024-07-15 21:04:34 -07:00 · 2024-07-15 21:04:34 -07:00 · 56ca39ca7f
commit 56ca39ca7f
parent 0057f9dea8
9 changed files with 381 additions and 293 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.15.0-dev11
+## 0.15.0-dev12

 ### Enhancements

--- a/example-docs/bmp_24.bmp
+++ b/example-docs/bmp_24.bmp
--- a/example-docs/csv-with-escaped-commas.csv
+++ b/example-docs/csv-with-escaped-commas.csv
@ -0,0 +1,3 @@
+A,A,A,A,A
+A,A,A,"A,A",A
+A,A,A,"A,A",A
--- a/example-docs/fake.go
+++ b/example-docs/fake.go
@ -0,0 +1,7 @@
+package main
+
+import "fmt"
+
+func main() {
+	fmt.Println("Hello Go!")
+}
--- a/example-docs/logger.py
+++ b/example-docs/logger.py
@ -0,0 +1,19 @@
+import logging
+from typing import Any
+
+logger = logging.getLogger("unstructured")
+trace_logger = logging.getLogger("unstructured.trace")
+
+# Create a custom logging level
+DETAIL = 15
+logging.addLevelName(DETAIL, "DETAIL")
+
+
+# Create a custom log method for the "DETAIL" level
+def detail(self: logging.Logger, message: str, *args: Any, **kwargs: Any):
+    if self.isEnabledFor(DETAIL):
+        self._log(DETAIL, message, args, **kwargs)
+
+
+# Add the custom log method to the logging.Logger class
+logging.Logger.detail = detail  # type: ignore
--- a/example-docs/simple.yaml
+++ b/example-docs/simple.yaml
@ -0,0 +1,19 @@
+---
+ doe: "a deer, a female deer"
+ ray: "a drop of golden sun"
+ pi: 3.14159
+ xmas: true
+ french-hens: 3
+ calling-birds:
+   - huey
+   - dewey
+   - louie
+   - fred
+ xmas-fifth-day:
+   calling-birds: four
+   french-hens: 3
+   golden-rings: 5
+   partridges:
+     count: 1
+     location: "a pear tree"
+   turtle-doves: two
--- a/example-docs/simple.zip
+++ b/example-docs/simple.zip
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -1,12 +1,24 @@
+# pyright: reportPrivateUsage=false
+
+"""Test suite for `unstructured.file_utils.filetype`."""
+
+from __future__ import annotations
+
+import io
 import os
 import pathlib
-import zipfile

-import magic
 import pytest
-import yaml
-from PIL import Image

+from test_unstructured.unit_utils import (
+    FixtureRequest,
+    LogCaptureFixture,
+    Mock,
+    MonkeyPatch,
+    call,
+    example_doc_path,
+    function_mock,
+)
 from unstructured.file_utils import filetype
 from unstructured.file_utils.filetype import (
    FileType,
@ -17,22 +29,11 @@ from unstructured.file_utils.filetype import (
    detect_filetype,
 )

-FILE_DIRECTORY = pathlib.Path(__file__).parent.resolve()
-EXAMPLE_DOCS_DIRECTORY = os.path.join(FILE_DIRECTORY, "..", "..", "example-docs")
-
 is_in_docker = os.path.exists("/.dockerenv")

-DOCX_MIME_TYPES = [
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-]
-
-XLSX_MIME_TYPES = [
-    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-]
-

@pytest.mark.parametrize(
-    ("file", "expected"),
+    ("file_name", "expected_value"),
    [
        ("layout-parser-paper-fast.pdf", FileType.PDF),
        ("fake.docx", FileType.DOCX),
@ -57,13 +58,12 @@ XLSX_MIME_TYPES = [
        ("fake-incomplete-json.txt", FileType.TXT),
    ],
 )
-def test_detect_filetype_from_filename(file, expected):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file)
-    assert detect_filetype(filename) == expected
+def test_detect_filetype_from_filename(file_name: str, expected_value: FileType):
+    assert detect_filetype(example_doc_path(file_name)) == expected_value


@pytest.mark.parametrize(
-    ("file", "expected"),
+    ("file_name", "expected_value"),
    [
        ("layout-parser-paper-fast.pdf", FileType.PDF),
        ("fake.docx", FileType.DOCX),
@ -84,340 +84,375 @@ def test_detect_filetype_from_filename(file, expected):
        ("fake-incomplete-json.txt", FileType.TXT),
    ],
 )
-def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
-    """Test that we detect the filetype from the filename extension when libmagic is not available
-    or the file does not exist."""
-    # Test when libmagic is not available
+def test_detect_filetype_from_filename_with_extension(
+    file_name: str, expected_value: FileType, monkeypatch: MonkeyPatch
+):
+    """File-type is detected from extension when libmagic not available or file does not exist."""
+    # -- when libmagic is not available --
    monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file)
-    assert detect_filetype(filename) == expected
-    # Test when the file does not exist
+    assert detect_filetype(example_doc_path(file_name)) == expected_value
+    # -- when file does not exist --
    monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", True)
-    extension = pathlib.Path(file).suffix
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "not-on-disk" + extension)
-    assert detect_filetype(filename) == expected
+    extension = pathlib.Path(file_name).suffix
+    assert detect_filetype(example_doc_path("not-on-disk" + extension)) == expected_value


@pytest.mark.parametrize(
-    ("file", "expected"),
+    ("file_name", "expected_value"),
    [
-        ("layout-parser-paper-fast.pdf", FileType.PDF),
-        ("fake.docx", FileType.DOCX),
-        ("example.jpg", FileType.JPG),
-        ("fake-text.txt", FileType.TXT),
-        ("eml/fake-email.eml", FileType.EML),
-        ("factbook.xml", FileType.XML),
-        # NOTE(robinson) - For the document, some operating systems return
+        ("layout-parser-paper-fast.pdf", [FileType.PDF]),
+        ("fake.docx", [FileType.DOCX]),
+        ("example.jpg", [FileType.JPG]),
+        ("fake-text.txt", [FileType.TXT]),
+        ("eml/fake-email.eml", [FileType.EML]),
+        ("factbook.xml", [FileType.XML]),
+        # NOTE(robinson]) - For the document, some operating systems return
        # */xml and some return */html. Either could be acceptable depending on the OS
        ("example-10k.html", [FileType.HTML, FileType.XML]),
-        ("fake-html.html", FileType.HTML),
-        ("stanley-cups.xlsx", FileType.XLSX),
-        # NOTE(robinson) - currently failing in the docker tests because the detected
+        ("fake-html.html", [FileType.HTML]),
+        ("stanley-cups.xlsx", [FileType.XLSX]),
+        # NOTE(robinson]) - currently failing in the docker tests because the detected
        # MIME type is text/csv
-        # ("stanley-cups.csv", FileType.CSV),
-        ("stanley-cups.tsv", FileType.TSV),
-        ("fake-power-point.pptx", FileType.PPTX),
-        ("winter-sports.epub", FileType.EPUB),
-        ("fake-incomplete-json.txt", FileType.TXT),
+        # ("stanley-cups.csv", [FileType.CSV]),
+        ("stanley-cups.tsv", [FileType.TSV]),
+        ("fake-power-point.pptx", [FileType.PPTX]),
+        ("winter-sports.epub", [FileType.EPUB]),
+        ("fake-incomplete-json.txt", [FileType.TXT]),
    ],
 )
-def test_detect_filetype_from_file(file, expected):
-    expected = expected if isinstance(expected, list) else [expected]
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file)
-    with open(filename, "rb") as f:
-        assert detect_filetype(file=f) in expected
+def test_detect_filetype_from_file(file_name: str, expected_value: list[FileType]):
+    with open(example_doc_path(file_name), "rb") as f:
+        assert detect_filetype(file=f) in expected_value


-def test_detect_filetype_from_file_warning_without_libmagic(monkeypatch, caplog):
+def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed(
+    monkeypatch: MonkeyPatch, caplog: LogCaptureFixture
+):
    monkeypatch.setattr(filetype, "LIBMAGIC_AVAILABLE", False)
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-    with open(filename, "rb") as f:
+    with open(example_doc_path("fake-text.txt"), "rb") as f:
        detect_filetype(file=f)
-        assert "WARNING" in caplog.text
+
+    assert "WARNING" in caplog.text


-def test_detect_xml_application_xml(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml")
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml")
-    filetype = detect_filetype(filename=filename)
+def test_detect_XML_from_application_xml_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "application/xml"
+    file_path = example_doc_path("factbook.xml")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.XML


-def test_detect_text_csv(monkeypatch, filename="example-docs/stanley-cup.csv"):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/csv")
-    filetype = detect_filetype(filename=filename)
+def test_detect_CSV_from_text_csv_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "text/csv"
+    file_path = example_doc_path("stanley-cups.csv")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.CSV


-def test_detect_text_python_from_filename(monkeypatch, filename="unstructured/logger.py"):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-script.python")
-    filetype = detect_filetype(filename=filename)
+def test_detect_TXT_from_text_x_script_python_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "text/x-script.python"
+    file_path = example_doc_path("logger.py")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.TXT


-def test_detect_text_python_from_file(monkeypatch, filename="unstructured/logger.py"):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-script.python")
-    with open(filename, "rb") as f:
+def test_detect_TXT_from_text_x_script_python_file(magic_from_buffer_: Mock):
+    magic_from_buffer_.return_value = "text/x-script.python"
+    file_path = example_doc_path("logger.py")
+
+    with open(file_path, "rb") as f:
+        head = f.read(4096)
+        f.seek(0)
        filetype = detect_filetype(file=f)
+
+    magic_from_buffer_.assert_called_once_with(head, mime=True)
    assert filetype == FileType.TXT


-def test_detects_go_mime_type():
+def test_is_code_mime_type_for_Go():
    assert _is_code_mime_type("text/x-go") is True


-def test_detect_xml_application_go(monkeypatch, tmpdir):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/x-go")
+def test_detect_TXT_from_text_go_file(magic_from_buffer_: Mock):
+    magic_from_buffer_.return_value = "text/x-go"
+    file_path = example_doc_path("fake.go")

-    filename = os.path.join(tmpdir, "fake.go")
-    with open(filename, "w") as f:
-        f.write("")
+    with open(file_path, "rb") as f:
+        head = f.read(4096)
+        f.seek(0)
+        filetype = detect_filetype(file=f)

-    with open(filename, "rb") as f:
-        assert detect_filetype(filename=filename) == FileType.TXT
+    magic_from_buffer_.assert_called_once_with(head, mime=True)
+    assert filetype == FileType.TXT


-def test_detect_xml_application_rtf(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/rtf")
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.rtf")
-    filetype = detect_filetype(filename=filename)
+def test_detect_RTF_from_application_rtf_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "application/rtf"
+    file_path = example_doc_path("fake-doc.rtf")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.RTF


-def test_detect_xml_text_xml(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/xml")
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml")
-    filetype = detect_filetype(filename=filename)
+def test_detect_XML_from_text_xml_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "text/xml"
+    file_path = example_doc_path("factbook.xml")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.XML


-def test_detect_html_application_xml(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml")
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.html")
-    filetype = detect_filetype(filename=filename)
+def test_detect_HTML_from_application_xml_file_path_with_html_extension(magic_from_file_: Mock):
+    magic_from_file_.return_value = "application/xml"
+    file_path = example_doc_path("fake-html.html")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.HTML


-def test_detect_html_text_xml(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/xml")
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.html")
-    filetype = detect_filetype(filename=filename)
+def test_detect_HTML_from_text_xml_file_path_with_html_extension(magic_from_file_: Mock):
+    magic_from_file_.return_value = "text/xml"
+    file_path = example_doc_path("fake-html.html")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.HTML


-def test_detect_docx_filetype_application_octet_stream(monkeypatch):
-    monkeypatch.setattr(
-        magic,
-        "from_buffer",
-        lambda *args, **kwargs: "application/octet-stream",
-    )
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
-    with open(filename, "rb") as f:
-        filetype = detect_filetype(file=f)
+def test_detect_DOCX_from_application_octet_stream_file_no_extension(magic_from_buffer_: Mock):
+    magic_from_buffer_.return_value = "application/octet-stream"
+    with open(example_doc_path("simple.docx"), "rb") as f:
+        file = io.BytesIO(f.read())
+
+    filetype = detect_filetype(file=file)
+
+    magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True)
    assert filetype == FileType.DOCX


-def test_detect_docx_filetype_application_octet_stream_with_filename(monkeypatch):
-    monkeypatch.setattr(
-        magic,
-        "from_file",
-        lambda *args, **kwargs: "application/octet-stream",
-    )
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
-    filetype = detect_filetype(filename=filename)
+def test_detect_DOCX_from_application_octet_stream_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "application/octet-stream"
+    file_path = example_doc_path("simple.docx")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.DOCX


-def test_detect_docx_filetype_application_zip(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/zip")
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
-    filetype = detect_filetype(filename=filename)
+def test_detect_DOCX_from_application_zip_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "application/zip"
+    file_path = example_doc_path("simple.docx")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.DOCX


-def test_detect_application_zip_files(monkeypatch, tmpdir):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/zip")
-    filename = os.path.join(tmpdir, "test.zip")
-    zf = zipfile.ZipFile(filename, "w")
-    zf.close()
-    filetype = detect_filetype(filename=filename)
+def test_detect_ZIP_from_application_zip_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "application/zip"
+    file_path = example_doc_path("simple.zip")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.ZIP


-def test_detect_doc_file_from_mime_type(monkeypatch):
-    monkeypatch.setattr(
-        magic,
-        "from_file",
-        lambda *args, **kwargs: "application/msword",
-    )
-    filetype = detect_filetype(filename="fake.doc")
+def test_detect_DOC_from_application_msword_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "application/msword"
+    file_path = example_doc_path("fake.doc")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.DOC


-def test_detect_ppt_file_from_mime_type(monkeypatch):
-    monkeypatch.setattr(
-        magic,
-        "from_file",
-        lambda *args, **kwargs: "application/vnd.ms-powerpoint",
-    )
-    filetype = detect_filetype(filename="fake.ppt")
+def test_detect_PPT_from_application_vnd_ms_powerpoint_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "application/vnd.ms-powerpoint"
+    file_path = example_doc_path("fake-power-point.ppt")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.PPT


-def test_detect_xls_file_from_mime_type(monkeypatch):
-    monkeypatch.setattr(
-        magic,
-        "from_file",
-        lambda *args, **kwargs: "application/vnd.ms-excel",
-    )
-    filetype = detect_filetype(filename="fake.xls")
+def test_detect_XLS_from_application_vnd_ms_excel_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "application/vnd.ms-excel"
+    file_path = example_doc_path("tests-example.xls")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.XLS


-def test_detect_xlsx_filetype_application_octet_stream(monkeypatch):
-    monkeypatch.setattr(
-        magic,
-        "from_buffer",
-        lambda *args, **kwargs: "application/octet-stream",
-    )
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
-    with open(filename, "rb") as f:
-        filetype = detect_filetype(file=f)
+def test_detect_XLSX_from_application_octet_stream_file_no_extension(magic_from_buffer_: Mock):
+    magic_from_buffer_.return_value = "application/octet-stream"
+    with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
+        file = io.BytesIO(f.read())
+
+    filetype = detect_filetype(file=file)
+
+    magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True)
    assert filetype == FileType.XLSX


-def test_detect_xlsx_filetype_application_octet_stream_with_filename(monkeypatch):
-    monkeypatch.setattr(
-        magic,
-        "from_file",
-        lambda *args, **kwargs: "application/octet-stream",
-    )
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
-    filetype = detect_filetype(filename=filename)
+def test_detect_XLSX_from_application_octet_stream_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "application/octet-stream"
+    file_path = example_doc_path("stanley-cups.xlsx")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.XLSX


-def test_detect_pptx_filetype_application_octet_stream(monkeypatch):
-    monkeypatch.setattr(
-        magic,
-        "from_buffer",
-        lambda *args, **kwargs: "application/octet-stream",
-    )
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
-    with open(filename, "rb") as f:
-        filetype = detect_filetype(file=f)
+def test_detect_PPTX_from_application_octet_stream_file_no_extension(magic_from_buffer_: Mock):
+    magic_from_buffer_.return_value = "application/octet-stream"
+    with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
+        file = io.BytesIO(f.read())
+
+    filetype = detect_filetype(file=file)
+
+    magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True)
    assert filetype == FileType.PPTX


-def test_detect_pptx_filetype_application_octet_stream_with_filename(monkeypatch):
-    monkeypatch.setattr(
-        magic,
-        "from_file",
-        lambda *args, **kwargs: "application/octet-stream",
-    )
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.pptx")
-    filetype = detect_filetype(filename=filename)
+def test_detect_PPTX_from_application_octet_stream_file_path(magic_from_file_: Mock):
+    magic_from_file_.return_value = "application/octet-stream"
+    file_path = example_doc_path("fake-power-point.pptx")
+
+    filetype = detect_filetype(file_path)
+
+    magic_from_file_.assert_called_once_with(file_path, mime=True)
    assert filetype == FileType.PPTX


-def test_detect_application_octet_stream_returns_none_with_unknown(monkeypatch):
-    monkeypatch.setattr(
-        magic,
-        "from_buffer",
-        lambda *args, **kwargs: "application/octet-stream",
-    )
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-    with open(filename, "rb") as f:
-        filetype = detect_filetype(file=f)
+def test_detect_UNK_from_application_octet_stream_text_file_no_extension(magic_from_buffer_: Mock):
+    magic_from_buffer_.return_value = "application/octet-stream"
+    with open(example_doc_path("fake-text.txt"), "rb") as f:
+        file = io.BytesIO(f.read())
+
+    filetype = detect_filetype(file=file)
+
+    assert magic_from_buffer_.call_args_list == [
+        call(file.getvalue()[:4096], mime=True),
+        call(b"", mime=True),
+    ]
    assert filetype == FileType.UNK


-def test_detect_application_zip_returns_zip_with_unknown(monkeypatch):
-    monkeypatch.setattr(magic, "from_buffer", lambda *args, **kwargs: "application/zip")
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-text.txt")
-    with open(filename, "rb") as f:
+def test_detect_ZIP_from_application_zip_not_a_zip_file(magic_from_buffer_: Mock):
+    magic_from_buffer_.return_value = "application/zip"
+
+    with open(example_doc_path("fake-text.txt"), "rb") as f:
+        head = f.read(4096)
+        f.seek(0)
        filetype = detect_filetype(file=f)
+
+    assert magic_from_buffer_.call_args_list == [
+        call(head, mime=True),
+        call(b"", mime=True),
+    ]
    assert filetype == FileType.ZIP


-def test_detect_docx_filetype_word_mime_type(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: DOCX_MIME_TYPES[0])
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.docx")
-    with open(filename, "rb") as f:
-        filetype = detect_filetype(file=f)
+def test_detect_DOCX_from_docx_mime_type_file_no_extension(magic_from_buffer_: Mock):
+    magic_from_buffer_.return_value = (
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    )
+    with open(example_doc_path("simple.docx"), "rb") as f:
+        file = io.BytesIO(f.read())
+
+    filetype = detect_filetype(file=file)
+
+    magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True)
    assert filetype == FileType.DOCX


-def test_detect_xlsx_filetype_word_mime_type(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: XLSX_MIME_TYPES[0])
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "stanley-cups.xlsx")
-    with open(filename, "rb") as f:
-        filetype = detect_filetype(file=f)
+def test_detect_XLSX_from_xlsx_mime_type_file_no_extension(magic_from_buffer_: Mock):
+    magic_from_buffer_.return_value = (
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    )
+    with open(example_doc_path("stanley-cups.xlsx"), "rb") as f:
+        file = io.BytesIO(f.read())
+
+    filetype = detect_filetype(file=file)
+
+    magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True)
    assert filetype == FileType.XLSX


-def test_detect_filetype_returns_none_with_unknown(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/fake")
-    assert detect_filetype(filename="made_up.fake") == FileType.UNK
+def test_detect_UNK_from_extension_of_non_existent_file_path():
+    assert detect_filetype(example_doc_path("made_up.fake")) == FileType.UNK


-def test_detect_filetype_detects_png(monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "image/png")
-    assert detect_filetype(filename="made_up.png") == FileType.PNG
+def test_detect_PNG_from_extension_of_non_existent_file_path():
+    assert detect_filetype(example_doc_path("made_up.png")) == FileType.PNG


-def test_detect_filetype_detects_unknown_text_types_as_txt(monkeypatch, tmpdir):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/new-type")
-    monkeypatch.setattr(os.path, "isfile", lambda *args, **kwargs: True)
+def test_detect_TXT_from_unknown_text_subtype_file_no_extension(magic_from_buffer_: Mock):
+    magic_from_buffer_.return_value = "text/new-type"
+    with open(example_doc_path("fake-text.txt"), "rb") as f:
+        file = io.BytesIO(f.read())

-    filename = os.path.join(tmpdir.dirname, "made_up.png")
-    with open(filename, "w") as f:
-        f.write("here is a fake file!")
+    filetype = detect_filetype(file=file)

-    assert detect_filetype(filename=filename) == FileType.TXT
+    magic_from_buffer_.assert_called_once_with(file.getvalue()[:4096], mime=True)
+    assert filetype == FileType.TXT


-def test_detect_filetype_detects_bmp_from_filename(
-    tmpdir,
-    filename="example-docs/layout-parser-paper-with-table.jpg",
-):
-    bmp_filename = os.path.join(tmpdir.dirname, "example.bmp")
-    img = Image.open(filename)
-    img.save(bmp_filename)
-
-    detect_filetype(filename=bmp_filename) == FileType.BMP
+def test_detect_BMP_from_file_path():
+    assert detect_filetype(example_doc_path("bmp_24.bmp")) == FileType.BMP


-@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
-def test_detect_filetype_detects_bmp_from_file(
-    tmpdir,
-    filename="example-docs/layout-parser-paper-with-table.jpg",
-):
-    bmp_filename = os.path.join(tmpdir.dirname, "example.bmp")
-    img = Image.open(filename)
-    img.save(bmp_filename)
-
-    with open(bmp_filename, "rb") as f:
-        assert detect_filetype(file=f) == FileType.BMP
+def test_detect_BMP_from_file_no_extension():
+    with open(example_doc_path("bmp_24.bmp"), "rb") as f:
+        file = io.BytesIO(f.read())
+    assert detect_filetype(file=file) == FileType.BMP


-def test_detect_filetype_raises_with_both_specified():
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "eml/fake-email.eml")
-    with open(filename, "rb") as f, pytest.raises(ValueError):
-        detect_filetype(filename=filename, file=f)
+def test_detect_filetype_raises_when_both_path_and_file_like_object_are_specified():
+    file_path = example_doc_path("fake-email.eml")
+    with open(example_doc_path(file_path), "rb") as f:
+        file = io.BytesIO(f.read())
+
+    with pytest.raises(ValueError, match="Exactly one of filename and file must be specified."):
+        detect_filetype(filename=file_path, file=file)


-def test_detect_filetype_raises_with_none_specified():
-    with pytest.raises(ValueError):
+def test_detect_filetype_raises_with_neither_path_or_file_like_object_specified():
+    with pytest.raises(ValueError, match="Exactly one of filename and file must be specified."):
        detect_filetype()


-def test_filetype_order():
-    assert FileType.HTML < FileType.XML
+def test_FileType_is_ordererd_by_name():
+    """FileType is a total order on name, e.g. FileType.A < FileType.B."""
+    assert FileType.EML < FileType.HTML < FileType.XML


@pytest.mark.parametrize(
-    ("content", "expected"),
+    ("content", "expected_value"),
    [
        (b"d\xe2\x80", False),  # Invalid JSON
        (b'[{"key": "value"}]', True),  # Valid JSON
@ -425,15 +460,13 @@ def test_filetype_order():
        (b'"This is not a JSON"', False),  # Serializable as JSON, but we want to treat it as txt
    ],
 )
-def test_is_text_file_a_json(content, expected):
-    from io import BytesIO
-
-    with BytesIO(content) as f:
-        assert _is_text_file_a_json(file=f) == expected
+def test_is_text_file_a_json_distinguishes_JSON_from_text(content: bytes, expected_value: bool):
+    with io.BytesIO(content) as f:
+        assert _is_text_file_a_json(file=f) == expected_value


@pytest.mark.parametrize(
-    ("content", "expected"),
+    ("content", "expected_value"),
    [
        (b"d\xe2\x80", False),  # Invalid CSV
        (b'[{"key": "value"}]', False),  # Invalid CSV
@ -441,76 +474,83 @@ def test_is_text_file_a_json(content, expected):
        (b"", False),  # Empty content
    ],
 )
-def test_is_text_file_a_csv(content, expected):
-    from io import BytesIO
-
-    with BytesIO(content) as f:
-        assert _is_text_file_a_csv(file=f) == expected
+def test_is_text_file_a_csv_distinguishes_CSV_from_text(content: bytes, expected_value: bool):
+    with io.BytesIO(content) as f:
+        assert _is_text_file_a_csv(file=f) == expected_value


-def test_csv_json_check_with_filename_and_utf_32(filename="example-docs/fake-text-utf-32.txt"):
-    assert _is_text_file_a_csv(filename=filename) is False
-    assert _is_text_file_a_json(filename=filename) is False
+def test_csv_and_json_checks_with_filename_accommodate_utf_32_encoded_file():
+    file_path = example_doc_path("fake-text-utf-32.txt")
+    assert _is_text_file_a_csv(filename=file_path) is False
+    assert _is_text_file_a_json(filename=file_path) is False


-def test_csv_json_check_with_file_and_utf_32(filename="example-docs/fake-text-utf-32.txt"):
-    with open(filename, "rb") as f:
-        assert _is_text_file_a_csv(file=f) is False
+def test_csv_and_json_checks_with_file_accommodate_utf_32_encoded_content():
+    with open(example_doc_path("fake-text-utf-32.txt"), "rb") as f:
+        file = io.BytesIO(f.read())

-    with open(filename, "rb") as f:
-        assert _is_text_file_a_json(file=f) is False
+    assert _is_text_file_a_csv(file=file) is False
+    file.seek(0)
+    assert _is_text_file_a_json(file=file) is False


-def test_detect_filetype_detects_empty_filename(filename="example-docs/empty.txt"):
-    assert detect_filetype(filename=filename) == FileType.EMPTY
+def test_detect_EMPTY_from_file_path_to_empty_file():
+    assert detect_filetype(example_doc_path("empty.txt")) == FileType.EMPTY


-def test_detect_filetype_detects_empty_file(filename="example-docs/empty.txt"):
-    with open(filename, "rb") as f:
+def test_detect_EMPTY_from_file_that_is_empty():
+    with open(example_doc_path("empty.txt"), "rb") as f:
        assert detect_filetype(file=f) == FileType.EMPTY


-def test_detect_filetype_skips_escape_commas_for_csv(tmpdir):
-    text = 'A,A,A,A,A\nA,A,A,"A,A",A\nA,A,A,"A,A",A'
-    filename = os.path.join(tmpdir.dirname, "csv-with-escaped-commas.csv")
-    with open(filename, "w") as f:
-        f.write(text)
+def test_detect_CSV_from_path_and_file_when_content_contains_escaped_commas():
+    file_path = example_doc_path("csv-with-escaped-commas.csv")

-    assert detect_filetype(filename=filename) == FileType.CSV
-
-    with open(filename, "rb") as f:
+    assert detect_filetype(filename=file_path) == FileType.CSV
+    with open(file_path, "rb") as f:
        assert detect_filetype(file=f) == FileType.CSV


-def test_detect_filetype_from_octet_stream(filename="example-docs/emoji.xlsx"):
-    with open(filename, "rb") as f:
+def test_detect_filetype_from_octet_stream():
+    with open(example_doc_path("emoji.xlsx"), "rb") as f:
        assert _detect_filetype_from_octet_stream(file=f) == FileType.XLSX


-def test_detect_wav_from_filename(filename="example-docs/CantinaBand3.wav"):
-    assert detect_filetype(filename=filename) == FileType.WAV
+def test_detect_WAV_from_filename():
+    assert detect_filetype(example_doc_path("CantinaBand3.wav")) == FileType.WAV


-def test_detect_wav_from_file(filename="example-docs/CantinaBand3.wav"):
-    with open(filename, "rb") as f:
+def test_detect_wav_from_file():
+    with open(example_doc_path("CantinaBand3.wav"), "rb") as f:
        assert detect_filetype(file=f) == FileType.WAV


-def test_detect_yaml_as_text_from_filename(tmpdir):
-    data = {"hi": "there", "this is": "yaml"}
-    filename = os.path.join(tmpdir.dirname, "test.yaml")
-    with open(filename, "w") as f:
-        yaml.dump(data, f)
-
-    assert detect_filetype(filename=filename) == FileType.TXT
+def test_detect_TXT_from_file_path_to_yaml():
+    assert detect_filetype(example_doc_path("simple.yaml")) == FileType.TXT


-def test_detect_yaml_as_text_from_file(tmpdir, monkeypatch):
-    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/yaml")
-    data = {"hi": "there", "this is": "yaml"}
-    filename = os.path.join(tmpdir.dirname, "test.yaml")
-    with open(filename, "w") as f:
-        yaml.dump(data, f)
+def test_detect_TXT_from_yaml_file(magic_from_buffer_: Mock):
+    magic_from_buffer_.return_value = "text/yaml"

-    with open(filename, "rb") as f:
-        assert detect_filetype(file=f) == FileType.TXT
+    with open(example_doc_path("simple.yaml"), "rb") as f:
+        head = f.read(4096)
+        f.seek(0)
+        file_type = detect_filetype(file=f)
+
+    magic_from_buffer_.assert_called_once_with(head, mime=True)
+    assert file_type == FileType.TXT
+
+
+# ================================================================================================
+# MODULE-LEVEL FIXTURES
+# ================================================================================================
+
+
+@pytest.fixture()
+def magic_from_buffer_(request: FixtureRequest):
+    return function_mock(request, "unstructured.file_utils.filetype.magic.from_buffer")
+
+
+@pytest.fixture()
+def magic_from_file_(request: FixtureRequest):
+    return function_mock(request, "unstructured.file_utils.filetype.magic.from_file")
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.15.0-dev11"  # pragma: no cover
+__version__ = "0.15.0-dev12"  # pragma: no cover