fix: refine filetype detection (#3828)

**Summary** Fixes a bug where a CSV file with asserted content-type `application/vnd.ms-excel` was incorrectly identified as an XLS file and failed partitioning. **Additional Context** The `content_type` argument to partitioning is often authored by the client system (e.g. Unstructured SDK) and is both unreliable and outside the control of the user. In this case the `.csv -> XLS` mapping is correct for certain purposes (Excel is often used to load and edit CSV files) but not for partitioning, and the user has no readily available way to override the mapping. XLS files as well as seven other common binary file types can be efficiently detected 100% of the time (at least 99.999%) using code we already have in the file detector. - Promote this direct-inspection strategy to be tried first. - When DOC, DOCX, EPUB, ODT, PPT, PPTX, XLS, or XLSX is detected, use that file-type. - When one of those types is NOT detected, clear the asserted `content_type` when it matches any of those types. This prevents the problem seen in the bug where the asserted content type was used to determine the file-type. - The remaining content_type, guess MIME-type, and filename-extension mapping strategies are tried, in that order, only when direct inspection fails. This is largely the same as it was before. - Fix #3781 while we were in the neighborhood. - Fix #3596 as well, essentially an earlier report of #3781.
2025-10-30 09:23:45 +00:00 · 2024-12-16 16:56:21 -08:00 · 2024-12-16 16:56:21 -08:00 · b5ff79d8db
commit b5ff79d8db
parent 10f0d54ac2
4 changed files with 224 additions and 479 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.16.12-dev2
+## 0.16.12-dev3
 ### Enhancements
@ -9,6 +9,7 @@
 ### Fixes
 - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
 - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
 ## 0.16.11
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -14,15 +14,14 @@ from test_unstructured.unit_utils import (
    LogCaptureFixture,
    Mock,
    example_doc_path,
    function_mock,
    patch,
    property_mock,
 )
 from unstructured.file_utils.filetype import (
    _FileTypeDetectionContext,
-    _OleFileDifferentiator,
+    _OleFileDetector,
    _TextFileDifferentiator,
-    _ZipFileDifferentiator,
+    _ZipFileDetector,
    detect_filetype,
    is_json_processable,
 )
@ -31,7 +30,41 @@ from unstructured.file_utils.model import FileType
 is_in_docker = os.path.exists("/.dockerenv")
 # ================================================================================================
-# STRATEGY #1 - CONTENT-TYPE ASSERTED IN CALL
+# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
 # ================================================================================================
@pytest.mark.parametrize(
    ("expected_value", "file_name"),
    [
        (FileType.DOC, "simple.doc"),
        (FileType.DOCX, "simple.docx"),
        (FileType.EPUB, "winter-sports.epub"),
        (FileType.ODT, "simple.odt"),
        (FileType.PPT, "fake-power-point.ppt"),
        (FileType.PPTX, "fake-power-point.pptx"),
        (FileType.XLS, "tests-example.xls"),
        (FileType.XLSX, "stanley-cups.xlsx"),
    ],
 )
 def test_it_detects_correct_file_type_for_CFB_and_ZIP_subtypes_detected_by_direct_inspection(
    file_name: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
    # -- disable other strategies; no content-type, guessed MIME-type or extension --
    ctx_mime_type_.return_value = None
    with open(example_doc_path(file_name), "rb") as f:
        file = io.BytesIO(f.read())
    file_type = detect_filetype(file=file)
    # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
    # -- fall back to MIME-type guessing for any of these test cases.
    ctx_mime_type_.assert_not_called()
    assert file_type == expected_value
 # ================================================================================================
 # STRATEGY #2 - CONTENT-TYPE ASSERTED IN CALL
 # ================================================================================================
@ -40,41 +73,21 @@ is_in_docker = os.path.exists("/.dockerenv")
    [
        (FileType.BMP, "img/bmp_24.bmp", "image/bmp"),
        (FileType.CSV, "stanley-cups.csv", "text/csv"),
        (FileType.DOC, "simple.doc", "application/msword"),
        (
            FileType.DOCX,
            "simple.docx",
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        ),
        (FileType.EML, "eml/fake-email.eml", "message/rfc822"),
        (FileType.EPUB, "winter-sports.epub", "application/epub+zip"),
        (FileType.HEIC, "img/DA-1p.heic", "image/heic"),
        (FileType.HTML, "example-10k-1p.html", "text/html"),
        (FileType.JPG, "img/example.jpg", "image/jpeg"),
        (FileType.JSON, "spring-weather.html.json", "application/json"),
        (FileType.MD, "README.md", "text/markdown"),
        (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"),
        (FileType.ORG, "README.org", "text/org"),
        (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
        (FileType.PNG, "img/DA-1p.png", "image/png"),
        (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
        (
            FileType.PPTX,
            "fake-power-point.pptx",
            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        ),
        (FileType.RST, "README.rst", "text/x-rst"),
        (FileType.RTF, "fake-doc.rtf", "text/rtf"),
        (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"),
        (FileType.TSV, "stanley-cups.tsv", "text/tsv"),
        (FileType.TXT, "norwich-city.txt", "text/plain"),
        (FileType.WAV, "CantinaBand3.wav", "audio/wav"),
        (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"),
        (
            FileType.XLSX,
            "stanley-cups.xlsx",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        ),
        (FileType.XML, "factbook.xml", "application/xml"),
        (FileType.ZIP, "simple.zip", "application/zip"),
    ],
@ -82,13 +95,13 @@ is_in_docker = os.path.exists("/.dockerenv")
 def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_content_type(
    file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
-    # -- disable strategy #2, leaving only asserted content-type and extension --
+    # -- disable mime-guessing leaving only asserted content-type and extension --
    ctx_mime_type_.return_value = None
    file_type = detect_filetype(example_doc_path(file_name), content_type=content_type)
-    # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
+    # -- Content-type strategy should not need to refer to guessed MIME-type and detection should
-    # -- fall back to strategy 2 for any of these test cases.
+    # not -- fall back to strategy 2 for any of these test cases.
    ctx_mime_type_.assert_not_called()
    assert file_type == expected_value
@ -98,41 +111,21 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte
    [
        (FileType.BMP, "img/bmp_24.bmp", "image/bmp"),
        (FileType.CSV, "stanley-cups.csv", "text/csv"),
        (FileType.DOC, "simple.doc", "application/msword"),
        (
            FileType.DOCX,
            "simple.docx",
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        ),
        (FileType.EML, "eml/fake-email.eml", "message/rfc822"),
        (FileType.EPUB, "winter-sports.epub", "application/epub+zip"),
        (FileType.HEIC, "img/DA-1p.heic", "image/heic"),
        (FileType.HTML, "example-10k-1p.html", "text/html"),
        (FileType.JPG, "img/example.jpg", "image/jpeg"),
        (FileType.JSON, "spring-weather.html.json", "application/json"),
        (FileType.MD, "README.md", "text/markdown"),
        (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"),
        (FileType.ORG, "README.org", "text/org"),
        (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
        (FileType.PNG, "img/DA-1p.png", "image/png"),
        (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
        (
            FileType.PPTX,
            "fake-power-point.pptx",
            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        ),
        (FileType.RST, "README.rst", "text/x-rst"),
        (FileType.RTF, "fake-doc.rtf", "text/rtf"),
        (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"),
        (FileType.TSV, "stanley-cups.tsv", "text/tsv"),
        (FileType.TXT, "norwich-city.txt", "text/plain"),
        (FileType.WAV, "CantinaBand3.wav", "audio/wav"),
        (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"),
        (
            FileType.XLSX,
            "stanley-cups.xlsx",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        ),
        (FileType.XML, "factbook.xml", "application/xml"),
        (FileType.ZIP, "simple.zip", "application/zip"),
    ],
@ -140,93 +133,22 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte
 def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_content_type(
    file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
-    # -- disable strategy #2 (guessed mime-type) --
+    # -- disable mime-guessing --
    ctx_mime_type_.return_value = None
-    # -- disable strategy #3 (filename extension) by supplying no source of file name --
+    # -- disable filename extension mapping by supplying no source of file name --
    with open(example_doc_path(file_name), "rb") as f:
        file = io.BytesIO(f.read())
    file_type = detect_filetype(file=file, content_type=content_type)
-    # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
+    # -- Content-type strategy should not need to refer to guessed MIME-type and detection should
-    # -- fall-back to strategy 2 for any of these test cases.
+    # -- not fall-back to strategy 2 for any of these test cases.
    ctx_mime_type_.assert_not_called()
    assert file_type is expected_value
@pytest.mark.parametrize(
    ("expected_value", "file_name"),
    [
        (FileType.DOCX, "simple.docx"),
        (FileType.PPTX, "fake-power-point.pptx"),
        (FileType.XLSX, "stanley-cups.xlsx"),
    ],
 )
@pytest.mark.parametrize(
    "content_type",
    [
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    ],
 )
 def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_content_type(
    file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
    # -- disable strategies 2 & 3, content-type strategy should get this on its own --
    ctx_mime_type_.return_value = None
    with open(example_doc_path(file_name), "rb") as f:
        file = io.BytesIO(f.read())
    file_type = detect_filetype(file=file, content_type=content_type)
    # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
    # -- fall-back to strategy 2 for any of these test cases.
    ctx_mime_type_.assert_not_called()
    assert file_type is expected_value
@pytest.mark.parametrize(
    ("expected_value", "file_name"),
    [
        (FileType.DOC, "simple.doc"),
        (FileType.PPT, "fake-power-point.ppt"),
        (FileType.XLS, "tests-example.xls"),
    ],
 )
@pytest.mark.parametrize(
    "content_type",
    [
        "application/msword",
        "application/vnd.ms-outlook",
        "application/vnd.ms-powerpoint",
        "application/vnd.ms-excel",
        "anything/else",
    ],
 )
 def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_content_type(
    file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
    """Fixes wrong XLS asserted as DOC, PPT, etc.
    Asserted content-type can be anything except `None` and differentiator will fix it if the file
    is DOC, PPT, or XLS type.
    """
    # -- disable strategies 2 & 3, content-type strategy should get this on its own --
    ctx_mime_type_.return_value = None
    with open(example_doc_path(file_name), "rb") as f:
        file = io.BytesIO(f.read())
    file_type = detect_filetype(file=file, content_type=content_type)
    # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
    # -- fall-back to strategy 2 for any of these test cases.
    ctx_mime_type_.assert_not_called()
    assert file_type is expected_value
 # ================================================================================================
-# STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC
+# STRATEGY #3 - GUESS MIME-TYPE WITH LIBMAGIC/FILETYPE LIBRARY
 # ================================================================================================
@ -237,31 +159,16 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_
        (FileType.CSV, "stanley-cups.csv", "text/csv"),
        (FileType.CSV, "stanley-cups.csv", "application/csv"),
        (FileType.CSV, "stanley-cups.csv", "application/x-csv"),
        (FileType.DOC, "simple.doc", "application/msword"),
        (
            FileType.DOCX,
            "simple.docx",
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        ),
        (FileType.EML, "eml/fake-email.eml", "message/rfc822"),
        (FileType.EPUB, "winter-sports.epub", "application/epub"),
        (FileType.EPUB, "winter-sports.epub", "application/epub+zip"),
        (FileType.HEIC, "img/DA-1p.heic", "image/heic"),
        (FileType.HTML, "example-10k-1p.html", "text/html"),
        (FileType.JPG, "img/example.jpg", "image/jpeg"),
        (FileType.JSON, "spring-weather.html.json", "application/json"),
        (FileType.MD, "README.md", "text/markdown"),
        (FileType.MD, "README.md", "text/x-markdown"),
        (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"),
        (FileType.ORG, "README.org", "text/org"),
        (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
        (FileType.PNG, "img/DA-1p.png", "image/png"),
        (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
        (
            FileType.PPTX,
            "fake-power-point.pptx",
            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        ),
        (FileType.RST, "README.rst", "text/x-rst"),
        (FileType.RTF, "fake-doc.rtf", "text/rtf"),
        (FileType.RTF, "fake-doc.rtf", "application/rtf"),
@ -270,18 +177,11 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_
        (FileType.TXT, "norwich-city.txt", "text/plain"),
        (FileType.TXT, "simple.yaml", "text/yaml"),
        (FileType.WAV, "CantinaBand3.wav", "audio/wav"),
        (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"),
        (
            FileType.XLSX,
            "stanley-cups.xlsx",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        ),
        (FileType.XML, "factbook.xml", "application/xml"),
        (FileType.XML, "factbook.xml", "text/xml"),
        (FileType.ZIP, "simple.zip", "application/zip"),
    ],
 )
-def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_recognized_mime_type(
+def test_it_detects_correct_file_type_by_guessed_MIME_when_libmagic_guesses_recognized_mime_type(
    file_name: str, mime_type: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
    # -- libmagic guesses a MIME-type mapped to a `FileType` --
@ -290,7 +190,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
    with open(example_doc_path(file_name), "rb") as f:
        file = io.BytesIO(f.read())
-    # -- disable strategy #1 by not asserting a content_type in the call --
+    # -- disable content-type strategy by not asserting a content_type in the call --
    file_type = detect_filetype(file=file)
    # -- ctx.mime_type may be referenced multiple times, but at least once --
@ -303,30 +203,22 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
    [
        (FileType.BMP, "img/bmp_24.bmp"),
        (FileType.CSV, "stanley-cups.csv"),
        (FileType.DOC, "simple.doc"),
        (FileType.DOCX, "simple.docx"),
        (FileType.EML, "eml/fake-email.eml"),
        (FileType.EPUB, "winter-sports.epub"),
        (FileType.HEIC, "img/DA-1p.heic"),
        (FileType.HTML, "ideas-page.html"),
        (FileType.JPG, "img/example.jpg"),
        (FileType.JSON, "spring-weather.html.json"),
        (FileType.ODT, "simple.odt"),
        (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
        (FileType.PNG, "img/DA-1p.png"),
        (FileType.PPT, "fake-power-point.ppt"),
        (FileType.PPTX, "fake-power-point.pptx"),
        (FileType.RTF, "fake-doc.rtf"),
        (FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
        (FileType.TXT, "norwich-city.txt"),
        (FileType.WAV, "CantinaBand3.wav"),
        (FileType.XLS, "tests-example.xls"),
        (FileType.XLSX, "stanley-cups.xlsx"),
        (FileType.XML, "factbook.xml"),
        (FileType.ZIP, "simple.zip"),
    ],
 )
-def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_type_for_itself(
+def test_it_detects_most_file_types_using_mime_guessing_when_libmagic_guesses_mime_type_for_itself(
    file_name: str, expected_value: FileType
 ):
    """Does not work for all types, in particular:
@ -339,90 +231,26 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_
    - ORG is identified as TXT
    - RST is identified as TXT
    """
-    # -- disable strategy #1 by not asserting a content_type in the call --
+    # -- disable content-type strategy by not asserting a content_type in the call --
-    # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute --
+    # -- disable extension-mapping strategy by passing file-like object with no `.name` attribute --
    with open(example_doc_path(file_name), "rb") as f:
        file = io.BytesIO(f.read())
    assert detect_filetype(file=file) is expected_value
@pytest.mark.parametrize(
    ("expected_value", "file_name"),
    [
        (FileType.DOC, "simple.doc"),
        (FileType.PPT, "fake-power-point.ppt"),
        (FileType.XLS, "tests-example.xls"),
    ],
 )
@pytest.mark.parametrize(
    "guessed_mime_type",
    [
        "application/msword",
        "application/vnd.ms-excel",
        "application/vnd.ms-outlook",
        "application/vnd.ms-powerpoint",
        "application/x-ole-storage",
        "anything/else",
    ],
 )
 def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_guessed_mime_type(
    file_name: str, guessed_mime_type: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
    """Fixes XLS wrongly-guessed as DOC, PPT, "application/x-ole-storage" etc.
    It's better than that actually, the OLE differentiator will get the right file-type for any DOC,
    PPT, XLS, or MSG file, regardless of guessed MIME-type.
    """
    ctx_mime_type_.return_value = guessed_mime_type
    # -- disable strategy 3 by not providing a file-name source --
    with open(example_doc_path(file_name), "rb") as f:
        file = io.BytesIO(f.read())
    # -- disable strategy 1 by not asserting a content-type --
    file_type = detect_filetype(file=file)
    ctx_mime_type_.assert_called_with()
    assert file_type is expected_value
@pytest.mark.parametrize(
    ("filename", "mime_type", "expected"),
    [
        ("fake.doc", "application/vnd.ms-excel", FileType.DOC),
        ("fake-power-point.ppt", "application/vnd.ms-excel", FileType.PPT),
        ("tests-example.xls", "application/msword", FileType.XLS),
        ("fake-email.msg", "application/vnd.ms-excel", FileType.MSG),
    ],
 )
 def test_ole_file_structure_trusted_over_mime_type_guess(filename, mime_type, expected):
    def _guess_mime(*args, **kwargs):
        return mime_type
    with patch("filetype.guess_mime", _guess_mime):
        detect_filetype(example_doc_path(filename)) == expected
@pytest.mark.parametrize(
    ("expected_value", "file_name"),
    [
        # -- `filetype` lib recognizes all these binary file-types --
        (FileType.BMP, "img/bmp_24.bmp"),
        (FileType.DOC, "simple.doc"),
        (FileType.DOCX, "simple.docx"),
        (FileType.EPUB, "winter-sports.epub"),
        (FileType.HEIC, "img/DA-1p.heic"),
        (FileType.JPG, "img/example.jpg"),
        (FileType.ODT, "simple.odt"),
        (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
        (FileType.PNG, "img/DA-1p.png"),
        (FileType.PPT, "fake-power-point.ppt"),
        (FileType.PPTX, "fake-power-point.pptx"),
        (FileType.RTF, "fake-doc.rtf"),
        (FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
        (FileType.WAV, "CantinaBand3.wav"),
        (FileType.XLS, "tests-example.xls"),
        (FileType.XLSX, "stanley-cups.xlsx"),
        (FileType.ZIP, "simple.zip"),
        # -- but it doesn't recognize textual file-types at all --
        (FileType.UNK, "stanley-cups.csv"),
@ -435,11 +263,9 @@ def test_ole_file_structure_trusted_over_mime_type_guess(filename, mime_type, ex
        (FileType.UNK, "stanley-cups.tsv"),
        (FileType.UNK, "norwich-city.txt"),
        (FileType.UNK, "factbook.xml"),
        # -- and it doesn't recognize MSG files --
        (FileType.UNK, "fake-email.msg"),
    ],
 )
-def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailable(
+def test_strategy_mime_guessing_can_detect_only_binary_file_types_when_libmagic_is_unavailable(
    file_name: str, expected_value: FileType, LIBMAGIC_AVAILABLE_False: bool
 ):
    """File-type is detected using `filetype` library when libmagic is not available.
@ -447,7 +273,7 @@ def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailab
    `filetype.guess_mime()` does a good job on binary file types (PDF, images, legacy MS-Office),
    but doesn't even try to guess textual file-types.
    """
-    # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute --
+    # -- disable detection by extension by passing file-like object with no `.name` attribute --
    with open(example_doc_path(file_name), "rb") as f:
        file = io.BytesIO(f.read())
    # -- simulate libmagic is not available --
@ -470,7 +296,7 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed(
 # ================================================================================================
-# STRATEGY #3 - MAP FILENAME EXTENSION TO FILETYPE
+# STRATEGY #4 - MAP FILENAME EXTENSION TO FILETYPE
 # ================================================================================================
@ -479,35 +305,25 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed(
    [
        (FileType.BMP, "img/bmp_24.bmp"),
        (FileType.CSV, "stanley-cups.csv"),
        (FileType.DOC, "simple.doc"),
        (FileType.DOCX, "simple.docx"),
        (FileType.EML, "eml/fake-email.eml"),
        (FileType.EPUB, "winter-sports.epub"),
        (FileType.HEIC, "img/DA-1p.heic"),
        (FileType.HTML, "example-10k-1p.html"),
        (FileType.JPG, "img/example.jpg"),
        (FileType.JSON, "spring-weather.html.json"),
        (FileType.MD, "README.md"),
        (FileType.MSG, "fake-email.msg"),
        (FileType.ODT, "simple.odt"),
        (FileType.ORG, "README.org"),
        (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
        (FileType.PNG, "img/DA-1p.png"),
        (FileType.PPT, "fake-power-point.ppt"),
        (FileType.PPTX, "fake-power-point.pptx"),
        (FileType.RST, "README.rst"),
        (FileType.RTF, "fake-doc.rtf"),
        (FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
        (FileType.TSV, "stanley-cups.tsv"),
        (FileType.TXT, "norwich-city.txt"),
        (FileType.WAV, "CantinaBand3.wav"),
        (FileType.XLS, "tests-example.xls"),
        (FileType.XLSX, "stanley-cups.xlsx"),
        (FileType.XML, "factbook.xml"),
        (FileType.ZIP, "simple.zip"),
    ],
 )
-def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_file_type(
+def test_it_detects_correct_file_type_from_extension_when_that_maps_to_a_file_type(
    file_name: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
    # -- disable strategy #2 by making libmagic always guess `None` --
@ -525,10 +341,8 @@ def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_fil
@pytest.mark.parametrize(
    ("expected_value", "file_name", "mime_type"),
    [
-        (FileType.BMP, "img/bmp_24.bmp", "application/zip"),
+        (FileType.BMP, "img/bmp_24.bmp", "application/octet-stream"),
-        (FileType.DOC, "simple.doc", None),
+        (FileType.HEIC, "img/DA-1p.heic", "application/octet-stream"),
        (FileType.EPUB, "winter-sports.epub", "application/x-ole-storage"),
        (FileType.MSG, "fake-email.msg", "application/octet-stream"),
    ],
 )
 def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail(
@ -547,6 +361,12 @@ def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail(
 # ================================================================================================
@pytest.mark.parametrize("mime_type", [FileType.XLS.mime_type, FileType.XLSX.mime_type])
 def test_it_ignores_asserted_XLS_content_type_when_file_is_CSV(mime_type: str):
    file_path = example_doc_path("stanley-cups.csv")
    assert detect_filetype(file_path, content_type=mime_type) == FileType.CSV
@pytest.mark.parametrize("mime_type", ["application/xml", "text/xml"])
@pytest.mark.parametrize("extension", [".html", ".htm"])
 def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extension(
@ -563,39 +383,6 @@ def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extensi
    assert file_type is FileType.HTML
@pytest.mark.parametrize(
    "mime_type",
    [
        "application/octet-stream",
        "application/zip",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
    ],
 )
@pytest.mark.parametrize(
    ("expected_value", "file_name"),
    [
        (FileType.DOCX, "simple.docx"),
        (FileType.PPTX, "fake-power-point.pptx"),
        (FileType.XLSX, "stanley-cups.xlsx"),
        (FileType.ZIP, "simple.zip"),
    ],
 )
 def test_it_differentiates_files_when_libmagic_guesses_octet_stream_zip_or_modern_ms_office(
    mime_type: str, file_name: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
    ctx_mime_type_.return_value = mime_type
    # -- disable extension-based strategy #3 --
    with open(example_doc_path(file_name), "rb") as f:
        file = io.BytesIO(f.read())
    file_type = detect_filetype(file=file)
    ctx_mime_type_.assert_called_with()
    assert file_type is expected_value
@pytest.mark.parametrize(
    ("mime_type", "file_name"),
    [
@ -1000,29 +787,8 @@ class Describe_FileTypeDetectionContext:
        return property_mock(request, _FileTypeDetectionContext, "mime_type")
-class Describe_OleFileDifferentiator:
+class Describe_OleFileDetector:
-    """Unit-test suite for `unstructured.file_utils.filetype._OleFileDifferentiator`."""
+    """Unit-test suite for `unstructured.file_utils.filetype._OleFileDetector`."""
    # -- .applies() ---------------------------------------------
    def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self):
        """The constructor determines whether this differentiator is applicable.
        It returns an instance only when differentiating a CFBF file-type is required, which it
        judges by inspecting the initial bytes of the file for the CFBF magic-bytes.
        """
        ctx = _FileTypeDetectionContext(example_doc_path("simple.doc"))
        differentiator = _OleFileDifferentiator.applies(ctx, "foo/bar")
        assert differentiator is not None
        assert isinstance(differentiator, _OleFileDifferentiator)
    def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_type(self):
        ctx = _FileTypeDetectionContext(example_doc_path("winter-sports.epub"))
        assert _OleFileDifferentiator.applies(ctx, "application/epub") is None
    # -- .file_type ---------------------------------------------
    @pytest.mark.parametrize(
        ("file_name", "expected_value"),
@ -1034,59 +800,15 @@ class Describe_OleFileDifferentiator:
            ("README.org", None),
        ],
    )
-    def it_distinguishes_the_file_type_of_applicable_OLE_files(
+    def it_distinguishes_the_file_type_of_applicable_CFB_files(
        self, file_name: str, expected_value: FileType | None
    ):
        # -- no file-name available, just to make sure we're not relying on an extension --
        with open(example_doc_path(file_name), "rb") as f:
            file = io.BytesIO(f.read())
        ctx = _FileTypeDetectionContext(file=file)
        differentiator = _OleFileDifferentiator(ctx)
-        assert differentiator.file_type is expected_value
+        assert _OleFileDetector.file_type(ctx) is expected_value
    @pytest.mark.parametrize(
        ("file_name", "expected_value"),
        [
            ("simple.doc", FileType.DOC),
            ("fake-power-point.ppt", FileType.PPT),
            ("tests-example.xls", FileType.XLS),
            ("fake-email.msg", FileType.MSG),
        ],
    )
    def it_distinguishes_the_file_type_of_applicable_OLE_files_from_storage_content(
        self, file_name: str, expected_value: FileType | None
    ):
        # -- no file-name available, just to make sure we're not relying on an extension --
        with open(example_doc_path(file_name), "rb") as f:
            file = io.BytesIO(f.read())
        ctx = _FileTypeDetectionContext(file=file)
        differentiator = _OleFileDifferentiator(ctx)
        assert differentiator._check_ole_file_type(ctx) is expected_value
    def but_it_returns_None_to_engage_fallback_when_filetype_cannot_guess_mime(
        self, guess_mime_: Mock
    ):
        guess_mime_.return_value = None
        # -- no file-name available, just to make sure we're not relying on an extension --
        with open(example_doc_path("fake-email.msg"), "rb") as f:
            file = io.BytesIO(f.read())
        ctx = _FileTypeDetectionContext(file=file)
        differentiator = _OleFileDifferentiator(ctx)
        # -- force method to return None to trigger the mime type being guessed
        differentiator._check_ole_file_type = lambda ctx: None
        file_type = differentiator.file_type
        guess_mime_.assert_called_once_with(file)
        assert file_type is None
    # -- fixtures --------------------------------------------------------------------------------
    @pytest.fixture
    def guess_mime_(self, request: FixtureRequest):
        return function_mock(request, "unstructured.file_utils.filetype.ft.guess_mime")
 class Describe_TextFileDifferentiator:
@ -1164,33 +886,15 @@ class Describe_TextFileDifferentiator:
        assert differentiator._is_json is expected_value
-class Describe_ZipFileDifferentiator:
+class Describe_ZipFileDetector:
-    """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDifferentiator`."""
+    """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDetector`."""
    # -- .applies() ---------------------------------------------
    def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self):
        """The constructor determines whether this differentiator is applicable.
        It returns an instance only when differentiating a zip file-type is required, which it can
        judge from the mime-type provided by the context (`ctx`).
        """
        ctx = _FileTypeDetectionContext(example_doc_path("simple.docx"))
        differentiator = _ZipFileDifferentiator.applies(ctx, "application/zip")
        assert isinstance(differentiator, _ZipFileDifferentiator)
    def and_it_returns_None_when_zip_differentiation_does_not_apply_to_the_detection_context(self):
        ctx = _FileTypeDetectionContext(example_doc_path("norwich-city.txt"))
        assert _ZipFileDifferentiator.applies(ctx, "application/epub") is None
    # -- .file_type ---------------------------------------------
    @pytest.mark.parametrize(
        ("file_name", "expected_value"),
        [
            ("simple.docx", FileType.DOCX),
            ("winter-sports.epub", FileType.EPUB),
            ("simple.odt", FileType.ODT),
            ("picture.pptx", FileType.PPTX),
            ("vodafone.xlsx", FileType.XLSX),
            ("simple.zip", FileType.ZIP),
@ -1201,6 +905,4 @@ class Describe_ZipFileDifferentiator:
        self, file_name: str, expected_value: FileType | None
    ):
        ctx = _FileTypeDetectionContext(example_doc_path(file_name))
-        differentiator = _ZipFileDifferentiator(ctx)
+        assert _ZipFileDetector.file_type(ctx) is expected_value
        assert differentiator.file_type is expected_value
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.16.12-dev2"  # pragma: no cover
+__version__ = "0.16.12-dev3"  # pragma: no cover
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -51,7 +51,11 @@ from unstructured.partition.common.common import add_element_metadata, exactly_o
 from unstructured.partition.common.metadata import set_element_hierarchy
 from unstructured.utils import get_call_args_applying_defaults, lazyproperty
-LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))
+try:
    importlib.import_module("magic")
    LIBMAGIC_AVAILABLE = True
 except ImportError:
    LIBMAGIC_AVAILABLE = False  # pyright: ignore[reportConstantRedefinition]
 def detect_filetype(
@ -133,43 +137,57 @@ class _FileTypeDetector:
    @property
    def _file_type(self) -> FileType:
        """FileType member corresponding to this document source."""
-        # -- strategy 1: use content-type asserted by caller --
+        # -- An explicit content-type most commonly asserted by the client/SDK and is therefore
        # -- inherently unreliable. On the other hand, binary file-types can be detected with 100%
        # -- accuracy. So start with binary types and only then consider an asserted content-type,
        # -- generally as a last resort.
        # -- strategy 1: most binary types can be detected with 100% accuracy --
        if file_type := self._known_binary_file_type:
            return file_type
        # -- strategy 2: use content-type asserted by caller --
        if file_type := self._file_type_from_content_type:
            return file_type
-        # -- strategy 2: guess MIME-type using libmagic and use that --
+        # -- strategy 3: guess MIME-type using libmagic and use that --
        if file_type := self._file_type_from_guessed_mime_type:
            return file_type
-        # -- strategy 3: use filename-extension, like ".docx" -> FileType.DOCX --
+        # -- strategy 4: use filename-extension, like ".docx" -> FileType.DOCX --
        if file_type := self._file_type_from_file_extension:
            return file_type
-        # -- strategy 4: give up and report FileType.UNK --
+        # -- strategy 5: give up and report FileType.UNK --
        return FileType.UNK
    # == STRATEGIES ============================================================
    @property
-    def _file_type_from_content_type(self) -> FileType | None:
+    def _known_binary_file_type(self) -> FileType | None:
-        """Map passed content-type argument to a file-type, subject to certain rules."""
+        """Detect file-type for binary types we can positively detect."""
-        content_type = self._ctx.content_type
+        if file_type := _OleFileDetector.file_type(self._ctx):
            return file_type
        self._ctx.rule_out_cfb_content_types()
        if file_type := _ZipFileDetector.file_type(self._ctx):
            return file_type
        self._ctx.rule_out_zip_content_types()
        # -- when no content-type was asserted by caller, this strategy is not applicable --
        if not content_type:
        return None
-        # -- OLE-based file-format content_type values are sometimes unreliable. These are
+    @property
-        # -- DOC, PPT, XLS, and MSG.
+    def _file_type_from_content_type(self) -> FileType | None:
-        if differentiator := _OleFileDifferentiator.applies(self._ctx, content_type):
+        """Map passed content-type argument to a file-type, subject to certain rules."""
            return differentiator.file_type
-        # -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable --
+        # -- when no content-type was asserted by caller, this strategy is not applicable --
-        if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type):
+        if not self._ctx.content_type:
-            return differentiator.file_type
+            return None
        # -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it --
-        return FileType.from_mime_type(content_type)
+        return FileType.from_mime_type(self._ctx.content_type)
    @property
    def _file_type_from_guessed_mime_type(self) -> FileType | None:
@ -188,24 +206,12 @@ class _FileTypeDetector:
        if mime_type is None:
            return None
        if differentiator := _OleFileDifferentiator.applies(self._ctx, mime_type):
            return differentiator.file_type
        if mime_type.endswith("xml"):
            return FileType.HTML if extension in (".html", ".htm") else FileType.XML
        if differentiator := _TextFileDifferentiator.applies(self._ctx):
            return differentiator.file_type
        # -- applicable to "application/octet-stream", "application/zip", and all Office 2007+
        # -- document MIME-types, i.e. those for DOCX, PPTX, and XLSX. Note however it does NOT
        # -- apply to EPUB or ODT documents, even though those are also Zip archives. The zip and
        # -- octet-stream MIME-types are fed in because they are ambiguous. The MS-Office types are
        # -- differentiated because they are sometimes mistaken for each other, like DOCX mime-type
        # -- is actually a PPTX file etc.
        if differentiator := _ZipFileDifferentiator.applies(self._ctx, mime_type):
            return differentiator.file_type
        # -- All source-code files (e.g. *.py, *.js) are classified as plain text for the moment --
        if self._ctx.has_code_mime_type:
            return FileType.TXT
@ -214,14 +220,8 @@ class _FileTypeDetector:
            return FileType.EMPTY
        # -- if no more-specific rules apply, use the MIME-type -> FileType mapping when present --
-        if file_type := FileType.from_mime_type(mime_type):
+        file_type = FileType.from_mime_type(mime_type)
-            return file_type
+        return file_type if file_type != FileType.UNK else None
        logger.warning(
            f"The MIME type{f' of {self._ctx.file_path!r}' if self._ctx.file_path else ''} is"
            f" {mime_type!r}. This file type is not currently supported in unstructured.",
        )
        return None
    @lazyproperty
    def _file_type_from_file_extension(self) -> FileType | None:
@ -236,6 +236,9 @@ class _FileTypeDetector:
 class _FileTypeDetectionContext:
    """Provides all arguments to auto-file detection and values derived from them.
    NOTE that `._content_type` is mutable via `.rule_out_*_content_types()` methods, so it should
    not be assumed to be a constant value across those calls.
    This keeps computation of derived values out of the file-detection code but more importantly
    allows the main filetype-detector to pass the full context to any delegates without coupling
    itself to which values it might need.
@ -276,7 +279,7 @@ class _FileTypeDetectionContext:
        self._validate()
        return self
-    @lazyproperty
+    @property
    def content_type(self) -> str | None:
        """MIME-type asserted by caller; not based on inspection of file by this process.
@ -284,6 +287,8 @@ class _FileTypeDetectionContext:
        present on the response. These are often ambiguous and sometimes just wrong so get some
        further verification. All lower-case when not `None`.
        """
        # -- Note `._content_type` is mutable via `.invalidate_content_type()` so this cannot be a
        # -- `@lazyproperty`.
        return self._content_type.lower() if self._content_type else None
    @lazyproperty
@ -327,12 +332,6 @@ class _FileTypeDetectionContext:
        return os.path.realpath(file_path) if os.path.islink(file_path) else file_path
    @lazyproperty
    def is_zipfile(self) -> bool:
        """True when file is a Zip archive."""
        with self.open() as file:
            return zipfile.is_zipfile(file)
    @lazyproperty
    def has_code_mime_type(self) -> bool:
        """True when `mime_type` plausibly indicates a programming language source-code file."""
@ -347,9 +346,27 @@ class _FileTypeDetectionContext:
        return any(
            lang in mime_type
-            for lang in "c# c++ cpp csharp java javascript php python ruby swift typescript".split()
+            for lang in [
                "c#",
                "c++",
                "cpp",
                "csharp",
                "java",
                "javascript",
                "php",
                "python",
                "ruby",
                "swift",
                "typescript",
            ]
        )
    @lazyproperty
    def is_zipfile(self) -> bool:
        """True when file is a Zip archive."""
        with self.open() as file:
            return zipfile.is_zipfile(file)
    @lazyproperty
    def mime_type(self) -> str | None:
        """The best MIME-type we can get from `magic` (or `filetype` package).
@ -401,6 +418,38 @@ class _FileTypeDetectionContext:
            file.seek(0)
            yield file
    def rule_out_cfb_content_types(self) -> None:
        """Invalidate content-type when a legacy MS-Office file-type is asserted.
        Used before returning `None`; at that point we know the file is not one of these formats
        so if the asserted `content-type` is a legacy MS-Office type we know it's wrong and should
        not be used as a fallback later in the detection process.
        """
        if FileType.from_mime_type(self._content_type) in (
            FileType.DOC,
            FileType.MSG,
            FileType.PPT,
            FileType.XLS,
        ):
            self._content_type = None
    def rule_out_zip_content_types(self) -> None:
        """Invalidate content-type when an MS-Office 2007+ file-type is asserted.
        Used before returning `None`; at that point we know the file is not one of these formats
        so if the asserted `content-type` is an MS-Office 2007+ type we know it's wrong and should
        not be used as a fallback later in the detection process.
        """
        if FileType.from_mime_type(self._content_type) in (
            FileType.DOCX,
            FileType.EPUB,
            FileType.ODT,
            FileType.PPTX,
            FileType.XLSX,
            FileType.ZIP,
        ):
            self._content_type = None
    @lazyproperty
    def text_head(self) -> str:
        """The initial characters of the text file for use with text-format differentiation.
@ -440,27 +489,23 @@ class _FileTypeDetectionContext:
            raise ValueError("either `file_path` or `file` argument must be provided")
-class _OleFileDifferentiator:
+class _OleFileDetector:
-    """Refine an OLE-storage package (CFBF) file-type that may not be as specific as it could be.
+    """Detect and differentiate a CFB file, aka. "OLE" file.
-    Compound File Binary Format (CFBF), aka. OLE file, is use by Microsoft for legacy MS Office
+    Compound File Binary Format (CFB), aka. OLE file, is use by Microsoft for legacy MS Office
-    files (DOC, PPT, XLS) as well as for Outlook MSG files. `libmagic` tends to identify these as
+    files (DOC, PPT, XLS) as well as for Outlook MSG files.
    `"application/x-ole-storage"` which is true but too not specific enough for partitioning
    purposes.
    """
    def __init__(self, ctx: _FileTypeDetectionContext):
        self._ctx = ctx
    @classmethod
-    def applies(
+    def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None:
-        cls, ctx: _FileTypeDetectionContext, mime_type: str
+        """Specific file-type when file is a CFB file, `None` otherwise."""
-    ) -> _OleFileDifferentiator | None:
+        return cls(ctx)._file_type
        """Constructs an instance, but only if this differentiator applies for `mime_type`."""
        return cls(ctx) if cls._is_ole_file(ctx) else None
    @property
-    def file_type(self) -> FileType | None:
+    def _file_type(self) -> FileType | None:
        """Differentiated file-type for Microsoft Compound File Binary Format (CFBF).
        Returns one of:
@ -468,34 +513,27 @@ class _OleFileDifferentiator:
        - `FileType.PPT`
        - `FileType.XLS`
        - `FileType.MSG`
        - `None` when the file is not one of these.
        """
-        # -- if this is not a CFBF file then whatever MIME-type was guessed is wrong, so return
+        # -- all CFB files share common magic number, start with that --
-        # -- `None` to trigger fall-back to next strategy.
+        if not self._is_ole_file:
        if not self._is_ole_file(self._ctx):
            return None
-        # -- check storage contents of the ole file for file type markers
+        # -- check storage contents of the ole file for file-type specific stream names --
-        if (ole_file_type := self._check_ole_file_type(self._ctx)) is not None:
+        if (ole_file_type := self._ole_file_type) is not None:
            return ole_file_type
-        # -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so we rely on it
+        return None
-        # -- to differentiate those. Note `filetype` doesn't detect MSG type and won't always
+
-        # -- detect DOC, PPT, or XLS, returning `None` instead. We let those fall through and we
+    @lazyproperty
-        # -- rely on filename-extension to identify those.
+    def _is_ole_file(self) -> bool:
        """True when file has CFB magic first 8 bytes."""
        with self._ctx.open() as file:
            mime_type = ft.guess_mime(file)
        return FileType.from_mime_type(mime_type) if mime_type else None
    @staticmethod
    def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool:
        """True when file has CFBF magic first 8 bytes."""
        with ctx.open() as file:
            return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
-    @staticmethod
+    @lazyproperty
-    def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None:
+    def _ole_file_type(self) -> FileType | None:
-        with ctx.open() as f:
+        with self._ctx.open() as f:
            ole = OleFileIO(f)  # pyright: ignore[reportUnknownVariableType]
            root_storage = Storage.from_ole(ole)  # pyright: ignore[reportUnknownMemberType]
@ -537,7 +575,20 @@ class _TextFileDifferentiator:
        """
        extension = self._ctx.extension
-        if extension in ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .tsv".split():
+        if extension in [
            ".csv",
            ".eml",
            ".html",
            ".json",
            ".markdown",
            ".md",
            ".org",
            ".p7s",
            ".rst",
            ".rtf",
            ".tab",
            ".tsv",
        ]:
            return FileType.from_extension(extension) or FileType.TXT
        # NOTE(crag): for older versions of the OS libmagic package, such as is currently
@ -616,40 +667,28 @@ class _TextFileDifferentiator:
            return False
-class _ZipFileDifferentiator:
+class _ZipFileDetector:
-    """Refine a Zip-packaged file-type that may be ambiguous or swapped."""
+    """Detect and differentiate a Zip-archive file."""
    def __init__(self, ctx: _FileTypeDetectionContext):
        self._ctx = ctx
    @classmethod
-    def applies(
+    def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None:
-        cls, ctx: _FileTypeDetectionContext, mime_type: str
+        """Most specific file-type available when file is a Zip file, `None` otherwise.
    ) -> _ZipFileDifferentiator | None:
        """Constructs an instance, but only if this differentiator applies for `mime_type`.
-        Separate `mime_type` argument allows it to be applied to either asserted content-type or
+        MS-Office 2007+ files are detected with 100% accuracy. Otherwise this returns `None`, even
-        guessed mime-type.
+        when we can tell it's a Zip file, so later strategies can have a crack at it. In
        particular, ODT and EPUB files are Zip archives but are not detected here.
        """
-        return (
+        return cls(ctx)._file_type
            cls(ctx)
            if mime_type
            in (
                "application/octet-stream",
                "application/zip",
                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
                "application/vnd.openxmlformats-officedocument.presentationml.presentation",
                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
            )
            else None
        )
    @lazyproperty
-    def file_type(self) -> FileType | None:
+    def _file_type(self) -> FileType | None:
        """Differentiated file-type for a Zip archive.
-        Returns `None` if the file is not a Zip archive. Otherwise it returns `FileType.DOCX`,
+        Returns `FileType.DOCX`, `FileType.PPTX`, or `FileType.XLSX` when one of those applies,
-        `FileType.PPTX`, or `FileType.XLSX` when one of those applies and `FileType.ZIP` otherwise.
+        `None` otherwise.
        """
        if not self._ctx.is_zipfile:
            return None
@ -657,20 +696,23 @@ class _ZipFileDifferentiator:
        with self._ctx.open() as file:
            zip = zipfile.ZipFile(file)
-            # NOTE(robinson) - .docx and .xlsx files are actually a zip file with a .docx/.xslx
+            filenames = zip.namelist()
            # extension. If the MIME type is application/octet-stream, we check if it's a
            # .docx/.xlsx file by looking for expected filenames within the zip file.
            filenames = [f.filename for f in zip.filelist]
-            if all(f in filenames for f in ("word/document.xml",)):
+            if "word/document.xml" in filenames:
                return FileType.DOCX
-            if all(f in filenames for f in ("xl/workbook.xml",)):
+            if "xl/workbook.xml" in filenames:
                return FileType.XLSX
-            if all(f in filenames for f in ("ppt/presentation.xml",)):
+            if "ppt/presentation.xml" in filenames:
                return FileType.PPTX
            # -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --
            if "mimetype" in filenames:
                with zip.open("mimetype") as f:
                    mime_type = f.read().decode("utf-8").strip()
                    return FileType.from_mime_type(mime_type)
        return FileType.ZIP
`@ -1 +1 @@`
	`__version__ = "0.16.12-dev2" # pragma: no cover`	`__version__ = "0.16.12-dev3" # pragma: no cover`