fix(file): fix OLE-based file-type auto-detection (#3437)

**Summary** A DOC, PPT, or XLS file sent to partition() as a file-like object is misidentified as a MSG file and raises an exception in python-oxmsg (which is used to process MSG files). **Fix** DOC, PPT, XLS, and MSG are all Microsoft OLE-based files, aka. Compound File Binary Format (CFBF). These can be reliably distinguished by inspecting magic bytes in certain locations. `libmagic` is unreliable at this or doesn't try, reporting the generic `"application/x-ole-storage"` which corresponds to the "container" CFBF format (vaguely like a Microsoft Zip format) that all these document types are stored in. Unconditionally use `filetype.guess_mime()` provided by the `filetype` package that is part of the base unstructured install. Unlike `libmagic`, this package reliably detects the distinguished MIME-type (e.g. `"application/msword"`) for OLE file subtypes. Fixes #3364
2025-11-09 23:17:21 +00:00 · 2024-07-25 10:25:41 -07:00 · 2024-07-25 10:25:41 -07:00 · 4e61acc1c6
commit 4e61acc1c6
parent 432d209c36
7 changed files with 262 additions and 90 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.15.1-dev4
+## 0.15.1-dev5
 ### Enhancements
@ -14,6 +14,7 @@
 * **A DOCX, PPTX, or XLSX file specified by path and ambiguously identified as MIME-type "application/octet-stream" is identified correctly.** Resolves a shortcoming where a file specified by path immediately fell back to filename-extension based identification when misidentified as "application/octet-stream", either by asserted content type or a mis-guess by libmagic. An MS Office file misidentified in this way is now correctly identified regardless of its filename and whether it is specified by path or file-like object.
 * **Textual content retrieved from a URL with gzip transport compression now partitions correctly.** Resolves a bug where a textual file-type (such as Markdown) retrieved by passing a URL to `partition()` would raise when `gzip` compression was used for transport by the server.
 * **A DOCX, PPTX, or XLSX content-type asserted on partition is confirmed or fixed.** Resolves a bug where calling `partition()` with a swapped MS-Office `content_type` would cause the file-type to be misidentified. A DOCX, PPTX, or XLSX MIME-type received by `partition()` is now checked for accuracy and corrected if the file is for a different MS-Office 2007+ type.
 * **DOC, PPT, XLS, and MSG files are now auto-detected correctly.** Resolves a bug where DOC, PPT, and XLS files were auto-detected as MSG files under certain circumstances.
 ## 0.15.0
--- a/example-docs/not-unstructured-payload.json
+++ b/example-docs/not-unstructured-payload.json
@ -0,0 +1,5 @@
 {
    "id": "Sample-1",
    "name": "Sample 1",
    "description": "This is sample data #1"
 }
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -19,9 +19,11 @@ from test_unstructured.unit_utils import (
 )
 from unstructured.file_utils.filetype import (
    _FileTypeDetectionContext,
    _OleFileDifferentiator,
    _TextFileDifferentiator,
    _ZipFileDifferentiator,
    detect_filetype,
    is_json_processable,
 )
 from unstructured.file_utils.model import FileType
@ -185,6 +187,46 @@ def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_c
    assert file_type is expected_value
@pytest.mark.parametrize(
    ("expected_value", "file_name"),
    [
        (FileType.DOC, "simple.doc"),
        (FileType.PPT, "fake-power-point.ppt"),
        (FileType.XLS, "tests-example.xls"),
        (FileType.MSG, "fake-email-multiple-attachments.msg"),
    ],
 )
@pytest.mark.parametrize(
    "content_type",
    [
        "application/msword",
        "application/vnd.ms-outlook",
        "application/vnd.ms-powerpoint",
        "application/vnd.ms-excel",
        "anything/else",
    ],
 )
 def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_content_type(
    file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
    """Fixes wrong XLS asserted as DOC, PPT, etc.
    Asserted content-type can be anything except `None` and differentiator will fix it if the file
    is DOC, PPT, XLS, or MSG type.
    """
    # -- disable strategies 2 & 3, content-type strategy should get this on its own --
    ctx_mime_type_.return_value = None
    with open(example_doc_path(file_name), "rb") as f:
        file = io.BytesIO(f.read())
    file_type = detect_filetype(file=file, content_type=content_type)
    # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
    # -- fall-back to strategy 2 for any of these test cases.
    ctx_mime_type_.assert_not_called()
    assert file_type is expected_value
 # ================================================================================================
 # STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC
 # ================================================================================================
@ -264,6 +306,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
    [
        (FileType.BMP, "img/bmp_24.bmp"),
        (FileType.CSV, "stanley-cups.csv"),
        (FileType.DOC, "simple.doc"),
        (FileType.DOCX, "simple.docx"),
        (FileType.EML, "eml/fake-email.eml"),
        (FileType.EPUB, "winter-sports.epub"),
@ -271,14 +314,17 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
        (FileType.HTML, "ideas-page.html"),
        (FileType.JPG, "img/example.jpg"),
        (FileType.JSON, "spring-weather.html.json"),
        (FileType.MSG, "fake-email.msg"),
        (FileType.ODT, "simple.odt"),
        (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
        (FileType.PNG, "img/DA-1p.png"),
        (FileType.PPT, "fake-power-point.ppt"),
        (FileType.PPTX, "fake-power-point.pptx"),
        (FileType.RTF, "fake-doc.rtf"),
        (FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
        (FileType.TXT, "norwich-city.txt"),
        (FileType.WAV, "CantinaBand3.wav"),
        (FileType.XLS, "tests-example.xls"),
        (FileType.XLSX, "stanley-cups.xlsx"),
        (FileType.XML, "factbook.xml"),
        (FileType.ZIP, "simple.zip"),
@ -290,11 +336,7 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_
    """Does not work for all types, in particular:
    TODOs:
    - DOC is misidentified as MSG, TODO on that below.
    - MSG is misidentified as UNK, but only on CI.
    - PPT is misidentified as MSG, same fix as DOC.
    - TSV is identified as TXT, maybe need an `.is_tsv` predicate in `_TextFileDifferentiator`
    - XLS is misidentified as MSG, same fix as DOC.
    NOCANDOs: w/o an extension I think these are the best we can do.
    - MD is identified as TXT
@ -309,25 +351,44 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_
    assert detect_filetype(file=file) is expected_value
 # NOTE(scanny): magic gets this wrong ("application/x-ole-storage") but filetype lib gets it right
 # ("application/msword"). Need a differentiator for "application/x-ole-storage".
@pytest.mark.xfail(reason="TODO: FIX", raises=AssertionError, strict=True)
@pytest.mark.parametrize(
    ("expected_value", "file_name"),
    [
        (FileType.DOC, "simple.doc"),
        (FileType.PPT, "fake-power-point.ppt"),
        (FileType.XLS, "tests-example.xls"),
-        # -- only fails on CI, maybe different libmagic version or "magic-files" --
+        (FileType.MSG, "fake-email-multiple-attachments.msg"),
        # (FileType.MSG, "fake-email.msg"),
    ],
 )
-def test_it_detects_MS_Office_file_types_using_strategy_2_when_libmagic_guesses_mime_type(
+@pytest.mark.parametrize(
-    file_name: str, expected_value: FileType
+    "guessed_mime_type",
    [
        "application/msword",
        "application/vnd.ms-excel",
        "application/vnd.ms-outlook",
        "application/vnd.ms-powerpoint",
        "application/x-ole-storage",
        "anything/else",
    ],
 )
 def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_guessed_mime_type(
    file_name: str, guessed_mime_type: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
    """Fixes XLS wrongly-guessed as DOC, PPT, "application/x-ole-storage" etc.
    It's better than that actually, the OLE differentiator will get the right file-type for any DOC,
    PPT, XLS, or MSG file, regardless of guessed MIME-type.
    """
    ctx_mime_type_.return_value = guessed_mime_type
    # -- disable strategy 3 by not providing a file-name source --
    with open(example_doc_path(file_name), "rb") as f:
        file = io.BytesIO(f.read())
-    assert detect_filetype(file=file) is expected_value
+
    # -- disable strategy 1 by not asserting a content-type --
    file_type = detect_filetype(file=file)
    ctx_mime_type_.assert_called_with()
    assert file_type is expected_value
@pytest.mark.parametrize(
@ -454,6 +515,7 @@ def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_fil
    [
        (FileType.BMP, "img/bmp_24.bmp", "application/zip"),
        (FileType.DOC, "simple.doc", None),
        (FileType.EPUB, "winter-sports.epub", "application/x-ole-storage"),
        (FileType.MSG, "fake-email.msg", "application/octet-stream"),
    ],
 )
@ -575,6 +637,41 @@ def test_it_detect_CSV_from_path_and_file_when_content_contains_escaped_commas()
        assert detect_filetype(file=f) == FileType.CSV
 # ================================================================================================
 # Describe `is_json_processable()`
 # ================================================================================================
 def it_affirms_JSON_is_array_of_objects_from_a_file_path():
    assert is_json_processable(example_doc_path("simple.json")) is True
 def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_path():
    assert is_json_processable(example_doc_path("not-unstructured-payload.json")) is False
 def it_affirms_JSON_is_array_of_objects_from_a_file_like_object_open_for_reading_bytes():
    with open(example_doc_path("simple.json"), "rb") as f:
        assert is_json_processable(file=f) is True
 def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_like_object_open_for_reading_bytes():
    with open(example_doc_path("not-unstructured-payload.json"), "rb") as f:
        assert is_json_processable(file=f) is False
 def it_affirms_JSON_is_array_of_objects_from_text():
    with open(example_doc_path("simple.json")) as f:
        text = f.read()
    assert is_json_processable(file_text=text) is True
 def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_text():
    with open(example_doc_path("not-unstructured-payload.json")) as f:
        text = f.read()
    assert is_json_processable(file_text=text) is False
 # ================================================================================================
 # MODULE-LEVEL FIXTURES
 # ================================================================================================
@ -891,6 +988,52 @@ class Describe_FileTypeDetectionContext:
        return property_mock(request, _FileTypeDetectionContext, "mime_type")
 class Describe_OleFileDifferentiator:
    """Unit-test suite for `unstructured.file_utils.filetype._OleFileDifferentiator`."""
    # -- .applies() ---------------------------------------------
    def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self):
        """The constructor determines whether this differentiator is applicable.
        It returns an instance only when differentiating a CFBF file-type is required, which it
        judges by inspecting the initial bytes of the file for the CFBF magic-bytes.
        """
        ctx = _FileTypeDetectionContext(example_doc_path("simple.doc"))
        differentiator = _OleFileDifferentiator.applies(ctx, "foo/bar")
        assert differentiator is not None
        assert isinstance(differentiator, _OleFileDifferentiator)
    def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_type(self):
        ctx = _FileTypeDetectionContext(example_doc_path("winter-sports.epub"))
        assert _OleFileDifferentiator.applies(ctx, "application/epub") is None
    # -- .file_type ---------------------------------------------
    @pytest.mark.parametrize(
        ("file_name", "expected_value"),
        [
            ("simple.doc", FileType.DOC),
            ("fake-power-point.ppt", FileType.PPT),
            ("tests-example.xls", FileType.XLS),
            ("fake-email.msg", FileType.MSG),
            ("README.org", None),
        ],
    )
    def it_distinguishes_the_file_type_of_applicable_zip_files(
        self, file_name: str, expected_value: FileType | None
    ):
        # -- no file-name available, just to make sure we're not relying on an extension --
        with open(example_doc_path(file_name), "rb") as f:
            file = io.BytesIO(f.read())
        ctx = _FileTypeDetectionContext(file=file)
        differentiator = _OleFileDifferentiator(ctx)
        assert differentiator.file_type is expected_value
 class Describe_TextFileDifferentiator:
    """Unit-test suite for `unstructured.file_utils.filetype._TextFileDifferentiator`."""
--- a/test_unstructured/partition/test_auto.py
+++ b/test_unstructured/partition/test_auto.py
@ -1221,33 +1221,39 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti
@pytest.mark.parametrize(
-    "filetype",
+    "file_type",
    [
        t
        for t in FileType
-        if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP)
+        if t
-        and t.partitioner_function_name != "partition_image"
+        not in (
            FileType.EMPTY,
            FileType.JSON,
            FileType.UNK,
            FileType.WAV,
            FileType.XLS,
            FileType.ZIP,
        )
        and t.partitioner_shortname != "image"
    ],
 )
-def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType):
+def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(file_type: FileType):
-    extension = filetype.name.lower()
+    partition_fn_name = file_type.partitioner_function_name
-    # -- except for two oddballs, the shortname is the extension --
+    module = import_module(file_type.partitioner_module_qname)
    partitioner_shortname = {FileType.TXT: "text", FileType.EML: "email"}.get(filetype, extension)
    partition_fn_name = f"partition_{partitioner_shortname}"
    module = import_module(f"unstructured.partition.{partitioner_shortname}")
    partition_fn = getattr(module, partition_fn_name)
    # -- partition the first example-doc with the extension for this filetype --
    elements: list[Element] = []
-    doc_path = example_doc_path("pdf") if filetype == FileType.PDF else example_doc_path("")
+    doc_path = example_doc_path("pdf") if file_type == FileType.PDF else example_doc_path("")
    extensions = file_type._extensions
    for file in pathlib.Path(doc_path).iterdir():
-        if file.is_file() and file.suffix == f".{extension}":
+        if file.is_file() and file.suffix in extensions:
            elements = partition_fn(str(file))
            break
    assert elements
    assert all(
-        e.metadata.filetype == filetype.mime_type
+        e.metadata.filetype == file_type.mime_type
        for e in elements
        if e.metadata.filetype is not None
    )
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.15.1-dev4"  # pragma: no cover
+__version__ = "0.15.1-dev5"  # pragma: no cover
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -112,12 +112,12 @@ def is_json_processable(
    file is JSON.
    """
    exactly_one(filename=filename, file=file, file_text=file_text)
    if file_text is None:
-        file_text = _read_file_start_for_type_check(
+        file_text = _FileTypeDetectionContext.new(
-            file=file,
+            file_path=filename, file=file, encoding=encoding
-            filename=filename,
+        ).text_head
-            encoding=encoding,
+
        )
    return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
@ -161,6 +161,11 @@ class _FileTypeDetector:
        if not content_type:
            return None
        # -- OLE-based file-format content_type values are sometimes unreliable. These are
        # -- DOC, PPT, XLS, and MSG.
        if differentiator := _OleFileDifferentiator.applies(self._ctx, content_type):
            return differentiator.file_type
        # -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable --
        if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type):
            return differentiator.file_type
@ -185,9 +190,8 @@ class _FileTypeDetector:
        if mime_type is None:
            return None
-        # NOTE(Crag): older magic lib does not differentiate between xls and doc
+        if differentiator := _OleFileDifferentiator.applies(self._ctx, mime_type):
-        if mime_type == "application/msword" and extension == ".xls":
+            return differentiator.file_type
            return FileType.XLS
        if mime_type.endswith("xml"):
            return FileType.HTML if extension in (".html", ".htm") else FileType.XML
@ -248,7 +252,7 @@ class _FileTypeDetectionContext:
        content_type: str | None = None,
        metadata_file_path: str | None = None,
    ):
-        self._file_path = file_path
+        self._file_path_arg = file_path
        self._file_arg = file
        self._encoding_arg = encoding
        self._content_type = content_type
@ -261,9 +265,9 @@ class _FileTypeDetectionContext:
        file_path: str | None,
        file: IO[bytes] | None,
        encoding: str | None,
-        content_type: str | None,
+        content_type: str | None = None,
-        metadata_file_path: str | None,
+        metadata_file_path: str | None = None,
-    ):
+    ) -> _FileTypeDetectionContext:
        self = cls(
            file_path=file_path,
            file=file,
@ -320,7 +324,10 @@ class _FileTypeDetectionContext:
        None when the caller specified the source as a file-like object instead. Useful for user
        feedback on an error, but users of context should have little use for it otherwise.
        """
-        return self._file_path
+        if (file_path := self._file_path_arg) is None:
            return None
        return os.path.realpath(file_path) if os.path.islink(file_path) else file_path
    @lazyproperty
    def is_zipfile(self) -> bool:
@ -351,19 +358,19 @@ class _FileTypeDetectionContext:
        A `str` return value is always in lower-case.
        """
        file_path = self.file_path
        if LIBMAGIC_AVAILABLE:
            import magic
            mime_type = (
-                magic.from_file(_resolve_symlink(self._file_path), mime=True)
+                magic.from_file(file_path, mime=True)
-                if self._file_path
+                if file_path
                else magic.from_buffer(self.file_head, mime=True)
            )
            return mime_type.lower() if mime_type else None
-        mime_type = (
+        mime_type = ft.guess_mime(file_path) if file_path else ft.guess_mime(self.file_head)
            ft.guess_mime(self._file_path) if self._file_path else ft.guess_mime(self.file_head)
        )
        if mime_type is None:
            logger.warning(
@ -387,8 +394,8 @@ class _FileTypeDetectionContext:
        File is guaranteed to be at read position 0 when called.
        """
-        if self._file_path:
+        if self.file_path:
-            with open(self._file_path, "rb") as f:
+            with open(self.file_path, "rb") as f:
                yield f
        else:
            file = self._file_arg
@ -416,7 +423,7 @@ class _FileTypeDetectionContext:
                else content.decode(encoding=self.encoding, errors="ignore")
            )
-        file_path = self._file_path
+        file_path = self.file_path
        assert file_path is not None  # -- guaranteed by `._validate` --
        try:
@ -429,12 +436,61 @@ class _FileTypeDetectionContext:
    def _validate(self) -> None:
        """Raise if the context is invalid."""
-        if self._file_path and not os.path.isfile(self._file_path):
+        if self.file_path and not os.path.isfile(self.file_path):
-            raise FileNotFoundError(f"no such file {self._file_path}")
+            raise FileNotFoundError(f"no such file {self._file_path_arg}")
-        if not self._file_path and not self._file_arg:
+        if not self.file_path and not self._file_arg:
            raise ValueError("either `file_path` or `file` argument must be provided")
 class _OleFileDifferentiator:
    """Refine an OLE-storage package (CFBF) file-type that may not be as specific as it could be.
    Compound File Binary Format (CFBF), aka. OLE file, is use by Microsoft for legacy MS Office
    files (DOC, PPT, XLS) as well as for Outlook MSG files. `libmagic` tends to identify these as
    `"application/x-ole-storage"` which is true but too not specific enough for partitioning
    purposes.
    """
    def __init__(self, ctx: _FileTypeDetectionContext):
        self._ctx = ctx
    @classmethod
    def applies(
        cls, ctx: _FileTypeDetectionContext, mime_type: str
    ) -> _OleFileDifferentiator | None:
        """Constructs an instance, but only if this differentiator applies for `mime_type`."""
        return cls(ctx) if cls._is_ole_file(ctx) else None
    @property
    def file_type(self) -> FileType | None:
        """Differentiated file-type for Microsoft Compound File Binary Format (CFBF).
        Returns one of:
        - `FileType.DOC`
        - `FileType.PPT`
        - `FileType.XLS`
        - `FileType.MSG`
        """
        # -- if this is not a CFBF file then whatever MIME-type was guessed is wrong, so return
        # -- `None` to trigger fall-back to next strategy.
        if not self._is_ole_file(self._ctx):
            return None
        # -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so rely on it to
        # -- differentiate those. Note it doesn't detect MSG type though, so we assume any OLE file
        # -- that is not a legacy MS-Office type to be a MSG file.
        with self._ctx.open() as file:
            mime_type = ft.guess_mime(file)
        return FileType.from_mime_type(mime_type or "application/vnd.ms-outlook")
    @staticmethod
    def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool:
        """True when file has CFBF magic first 8 bytes."""
        with ctx.open() as file:
            return file.read(8) == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
 class _TextFileDifferentiator:
    """Refine a textual file-type that may not be as specific as it could be."""
@ -597,45 +653,6 @@ class _ZipFileDifferentiator:
        return FileType.ZIP
 def _read_file_start_for_type_check(
    filename: Optional[str] = None,
    file: Optional[IO[bytes]] = None,
    encoding: Optional[str] = "utf-8",
 ) -> str:
    """Reads the start of the file and returns the text content."""
    exactly_one(filename=filename, file=file)
    if file is not None:
        file.seek(0)
        file_content = file.read(4096)
        if isinstance(file_content, str):
            file_text = file_content
        else:
            file_text = file_content.decode(errors="ignore")
        file.seek(0)
        return file_text
    # -- guaranteed by `exactly_one()` call --
    assert filename is not None
    try:
        with open(filename, encoding=encoding) as f:
            file_text = f.read(4096)
    except UnicodeDecodeError:
        formatted_encoding, _ = detect_file_encoding(filename=filename)
        with open(filename, encoding=formatted_encoding) as f:
            file_text = f.read(4096)
    return file_text
 def _resolve_symlink(file_path: str) -> str:
    """Resolve `file_path` containing symlink to the actual file path."""
    if os.path.islink(file_path):
        file_path = os.path.realpath(file_path)
    return file_path
 _P = ParamSpec("_P")
--- a/unstructured/file_utils/model.py
+++ b/unstructured/file_utils/model.py
@ -286,7 +286,7 @@ class FileType(enum.Enum):
        "msg",
        [".msg"],
        "application/vnd.ms-outlook",
-        ["application/x-ole-storage"],
+        cast(list[str], []),
    )
    ODT = (
        "odt",
`@ -1 +1 @@`
	`__version__ = "0.15.1-dev4" # pragma: no cover`	`__version__ = "0.15.1-dev5" # pragma: no cover`