fix(file): fix OLE-based file-type auto-detection (#3437)

**Summary**
A DOC, PPT, or XLS file sent to partition() as a file-like object is
misidentified as a MSG file and raises an exception in python-oxmsg
(which is used to process MSG files).

**Fix**
DOC, PPT, XLS, and MSG are all Microsoft OLE-based files, aka. Compound
File Binary Format (CFBF). These can be reliably distinguished by
inspecting magic bytes in certain locations. `libmagic` is unreliable at
this or doesn't try, reporting the generic `"application/x-ole-storage"`
which corresponds to the "container" CFBF format (vaguely like a
Microsoft Zip format) that all these document types are stored in.

Unconditionally use `filetype.guess_mime()` provided by the `filetype`
package that is part of the base unstructured install. Unlike
`libmagic`, this package reliably detects the distinguished MIME-type
(e.g. `"application/msword"`) for OLE file subtypes.

Fixes #3364
This commit is contained in:
Steve Canny 2024-07-25 10:25:41 -07:00 committed by GitHub
parent 432d209c36
commit 4e61acc1c6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 262 additions and 90 deletions

View File

@ -1,4 +1,4 @@
## 0.15.1-dev4
## 0.15.1-dev5
### Enhancements
@ -14,6 +14,7 @@
* **A DOCX, PPTX, or XLSX file specified by path and ambiguously identified as MIME-type "application/octet-stream" is identified correctly.** Resolves a shortcoming where a file specified by path immediately fell back to filename-extension based identification when misidentified as "application/octet-stream", either by asserted content type or a mis-guess by libmagic. An MS Office file misidentified in this way is now correctly identified regardless of its filename and whether it is specified by path or file-like object.
* **Textual content retrieved from a URL with gzip transport compression now partitions correctly.** Resolves a bug where a textual file-type (such as Markdown) retrieved by passing a URL to `partition()` would raise when `gzip` compression was used for transport by the server.
* **A DOCX, PPTX, or XLSX content-type asserted on partition is confirmed or fixed.** Resolves a bug where calling `partition()` with a swapped MS-Office `content_type` would cause the file-type to be misidentified. A DOCX, PPTX, or XLSX MIME-type received by `partition()` is now checked for accuracy and corrected if the file is for a different MS-Office 2007+ type.
* **DOC, PPT, XLS, and MSG files are now auto-detected correctly.** Resolves a bug where DOC, PPT, and XLS files were auto-detected as MSG files under certain circumstances.
## 0.15.0

View File

@ -0,0 +1,5 @@
{
"id": "Sample-1",
"name": "Sample 1",
"description": "This is sample data #1"
}

View File

@ -19,9 +19,11 @@ from test_unstructured.unit_utils import (
)
from unstructured.file_utils.filetype import (
_FileTypeDetectionContext,
_OleFileDifferentiator,
_TextFileDifferentiator,
_ZipFileDifferentiator,
detect_filetype,
is_json_processable,
)
from unstructured.file_utils.model import FileType
@ -185,6 +187,46 @@ def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_c
assert file_type is expected_value
@pytest.mark.parametrize(
("expected_value", "file_name"),
[
(FileType.DOC, "simple.doc"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.XLS, "tests-example.xls"),
(FileType.MSG, "fake-email-multiple-attachments.msg"),
],
)
@pytest.mark.parametrize(
"content_type",
[
"application/msword",
"application/vnd.ms-outlook",
"application/vnd.ms-powerpoint",
"application/vnd.ms-excel",
"anything/else",
],
)
def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_content_type(
file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
):
"""Fixes wrong XLS asserted as DOC, PPT, etc.
Asserted content-type can be anything except `None` and differentiator will fix it if the file
is DOC, PPT, XLS, or MSG type.
"""
# -- disable strategies 2 & 3, content-type strategy should get this on its own --
ctx_mime_type_.return_value = None
with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read())
file_type = detect_filetype(file=file, content_type=content_type)
# -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
# -- fall-back to strategy 2 for any of these test cases.
ctx_mime_type_.assert_not_called()
assert file_type is expected_value
# ================================================================================================
# STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC
# ================================================================================================
@ -264,6 +306,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
[
(FileType.BMP, "img/bmp_24.bmp"),
(FileType.CSV, "stanley-cups.csv"),
(FileType.DOC, "simple.doc"),
(FileType.DOCX, "simple.docx"),
(FileType.EML, "eml/fake-email.eml"),
(FileType.EPUB, "winter-sports.epub"),
@ -271,14 +314,17 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
(FileType.HTML, "ideas-page.html"),
(FileType.JPG, "img/example.jpg"),
(FileType.JSON, "spring-weather.html.json"),
(FileType.MSG, "fake-email.msg"),
(FileType.ODT, "simple.odt"),
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
(FileType.PNG, "img/DA-1p.png"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.PPTX, "fake-power-point.pptx"),
(FileType.RTF, "fake-doc.rtf"),
(FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
(FileType.TXT, "norwich-city.txt"),
(FileType.WAV, "CantinaBand3.wav"),
(FileType.XLS, "tests-example.xls"),
(FileType.XLSX, "stanley-cups.xlsx"),
(FileType.XML, "factbook.xml"),
(FileType.ZIP, "simple.zip"),
@ -290,11 +336,7 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_
"""Does not work for all types, in particular:
TODOs:
- DOC is misidentified as MSG, TODO on that below.
- MSG is misidentified as UNK, but only on CI.
- PPT is misidentified as MSG, same fix as DOC.
- TSV is identified as TXT, maybe need an `.is_tsv` predicate in `_TextFileDifferentiator`
- XLS is misidentified as MSG, same fix as DOC.
NOCANDOs: w/o an extension I think these are the best we can do.
- MD is identified as TXT
@ -309,25 +351,44 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_
assert detect_filetype(file=file) is expected_value
# NOTE(scanny): magic gets this wrong ("application/x-ole-storage") but filetype lib gets it right
# ("application/msword"). Need a differentiator for "application/x-ole-storage".
@pytest.mark.xfail(reason="TODO: FIX", raises=AssertionError, strict=True)
@pytest.mark.parametrize(
("expected_value", "file_name"),
[
(FileType.DOC, "simple.doc"),
(FileType.PPT, "fake-power-point.ppt"),
(FileType.XLS, "tests-example.xls"),
# -- only fails on CI, maybe different libmagic version or "magic-files" --
# (FileType.MSG, "fake-email.msg"),
(FileType.MSG, "fake-email-multiple-attachments.msg"),
],
)
def test_it_detects_MS_Office_file_types_using_strategy_2_when_libmagic_guesses_mime_type(
file_name: str, expected_value: FileType
@pytest.mark.parametrize(
"guessed_mime_type",
[
"application/msword",
"application/vnd.ms-excel",
"application/vnd.ms-outlook",
"application/vnd.ms-powerpoint",
"application/x-ole-storage",
"anything/else",
],
)
def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_guessed_mime_type(
file_name: str, guessed_mime_type: str, expected_value: FileType, ctx_mime_type_: Mock
):
"""Fixes XLS wrongly-guessed as DOC, PPT, "application/x-ole-storage" etc.
It's better than that actually, the OLE differentiator will get the right file-type for any DOC,
PPT, XLS, or MSG file, regardless of guessed MIME-type.
"""
ctx_mime_type_.return_value = guessed_mime_type
# -- disable strategy 3 by not providing a file-name source --
with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read())
assert detect_filetype(file=file) is expected_value
# -- disable strategy 1 by not asserting a content-type --
file_type = detect_filetype(file=file)
ctx_mime_type_.assert_called_with()
assert file_type is expected_value
@pytest.mark.parametrize(
@ -454,6 +515,7 @@ def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_fil
[
(FileType.BMP, "img/bmp_24.bmp", "application/zip"),
(FileType.DOC, "simple.doc", None),
(FileType.EPUB, "winter-sports.epub", "application/x-ole-storage"),
(FileType.MSG, "fake-email.msg", "application/octet-stream"),
],
)
@ -575,6 +637,41 @@ def test_it_detect_CSV_from_path_and_file_when_content_contains_escaped_commas()
assert detect_filetype(file=f) == FileType.CSV
# ================================================================================================
# Describe `is_json_processable()`
# ================================================================================================
def it_affirms_JSON_is_array_of_objects_from_a_file_path():
assert is_json_processable(example_doc_path("simple.json")) is True
def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_path():
assert is_json_processable(example_doc_path("not-unstructured-payload.json")) is False
def it_affirms_JSON_is_array_of_objects_from_a_file_like_object_open_for_reading_bytes():
with open(example_doc_path("simple.json"), "rb") as f:
assert is_json_processable(file=f) is True
def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_a_file_like_object_open_for_reading_bytes():
with open(example_doc_path("not-unstructured-payload.json"), "rb") as f:
assert is_json_processable(file=f) is False
def it_affirms_JSON_is_array_of_objects_from_text():
with open(example_doc_path("simple.json")) as f:
text = f.read()
assert is_json_processable(file_text=text) is True
def and_it_affirms_JSON_is_NOT_an_array_of_objects_from_text():
with open(example_doc_path("not-unstructured-payload.json")) as f:
text = f.read()
assert is_json_processable(file_text=text) is False
# ================================================================================================
# MODULE-LEVEL FIXTURES
# ================================================================================================
@ -891,6 +988,52 @@ class Describe_FileTypeDetectionContext:
return property_mock(request, _FileTypeDetectionContext, "mime_type")
class Describe_OleFileDifferentiator:
"""Unit-test suite for `unstructured.file_utils.filetype._OleFileDifferentiator`."""
# -- .applies() ---------------------------------------------
def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self):
"""The constructor determines whether this differentiator is applicable.
It returns an instance only when differentiating a CFBF file-type is required, which it
judges by inspecting the initial bytes of the file for the CFBF magic-bytes.
"""
ctx = _FileTypeDetectionContext(example_doc_path("simple.doc"))
differentiator = _OleFileDifferentiator.applies(ctx, "foo/bar")
assert differentiator is not None
assert isinstance(differentiator, _OleFileDifferentiator)
def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_type(self):
ctx = _FileTypeDetectionContext(example_doc_path("winter-sports.epub"))
assert _OleFileDifferentiator.applies(ctx, "application/epub") is None
# -- .file_type ---------------------------------------------
@pytest.mark.parametrize(
("file_name", "expected_value"),
[
("simple.doc", FileType.DOC),
("fake-power-point.ppt", FileType.PPT),
("tests-example.xls", FileType.XLS),
("fake-email.msg", FileType.MSG),
("README.org", None),
],
)
def it_distinguishes_the_file_type_of_applicable_zip_files(
self, file_name: str, expected_value: FileType | None
):
# -- no file-name available, just to make sure we're not relying on an extension --
with open(example_doc_path(file_name), "rb") as f:
file = io.BytesIO(f.read())
ctx = _FileTypeDetectionContext(file=file)
differentiator = _OleFileDifferentiator(ctx)
assert differentiator.file_type is expected_value
class Describe_TextFileDifferentiator:
"""Unit-test suite for `unstructured.file_utils.filetype._TextFileDifferentiator`."""

View File

@ -1221,33 +1221,39 @@ def test_auto_partition_overwrites_any_filetype_applied_by_file_specific_partiti
@pytest.mark.parametrize(
"filetype",
"file_type",
[
t
for t in FileType
if t not in (FileType.EMPTY, FileType.UNK, FileType.WAV, FileType.XLS, FileType.ZIP)
and t.partitioner_function_name != "partition_image"
if t
not in (
FileType.EMPTY,
FileType.JSON,
FileType.UNK,
FileType.WAV,
FileType.XLS,
FileType.ZIP,
)
and t.partitioner_shortname != "image"
],
)
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(filetype: FileType):
extension = filetype.name.lower()
# -- except for two oddballs, the shortname is the extension --
partitioner_shortname = {FileType.TXT: "text", FileType.EML: "email"}.get(filetype, extension)
partition_fn_name = f"partition_{partitioner_shortname}"
module = import_module(f"unstructured.partition.{partitioner_shortname}")
def test_auto_partition_applies_the_correct_filetype_for_all_filetypes(file_type: FileType):
partition_fn_name = file_type.partitioner_function_name
module = import_module(file_type.partitioner_module_qname)
partition_fn = getattr(module, partition_fn_name)
# -- partition the first example-doc with the extension for this filetype --
elements: list[Element] = []
doc_path = example_doc_path("pdf") if filetype == FileType.PDF else example_doc_path("")
doc_path = example_doc_path("pdf") if file_type == FileType.PDF else example_doc_path("")
extensions = file_type._extensions
for file in pathlib.Path(doc_path).iterdir():
if file.is_file() and file.suffix == f".{extension}":
if file.is_file() and file.suffix in extensions:
elements = partition_fn(str(file))
break
assert elements
assert all(
e.metadata.filetype == filetype.mime_type
e.metadata.filetype == file_type.mime_type
for e in elements
if e.metadata.filetype is not None
)

View File

@ -1 +1 @@
__version__ = "0.15.1-dev4" # pragma: no cover
__version__ = "0.15.1-dev5" # pragma: no cover

View File

@ -112,12 +112,12 @@ def is_json_processable(
file is JSON.
"""
exactly_one(filename=filename, file=file, file_text=file_text)
if file_text is None:
file_text = _read_file_start_for_type_check(
file=file,
filename=filename,
encoding=encoding,
)
file_text = _FileTypeDetectionContext.new(
file_path=filename, file=file, encoding=encoding
).text_head
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
@ -161,6 +161,11 @@ class _FileTypeDetector:
if not content_type:
return None
# -- OLE-based file-format content_type values are sometimes unreliable. These are
# -- DOC, PPT, XLS, and MSG.
if differentiator := _OleFileDifferentiator.applies(self._ctx, content_type):
return differentiator.file_type
# -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable --
if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type):
return differentiator.file_type
@ -185,9 +190,8 @@ class _FileTypeDetector:
if mime_type is None:
return None
# NOTE(Crag): older magic lib does not differentiate between xls and doc
if mime_type == "application/msword" and extension == ".xls":
return FileType.XLS
if differentiator := _OleFileDifferentiator.applies(self._ctx, mime_type):
return differentiator.file_type
if mime_type.endswith("xml"):
return FileType.HTML if extension in (".html", ".htm") else FileType.XML
@ -248,7 +252,7 @@ class _FileTypeDetectionContext:
content_type: str | None = None,
metadata_file_path: str | None = None,
):
self._file_path = file_path
self._file_path_arg = file_path
self._file_arg = file
self._encoding_arg = encoding
self._content_type = content_type
@ -261,9 +265,9 @@ class _FileTypeDetectionContext:
file_path: str | None,
file: IO[bytes] | None,
encoding: str | None,
content_type: str | None,
metadata_file_path: str | None,
):
content_type: str | None = None,
metadata_file_path: str | None = None,
) -> _FileTypeDetectionContext:
self = cls(
file_path=file_path,
file=file,
@ -320,7 +324,10 @@ class _FileTypeDetectionContext:
None when the caller specified the source as a file-like object instead. Useful for user
feedback on an error, but users of context should have little use for it otherwise.
"""
return self._file_path
if (file_path := self._file_path_arg) is None:
return None
return os.path.realpath(file_path) if os.path.islink(file_path) else file_path
@lazyproperty
def is_zipfile(self) -> bool:
@ -351,19 +358,19 @@ class _FileTypeDetectionContext:
A `str` return value is always in lower-case.
"""
file_path = self.file_path
if LIBMAGIC_AVAILABLE:
import magic
mime_type = (
magic.from_file(_resolve_symlink(self._file_path), mime=True)
if self._file_path
magic.from_file(file_path, mime=True)
if file_path
else magic.from_buffer(self.file_head, mime=True)
)
return mime_type.lower() if mime_type else None
mime_type = (
ft.guess_mime(self._file_path) if self._file_path else ft.guess_mime(self.file_head)
)
mime_type = ft.guess_mime(file_path) if file_path else ft.guess_mime(self.file_head)
if mime_type is None:
logger.warning(
@ -387,8 +394,8 @@ class _FileTypeDetectionContext:
File is guaranteed to be at read position 0 when called.
"""
if self._file_path:
with open(self._file_path, "rb") as f:
if self.file_path:
with open(self.file_path, "rb") as f:
yield f
else:
file = self._file_arg
@ -416,7 +423,7 @@ class _FileTypeDetectionContext:
else content.decode(encoding=self.encoding, errors="ignore")
)
file_path = self._file_path
file_path = self.file_path
assert file_path is not None # -- guaranteed by `._validate` --
try:
@ -429,12 +436,61 @@ class _FileTypeDetectionContext:
def _validate(self) -> None:
"""Raise if the context is invalid."""
if self._file_path and not os.path.isfile(self._file_path):
raise FileNotFoundError(f"no such file {self._file_path}")
if not self._file_path and not self._file_arg:
if self.file_path and not os.path.isfile(self.file_path):
raise FileNotFoundError(f"no such file {self._file_path_arg}")
if not self.file_path and not self._file_arg:
raise ValueError("either `file_path` or `file` argument must be provided")
class _OleFileDifferentiator:
"""Refine an OLE-storage package (CFBF) file-type that may not be as specific as it could be.
Compound File Binary Format (CFBF), aka. OLE file, is use by Microsoft for legacy MS Office
files (DOC, PPT, XLS) as well as for Outlook MSG files. `libmagic` tends to identify these as
`"application/x-ole-storage"` which is true but too not specific enough for partitioning
purposes.
"""
def __init__(self, ctx: _FileTypeDetectionContext):
self._ctx = ctx
@classmethod
def applies(
cls, ctx: _FileTypeDetectionContext, mime_type: str
) -> _OleFileDifferentiator | None:
"""Constructs an instance, but only if this differentiator applies for `mime_type`."""
return cls(ctx) if cls._is_ole_file(ctx) else None
@property
def file_type(self) -> FileType | None:
"""Differentiated file-type for Microsoft Compound File Binary Format (CFBF).
Returns one of:
- `FileType.DOC`
- `FileType.PPT`
- `FileType.XLS`
- `FileType.MSG`
"""
# -- if this is not a CFBF file then whatever MIME-type was guessed is wrong, so return
# -- `None` to trigger fall-back to next strategy.
if not self._is_ole_file(self._ctx):
return None
# -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so rely on it to
# -- differentiate those. Note it doesn't detect MSG type though, so we assume any OLE file
# -- that is not a legacy MS-Office type to be a MSG file.
with self._ctx.open() as file:
mime_type = ft.guess_mime(file)
return FileType.from_mime_type(mime_type or "application/vnd.ms-outlook")
@staticmethod
def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool:
"""True when file has CFBF magic first 8 bytes."""
with ctx.open() as file:
return file.read(8) == b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
class _TextFileDifferentiator:
"""Refine a textual file-type that may not be as specific as it could be."""
@ -597,45 +653,6 @@ class _ZipFileDifferentiator:
return FileType.ZIP
def _read_file_start_for_type_check(
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
encoding: Optional[str] = "utf-8",
) -> str:
"""Reads the start of the file and returns the text content."""
exactly_one(filename=filename, file=file)
if file is not None:
file.seek(0)
file_content = file.read(4096)
if isinstance(file_content, str):
file_text = file_content
else:
file_text = file_content.decode(errors="ignore")
file.seek(0)
return file_text
# -- guaranteed by `exactly_one()` call --
assert filename is not None
try:
with open(filename, encoding=encoding) as f:
file_text = f.read(4096)
except UnicodeDecodeError:
formatted_encoding, _ = detect_file_encoding(filename=filename)
with open(filename, encoding=formatted_encoding) as f:
file_text = f.read(4096)
return file_text
def _resolve_symlink(file_path: str) -> str:
"""Resolve `file_path` containing symlink to the actual file path."""
if os.path.islink(file_path):
file_path = os.path.realpath(file_path)
return file_path
_P = ParamSpec("_P")

View File

@ -286,7 +286,7 @@ class FileType(enum.Enum):
"msg",
[".msg"],
"application/vnd.ms-outlook",
["application/x-ole-storage"],
cast(list[str], []),
)
ODT = (
"odt",