mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-01 18:43:04 +00:00
fix(file): confirm or correct asserted DOCX, PPTX, and XLSX content types (#3434)
**Summary** The `content_type` argument received by `partition()` from the API is sometimes unreliable for MS-Office 2007+ MIME-types. What we've observed is that it gets the MS-Office bit right but falls down on distinguishing PPTX from DOCX or XLSX. Confirmation of these types is simple, fast, and reliable. Confirm all MS-Office `content_type` argument values asserted by callers of `detect_filetype()` and correct swapped values.
This commit is contained in:
parent
560cc0e975
commit
432d209c36
@ -13,6 +13,7 @@
|
||||
* **EML files specified as a file-path are detected correctly.** Resolved a bug where an EML file submitted to `partition()` as a file-path was identified as TXT and partitioned using `partition_text()`. EML files specified by path are now identified and processed correctly, including processing any attachments.
|
||||
* **A DOCX, PPTX, or XLSX file specified by path and ambiguously identified as MIME-type "application/octet-stream" is identified correctly.** Resolves a shortcoming where a file specified by path immediately fell back to filename-extension based identification when misidentified as "application/octet-stream", either by asserted content type or a mis-guess by libmagic. An MS Office file misidentified in this way is now correctly identified regardless of its filename and whether it is specified by path or file-like object.
|
||||
* **Textual content retrieved from a URL with gzip transport compression now partitions correctly.** Resolves a bug where a textual file-type (such as Markdown) retrieved by passing a URL to `partition()` would raise when `gzip` compression was used for transport by the server.
|
||||
* **A DOCX, PPTX, or XLSX content-type asserted on partition is confirmed or fixed.** Resolves a bug where calling `partition()` with a swapped MS-Office `content_type` would cause the file-type to be misidentified. A DOCX, PPTX, or XLSX MIME-type received by `partition()` is now checked for accuracy and corrected if the file is for a different MS-Office 2007+ type.
|
||||
|
||||
## 0.15.0
|
||||
|
||||
|
||||
@ -153,6 +153,38 @@ def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_co
|
||||
assert file_type is expected_value
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("expected_value", "file_name"),
|
||||
[
|
||||
(FileType.DOCX, "simple.docx"),
|
||||
(FileType.PPTX, "fake-power-point.pptx"),
|
||||
(FileType.XLSX, "stanley-cups.xlsx"),
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"content_type",
|
||||
[
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
],
|
||||
)
|
||||
def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_content_type(
|
||||
file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
|
||||
):
|
||||
# -- disable strategies 2 & 3, content-type strategy should get this on its own --
|
||||
ctx_mime_type_.return_value = None
|
||||
with open(example_doc_path(file_name), "rb") as f:
|
||||
file = io.BytesIO(f.read())
|
||||
|
||||
file_type = detect_filetype(file=file, content_type=content_type)
|
||||
|
||||
# -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
|
||||
# -- fall-back to strategy 2 for any of these test cases.
|
||||
ctx_mime_type_.assert_not_called()
|
||||
assert file_type is expected_value
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC
|
||||
# ================================================================================================
|
||||
@ -664,13 +696,13 @@ class Describe_FileTypeDetectionContext:
|
||||
|
||||
# -- .file_head ---------------------------------------------
|
||||
|
||||
def it_grabs_the_first_4k_bytes_of_the_file_for_use_by_magic(self):
|
||||
def it_grabs_the_first_8k_bytes_of_the_file_for_use_by_magic(self):
|
||||
ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt"))
|
||||
|
||||
head = ctx.file_head
|
||||
|
||||
assert isinstance(head, bytes)
|
||||
assert len(head) == 4096
|
||||
assert len(head) == 8192
|
||||
assert head.startswith(b"Iwan Roberts\nRoberts celebrating after")
|
||||
|
||||
# -- .file_path ---------------------------------------------
|
||||
|
||||
@ -161,6 +161,10 @@ class _FileTypeDetector:
|
||||
if not content_type:
|
||||
return None
|
||||
|
||||
# -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable --
|
||||
if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type):
|
||||
return differentiator.file_type
|
||||
|
||||
# -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it --
|
||||
return FileType.from_mime_type(content_type)
|
||||
|
||||
@ -307,7 +311,7 @@ class _FileTypeDetectionContext:
|
||||
def file_head(self) -> bytes:
|
||||
"""The initial bytes of the file to be recognized, for use with libmagic detection."""
|
||||
with self.open() as file:
|
||||
return file.read(4096)
|
||||
return file.read(8192)
|
||||
|
||||
@lazyproperty
|
||||
def file_path(self) -> str | None:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user