diff --git a/CHANGELOG.md b/CHANGELOG.md index 456f6e4c7..575bda24a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ * **EML files specified as a file-path are detected correctly.** Resolved a bug where an EML file submitted to `partition()` as a file-path was identified as TXT and partitioned using `partition_text()`. EML files specified by path are now identified and processed correctly, including processing any attachments. * **A DOCX, PPTX, or XLSX file specified by path and ambiguously identified as MIME-type "application/octet-stream" is identified correctly.** Resolves a shortcoming where a file specified by path immediately fell back to filename-extension based identification when misidentified as "application/octet-stream", either by asserted content type or a mis-guess by libmagic. An MS Office file misidentified in this way is now correctly identified regardless of its filename and whether it is specified by path or file-like object. * **Textual content retrieved from a URL with gzip transport compression now partitions correctly.** Resolves a bug where a textual file-type (such as Markdown) retrieved by passing a URL to `partition()` would raise when `gzip` compression was used for transport by the server. +* **A DOCX, PPTX, or XLSX content-type asserted on partition is confirmed or fixed.** Resolves a bug where calling `partition()` with a swapped MS-Office `content_type` would cause the file-type to be misidentified. A DOCX, PPTX, or XLSX MIME-type received by `partition()` is now checked for accuracy and corrected if the file is for a different MS-Office 2007+ type. ## 0.15.0 diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index fac88ab2d..ef66a2297 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -153,6 +153,38 @@ def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_co assert file_type is expected_value +@pytest.mark.parametrize( + ("expected_value", "file_name"), + [ + (FileType.DOCX, "simple.docx"), + (FileType.PPTX, "fake-power-point.pptx"), + (FileType.XLSX, "stanley-cups.xlsx"), + ], +) +@pytest.mark.parametrize( + "content_type", + [ + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ], +) +def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_content_type( + file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock +): + # -- disable strategies 2 & 3, content-type strategy should get this on its own -- + ctx_mime_type_.return_value = None + with open(example_doc_path(file_name), "rb") as f: + file = io.BytesIO(f.read()) + + file_type = detect_filetype(file=file, content_type=content_type) + + # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not + # -- fall-back to strategy 2 for any of these test cases. + ctx_mime_type_.assert_not_called() + assert file_type is expected_value + + # ================================================================================================ # STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC # ================================================================================================ @@ -664,13 +696,13 @@ class Describe_FileTypeDetectionContext: # -- .file_head --------------------------------------------- - def it_grabs_the_first_4k_bytes_of_the_file_for_use_by_magic(self): + def it_grabs_the_first_8k_bytes_of_the_file_for_use_by_magic(self): ctx = _FileTypeDetectionContext(file_path=example_doc_path("norwich-city.txt")) head = ctx.file_head assert isinstance(head, bytes) - assert len(head) == 4096 + assert len(head) == 8192 assert head.startswith(b"Iwan Roberts\nRoberts celebrating after") # -- .file_path --------------------------------------------- diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 5e930c366..251f33361 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -161,6 +161,10 @@ class _FileTypeDetector: if not content_type: return None + # -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable -- + if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type): + return differentiator.file_type + # -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it -- return FileType.from_mime_type(content_type) @@ -307,7 +311,7 @@ class _FileTypeDetectionContext: def file_head(self) -> bytes: """The initial bytes of the file to be recognized, for use with libmagic detection.""" with self.open() as file: - return file.read(4096) + return file.read(8192) @lazyproperty def file_path(self) -> str | None: