Matches prefix to verify presence of DOCX,PPTX,XLSX files instead of standard file names (#3959)

Instead of looking for presence of `word/document.xml` , `ppt/presentation.xml` and `xl/workbook.xml` to identify DOCX,PPTX and XLSX files, we look for prefix `word/document*.xml`, `ppt/presentation*.xml` and `xl/workbook*.xml` as certain files generated from office365 has files with different names. Fixes https://github.com/Unstructured-IO/unstructured/issues/3937 --------- Co-authored-by: Yao You <theyaoyou@gmail.com>
2025-06-27 02:30:08 +00:00 · 2025-03-21 21:57:13 +05:30 · 2025-03-21 21:57:13 +05:30 · 349728162e
commit 349728162e
parent 0fa5174bd7
5 changed files with 23 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,12 @@
+## 0.17.3-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml 
+
 ## 0.17.2

 * Fix Image in a <div> tag is "UncategorizedText" with no .text
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@ -15,6 +15,7 @@ from test_unstructured.unit_utils import (
    LogCaptureFixture,
    Mock,
    example_doc_path,
+    input_path,
    patch,
    property_mock,
 )
@ -30,6 +31,7 @@ from unstructured.file_utils.model import FileType, create_file_type

 is_in_docker = os.path.exists("/.dockerenv")

+
 # ================================================================================================
 # STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
 # ================================================================================================
@ -987,3 +989,11 @@ def test_json_content_type_is_disambiguated_for_ndjson():
    file_buffer.name = "filename.pdf"
    predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
    assert predicted_type == FileType.NDJSON
+
+
+def test_office_files_when_document_archive_has_non_standard_prefix():
+
+    predicted_type = detect_filetype(
+        file_path=input_path("file_type/test_document_from_office365.docx")
+    )
+    assert predicted_type == FileType.DOCX
--- a/test_unstructured/testfiles/file_type/test_document_from_office365.docx
+++ b/test_unstructured/testfiles/file_type/test_document_from_office365.docx
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.17.2"  # pragma: no cover
+__version__ = "0.17.3-dev0"  # pragma: no cover
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@ -747,13 +747,13 @@ class _ZipFileDetector:

            filenames = zip.namelist()

-            if "word/document.xml" in filenames:
+            if any(re.match(r"word/document.*\.xml$", filename) for filename in filenames):
                return FileType.DOCX

-            if "xl/workbook.xml" in filenames:
+            if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames):
                return FileType.XLSX

-            if "ppt/presentation.xml" in filenames:
+            if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames):
                return FileType.PPTX

            # -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --