Matches prefix to verify presence of DOCX,PPTX,XLSX files instead of standard file names (#3959)

Instead of looking for presence of `word/document.xml` ,
`ppt/presentation.xml` and `xl/workbook.xml` to identify DOCX,PPTX and
XLSX files, we look for prefix `word/document*.xml`,
`ppt/presentation*.xml` and `xl/workbook*.xml` as certain files
generated from office365 has files with different names.
Fixes https://github.com/Unstructured-IO/unstructured/issues/3937

---------

Co-authored-by: Yao You <theyaoyou@gmail.com>
This commit is contained in:
Sri Sudarsan 2025-03-21 21:57:13 +05:30 committed by GitHub
parent 0fa5174bd7
commit 349728162e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 23 additions and 4 deletions

View File

@ -1,3 +1,12 @@
## 0.17.3-dev0
### Enhancements
### Features
### Fixes
- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml
## 0.17.2 ## 0.17.2
* Fix Image in a <div> tag is "UncategorizedText" with no .text * Fix Image in a <div> tag is "UncategorizedText" with no .text

View File

@ -15,6 +15,7 @@ from test_unstructured.unit_utils import (
LogCaptureFixture, LogCaptureFixture,
Mock, Mock,
example_doc_path, example_doc_path,
input_path,
patch, patch,
property_mock, property_mock,
) )
@ -30,6 +31,7 @@ from unstructured.file_utils.model import FileType, create_file_type
is_in_docker = os.path.exists("/.dockerenv") is_in_docker = os.path.exists("/.dockerenv")
# ================================================================================================ # ================================================================================================
# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES) # STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
# ================================================================================================ # ================================================================================================
@ -987,3 +989,11 @@ def test_json_content_type_is_disambiguated_for_ndjson():
file_buffer.name = "filename.pdf" file_buffer.name = "filename.pdf"
predicted_type = detect_filetype(file=file_buffer, content_type="application/json") predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
assert predicted_type == FileType.NDJSON assert predicted_type == FileType.NDJSON
def test_office_files_when_document_archive_has_non_standard_prefix():
predicted_type = detect_filetype(
file_path=input_path("file_type/test_document_from_office365.docx")
)
assert predicted_type == FileType.DOCX

View File

@ -1 +1 @@
__version__ = "0.17.2" # pragma: no cover __version__ = "0.17.3-dev0" # pragma: no cover

View File

@ -747,13 +747,13 @@ class _ZipFileDetector:
filenames = zip.namelist() filenames = zip.namelist()
if "word/document.xml" in filenames: if any(re.match(r"word/document.*\.xml$", filename) for filename in filenames):
return FileType.DOCX return FileType.DOCX
if "xl/workbook.xml" in filenames: if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames):
return FileType.XLSX return FileType.XLSX
if "ppt/presentation.xml" in filenames: if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames):
return FileType.PPTX return FileType.PPTX
# -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root -- # -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --