mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Matches prefix to verify presence of DOCX,PPTX,XLSX files instead of standard file names (#3959)
Instead of looking for presence of `word/document.xml` , `ppt/presentation.xml` and `xl/workbook.xml` to identify DOCX,PPTX and XLSX files, we look for prefix `word/document*.xml`, `ppt/presentation*.xml` and `xl/workbook*.xml` as certain files generated from office365 has files with different names. Fixes https://github.com/Unstructured-IO/unstructured/issues/3937 --------- Co-authored-by: Yao You <theyaoyou@gmail.com>
This commit is contained in:
parent
0fa5174bd7
commit
349728162e
@ -1,3 +1,12 @@
|
||||
## 0.17.3-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml
|
||||
|
||||
## 0.17.2
|
||||
|
||||
* Fix Image in a <div> tag is "UncategorizedText" with no .text
|
||||
|
@ -15,6 +15,7 @@ from test_unstructured.unit_utils import (
|
||||
LogCaptureFixture,
|
||||
Mock,
|
||||
example_doc_path,
|
||||
input_path,
|
||||
patch,
|
||||
property_mock,
|
||||
)
|
||||
@ -30,6 +31,7 @@ from unstructured.file_utils.model import FileType, create_file_type
|
||||
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
|
||||
# ================================================================================================
|
||||
@ -987,3 +989,11 @@ def test_json_content_type_is_disambiguated_for_ndjson():
|
||||
file_buffer.name = "filename.pdf"
|
||||
predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
|
||||
assert predicted_type == FileType.NDJSON
|
||||
|
||||
|
||||
def test_office_files_when_document_archive_has_non_standard_prefix():
|
||||
|
||||
predicted_type = detect_filetype(
|
||||
file_path=input_path("file_type/test_document_from_office365.docx")
|
||||
)
|
||||
assert predicted_type == FileType.DOCX
|
||||
|
Binary file not shown.
@ -1 +1 @@
|
||||
__version__ = "0.17.2" # pragma: no cover
|
||||
__version__ = "0.17.3-dev0" # pragma: no cover
|
||||
|
@ -747,13 +747,13 @@ class _ZipFileDetector:
|
||||
|
||||
filenames = zip.namelist()
|
||||
|
||||
if "word/document.xml" in filenames:
|
||||
if any(re.match(r"word/document.*\.xml$", filename) for filename in filenames):
|
||||
return FileType.DOCX
|
||||
|
||||
if "xl/workbook.xml" in filenames:
|
||||
if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames):
|
||||
return FileType.XLSX
|
||||
|
||||
if "ppt/presentation.xml" in filenames:
|
||||
if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames):
|
||||
return FileType.PPTX
|
||||
|
||||
# -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --
|
||||
|
Loading…
x
Reference in New Issue
Block a user