mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Matches prefix to verify presence of DOCX,PPTX,XLSX files instead of standard file names (#3959)
Instead of looking for presence of `word/document.xml` , `ppt/presentation.xml` and `xl/workbook.xml` to identify DOCX,PPTX and XLSX files, we look for prefix `word/document*.xml`, `ppt/presentation*.xml` and `xl/workbook*.xml` as certain files generated from office365 has files with different names. Fixes https://github.com/Unstructured-IO/unstructured/issues/3937 --------- Co-authored-by: Yao You <theyaoyou@gmail.com>
This commit is contained in:
parent
0fa5174bd7
commit
349728162e
@ -1,3 +1,12 @@
|
|||||||
|
## 0.17.3-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
- **Fixes wrong detection of office files** certain office files wrongly identified as .ZIP when office(.docx,.xlsx and .pptx) files containing files other than word/document.xml, xl/workbook.xml and ppt/presentation.xml respectively will now be identified correctly by looking for word/document\*.xml, xl/workbook\*.xml and ppt/presentation\*.xml
|
||||||
|
|
||||||
## 0.17.2
|
## 0.17.2
|
||||||
|
|
||||||
* Fix Image in a <div> tag is "UncategorizedText" with no .text
|
* Fix Image in a <div> tag is "UncategorizedText" with no .text
|
||||||
|
@ -15,6 +15,7 @@ from test_unstructured.unit_utils import (
|
|||||||
LogCaptureFixture,
|
LogCaptureFixture,
|
||||||
Mock,
|
Mock,
|
||||||
example_doc_path,
|
example_doc_path,
|
||||||
|
input_path,
|
||||||
patch,
|
patch,
|
||||||
property_mock,
|
property_mock,
|
||||||
)
|
)
|
||||||
@ -30,6 +31,7 @@ from unstructured.file_utils.model import FileType, create_file_type
|
|||||||
|
|
||||||
is_in_docker = os.path.exists("/.dockerenv")
|
is_in_docker = os.path.exists("/.dockerenv")
|
||||||
|
|
||||||
|
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
|
# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
@ -987,3 +989,11 @@ def test_json_content_type_is_disambiguated_for_ndjson():
|
|||||||
file_buffer.name = "filename.pdf"
|
file_buffer.name = "filename.pdf"
|
||||||
predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
|
predicted_type = detect_filetype(file=file_buffer, content_type="application/json")
|
||||||
assert predicted_type == FileType.NDJSON
|
assert predicted_type == FileType.NDJSON
|
||||||
|
|
||||||
|
|
||||||
|
def test_office_files_when_document_archive_has_non_standard_prefix():
|
||||||
|
|
||||||
|
predicted_type = detect_filetype(
|
||||||
|
file_path=input_path("file_type/test_document_from_office365.docx")
|
||||||
|
)
|
||||||
|
assert predicted_type == FileType.DOCX
|
||||||
|
Binary file not shown.
@ -1 +1 @@
|
|||||||
__version__ = "0.17.2" # pragma: no cover
|
__version__ = "0.17.3-dev0" # pragma: no cover
|
||||||
|
@ -747,13 +747,13 @@ class _ZipFileDetector:
|
|||||||
|
|
||||||
filenames = zip.namelist()
|
filenames = zip.namelist()
|
||||||
|
|
||||||
if "word/document.xml" in filenames:
|
if any(re.match(r"word/document.*\.xml$", filename) for filename in filenames):
|
||||||
return FileType.DOCX
|
return FileType.DOCX
|
||||||
|
|
||||||
if "xl/workbook.xml" in filenames:
|
if any(re.match(r"xl/workbook.*\.xml$", filename) for filename in filenames):
|
||||||
return FileType.XLSX
|
return FileType.XLSX
|
||||||
|
|
||||||
if "ppt/presentation.xml" in filenames:
|
if any(re.match(r"ppt/presentation.*\.xml$", filename) for filename in filenames):
|
||||||
return FileType.PPTX
|
return FileType.PPTX
|
||||||
|
|
||||||
# -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --
|
# -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --
|
||||||
|
Loading…
x
Reference in New Issue
Block a user