mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix(pptx): accommodate invalid image/jpg MIME-type (#3475)
As described in #3381, some clients, perhaps including Adobe PDF Converter, map JPEG images to the invalid `image/jpg` MIME-type. Prior to v1.0.0, `python-pptx` would not load these images, which caused image extraction to fail. Update the `python-pptx` dependency to `v1.0.1` or above to ensure this upstream fix is always available. Fixes: #3381
This commit is contained in:
parent
a468b2de3b
commit
73bef27ef1
@ -1,4 +1,4 @@
|
||||
## 0.15.2-dev1
|
||||
## 0.15.2-dev2
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
|
||||
* **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
|
||||
* **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters).
|
||||
* **Accommodate `image/jpg` in PPTX as alias for `image/jpeg`.** Resolves problem partitioning PPTX files having an invalid `image/jpg` (should be `image/jpeg`) MIME-type in the `[Content_Types].xml` member of the PPTX Zip archive.
|
||||
|
||||
## 0.15.1
|
||||
|
||||
|
BIN
example-docs/test-image-jpg-mime.pptx
Normal file
BIN
example-docs/test-image-jpg-mime.pptx
Normal file
Binary file not shown.
@ -1,3 +1,3 @@
|
||||
-c ./deps/constraints.txt
|
||||
|
||||
python-pptx<=0.6.23
|
||||
python-pptx>=1.0.1
|
||||
|
@ -8,7 +8,7 @@ lxml==5.2.2
|
||||
# via python-pptx
|
||||
pillow==10.4.0
|
||||
# via python-pptx
|
||||
python-pptx==0.6.23
|
||||
python-pptx==1.0.1
|
||||
# via -r ./extra-pptx.in
|
||||
xlsxwriter==3.2.0
|
||||
# via python-pptx
|
||||
|
@ -8,7 +8,7 @@ import hashlib
|
||||
import io
|
||||
import pathlib
|
||||
import tempfile
|
||||
from typing import Any, Iterator
|
||||
from typing import Any, Iterator, cast
|
||||
|
||||
import pptx
|
||||
import pytest
|
||||
@ -36,6 +36,7 @@ from unstructured.documents.elements import (
|
||||
)
|
||||
from unstructured.partition.pptx import (
|
||||
PptxPartitionerOptions,
|
||||
_PptxPartitioner,
|
||||
partition_pptx,
|
||||
register_picture_partitioner,
|
||||
)
|
||||
@ -85,8 +86,7 @@ def test_partition_pptx_from_file():
|
||||
with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
|
||||
elements = partition_pptx(file=f)
|
||||
assert elements == EXPECTED_PPTX_OUTPUT
|
||||
for element in elements:
|
||||
assert element.metadata.filename is None
|
||||
assert all(e.metadata.filename is None for e in elements)
|
||||
|
||||
|
||||
def test_partition_pptx_from_file_with_metadata_filename():
|
||||
@ -107,6 +107,18 @@ def test_partition_pptx_recurses_into_group_shapes():
|
||||
assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
|
||||
|
||||
|
||||
def test_it_loads_a_PPTX_with_a_JPEG_misidentified_as_image_jpg(opts_args: dict[str, Any]):
|
||||
opts_args["file_path"] = example_doc_path("test-image-jpg-mime.pptx")
|
||||
opts = PptxPartitionerOptions(**opts_args)
|
||||
prs = _PptxPartitioner(opts)._presentation
|
||||
picture = cast(Picture, prs.slides[0].shapes[0])
|
||||
|
||||
try:
|
||||
picture.image
|
||||
except AttributeError:
|
||||
raise AssertionError("JPEG image not recognized, needs `python-pptx>=1.0.1`")
|
||||
|
||||
|
||||
# == page-break behaviors ========================================================================
|
||||
|
||||
|
||||
@ -539,6 +551,31 @@ def test_partition_pptx_hierarchy_sample_document():
|
||||
assert element.id == expected_id
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# MODULE-LEVEL FIXTURES
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def opts_args() -> dict[str, Any]:
|
||||
"""All default arguments for `_XlsxPartitionerOptions`.
|
||||
|
||||
Individual argument values can be changed to suit each test. Makes construction of opts more
|
||||
compact for testing purposes.
|
||||
"""
|
||||
return {
|
||||
"date_from_file_object": False,
|
||||
"file": None,
|
||||
"file_path": None,
|
||||
"include_page_breaks": True,
|
||||
"include_slide_notes": False,
|
||||
"infer_table_structure": True,
|
||||
"metadata_file_path": None,
|
||||
"metadata_last_modified": None,
|
||||
"strategy": "fast",
|
||||
}
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# ISOLATED UNIT TESTS
|
||||
# ================================================================================================
|
||||
@ -817,22 +854,3 @@ class DescribePptxPartitionerOptions:
|
||||
return function_mock(
|
||||
request, "unstructured.partition.pptx.get_last_modified_date_from_file"
|
||||
)
|
||||
|
||||
@pytest.fixture()
|
||||
def opts_args(self) -> dict[str, Any]:
|
||||
"""All default arguments for `_XlsxPartitionerOptions`.
|
||||
|
||||
Individual argument values can be changed to suit each test. Makes construction of opts more
|
||||
compact for testing purposes.
|
||||
"""
|
||||
return {
|
||||
"date_from_file_object": False,
|
||||
"file": None,
|
||||
"file_path": None,
|
||||
"include_page_breaks": True,
|
||||
"include_slide_notes": False,
|
||||
"infer_table_structure": True,
|
||||
"metadata_file_path": None,
|
||||
"metadata_last_modified": None,
|
||||
"strategy": "fast",
|
||||
}
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.15.2-dev1" # pragma: no cover
|
||||
__version__ = "0.15.2-dev2" # pragma: no cover
|
||||
|
Loading…
x
Reference in New Issue
Block a user