fix(pptx): accommodate invalid image/jpg MIME-type (#3475)

As described in #3381, some clients, perhaps including Adobe PDF
Converter, map JPEG images to the invalid `image/jpg` MIME-type. Prior
to v1.0.0, `python-pptx` would not load these images, which caused image
extraction to fail.

Update the `python-pptx` dependency to `v1.0.1` or above to ensure this
upstream fix is always available.

Fixes: #3381
This commit is contained in:
Steve Canny 2024-08-06 11:48:15 -07:00 committed by GitHub
parent a468b2de3b
commit 73bef27ef1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 45 additions and 26 deletions

View File

@ -1,4 +1,4 @@
## 0.15.2-dev1
## 0.15.2-dev2
### Enhancements
@ -8,6 +8,7 @@
* **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
* **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters).
* **Accommodate `image/jpg` in PPTX as alias for `image/jpeg`.** Resolves problem partitioning PPTX files having an invalid `image/jpg` (should be `image/jpeg`) MIME-type in the `[Content_Types].xml` member of the PPTX Zip archive.
## 0.15.1

Binary file not shown.

View File

@ -1,3 +1,3 @@
-c ./deps/constraints.txt
python-pptx<=0.6.23
python-pptx>=1.0.1

View File

@ -8,7 +8,7 @@ lxml==5.2.2
# via python-pptx
pillow==10.4.0
# via python-pptx
python-pptx==0.6.23
python-pptx==1.0.1
# via -r ./extra-pptx.in
xlsxwriter==3.2.0
# via python-pptx

View File

@ -8,7 +8,7 @@ import hashlib
import io
import pathlib
import tempfile
from typing import Any, Iterator
from typing import Any, Iterator, cast
import pptx
import pytest
@ -36,6 +36,7 @@ from unstructured.documents.elements import (
)
from unstructured.partition.pptx import (
PptxPartitionerOptions,
_PptxPartitioner,
partition_pptx,
register_picture_partitioner,
)
@ -85,8 +86,7 @@ def test_partition_pptx_from_file():
with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
elements = partition_pptx(file=f)
assert elements == EXPECTED_PPTX_OUTPUT
for element in elements:
assert element.metadata.filename is None
assert all(e.metadata.filename is None for e in elements)
def test_partition_pptx_from_file_with_metadata_filename():
@ -107,6 +107,18 @@ def test_partition_pptx_recurses_into_group_shapes():
assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
def test_it_loads_a_PPTX_with_a_JPEG_misidentified_as_image_jpg(opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("test-image-jpg-mime.pptx")
opts = PptxPartitionerOptions(**opts_args)
prs = _PptxPartitioner(opts)._presentation
picture = cast(Picture, prs.slides[0].shapes[0])
try:
picture.image
except AttributeError:
raise AssertionError("JPEG image not recognized, needs `python-pptx>=1.0.1`")
# == page-break behaviors ========================================================================
@ -539,6 +551,31 @@ def test_partition_pptx_hierarchy_sample_document():
assert element.id == expected_id
# ================================================================================================
# MODULE-LEVEL FIXTURES
# ================================================================================================
@pytest.fixture()
def opts_args() -> dict[str, Any]:
"""All default arguments for `_XlsxPartitionerOptions`.
Individual argument values can be changed to suit each test. Makes construction of opts more
compact for testing purposes.
"""
return {
"date_from_file_object": False,
"file": None,
"file_path": None,
"include_page_breaks": True,
"include_slide_notes": False,
"infer_table_structure": True,
"metadata_file_path": None,
"metadata_last_modified": None,
"strategy": "fast",
}
# ================================================================================================
# ISOLATED UNIT TESTS
# ================================================================================================
@ -817,22 +854,3 @@ class DescribePptxPartitionerOptions:
return function_mock(
request, "unstructured.partition.pptx.get_last_modified_date_from_file"
)
@pytest.fixture()
def opts_args(self) -> dict[str, Any]:
"""All default arguments for `_XlsxPartitionerOptions`.
Individual argument values can be changed to suit each test. Makes construction of opts more
compact for testing purposes.
"""
return {
"date_from_file_object": False,
"file": None,
"file_path": None,
"include_page_breaks": True,
"include_slide_notes": False,
"infer_table_structure": True,
"metadata_file_path": None,
"metadata_last_modified": None,
"strategy": "fast",
}

View File

@ -1 +1 @@
__version__ = "0.15.2-dev1" # pragma: no cover
__version__ = "0.15.2-dev2" # pragma: no cover