fix(pptx): accommodate invalid image/jpg MIME-type (#3475)

As described in #3381, some clients, perhaps including Adobe PDF
Converter, map JPEG images to the invalid `image/jpg` MIME-type. Prior
to v1.0.0, `python-pptx` would not load these images, which caused image
extraction to fail.

Update the `python-pptx` dependency to `v1.0.1` or above to ensure this
upstream fix is always available.

Fixes: #3381
This commit is contained in:
Steve Canny 2024-08-06 11:48:15 -07:00 committed by GitHub
parent a468b2de3b
commit 73bef27ef1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 45 additions and 26 deletions

View File

@ -1,4 +1,4 @@
## 0.15.2-dev1 ## 0.15.2-dev2
### Enhancements ### Enhancements
@ -8,6 +8,7 @@
* **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions. * **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
* **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters). * **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters).
* **Accommodate `image/jpg` in PPTX as alias for `image/jpeg`.** Resolves problem partitioning PPTX files having an invalid `image/jpg` (should be `image/jpeg`) MIME-type in the `[Content_Types].xml` member of the PPTX Zip archive.
## 0.15.1 ## 0.15.1

Binary file not shown.

View File

@ -1,3 +1,3 @@
-c ./deps/constraints.txt -c ./deps/constraints.txt
python-pptx<=0.6.23 python-pptx>=1.0.1

View File

@ -8,7 +8,7 @@ lxml==5.2.2
# via python-pptx # via python-pptx
pillow==10.4.0 pillow==10.4.0
# via python-pptx # via python-pptx
python-pptx==0.6.23 python-pptx==1.0.1
# via -r ./extra-pptx.in # via -r ./extra-pptx.in
xlsxwriter==3.2.0 xlsxwriter==3.2.0
# via python-pptx # via python-pptx

View File

@ -8,7 +8,7 @@ import hashlib
import io import io
import pathlib import pathlib
import tempfile import tempfile
from typing import Any, Iterator from typing import Any, Iterator, cast
import pptx import pptx
import pytest import pytest
@ -36,6 +36,7 @@ from unstructured.documents.elements import (
) )
from unstructured.partition.pptx import ( from unstructured.partition.pptx import (
PptxPartitionerOptions, PptxPartitionerOptions,
_PptxPartitioner,
partition_pptx, partition_pptx,
register_picture_partitioner, register_picture_partitioner,
) )
@ -85,8 +86,7 @@ def test_partition_pptx_from_file():
with open(example_doc_path("fake-power-point.pptx"), "rb") as f: with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
elements = partition_pptx(file=f) elements = partition_pptx(file=f)
assert elements == EXPECTED_PPTX_OUTPUT assert elements == EXPECTED_PPTX_OUTPUT
for element in elements: assert all(e.metadata.filename is None for e in elements)
assert element.metadata.filename is None
def test_partition_pptx_from_file_with_metadata_filename(): def test_partition_pptx_from_file_with_metadata_filename():
@ -107,6 +107,18 @@ def test_partition_pptx_recurses_into_group_shapes():
assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
def test_it_loads_a_PPTX_with_a_JPEG_misidentified_as_image_jpg(opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("test-image-jpg-mime.pptx")
opts = PptxPartitionerOptions(**opts_args)
prs = _PptxPartitioner(opts)._presentation
picture = cast(Picture, prs.slides[0].shapes[0])
try:
picture.image
except AttributeError:
raise AssertionError("JPEG image not recognized, needs `python-pptx>=1.0.1`")
# == page-break behaviors ======================================================================== # == page-break behaviors ========================================================================
@ -539,6 +551,31 @@ def test_partition_pptx_hierarchy_sample_document():
assert element.id == expected_id assert element.id == expected_id
# ================================================================================================
# MODULE-LEVEL FIXTURES
# ================================================================================================
@pytest.fixture()
def opts_args() -> dict[str, Any]:
"""All default arguments for `_XlsxPartitionerOptions`.
Individual argument values can be changed to suit each test. Makes construction of opts more
compact for testing purposes.
"""
return {
"date_from_file_object": False,
"file": None,
"file_path": None,
"include_page_breaks": True,
"include_slide_notes": False,
"infer_table_structure": True,
"metadata_file_path": None,
"metadata_last_modified": None,
"strategy": "fast",
}
# ================================================================================================ # ================================================================================================
# ISOLATED UNIT TESTS # ISOLATED UNIT TESTS
# ================================================================================================ # ================================================================================================
@ -817,22 +854,3 @@ class DescribePptxPartitionerOptions:
return function_mock( return function_mock(
request, "unstructured.partition.pptx.get_last_modified_date_from_file" request, "unstructured.partition.pptx.get_last_modified_date_from_file"
) )
@pytest.fixture()
def opts_args(self) -> dict[str, Any]:
"""All default arguments for `_XlsxPartitionerOptions`.
Individual argument values can be changed to suit each test. Makes construction of opts more
compact for testing purposes.
"""
return {
"date_from_file_object": False,
"file": None,
"file_path": None,
"include_page_breaks": True,
"include_slide_notes": False,
"infer_table_structure": True,
"metadata_file_path": None,
"metadata_last_modified": None,
"strategy": "fast",
}

View File

@ -1 +1 @@
__version__ = "0.15.2-dev1" # pragma: no cover __version__ = "0.15.2-dev2" # pragma: no cover