mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
fix(pptx): accommodate invalid image/jpg MIME-type (#3475)
As described in #3381, some clients, perhaps including Adobe PDF Converter, map JPEG images to the invalid `image/jpg` MIME-type. Prior to v1.0.0, `python-pptx` would not load these images, which caused image extraction to fail. Update the `python-pptx` dependency to `v1.0.1` or above to ensure this upstream fix is always available. Fixes: #3381
This commit is contained in:
parent
a468b2de3b
commit
73bef27ef1
@ -1,4 +1,4 @@
|
|||||||
## 0.15.2-dev1
|
## 0.15.2-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -8,6 +8,7 @@
|
|||||||
|
|
||||||
* **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
|
* **Renames Astra to Astra DB** Conforms with DataStax internal naming conventions.
|
||||||
* **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters).
|
* **Accommodate single-column CSV files.** Resolves a limitation of `partition_csv()` where delimiter detection would fail on a single-column CSV file (which naturally has no delimeters).
|
||||||
|
* **Accommodate `image/jpg` in PPTX as alias for `image/jpeg`.** Resolves problem partitioning PPTX files having an invalid `image/jpg` (should be `image/jpeg`) MIME-type in the `[Content_Types].xml` member of the PPTX Zip archive.
|
||||||
|
|
||||||
## 0.15.1
|
## 0.15.1
|
||||||
|
|
||||||
|
BIN
example-docs/test-image-jpg-mime.pptx
Normal file
BIN
example-docs/test-image-jpg-mime.pptx
Normal file
Binary file not shown.
@ -1,3 +1,3 @@
|
|||||||
-c ./deps/constraints.txt
|
-c ./deps/constraints.txt
|
||||||
|
|
||||||
python-pptx<=0.6.23
|
python-pptx>=1.0.1
|
||||||
|
@ -8,7 +8,7 @@ lxml==5.2.2
|
|||||||
# via python-pptx
|
# via python-pptx
|
||||||
pillow==10.4.0
|
pillow==10.4.0
|
||||||
# via python-pptx
|
# via python-pptx
|
||||||
python-pptx==0.6.23
|
python-pptx==1.0.1
|
||||||
# via -r ./extra-pptx.in
|
# via -r ./extra-pptx.in
|
||||||
xlsxwriter==3.2.0
|
xlsxwriter==3.2.0
|
||||||
# via python-pptx
|
# via python-pptx
|
||||||
|
@ -8,7 +8,7 @@ import hashlib
|
|||||||
import io
|
import io
|
||||||
import pathlib
|
import pathlib
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Any, Iterator
|
from typing import Any, Iterator, cast
|
||||||
|
|
||||||
import pptx
|
import pptx
|
||||||
import pytest
|
import pytest
|
||||||
@ -36,6 +36,7 @@ from unstructured.documents.elements import (
|
|||||||
)
|
)
|
||||||
from unstructured.partition.pptx import (
|
from unstructured.partition.pptx import (
|
||||||
PptxPartitionerOptions,
|
PptxPartitionerOptions,
|
||||||
|
_PptxPartitioner,
|
||||||
partition_pptx,
|
partition_pptx,
|
||||||
register_picture_partitioner,
|
register_picture_partitioner,
|
||||||
)
|
)
|
||||||
@ -85,8 +86,7 @@ def test_partition_pptx_from_file():
|
|||||||
with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
|
with open(example_doc_path("fake-power-point.pptx"), "rb") as f:
|
||||||
elements = partition_pptx(file=f)
|
elements = partition_pptx(file=f)
|
||||||
assert elements == EXPECTED_PPTX_OUTPUT
|
assert elements == EXPECTED_PPTX_OUTPUT
|
||||||
for element in elements:
|
assert all(e.metadata.filename is None for e in elements)
|
||||||
assert element.metadata.filename is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_partition_pptx_from_file_with_metadata_filename():
|
def test_partition_pptx_from_file_with_metadata_filename():
|
||||||
@ -107,6 +107,18 @@ def test_partition_pptx_recurses_into_group_shapes():
|
|||||||
assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
|
assert [e.text for e in elements] == ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_it_loads_a_PPTX_with_a_JPEG_misidentified_as_image_jpg(opts_args: dict[str, Any]):
|
||||||
|
opts_args["file_path"] = example_doc_path("test-image-jpg-mime.pptx")
|
||||||
|
opts = PptxPartitionerOptions(**opts_args)
|
||||||
|
prs = _PptxPartitioner(opts)._presentation
|
||||||
|
picture = cast(Picture, prs.slides[0].shapes[0])
|
||||||
|
|
||||||
|
try:
|
||||||
|
picture.image
|
||||||
|
except AttributeError:
|
||||||
|
raise AssertionError("JPEG image not recognized, needs `python-pptx>=1.0.1`")
|
||||||
|
|
||||||
|
|
||||||
# == page-break behaviors ========================================================================
|
# == page-break behaviors ========================================================================
|
||||||
|
|
||||||
|
|
||||||
@ -539,6 +551,31 @@ def test_partition_pptx_hierarchy_sample_document():
|
|||||||
assert element.id == expected_id
|
assert element.id == expected_id
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# MODULE-LEVEL FIXTURES
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def opts_args() -> dict[str, Any]:
|
||||||
|
"""All default arguments for `_XlsxPartitionerOptions`.
|
||||||
|
|
||||||
|
Individual argument values can be changed to suit each test. Makes construction of opts more
|
||||||
|
compact for testing purposes.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"date_from_file_object": False,
|
||||||
|
"file": None,
|
||||||
|
"file_path": None,
|
||||||
|
"include_page_breaks": True,
|
||||||
|
"include_slide_notes": False,
|
||||||
|
"infer_table_structure": True,
|
||||||
|
"metadata_file_path": None,
|
||||||
|
"metadata_last_modified": None,
|
||||||
|
"strategy": "fast",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
# ISOLATED UNIT TESTS
|
# ISOLATED UNIT TESTS
|
||||||
# ================================================================================================
|
# ================================================================================================
|
||||||
@ -817,22 +854,3 @@ class DescribePptxPartitionerOptions:
|
|||||||
return function_mock(
|
return function_mock(
|
||||||
request, "unstructured.partition.pptx.get_last_modified_date_from_file"
|
request, "unstructured.partition.pptx.get_last_modified_date_from_file"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.fixture()
|
|
||||||
def opts_args(self) -> dict[str, Any]:
|
|
||||||
"""All default arguments for `_XlsxPartitionerOptions`.
|
|
||||||
|
|
||||||
Individual argument values can be changed to suit each test. Makes construction of opts more
|
|
||||||
compact for testing purposes.
|
|
||||||
"""
|
|
||||||
return {
|
|
||||||
"date_from_file_object": False,
|
|
||||||
"file": None,
|
|
||||||
"file_path": None,
|
|
||||||
"include_page_breaks": True,
|
|
||||||
"include_slide_notes": False,
|
|
||||||
"infer_table_structure": True,
|
|
||||||
"metadata_file_path": None,
|
|
||||||
"metadata_last_modified": None,
|
|
||||||
"strategy": "fast",
|
|
||||||
}
|
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.15.2-dev1" # pragma: no cover
|
__version__ = "0.15.2-dev2" # pragma: no cover
|
||||||
|
Loading…
x
Reference in New Issue
Block a user