mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-28 11:31:08 +00:00
fix: Fix for Pillow error when extracting PNG images (#3998)
When I tried to partition a PNG file and extract images, I got an error from Pillow: ``` WARNING unstructured:pdf_image_utils.py:230 Image Extraction Error: Skipping the failed image Traceback (most recent call last): File "/Users/austin/.pyenv/versions/unstructured/lib/python3.10/site-packages/PIL/JpegImagePlugin.py", line 666, in _save rawmode = RAWMODE[im.mode] KeyError: 'RGBA' ``` The issue is that a PNG has an additional layer that cannot be saved off in jpeg format. We can fix this with a quick conversion. I added a png test case that is now passing with this fix.
This commit is contained in:
parent
b814ece39f
commit
e3417d7e98
@ -1,3 +1,12 @@
|
|||||||
|
## 0.17.7-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
- **Fix image extraction for PNG files.** When `extract_image_block_to_payload` is True, and the image is a PNG, we get a Pillow error. We need to remove the PNG transparency layer before saving the image.
|
||||||
|
|
||||||
## 0.17.6
|
## 0.17.6
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -73,6 +73,7 @@ def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-i
|
|||||||
[
|
[
|
||||||
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
|
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), False),
|
||||||
(example_doc_path("img/layout-parser-paper-fast.jpg"), True),
|
(example_doc_path("img/layout-parser-paper-fast.jpg"), True),
|
||||||
|
(example_doc_path("img/english-and-korean.png"), True),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
|
@pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE])
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.17.6" # pragma: no cover
|
__version__ = "0.17.7-dev0" # pragma: no cover
|
||||||
|
@ -204,6 +204,11 @@ def save_elements(
|
|||||||
image_path = image_paths[page_index]
|
image_path = image_paths[page_index]
|
||||||
image = Image.open(image_path)
|
image = Image.open(image_path)
|
||||||
cropped_image = image.crop(padded_bbox)
|
cropped_image = image.crop(padded_bbox)
|
||||||
|
|
||||||
|
# PNG images with transparency need to be converted before saving
|
||||||
|
if cropped_image.mode == "RGBA":
|
||||||
|
cropped_image = cropped_image.convert("RGB")
|
||||||
|
|
||||||
if extract_image_block_to_payload:
|
if extract_image_block_to_payload:
|
||||||
buffered = BytesIO()
|
buffered = BytesIO()
|
||||||
cropped_image.save(buffered, format="JPEG")
|
cropped_image.save(buffered, format="JPEG")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user