From e3417d7e98b8ffba47ed75c65be6cff3fc465764 Mon Sep 17 00:00:00 2001 From: Austin Walker Date: Thu, 8 May 2025 17:57:05 -0400 Subject: [PATCH] fix: Fix for Pillow error when extracting PNG images (#3998) When I tried to partition a PNG file and extract images, I got an error from Pillow: ``` WARNING unstructured:pdf_image_utils.py:230 Image Extraction Error: Skipping the failed image Traceback (most recent call last): File "/Users/austin/.pyenv/versions/unstructured/lib/python3.10/site-packages/PIL/JpegImagePlugin.py", line 666, in _save rawmode = RAWMODE[im.mode] KeyError: 'RGBA' ``` The issue is that a PNG has an additional layer that cannot be saved off in jpeg format. We can fix this with a quick conversion. I added a png test case that is now passing with this fix. --- CHANGELOG.md | 9 +++++++++ .../partition/pdf_image/test_pdf_image_utils.py | 1 + unstructured/__version__.py | 2 +- unstructured/partition/pdf_image/pdf_image_utils.py | 5 +++++ 4 files changed, 16 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a9b4c3ca5..20a4bcaf7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +## 0.17.7-dev0 + +### Enhancements + +### Features + +### Fixes +- **Fix image extraction for PNG files.** When `extract_image_block_to_payload` is True, and the image is a PNG, we get a Pillow error. We need to remove the PNG transparency layer before saving the image. + ## 0.17.6 ### Enhancements diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py index bfb09b762..1be79e92a 100644 --- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py +++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py @@ -73,6 +73,7 @@ def test_convert_pdf_to_image_raises_error(filename=example_doc_path("embedded-i [ (example_doc_path("pdf/layout-parser-paper-fast.pdf"), False), (example_doc_path("img/layout-parser-paper-fast.jpg"), True), + (example_doc_path("img/english-and-korean.png"), True), ], ) @pytest.mark.parametrize("element_category_to_save", [ElementType.IMAGE, ElementType.TABLE]) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 29149d154..d53993104 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.6" # pragma: no cover +__version__ = "0.17.7-dev0" # pragma: no cover diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py index a7e98aa2f..4365b8dba 100644 --- a/unstructured/partition/pdf_image/pdf_image_utils.py +++ b/unstructured/partition/pdf_image/pdf_image_utils.py @@ -204,6 +204,11 @@ def save_elements( image_path = image_paths[page_index] image = Image.open(image_path) cropped_image = image.crop(padded_bbox) + + # PNG images with transparency need to be converted before saving + if cropped_image.mode == "RGBA": + cropped_image = cropped_image.convert("RGB") + if extract_image_block_to_payload: buffered = BytesIO() cropped_image.save(buffered, format="JPEG")