Refactor: support image extraction (#2201)

### Summary This PR is the second part of the "image extraction" refactor to move it from unstructured-inference repo to unstructured repo, the first part is done in https://github.com/Unstructured-IO/unstructured-inference/pull/299. This PR adds logic to support extracting images. ### Testing `git clone -b refactor/remove_image_extraction_code --single-branch https://github.com/Unstructured-IO/unstructured-inference.git && cd unstructured-inference && pip install -e . && cd ../` ``` elements = partition_pdf( filename="example-docs/embedded-images.pdf", strategy="hi_res", extract_images_in_pdf=True, ) print("\n\n".join([str(el) for el in elements])) ```
2025-11-08 14:39:27 +00:00 · 2023-12-05 10:22:29 -08:00 · 2023-12-05 10:22:29 -08:00 · ed76b11b1a
commit ed76b11b1a
parent c5cb216ac8
24 changed files with 334 additions and 91 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,6 +2,7 @@
 ### Enhancements
 * **Refactor image extraction code.** The image extraction code is moved from `unstructured-inference` to `unstructured`. 
 * **Refactor pdfminer code.** The pdfminer code is moved from `unstructured-inference` to `unstructured`.
 ### Features
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -36,7 +36,7 @@ idna==3.6
    #   requests
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==6.9.0
+importlib-metadata==7.0.0
    # via sphinx
 jinja2==3.1.2
    # via
--- a/requirements/build.txt
+++ b/requirements/build.txt
@ -36,7 +36,7 @@ idna==3.6
    #   requests
 imagesize==1.4.1
    # via sphinx
-importlib-metadata==6.9.0
+importlib-metadata==7.0.0
    # via sphinx
 jinja2==3.1.2
    # via
--- a/requirements/dev.txt
+++ b/requirements/dev.txt
@ -91,7 +91,7 @@ idna==3.6
    #   anyio
    #   jsonschema
    #   requests
-importlib-metadata==6.9.0
+importlib-metadata==7.0.0
    # via
    #   build
    #   jupyter-client
@ -167,7 +167,7 @@ jupyter-events==0.9.0
    # via jupyter-server
 jupyter-lsp==2.2.1
    # via jupyterlab
-jupyter-server==2.11.1
+jupyter-server==2.11.2
    # via
    #   jupyter-lsp
    #   jupyterlab
@ -198,7 +198,7 @@ mistune==3.0.2
    # via nbconvert
 nbclient==0.9.0
    # via nbconvert
-nbconvert==7.11.0
+nbconvert==7.12.0
    # via
    #   jupyter
    #   jupyter-server
@ -290,7 +290,7 @@ pyyaml==6.0.1
    #   -c test.txt
    #   jupyter-events
    #   pre-commit
-pyzmq==25.1.1
+pyzmq==25.1.2
    # via
    #   ipykernel
    #   jupyter-client
@ -405,7 +405,7 @@ webencodings==0.5.1
    # via
    #   bleach
    #   tinycss2
-websocket-client==1.6.4
+websocket-client==1.7.0
    # via jupyter-server
 wheel==0.42.0
    # via
--- a/requirements/extra-markdown.txt
+++ b/requirements/extra-markdown.txt
@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=extra-markdown.txt extra-markdown.in
 #
-importlib-metadata==6.9.0
+importlib-metadata==7.0.0
    # via markdown
 markdown==3.5.1
    # via -r extra-markdown.in
--- a/requirements/extra-paddleocr.txt
+++ b/requirements/extra-paddleocr.txt
@ -45,7 +45,7 @@ flask==3.0.0
    #   visualdl
 flask-babel==4.0.0
    # via visualdl
-fonttools==4.45.1
+fonttools==4.46.0
    # via matplotlib
 future==0.18.3
    # via bce-python-sdk
@ -59,7 +59,7 @@ imageio==2.33.0
    #   scikit-image
 imgaug==0.4.0
    # via unstructured-paddleocr
-importlib-metadata==6.9.0
+importlib-metadata==7.0.0
    # via flask
 importlib-resources==6.1.1
    # via matplotlib
--- a/requirements/extra-pdf-image.in
+++ b/requirements/extra-pdf-image.in
@ -8,7 +8,7 @@ pikepdf
 pypdf
 # Do not move to contsraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference==0.7.17
+unstructured-inference==0.7.18
 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats
 # from one tesseract call
 unstructured.pytesseract>=0.3.12
--- a/requirements/extra-pdf-image.txt
+++ b/requirements/extra-pdf-image.txt
@ -37,7 +37,7 @@ filelock==3.13.1
    #   transformers
 flatbuffers==23.5.26
    # via onnxruntime
-fonttools==4.45.1
+fonttools==4.46.0
    # via matplotlib
 fsspec==2023.9.1
    # via
@ -134,7 +134,7 @@ pdfminer-six==20221105
    #   pdfplumber
 pdfplumber==0.10.3
    # via layoutparser
-pikepdf==8.7.1
+pikepdf==8.8.0
    # via -r extra-pdf-image.in
 pillow==10.0.1
    # via
@ -250,7 +250,7 @@ typing-extensions==4.8.0
    #   torch
 tzdata==2023.3
    # via pandas
-unstructured-inference==0.7.17
+unstructured-inference==0.7.18
    # via -r extra-pdf-image.in
 unstructured-pytesseract==0.3.12
    # via
--- a/requirements/ingest/azure.txt
+++ b/requirements/ingest/azure.txt
@ -60,7 +60,7 @@ idna==3.6
    #   yarl
 isodate==0.6.1
    # via azure-storage-blob
-msal==1.25.0
+msal==1.26.0
    # via
    #   azure-datalake-store
    #   azure-identity
--- a/requirements/ingest/delta-table.txt
+++ b/requirements/ingest/delta-table.txt
@ -4,7 +4,7 @@
 #
 #    pip-compile --output-file=ingest/delta-table.txt ingest/delta-table.in
 #
-deltalake==0.13.0
+deltalake==0.14.0
    # via -r ingest/delta-table.in
 fsspec==2023.9.1
    # via
--- a/requirements/ingest/embed-aws-bedrock.txt
+++ b/requirements/ingest/embed-aws-bedrock.txt
@ -64,11 +64,11 @@ jsonpatch==1.33
    #   langchain-core
 jsonpointer==2.4
    # via jsonpatch
-langchain==0.0.344
+langchain==0.0.345
    # via -r ingest/embed-aws-bedrock.in
-langchain-core==0.0.8
+langchain-core==0.0.9
    # via langchain
-langsmith==0.0.68
+langsmith==0.0.69
    # via
    #   langchain
    #   langchain-core
--- a/requirements/ingest/embed-huggingface.txt
+++ b/requirements/ingest/embed-huggingface.txt
@ -79,11 +79,11 @@ jsonpatch==1.33
    #   langchain-core
 jsonpointer==2.4
    # via jsonpatch
-langchain==0.0.344
+langchain==0.0.345
    # via -r ingest/embed-huggingface.in
-langchain-core==0.0.8
+langchain-core==0.0.9
    # via langchain
-langsmith==0.0.68
+langsmith==0.0.69
    # via
    #   langchain
    #   langchain-core
--- a/requirements/ingest/embed-openai.txt
+++ b/requirements/ingest/embed-openai.txt
@ -64,11 +64,11 @@ jsonpatch==1.33
    #   langchain-core
 jsonpointer==2.4
    # via jsonpatch
-langchain==0.0.344
+langchain==0.0.345
    # via -r ingest/embed-openai.in
-langchain-core==0.0.8
+langchain-core==0.0.9
    # via langchain
-langsmith==0.0.68
+langsmith==0.0.69
    # via
    #   langchain
    #   langchain-core
@ -125,7 +125,7 @@ tenacity==8.2.3
    # via
    #   langchain
    #   langchain-core
-tiktoken==0.5.1
+tiktoken==0.5.2
    # via -r ingest/embed-openai.in
 tqdm==4.66.1
    # via
--- a/requirements/ingest/onedrive.txt
+++ b/requirements/ingest/onedrive.txt
@ -29,7 +29,7 @@ idna==3.6
    # via
    #   -c ingest/../base.txt
    #   requests
-msal==1.25.0
+msal==1.26.0
    # via
    #   -r ingest/onedrive.in
    #   office365-rest-python-client
--- a/requirements/ingest/outlook.txt
+++ b/requirements/ingest/outlook.txt
@ -23,7 +23,7 @@ idna==3.6
    # via
    #   -c ingest/../base.txt
    #   requests
-msal==1.25.0
+msal==1.26.0
    # via
    #   -r ingest/outlook.in
    #   office365-rest-python-client
--- a/requirements/ingest/reddit.txt
+++ b/requirements/ingest/reddit.txt
@ -33,5 +33,5 @@ urllib3==1.26.18
    #   -c ingest/../base.txt
    #   -c ingest/../constraints.in
    #   requests
-websocket-client==1.6.4
+websocket-client==1.7.0
    # via praw
--- a/requirements/ingest/sharepoint.txt
+++ b/requirements/ingest/sharepoint.txt
@ -23,7 +23,7 @@ idna==3.6
    # via
    #   -c ingest/../base.txt
    #   requests
-msal==1.25.0
+msal==1.26.0
    # via
    #   -r ingest/sharepoint.in
    #   office365-rest-python-client
--- a/requirements/ingest/slack.txt
+++ b/requirements/ingest/slack.txt
@ -4,5 +4,5 @@
 #
 #    pip-compile --output-file=ingest/slack.txt ingest/slack.in
 #
-slack-sdk==3.26.0
+slack-sdk==3.26.1
    # via -r ingest/slack.in
--- a/requirements/ingest/weaviate.txt
+++ b/requirements/ingest/weaviate.txt
@ -2,44 +2,41 @@
 # This file is autogenerated by pip-compile with Python 3.8
 # by the following command:
 #
-#    pip-compile --constraint=requirements/constraints.in requirements/ingest/weaviate.in
+#    pip-compile --output-file=ingest/weaviate.txt ingest/weaviate.in
 #
 authlib==1.2.1
    # via weaviate-client
 certifi==2023.11.17
    # via
-    #   -c requirements/constraints.in
+    #   -c ingest/../base.txt
-    #   -c requirements/ingest/../base.txt
+    #   -c ingest/../constraints.in
    #   -c requirements/ingest/../constraints.in
    #   requests
 cffi==1.16.0
    # via cryptography
 charset-normalizer==3.3.2
    # via
-    #   -c requirements/ingest/../base.txt
+    #   -c ingest/../base.txt
    #   requests
-cryptography==41.0.5
+cryptography==41.0.7
    # via authlib
-idna==3.4
+idna==3.6
    # via
-    #   -c requirements/ingest/../base.txt
+    #   -c ingest/../base.txt
    #   requests
 pycparser==2.21
    # via cffi
 requests==2.31.0
    # via
-    #   -c requirements/ingest/../base.txt
+    #   -c ingest/../base.txt
    #   weaviate-client
 urllib3==1.26.18
    # via
-    #   -c requirements/constraints.in
+    #   -c ingest/../base.txt
-    #   -c requirements/ingest/../base.txt
+    #   -c ingest/../constraints.in
    #   -c requirements/ingest/../constraints.in
    #   requests
 validators==0.22.0
    # via weaviate-client
 weaviate-client==3.25.3
    # via
-    #   -c requirements/constraints.in
+    #   -c ingest/../constraints.in
-    #   -c requirements/ingest/../constraints.in
+    #   -r ingest/weaviate.in
    #   -r requirements/ingest/weaviate.in
--- a/requirements/test.txt
+++ b/requirements/test.txt
@ -36,7 +36,7 @@ flake8==6.1.0
    #   flake8-print
 flake8-print==5.0.0
    # via -r test.in
-freezegun==1.2.2
+freezegun==1.3.1
    # via -r test.in
 grpcio==1.59.3
    # via -r test.in
@ -111,7 +111,7 @@ requests==2.31.0
    # via
    #   -c base.txt
    #   label-studio-sdk
-ruff==0.1.6
+ruff==0.1.7
    # via -r test.in
 six==1.16.0
    # via
--- a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
+++ b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@ -0,0 +1,115 @@
 import os
 import tempfile
 import numpy as np
 import pytest
 from PIL import Image as PILImg
 from test_unstructured.unit_utils import example_doc_path
 from unstructured.documents.coordinates import PixelSpace
 from unstructured.documents.elements import ElementMetadata, Image
 from unstructured.partition.pdf_image import pdf_image_utils
@pytest.mark.parametrize("image_type", ["pil", "numpy_array"])
 def test_write_image(image_type):
    mock_pil_image = PILImg.new("RGB", (50, 50))
    mock_numpy_image = np.zeros((50, 50, 3), np.uint8)
    image_map = {
        "pil": mock_pil_image,
        "numpy_array": mock_numpy_image,
    }
    image = image_map[image_type]
    with tempfile.TemporaryDirectory() as tmpdir:
        output_image_path = os.path.join(tmpdir, "test_image.jpg")
        pdf_image_utils.write_image(image, output_image_path)
        assert os.path.exists(output_image_path)
        # Additional check to see if the written image can be read
        read_image = PILImg.open(output_image_path)
        assert read_image is not None
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
@pytest.mark.parametrize("path_only", [True, False])
 def test_convert_pdf_to_image(
    file_mode, path_only, filename=example_doc_path("embedded-images.pdf")
 ):
    with tempfile.TemporaryDirectory() as tmpdir:
        if file_mode == "filename":
            images = pdf_image_utils.convert_pdf_to_image(
                filename=filename,
                file=None,
                output_folder=tmpdir,
                path_only=path_only,
            )
        else:
            with open(filename, "rb") as f:
                images = pdf_image_utils.convert_pdf_to_image(
                    filename="",
                    file=f,
                    output_folder=tmpdir,
                    path_only=path_only,
                )
        if path_only:
            assert isinstance(images[0], str)
        else:
            assert isinstance(images[0], PILImg.Image)
 def test_extract_images_from_elements(filename=example_doc_path("embedded-images.pdf")):
    with tempfile.TemporaryDirectory() as tmpdir:
        elements = [
            Image(
                text="3",
                coordinates=(
                    (78.7401411111111, 86.61545694444455),
                    (78.7401411111111, 519.9487805555556),
                    (512.0734647222223, 519.9487805555556),
                    (512.0734647222223, 86.61545694444455),
                ),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=1),
            ),
            Image(
                text="4",
                coordinates=(
                    (570.8661397222222, 86.6154566666667),
                    (570.8661397222222, 519.6862825000001),
                    (1003.9369655555556, 519.6862825000001),
                    (1003.9369655555556, 86.6154566666667),
                ),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=1),
            ),
            Image(
                text="5",
                coordinates=(
                    (1062.9921808333331, 86.61545694444455),
                    (1062.9921808333331, 519.9487805555556),
                    (1496.3255044444445, 519.9487805555556),
                    (1496.3255044444445, 86.61545694444455),
                ),
                coordinate_system=PixelSpace(width=1575, height=1166),
                metadata=ElementMetadata(page_number=1),
            ),
        ]
        pdf_image_utils.extract_images_from_elements(
            elements=elements, pdf_image_dpi=200, filename=filename, output_dir_path=str(tmpdir)
        )
        for i, el in enumerate(elements):
            expected_image_path = os.path.join(
                str(tmpdir), f"figure-{el.metadata.page_number}-{i + 1}.jpg"
            )
            assert os.path.isfile(el.metadata.image_path)
            assert el.metadata.image_path == expected_image_path
 def test_write_image_raises_error():
    with pytest.raises(ValueError):
        pdf_image_utils.write_image("invalid_type", "test_image.jpg")
--- a/unstructured/documents/elements.py
+++ b/unstructured/documents/elements.py
@ -555,6 +555,44 @@ def _add_regex_metadata(
    return elements
 class ElementType:
    TITLE = "Title"
    TEXT = "Text"
    UNCATEGORIZED_TEXT = "UncategorizedText"
    NARRATIVE_TEXT = "NarrativeText"
    BULLETED_TEXT = "BulletedText"
    ABSTRACT = "Abstract"
    THREADING = "Threading"
    FORM = "Form"
    FIELD_NAME = "Field-Name"
    VALUE = "Value"
    LINK = "Link"
    COMPOSITE_ELEMENT = "CompositeElement"
    IMAGE = "Image"
    PICTURE = "Picture"
    FIGURE_CAPTION = "FigureCaption"
    FIGURE = "Figure"
    CAPTION = "Caption"
    LIST = "List"
    LIST_ITEM = "ListItem"
    LIST_ITEM_OTHER = "List-item"
    CHECKED = "Checked"
    UNCHECKED = "Unchecked"
    ADDRESS = "Address"
    EMAIL_ADDRESS = "EmailAddress"
    PAGE_BREAK = "PageBreak"
    FORMULA = "Formula"
    TABLE = "Table"
    HEADER = "Header"
    HEADLINE = "Headline"
    SUB_HEADLINE = "Subheadline"
    PAGE_HEADER = "Page-header"  # Title?
    SECTION_HEADER = "Section-header"
    FOOTER = "Footer"
    FOOTNOTE = "Footnote"
    PAGE_FOOTER = "Page-footer"
 class Element(abc.ABC):
    """An element is a section of a page in the document."""
@ -764,7 +802,7 @@ class EmailAddress(Text):
 class Image(Text):
    """A text element for capturing image metadata."""
-    category = "Image"
+    category = ElementType.IMAGE
 class PageBreak(Text):
@ -797,44 +835,6 @@ class Footer(Text):
    category = "Footer"
 class ElementType:
    TITLE = "Title"
    TEXT = "Text"
    UNCATEGORIZED_TEXT = "UncategorizedText"
    NARRATIVE_TEXT = "NarrativeText"
    BULLETED_TEXT = "BulletedText"
    ABSTRACT = "Abstract"
    THREADING = "Threading"
    FORM = "Form"
    FIELD_NAME = "Field-Name"
    VALUE = "Value"
    LINK = "Link"
    COMPOSITE_ELEMENT = "CompositeElement"
    IMAGE = "Image"
    PICTURE = "Picture"
    FIGURE_CAPTION = "FigureCaption"
    FIGURE = "Figure"
    CAPTION = "Caption"
    LIST = "List"
    LIST_ITEM = "ListItem"
    LIST_ITEM_OTHER = "List-item"
    CHECKED = "Checked"
    UNCHECKED = "Unchecked"
    ADDRESS = "Address"
    EMAIL_ADDRESS = "EmailAddress"
    PAGE_BREAK = "PageBreak"
    FORMULA = "Formula"
    TABLE = "Table"
    HEADER = "Header"
    HEADLINE = "Headline"
    SUB_HEADLINE = "Subheadline"
    PAGE_HEADER = "Page-header"  # Title?
    SECTION_HEADER = "Section-header"
    FOOTER = "Footer"
    FOOTNOTE = "Footnote"
    PAGE_FOOTER = "Page-footer"
 TYPE_TO_TEXT_ELEMENT_MAP: Dict[str, Any] = {
    ElementType.TITLE: Title,
    ElementType.SECTION_HEADER: Title,
--- a/unstructured/partition/pdf_image/pdf.py
+++ b/unstructured/partition/pdf_image/pdf.py
@ -70,6 +70,7 @@ from unstructured.partition.lang import (
    check_languages,
    prepare_languages_for_tesseract,
 )
 from unstructured.partition.pdf_image.pdf_image_utils import extract_images_from_elements
 from unstructured.partition.pdf_image.pdfminer_utils import (
    open_pdfminer_pages_generator,
    rect_to_bbox,
@ -381,8 +382,6 @@ def _partition_pdf_or_image_local(
            is_image=is_image,
            model_name=model_name,
            pdf_image_dpi=pdf_image_dpi,
            extract_images_in_pdf=extract_images_in_pdf,
            image_output_dir_path=image_output_dir_path,
        )
        # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
@ -411,8 +410,6 @@ def _partition_pdf_or_image_local(
            is_image=is_image,
            model_name=model_name,
            pdf_image_dpi=pdf_image_dpi,
            extract_images_in_pdf=extract_images_in_pdf,
            image_output_dir_path=image_output_dir_path,
        )
        if hasattr(file, "seek"):
            file.seek(0)
@ -458,6 +455,15 @@ def _partition_pdf_or_image_local(
        **kwargs,
    )
    if extract_images_in_pdf:
        extract_images_from_elements(
            elements=elements,
            filename=filename,
            file=file,
            pdf_image_dpi=pdf_image_dpi,
            output_dir_path=image_output_dir_path,
        )
    out_elements = []
    for el in elements:
        if isinstance(el, PageBreak) and not include_page_breaks:
--- a/unstructured/partition/pdf_image/pdf_image_utils.py
+++ b/unstructured/partition/pdf_image/pdf_image_utils.py
@ -0,0 +1,124 @@
 import os
 import tempfile
 from pathlib import PurePath
 from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
 import cv2
 import numpy as np
 import pdf2image
 from PIL import Image
 from unstructured.documents.elements import ElementType
 from unstructured.logger import logger
 from unstructured.partition.common import convert_to_bytes
 if TYPE_CHECKING:
    from unstructured.documents.elements import Element
 def write_image(image: Union[Image.Image, np.ndarray], output_image_path: str):
    """
    Write an image to a specified file path, supporting both PIL Image and numpy ndarray formats.
    Parameters:
    - image (Union[Image.Image, np.ndarray]): The image to be written, which can be in PIL Image
      format or a numpy ndarray format.
    - output_image_path (str): The path to which the image will be written.
    Raises:
    - ValueError: If the provided image type is neither PIL Image nor numpy ndarray.
    Returns:
    - None: The function writes the image to the specified path but does not return any value.
    """
    if isinstance(image, Image.Image):
        image.save(output_image_path)
    elif isinstance(image, np.ndarray):
        cv2.imwrite(output_image_path, image)
    else:
        raise ValueError("Unsupported Image Type")
 def convert_pdf_to_image(
    filename: str,
    file: Optional[Union[bytes, BinaryIO]] = None,
    dpi: int = 200,
    output_folder: Optional[Union[str, PurePath]] = None,
    path_only: bool = False,
 ) -> Union[List[Image.Image], List[str]]:
    """Get the image renderings of the pdf pages using pdf2image"""
    if path_only and not output_folder:
        raise ValueError("output_folder must be specified if path_only is true")
    if file is not None:
        f_bytes = convert_to_bytes(file)
        images = pdf2image.convert_from_bytes(
            f_bytes,
            dpi=dpi,
            output_folder=output_folder,
            paths_only=path_only,
        )
    else:
        images = pdf2image.convert_from_path(
            filename,
            dpi=dpi,
            output_folder=output_folder,
            paths_only=path_only,
        )
    return images
 def extract_images_from_elements(
    elements: List["Element"],
    pdf_image_dpi: int,
    filename: str = "",
    file: Optional[Union[bytes, BinaryIO]] = None,
    output_dir_path: Optional[str] = None,
 ):
    """
    Extract and save images from the page. This method iterates through the layout elements
    of the page, identifies image regions, and extracts and saves them as separate image files.
    """
    if not output_dir_path:
        output_dir_path = os.path.join(os.getcwd(), "figures")
    os.makedirs(output_dir_path, exist_ok=True)
    with tempfile.TemporaryDirectory() as temp_dir:
        _image_paths = convert_pdf_to_image(
            filename,
            file,
            pdf_image_dpi,
            output_folder=temp_dir,
            path_only=True,
        )
        image_paths = cast(List[str], _image_paths)
        figure_number = 0
        for el in elements:
            coordinates = el.metadata.coordinates
            if not coordinates or not coordinates.points or el.category != ElementType.IMAGE:
                continue
            points = coordinates.points
            x1, y1 = points[0]
            x2, y2 = points[2]
            page_number = el.metadata.page_number
            figure_number += 1
            try:
                output_f_path = os.path.join(
                    output_dir_path,
                    f"figure-{page_number}-{figure_number}.jpg",
                )
                image_path = image_paths[page_number - 1]
                image = Image.open(image_path)
                cropped_image = image.crop((x1, y1, x2, y2))
                write_image(cropped_image, output_f_path)
                # add image path to element metadata
                el.metadata.image_path = output_f_path
            except (ValueError, IOError):
                logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)