feat(docx): add pluggable picture sub-partitioner (#3081)

**Summary** Allow registration of a custom sub-partitioner that extracts images from a DOCX paragraph. **Additional Context** - A custom image sub-partitioner must implement the `PicturePartitionerT` interface defined in this PR. Basically have an `.iter_elements()` classmethod that takes the paragraph and generates zero or more `Image` elements from it. - The custom image sub-partitioner must be registered by passing the class to `register_picture_partitioner()`. - The default image sub-partitioner is `_NullPicturePartitioner` that does nothing. - The registered picture partitioner is called once for each paragraph.
2025-12-12 23:51:47 +00:00 · 2024-05-23 11:46:30 -07:00 · 2024-05-23 11:46:30 -07:00 · 47d28612f7
commit 47d28612f7
parent 171b5df09f
4 changed files with 137 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,7 @@
 ### Enhancements

 * **Move `category` field from Text class to Element class.**
+* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.

 ### Features

--- a/example-docs/contains-pictures.docx
+++ b/example-docs/contains-pictures.docx
--- a/test_unstructured/partition/test_docx.py
+++ b/test_unstructured/partition/test_docx.py
@ -4,15 +4,17 @@

 from __future__ import annotations

+import hashlib
 import io
 import pathlib
 import re
 import tempfile
-from typing import Any
+from typing import Any, Iterator

 import docx
 import pytest
 from docx.document import Document
+from docx.text.paragraph import Paragraph
 from pytest_mock import MockFixture

 from test_unstructured.unit_utils import (
@ -31,6 +33,7 @@ from unstructured.documents.elements import (
    Element,
    Footer,
    Header,
+    Image,
    ListItem,
    NarrativeText,
    PageBreak,
@ -39,7 +42,12 @@ from unstructured.documents.elements import (
    Text,
    Title,
 )
-from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
+from unstructured.partition.docx import (
+    DocxPartitionerOptions,
+    _DocxPartitioner,
+    partition_docx,
+    register_picture_partitioner,
+)
 from unstructured.partition.utils.constants import (
    UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
    PartitionStrategy,
@ -622,6 +630,45 @@ def test_it_considers_text_inside_shapes():
    ]


+# -- image sub-partitioning behaviors ------------------------------------------------------------
+
+
+def test_partition_docx_generates_no_Image_elements_by_default():
+    assert not any(
+        isinstance(e, Image) for e in partition_docx(example_doc_path("contains-pictures.docx"))
+    )
+
+
+def test_partition_docx_uses_registered_picture_partitioner():
+    class FakeParagraphPicturePartitioner:
+        @classmethod
+        def iter_elements(
+            cls, paragraph: Paragraph, opts: DocxPartitionerOptions
+        ) -> Iterator[Image]:
+            call_hash = hashlib.sha1(f"{paragraph.text}{opts.strategy}".encode()).hexdigest()
+            yield Image(f"Image with hash {call_hash}, strategy: {opts.strategy}")
+
+    register_picture_partitioner(FakeParagraphPicturePartitioner)
+
+    elements = partition_docx(example_doc_path("contains-pictures.docx"))
+
+    # -- picture-partitioner registration has module-lifetime, so need to de-register this fake
+    # -- so other tests in same test-run don't use it
+    DocxPartitionerOptions._PicturePartitionerCls = None
+
+    assert len(elements) == 11
+    image_elements = [e for e in elements if isinstance(e, Image)]
+    assert len(image_elements) == 6
+    assert [e.text for e in image_elements] == [
+        "Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
+        "Image with hash 5e0cd2c62809377d8ce7422d8ca6b0cf5f4453bc, strategy: hi_res",
+        "Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
+        "Image with hash ccbd34be6096544babc391890cb0849c24cc046c, strategy: hi_res",
+        "Image with hash a41b819c7b4a9750ec0f9198c59c2057d39c653c, strategy: hi_res",
+        "Image with hash ba0dc2a1205af8f6d9e06c8d415df096b0a9c428, strategy: hi_res",
+    ]
+
+
 # -- module-level fixtures -----------------------------------------------------------------------


--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -6,7 +6,7 @@ import html
 import io
 import itertools
 import tempfile
-from typing import IO, Any, Iterator, Optional, Type
+from typing import IO, Any, Iterator, Optional, Protocol, Type

 # -- CT_* stands for "complex-type", an XML element type in docx parlance --
 import docx
@ -33,6 +33,7 @@ from unstructured.documents.elements import (
    EmailAddress,
    Footer,
    Header,
+    Image,
    Link,
    ListItem,
    NarrativeText,
@ -63,6 +64,43 @@ BlockElement: TypeAlias = "CT_P | CT_Tbl"
 BlockItem: TypeAlias = "Paragraph | DocxTable"


+def register_picture_partitioner(picture_partitioner: PicturePartitionerT) -> None:
+    """Specify a pluggable sub-partitioner to be used for partitioning DOCX images."""
+    DocxPartitionerOptions.register_picture_partitioner(picture_partitioner)
+
+
+# ================================================================================================
+# DOCX DOMAIN MODEL DEFINITIONS
+# ================================================================================================
+
+
+class PicturePartitionerT(Protocol):
+    """Defines the interface for a pluggable sub-partitioner for DOCX Picture objects.
+
+    In Microsoft Word parlance, an image is a "picture". We use that term here for an image in a
+    DOCX file both for domain consistency and because it conveniently avoids confusion with an
+    `unstructured` `Image` element.
+
+    A picture can be either *inline* or *floating*. An inline picture is treated like a big
+    character in the text of a paragraph, moving with the text. A floating picture can be moved
+    freely and text flows around it.
+
+    Both inline and floating pictures are defined inside a paragraph in the DOCX file. A paragraph
+    can have zero or more pictures. A DOCX picture partitioner takes a `docx` `Paragraph` object
+    and generates an `Image` element for each picture found in that paragraph.
+    """
+
+    @classmethod
+    def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
+        """Generate an `Image` element for each picture in `paragraph`."""
+        ...
+
+
+# ================================================================================================
+# PARTITIONER
+# ================================================================================================
+
+
@process_metadata()
@add_metadata_with_filetype(FileType.DOCX)
@add_chunking_strategy
@ -142,6 +180,16 @@ def partition_docx(
 class DocxPartitionerOptions:
    """Encapsulates partitioning option validation, computation, and application of defaults."""

+    _PicturePartitionerCls = None
+    """Sub-partitioner used to extract pictures from a paragraph as `Image` elements.
+
+    This value has module lifetime and is updated by calling the `register_picture_partitioner()`
+    function defined in this module. The value sent to `register_picture_partitioner()` must be a
+    pluggable sub-partitioner implementing the `PicturePartitionerT` interface. After
+    registration, all paragraphs in subsequently partitioned DOCX documents will be sent to this
+    sub-partitioner to extract images when so configured.
+    """
+
    def __init__(
        self,
        *,
@ -166,6 +214,11 @@ class DocxPartitionerOptions:
        # -- options object maintains page-number state --
        self._page_counter = starting_page_number

+    @classmethod
+    def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
+        """Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
+        cls._PicturePartitionerCls = picture_partitioner
+
    @lazyproperty
    def document(self) -> Document:
        """The python-docx `Document` object loaded from file or filename."""
@ -248,6 +301,16 @@ class DocxPartitionerOptions:
        """
        return self._page_counter

+    @lazyproperty
+    def picture_partitioner(self) -> PicturePartitionerT:
+        """The sub-partitioner to use for DOCX image extraction."""
+        # -- Note this value has partitioning-run scope. An instance of this options class is
+        # -- instantiated once per partitioning run (each document can have different options).
+        # -- Because this is a lazyproperty, it is computed only on the first reference. All
+        # -- subsequent references during the same partitioning run will get the same value. This
+        # -- ensures image extraction is processed consistently within a single document.
+        return self._PicturePartitionerCls or _NullPicturePartitioner
+
    @lazyproperty
    def strategy(self) -> str:
        """The partitioning strategy for this document.
@ -569,6 +632,7 @@ class _DocxPartitioner:
        for item in iter_paragraph_items(paragraph):
            if isinstance(item, Paragraph):
                yield from self._classify_paragraph_to_element(item)
+                yield from self._iter_paragraph_images(item)
            else:
                yield from self._opts.increment_page_number()

@ -583,6 +647,13 @@ class _DocxPartitioner:
            if run.italic:
                yield {"text": text, "tag": "i"}

+    def _iter_paragraph_images(self, paragraph: Paragraph) -> Iterator[Image]:
+        """Generate `Image` element for each picture shape in `paragraph` when so configured."""
+        # -- Delegate this job to the pluggable Picture partitioner. Note the default picture
+        # -- partitioner does not extract images.
+        PicturePartitionerCls = self._opts.picture_partitioner
+        yield from PicturePartitionerCls.iter_elements(paragraph, self._opts)
+
    def _iter_section_footers(self, section: Section) -> Iterator[Footer]:
        """Generate any `Footer` elements defined for this section.

@ -925,3 +996,18 @@ class _DocxPartitioner:
        """[contents, tags] pair describing emphasized text in `table`."""
        iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table))
        return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2])
+
+
+# ================================================================================================
+# SUB-PARTITIONERS
+# ================================================================================================
+
+
+class _NullPicturePartitioner:
+    """Does not parse the provided paragraph for pictures and generates zero `Image` elements."""
+
+    @classmethod
+    def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
+        """No-op picture partitioner."""
+        return
+        yield