feat(docx): add pluggable picture sub-partitioner (#3081)

**Summary** Allow registration of a custom sub-partitioner that extracts images from a DOCX paragraph. **Additional Context** - A custom image sub-partitioner must implement the `PicturePartitionerT` interface defined in this PR. Basically have an `.iter_elements()` classmethod that takes the paragraph and generates zero or more `Image` elements from it. - The custom image sub-partitioner must be registered by passing the class to `register_picture_partitioner()`. - The default image sub-partitioner is `_NullPicturePartitioner` that does nothing. - The registered picture partitioner is called once for each paragraph.
2025-12-17 18:26:08 +00:00 · 2024-05-23 11:46:30 -07:00 · 2024-05-23 11:46:30 -07:00 · 47d28612f7
commit 47d28612f7
parent 171b5df09f
4 changed files with 137 additions and 3 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,7 @@
 ### Enhancements
 * **Move `category` field from Text class to Element class.**
 * **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.
 ### Features
--- a/example-docs/contains-pictures.docx
+++ b/example-docs/contains-pictures.docx
--- a/test_unstructured/partition/test_docx.py
+++ b/test_unstructured/partition/test_docx.py
@ -4,15 +4,17 @@
 from __future__ import annotations
 import hashlib
 import io
 import pathlib
 import re
 import tempfile
-from typing import Any
+from typing import Any, Iterator
 import docx
 import pytest
 from docx.document import Document
 from docx.text.paragraph import Paragraph
 from pytest_mock import MockFixture
 from test_unstructured.unit_utils import (
@ -31,6 +33,7 @@ from unstructured.documents.elements import (
    Element,
    Footer,
    Header,
    Image,
    ListItem,
    NarrativeText,
    PageBreak,
@ -39,7 +42,12 @@ from unstructured.documents.elements import (
    Text,
    Title,
 )
-from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
+from unstructured.partition.docx import (
    DocxPartitionerOptions,
    _DocxPartitioner,
    partition_docx,
    register_picture_partitioner,
 )
 from unstructured.partition.utils.constants import (
    UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
    PartitionStrategy,
@ -622,6 +630,45 @@ def test_it_considers_text_inside_shapes():
    ]
 # -- image sub-partitioning behaviors ------------------------------------------------------------
 def test_partition_docx_generates_no_Image_elements_by_default():
    assert not any(
        isinstance(e, Image) for e in partition_docx(example_doc_path("contains-pictures.docx"))
    )
 def test_partition_docx_uses_registered_picture_partitioner():
    class FakeParagraphPicturePartitioner:
        @classmethod
        def iter_elements(
            cls, paragraph: Paragraph, opts: DocxPartitionerOptions
        ) -> Iterator[Image]:
            call_hash = hashlib.sha1(f"{paragraph.text}{opts.strategy}".encode()).hexdigest()
            yield Image(f"Image with hash {call_hash}, strategy: {opts.strategy}")
    register_picture_partitioner(FakeParagraphPicturePartitioner)
    elements = partition_docx(example_doc_path("contains-pictures.docx"))
    # -- picture-partitioner registration has module-lifetime, so need to de-register this fake
    # -- so other tests in same test-run don't use it
    DocxPartitionerOptions._PicturePartitionerCls = None
    assert len(elements) == 11
    image_elements = [e for e in elements if isinstance(e, Image)]
    assert len(image_elements) == 6
    assert [e.text for e in image_elements] == [
        "Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
        "Image with hash 5e0cd2c62809377d8ce7422d8ca6b0cf5f4453bc, strategy: hi_res",
        "Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
        "Image with hash ccbd34be6096544babc391890cb0849c24cc046c, strategy: hi_res",
        "Image with hash a41b819c7b4a9750ec0f9198c59c2057d39c653c, strategy: hi_res",
        "Image with hash ba0dc2a1205af8f6d9e06c8d415df096b0a9c428, strategy: hi_res",
    ]
 # -- module-level fixtures -----------------------------------------------------------------------
--- a/unstructured/partition/docx.py
+++ b/unstructured/partition/docx.py
@ -6,7 +6,7 @@ import html
 import io
 import itertools
 import tempfile
-from typing import IO, Any, Iterator, Optional, Type
+from typing import IO, Any, Iterator, Optional, Protocol, Type
 # -- CT_* stands for "complex-type", an XML element type in docx parlance --
 import docx
@ -33,6 +33,7 @@ from unstructured.documents.elements import (
    EmailAddress,
    Footer,
    Header,
    Image,
    Link,
    ListItem,
    NarrativeText,
@ -63,6 +64,43 @@ BlockElement: TypeAlias = "CT_P | CT_Tbl"
 BlockItem: TypeAlias = "Paragraph | DocxTable"
 def register_picture_partitioner(picture_partitioner: PicturePartitionerT) -> None:
    """Specify a pluggable sub-partitioner to be used for partitioning DOCX images."""
    DocxPartitionerOptions.register_picture_partitioner(picture_partitioner)
 # ================================================================================================
 # DOCX DOMAIN MODEL DEFINITIONS
 # ================================================================================================
 class PicturePartitionerT(Protocol):
    """Defines the interface for a pluggable sub-partitioner for DOCX Picture objects.
    In Microsoft Word parlance, an image is a "picture". We use that term here for an image in a
    DOCX file both for domain consistency and because it conveniently avoids confusion with an
    `unstructured` `Image` element.
    A picture can be either *inline* or *floating*. An inline picture is treated like a big
    character in the text of a paragraph, moving with the text. A floating picture can be moved
    freely and text flows around it.
    Both inline and floating pictures are defined inside a paragraph in the DOCX file. A paragraph
    can have zero or more pictures. A DOCX picture partitioner takes a `docx` `Paragraph` object
    and generates an `Image` element for each picture found in that paragraph.
    """
    @classmethod
    def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
        """Generate an `Image` element for each picture in `paragraph`."""
        ...
 # ================================================================================================
 # PARTITIONER
 # ================================================================================================
@process_metadata()
@add_metadata_with_filetype(FileType.DOCX)
@add_chunking_strategy
@ -142,6 +180,16 @@ def partition_docx(
 class DocxPartitionerOptions:
    """Encapsulates partitioning option validation, computation, and application of defaults."""
    _PicturePartitionerCls = None
    """Sub-partitioner used to extract pictures from a paragraph as `Image` elements.
    This value has module lifetime and is updated by calling the `register_picture_partitioner()`
    function defined in this module. The value sent to `register_picture_partitioner()` must be a
    pluggable sub-partitioner implementing the `PicturePartitionerT` interface. After
    registration, all paragraphs in subsequently partitioned DOCX documents will be sent to this
    sub-partitioner to extract images when so configured.
    """
    def __init__(
        self,
        *,
@ -166,6 +214,11 @@ class DocxPartitionerOptions:
        # -- options object maintains page-number state --
        self._page_counter = starting_page_number
    @classmethod
    def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
        """Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
        cls._PicturePartitionerCls = picture_partitioner
    @lazyproperty
    def document(self) -> Document:
        """The python-docx `Document` object loaded from file or filename."""
@ -248,6 +301,16 @@ class DocxPartitionerOptions:
        """
        return self._page_counter
    @lazyproperty
    def picture_partitioner(self) -> PicturePartitionerT:
        """The sub-partitioner to use for DOCX image extraction."""
        # -- Note this value has partitioning-run scope. An instance of this options class is
        # -- instantiated once per partitioning run (each document can have different options).
        # -- Because this is a lazyproperty, it is computed only on the first reference. All
        # -- subsequent references during the same partitioning run will get the same value. This
        # -- ensures image extraction is processed consistently within a single document.
        return self._PicturePartitionerCls or _NullPicturePartitioner
    @lazyproperty
    def strategy(self) -> str:
        """The partitioning strategy for this document.
@ -569,6 +632,7 @@ class _DocxPartitioner:
        for item in iter_paragraph_items(paragraph):
            if isinstance(item, Paragraph):
                yield from self._classify_paragraph_to_element(item)
                yield from self._iter_paragraph_images(item)
            else:
                yield from self._opts.increment_page_number()
@ -583,6 +647,13 @@ class _DocxPartitioner:
            if run.italic:
                yield {"text": text, "tag": "i"}
    def _iter_paragraph_images(self, paragraph: Paragraph) -> Iterator[Image]:
        """Generate `Image` element for each picture shape in `paragraph` when so configured."""
        # -- Delegate this job to the pluggable Picture partitioner. Note the default picture
        # -- partitioner does not extract images.
        PicturePartitionerCls = self._opts.picture_partitioner
        yield from PicturePartitionerCls.iter_elements(paragraph, self._opts)
    def _iter_section_footers(self, section: Section) -> Iterator[Footer]:
        """Generate any `Footer` elements defined for this section.
@ -925,3 +996,18 @@ class _DocxPartitioner:
        """[contents, tags] pair describing emphasized text in `table`."""
        iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table))
        return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2])
 # ================================================================================================
 # SUB-PARTITIONERS
 # ================================================================================================
 class _NullPicturePartitioner:
    """Does not parse the provided paragraph for pictures and generates zero `Image` elements."""
    @classmethod
    def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
        """No-op picture partitioner."""
        return
        yield