feat(docx): add pluggable picture sub-partitioner (#3081)

**Summary**
Allow registration of a custom sub-partitioner that extracts images from
a DOCX paragraph.

**Additional Context**
- A custom image sub-partitioner must implement the
`PicturePartitionerT` interface defined in this PR. Basically have an
`.iter_elements()` classmethod that takes the paragraph and generates
zero or more `Image` elements from it.
- The custom image sub-partitioner must be registered by passing the
class to `register_picture_partitioner()`.
- The default image sub-partitioner is `_NullPicturePartitioner` that
does nothing.
- The registered picture partitioner is called once for each paragraph.
This commit is contained in:
Steve Canny 2024-05-23 11:46:30 -07:00 committed by GitHub
parent 171b5df09f
commit 47d28612f7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 137 additions and 3 deletions

View File

@ -3,6 +3,7 @@
### Enhancements ### Enhancements
* **Move `category` field from Text class to Element class.** * **Move `category` field from Text class to Element class.**
* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.
### Features ### Features

Binary file not shown.

View File

@ -4,15 +4,17 @@
from __future__ import annotations from __future__ import annotations
import hashlib
import io import io
import pathlib import pathlib
import re import re
import tempfile import tempfile
from typing import Any from typing import Any, Iterator
import docx import docx
import pytest import pytest
from docx.document import Document from docx.document import Document
from docx.text.paragraph import Paragraph
from pytest_mock import MockFixture from pytest_mock import MockFixture
from test_unstructured.unit_utils import ( from test_unstructured.unit_utils import (
@ -31,6 +33,7 @@ from unstructured.documents.elements import (
Element, Element,
Footer, Footer,
Header, Header,
Image,
ListItem, ListItem,
NarrativeText, NarrativeText,
PageBreak, PageBreak,
@ -39,7 +42,12 @@ from unstructured.documents.elements import (
Text, Text,
Title, Title,
) )
from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx from unstructured.partition.docx import (
DocxPartitionerOptions,
_DocxPartitioner,
partition_docx,
register_picture_partitioner,
)
from unstructured.partition.utils.constants import ( from unstructured.partition.utils.constants import (
UNSTRUCTURED_INCLUDE_DEBUG_METADATA, UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
PartitionStrategy, PartitionStrategy,
@ -622,6 +630,45 @@ def test_it_considers_text_inside_shapes():
] ]
# -- image sub-partitioning behaviors ------------------------------------------------------------
def test_partition_docx_generates_no_Image_elements_by_default():
assert not any(
isinstance(e, Image) for e in partition_docx(example_doc_path("contains-pictures.docx"))
)
def test_partition_docx_uses_registered_picture_partitioner():
class FakeParagraphPicturePartitioner:
@classmethod
def iter_elements(
cls, paragraph: Paragraph, opts: DocxPartitionerOptions
) -> Iterator[Image]:
call_hash = hashlib.sha1(f"{paragraph.text}{opts.strategy}".encode()).hexdigest()
yield Image(f"Image with hash {call_hash}, strategy: {opts.strategy}")
register_picture_partitioner(FakeParagraphPicturePartitioner)
elements = partition_docx(example_doc_path("contains-pictures.docx"))
# -- picture-partitioner registration has module-lifetime, so need to de-register this fake
# -- so other tests in same test-run don't use it
DocxPartitionerOptions._PicturePartitionerCls = None
assert len(elements) == 11
image_elements = [e for e in elements if isinstance(e, Image)]
assert len(image_elements) == 6
assert [e.text for e in image_elements] == [
"Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
"Image with hash 5e0cd2c62809377d8ce7422d8ca6b0cf5f4453bc, strategy: hi_res",
"Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
"Image with hash ccbd34be6096544babc391890cb0849c24cc046c, strategy: hi_res",
"Image with hash a41b819c7b4a9750ec0f9198c59c2057d39c653c, strategy: hi_res",
"Image with hash ba0dc2a1205af8f6d9e06c8d415df096b0a9c428, strategy: hi_res",
]
# -- module-level fixtures ----------------------------------------------------------------------- # -- module-level fixtures -----------------------------------------------------------------------

View File

@ -6,7 +6,7 @@ import html
import io import io
import itertools import itertools
import tempfile import tempfile
from typing import IO, Any, Iterator, Optional, Type from typing import IO, Any, Iterator, Optional, Protocol, Type
# -- CT_* stands for "complex-type", an XML element type in docx parlance -- # -- CT_* stands for "complex-type", an XML element type in docx parlance --
import docx import docx
@ -33,6 +33,7 @@ from unstructured.documents.elements import (
EmailAddress, EmailAddress,
Footer, Footer,
Header, Header,
Image,
Link, Link,
ListItem, ListItem,
NarrativeText, NarrativeText,
@ -63,6 +64,43 @@ BlockElement: TypeAlias = "CT_P | CT_Tbl"
BlockItem: TypeAlias = "Paragraph | DocxTable" BlockItem: TypeAlias = "Paragraph | DocxTable"
def register_picture_partitioner(picture_partitioner: PicturePartitionerT) -> None:
"""Specify a pluggable sub-partitioner to be used for partitioning DOCX images."""
DocxPartitionerOptions.register_picture_partitioner(picture_partitioner)
# ================================================================================================
# DOCX DOMAIN MODEL DEFINITIONS
# ================================================================================================
class PicturePartitionerT(Protocol):
"""Defines the interface for a pluggable sub-partitioner for DOCX Picture objects.
In Microsoft Word parlance, an image is a "picture". We use that term here for an image in a
DOCX file both for domain consistency and because it conveniently avoids confusion with an
`unstructured` `Image` element.
A picture can be either *inline* or *floating*. An inline picture is treated like a big
character in the text of a paragraph, moving with the text. A floating picture can be moved
freely and text flows around it.
Both inline and floating pictures are defined inside a paragraph in the DOCX file. A paragraph
can have zero or more pictures. A DOCX picture partitioner takes a `docx` `Paragraph` object
and generates an `Image` element for each picture found in that paragraph.
"""
@classmethod
def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
"""Generate an `Image` element for each picture in `paragraph`."""
...
# ================================================================================================
# PARTITIONER
# ================================================================================================
@process_metadata() @process_metadata()
@add_metadata_with_filetype(FileType.DOCX) @add_metadata_with_filetype(FileType.DOCX)
@add_chunking_strategy @add_chunking_strategy
@ -142,6 +180,16 @@ def partition_docx(
class DocxPartitionerOptions: class DocxPartitionerOptions:
"""Encapsulates partitioning option validation, computation, and application of defaults.""" """Encapsulates partitioning option validation, computation, and application of defaults."""
_PicturePartitionerCls = None
"""Sub-partitioner used to extract pictures from a paragraph as `Image` elements.
This value has module lifetime and is updated by calling the `register_picture_partitioner()`
function defined in this module. The value sent to `register_picture_partitioner()` must be a
pluggable sub-partitioner implementing the `PicturePartitionerT` interface. After
registration, all paragraphs in subsequently partitioned DOCX documents will be sent to this
sub-partitioner to extract images when so configured.
"""
def __init__( def __init__(
self, self,
*, *,
@ -166,6 +214,11 @@ class DocxPartitionerOptions:
# -- options object maintains page-number state -- # -- options object maintains page-number state --
self._page_counter = starting_page_number self._page_counter = starting_page_number
@classmethod
def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
"""Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
cls._PicturePartitionerCls = picture_partitioner
@lazyproperty @lazyproperty
def document(self) -> Document: def document(self) -> Document:
"""The python-docx `Document` object loaded from file or filename.""" """The python-docx `Document` object loaded from file or filename."""
@ -248,6 +301,16 @@ class DocxPartitionerOptions:
""" """
return self._page_counter return self._page_counter
@lazyproperty
def picture_partitioner(self) -> PicturePartitionerT:
"""The sub-partitioner to use for DOCX image extraction."""
# -- Note this value has partitioning-run scope. An instance of this options class is
# -- instantiated once per partitioning run (each document can have different options).
# -- Because this is a lazyproperty, it is computed only on the first reference. All
# -- subsequent references during the same partitioning run will get the same value. This
# -- ensures image extraction is processed consistently within a single document.
return self._PicturePartitionerCls or _NullPicturePartitioner
@lazyproperty @lazyproperty
def strategy(self) -> str: def strategy(self) -> str:
"""The partitioning strategy for this document. """The partitioning strategy for this document.
@ -569,6 +632,7 @@ class _DocxPartitioner:
for item in iter_paragraph_items(paragraph): for item in iter_paragraph_items(paragraph):
if isinstance(item, Paragraph): if isinstance(item, Paragraph):
yield from self._classify_paragraph_to_element(item) yield from self._classify_paragraph_to_element(item)
yield from self._iter_paragraph_images(item)
else: else:
yield from self._opts.increment_page_number() yield from self._opts.increment_page_number()
@ -583,6 +647,13 @@ class _DocxPartitioner:
if run.italic: if run.italic:
yield {"text": text, "tag": "i"} yield {"text": text, "tag": "i"}
def _iter_paragraph_images(self, paragraph: Paragraph) -> Iterator[Image]:
"""Generate `Image` element for each picture shape in `paragraph` when so configured."""
# -- Delegate this job to the pluggable Picture partitioner. Note the default picture
# -- partitioner does not extract images.
PicturePartitionerCls = self._opts.picture_partitioner
yield from PicturePartitionerCls.iter_elements(paragraph, self._opts)
def _iter_section_footers(self, section: Section) -> Iterator[Footer]: def _iter_section_footers(self, section: Section) -> Iterator[Footer]:
"""Generate any `Footer` elements defined for this section. """Generate any `Footer` elements defined for this section.
@ -925,3 +996,18 @@ class _DocxPartitioner:
"""[contents, tags] pair describing emphasized text in `table`.""" """[contents, tags] pair describing emphasized text in `table`."""
iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table)) iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table))
return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2]) return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2])
# ================================================================================================
# SUB-PARTITIONERS
# ================================================================================================
class _NullPicturePartitioner:
"""Does not parse the provided paragraph for pictures and generates zero `Image` elements."""
@classmethod
def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
"""No-op picture partitioner."""
return
yield