diff --git a/CHANGELOG.md b/CHANGELOG.md index 70aa36856..c93d2bb3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ### Enhancements * **Move `category` field from Text class to Element class.** +* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image. ### Features diff --git a/example-docs/contains-pictures.docx b/example-docs/contains-pictures.docx new file mode 100644 index 000000000..ee5cbede2 Binary files /dev/null and b/example-docs/contains-pictures.docx differ diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py index bab761d4a..c5b4462c4 100644 --- a/test_unstructured/partition/test_docx.py +++ b/test_unstructured/partition/test_docx.py @@ -4,15 +4,17 @@ from __future__ import annotations +import hashlib import io import pathlib import re import tempfile -from typing import Any +from typing import Any, Iterator import docx import pytest from docx.document import Document +from docx.text.paragraph import Paragraph from pytest_mock import MockFixture from test_unstructured.unit_utils import ( @@ -31,6 +33,7 @@ from unstructured.documents.elements import ( Element, Footer, Header, + Image, ListItem, NarrativeText, PageBreak, @@ -39,7 +42,12 @@ from unstructured.documents.elements import ( Text, Title, ) -from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx +from unstructured.partition.docx import ( + DocxPartitionerOptions, + _DocxPartitioner, + partition_docx, + register_picture_partitioner, +) from unstructured.partition.utils.constants import ( UNSTRUCTURED_INCLUDE_DEBUG_METADATA, PartitionStrategy, @@ -622,6 +630,45 @@ def test_it_considers_text_inside_shapes(): ] +# -- image sub-partitioning behaviors ------------------------------------------------------------ + + +def test_partition_docx_generates_no_Image_elements_by_default(): + assert not any( + isinstance(e, Image) for e in partition_docx(example_doc_path("contains-pictures.docx")) + ) + + +def test_partition_docx_uses_registered_picture_partitioner(): + class FakeParagraphPicturePartitioner: + @classmethod + def iter_elements( + cls, paragraph: Paragraph, opts: DocxPartitionerOptions + ) -> Iterator[Image]: + call_hash = hashlib.sha1(f"{paragraph.text}{opts.strategy}".encode()).hexdigest() + yield Image(f"Image with hash {call_hash}, strategy: {opts.strategy}") + + register_picture_partitioner(FakeParagraphPicturePartitioner) + + elements = partition_docx(example_doc_path("contains-pictures.docx")) + + # -- picture-partitioner registration has module-lifetime, so need to de-register this fake + # -- so other tests in same test-run don't use it + DocxPartitionerOptions._PicturePartitionerCls = None + + assert len(elements) == 11 + image_elements = [e for e in elements if isinstance(e, Image)] + assert len(image_elements) == 6 + assert [e.text for e in image_elements] == [ + "Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res", + "Image with hash 5e0cd2c62809377d8ce7422d8ca6b0cf5f4453bc, strategy: hi_res", + "Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res", + "Image with hash ccbd34be6096544babc391890cb0849c24cc046c, strategy: hi_res", + "Image with hash a41b819c7b4a9750ec0f9198c59c2057d39c653c, strategy: hi_res", + "Image with hash ba0dc2a1205af8f6d9e06c8d415df096b0a9c428, strategy: hi_res", + ] + + # -- module-level fixtures ----------------------------------------------------------------------- diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index e29a7089f..9ccf2ee9a 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -6,7 +6,7 @@ import html import io import itertools import tempfile -from typing import IO, Any, Iterator, Optional, Type +from typing import IO, Any, Iterator, Optional, Protocol, Type # -- CT_* stands for "complex-type", an XML element type in docx parlance -- import docx @@ -33,6 +33,7 @@ from unstructured.documents.elements import ( EmailAddress, Footer, Header, + Image, Link, ListItem, NarrativeText, @@ -63,6 +64,43 @@ BlockElement: TypeAlias = "CT_P | CT_Tbl" BlockItem: TypeAlias = "Paragraph | DocxTable" +def register_picture_partitioner(picture_partitioner: PicturePartitionerT) -> None: + """Specify a pluggable sub-partitioner to be used for partitioning DOCX images.""" + DocxPartitionerOptions.register_picture_partitioner(picture_partitioner) + + +# ================================================================================================ +# DOCX DOMAIN MODEL DEFINITIONS +# ================================================================================================ + + +class PicturePartitionerT(Protocol): + """Defines the interface for a pluggable sub-partitioner for DOCX Picture objects. + + In Microsoft Word parlance, an image is a "picture". We use that term here for an image in a + DOCX file both for domain consistency and because it conveniently avoids confusion with an + `unstructured` `Image` element. + + A picture can be either *inline* or *floating*. An inline picture is treated like a big + character in the text of a paragraph, moving with the text. A floating picture can be moved + freely and text flows around it. + + Both inline and floating pictures are defined inside a paragraph in the DOCX file. A paragraph + can have zero or more pictures. A DOCX picture partitioner takes a `docx` `Paragraph` object + and generates an `Image` element for each picture found in that paragraph. + """ + + @classmethod + def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]: + """Generate an `Image` element for each picture in `paragraph`.""" + ... + + +# ================================================================================================ +# PARTITIONER +# ================================================================================================ + + @process_metadata() @add_metadata_with_filetype(FileType.DOCX) @add_chunking_strategy @@ -142,6 +180,16 @@ def partition_docx( class DocxPartitionerOptions: """Encapsulates partitioning option validation, computation, and application of defaults.""" + _PicturePartitionerCls = None + """Sub-partitioner used to extract pictures from a paragraph as `Image` elements. + + This value has module lifetime and is updated by calling the `register_picture_partitioner()` + function defined in this module. The value sent to `register_picture_partitioner()` must be a + pluggable sub-partitioner implementing the `PicturePartitionerT` interface. After + registration, all paragraphs in subsequently partitioned DOCX documents will be sent to this + sub-partitioner to extract images when so configured. + """ + def __init__( self, *, @@ -166,6 +214,11 @@ class DocxPartitionerOptions: # -- options object maintains page-number state -- self._page_counter = starting_page_number + @classmethod + def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT): + """Specify a pluggable sub-partitioner to extract images from DOCX paragraphs.""" + cls._PicturePartitionerCls = picture_partitioner + @lazyproperty def document(self) -> Document: """The python-docx `Document` object loaded from file or filename.""" @@ -248,6 +301,16 @@ class DocxPartitionerOptions: """ return self._page_counter + @lazyproperty + def picture_partitioner(self) -> PicturePartitionerT: + """The sub-partitioner to use for DOCX image extraction.""" + # -- Note this value has partitioning-run scope. An instance of this options class is + # -- instantiated once per partitioning run (each document can have different options). + # -- Because this is a lazyproperty, it is computed only on the first reference. All + # -- subsequent references during the same partitioning run will get the same value. This + # -- ensures image extraction is processed consistently within a single document. + return self._PicturePartitionerCls or _NullPicturePartitioner + @lazyproperty def strategy(self) -> str: """The partitioning strategy for this document. @@ -569,6 +632,7 @@ class _DocxPartitioner: for item in iter_paragraph_items(paragraph): if isinstance(item, Paragraph): yield from self._classify_paragraph_to_element(item) + yield from self._iter_paragraph_images(item) else: yield from self._opts.increment_page_number() @@ -583,6 +647,13 @@ class _DocxPartitioner: if run.italic: yield {"text": text, "tag": "i"} + def _iter_paragraph_images(self, paragraph: Paragraph) -> Iterator[Image]: + """Generate `Image` element for each picture shape in `paragraph` when so configured.""" + # -- Delegate this job to the pluggable Picture partitioner. Note the default picture + # -- partitioner does not extract images. + PicturePartitionerCls = self._opts.picture_partitioner + yield from PicturePartitionerCls.iter_elements(paragraph, self._opts) + def _iter_section_footers(self, section: Section) -> Iterator[Footer]: """Generate any `Footer` elements defined for this section. @@ -925,3 +996,18 @@ class _DocxPartitioner: """[contents, tags] pair describing emphasized text in `table`.""" iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table)) return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2]) + + +# ================================================================================================ +# SUB-PARTITIONERS +# ================================================================================================ + + +class _NullPicturePartitioner: + """Does not parse the provided paragraph for pictures and generates zero `Image` elements.""" + + @classmethod + def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]: + """No-op picture partitioner.""" + return + yield