mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-17 05:02:25 +00:00
feat(docx): add pluggable picture sub-partitioner (#3081)
**Summary** Allow registration of a custom sub-partitioner that extracts images from a DOCX paragraph. **Additional Context** - A custom image sub-partitioner must implement the `PicturePartitionerT` interface defined in this PR. Basically have an `.iter_elements()` classmethod that takes the paragraph and generates zero or more `Image` elements from it. - The custom image sub-partitioner must be registered by passing the class to `register_picture_partitioner()`. - The default image sub-partitioner is `_NullPicturePartitioner` that does nothing. - The registered picture partitioner is called once for each paragraph.
This commit is contained in:
parent
171b5df09f
commit
47d28612f7
@ -3,6 +3,7 @@
|
||||
### Enhancements
|
||||
|
||||
* **Move `category` field from Text class to Element class.**
|
||||
* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.
|
||||
|
||||
### Features
|
||||
|
||||
|
BIN
example-docs/contains-pictures.docx
Normal file
BIN
example-docs/contains-pictures.docx
Normal file
Binary file not shown.
@ -4,15 +4,17 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import io
|
||||
import pathlib
|
||||
import re
|
||||
import tempfile
|
||||
from typing import Any
|
||||
from typing import Any, Iterator
|
||||
|
||||
import docx
|
||||
import pytest
|
||||
from docx.document import Document
|
||||
from docx.text.paragraph import Paragraph
|
||||
from pytest_mock import MockFixture
|
||||
|
||||
from test_unstructured.unit_utils import (
|
||||
@ -31,6 +33,7 @@ from unstructured.documents.elements import (
|
||||
Element,
|
||||
Footer,
|
||||
Header,
|
||||
Image,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
PageBreak,
|
||||
@ -39,7 +42,12 @@ from unstructured.documents.elements import (
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
|
||||
from unstructured.partition.docx import (
|
||||
DocxPartitionerOptions,
|
||||
_DocxPartitioner,
|
||||
partition_docx,
|
||||
register_picture_partitioner,
|
||||
)
|
||||
from unstructured.partition.utils.constants import (
|
||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
|
||||
PartitionStrategy,
|
||||
@ -622,6 +630,45 @@ def test_it_considers_text_inside_shapes():
|
||||
]
|
||||
|
||||
|
||||
# -- image sub-partitioning behaviors ------------------------------------------------------------
|
||||
|
||||
|
||||
def test_partition_docx_generates_no_Image_elements_by_default():
|
||||
assert not any(
|
||||
isinstance(e, Image) for e in partition_docx(example_doc_path("contains-pictures.docx"))
|
||||
)
|
||||
|
||||
|
||||
def test_partition_docx_uses_registered_picture_partitioner():
|
||||
class FakeParagraphPicturePartitioner:
|
||||
@classmethod
|
||||
def iter_elements(
|
||||
cls, paragraph: Paragraph, opts: DocxPartitionerOptions
|
||||
) -> Iterator[Image]:
|
||||
call_hash = hashlib.sha1(f"{paragraph.text}{opts.strategy}".encode()).hexdigest()
|
||||
yield Image(f"Image with hash {call_hash}, strategy: {opts.strategy}")
|
||||
|
||||
register_picture_partitioner(FakeParagraphPicturePartitioner)
|
||||
|
||||
elements = partition_docx(example_doc_path("contains-pictures.docx"))
|
||||
|
||||
# -- picture-partitioner registration has module-lifetime, so need to de-register this fake
|
||||
# -- so other tests in same test-run don't use it
|
||||
DocxPartitionerOptions._PicturePartitionerCls = None
|
||||
|
||||
assert len(elements) == 11
|
||||
image_elements = [e for e in elements if isinstance(e, Image)]
|
||||
assert len(image_elements) == 6
|
||||
assert [e.text for e in image_elements] == [
|
||||
"Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
|
||||
"Image with hash 5e0cd2c62809377d8ce7422d8ca6b0cf5f4453bc, strategy: hi_res",
|
||||
"Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
|
||||
"Image with hash ccbd34be6096544babc391890cb0849c24cc046c, strategy: hi_res",
|
||||
"Image with hash a41b819c7b4a9750ec0f9198c59c2057d39c653c, strategy: hi_res",
|
||||
"Image with hash ba0dc2a1205af8f6d9e06c8d415df096b0a9c428, strategy: hi_res",
|
||||
]
|
||||
|
||||
|
||||
# -- module-level fixtures -----------------------------------------------------------------------
|
||||
|
||||
|
||||
|
@ -6,7 +6,7 @@ import html
|
||||
import io
|
||||
import itertools
|
||||
import tempfile
|
||||
from typing import IO, Any, Iterator, Optional, Type
|
||||
from typing import IO, Any, Iterator, Optional, Protocol, Type
|
||||
|
||||
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
|
||||
import docx
|
||||
@ -33,6 +33,7 @@ from unstructured.documents.elements import (
|
||||
EmailAddress,
|
||||
Footer,
|
||||
Header,
|
||||
Image,
|
||||
Link,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
@ -63,6 +64,43 @@ BlockElement: TypeAlias = "CT_P | CT_Tbl"
|
||||
BlockItem: TypeAlias = "Paragraph | DocxTable"
|
||||
|
||||
|
||||
def register_picture_partitioner(picture_partitioner: PicturePartitionerT) -> None:
|
||||
"""Specify a pluggable sub-partitioner to be used for partitioning DOCX images."""
|
||||
DocxPartitionerOptions.register_picture_partitioner(picture_partitioner)
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# DOCX DOMAIN MODEL DEFINITIONS
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class PicturePartitionerT(Protocol):
|
||||
"""Defines the interface for a pluggable sub-partitioner for DOCX Picture objects.
|
||||
|
||||
In Microsoft Word parlance, an image is a "picture". We use that term here for an image in a
|
||||
DOCX file both for domain consistency and because it conveniently avoids confusion with an
|
||||
`unstructured` `Image` element.
|
||||
|
||||
A picture can be either *inline* or *floating*. An inline picture is treated like a big
|
||||
character in the text of a paragraph, moving with the text. A floating picture can be moved
|
||||
freely and text flows around it.
|
||||
|
||||
Both inline and floating pictures are defined inside a paragraph in the DOCX file. A paragraph
|
||||
can have zero or more pictures. A DOCX picture partitioner takes a `docx` `Paragraph` object
|
||||
and generates an `Image` element for each picture found in that paragraph.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
|
||||
"""Generate an `Image` element for each picture in `paragraph`."""
|
||||
...
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# PARTITIONER
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.DOCX)
|
||||
@add_chunking_strategy
|
||||
@ -142,6 +180,16 @@ def partition_docx(
|
||||
class DocxPartitionerOptions:
|
||||
"""Encapsulates partitioning option validation, computation, and application of defaults."""
|
||||
|
||||
_PicturePartitionerCls = None
|
||||
"""Sub-partitioner used to extract pictures from a paragraph as `Image` elements.
|
||||
|
||||
This value has module lifetime and is updated by calling the `register_picture_partitioner()`
|
||||
function defined in this module. The value sent to `register_picture_partitioner()` must be a
|
||||
pluggable sub-partitioner implementing the `PicturePartitionerT` interface. After
|
||||
registration, all paragraphs in subsequently partitioned DOCX documents will be sent to this
|
||||
sub-partitioner to extract images when so configured.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
@ -166,6 +214,11 @@ class DocxPartitionerOptions:
|
||||
# -- options object maintains page-number state --
|
||||
self._page_counter = starting_page_number
|
||||
|
||||
@classmethod
|
||||
def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
|
||||
"""Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
|
||||
cls._PicturePartitionerCls = picture_partitioner
|
||||
|
||||
@lazyproperty
|
||||
def document(self) -> Document:
|
||||
"""The python-docx `Document` object loaded from file or filename."""
|
||||
@ -248,6 +301,16 @@ class DocxPartitionerOptions:
|
||||
"""
|
||||
return self._page_counter
|
||||
|
||||
@lazyproperty
|
||||
def picture_partitioner(self) -> PicturePartitionerT:
|
||||
"""The sub-partitioner to use for DOCX image extraction."""
|
||||
# -- Note this value has partitioning-run scope. An instance of this options class is
|
||||
# -- instantiated once per partitioning run (each document can have different options).
|
||||
# -- Because this is a lazyproperty, it is computed only on the first reference. All
|
||||
# -- subsequent references during the same partitioning run will get the same value. This
|
||||
# -- ensures image extraction is processed consistently within a single document.
|
||||
return self._PicturePartitionerCls or _NullPicturePartitioner
|
||||
|
||||
@lazyproperty
|
||||
def strategy(self) -> str:
|
||||
"""The partitioning strategy for this document.
|
||||
@ -569,6 +632,7 @@ class _DocxPartitioner:
|
||||
for item in iter_paragraph_items(paragraph):
|
||||
if isinstance(item, Paragraph):
|
||||
yield from self._classify_paragraph_to_element(item)
|
||||
yield from self._iter_paragraph_images(item)
|
||||
else:
|
||||
yield from self._opts.increment_page_number()
|
||||
|
||||
@ -583,6 +647,13 @@ class _DocxPartitioner:
|
||||
if run.italic:
|
||||
yield {"text": text, "tag": "i"}
|
||||
|
||||
def _iter_paragraph_images(self, paragraph: Paragraph) -> Iterator[Image]:
|
||||
"""Generate `Image` element for each picture shape in `paragraph` when so configured."""
|
||||
# -- Delegate this job to the pluggable Picture partitioner. Note the default picture
|
||||
# -- partitioner does not extract images.
|
||||
PicturePartitionerCls = self._opts.picture_partitioner
|
||||
yield from PicturePartitionerCls.iter_elements(paragraph, self._opts)
|
||||
|
||||
def _iter_section_footers(self, section: Section) -> Iterator[Footer]:
|
||||
"""Generate any `Footer` elements defined for this section.
|
||||
|
||||
@ -925,3 +996,18 @@ class _DocxPartitioner:
|
||||
"""[contents, tags] pair describing emphasized text in `table`."""
|
||||
iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table))
|
||||
return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2])
|
||||
|
||||
|
||||
# ================================================================================================
|
||||
# SUB-PARTITIONERS
|
||||
# ================================================================================================
|
||||
|
||||
|
||||
class _NullPicturePartitioner:
|
||||
"""Does not parse the provided paragraph for pictures and generates zero `Image` elements."""
|
||||
|
||||
@classmethod
|
||||
def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
|
||||
"""No-op picture partitioner."""
|
||||
return
|
||||
yield
|
||||
|
Loading…
x
Reference in New Issue
Block a user