mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-17 13:16:47 +00:00
feat(docx): add pluggable picture sub-partitioner (#3081)
**Summary** Allow registration of a custom sub-partitioner that extracts images from a DOCX paragraph. **Additional Context** - A custom image sub-partitioner must implement the `PicturePartitionerT` interface defined in this PR. Basically have an `.iter_elements()` classmethod that takes the paragraph and generates zero or more `Image` elements from it. - The custom image sub-partitioner must be registered by passing the class to `register_picture_partitioner()`. - The default image sub-partitioner is `_NullPicturePartitioner` that does nothing. - The registered picture partitioner is called once for each paragraph.
This commit is contained in:
parent
171b5df09f
commit
47d28612f7
@ -3,6 +3,7 @@
|
|||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* **Move `category` field from Text class to Element class.**
|
* **Move `category` field from Text class to Element class.**
|
||||||
|
* **`partition_docx()` now supports pluggable picture sub-partitioners.** A subpartitioner that accepts a DOCX `Paragraph` and generates elements is now supported. This allows adding a custom sub-partitioner that extracts images and applies OCR or summarization for the image.
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
BIN
example-docs/contains-pictures.docx
Normal file
BIN
example-docs/contains-pictures.docx
Normal file
Binary file not shown.
@ -4,15 +4,17 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
import io
|
import io
|
||||||
import pathlib
|
import pathlib
|
||||||
import re
|
import re
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Any
|
from typing import Any, Iterator
|
||||||
|
|
||||||
import docx
|
import docx
|
||||||
import pytest
|
import pytest
|
||||||
from docx.document import Document
|
from docx.document import Document
|
||||||
|
from docx.text.paragraph import Paragraph
|
||||||
from pytest_mock import MockFixture
|
from pytest_mock import MockFixture
|
||||||
|
|
||||||
from test_unstructured.unit_utils import (
|
from test_unstructured.unit_utils import (
|
||||||
@ -31,6 +33,7 @@ from unstructured.documents.elements import (
|
|||||||
Element,
|
Element,
|
||||||
Footer,
|
Footer,
|
||||||
Header,
|
Header,
|
||||||
|
Image,
|
||||||
ListItem,
|
ListItem,
|
||||||
NarrativeText,
|
NarrativeText,
|
||||||
PageBreak,
|
PageBreak,
|
||||||
@ -39,7 +42,12 @@ from unstructured.documents.elements import (
|
|||||||
Text,
|
Text,
|
||||||
Title,
|
Title,
|
||||||
)
|
)
|
||||||
from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
|
from unstructured.partition.docx import (
|
||||||
|
DocxPartitionerOptions,
|
||||||
|
_DocxPartitioner,
|
||||||
|
partition_docx,
|
||||||
|
register_picture_partitioner,
|
||||||
|
)
|
||||||
from unstructured.partition.utils.constants import (
|
from unstructured.partition.utils.constants import (
|
||||||
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
|
||||||
PartitionStrategy,
|
PartitionStrategy,
|
||||||
@ -622,6 +630,45 @@ def test_it_considers_text_inside_shapes():
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# -- image sub-partitioning behaviors ------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_docx_generates_no_Image_elements_by_default():
|
||||||
|
assert not any(
|
||||||
|
isinstance(e, Image) for e in partition_docx(example_doc_path("contains-pictures.docx"))
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_partition_docx_uses_registered_picture_partitioner():
|
||||||
|
class FakeParagraphPicturePartitioner:
|
||||||
|
@classmethod
|
||||||
|
def iter_elements(
|
||||||
|
cls, paragraph: Paragraph, opts: DocxPartitionerOptions
|
||||||
|
) -> Iterator[Image]:
|
||||||
|
call_hash = hashlib.sha1(f"{paragraph.text}{opts.strategy}".encode()).hexdigest()
|
||||||
|
yield Image(f"Image with hash {call_hash}, strategy: {opts.strategy}")
|
||||||
|
|
||||||
|
register_picture_partitioner(FakeParagraphPicturePartitioner)
|
||||||
|
|
||||||
|
elements = partition_docx(example_doc_path("contains-pictures.docx"))
|
||||||
|
|
||||||
|
# -- picture-partitioner registration has module-lifetime, so need to de-register this fake
|
||||||
|
# -- so other tests in same test-run don't use it
|
||||||
|
DocxPartitionerOptions._PicturePartitionerCls = None
|
||||||
|
|
||||||
|
assert len(elements) == 11
|
||||||
|
image_elements = [e for e in elements if isinstance(e, Image)]
|
||||||
|
assert len(image_elements) == 6
|
||||||
|
assert [e.text for e in image_elements] == [
|
||||||
|
"Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
|
||||||
|
"Image with hash 5e0cd2c62809377d8ce7422d8ca6b0cf5f4453bc, strategy: hi_res",
|
||||||
|
"Image with hash 429de54e71f1f0fb395b6f6191961a3ea1b64dc0, strategy: hi_res",
|
||||||
|
"Image with hash ccbd34be6096544babc391890cb0849c24cc046c, strategy: hi_res",
|
||||||
|
"Image with hash a41b819c7b4a9750ec0f9198c59c2057d39c653c, strategy: hi_res",
|
||||||
|
"Image with hash ba0dc2a1205af8f6d9e06c8d415df096b0a9c428, strategy: hi_res",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
# -- module-level fixtures -----------------------------------------------------------------------
|
# -- module-level fixtures -----------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ import html
|
|||||||
import io
|
import io
|
||||||
import itertools
|
import itertools
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import IO, Any, Iterator, Optional, Type
|
from typing import IO, Any, Iterator, Optional, Protocol, Type
|
||||||
|
|
||||||
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
|
# -- CT_* stands for "complex-type", an XML element type in docx parlance --
|
||||||
import docx
|
import docx
|
||||||
@ -33,6 +33,7 @@ from unstructured.documents.elements import (
|
|||||||
EmailAddress,
|
EmailAddress,
|
||||||
Footer,
|
Footer,
|
||||||
Header,
|
Header,
|
||||||
|
Image,
|
||||||
Link,
|
Link,
|
||||||
ListItem,
|
ListItem,
|
||||||
NarrativeText,
|
NarrativeText,
|
||||||
@ -63,6 +64,43 @@ BlockElement: TypeAlias = "CT_P | CT_Tbl"
|
|||||||
BlockItem: TypeAlias = "Paragraph | DocxTable"
|
BlockItem: TypeAlias = "Paragraph | DocxTable"
|
||||||
|
|
||||||
|
|
||||||
|
def register_picture_partitioner(picture_partitioner: PicturePartitionerT) -> None:
|
||||||
|
"""Specify a pluggable sub-partitioner to be used for partitioning DOCX images."""
|
||||||
|
DocxPartitionerOptions.register_picture_partitioner(picture_partitioner)
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# DOCX DOMAIN MODEL DEFINITIONS
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class PicturePartitionerT(Protocol):
|
||||||
|
"""Defines the interface for a pluggable sub-partitioner for DOCX Picture objects.
|
||||||
|
|
||||||
|
In Microsoft Word parlance, an image is a "picture". We use that term here for an image in a
|
||||||
|
DOCX file both for domain consistency and because it conveniently avoids confusion with an
|
||||||
|
`unstructured` `Image` element.
|
||||||
|
|
||||||
|
A picture can be either *inline* or *floating*. An inline picture is treated like a big
|
||||||
|
character in the text of a paragraph, moving with the text. A floating picture can be moved
|
||||||
|
freely and text flows around it.
|
||||||
|
|
||||||
|
Both inline and floating pictures are defined inside a paragraph in the DOCX file. A paragraph
|
||||||
|
can have zero or more pictures. A DOCX picture partitioner takes a `docx` `Paragraph` object
|
||||||
|
and generates an `Image` element for each picture found in that paragraph.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
|
||||||
|
"""Generate an `Image` element for each picture in `paragraph`."""
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# PARTITIONER
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
@process_metadata()
|
@process_metadata()
|
||||||
@add_metadata_with_filetype(FileType.DOCX)
|
@add_metadata_with_filetype(FileType.DOCX)
|
||||||
@add_chunking_strategy
|
@add_chunking_strategy
|
||||||
@ -142,6 +180,16 @@ def partition_docx(
|
|||||||
class DocxPartitionerOptions:
|
class DocxPartitionerOptions:
|
||||||
"""Encapsulates partitioning option validation, computation, and application of defaults."""
|
"""Encapsulates partitioning option validation, computation, and application of defaults."""
|
||||||
|
|
||||||
|
_PicturePartitionerCls = None
|
||||||
|
"""Sub-partitioner used to extract pictures from a paragraph as `Image` elements.
|
||||||
|
|
||||||
|
This value has module lifetime and is updated by calling the `register_picture_partitioner()`
|
||||||
|
function defined in this module. The value sent to `register_picture_partitioner()` must be a
|
||||||
|
pluggable sub-partitioner implementing the `PicturePartitionerT` interface. After
|
||||||
|
registration, all paragraphs in subsequently partitioned DOCX documents will be sent to this
|
||||||
|
sub-partitioner to extract images when so configured.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
@ -166,6 +214,11 @@ class DocxPartitionerOptions:
|
|||||||
# -- options object maintains page-number state --
|
# -- options object maintains page-number state --
|
||||||
self._page_counter = starting_page_number
|
self._page_counter = starting_page_number
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def register_picture_partitioner(cls, picture_partitioner: PicturePartitionerT):
|
||||||
|
"""Specify a pluggable sub-partitioner to extract images from DOCX paragraphs."""
|
||||||
|
cls._PicturePartitionerCls = picture_partitioner
|
||||||
|
|
||||||
@lazyproperty
|
@lazyproperty
|
||||||
def document(self) -> Document:
|
def document(self) -> Document:
|
||||||
"""The python-docx `Document` object loaded from file or filename."""
|
"""The python-docx `Document` object loaded from file or filename."""
|
||||||
@ -248,6 +301,16 @@ class DocxPartitionerOptions:
|
|||||||
"""
|
"""
|
||||||
return self._page_counter
|
return self._page_counter
|
||||||
|
|
||||||
|
@lazyproperty
|
||||||
|
def picture_partitioner(self) -> PicturePartitionerT:
|
||||||
|
"""The sub-partitioner to use for DOCX image extraction."""
|
||||||
|
# -- Note this value has partitioning-run scope. An instance of this options class is
|
||||||
|
# -- instantiated once per partitioning run (each document can have different options).
|
||||||
|
# -- Because this is a lazyproperty, it is computed only on the first reference. All
|
||||||
|
# -- subsequent references during the same partitioning run will get the same value. This
|
||||||
|
# -- ensures image extraction is processed consistently within a single document.
|
||||||
|
return self._PicturePartitionerCls or _NullPicturePartitioner
|
||||||
|
|
||||||
@lazyproperty
|
@lazyproperty
|
||||||
def strategy(self) -> str:
|
def strategy(self) -> str:
|
||||||
"""The partitioning strategy for this document.
|
"""The partitioning strategy for this document.
|
||||||
@ -569,6 +632,7 @@ class _DocxPartitioner:
|
|||||||
for item in iter_paragraph_items(paragraph):
|
for item in iter_paragraph_items(paragraph):
|
||||||
if isinstance(item, Paragraph):
|
if isinstance(item, Paragraph):
|
||||||
yield from self._classify_paragraph_to_element(item)
|
yield from self._classify_paragraph_to_element(item)
|
||||||
|
yield from self._iter_paragraph_images(item)
|
||||||
else:
|
else:
|
||||||
yield from self._opts.increment_page_number()
|
yield from self._opts.increment_page_number()
|
||||||
|
|
||||||
@ -583,6 +647,13 @@ class _DocxPartitioner:
|
|||||||
if run.italic:
|
if run.italic:
|
||||||
yield {"text": text, "tag": "i"}
|
yield {"text": text, "tag": "i"}
|
||||||
|
|
||||||
|
def _iter_paragraph_images(self, paragraph: Paragraph) -> Iterator[Image]:
|
||||||
|
"""Generate `Image` element for each picture shape in `paragraph` when so configured."""
|
||||||
|
# -- Delegate this job to the pluggable Picture partitioner. Note the default picture
|
||||||
|
# -- partitioner does not extract images.
|
||||||
|
PicturePartitionerCls = self._opts.picture_partitioner
|
||||||
|
yield from PicturePartitionerCls.iter_elements(paragraph, self._opts)
|
||||||
|
|
||||||
def _iter_section_footers(self, section: Section) -> Iterator[Footer]:
|
def _iter_section_footers(self, section: Section) -> Iterator[Footer]:
|
||||||
"""Generate any `Footer` elements defined for this section.
|
"""Generate any `Footer` elements defined for this section.
|
||||||
|
|
||||||
@ -925,3 +996,18 @@ class _DocxPartitioner:
|
|||||||
"""[contents, tags] pair describing emphasized text in `table`."""
|
"""[contents, tags] pair describing emphasized text in `table`."""
|
||||||
iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table))
|
iter_tbl_emph, iter_tbl_emph_2 = itertools.tee(self._iter_table_emphasis(table))
|
||||||
return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2])
|
return ([e["text"] for e in iter_tbl_emph], [e["tag"] for e in iter_tbl_emph_2])
|
||||||
|
|
||||||
|
|
||||||
|
# ================================================================================================
|
||||||
|
# SUB-PARTITIONERS
|
||||||
|
# ================================================================================================
|
||||||
|
|
||||||
|
|
||||||
|
class _NullPicturePartitioner:
|
||||||
|
"""Does not parse the provided paragraph for pictures and generates zero `Image` elements."""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def iter_elements(cls, paragraph: Paragraph, opts: DocxPartitionerOptions) -> Iterator[Image]:
|
||||||
|
"""No-op picture partitioner."""
|
||||||
|
return
|
||||||
|
yield
|
||||||
|
Loading…
x
Reference in New Issue
Block a user