rfctr(docx): extract DocxPartitionerOptions (#3018)

**Reviewers:** Probably easier to review first and second commits
separately as the first one adds all the new code and tests (without
installing it), and the second one installs it into the partitioner
along with the required changes to code and tests.

**Summary**
Enable communication of partitioning options to sub-partitioners, in
particular to the pluggable `PicturePartitioner` coming in a closely
subsequent PR to implement image-extraction and OCR for DOCX, DOC, and
ODT formats.

**Additional Context**
In general, validation of partitioning options as well as assigning
default values and computing derived partitioning settings can be
extracted from partitioners into a neatly encapsulated separate object.
This simplifies the core partitioning code by removing the noise
associated with computing metadata values and deciding how to access the
source document, etc.

However, better factoring aside, having the partition-time "settings"
available in a single object allows partitioning of certain document
features, for example images, to be readily _delegated_ to a
sub-partitioner while still giving it access to all the relevant
partitioning settings for the current document. This is particularly
important when a sub-partitioner is "pluggable" at runtime and must rely
on a clearly-defined (and simple as possible) interface to operate
smoothly.
This commit is contained in:
Steve Canny 2024-05-14 17:50:31 -07:00 committed by GitHub
parent db186dc23b
commit 12b30d2810
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 543 additions and 200 deletions

View File

@ -1,8 +1,8 @@
## 0.13.8-dev7
## 0.13.8-dev8
### Enhancements
**Faster evaluation** Support for concurrent processing of documents during evaluation
* **Faster evaluation** Support for concurrent processing of documents during evaluation
### Features

View File

@ -4,16 +4,26 @@
from __future__ import annotations
import io
import pathlib
import re
import tempfile
from typing import Any
import docx
import pytest
from docx.document import Document
from pytest_mock import MockFixture
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
from test_unstructured.unit_utils import (
FixtureRequest,
Mock,
assert_round_trips_through_JSON,
example_doc_path,
function_mock,
instance_mock,
property_mock,
)
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import (
Address,
@ -29,7 +39,7 @@ from unstructured.documents.elements import (
Text,
Title,
)
from unstructured.partition.docx import _DocxPartitioner, partition_docx
from unstructured.partition.docx import DocxPartitionerOptions, _DocxPartitioner, partition_docx
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
# -- docx-file loading behaviors -----------------------------------------------------------------
@ -89,14 +99,16 @@ def test_partition_docx_from_file_with_metadata_filename(
assert element.metadata.filename == "test"
def test_partition_docx_raises_with_both_specified(mock_document_file_path: str):
with open(mock_document_file_path, "rb") as f:
with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"):
partition_docx(filename=mock_document_file_path, file=f)
def test_partition_docx_uses_file_path_when_both_are_specified(
mock_document_file_path: str, expected_elements: list[Text]
):
f = io.BytesIO(b"abcde")
elements = partition_docx(filename=mock_document_file_path, file=f)
assert elements == expected_elements
def test_partition_docx_raises_with_neither():
with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"):
with pytest.raises(ValueError, match="either `filename` or `file` argument must be provided"):
partition_docx()
@ -292,15 +304,13 @@ def test_partition_docx_from_file_without_metadata_date():
assert elements[0].metadata.last_modified is None
def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: list[dict[str, str]]):
partitioner = _DocxPartitioner(
example_doc_path("fake-doc-emphasized-text.docx"),
None,
None,
False,
True,
None,
)
def test_get_emphasized_texts_from_paragraph(
opts_args: dict[str, Any], expected_emphasized_texts: list[dict[str, str]]
):
opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx")
opts = DocxPartitionerOptions(**opts_args)
partitioner = _DocxPartitioner(opts)
paragraph = partitioner._document.paragraphs[1]
emphasized_texts = list(partitioner._iter_paragraph_emphasis(paragraph))
assert paragraph.text == "I am a bold italic bold-italic text."
@ -317,34 +327,31 @@ def test_get_emphasized_texts_from_paragraph(expected_emphasized_texts: list[dic
assert emphasized_texts == []
def test_iter_table_emphasis(expected_emphasized_texts: list[dict[str, str]]):
partitioner = _DocxPartitioner(
example_doc_path("fake-doc-emphasized-text.docx"),
None,
None,
False,
True,
None,
)
def test_iter_table_emphasis(
opts_args: dict[str, Any], expected_emphasized_texts: list[dict[str, str]]
):
opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx")
opts = DocxPartitionerOptions(**opts_args)
partitioner = _DocxPartitioner(opts)
table = partitioner._document.tables[0]
emphasized_texts = list(partitioner._iter_table_emphasis(table))
assert emphasized_texts == expected_emphasized_texts
def test_table_emphasis(
opts_args: dict[str, Any],
expected_emphasized_text_contents: list[str],
expected_emphasized_text_tags: list[str],
):
partitioner = _DocxPartitioner(
example_doc_path("fake-doc-emphasized-text.docx"),
None,
None,
False,
True,
None,
)
opts_args["file_path"] = example_doc_path("fake-doc-emphasized-text.docx")
opts = DocxPartitionerOptions(**opts_args)
partitioner = _DocxPartitioner(opts)
table = partitioner._document.tables[0]
emphasized_text_contents, emphasized_text_tags = partitioner._table_emphasis(table)
assert emphasized_text_contents == expected_emphasized_text_contents
assert emphasized_text_tags == expected_emphasized_text_tags
@ -373,15 +380,10 @@ def test_partition_docx_with_json(mock_document_file_path: str):
assert_round_trips_through_JSON(elements)
def test_parse_category_depth_by_style():
partitioner = _DocxPartitioner(
example_doc_path("category-level.docx"),
None,
None,
False,
True,
None,
)
def test_parse_category_depth_by_style(opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("category-level.docx")
opts = DocxPartitionerOptions(**opts_args)
partitioner = _DocxPartitioner(opts)
# Category depths are 0-indexed and relative to the category type
# Title, list item, bullet, narrative text, etc.
@ -411,9 +413,9 @@ def test_parse_category_depth_by_style():
), f"expected paragraph[{idx}] to have depth=={depth}, got {actual_depth}"
def test_parse_category_depth_by_style_name():
partitioner = _DocxPartitioner(None, None, None, False, True, None)
def test_parse_category_depth_by_style_name(opts_args: dict[str, Any]):
opts = DocxPartitionerOptions(**opts_args)
partitioner = _DocxPartitioner(opts)
test_cases = [
(0, "Heading 1"),
(1, "Heading 2"),
@ -436,8 +438,9 @@ def test_parse_category_depth_by_style_name():
), f"test case {test_cases[idx]} failed"
def test_parse_category_depth_by_style_ilvl():
partitioner = _DocxPartitioner(None, None, None, False, True, None)
def test_parse_category_depth_by_style_ilvl(opts_args: dict[str, Any]):
opts = DocxPartitionerOptions(**opts_args)
partitioner = _DocxPartitioner(opts)
assert partitioner._parse_category_depth_by_style_ilvl() == 0
@ -683,6 +686,24 @@ def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) ->
return filename
@pytest.fixture()
def opts_args() -> dict[str, Any]:
"""All default arguments for `DocxPartitionerOptions`.
Individual argument values can be changed to suit each test. Makes construction of opts more
compact for testing purposes.
"""
return {
"date_from_file_object": False,
"file": None,
"file_path": None,
"include_page_breaks": True,
"infer_table_structure": True,
"metadata_file_path": None,
"metadata_last_modified": None,
}
# ================================================================================================
# ISOLATED UNIT TESTS
# ================================================================================================
@ -691,14 +712,280 @@ def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) ->
# ================================================================================================
class DescribeDocxPartitionerOptions:
"""Unit-test suite for `unstructured.partition.docx.DocxPartitionerOptions` objects."""
# -- .document -------------------------------
def it_loads_the_docx_document(
self,
request: FixtureRequest,
opts_args: dict[str, Any],
):
document_ = instance_mock(request, Document)
docx_Document_ = function_mock(
request, "unstructured.partition.docx.docx.Document", return_value=document_
)
_docx_file_prop_ = property_mock(
request, DocxPartitionerOptions, "_docx_file", return_value="abcde.docx"
)
opts = DocxPartitionerOptions(**opts_args)
document = opts.document
_docx_file_prop_.assert_called_once_with()
docx_Document_.assert_called_once_with("abcde.docx")
assert document is document_
# -- .include_page_breaks --------------------
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_emit_PageBreak_elements_as_part_of_the_output_element_stream(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["include_page_breaks"] = arg_value
opts = DocxPartitionerOptions(**opts_args)
assert opts.include_page_breaks is arg_value
# -- .infer_table_structure ------------------
@pytest.mark.parametrize("arg_value", [True, False])
def it_knows_whether_to_include_text_as_html_in_Table_metadata(
self, arg_value: bool, opts_args: dict[str, Any]
):
opts_args["infer_table_structure"] = arg_value
opts = DocxPartitionerOptions(**opts_args)
assert opts.infer_table_structure is arg_value
# -- .increment_page_number() ----------------
def it_generates_a_PageBreak_element_when_the_page_number_is_incremented(
self, opts_args: dict[str, Any]
):
opts = DocxPartitionerOptions(**opts_args)
page_break_iter = opts.increment_page_number()
assert isinstance(next(page_break_iter, None), PageBreak)
assert opts.page_number == 2
with pytest.raises(StopIteration):
next(page_break_iter)
def but_it_does_not_generate_a_PageBreak_element_when_include_page_breaks_option_is_off(
self, opts_args: dict[str, Any]
):
opts_args["include_page_breaks"] = False
opts = DocxPartitionerOptions(**opts_args)
page_break_iter = opts.increment_page_number()
with pytest.raises(StopIteration):
next(page_break_iter)
assert opts.page_number == 2
# -- .last_modified --------------------------
def it_gets_the_last_modified_date_of_the_document_from_the_caller_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["metadata_last_modified"] = "2024-03-05T17:02:53"
opts = DocxPartitionerOptions(**opts_args)
assert opts.last_modified == "2024-03-05T17:02:53"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_path_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_: Mock
):
opts_args["file_path"] = "a/b/document.docx"
get_last_modified_date_.return_value = "2024-04-02T20:32:35"
opts = DocxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_.assert_called_once_with("a/b/document.docx")
assert last_modified == "2024-04-02T20:32:35"
def and_it_falls_back_to_the_last_modified_date_of_the_file_when_a_file_like_object_is_provided(
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts_args["date_from_file_object"] = True
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
opts = DocxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_from_file_.assert_called_once_with(file)
assert last_modified == "2024-04-02T20:42:07"
def but_it_falls_back_to_None_for_the_last_modified_date_when_date_from_file_object_is_False(
self, opts_args: dict[str, Any], get_last_modified_date_from_file_: Mock
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts_args["date_from_file_object"] = False
get_last_modified_date_from_file_.return_value = "2024-04-02T20:42:07"
opts = DocxPartitionerOptions(**opts_args)
last_modified = opts.last_modified
get_last_modified_date_from_file_.assert_not_called()
assert last_modified is None
# -- .metadata_file_path ---------------------
def it_uses_the_user_provided_file_path_in_the_metadata_when_provided(
self, opts_args: dict[str, Any]
):
opts_args["file_path"] = "x/y/z.docx"
opts_args["metadata_file_path"] = "a/b/c.docx"
opts = DocxPartitionerOptions(**opts_args)
assert opts.metadata_file_path == "a/b/c.docx"
@pytest.mark.parametrize("file_path", ["u/v/w.docx", None])
def and_it_falls_back_to_the_document_file_path_otherwise(
self, file_path: str | None, opts_args: dict[str, Any]
):
opts_args["file_path"] = file_path
opts_args["metadata_file_path"] = None
opts = DocxPartitionerOptions(**opts_args)
assert opts.metadata_file_path == file_path
# -- ._metadata_page_number ------------------
@pytest.mark.parametrize(
("page_count", "document_contains_pagebreaks", "expected_value"),
[(7, True, 7), (1, False, None)],
)
def it_reports_None_when_no_rendered_page_breaks_are_found_in_document(
self,
request: FixtureRequest,
opts_args: dict[str, Any],
page_count: int,
document_contains_pagebreaks: bool,
expected_value: int | None,
):
_document_contains_pagebreaks_prop_ = property_mock(
request,
DocxPartitionerOptions,
"_document_contains_pagebreaks",
return_value=document_contains_pagebreaks,
)
opts = DocxPartitionerOptions(**opts_args)
opts._page_counter = page_count
metadata_page_number = opts.metadata_page_number
_document_contains_pagebreaks_prop_.assert_called_once_with()
assert metadata_page_number is expected_value
# -- .page_number ----------------------------
def it_keeps_track_of_the_page_number(self, opts_args: dict[str, Any]):
"""In DOCX, page-number is the slide number."""
opts = DocxPartitionerOptions(**opts_args)
assert opts.page_number == 1
list(opts.increment_page_number())
assert opts.page_number == 2
list(opts.increment_page_number())
assert opts.page_number == 3
def it_assigns_the_correct_page_number_when_starting_page_number_is_given(
self, opts_args: dict[str, Any]
):
opts = DocxPartitionerOptions(**opts_args, starting_page_number=3)
assert opts.page_number == 3
list(opts.increment_page_number())
assert opts.page_number == 4
# -- ._document_contains_pagebreaks ----------
@pytest.mark.parametrize(
("file_name", "expected_value"), [("page-breaks.docx", True), ("teams_chat.docx", False)]
)
def it_knows_whether_the_document_contains_page_breaks(
self, opts_args: dict[str, Any], file_name: str, expected_value: bool
):
opts_args["file_path"] = example_doc_path(file_name)
opts = DocxPartitionerOptions(**opts_args)
assert opts._document_contains_pagebreaks is expected_value
# -- ._docx_file -----------------------------
def it_uses_the_path_to_open_the_presentation_when_file_path_is_provided(
self, opts_args: dict[str, Any]
):
opts_args["file_path"] = "l/m/n.docx"
opts = DocxPartitionerOptions(**opts_args)
assert opts._docx_file == "l/m/n.docx"
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
self, opts_args: dict[str, Any]
):
spooled_temp_file = tempfile.SpooledTemporaryFile()
spooled_temp_file.write(b"abcdefg")
opts_args["file"] = spooled_temp_file
opts = DocxPartitionerOptions(**opts_args)
docx_file = opts._docx_file
assert docx_file is not spooled_temp_file
assert isinstance(docx_file, io.BytesIO)
assert docx_file.getvalue() == b"abcdefg"
def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
self, opts_args: dict[str, Any]
):
file = io.BytesIO(b"abcdefg")
opts_args["file"] = file
opts = DocxPartitionerOptions(**opts_args)
docx_file = opts._docx_file
assert docx_file is file
assert isinstance(docx_file, io.BytesIO)
assert docx_file.getvalue() == b"abcdefg"
def but_it_raises_ValueError_when_neither_a_file_path_or_file_is_provided(
self, opts_args: dict[str, Any]
):
opts = DocxPartitionerOptions(**opts_args)
with pytest.raises(ValueError, match="No DOCX document specified, either `filename` or "):
opts._docx_file
# -- fixtures --------------------------------------------------------------------------------
@pytest.fixture()
def get_last_modified_date_(self, request: FixtureRequest) -> Mock:
return function_mock(request, "unstructured.partition.docx.get_last_modified_date")
@pytest.fixture()
def get_last_modified_date_from_file_(self, request: FixtureRequest):
return function_mock(
request, "unstructured.partition.docx.get_last_modified_date_from_file"
)
class Describe_DocxPartitioner:
"""Unit-test suite for `unstructured.partition.docx._DocxPartitioner`."""
# -- table behaviors -------------------------------------------------------------------------
def it_can_convert_a_table_to_html(self):
def it_can_convert_a_table_to_html(self, opts_args: dict[str, Any]):
opts = DocxPartitionerOptions(**opts_args)
table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]
assert _DocxPartitioner()._convert_table_to_html(table) == (
assert _DocxPartitioner(opts)._convert_table_to_html(table) == (
"<table>\n"
"<thead>\n"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
@ -709,7 +996,7 @@ class Describe_DocxPartitioner:
"</table>"
)
def and_it_can_convert_a_nested_table_to_html(self):
def and_it_can_convert_a_nested_table_to_html(self, opts_args: dict[str, Any]):
"""
Fixture table is:
@ -725,10 +1012,11 @@ class Describe_DocxPartitioner:
| j | k | l |
+---+-------------+---+
"""
opts = DocxPartitionerOptions(**opts_args)
table = docx.Document(example_doc_path("docx-tables.docx")).tables[1]
# -- re.sub() strips out the extra padding inserted by tabulate --
html = re.sub(r" +<", "<", _DocxPartitioner()._convert_table_to_html(table))
html = re.sub(r" +<", "<", _DocxPartitioner(opts)._convert_table_to_html(table))
expected_lines = [
"<table>",
@ -750,13 +1038,15 @@ class Describe_DocxPartitioner:
for expected, actual in zip(expected_lines, actual_lines):
assert actual == expected, f"\nexpected: {repr(expected)}\nactual: {repr(actual)}"
def it_can_convert_a_table_to_plain_text(self):
def it_can_convert_a_table_to_plain_text(self, opts_args: dict[str, Any]):
opts = DocxPartitionerOptions(**opts_args)
table = docx.Document(example_doc_path("docx-tables.docx")).tables[0]
assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == (
assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == (
"Header Col 1 Header Col 2 Lorem ipsum A link example"
)
def and_it_can_convert_a_nested_table_to_plain_text(self):
def and_it_can_convert_a_nested_table_to_plain_text(self, opts_args: dict[str, Any]):
"""
Fixture table is:
@ -772,12 +1062,14 @@ class Describe_DocxPartitioner:
| j | k | l |
+---+-------------+---+
"""
opts = DocxPartitionerOptions(**opts_args)
table = docx.Document(example_doc_path("docx-tables.docx")).tables[1]
assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == (
assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == (
"a >b< c d e f g&t h i j k l"
)
def but_the_text_of_a_merged_cell_appears_only_once(self):
def but_the_text_of_a_merged_cell_appears_only_once(self, opts_args: dict[str, Any]):
"""
Fixture table is:
@ -789,8 +1081,9 @@ class Describe_DocxPartitioner:
| e | |
+-------+---+
"""
opts = DocxPartitionerOptions(**opts_args)
table = docx.Document(example_doc_path("docx-tables.docx")).tables[2]
assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == "a b c d e"
assert " ".join(_DocxPartitioner(opts)._iter_table_texts(table)) == "a b c d e"
def it_can_partition_tables_with_incomplete_rows(self):
"""DOCX permits table rows to start late and end early.
@ -921,7 +1214,7 @@ class Describe_DocxPartitioner:
# -- page-break behaviors --------------------------------------------------------------------
def it_places_page_breaks_precisely_where_they_occur(self):
def it_places_page_breaks_precisely_where_they_occur(self, opts_args: dict[str, Any]):
"""Page-break behavior has some subtleties.
* A hard page-break does not generate a PageBreak element (because that would double-count
@ -940,6 +1233,8 @@ class Describe_DocxPartitioner:
"""A more detailed `repr()` to aid debugging when assertion fails."""
return f"{e.__class__.__name__}('{e}')"
opts_args["file_path"] = example_doc_path("page-breaks.docx")
opts = DocxPartitionerOptions(**opts_args)
expected = [
# NOTE(scanny) - -- page 1 --
NarrativeText(
@ -975,7 +1270,7 @@ class Describe_DocxPartitioner:
Title("<<and then more text proceeds."),
]
elements = _DocxPartitioner.iter_document_elements(example_doc_path("page-breaks.docx"))
elements = _DocxPartitioner.iter_document_elements(opts)
for idx, e in enumerate(elements):
assert e == expected[idx], (
@ -986,8 +1281,10 @@ class Describe_DocxPartitioner:
# -- header/footer behaviors -----------------------------------------------------------------
def it_includes_table_cell_text_in_Header_text(self):
partitioner = _DocxPartitioner(example_doc_path("docx-hdrftr.docx"))
def it_includes_table_cell_text_in_Header_text(self, opts_args: dict[str, Any]):
opts_args["file_path"] = example_doc_path("docx-hdrftr.docx")
opts = DocxPartitionerOptions(**opts_args)
partitioner = _DocxPartitioner(opts)
section = partitioner._document.sections[0]
header_iter = partitioner._iter_section_headers(section)
@ -995,9 +1292,11 @@ class Describe_DocxPartitioner:
element = next(header_iter)
assert element.text == "First header para\nTable cell1 Table cell2\nLast header para"
def it_includes_table_cell_text_in_Footer_text(self):
def it_includes_table_cell_text_in_Footer_text(self, opts_args: dict[str, Any]):
"""This case also verifies nested-table and merged-cell behaviors."""
partitioner = _DocxPartitioner(example_doc_path("docx-hdrftr.docx"))
opts_args["file_path"] = example_doc_path("docx-hdrftr.docx")
opts = DocxPartitionerOptions(**opts_args)
partitioner = _DocxPartitioner(opts)
section = partitioner._document.sections[0]
footer_iter = partitioner._iter_section_footers(section)

View File

@ -1 +1 @@
__version__ = "0.13.8-dev7" # pragma: no cover
__version__ = "0.13.8-dev8" # pragma: no cover

View File

@ -217,19 +217,19 @@ def partition_docx(
Assign this number to the first page of this document and increment the page number from
there.
"""
# -- verify that only one file-specifier argument was provided --
exactly_one(filename=filename, file=file)
elements = _DocxPartitioner.iter_document_elements(
filename,
file,
metadata_filename,
include_page_breaks,
infer_table_structure,
metadata_last_modified,
date_from_file_object,
opts = DocxPartitionerOptions(
date_from_file_object=date_from_file_object,
file=file,
file_path=filename,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
metadata_file_path=metadata_filename,
metadata_last_modified=metadata_last_modified,
starting_page_number=starting_page_number,
)
elements = _DocxPartitioner.iter_document_elements(opts)
elements = apply_lang_metadata(
elements=elements,
languages=languages,
@ -238,56 +238,169 @@ def partition_docx(
return list(elements)
class _DocxPartitioner:
"""Provides `.partition()` for MS-Word 2007+ (.docx) files."""
class DocxPartitionerOptions:
"""Encapsulates partitioning option validation, computation, and application of defaults."""
def __init__(
self,
# -- NOTE(scanny): default values here are unnecessary for production use because
# -- `.iter_document_elements()` is the only interface method and always calls with all
# -- args. However, providing defaults eases unit-testing and decouples unit-tests from
# -- future changes to args.
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
include_page_breaks: bool = True,
infer_table_structure: bool = True,
metadata_last_modified: Optional[str] = None,
date_from_file_object: bool = False,
*,
date_from_file_object: bool,
file: IO[bytes] | None,
file_path: str | None,
include_page_breaks: bool,
infer_table_structure: bool,
metadata_file_path: Optional[str],
metadata_last_modified: Optional[str],
starting_page_number: int = 1,
) -> None:
self._filename = filename
):
self._date_from_file_object = date_from_file_object
self._file = file
self._metadata_filename = metadata_filename
self._file_path = file_path
self._include_page_breaks = include_page_breaks
self._infer_table_structure = infer_table_structure
self._metadata_file_path = metadata_file_path
self._metadata_last_modified = metadata_last_modified
# -- options object maintains page-number state --
self._page_counter = starting_page_number
self._date_from_file_object = date_from_file_object
@lazyproperty
def document(self) -> Document:
"""The python-docx `Document` object loaded from file or filename."""
return docx.Document(self._docx_file)
@lazyproperty
def include_page_breaks(self) -> bool:
"""When True, include `PageBreak` elements in element-stream.
Note that regardless of this setting, page-breaks are detected, and page-number is tracked
and included in element metadata. Only the presence of distinct `PageBreak` elements (which
contain no text) in the element stream is affected.
"""
return self._include_page_breaks
def increment_page_number(self) -> Iterator[PageBreak]:
"""Increment page-number by 1 and generate a PageBreak element if enabled."""
self._page_counter += 1
# -- only emit page-breaks when enabled --
if self._include_page_breaks:
yield PageBreak("", detection_origin=DETECTION_ORIGIN)
@lazyproperty
def infer_table_structure(self) -> bool:
"""True when partitioner should compute and apply `text_as_html` metadata for tables."""
return self._infer_table_structure
@lazyproperty
def last_modified(self) -> Optional[str]:
"""The best last-modified date available, None if no sources are available."""
# -- Value explicitly specified by caller takes precedence. This is used for example when
# -- this file was converted from another format, and any last-modified date for the file
# -- would be just now.
if self._metadata_last_modified:
return self._metadata_last_modified
if self._file_path:
return (
None
if is_temp_file_path(self._file_path)
else get_last_modified_date(self._file_path)
)
if self._file:
return (
get_last_modified_date_from_file(self._file)
if self._date_from_file_object
else None
)
return None
@lazyproperty
def metadata_file_path(self) -> str | None:
"""The best available file-path for this document or `None` if unavailable."""
return self._metadata_file_path or self._file_path
@property
def metadata_page_number(self) -> Optional[int]:
"""The current page number to report in metadata, or None if we can't really tell.
Page numbers are not added to element metadata if we can't find any page-breaks in the
document (which may be a common case).
In the DOCX format, determining page numbers is strictly a best-efforts attempt since
actual page-breaks are determined at rendering time (e.g. printing) based on the
font-metrics of the target device. Explicit (hard) page-breaks are always recorded in the
docx file but the rendered page-breaks are only added optionally.
"""
return self._page_counter if self._document_contains_pagebreaks else None
@property
def page_number(self) -> int:
"""The current page number.
Note this value may not represent the actual rendered page number when rendered page-break
indicators are not present in the document (not uncommon). Use `.metadata_page_number` for
metadata purposes, which is `None` when rendered page-breaks are not present in this
document.
"""
return self._page_counter
@lazyproperty
def _document_contains_pagebreaks(self) -> bool:
"""True when there is at least one page-break detected in the document.
Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably
inserted by Microsoft Word, but probably don't appear in documents converted into .docx
format from for example .odt format.
"""
xpath = (
# NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can
# appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which
# is w:p inner-content and both of these can occur inside a table-cell as well as the
# document body
"./w:body/w:p/w:r/w:lastRenderedPageBreak"
" | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
" | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak"
" | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
)
return bool(self.document.element.xpath(xpath))
@lazyproperty
def _docx_file(self) -> str | IO[bytes]:
"""The Word 2007+ document file to be partitioned.
This is either a `str` path or a file-like object. `python-docx` accepts either for opening
a document file.
"""
if self._file_path:
return self._file_path
# -- In Python <3.11 SpooledTemporaryFile does not implement ".seekable" which triggers an
# -- exception when Zipfile tries to open it. The docx format is a zip archive so we need
# -- to work around that bug here.
if isinstance(self._file, tempfile.SpooledTemporaryFile):
self._file.seek(0)
return io.BytesIO(self._file.read())
if self._file:
return self._file
raise ValueError(
"No DOCX document specified, either `filename` or `file` argument must be provided"
)
class _DocxPartitioner:
"""Provides `.partition()` for MS-Word 2007+ (.docx) files."""
def __init__(self, opts: DocxPartitionerOptions) -> None:
self._opts = opts
@classmethod
def iter_document_elements(
cls,
filename: Optional[str] = None,
file: Optional[IO[bytes]] = None,
metadata_filename: Optional[str] = None,
include_page_breaks: bool = True,
infer_table_structure: bool = True,
metadata_last_modified: Optional[str] = None,
date_from_file_object: bool = False,
starting_page_number: int = 1,
) -> Iterator[Element]:
def iter_document_elements(cls, opts: DocxPartitionerOptions) -> Iterator[Element]:
"""Partition MS Word documents (.docx format) into its document elements."""
self = cls(
filename=filename,
file=file,
metadata_filename=metadata_filename,
include_page_breaks=include_page_breaks,
infer_table_structure=infer_table_structure,
metadata_last_modified=metadata_last_modified,
date_from_file_object=date_from_file_object,
starting_page_number=starting_page_number,
)
self = cls(opts)
# NOTE(scanny): It's possible for a Word document to have no sections. In particular, a
# Microsoft Teams chat transcript exported to DOCX contains no sections. Such a
# "section-less" document has to be interated differently and has no headers or footers and
@ -452,37 +565,7 @@ class _DocxPartitioner:
@lazyproperty
def _document(self) -> Document:
"""The python-docx `Document` object loaded from file or filename."""
filename, file = self._filename, self._file
if filename is not None:
return docx.Document(filename)
assert file is not None
if isinstance(file, tempfile.SpooledTemporaryFile):
file.seek(0)
file = io.BytesIO(file.read())
return docx.Document(file)
@lazyproperty
def _document_contains_pagebreaks(self) -> bool:
"""True when there is at least one page-break detected in the document.
Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably
inserted by Microsoft Word, but probably don't appear in documents converted into .docx
format from for example .odt format.
"""
xpath = (
# NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can
# appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which
# is w:p inner-content and both of these can occur inside a table-cell as well as the
# document body
"./w:body/w:p/w:r/w:lastRenderedPageBreak"
" | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
" | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak"
" | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
)
return bool(self._document.element.xpath(xpath))
return self._opts.document
@lazyproperty
def _document_contains_sections(self) -> bool:
@ -524,12 +607,6 @@ class _DocxPartitioner:
return "\n".join(text for text in iter_hdrftr_texts(hdrftr) if text)
def _increment_page_number(self) -> Iterator[PageBreak]:
"""Increment page-number by 1 and generate a PageBreak element if enabled."""
self._page_counter += 1
if self._include_page_breaks:
yield PageBreak("", detection_origin=DETECTION_ORIGIN)
def _is_list_item(self, paragraph: Paragraph) -> bool:
"""True when `paragraph` can be identified as a list-item."""
if is_bulleted_text(paragraph.text):
@ -581,7 +658,7 @@ class _DocxPartitioner:
if isinstance(item, Paragraph):
yield from self._classify_paragraph_to_element(item)
else:
yield from self._increment_page_number()
yield from self._opts.increment_page_number()
def _iter_paragraph_emphasis(self, paragraph: Paragraph) -> Iterator[dict[str, str]]:
"""Generate e.g. {"text": "MUST", "tag": "b"} for each emphasis in `paragraph`."""
@ -616,7 +693,7 @@ class _DocxPartitioner:
text=text,
detection_origin=DETECTION_ORIGIN,
metadata=ElementMetadata(
filename=self._metadata_filename,
filename=self._opts.metadata_file_path,
header_footer_type=header_footer_type,
category_depth=0,
),
@ -645,7 +722,7 @@ class _DocxPartitioner:
text=text,
detection_origin=DETECTION_ORIGIN,
metadata=ElementMetadata(
filename=self._metadata_filename,
filename=self._opts.metadata_file_path,
header_footer_type=header_footer_type,
category_depth=0, # -- headers are always at the root level}
),
@ -668,7 +745,7 @@ class _DocxPartitioner:
"""
def page_is_odd() -> bool:
return self._page_counter % 2 == 1
return self._opts.page_number % 2 == 1
start_type = section.start_type
@ -682,14 +759,14 @@ class _DocxPartitioner:
# -- on an even page we need two total, add one to supplement the rendered page break
# -- to follow. There is no "first-document-page" special case because 1 is odd.
if not page_is_odd():
yield from self._increment_page_number()
yield from self._opts.increment_page_number()
elif start_type == WD_SECTION_START.ODD_PAGE:
# -- the first page of the document is an implicit "new" odd-page, so no page-break --
if section_idx == 0:
return
if page_is_odd():
yield from self._increment_page_number()
yield from self._opts.increment_page_number()
# -- otherwise, start-type is one of "continuous", "new-column", or "next-page", none of
# -- which need our help to get the page-breaks right.
@ -699,7 +776,9 @@ class _DocxPartitioner:
"""Generate zero-or-one Table element for a DOCX `w:tbl` XML element."""
# -- at present, we always generate exactly one Table element, but we might want
# -- to skip, for example, an empty table.
html_table = self._convert_table_to_html(table) if self._infer_table_structure else None
html_table = (
self._convert_table_to_html(table) if self._opts.infer_table_structure else None
)
text_table = " ".join(self._iter_table_texts(table))
emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)
@ -708,9 +787,9 @@ class _DocxPartitioner:
detection_origin=DETECTION_ORIGIN,
metadata=ElementMetadata(
text_as_html=html_table,
filename=self._metadata_filename,
page_number=self._page_number,
last_modified=self._last_modified,
filename=self._opts.metadata_file_path,
page_number=self._opts.metadata_page_number,
last_modified=self._opts.last_modified,
emphasized_text_contents=emphasized_text_contents or None,
emphasized_text_tags=emphasized_text_tags or None,
),
@ -753,41 +832,6 @@ class _DocxPartitioner:
# -- do not generate empty strings --
yield from (text for text in iter_cell_texts(_Cell(tc, table)) if text)
@lazyproperty
def _last_modified(self) -> Optional[str]:
"""Last-modified date suitable for use in element metadata."""
# -- if this file was converted from another format, any last-modified date for the file
# -- will be today, so we get it from the conversion step in `._metadata_last_modified`.
if self._metadata_last_modified:
return self._metadata_last_modified
file_path, file = self._filename, self._file
# -- if the file is on the filesystem, get its date from there --
if file_path is not None:
return None if is_temp_file_path(file_path) else get_last_modified_date(file_path)
# -- otherwise, as long as user explicitly requested it, try getting it from the file-like
# -- object (unlikely since BytesIO and its brethren have no such metadata).
assert file is not None
if self._date_from_file_object:
return get_last_modified_date_from_file(file)
return None
@property
def _page_number(self) -> Optional[int]:
"""The current page number, or None if we can't really tell.
Page numbers are not added to element metadata if we can't find any page-breaks in the
document (which may be a common case).
In the DOCX format, determining page numbers is strictly a best-efforts attempt since actual
page-breaks are determined at rendering time (e.g. printing) based on the fontmetrics of the
target device. Explicit (hard) page-breaks are always recorded in the docx file but the
rendered page-breaks are only added optionally.
"""
return self._page_counter if self._document_contains_pagebreaks else None
def _paragraph_emphasis(self, paragraph: Paragraph) -> tuple[list[str], list[str]]:
"""[contents, tags] pair describing emphasized text in `paragraph`."""
iter_p_emph, iter_p_emph_2 = itertools.tee(self._iter_paragraph_emphasis(paragraph))
@ -842,12 +886,12 @@ class _DocxPartitioner:
category_depth=category_depth,
emphasized_text_contents=emphasized_text_contents or None,
emphasized_text_tags=emphasized_text_tags or None,
filename=self._metadata_filename,
last_modified=self._last_modified,
filename=self._opts.metadata_file_path,
last_modified=self._opts.last_modified,
link_texts=link_texts or None,
link_urls=link_urls or None,
links=links or None,
page_number=self._page_number,
page_number=self._opts.metadata_page_number,
)
element_metadata.detection_origin = "docx"
return element_metadata