rfctr: prepare docx partitioner and tests for nested tables PR to follow (#1978)

*Reviewer:* May be quicker to review commit by commit as they are quite
distinct and well-groomed to each focus on a single clean-up task.

Clean up odds-and-ends in the docx partitioner in preparation for adding
nested-tables support in a closely following PR.

1. Remove obsolete TODOs now in GitHub issues, which is probably where
they belong in future anyway.
2. Remove local DOCX "workaround" code that has been implemented
upstream and is now obsolete.
3. "Clean" the docx tests, introducing strict typing, extracting a
fixture or two, and generally tightening things up.
4. Extract docx-local versions of
`unstructured.partition.common.convert_ms_office_table_to_text()` which
will be the base for adding nested-table support. More information on
why this is required in that commit.
This commit is contained in:
Steve Canny 2023-11-01 22:22:17 -07:00 committed by GitHub
parent 51d07b6434
commit 4e40999070
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 188 additions and 362 deletions

View File

@ -1,4 +1,4 @@
## 0.10.29-dev7
## 0.10.29-dev8
### Enhancements
@ -16,7 +16,7 @@
* **Handle empty string for `ocr_languages` with values for `languages`** Some API users ran into an issue with sending `languages` params because the API defaulted to also using an empty string for `ocr_languages`. This update handles situations where `languages` is defined and `ocr_languages` is an empty string.
* **Fix PDF tried to loop through None** Previously the PDF annotation extraction tried to loop through `annots` that resolved out as None. A logical check added to avoid such error.
* **Ingest session handler not being shared correctly** All ingest docs that leverage the session handler should only need to set it once per process. It was recreating it each time because the right values weren't being set nor available given how dataclasses work in python.
* **Ingest download-only fix** Previously the download only flag was being checked after the doc factory pipeline step, which occurs before the files are actually downloaded by the source node. This check was moved after the source node to allow for the files to be downloaded first before exiting the pipeline.
* **Ingest download-only fix.** Previously the download only flag was being checked after the doc factory pipeline step, which occurs before the files are actually downloaded by the source node. This check was moved after the source node to allow for the files to be downloaded first before exiting the pipeline.
* **Fix flaky chunk-metadata.** Prior implementation was sensitive to element order in the section resulting in metadata values sometimes being dropped. Also, not all metadata items can be consolidated across multiple elements (e.g. coordinates) and so are now dropped from consolidated metadata.
## 0.10.28

View File

@ -10,6 +10,7 @@ from unstructured.documents.elements import (
Address,
ListItem,
NarrativeText,
Table,
Text,
Title,
)
@ -271,6 +272,25 @@ def test_partition_doc_from_file_without_metadata_date(
assert elements[0].metadata.date == "2020-07-05"
def test_partition_doc_grabs_emphasized_texts():
expected_emphasized_text_contents = ["bold", "italic", "bold-italic", "bold-italic"]
expected_emphasized_text_tags = ["b", "i", "b", "i"]
elements = partition_doc("example-docs/fake-doc-emphasized-text.doc")
assert isinstance(elements[0], Table)
assert elements[0].metadata.emphasized_text_contents == expected_emphasized_text_contents
assert elements[0].metadata.emphasized_text_tags == expected_emphasized_text_tags
assert elements[1] == NarrativeText("I am a bold italic bold-italic text.")
assert elements[1].metadata.emphasized_text_contents == expected_emphasized_text_contents
assert elements[1].metadata.emphasized_text_tags == expected_emphasized_text_tags
assert elements[2] == NarrativeText("I am a normal text.")
assert elements[2].metadata.emphasized_text_contents is None
assert elements[2].metadata.emphasized_text_tags is None
def test_partition_doc_with_json(mock_document, tmpdir):
docx_filename = os.path.join(tmpdir.dirname, "mock_document.docx")
doc_filename = os.path.join(tmpdir.dirname, "mock_document.doc")

View File

@ -1,6 +1,5 @@
# pyright: reportPrivateUsage=false
import os
import pathlib
from tempfile import SpooledTemporaryFile
from typing import Dict, List, cast
@ -8,27 +7,29 @@ from typing import Dict, List, cast
import docx
import pytest
from docx.document import Document
from pytest_mock import MockFixture
from test_unstructured.unit_utils import assert_round_trips_through_JSON
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import (
Address,
CompositeElement,
Element,
Footer,
Header,
ListItem,
NarrativeText,
Table,
TableChunk,
Text,
Title,
)
from unstructured.partition.doc import partition_doc
from unstructured.partition.docx import _DocxPartitioner, partition_docx
from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA
def test_parition_docx_from_team_chat():
elements = partition_docx(filename="example-docs/teams_chat.docx")
elements = cast(List[Text], partition_docx(example_doc_path("teams_chat.docx")))
assert [element.text for element in elements] == [
"0:0:0.0 --> 0:0:1.510\nSome Body\nOK. Yeah.",
"0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
@ -37,10 +38,10 @@ def test_parition_docx_from_team_chat():
def test_partition_docx_from_filename(
mock_document_filename: str,
mock_document_file_path: str,
expected_elements: List[Element],
):
elements = partition_docx(filename=mock_document_filename)
elements = partition_docx(mock_document_file_path)
assert elements == expected_elements
assert elements[0].metadata.page_number is None
@ -50,19 +51,20 @@ def test_partition_docx_from_filename(
assert {element.metadata.detection_origin for element in elements} == {"docx"}
def test_partition_docx_from_filename_with_metadata_filename(mock_document, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
elements = partition_docx(filename=filename, metadata_filename="test")
def test_partition_docx_from_filename_with_metadata_filename(mock_document_file_path: str):
elements = partition_docx(mock_document_file_path, metadata_filename="test")
assert all(element.metadata.filename == "test" for element in elements)
def test_partition_docx_with_spooled_file(mock_document, expected_elements, tmpdir):
# Test that the partition_docx function can handle a SpooledTemporaryFile
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
def test_partition_docx_with_spooled_file(
mock_document_file_path: str, expected_elements: List[Text]
):
"""`partition_docx()` accepts a SpooledTemporaryFile as its `file` argument.
with open(filename, "rb") as test_file:
`python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need
to ensure the source file is appropriately converted in this case.
"""
with open(mock_document_file_path, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile()
spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0)
@ -72,28 +74,18 @@ def test_partition_docx_with_spooled_file(mock_document, expected_elements, tmpd
assert element.metadata.filename is None
def test_partition_docx_from_file(mock_document, expected_elements, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
with open(filename, "rb") as f:
def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: List[Text]):
with open(mock_document_file_path, "rb") as f:
elements = partition_docx(file=f)
assert elements == expected_elements
for element in elements:
assert element.metadata.filename is None
@pytest.mark.parametrize(
"infer_table_structure",
[
True,
False,
],
)
def test_partition_docx_infer_table_structure(infer_table_structure):
@pytest.mark.parametrize("infer_table_structure", [True, False])
def test_partition_docx_infer_table_structure(infer_table_structure: bool):
elements = partition_docx(
filename="example-docs/fake_table.docx",
infer_table_structure=infer_table_structure,
example_doc_path("fake_table.docx"), infer_table_structure=infer_table_structure
)
table_element_has_text_as_html_field = (
hasattr(elements[0].metadata, "text_as_html")
@ -102,58 +94,58 @@ def test_partition_docx_infer_table_structure(infer_table_structure):
assert table_element_has_text_as_html_field == infer_table_structure
def test_partition_docx_from_file_with_metadata_filename(mock_document, expected_elements, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
with open(filename, "rb") as f:
def test_partition_docx_from_file_with_metadata_filename(
mock_document_file_path: str, expected_elements: List[Text]
):
with open(mock_document_file_path, "rb") as f:
elements = partition_docx(file=f, metadata_filename="test")
assert elements == expected_elements
for element in elements:
assert element.metadata.filename == "test"
def test_partition_docx_raises_with_both_specified(mock_document, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
with open(filename, "rb") as f, pytest.raises(ValueError):
partition_docx(filename=filename, file=f)
def test_partition_docx_raises_with_both_specified(mock_document_file_path: str):
with open(mock_document_file_path, "rb") as f:
with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"):
partition_docx(filename=mock_document_file_path, file=f)
def test_partition_docx_raises_with_neither():
with pytest.raises(ValueError):
with pytest.raises(ValueError, match="Exactly one of filename and file must be specified"):
partition_docx()
def test_partition_docx_processes_table(filename="example-docs/fake_table.docx"):
elements = partition_docx(filename=filename)
def test_partition_docx_processes_table():
elements = partition_docx(example_doc_path("fake_table.docx"))
assert isinstance(elements[0], Table)
assert (
elements[0].metadata.text_as_html
== """<table>
<thead>
<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>
</thead>
<tbody>
<tr><td>Lorem ipsum </td><td>A Link example</td></tr>
</tbody>
</table>"""
assert elements[0].metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Header Col 1 </th><th>Header Col 2 </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Lorem ipsum </td><td>A Link example</td></tr>\n"
"</tbody>\n"
"</table>"
)
assert elements[0].metadata.filename == "fake_table.docx"
def test_partition_docx_grabs_header_and_footer(filename="example-docs/handbook-1p.docx"):
elements = partition_docx(filename=filename)
def test_partition_docx_grabs_header_and_footer():
elements = partition_docx(example_doc_path("handbook-1p.docx"))
assert elements[0] == Header("US Trustee Handbook")
assert elements[-1] == Footer("Copyright")
for element in elements:
assert element.metadata.filename == "handbook-1p.docx"
def test_partition_docx_includes_pages_if_present(filename="example-docs/handbook-1p.docx"):
elements = partition_docx(filename=filename, include_page_breaks=False)
def test_partition_docx_includes_pages_if_present():
elements = cast(
List[Text], partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=False)
)
assert "PageBreak" not in [elem.category for elem in elements]
assert elements[1].metadata.page_number == 1
assert elements[-2].metadata.page_number == 2
@ -161,8 +153,11 @@ def test_partition_docx_includes_pages_if_present(filename="example-docs/handboo
assert element.metadata.filename == "handbook-1p.docx"
def test_partition_docx_includes_page_breaks(filename="example-docs/handbook-1p.docx"):
elements = partition_docx(filename=filename, include_page_breaks=True)
def test_partition_docx_includes_page_breaks():
elements = cast(
List[Text], partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=True)
)
assert "PageBreak" in [elem.category for elem in elements]
assert elements[1].metadata.page_number == 1
assert elements[-2].metadata.page_number == 2
@ -170,115 +165,81 @@ def test_partition_docx_includes_page_breaks(filename="example-docs/handbook-1p.
assert element.metadata.filename == "handbook-1p.docx"
def test_partition_docx_detects_lists(filename="example-docs/example-list-items-multiple.docx"):
elements = partition_docx(filename=filename)
list_elements = []
narrative_elements = []
for element in elements:
if isinstance(element, ListItem):
list_elements.append(element)
else:
narrative_elements.append(element)
def test_partition_docx_detects_lists():
elements = partition_docx(example_doc_path("example-list-items-multiple.docx"))
assert elements[-1] == ListItem(
"This is simply dummy text of the printing and typesetting industry.",
)
assert len(list_elements) == 10
assert sum(1 for e in elements if isinstance(e, ListItem)) == 10
def test_partition_docx_from_filename_exclude_metadata(filename="example-docs/handbook-1p.docx"):
elements = partition_docx(filename=filename, include_metadata=False)
def test_partition_docx_from_filename_exclude_metadata():
elements = partition_docx(example_doc_path("handbook-1p.docx"), include_metadata=False)
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None
def test_partition_docx_from_file_exclude_metadata(mock_document, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
with open(filename, "rb") as f:
def test_partition_docx_from_file_exclude_metadata(mock_document_file_path: str):
with open(mock_document_file_path, "rb") as f:
elements = partition_docx(file=f, include_metadata=False)
assert elements[0].metadata.filetype is None
assert elements[0].metadata.page_name is None
assert elements[0].metadata.filename is None
def test_partition_docx_metadata_date(
mocker,
filename="example-docs/fake.docx",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
def test_partition_docx_metadata_date(mocker: MockFixture):
mocker.patch(
"unstructured.partition.docx.get_last_modified_date",
return_value=mocked_last_modification_date,
"unstructured.partition.docx.get_last_modified_date", return_value="2029-07-05T09:24:28"
)
elements = partition_docx(filename=filename)
elements = partition_docx(example_doc_path("fake.docx"))
assert elements[0].metadata.last_modified == mocked_last_modification_date
assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"
def test_partition_docx_metadata_date_with_custom_metadata(
mocker,
filename="example-docs/fake.docx",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modified_date = "2020-07-05T09:24:28"
def test_partition_docx_metadata_date_with_custom_metadata(mocker: MockFixture):
mocker.patch(
"unstructured.partition.docx.get_last_modified_date",
return_value=mocked_last_modification_date,
"unstructured.partition.docx.get_last_modified_date", return_value="2023-11-01T14:13:07"
)
elements = partition_docx(
filename=filename,
metadata_last_modified=expected_last_modified_date,
example_doc_path("fake.docx"), metadata_last_modified="2020-07-05T09:24:28"
)
assert elements[0].metadata.last_modified == expected_last_modified_date
assert elements[0].metadata.last_modified == "2020-07-05T09:24:28"
def test_partition_docx_from_file_metadata_date(
mocker,
filename="example-docs/fake.docx",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
def test_partition_docx_from_file_metadata_date(mocker: MockFixture):
mocker.patch(
"unstructured.partition.docx.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
return_value="2029-07-05T09:24:28",
)
with open(filename, "rb") as f:
with open(example_doc_path("fake.docx"), "rb") as f:
elements = partition_docx(file=f)
assert elements[0].metadata.last_modified == mocked_last_modification_date
assert elements[0].metadata.last_modified == "2029-07-05T09:24:28"
def test_partition_docx_from_file_metadata_date_with_custom_metadata(
mocker,
filename="example-docs/fake.docx",
):
mocked_last_modification_date = "2029-07-05T09:24:28"
expected_last_modified_date = "2020-07-05T09:24:28"
def test_partition_docx_from_file_metadata_date_with_custom_metadata(mocker: MockFixture):
mocker.patch(
"unstructured.partition.docx.get_last_modified_date_from_file",
return_value=mocked_last_modification_date,
return_value="2023-11-01T14:13:07",
)
with open(filename, "rb") as f:
elements = partition_docx(file=f, metadata_last_modified=expected_last_modified_date)
assert elements[0].metadata.last_modified == expected_last_modified_date
with open(example_doc_path("fake.docx"), "rb") as f:
elements = partition_docx(file=f, metadata_last_modified="2020-07-05T09:24:28")
assert elements[0].metadata.last_modified == "2020-07-05T09:24:28"
def test_partition_docx_from_file_without_metadata_date(
filename="example-docs/fake.docx",
):
def test_partition_docx_from_file_without_metadata_date():
"""Test partition_docx() with file that are not possible to get last modified date"""
with open(filename, "rb") as f:
with open(example_doc_path("fake.docx"), "rb") as f:
sf = SpooledTemporaryFile()
sf.write(f.read())
sf.seek(0)
@ -344,20 +305,11 @@ def test_table_emphasis(
assert emphasized_text_tags == expected_emphasized_text_tags
@pytest.mark.parametrize(
("filename", "partition_func"),
[
("fake-doc-emphasized-text.docx", partition_docx),
("fake-doc-emphasized-text.doc", partition_doc),
],
)
def test_partition_docx_grabs_emphasized_texts(
filename,
partition_func,
expected_emphasized_text_contents,
expected_emphasized_text_tags,
expected_emphasized_text_contents: List[str],
expected_emphasized_text_tags: List[str],
):
elements = partition_func(filename=f"example-docs/{filename}")
elements = partition_docx(example_doc_path("fake-doc-emphasized-text.docx"))
assert isinstance(elements[0], Table)
assert elements[0].metadata.emphasized_text_contents == expected_emphasized_text_contents
@ -372,11 +324,8 @@ def test_partition_docx_grabs_emphasized_texts(
assert elements[2].metadata.emphasized_text_tags is None
def test_partition_docx_with_json(mock_document, tmpdir):
filename = os.path.join(tmpdir.dirname, "mock_document.docx")
mock_document.save(filename)
elements = partition_docx(filename=filename)
def test_partition_docx_with_json(mock_document_file_path: str):
elements = partition_docx(mock_document_file_path)
assert_round_trips_through_JSON(elements)
@ -448,33 +397,30 @@ def test_parse_category_depth_by_style_ilvl():
assert partitioner._parse_category_depth_by_style_ilvl() == 0
def test_add_chunking_strategy_on_partition_docx_default_args(
filename="example-docs/handbook-1p.docx",
):
chunk_elements = partition_docx(filename, chunking_strategy="by_title")
elements = partition_docx(filename)
def test_add_chunking_strategy_on_partition_docx_default_args():
chunk_elements = partition_docx(
example_doc_path("handbook-1p.docx"), chunking_strategy="by_title"
)
elements = partition_docx(example_doc_path("handbook-1p.docx"))
chunks = chunk_by_title(elements)
assert chunk_elements != elements
assert chunk_elements == chunks
def test_add_chunking_strategy_on_partition_docx(
filename="example-docs/fake-doc-emphasized-text.docx",
):
def test_add_chunking_strategy_on_partition_docx():
docx_path = example_doc_path("fake-doc-emphasized-text.docx")
chunk_elements = partition_docx(
filename,
chunking_strategy="by_title",
max_characters=9,
combine_text_under_n_chars=5,
docx_path, chunking_strategy="by_title", max_characters=9, combine_text_under_n_chars=5
)
elements = partition_docx(filename)
elements = partition_docx(docx_path)
chunks = chunk_by_title(elements, max_characters=9, combine_text_under_n_chars=5)
assert chunk_elements == chunks
assert elements != chunk_elements
for chunk in chunks:
assert isinstance(chunk, (CompositeElement, TableChunk))
assert len(chunk.text) <= 9
@ -507,7 +453,7 @@ def test_partition_docx_raises_TypeError_for_invalid_languages():
def test_partition_docx_includes_hyperlink_metadata():
elements = cast(List[Text], partition_docx(get_test_file_path("hlink-meta.docx")))
elements = cast(List[Text], partition_docx(example_doc_path("hlink-meta.docx")))
# -- regular paragraph, no hyperlinks --
element = elements[0]
@ -593,8 +539,13 @@ def test_partition_docx_includes_hyperlink_metadata():
# -- module-level fixtures -----------------------------------------------------------------------
def example_doc_path(filename: str) -> str:
"""String path to a file in the example-docs/ directory."""
return str(pathlib.Path(__file__).parent.parent.parent.parent / "example-docs" / filename)
@pytest.fixture()
def expected_elements():
def expected_elements() -> List[Text]:
return [
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
@ -608,12 +559,12 @@ def expected_elements():
@pytest.fixture()
def expected_emphasized_text_contents():
def expected_emphasized_text_contents() -> List[str]:
return ["bold", "italic", "bold-italic", "bold-italic"]
@pytest.fixture()
def expected_emphasized_text_tags():
def expected_emphasized_text_tags() -> List[str]:
return ["b", "i", "b", "i"]
@ -627,12 +578,6 @@ def expected_emphasized_texts():
]
def get_test_file_path(filename: str) -> str:
"""String path to a file in the docx/test_files directory."""
# -- needs the `get_` prefix on name so this doesn't get picked up as a test-function --
return str(pathlib.Path(__file__).parent / "test_files" / filename)
@pytest.fixture()
def mock_document():
document = docx.Document()
@ -661,8 +606,7 @@ def mock_document():
@pytest.fixture()
def mock_document_filename(mock_document: Document, tmp_path: pathlib.Path) -> str:
def mock_document_file_path(mock_document: Document, tmp_path: pathlib.Path) -> str:
filename = str(tmp_path / "mock_document.docx")
print(f"filename = {filename}")
mock_document.save(filename)
return filename

View File

@ -1,8 +1,10 @@
from typing import Sequence
from typing import Iterator, Sequence
from docx.blkcntnr import BlockItemContainer
from docx.enum.section import WD_SECTION
from docx.oxml.section import CT_SectPr
from docx.table import Table
from docx.text.paragraph import Paragraph
class Section:
_sectPr: CT_SectPr
@ -20,6 +22,7 @@ class Section:
def footer(self) -> _Footer: ...
@property
def header(self) -> _Header: ...
def iter_inner_content(self) -> Iterator[Paragraph | Table]: ...
@property
def start_type(self) -> WD_SECTION: ...

View File

@ -1 +1 @@
__version__ = "0.10.29-dev7" # pragma: no cover
__version__ = "0.10.29-dev8" # pragma: no cover

View File

@ -14,7 +14,6 @@ from typing import (
Iterator,
List,
Optional,
Sequence,
Tuple,
Type,
Union,
@ -25,18 +24,15 @@ from typing import (
import docx
from docx.document import Document
from docx.enum.section import WD_SECTION_START
from docx.oxml.ns import nsmap, qn
from docx.oxml.section import CT_SectPr
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.oxml.text.run import CT_R
from docx.oxml.xmlchemy import BaseOxmlElement
from docx.section import Section, _Footer, _Header
from docx.table import Table as DocxTable
from docx.text.hyperlink import Hyperlink
from docx.text.paragraph import Paragraph
from docx.text.run import Run
from lxml import etree
from tabulate import tabulate
from typing_extensions import TypeAlias
from unstructured.chunking.title import add_chunking_strategy
@ -59,7 +55,6 @@ from unstructured.documents.elements import (
)
from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
from unstructured.partition.common import (
convert_ms_office_table_to_text,
exactly_one,
get_last_modified_date,
get_last_modified_date_from_file,
@ -234,28 +229,6 @@ def partition_docx(
class _DocxPartitioner:
"""Provides `.partition()` for MS-Word 2007+ (.docx) files."""
# TODO: I think we can do better on metadata.filename. Should that only be populated when a
# `metadata_filename` argument was provided to `partition_docx()`? What about when not but
# we do get a `filename` arg or a `file` arg that has a `.name` attribute?
# TODO: get last-modified date from document-properties (stored in docx package) rather than
# relying on last filesystem-write date; maybe fall-back to filesystem-date.
# TODO: improve `._element_contains_pagebreak()`. It uses substring matching on the rendered
# XML text which is error-prone and not performant. Use XPath instead with the specific
# locations a page-break can be located. Also, there can be more than one, so return a
# count instead of a boolean.
# TODO: Improve document-contains-pagebreaks algorithm to use XPath and to search for
# `w:lastRenderedPageBreak` alone. Make it independent and don't rely on anything like
# the "_element_contains_pagebreak()" function.
# TODO: Improve ._is_list_item() to include list-styles such that telling whether a paragraph is
# a list-item is encapsulated in a single place rather than distributed around the code.
# TODO: Improve ._is_list_item() method of detecting a numbered-list-item to use XPath instead
# of a substring match on the rendered XML. Include all permutations of how a numbered
# list can be manually applied (as opposed to by using a style).
# TODO: Move _SectBlockIterator upstream into `python-docx`. It requires too much
# domain-specific knowledge to comfortable here and is of general use so welcome in the
# library.
# TODO: Move Paragraph._get_paragraph_runs() monkey-patch upstream to `python-docx`.
def __init__(
self,
filename: Optional[str],
@ -315,18 +288,62 @@ class _DocxPartitioner:
yield from self._iter_section_page_breaks(section_idx, section)
yield from self._iter_section_headers(section)
for block_item in _SectBlockItemIterator.iter_sect_block_items(section, self._document):
# -- a block-item can only be a Paragraph ... --
for block_item in section.iter_inner_content():
# -- a block-item can be a Paragraph or a Table, maybe others later so elif here.
# -- Paragraph is more common so check that first.
if isinstance(block_item, Paragraph):
yield from self._iter_paragraph_elements(block_item)
# -- a paragraph can contain a page-break --
yield from self._iter_maybe_paragraph_page_breaks(block_item)
# -- ... or a Table --
else:
elif isinstance( # pyright: ignore[reportUnnecessaryIsInstance]
block_item, DocxTable
):
yield from self._iter_table_element(block_item)
yield from self._iter_section_footers(section)
@staticmethod
def _convert_table_to_html(table: DocxTable) -> str:
"""HTML string version of `table`.
Example:
<table>
<tbody>
<tr><th>item </th><th style="text-align: right;"> qty</th></tr>
<tr><td>spam </td><td style="text-align: right;"> 42</td></tr>
<tr><td>eggs </td><td style="text-align: right;"> 451</td></tr>
<tr><td>bacon </td><td style="text-align: right;"> 0</td></tr>
</tbody>
</table>
"""
return tabulate(
[[cell.text for cell in row.cells] for row in table.rows],
headers="firstrow",
tablefmt="html",
)
@staticmethod
def _convert_table_to_plain_text(table: DocxTable) -> str:
"""Plain-text version of `table`.
Each row appears on its own line. Cells in a column are aligned using spaces as padding:
item qty
spam 42
eggs 451
bacon 0
The first row is unconditionally considered column headings, although the column headings
row is not differentiated in this format.
"""
return tabulate(
[[cell.text for cell in row.cells] for row in table.rows],
headers="firstrow",
tablefmt="plain",
)
@lazyproperty
def _document(self) -> Document:
"""The python-docx `Document` object loaded from file or filename."""
@ -562,8 +579,8 @@ class _DocxPartitioner:
# -- to skip, for example, an empty table, or accommodate nested tables.
html_table = None
if self._infer_table_structure:
html_table = convert_ms_office_table_to_text(table, as_html=True)
text_table = convert_ms_office_table_to_text(table, as_html=False)
html_table = self._convert_table_to_html(table)
text_table = self._convert_table_to_plain_text(table)
emphasized_text_contents, emphasized_text_tags = self._table_emphasis(table)
yield Table(
@ -800,161 +817,3 @@ class _DocxPartitioner:
def _parse_category_depth_by_style_ilvl(self) -> int:
# TODO(newelh) Parsing category depth by style ilvl is not yet implemented
return 0
class _SectBlockItemIterator:
"""Generates the block-items in a section.
A block item is a docx Paragraph or Table. This small class is separated from
`_SectBlockElementIterator` because these two aspects will live in different places upstream.
This makes them easier to transplant, which we expect to do soon.
"""
@classmethod
def iter_sect_block_items(cls, section: Section, document: Document) -> Iterator[BlockItem]:
"""Generate each Paragraph or Table object in `section`."""
for element in _SectBlockElementIterator.iter_sect_block_elements(section._sectPr):
yield (
Paragraph(element, document)
if isinstance(element, CT_P)
else DocxTable(element, document)
)
class _SectBlockElementIterator:
"""Generates the block-item XML elements in a section.
A block-item element is a `CT_P` (paragraph) or a `CT_Tbl` (table).
"""
_compiled_blocks_xpath: Optional[etree.XPath] = None
_compiled_count_xpath: Optional[etree.XPath] = None
def __init__(self, sectPr: CT_SectPr):
self._sectPr = sectPr
@classmethod
def iter_sect_block_elements(cls, sectPr: CT_SectPr) -> Iterator[BlockElement]:
"""Generate each CT_P or CT_Tbl element within the extents governed by `sectPr`."""
return cls(sectPr)._iter_sect_block_elements()
def _iter_sect_block_elements(self) -> Iterator[BlockElement]:
"""Generate each CT_P or CT_Tbl element in section."""
# -- General strategy is to get all block (<w;p> and <w:tbl>) elements from start of doc
# -- to and including this section, then compute the count of those elements that came
# -- from prior sections and skip that many to leave only the ones in this section. It's
# -- possible to express this "between here and there" (end of prior section and end of
# -- this one) concept in XPath, but it would be harder to follow because there are
# -- special cases (e.g. no prior section) and the boundary expressions are fairly hairy.
# -- I also believe it would be computationally more expensive than doing it this
# -- straighforward albeit (theoretically) slightly wasteful way.
sectPr, sectPrs = self._sectPr, self._sectPrs
sectPr_idx = sectPrs.index(sectPr)
# -- count block items belonging to prior sections --
n_blks_to_skip = (
0
if sectPr_idx == 0
else self._count_of_blocks_in_and_above_section(sectPrs[sectPr_idx - 1])
)
# -- and skip those in set of all blks from doc start to end of this section --
for element in self._blocks_in_and_above_section(sectPr)[n_blks_to_skip:]:
yield element
def _blocks_in_and_above_section(self, sectPr: CT_SectPr) -> Sequence[BlockElement]:
"""All ps and tbls in section defined by `sectPr` and all prior sections."""
if self._compiled_blocks_xpath is None:
self._compiled_blocks_xpath = etree.XPath(
self._blocks_in_and_above_section_xpath,
namespaces=nsmap,
regexp=False,
)
xpath = self._compiled_blocks_xpath
# -- XPath callable results are Any (basically), so need a cast --
return cast(Sequence[BlockElement], xpath(sectPr))
@lazyproperty
def _blocks_in_and_above_section_xpath(self) -> str:
"""XPath expr for ps and tbls in context of a sectPr and all prior sectPrs."""
# -- "p_sect" is a section with sectPr located at w:p/w:pPr/w:sectPr. "body_sect" is a
# -- section with sectPr located at w:body/w:sectPr. The last section in the document is a
# -- "body_sect". All others are of the "p_sect" variety. "term" means "terminal", like
# -- the last p or tbl in the section. "pred" means "predecessor", like a preceding p or
# -- tbl in the section.
# -- the terminal block in a p-based sect is the p the sectPr appears in --
p_sect_term_block = "./parent::w:pPr/parent::w:p"
# -- the terminus of a body-based sect is the sectPr itself (not a block) --
body_sect_term = "self::w:sectPr[parent::w:body]"
# -- all the ps and tbls preceding (but not including) the context node --
pred_ps_and_tbls = "preceding-sibling::*[self::w:p | self::w:tbl]"
# -- p_sect_term_block and body_sect_term(inus) are mutually exclusive. So the result is
# -- either the union of nodes found by the first two selectors or the nodes found by the
# -- last selector, never both.
return (
# -- include the p containing a sectPr --
f"{p_sect_term_block}"
# -- along with all the blocks that precede it --
f" | {p_sect_term_block}/{pred_ps_and_tbls}"
# -- or all the preceding blocks if sectPr is body-based (last sectPr) --
f" | {body_sect_term}/{pred_ps_and_tbls}"
)
def _count_of_blocks_in_and_above_section(self, sectPr: CT_SectPr) -> int:
"""All ps and tbls in section defined by `sectPr` and all prior sections."""
if self._compiled_count_xpath is None:
self._compiled_count_xpath = etree.XPath(
f"count({self._blocks_in_and_above_section_xpath})",
namespaces=nsmap,
regexp=False,
)
xpath = self._compiled_count_xpath
# -- numeric XPath results are always float, so need an int() conversion --
return int(cast(float, xpath(sectPr)))
@lazyproperty
def _sectPrs(self) -> Sequence[CT_SectPr]:
"""All w:sectPr elements in document, in document-order."""
return self._sectPr.xpath(
"/w:document/w:body/w:p/w:pPr/w:sectPr | /w:document/w:body/w:sectPr",
)
# == monkey-patch docx.text.Paragraph.runs ===========================================
def _get_paragraph_runs(paragraph: Paragraph) -> Sequence[Run]:
"""Gets all runs in paragraph, including hyperlinks python-docx skips.
Without this, the default runs function skips over hyperlinks.
Args:
paragraph (Paragraph): A Paragraph object.
Returns:
list: A list of Run objects.
"""
def _get_runs(node: BaseOxmlElement, parent: Paragraph) -> Iterator[Run]:
"""Recursively get runs."""
for child in node:
# -- the Paragraph has runs as direct children --
if child.tag == qn("w:r"):
yield Run(cast(CT_R, child), parent)
continue
# -- but it also has hyperlink children that themselves contain runs, so
# -- recurse into those
if child.tag == qn("w:hyperlink"):
yield from _get_runs(child, parent)
return list(_get_runs(paragraph._element, paragraph))
Paragraph.runs = property( # pyright: ignore[reportGeneralTypeIssues]
lambda self: _get_paragraph_runs(self),
)
# ====================================================================================