fix(docx): improve page-break detection (#2036)

Page breaks are reliably indicated by `w:lastRenderedPageBreak` elements
present in the document XML. Page breaks are NOT reliably indicated by
"hard" page-breaks inserted by the author and when present are redundant
to a `w:lastRenderedPageBreak` element so cause over-counting if used.

Use rendered page-breaks only.
This commit is contained in:
Steve Canny 2023-11-09 12:34:30 -08:00 committed by GitHub
parent 3fe480799a
commit d06bcc41bb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 74 additions and 83 deletions

View File

@ -14,6 +14,7 @@
* **Fix ingest partition parameters not being passed to the api.** When using the --partition-by-api flag via unstructured-ingest, none of the partition arguments are forwarded, meaning that these options are disregarded. With this change, we now pass through all of the relevant partition arguments to the api. This allows a user to specify all of the same partition arguments they would locally and have them respected when specifying --partition-by-api.
* **Support tables in section-less DOCX.** Generalize solution for MS Chat Transcripts exported as DOCX by including tables in the partitioned output when present.
* **Improve DOCX page-break detection.** DOCX page breaks are reliably indicated by `w:lastRenderedPageBreak` elements present in the document XML. Page breaks are NOT reliably indicated by "hard" page-breaks inserted by the author and when present are redundant to a `w:lastRenderedPageBreak` element so cause over-counting if used. Use rendered page-breaks only.
## 0.10.29

Binary file not shown.

Binary file not shown.

View File

@ -232,28 +232,46 @@ def test_partition_docx_grabs_header_and_footer():
assert element.metadata.filename == "handbook-1p.docx"
def test_partition_docx_includes_pages_if_present():
elements = cast(
List[Text], partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=False)
# -- page-break behaviors ------------------------------------------------------------------------
def test_partition_docx_includes_neither_page_breaks_nor_numbers_when_rendered_breaks_not_present():
"""Hard page-breaks by themselves are not enough to locate page-breaks in a document.
In particular, they are redundant when rendered page-breaks are present, which they usually are
in a native Word document, so lead to double-counting those page-breaks. When rendered page
breaks are *not* present, only a small fraction will be represented by hard page-breaks so hard
breaks are a false-positive and will generally produce incorrect page numbers.
"""
elements = partition_docx(
example_doc_path("handbook-1p-no-rendered-page-breaks.docx"), include_page_breaks=True
)
assert "PageBreak" not in [elem.category for elem in elements]
assert "PageBreak" not in [type(e).__name__ for e in elements]
assert all(e.metadata.page_number is None for e in elements)
def test_partition_docx_includes_page_numbers_when_page_break_elements_are_suppressed():
"""Page-number metadata is not supressed when `include_page_breaks` arga is False.
Only inclusion of PageBreak elements is affected by that option.
"""
elements = partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=False)
assert "PageBreak" not in [type(e).__name__ for e in elements]
assert elements[1].metadata.page_number == 1
assert elements[-2].metadata.page_number == 2
for element in elements:
assert element.metadata.filename == "handbook-1p.docx"
def test_partition_docx_includes_page_breaks():
elements = cast(
List[Text], partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=True)
)
def test_partition_docx_includes_page_break_elements_when_so_instructed():
elements = partition_docx(example_doc_path("handbook-1p.docx"), include_page_breaks=True)
assert "PageBreak" in [elem.category for elem in elements]
assert "PageBreak" in [type(e).__name__ for e in elements]
assert elements[1].metadata.page_number == 1
assert elements[-2].metadata.page_number == 2
for element in elements:
assert element.metadata.filename == "handbook-1p.docx"
# ------------------------------------------------------------------------------------------------
def test_partition_docx_detects_lists():

View File

@ -39,7 +39,6 @@
"languages": [
"eng"
],
"page_number": 1,
"emphasized_text_contents": [
"CHAPTER 1"
],
@ -67,7 +66,6 @@
"languages": [
"eng"
],
"page_number": 1,
"emphasized_text_contents": [
"INTRODUCTION"
],
@ -94,8 +92,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
]
},
"text": "CHAPTER 1 INTRODUCTION"
},
@ -116,8 +113,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
]
},
"text": "A.\tPURPOSE"
},
@ -138,8 +134,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
]
},
"text": "The United States Trustee appoints and supervises standing trustees and monitors and supervises cases under chapter 13 of title 11 of the United States Code. 28 U.S.C. § 586(b). The Handbook, issued as part of our duties under 28 U.S.C. § 586, establishes or clarifies the position of the United States Trustee Program (Program) on the duties owed by a standing trustee to the debtors, creditors, other parties in interest, and the United States Trustee. The Handbook does not present a full and complete statement of the law; it should not be used as a substitute for legal research and analysis. The standing trustee must be familiar with relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11 U.S.C. § 321, 28 U.S.C. § 586, 28 C.F.R. § 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in this Handbook but these are not considered mandatory."
},
@ -160,8 +155,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
]
},
"text": "Nothing in this Handbook should be construed to excuse the standing trustee from complying with all duties imposed by the Bankruptcy Code and Rules, local rules, and orders of the court. The standing trustee should notify the United States Trustee whenever the provision of the Handbook conflicts with the local rules or orders of the court. The standing trustee is accountable for all duties set forth in this Handbook, but need not personally perform any duty unless otherwise indicated. All statutory references in this Handbook refer to the Bankruptcy Code, 11 U.S.C. § 101 et seq., unless otherwise indicated."
},
@ -206,7 +200,7 @@
"eng"
]
},
"text": ""
"text": "This Handbook does not create additional rights against the standing trustee or United States Trustee in favor of other parties."
},
{
"type": "Title",
@ -225,8 +219,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
]
},
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE"
},
@ -247,8 +240,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
]
},
"text": "The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the responsibilities for daytoday administration of cases. Debtors, creditors, and third parties with adverse interests to the trustee were concerned that the court, which previously appointed and supervised the trustee, would not impartially adjudicate their rights as adversaries of that trustee. To address these concerns, judicial and administrative functions within the bankruptcy system were bifurcated."
},
@ -269,8 +261,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
]
},
"text": "Many administrative functions formerly performed by the court were placed within the Department of Justice through the creation of the Program. Among the administrative functions assigned to the United States Trustee were the appointment and supervision of chapter 13 trustees./ This Handbook is issued under the authority of the Programs enabling statutes. "
},
@ -291,8 +282,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
]
},
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t"
},
@ -313,8 +303,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
]
},
"text": "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The standing trustee is more than a mere disbursing agent. The standing trustee must be personally involved in the trustee operation. If the standing trustee is or becomes unable to perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28 U.S.C. § 586(b), 28 C.F.R. § 58.4(b) referencing 28 C.F.R. § 58.3(b)."
},
@ -335,8 +324,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
]
},
"text": "Although this Handbook is not intended to be a complete statutory reference, the standing trustees primary statutory duties are set forth in 11 U.S.C. § 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. § 704. These duties include, but are not limited to, the following:"
},
@ -362,4 +350,4 @@
},
"text": "Copyright"
}
]
]

View File

@ -35,7 +35,6 @@
"languages": [
"eng"
],
"page_number": 1,
"emphasized_text_contents": [
"CHAPTER 1"
],
@ -61,7 +60,6 @@
"languages": [
"eng"
],
"page_number": 1,
"emphasized_text_contents": [
"INTRODUCTION"
],
@ -86,8 +84,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
]
},
"text": "CHAPTER 1 INTRODUCTION"
},
@ -106,8 +103,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
]
},
"text": "A.\tPURPOSE"
},
@ -126,8 +122,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
]
},
"text": "The United States Trustee appoints and supervises standing trustees and monitors and supervises cases under chapter 13 of title 11 of the United States Code. 28 U.S.C. § 586(b). The Handbook, issued as part of our duties under 28 U.S.C. § 586, establishes or clarifies the position of the United States Trustee Program (Program) on the duties owed by a standing trustee to the debtors, creditors, other parties in interest, and the United States Trustee. The Handbook does not present a full and complete statement of the law; it should not be used as a substitute for legal research and analysis. The standing trustee must be familiar with relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11 U.S.C. § 321, 28 U.S.C. § 586, 28 C.F.R. § 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in this Handbook but these are not considered mandatory."
},
@ -146,8 +141,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
]
},
"text": "Nothing in this Handbook should be construed to excuse the standing trustee from complying with all duties imposed by the Bankruptcy Code and Rules, local rules, and orders of the court. The standing trustee should notify the United States Trustee whenever the provision of the Handbook conflicts with the local rules or orders of the court. The standing trustee is accountable for all duties set forth in this Handbook, but need not personally perform any duty unless otherwise indicated. All statutory references in this Handbook refer to the Bankruptcy Code, 11 U.S.C. § 101 et seq., unless otherwise indicated."
},
@ -188,7 +182,7 @@
"eng"
]
},
"text": ""
"text": "This Handbook does not create additional rights against the standing trustee or United States Trustee in favor of other parties."
},
{
"type": "Title",
@ -205,8 +199,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
]
},
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE"
},
@ -225,8 +218,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
]
},
"text": "The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the responsibilities for daytoday administration of cases. Debtors, creditors, and third parties with adverse interests to the trustee were concerned that the court, which previously appointed and supervised the trustee, would not impartially adjudicate their rights as adversaries of that trustee. To address these concerns, judicial and administrative functions within the bankruptcy system were bifurcated."
},
@ -245,8 +237,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
]
},
"text": "Many administrative functions formerly performed by the court were placed within the Department of Justice through the creation of the Program. Among the administrative functions assigned to the United States Trustee were the appointment and supervision of chapter 13 trustees./ This Handbook is issued under the authority of the Programs enabling statutes. "
},
@ -265,8 +256,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
]
},
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t"
},
@ -285,8 +275,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
]
},
"text": "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The standing trustee is more than a mere disbursing agent. The standing trustee must be personally involved in the trustee operation. If the standing trustee is or becomes unable to perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28 U.S.C. § 586(b), 28 C.F.R. § 58.4(b) referencing 28 C.F.R. § 58.3(b)."
},
@ -305,8 +294,7 @@
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
]
},
"text": "Although this Handbook is not intended to be a complete statutory reference, the standing trustees primary statutory duties are set forth in 11 U.S.C. § 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. § 704. These duties include, but are not limited to, the following:"
},
@ -330,4 +318,4 @@
},
"text": "Copyright"
}
]
]

View File

@ -27,7 +27,6 @@ from docx.document import Document
from docx.enum.section import WD_SECTION_START
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.oxml.xmlchemy import BaseOxmlElement
from docx.section import Section, _Footer, _Header
from docx.table import Table as DocxTable
from docx.table import _Cell, _Row
@ -411,8 +410,24 @@ class _DocxPartitioner:
@lazyproperty
def _document_contains_pagebreaks(self) -> bool:
"""True when there is at least one page-break detected in the document."""
return self._element_contains_pagebreak(self._document.element)
"""True when there is at least one page-break detected in the document.
Only `w:lastRenderedPageBreak` elements reliably indicate a page-break. These are reliably
inserted by Microsoft Word, but probably don't appear in documents converted into .docx
format from for example .odt format.
"""
xpath = (
# NOTE(scanny) - w:lastRenderedPageBreak (lrpb) is run (w:r) inner content. `w:r` can
# appear in a paragraph (w:p). w:r can also appear in a hyperlink (w:hyperlink), which
# is w:p inner-content and both of these can occur inside a table-cell as well as the
# document body
"./w:body/w:p/w:r/w:lastRenderedPageBreak"
" | ./w:body/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
" | ./w:body/w:tbl/w:tr/w:tc/w:p/w:r/w:lastRenderedPageBreak"
" | ./w:body/w:tbl/w:tr/w:tc/w:p/w:hyperlink/w:r/w:lastRenderedPageBreak"
)
return bool(self._document.element.xpath(xpath))
@lazyproperty
def _document_contains_sections(self) -> bool:
@ -424,24 +439,6 @@ class _DocxPartitioner:
"""
return bool(self._document.sections)
def _element_contains_pagebreak(self, element: BaseOxmlElement) -> bool:
"""True when `element` contains a page break.
Checks for both "hard" page breaks (page breaks explicitly inserted by the user)
and "soft" page breaks, which are sometimes inserted by the MS Word renderer.
Note that soft page breaks aren't always present. Whether or not pages are
tracked may depend on your Word renderer.
"""
page_break_indicators = [
["w:br", 'type="page"'], # "Hard" page break inserted by user
["lastRenderedPageBreak"], # "Soft" page break inserted by renderer
]
if hasattr(element, "xml"):
for indicators in page_break_indicators:
if all(indicator in element.xml for indicator in indicators):
return True
return False
def _increment_page_number(self) -> Iterator[PageBreak]:
"""Increment page-number by 1 and generate a PageBreak element if enabled."""
self._page_counter += 1
@ -509,7 +506,6 @@ class _DocxPartitioner:
def has_page_break_implementation_we_have_so_far() -> bool:
"""Needs to become more sophisticated."""
page_break_indicators = [
["w:br", 'type="page"'], # "Hard" page break inserted by user
["lastRenderedPageBreak"], # "Soft" page break inserted by renderer
]
for indicators in page_break_indicators: