fix: improve false-positive Title elements on Chinese text (#3836)

**Summary**
Improve element-type mapping for Chinese text. Fixes bug where Chinese
text would produce large numbers of false-positive `Title` elements.

Fixes #3084

---------

Co-authored-by: scanny <scanny@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
This commit is contained in:
Steve Canny 2024-12-17 17:16:42 -08:00 committed by GitHub
parent 9a9bf4c4f5
commit 9ece0b5ad2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 847 additions and 852 deletions

View File

@ -1,4 +1,4 @@
## 0.16.12-dev3
## 0.16.12-dev4
### Enhancements
@ -10,6 +10,7 @@
- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
## 0.16.11

View File

@ -74,20 +74,18 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
"handbook-1p.docx",
{
("Header", None): 1,
("Title", 0): 1,
("Title", 1): 1,
("Title", 2): 1,
("UncategorizedText", 0): 6,
("ListItem", 3): 3,
("NarrativeText", 4): 7,
("NarrativeText", 0): 7,
("Footer", None): 1,
},
(0.43, 0.07, 0.65),
(0.78, 0.72, 0.81),
),
(
"handbook-1p.docx",
{
("Header", None): 1,
("Title", 0): 6,
("UncategorizedText", 0): 6,
("NarrativeText", 0): 7,
("PageBreak", None): 1,
("Footer", None): 1,

View File

@ -1286,7 +1286,7 @@ def expected_docx_elements():
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),

View File

@ -275,7 +275,7 @@ def expected_elements() -> list[Element]:
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),

View File

@ -627,7 +627,7 @@ def expected_elements() -> list[Text]:
Title("These are a few of my favorite things:"),
ListItem("Parrots"),
ListItem("Hockey"),
Title("Analysis"),
Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."),
Text("2023"),
@ -1210,7 +1210,7 @@ class Describe_DocxPartitioner:
opts_args["file_path"] = example_doc_path("page-breaks.docx")
opts = DocxPartitionerOptions(**opts_args)
expected = [
# NOTE(scanny) - -- page 1 --
# -- page 1 --
NarrativeText(
"First page, tab here:\t"
"followed by line-break here:\n"
@ -1220,28 +1220,28 @@ class Describe_DocxPartitioner:
"and hard page-break here>>"
),
PageBreak(""),
# NOTE(scanny) - -- page 2 --
# -- page 2 --
NarrativeText(
"<<Text on second page. The font is big so it breaks onto third page--"
"------------------here-->> <<but break falls inside link so text stays"
" together."
),
PageBreak(""),
# NOTE(scanny) - -- page 3 --
# -- page 3 --
NarrativeText("Continuous section break here>>"),
NarrativeText("<<followed by text on same page"),
NarrativeText("Odd-page section break here>>"),
PageBreak(""),
# NOTE(scanny) - -- page 4 --
# -- page 4 --
PageBreak(""),
# NOTE(scanny) - -- page 5 --
# -- page 5 --
NarrativeText("<<producing two page-breaks to get from page-3 to page-5."),
NarrativeText(
'Then text gets big again so a "natural" rendered page break happens again here>> '
),
PageBreak(""),
# NOTE(scanny) - -- page 6 --
Title("<<and then more text proceeds."),
# -- page 6 --
Text("<<and then more text proceeds."),
]
elements = _DocxPartitioner.iter_document_elements(opts)

View File

@ -23,7 +23,6 @@ from unstructured.documents.elements import (
Table,
TableChunk,
Text,
Title,
)
from unstructured.partition.docx import partition_docx
from unstructured.partition.odt import partition_odt
@ -44,7 +43,7 @@ def test_partition_odt_from_filename():
elements = partition_odt(example_doc_path("fake.odt"))
assert elements == [
Title("Lorem ipsum dolor sit amet."),
Text("Lorem ipsum dolor sit amet."),
Table(
"Header row Mon Wed Fri"
" Color Blue Red Green"
@ -63,7 +62,7 @@ def test_partition_odt_from_file():
elements = partition_odt(file=f)
assert elements == [
Title("Lorem ipsum dolor sit amet."),
Text("Lorem ipsum dolor sit amet."),
Table(
"Header row Mon Wed Fri"
" Color Blue Red Green"

View File

@ -23,7 +23,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "5209312022a75a31d95385fdccff68fa",
"text": "CHAPTER 1",
"metadata": {
@ -51,7 +51,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "22a23e29022f32945965002cd734a8f0",
"text": "INTRODUCTION",
"metadata": {
@ -79,7 +79,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "4c175cf543957acc4420221de28d3fca",
"text": "CHAPTER 1 \u2013 INTRODUCTION",
"metadata": {
@ -101,7 +101,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "77022a5264f552b223538977cd40f640",
"text": "A.\tPURPOSE",
"metadata": {
@ -189,7 +189,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "e341ffc123dd2827638aba18149c4175",
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
"metadata": {
@ -255,7 +255,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "1b11ebe52652656e0ed8c12e5969de9b",
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
"metadata": {

View File

@ -23,7 +23,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "5209312022a75a31d95385fdccff68fa",
"text": "CHAPTER 1",
"metadata": {
@ -51,7 +51,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "22a23e29022f32945965002cd734a8f0",
"text": "INTRODUCTION",
"metadata": {
@ -79,7 +79,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "4c175cf543957acc4420221de28d3fca",
"text": "CHAPTER 1 \u2013 INTRODUCTION",
"metadata": {
@ -101,7 +101,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "77022a5264f552b223538977cd40f640",
"text": "A.\tPURPOSE",
"metadata": {
@ -189,7 +189,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "e341ffc123dd2827638aba18149c4175",
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
"metadata": {
@ -255,7 +255,7 @@
}
},
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "1b11ebe52652656e0ed8c12e5969de9b",
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
"metadata": {

View File

@ -1,6 +1,6 @@
[
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "56d531394823d81787d77a04462ed096",
"text": "Lorem ipsum dolor sit amet.",
"metadata": {
@ -17,6 +17,13 @@
"date_created": "1686809759.687",
"date_modified": "1686809743.0",
"permissions_data": [
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "18298851591250030956",
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
@ -28,6 +35,17 @@
"deleted": false,
"pendingOwner": false
},
{
"id": "04774006893477068632",
"displayName": "ryan",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
"emailAddress": "ryan@unstructured.io",
"role": "owner",
"deleted": false,
"pendingOwner": false
},
{
"id": "09147371668407854156",
"displayName": "roman",
@ -38,24 +56,6 @@
"role": "writer",
"deleted": false,
"pendingOwner": false
},
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "04774006893477068632",
"displayName": "ryan",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
"emailAddress": "ryan@unstructured.io",
"role": "owner",
"deleted": false,
"pendingOwner": false
}
]
}

View File

@ -1,6 +1,6 @@
[
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "56d531394823d81787d77a04462ed096",
"text": "Lorem ipsum dolor sit amet.",
"metadata": {
@ -17,6 +17,13 @@
"date_created": "1718722775.76",
"date_modified": "1718722788.018",
"permissions_data": [
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "18298851591250030956",
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
@ -39,13 +46,6 @@
"deleted": false,
"pendingOwner": false
},
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "09147371668407854156",
"displayName": "roman",

View File

@ -1,6 +1,6 @@
[
{
"type": "Title",
"type": "UncategorizedText",
"element_id": "cc23ac9998df1db62b795ec4e5133ab0",
"text": "Title",
"metadata": {
@ -22,6 +22,13 @@
"date_created": "1686809758.931",
"date_modified": "1686809744.0",
"permissions_data": [
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "18298851591250030956",
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
@ -33,24 +40,6 @@
"deleted": false,
"pendingOwner": false
},
{
"id": "09147371668407854156",
"displayName": "roman",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
"emailAddress": "roman@unstructured.io",
"role": "writer",
"deleted": false,
"pendingOwner": false
},
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "04774006893477068632",
"displayName": "ryan",
@ -61,6 +50,17 @@
"role": "owner",
"deleted": false,
"pendingOwner": false
},
{
"id": "09147371668407854156",
"displayName": "roman",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
"emailAddress": "roman@unstructured.io",
"role": "writer",
"deleted": false,
"pendingOwner": false
}
]
}
@ -89,6 +89,13 @@
"date_created": "1686809758.931",
"date_modified": "1686809744.0",
"permissions_data": [
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "18298851591250030956",
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
@ -100,6 +107,17 @@
"deleted": false,
"pendingOwner": false
},
{
"id": "04774006893477068632",
"displayName": "ryan",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
"emailAddress": "ryan@unstructured.io",
"role": "owner",
"deleted": false,
"pendingOwner": false
},
{
"id": "09147371668407854156",
"displayName": "roman",
@ -110,24 +128,6 @@
"role": "writer",
"deleted": false,
"pendingOwner": false
},
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "04774006893477068632",
"displayName": "ryan",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
"emailAddress": "ryan@unstructured.io",
"role": "owner",
"deleted": false,
"pendingOwner": false
}
]
}

View File

@ -27,7 +27,7 @@
"eng"
],
"page_number": 1,
"orig_elements": "eJztlU1v2jAYx79K5PMgifNm71Z1leilIAqnFkWO/RiiJnbkOBsd4rvPhtK1E4dt0qQx7RQ/74//P1l52CFooAVly1qgjwFKODBKuIRIZAXJc8oqwDxNMkllFgmGPgSoBcsEs8zl75A/lL0eDIeD3YFp676vterLl6SHHWq18OEkiQnZr1wPA1wbUTaaM6vNsZLZjV8h3OgWQjMoBSb8os1TOKjemoHbwYB4b8CWtV0DI6F5H26YEpXWT6O4GzvHFu33bpKsG7DPnR+PWNc1tZvotgs/KzHWHaht20htWmb7kZay5uBKB6/I2M0WndEc3HXUum3Gp4gXYQNMgCml1tZ9TgM6U7fMPPuEhqn1wNbQewUQqDVa+XUsbK3PXN4HCzP0FiCYvOztq06NJof2yFX8SEgKSgkTaSogo1lW0IRgUeTuUFQVJvzyCEHbbVhffwVRenVKrpV19z0Kdz25mi1u5kGMVmdSLVsf06pD+A+xPoPSeTvnKdXQVuDVid/C/b70G6SL2jZwjmhFRERFlnIBmJBIFEwkNM8hYhHnVVX9Y0Rv7xbz6afl9eJ2endJUN/t/VNckyzPCEtlgYXDWUkc5RyqnCcYA5X48rj+LShe31fwOOAoToJfZsNzwAWN3W8uTomgKc6KIk5S8GYS0/Q/m99lczV+tLPlfDa9vzlDYvUNQ4PhWg=="
"orig_elements": "eJztlV1r2zAUhv+K0fWS+NvW7kpXSG+akCZXbTD6OHJMLcnI8pYu5L9PSpqtHYGxjcEydmUdnffovDoPwg87BC1IULZqOHofoIQBwSUTEPKsKPMcEwoxS5NMYJGFnKB3AZJgCSeWOP0O+UXV68EwOMQdGNn0faNVX72IHnZIau7TSRKV5X7tzjDAtOFVqxmx2hwrid14C5ONljAxg1JgJp+0eZoMqrdmYHYwwN8GsCWya2HENesnG6I41fppFHVjt7FF+73rJJoW7HPn2yPSdW3jOjp3k4+Kj3UHaitboY0kth9pIRoGrnTwExm73rwzmoG7jqplOz5l/BA2QDiYSmht3efUoDONJObZC1qi6oHU0PsJIFA1Wns7FrbWK1f3wdIMvQUIpi++fdXpoOnheOQqvickOMYl4WnKIcNZVuCkjHmRu0VBaVyyyyMEstuQvvkMvPLTqZhW1t33OLjr6dV8ebMIIrQ+I7WkPsroIf2HWJ9B6XY7t1OpQVLw04lew/1m+hXSlXJuoNbG21965Rm6tOQh5lnKOMRlGfKC8ATnOYQkZIxS+o/Rvb1bLmYfVtfL29ndJQF+4/unGSdZnpUkFUXMHVoq4jBnQHOWxDFgEV8e478Fy9d3FzwOcRglwW9xYjnEBY7crzBKS47TOCuKKEnBh0mE0/+cfpXT1fjRzleL+ez+5gdU1l8AKnj1Ng=="
}
},
{
@ -383,7 +383,7 @@
"eng"
],
"page_number": 2,
"orig_elements": "eJxVkMtOwzAQRX8l8pompGmlhB2IIJAQldp0VarIsSdpVNtj+QFBVf4dG+iC3bzPvXO4EBAgQbl25OQuIauCccp51RXLjlf5ii0L3lddWZXlmt2uC3KTEAmOcupomL+QGLQWvWHwk2swcrR2RGXbv6HDhUjksV0UeVnOx3DDAEPDW4GMOjS/m9SdooTshBIy45UCk32iOWdeWWc8c94A/5/ARKUWsODIbHaiineI50Wu01CYyDwHUj8KcF864gnVWoyBGNRlH4qnqEFNUvRoJHV2gX0/MgirPn4kDWyuDTIIdtQgRXrtxCcIqgZPB7DRIAE1kGhLh0qrvOwgelpGvoPJRfZD+u62m9c62TwlzXOd7N9emvox2TX3Tb1Lmu1+19R1vHwV24xOAJmP32EBjeE="
"orig_elements": "eJxVkMtOwzAQRX8l8pomtGmlhB2IIJAQldp0VarIsSdpVNtj+QGBKv+ODe2C3bzPvbM/ExAgQblm4OQuIcucccp52eaLlpfzJVvkvCvboiyKFbtd5eQmIRIc5dTRMH8mMWgsesPgN9dg5GDtgMo2l6H9mUjksZ3n86KYDuGGAYaGNwIZdWj+Nqk7RgnZESVkxisFJvtEc8q8ss545rwB/j+BkUotYMaR2exIFW8RT7O5TkNhJNMUSN0gwH3piCdUazEEYlCXfSieogY1StGhkdTZGXbdwCCs+viRNLC5Nsgg2FG9FOm1E58gqOo97cFGgwRUT6ItHSqN8rKF6GkR+Q5GF9kP6bvbrF+rZP2U1M9Vsnt7qavHZFvf19U2qTe7bV1V8fJV7E4FpdCjGb6B1/HKdPgBKuqS2A=="
}
},
{
@ -572,7 +572,7 @@
"eng"
],
"page_number": 2,
"orig_elements": "eJxVkMtuwjAQRX8l8rokDZRHukOFVmxAgrCoAEVOPAkRtsfyo02F8u+1oSy689yZ8bl3DlcCHARIW7SMvEbkJcvSYZqNR1NaZiU8jwHGFDJIKWMTOp2Qp4gIsJRRS/38lYRHYdDpCm61Ai1aY1qUpvgbOlyJQBbao1E6m/Un/4eGCjUrOFbUor5vUnsOFpIzCki0kxJ08o36kjhprHaVdRrY/wI6KhSHAcPKJGcqWYl4GaQq9kJH+t6T6paD/VEBT6hSvPVE7y75kixGBbITvEYtqDUDrOu2Ar/qwkViz2ZKYwU+jmwEjx+dcAROZeNoAyYEJCAbEmIprxTSiRJCpmHgW+hsYL/FR7vL5/k+32w/o8U+Xy130eY9mkdeXS9W648o3+53+XJ5vAEenvPWciD96RdNTpBK"
"orig_elements": "eJxVkMtuwjAQRX8l8rokDZRHukOFVmxAgmRRAYqceBIibI/lR5sW8e+1aVl0N+9z7+wvBDgIkLbsGHmOyFOWpcM0G4+mtMoqeBwDjClkkFLGJnQ6IQ8REWApo5b6+QsJQWnQ6RpuuQItOmM6lKb8G9pfiEAW2qNROptdj/6Ghho1KznW1KL+3aT2FCQkJxSQaCcl6OQT9Tlx0ljtaus0sP8J9FQoDgOGtUlOVLIK8TxIVewLPblePanpONgvFfCEKsU7T/Tqkg/JYlQge8Eb1IJaM8Cm6Wrwqy58JPZspjTW4O3IVvD43glP4FS2jrZggkECsiXBlvKVUjpRQfA0DHwLvQ3sl/hgd/k8L/LN9j1aFPlquYs2r9E88tX1YrV+i/JtscuXy8MNcNdcSC8YWtTdN7A8HLsefwA0I5VB"
}
},
{

View File

@ -1 +1 @@
__version__ = "0.16.12-dev3" # pragma: no cover
__version__ = "0.16.12-dev4" # pragma: no cover

View File

@ -48,7 +48,6 @@ from unstructured.partition.text_type import (
is_bulleted_text,
is_email_address,
is_possible_narrative_text,
is_possible_title,
is_us_city_state_zip,
)
from unstructured.partition.utils.constants import PartitionStrategy
@ -412,15 +411,15 @@ class _DocxPartitioner:
)
)
# NOTE(scanny) - blank paragraphs are commonly used for spacing between paragraphs and
# do not contribute to the document-element stream.
# -- blank paragraphs are commonly used for spacing between paragraphs and do not
# -- contribute to the document-element stream
if not text.strip():
return
metadata = self._paragraph_metadata(paragraph)
# NOTE(scanny) - a list-item gets some special treatment, mutating the text to remove a
# bullet-character if present.
# -- a list-item gets some special treatment, mutating the text to remove a
# -- bullet-character if present
if self._is_list_item(paragraph):
clean_text = clean_bullets(text).strip()
if clean_text:
@ -431,19 +430,19 @@ class _DocxPartitioner:
)
return
# NOTE(scanny) - determine element-type from an explicit Word paragraph-style if possible
# -- determine element-type from an explicit Word paragraph-style if possible --
TextSubCls = self._style_based_element_type(paragraph)
if TextSubCls:
yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
return
# NOTE(scanny) - try to recognize the element type by parsing its text
# -- try to recognize the element type by parsing its text --
TextSubCls = self._parse_paragraph_text_for_element_type(paragraph)
if TextSubCls:
yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
return
# NOTE(scanny) - if all that fails we give it the default `Text` element-type
# -- if all that fails we give it the default `Text` element-type --
yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
def _convert_table_to_html(self, table: DocxTable) -> str:
@ -576,20 +575,20 @@ class _DocxPartitioner:
page_break = paragraph.rendered_page_breaks[0]
# NOTE(scanny)- preceding-fragment is None when first paragraph content is a page-break
# -- preceding-fragment is None when first paragraph content is a page-break --
preceding_paragraph_fragment = page_break.preceding_paragraph_fragment
if preceding_paragraph_fragment:
yield preceding_paragraph_fragment
yield page_break
# NOTE(scanny) - following-fragment is None when page-break is last paragraph content.
# This is probably quite rare (Word moves these to the start of the next paragraph) but
# easier to check for it than prove it can't happen.
# -- following-fragment is None when page-break is last paragraph content. This is
# -- probably quite rare (Word moves these to the start of the next paragraph) but
# -- easier to check for it than prove it can't happen.
following_paragraph_fragment = page_break.following_paragraph_fragment
# NOTE(scanny) - the paragraph fragment following a page-break can itself contain
# another page-break. This would also be quite rare, but it can happen so we just
# recurse into the second fragment the same way we handled the original paragraph.
# -- the paragraph fragment following a page-break can itself contain another
# -- page-break; this would also be quite rare, but it can happen so we just recurse
# -- into the second fragment the same way we handled the original paragraph
if following_paragraph_fragment:
yield from iter_paragraph_items(following_paragraph_fragment)
@ -901,8 +900,6 @@ class _DocxPartitioner:
return EmailAddress
if is_possible_narrative_text(text):
return NarrativeText
if is_possible_title(text):
return Title
return None