fix: improve false-positive Title elements on Chinese text (#3836)

**Summary**
Improve element-type mapping for Chinese text. Fixes bug where Chinese
text would produce large numbers of false-positive `Title` elements.

Fixes #3084

---------

Co-authored-by: scanny <scanny@users.noreply.github.com>
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
This commit is contained in:
Steve Canny 2024-12-17 17:16:42 -08:00 committed by GitHub
parent 9a9bf4c4f5
commit 9ece0b5ad2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 847 additions and 852 deletions

View File

@ -1,4 +1,4 @@
## 0.16.12-dev3 ## 0.16.12-dev4
### Enhancements ### Enhancements
@ -10,6 +10,7 @@
- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file. - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
## 0.16.11 ## 0.16.11

View File

@ -74,20 +74,18 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
"handbook-1p.docx", "handbook-1p.docx",
{ {
("Header", None): 1, ("Header", None): 1,
("Title", 0): 1, ("UncategorizedText", 0): 6,
("Title", 1): 1,
("Title", 2): 1,
("ListItem", 3): 3, ("ListItem", 3): 3,
("NarrativeText", 4): 7, ("NarrativeText", 0): 7,
("Footer", None): 1, ("Footer", None): 1,
}, },
(0.43, 0.07, 0.65), (0.78, 0.72, 0.81),
), ),
( (
"handbook-1p.docx", "handbook-1p.docx",
{ {
("Header", None): 1, ("Header", None): 1,
("Title", 0): 6, ("UncategorizedText", 0): 6,
("NarrativeText", 0): 7, ("NarrativeText", 0): 7,
("PageBreak", None): 1, ("PageBreak", None): 1,
("Footer", None): 1, ("Footer", None): 1,

View File

@ -1286,7 +1286,7 @@ def expected_docx_elements():
Title("These are a few of my favorite things:"), Title("These are a few of my favorite things:"),
ListItem("Parrots"), ListItem("Parrots"),
ListItem("Hockey"), ListItem("Hockey"),
Title("Analysis"), Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."), NarrativeText("This is my third thought."),
Text("2023"), Text("2023"),

View File

@ -275,7 +275,7 @@ def expected_elements() -> list[Element]:
Title("These are a few of my favorite things:"), Title("These are a few of my favorite things:"),
ListItem("Parrots"), ListItem("Parrots"),
ListItem("Hockey"), ListItem("Hockey"),
Title("Analysis"), Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."), NarrativeText("This is my third thought."),
Text("2023"), Text("2023"),

View File

@ -627,7 +627,7 @@ def expected_elements() -> list[Text]:
Title("These are a few of my favorite things:"), Title("These are a few of my favorite things:"),
ListItem("Parrots"), ListItem("Parrots"),
ListItem("Hockey"), ListItem("Hockey"),
Title("Analysis"), Text("Analysis"),
NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my first thought. This is my second thought."),
NarrativeText("This is my third thought."), NarrativeText("This is my third thought."),
Text("2023"), Text("2023"),
@ -1210,7 +1210,7 @@ class Describe_DocxPartitioner:
opts_args["file_path"] = example_doc_path("page-breaks.docx") opts_args["file_path"] = example_doc_path("page-breaks.docx")
opts = DocxPartitionerOptions(**opts_args) opts = DocxPartitionerOptions(**opts_args)
expected = [ expected = [
# NOTE(scanny) - -- page 1 -- # -- page 1 --
NarrativeText( NarrativeText(
"First page, tab here:\t" "First page, tab here:\t"
"followed by line-break here:\n" "followed by line-break here:\n"
@ -1220,28 +1220,28 @@ class Describe_DocxPartitioner:
"and hard page-break here>>" "and hard page-break here>>"
), ),
PageBreak(""), PageBreak(""),
# NOTE(scanny) - -- page 2 -- # -- page 2 --
NarrativeText( NarrativeText(
"<<Text on second page. The font is big so it breaks onto third page--" "<<Text on second page. The font is big so it breaks onto third page--"
"------------------here-->> <<but break falls inside link so text stays" "------------------here-->> <<but break falls inside link so text stays"
" together." " together."
), ),
PageBreak(""), PageBreak(""),
# NOTE(scanny) - -- page 3 -- # -- page 3 --
NarrativeText("Continuous section break here>>"), NarrativeText("Continuous section break here>>"),
NarrativeText("<<followed by text on same page"), NarrativeText("<<followed by text on same page"),
NarrativeText("Odd-page section break here>>"), NarrativeText("Odd-page section break here>>"),
PageBreak(""), PageBreak(""),
# NOTE(scanny) - -- page 4 -- # -- page 4 --
PageBreak(""), PageBreak(""),
# NOTE(scanny) - -- page 5 -- # -- page 5 --
NarrativeText("<<producing two page-breaks to get from page-3 to page-5."), NarrativeText("<<producing two page-breaks to get from page-3 to page-5."),
NarrativeText( NarrativeText(
'Then text gets big again so a "natural" rendered page break happens again here>> ' 'Then text gets big again so a "natural" rendered page break happens again here>> '
), ),
PageBreak(""), PageBreak(""),
# NOTE(scanny) - -- page 6 -- # -- page 6 --
Title("<<and then more text proceeds."), Text("<<and then more text proceeds."),
] ]
elements = _DocxPartitioner.iter_document_elements(opts) elements = _DocxPartitioner.iter_document_elements(opts)

View File

@ -23,7 +23,6 @@ from unstructured.documents.elements import (
Table, Table,
TableChunk, TableChunk,
Text, Text,
Title,
) )
from unstructured.partition.docx import partition_docx from unstructured.partition.docx import partition_docx
from unstructured.partition.odt import partition_odt from unstructured.partition.odt import partition_odt
@ -44,7 +43,7 @@ def test_partition_odt_from_filename():
elements = partition_odt(example_doc_path("fake.odt")) elements = partition_odt(example_doc_path("fake.odt"))
assert elements == [ assert elements == [
Title("Lorem ipsum dolor sit amet."), Text("Lorem ipsum dolor sit amet."),
Table( Table(
"Header row Mon Wed Fri" "Header row Mon Wed Fri"
" Color Blue Red Green" " Color Blue Red Green"
@ -63,7 +62,7 @@ def test_partition_odt_from_file():
elements = partition_odt(file=f) elements = partition_odt(file=f)
assert elements == [ assert elements == [
Title("Lorem ipsum dolor sit amet."), Text("Lorem ipsum dolor sit amet."),
Table( Table(
"Header row Mon Wed Fri" "Header row Mon Wed Fri"
" Color Blue Red Green" " Color Blue Red Green"

View File

@ -23,7 +23,7 @@
} }
}, },
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "5209312022a75a31d95385fdccff68fa", "element_id": "5209312022a75a31d95385fdccff68fa",
"text": "CHAPTER 1", "text": "CHAPTER 1",
"metadata": { "metadata": {
@ -51,7 +51,7 @@
} }
}, },
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "22a23e29022f32945965002cd734a8f0", "element_id": "22a23e29022f32945965002cd734a8f0",
"text": "INTRODUCTION", "text": "INTRODUCTION",
"metadata": { "metadata": {
@ -79,7 +79,7 @@
} }
}, },
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "4c175cf543957acc4420221de28d3fca", "element_id": "4c175cf543957acc4420221de28d3fca",
"text": "CHAPTER 1 \u2013 INTRODUCTION", "text": "CHAPTER 1 \u2013 INTRODUCTION",
"metadata": { "metadata": {
@ -101,7 +101,7 @@
} }
}, },
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "77022a5264f552b223538977cd40f640", "element_id": "77022a5264f552b223538977cd40f640",
"text": "A.\tPURPOSE", "text": "A.\tPURPOSE",
"metadata": { "metadata": {
@ -189,7 +189,7 @@
} }
}, },
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "e341ffc123dd2827638aba18149c4175", "element_id": "e341ffc123dd2827638aba18149c4175",
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE", "text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
"metadata": { "metadata": {
@ -255,7 +255,7 @@
} }
}, },
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "1b11ebe52652656e0ed8c12e5969de9b", "element_id": "1b11ebe52652656e0ed8c12e5969de9b",
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t", "text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
"metadata": { "metadata": {

View File

@ -23,7 +23,7 @@
} }
}, },
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "5209312022a75a31d95385fdccff68fa", "element_id": "5209312022a75a31d95385fdccff68fa",
"text": "CHAPTER 1", "text": "CHAPTER 1",
"metadata": { "metadata": {
@ -51,7 +51,7 @@
} }
}, },
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "22a23e29022f32945965002cd734a8f0", "element_id": "22a23e29022f32945965002cd734a8f0",
"text": "INTRODUCTION", "text": "INTRODUCTION",
"metadata": { "metadata": {
@ -79,7 +79,7 @@
} }
}, },
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "4c175cf543957acc4420221de28d3fca", "element_id": "4c175cf543957acc4420221de28d3fca",
"text": "CHAPTER 1 \u2013 INTRODUCTION", "text": "CHAPTER 1 \u2013 INTRODUCTION",
"metadata": { "metadata": {
@ -101,7 +101,7 @@
} }
}, },
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "77022a5264f552b223538977cd40f640", "element_id": "77022a5264f552b223538977cd40f640",
"text": "A.\tPURPOSE", "text": "A.\tPURPOSE",
"metadata": { "metadata": {
@ -189,7 +189,7 @@
} }
}, },
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "e341ffc123dd2827638aba18149c4175", "element_id": "e341ffc123dd2827638aba18149c4175",
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE", "text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
"metadata": { "metadata": {
@ -255,7 +255,7 @@
} }
}, },
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "1b11ebe52652656e0ed8c12e5969de9b", "element_id": "1b11ebe52652656e0ed8c12e5969de9b",
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t", "text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
"metadata": { "metadata": {

View File

@ -1,6 +1,6 @@
[ [
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "56d531394823d81787d77a04462ed096", "element_id": "56d531394823d81787d77a04462ed096",
"text": "Lorem ipsum dolor sit amet.", "text": "Lorem ipsum dolor sit amet.",
"metadata": { "metadata": {
@ -17,6 +17,13 @@
"date_created": "1686809759.687", "date_created": "1686809759.687",
"date_modified": "1686809743.0", "date_modified": "1686809743.0",
"permissions_data": [ "permissions_data": [
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{ {
"id": "18298851591250030956", "id": "18298851591250030956",
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com", "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
@ -28,6 +35,17 @@
"deleted": false, "deleted": false,
"pendingOwner": false "pendingOwner": false
}, },
{
"id": "04774006893477068632",
"displayName": "ryan",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
"emailAddress": "ryan@unstructured.io",
"role": "owner",
"deleted": false,
"pendingOwner": false
},
{ {
"id": "09147371668407854156", "id": "09147371668407854156",
"displayName": "roman", "displayName": "roman",
@ -38,24 +56,6 @@
"role": "writer", "role": "writer",
"deleted": false, "deleted": false,
"pendingOwner": false "pendingOwner": false
},
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "04774006893477068632",
"displayName": "ryan",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
"emailAddress": "ryan@unstructured.io",
"role": "owner",
"deleted": false,
"pendingOwner": false
} }
] ]
} }

View File

@ -1,6 +1,6 @@
[ [
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "56d531394823d81787d77a04462ed096", "element_id": "56d531394823d81787d77a04462ed096",
"text": "Lorem ipsum dolor sit amet.", "text": "Lorem ipsum dolor sit amet.",
"metadata": { "metadata": {
@ -17,6 +17,13 @@
"date_created": "1718722775.76", "date_created": "1718722775.76",
"date_modified": "1718722788.018", "date_modified": "1718722788.018",
"permissions_data": [ "permissions_data": [
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{ {
"id": "18298851591250030956", "id": "18298851591250030956",
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com", "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
@ -39,13 +46,6 @@
"deleted": false, "deleted": false,
"pendingOwner": false "pendingOwner": false
}, },
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{ {
"id": "09147371668407854156", "id": "09147371668407854156",
"displayName": "roman", "displayName": "roman",

View File

@ -1,6 +1,6 @@
[ [
{ {
"type": "Title", "type": "UncategorizedText",
"element_id": "cc23ac9998df1db62b795ec4e5133ab0", "element_id": "cc23ac9998df1db62b795ec4e5133ab0",
"text": "Title", "text": "Title",
"metadata": { "metadata": {
@ -22,6 +22,13 @@
"date_created": "1686809758.931", "date_created": "1686809758.931",
"date_modified": "1686809744.0", "date_modified": "1686809744.0",
"permissions_data": [ "permissions_data": [
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{ {
"id": "18298851591250030956", "id": "18298851591250030956",
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com", "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
@ -33,24 +40,6 @@
"deleted": false, "deleted": false,
"pendingOwner": false "pendingOwner": false
}, },
{
"id": "09147371668407854156",
"displayName": "roman",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
"emailAddress": "roman@unstructured.io",
"role": "writer",
"deleted": false,
"pendingOwner": false
},
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{ {
"id": "04774006893477068632", "id": "04774006893477068632",
"displayName": "ryan", "displayName": "ryan",
@ -61,6 +50,17 @@
"role": "owner", "role": "owner",
"deleted": false, "deleted": false,
"pendingOwner": false "pendingOwner": false
},
{
"id": "09147371668407854156",
"displayName": "roman",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
"emailAddress": "roman@unstructured.io",
"role": "writer",
"deleted": false,
"pendingOwner": false
} }
] ]
} }
@ -89,6 +89,13 @@
"date_created": "1686809758.931", "date_created": "1686809758.931",
"date_modified": "1686809744.0", "date_modified": "1686809744.0",
"permissions_data": [ "permissions_data": [
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{ {
"id": "18298851591250030956", "id": "18298851591250030956",
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com", "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
@ -100,6 +107,17 @@
"deleted": false, "deleted": false,
"pendingOwner": false "pendingOwner": false
}, },
{
"id": "04774006893477068632",
"displayName": "ryan",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
"emailAddress": "ryan@unstructured.io",
"role": "owner",
"deleted": false,
"pendingOwner": false
},
{ {
"id": "09147371668407854156", "id": "09147371668407854156",
"displayName": "roman", "displayName": "roman",
@ -110,24 +128,6 @@
"role": "writer", "role": "writer",
"deleted": false, "deleted": false,
"pendingOwner": false "pendingOwner": false
},
{
"id": "anyoneWithLink",
"type": "anyone",
"kind": "drive#permission",
"role": "reader",
"allowFileDiscovery": false
},
{
"id": "04774006893477068632",
"displayName": "ryan",
"type": "user",
"kind": "drive#permission",
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
"emailAddress": "ryan@unstructured.io",
"role": "owner",
"deleted": false,
"pendingOwner": false
} }
] ]
} }

View File

@ -27,7 +27,7 @@
"eng" "eng"
], ],
"page_number": 1, "page_number": 1,
"orig_elements": "eJztlU1v2jAYx79K5PMgifNm71Z1leilIAqnFkWO/RiiJnbkOBsd4rvPhtK1E4dt0qQx7RQ/74//P1l52CFooAVly1qgjwFKODBKuIRIZAXJc8oqwDxNMkllFgmGPgSoBcsEs8zl75A/lL0eDIeD3YFp676vterLl6SHHWq18OEkiQnZr1wPA1wbUTaaM6vNsZLZjV8h3OgWQjMoBSb8os1TOKjemoHbwYB4b8CWtV0DI6F5H26YEpXWT6O4GzvHFu33bpKsG7DPnR+PWNc1tZvotgs/KzHWHaht20htWmb7kZay5uBKB6/I2M0WndEc3HXUum3Gp4gXYQNMgCml1tZ9TgM6U7fMPPuEhqn1wNbQewUQqDVa+XUsbK3PXN4HCzP0FiCYvOztq06NJof2yFX8SEgKSgkTaSogo1lW0IRgUeTuUFQVJvzyCEHbbVhffwVRenVKrpV19z0Kdz25mi1u5kGMVmdSLVsf06pD+A+xPoPSeTvnKdXQVuDVid/C/b70G6SL2jZwjmhFRERFlnIBmJBIFEwkNM8hYhHnVVX9Y0Rv7xbz6afl9eJ2endJUN/t/VNckyzPCEtlgYXDWUkc5RyqnCcYA5X48rj+LShe31fwOOAoToJfZsNzwAWN3W8uTomgKc6KIk5S8GYS0/Q/m99lczV+tLPlfDa9vzlDYvUNQ4PhWg==" "orig_elements": "eJztlV1r2zAUhv+K0fWS+NvW7kpXSG+akCZXbTD6OHJMLcnI8pYu5L9PSpqtHYGxjcEydmUdnffovDoPwg87BC1IULZqOHofoIQBwSUTEPKsKPMcEwoxS5NMYJGFnKB3AZJgCSeWOP0O+UXV68EwOMQdGNn0faNVX72IHnZIau7TSRKV5X7tzjDAtOFVqxmx2hwrid14C5ONljAxg1JgJp+0eZoMqrdmYHYwwN8GsCWya2HENesnG6I41fppFHVjt7FF+73rJJoW7HPn2yPSdW3jOjp3k4+Kj3UHaitboY0kth9pIRoGrnTwExm73rwzmoG7jqplOz5l/BA2QDiYSmht3efUoDONJObZC1qi6oHU0PsJIFA1Wns7FrbWK1f3wdIMvQUIpi++fdXpoOnheOQqvickOMYl4WnKIcNZVuCkjHmRu0VBaVyyyyMEstuQvvkMvPLTqZhW1t33OLjr6dV8ebMIIrQ+I7WkPsroIf2HWJ9B6XY7t1OpQVLw04lew/1m+hXSlXJuoNbG21965Rm6tOQh5lnKOMRlGfKC8ATnOYQkZIxS+o/Rvb1bLmYfVtfL29ndJQF+4/unGSdZnpUkFUXMHVoq4jBnQHOWxDFgEV8e478Fy9d3FzwOcRglwW9xYjnEBY7crzBKS47TOCuKKEnBh0mE0/+cfpXT1fjRzleL+ez+5gdU1l8AKnj1Ng=="
} }
}, },
{ {
@ -383,7 +383,7 @@
"eng" "eng"
], ],
"page_number": 2, "page_number": 2,
"orig_elements": "eJxVkMtOwzAQRX8l8pompGmlhB2IIJAQldp0VarIsSdpVNtj+QFBVf4dG+iC3bzPvXO4EBAgQbl25OQuIauCccp51RXLjlf5ii0L3lddWZXlmt2uC3KTEAmOcupomL+QGLQWvWHwk2swcrR2RGXbv6HDhUjksV0UeVnOx3DDAEPDW4GMOjS/m9SdooTshBIy45UCk32iOWdeWWc8c94A/5/ARKUWsODIbHaiineI50Wu01CYyDwHUj8KcF864gnVWoyBGNRlH4qnqEFNUvRoJHV2gX0/MgirPn4kDWyuDTIIdtQgRXrtxCcIqgZPB7DRIAE1kGhLh0qrvOwgelpGvoPJRfZD+u62m9c62TwlzXOd7N9emvox2TX3Tb1Lmu1+19R1vHwV24xOAJmP32EBjeE=" "orig_elements": "eJxVkMtOwzAQRX8l8pomtGmlhB2IIJAQldp0VarIsSdpVNtj+QGBKv+ODe2C3bzPvbM/ExAgQblm4OQuIcucccp52eaLlpfzJVvkvCvboiyKFbtd5eQmIRIc5dTRMH8mMWgsesPgN9dg5GDtgMo2l6H9mUjksZ3n86KYDuGGAYaGNwIZdWj+Nqk7RgnZESVkxisFJvtEc8q8ss545rwB/j+BkUotYMaR2exIFW8RT7O5TkNhJNMUSN0gwH3piCdUazEEYlCXfSieogY1StGhkdTZGXbdwCCs+viRNLC5Nsgg2FG9FOm1E58gqOo97cFGgwRUT6ItHSqN8rKF6GkR+Q5GF9kP6bvbrF+rZP2U1M9Vsnt7qavHZFvf19U2qTe7bV1V8fJV7E4FpdCjGb6B1/HKdPgBKuqS2A=="
} }
}, },
{ {
@ -572,7 +572,7 @@
"eng" "eng"
], ],
"page_number": 2, "page_number": 2,
"orig_elements": "eJxVkMtuwjAQRX8l8rokDZRHukOFVmxAgrCoAEVOPAkRtsfyo02F8u+1oSy689yZ8bl3DlcCHARIW7SMvEbkJcvSYZqNR1NaZiU8jwHGFDJIKWMTOp2Qp4gIsJRRS/38lYRHYdDpCm61Ai1aY1qUpvgbOlyJQBbao1E6m/Un/4eGCjUrOFbUor5vUnsOFpIzCki0kxJ08o36kjhprHaVdRrY/wI6KhSHAcPKJGcqWYl4GaQq9kJH+t6T6paD/VEBT6hSvPVE7y75kixGBbITvEYtqDUDrOu2Ar/qwkViz2ZKYwU+jmwEjx+dcAROZeNoAyYEJCAbEmIprxTSiRJCpmHgW+hsYL/FR7vL5/k+32w/o8U+Xy130eY9mkdeXS9W648o3+53+XJ5vAEenvPWciD96RdNTpBK" "orig_elements": "eJxVkMtuwjAQRX8l8rokDZRHukOFVmxAgmRRAYqceBIibI/lR5sW8e+1aVl0N+9z7+wvBDgIkLbsGHmOyFOWpcM0G4+mtMoqeBwDjClkkFLGJnQ6IQ8REWApo5b6+QsJQWnQ6RpuuQItOmM6lKb8G9pfiEAW2qNROptdj/6Ghho1KznW1KL+3aT2FCQkJxSQaCcl6OQT9Tlx0ljtaus0sP8J9FQoDgOGtUlOVLIK8TxIVewLPblePanpONgvFfCEKsU7T/Tqkg/JYlQge8Eb1IJaM8Cm6Wrwqy58JPZspjTW4O3IVvD43glP4FS2jrZggkECsiXBlvKVUjpRQfA0DHwLvQ3sl/hgd/k8L/LN9j1aFPlquYs2r9E88tX1YrV+i/JtscuXy8MNcNdcSC8YWtTdN7A8HLsefwA0I5VB"
} }
}, },
{ {

View File

@ -1 +1 @@
__version__ = "0.16.12-dev3" # pragma: no cover __version__ = "0.16.12-dev4" # pragma: no cover

View File

@ -48,7 +48,6 @@ from unstructured.partition.text_type import (
is_bulleted_text, is_bulleted_text,
is_email_address, is_email_address,
is_possible_narrative_text, is_possible_narrative_text,
is_possible_title,
is_us_city_state_zip, is_us_city_state_zip,
) )
from unstructured.partition.utils.constants import PartitionStrategy from unstructured.partition.utils.constants import PartitionStrategy
@ -412,15 +411,15 @@ class _DocxPartitioner:
) )
) )
# NOTE(scanny) - blank paragraphs are commonly used for spacing between paragraphs and # -- blank paragraphs are commonly used for spacing between paragraphs and do not
# do not contribute to the document-element stream. # -- contribute to the document-element stream
if not text.strip(): if not text.strip():
return return
metadata = self._paragraph_metadata(paragraph) metadata = self._paragraph_metadata(paragraph)
# NOTE(scanny) - a list-item gets some special treatment, mutating the text to remove a # -- a list-item gets some special treatment, mutating the text to remove a
# bullet-character if present. # -- bullet-character if present
if self._is_list_item(paragraph): if self._is_list_item(paragraph):
clean_text = clean_bullets(text).strip() clean_text = clean_bullets(text).strip()
if clean_text: if clean_text:
@ -431,19 +430,19 @@ class _DocxPartitioner:
) )
return return
# NOTE(scanny) - determine element-type from an explicit Word paragraph-style if possible # -- determine element-type from an explicit Word paragraph-style if possible --
TextSubCls = self._style_based_element_type(paragraph) TextSubCls = self._style_based_element_type(paragraph)
if TextSubCls: if TextSubCls:
yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN) yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
return return
# NOTE(scanny) - try to recognize the element type by parsing its text # -- try to recognize the element type by parsing its text --
TextSubCls = self._parse_paragraph_text_for_element_type(paragraph) TextSubCls = self._parse_paragraph_text_for_element_type(paragraph)
if TextSubCls: if TextSubCls:
yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN) yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
return return
# NOTE(scanny) - if all that fails we give it the default `Text` element-type # -- if all that fails we give it the default `Text` element-type --
yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN) yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
def _convert_table_to_html(self, table: DocxTable) -> str: def _convert_table_to_html(self, table: DocxTable) -> str:
@ -576,20 +575,20 @@ class _DocxPartitioner:
page_break = paragraph.rendered_page_breaks[0] page_break = paragraph.rendered_page_breaks[0]
# NOTE(scanny)- preceding-fragment is None when first paragraph content is a page-break # -- preceding-fragment is None when first paragraph content is a page-break --
preceding_paragraph_fragment = page_break.preceding_paragraph_fragment preceding_paragraph_fragment = page_break.preceding_paragraph_fragment
if preceding_paragraph_fragment: if preceding_paragraph_fragment:
yield preceding_paragraph_fragment yield preceding_paragraph_fragment
yield page_break yield page_break
# NOTE(scanny) - following-fragment is None when page-break is last paragraph content. # -- following-fragment is None when page-break is last paragraph content. This is
# This is probably quite rare (Word moves these to the start of the next paragraph) but # -- probably quite rare (Word moves these to the start of the next paragraph) but
# easier to check for it than prove it can't happen. # -- easier to check for it than prove it can't happen.
following_paragraph_fragment = page_break.following_paragraph_fragment following_paragraph_fragment = page_break.following_paragraph_fragment
# NOTE(scanny) - the paragraph fragment following a page-break can itself contain # -- the paragraph fragment following a page-break can itself contain another
# another page-break. This would also be quite rare, but it can happen so we just # -- page-break; this would also be quite rare, but it can happen so we just recurse
# recurse into the second fragment the same way we handled the original paragraph. # -- into the second fragment the same way we handled the original paragraph
if following_paragraph_fragment: if following_paragraph_fragment:
yield from iter_paragraph_items(following_paragraph_fragment) yield from iter_paragraph_items(following_paragraph_fragment)
@ -901,8 +900,6 @@ class _DocxPartitioner:
return EmailAddress return EmailAddress
if is_possible_narrative_text(text): if is_possible_narrative_text(text):
return NarrativeText return NarrativeText
if is_possible_title(text):
return Title
return None return None