mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-31 12:23:49 +00:00
fix: improve false-positive Title elements on Chinese text (#3836)
**Summary** Improve element-type mapping for Chinese text. Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements. Fixes #3084 --------- Co-authored-by: scanny <scanny@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
This commit is contained in:
parent
9a9bf4c4f5
commit
9ece0b5ad2
@ -1,4 +1,4 @@
|
||||
## 0.16.12-dev3
|
||||
## 0.16.12-dev4
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
|
||||
- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
|
||||
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
|
||||
- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
|
||||
|
||||
## 0.16.11
|
||||
|
||||
|
@ -74,20 +74,18 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in
|
||||
"handbook-1p.docx",
|
||||
{
|
||||
("Header", None): 1,
|
||||
("Title", 0): 1,
|
||||
("Title", 1): 1,
|
||||
("Title", 2): 1,
|
||||
("UncategorizedText", 0): 6,
|
||||
("ListItem", 3): 3,
|
||||
("NarrativeText", 4): 7,
|
||||
("NarrativeText", 0): 7,
|
||||
("Footer", None): 1,
|
||||
},
|
||||
(0.43, 0.07, 0.65),
|
||||
(0.78, 0.72, 0.81),
|
||||
),
|
||||
(
|
||||
"handbook-1p.docx",
|
||||
{
|
||||
("Header", None): 1,
|
||||
("Title", 0): 6,
|
||||
("UncategorizedText", 0): 6,
|
||||
("NarrativeText", 0): 7,
|
||||
("PageBreak", None): 1,
|
||||
("Footer", None): 1,
|
||||
|
@ -1286,7 +1286,7 @@ def expected_docx_elements():
|
||||
Title("These are a few of my favorite things:"),
|
||||
ListItem("Parrots"),
|
||||
ListItem("Hockey"),
|
||||
Title("Analysis"),
|
||||
Text("Analysis"),
|
||||
NarrativeText("This is my first thought. This is my second thought."),
|
||||
NarrativeText("This is my third thought."),
|
||||
Text("2023"),
|
||||
|
@ -275,7 +275,7 @@ def expected_elements() -> list[Element]:
|
||||
Title("These are a few of my favorite things:"),
|
||||
ListItem("Parrots"),
|
||||
ListItem("Hockey"),
|
||||
Title("Analysis"),
|
||||
Text("Analysis"),
|
||||
NarrativeText("This is my first thought. This is my second thought."),
|
||||
NarrativeText("This is my third thought."),
|
||||
Text("2023"),
|
||||
|
@ -627,7 +627,7 @@ def expected_elements() -> list[Text]:
|
||||
Title("These are a few of my favorite things:"),
|
||||
ListItem("Parrots"),
|
||||
ListItem("Hockey"),
|
||||
Title("Analysis"),
|
||||
Text("Analysis"),
|
||||
NarrativeText("This is my first thought. This is my second thought."),
|
||||
NarrativeText("This is my third thought."),
|
||||
Text("2023"),
|
||||
@ -1210,7 +1210,7 @@ class Describe_DocxPartitioner:
|
||||
opts_args["file_path"] = example_doc_path("page-breaks.docx")
|
||||
opts = DocxPartitionerOptions(**opts_args)
|
||||
expected = [
|
||||
# NOTE(scanny) - -- page 1 --
|
||||
# -- page 1 --
|
||||
NarrativeText(
|
||||
"First page, tab here:\t"
|
||||
"followed by line-break here:\n"
|
||||
@ -1220,28 +1220,28 @@ class Describe_DocxPartitioner:
|
||||
"and hard page-break here>>"
|
||||
),
|
||||
PageBreak(""),
|
||||
# NOTE(scanny) - -- page 2 --
|
||||
# -- page 2 --
|
||||
NarrativeText(
|
||||
"<<Text on second page. The font is big so it breaks onto third page--"
|
||||
"------------------here-->> <<but break falls inside link so text stays"
|
||||
" together."
|
||||
),
|
||||
PageBreak(""),
|
||||
# NOTE(scanny) - -- page 3 --
|
||||
# -- page 3 --
|
||||
NarrativeText("Continuous section break here>>"),
|
||||
NarrativeText("<<followed by text on same page"),
|
||||
NarrativeText("Odd-page section break here>>"),
|
||||
PageBreak(""),
|
||||
# NOTE(scanny) - -- page 4 --
|
||||
# -- page 4 --
|
||||
PageBreak(""),
|
||||
# NOTE(scanny) - -- page 5 --
|
||||
# -- page 5 --
|
||||
NarrativeText("<<producing two page-breaks to get from page-3 to page-5."),
|
||||
NarrativeText(
|
||||
'Then text gets big again so a "natural" rendered page break happens again here>> '
|
||||
),
|
||||
PageBreak(""),
|
||||
# NOTE(scanny) - -- page 6 --
|
||||
Title("<<and then more text proceeds."),
|
||||
# -- page 6 --
|
||||
Text("<<and then more text proceeds."),
|
||||
]
|
||||
|
||||
elements = _DocxPartitioner.iter_document_elements(opts)
|
||||
|
@ -23,7 +23,6 @@ from unstructured.documents.elements import (
|
||||
Table,
|
||||
TableChunk,
|
||||
Text,
|
||||
Title,
|
||||
)
|
||||
from unstructured.partition.docx import partition_docx
|
||||
from unstructured.partition.odt import partition_odt
|
||||
@ -44,7 +43,7 @@ def test_partition_odt_from_filename():
|
||||
elements = partition_odt(example_doc_path("fake.odt"))
|
||||
|
||||
assert elements == [
|
||||
Title("Lorem ipsum dolor sit amet."),
|
||||
Text("Lorem ipsum dolor sit amet."),
|
||||
Table(
|
||||
"Header row Mon Wed Fri"
|
||||
" Color Blue Red Green"
|
||||
@ -63,7 +62,7 @@ def test_partition_odt_from_file():
|
||||
elements = partition_odt(file=f)
|
||||
|
||||
assert elements == [
|
||||
Title("Lorem ipsum dolor sit amet."),
|
||||
Text("Lorem ipsum dolor sit amet."),
|
||||
Table(
|
||||
"Header row Mon Wed Fri"
|
||||
" Color Blue Red Green"
|
||||
|
@ -23,7 +23,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "5209312022a75a31d95385fdccff68fa",
|
||||
"text": "CHAPTER 1",
|
||||
"metadata": {
|
||||
@ -51,7 +51,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "22a23e29022f32945965002cd734a8f0",
|
||||
"text": "INTRODUCTION",
|
||||
"metadata": {
|
||||
@ -79,7 +79,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "4c175cf543957acc4420221de28d3fca",
|
||||
"text": "CHAPTER 1 \u2013 INTRODUCTION",
|
||||
"metadata": {
|
||||
@ -101,7 +101,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "77022a5264f552b223538977cd40f640",
|
||||
"text": "A.\tPURPOSE",
|
||||
"metadata": {
|
||||
@ -189,7 +189,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "e341ffc123dd2827638aba18149c4175",
|
||||
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
|
||||
"metadata": {
|
||||
@ -255,7 +255,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "1b11ebe52652656e0ed8c12e5969de9b",
|
||||
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
|
||||
"metadata": {
|
||||
|
@ -23,7 +23,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "5209312022a75a31d95385fdccff68fa",
|
||||
"text": "CHAPTER 1",
|
||||
"metadata": {
|
||||
@ -51,7 +51,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "22a23e29022f32945965002cd734a8f0",
|
||||
"text": "INTRODUCTION",
|
||||
"metadata": {
|
||||
@ -79,7 +79,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "4c175cf543957acc4420221de28d3fca",
|
||||
"text": "CHAPTER 1 \u2013 INTRODUCTION",
|
||||
"metadata": {
|
||||
@ -101,7 +101,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "77022a5264f552b223538977cd40f640",
|
||||
"text": "A.\tPURPOSE",
|
||||
"metadata": {
|
||||
@ -189,7 +189,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "e341ffc123dd2827638aba18149c4175",
|
||||
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
|
||||
"metadata": {
|
||||
@ -255,7 +255,7 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "1b11ebe52652656e0ed8c12e5969de9b",
|
||||
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
|
||||
"metadata": {
|
||||
|
@ -1,6 +1,6 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "56d531394823d81787d77a04462ed096",
|
||||
"text": "Lorem ipsum dolor sit amet.",
|
||||
"metadata": {
|
||||
@ -17,6 +17,13 @@
|
||||
"date_created": "1686809759.687",
|
||||
"date_modified": "1686809743.0",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "anyoneWithLink",
|
||||
"type": "anyone",
|
||||
"kind": "drive#permission",
|
||||
"role": "reader",
|
||||
"allowFileDiscovery": false
|
||||
},
|
||||
{
|
||||
"id": "18298851591250030956",
|
||||
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
|
||||
@ -28,6 +35,17 @@
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
},
|
||||
{
|
||||
"id": "04774006893477068632",
|
||||
"displayName": "ryan",
|
||||
"type": "user",
|
||||
"kind": "drive#permission",
|
||||
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
|
||||
"emailAddress": "ryan@unstructured.io",
|
||||
"role": "owner",
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
},
|
||||
{
|
||||
"id": "09147371668407854156",
|
||||
"displayName": "roman",
|
||||
@ -38,24 +56,6 @@
|
||||
"role": "writer",
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
},
|
||||
{
|
||||
"id": "anyoneWithLink",
|
||||
"type": "anyone",
|
||||
"kind": "drive#permission",
|
||||
"role": "reader",
|
||||
"allowFileDiscovery": false
|
||||
},
|
||||
{
|
||||
"id": "04774006893477068632",
|
||||
"displayName": "ryan",
|
||||
"type": "user",
|
||||
"kind": "drive#permission",
|
||||
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
|
||||
"emailAddress": "ryan@unstructured.io",
|
||||
"role": "owner",
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
}
|
||||
]
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "56d531394823d81787d77a04462ed096",
|
||||
"text": "Lorem ipsum dolor sit amet.",
|
||||
"metadata": {
|
||||
@ -17,6 +17,13 @@
|
||||
"date_created": "1718722775.76",
|
||||
"date_modified": "1718722788.018",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "anyoneWithLink",
|
||||
"type": "anyone",
|
||||
"kind": "drive#permission",
|
||||
"role": "reader",
|
||||
"allowFileDiscovery": false
|
||||
},
|
||||
{
|
||||
"id": "18298851591250030956",
|
||||
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
|
||||
@ -39,13 +46,6 @@
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
},
|
||||
{
|
||||
"id": "anyoneWithLink",
|
||||
"type": "anyone",
|
||||
"kind": "drive#permission",
|
||||
"role": "reader",
|
||||
"allowFileDiscovery": false
|
||||
},
|
||||
{
|
||||
"id": "09147371668407854156",
|
||||
"displayName": "roman",
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "cc23ac9998df1db62b795ec4e5133ab0",
|
||||
"text": "Title",
|
||||
"metadata": {
|
||||
@ -22,6 +22,13 @@
|
||||
"date_created": "1686809758.931",
|
||||
"date_modified": "1686809744.0",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "anyoneWithLink",
|
||||
"type": "anyone",
|
||||
"kind": "drive#permission",
|
||||
"role": "reader",
|
||||
"allowFileDiscovery": false
|
||||
},
|
||||
{
|
||||
"id": "18298851591250030956",
|
||||
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
|
||||
@ -33,24 +40,6 @@
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
},
|
||||
{
|
||||
"id": "09147371668407854156",
|
||||
"displayName": "roman",
|
||||
"type": "user",
|
||||
"kind": "drive#permission",
|
||||
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
|
||||
"emailAddress": "roman@unstructured.io",
|
||||
"role": "writer",
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
},
|
||||
{
|
||||
"id": "anyoneWithLink",
|
||||
"type": "anyone",
|
||||
"kind": "drive#permission",
|
||||
"role": "reader",
|
||||
"allowFileDiscovery": false
|
||||
},
|
||||
{
|
||||
"id": "04774006893477068632",
|
||||
"displayName": "ryan",
|
||||
@ -61,6 +50,17 @@
|
||||
"role": "owner",
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
},
|
||||
{
|
||||
"id": "09147371668407854156",
|
||||
"displayName": "roman",
|
||||
"type": "user",
|
||||
"kind": "drive#permission",
|
||||
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
|
||||
"emailAddress": "roman@unstructured.io",
|
||||
"role": "writer",
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -89,6 +89,13 @@
|
||||
"date_created": "1686809758.931",
|
||||
"date_modified": "1686809744.0",
|
||||
"permissions_data": [
|
||||
{
|
||||
"id": "anyoneWithLink",
|
||||
"type": "anyone",
|
||||
"kind": "drive#permission",
|
||||
"role": "reader",
|
||||
"allowFileDiscovery": false
|
||||
},
|
||||
{
|
||||
"id": "18298851591250030956",
|
||||
"displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com",
|
||||
@ -100,6 +107,17 @@
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
},
|
||||
{
|
||||
"id": "04774006893477068632",
|
||||
"displayName": "ryan",
|
||||
"type": "user",
|
||||
"kind": "drive#permission",
|
||||
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
|
||||
"emailAddress": "ryan@unstructured.io",
|
||||
"role": "owner",
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
},
|
||||
{
|
||||
"id": "09147371668407854156",
|
||||
"displayName": "roman",
|
||||
@ -110,24 +128,6 @@
|
||||
"role": "writer",
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
},
|
||||
{
|
||||
"id": "anyoneWithLink",
|
||||
"type": "anyone",
|
||||
"kind": "drive#permission",
|
||||
"role": "reader",
|
||||
"allowFileDiscovery": false
|
||||
},
|
||||
{
|
||||
"id": "04774006893477068632",
|
||||
"displayName": "ryan",
|
||||
"type": "user",
|
||||
"kind": "drive#permission",
|
||||
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
|
||||
"emailAddress": "ryan@unstructured.io",
|
||||
"role": "owner",
|
||||
"deleted": false,
|
||||
"pendingOwner": false
|
||||
}
|
||||
]
|
||||
}
|
||||
|
@ -27,7 +27,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 1,
|
||||
"orig_elements": "eJztlU1v2jAYx79K5PMgifNm71Z1leilIAqnFkWO/RiiJnbkOBsd4rvPhtK1E4dt0qQx7RQ/74//P1l52CFooAVly1qgjwFKODBKuIRIZAXJc8oqwDxNMkllFgmGPgSoBcsEs8zl75A/lL0eDIeD3YFp676vterLl6SHHWq18OEkiQnZr1wPA1wbUTaaM6vNsZLZjV8h3OgWQjMoBSb8os1TOKjemoHbwYB4b8CWtV0DI6F5H26YEpXWT6O4GzvHFu33bpKsG7DPnR+PWNc1tZvotgs/KzHWHaht20htWmb7kZay5uBKB6/I2M0WndEc3HXUum3Gp4gXYQNMgCml1tZ9TgM6U7fMPPuEhqn1wNbQewUQqDVa+XUsbK3PXN4HCzP0FiCYvOztq06NJof2yFX8SEgKSgkTaSogo1lW0IRgUeTuUFQVJvzyCEHbbVhffwVRenVKrpV19z0Kdz25mi1u5kGMVmdSLVsf06pD+A+xPoPSeTvnKdXQVuDVid/C/b70G6SL2jZwjmhFRERFlnIBmJBIFEwkNM8hYhHnVVX9Y0Rv7xbz6afl9eJ2endJUN/t/VNckyzPCEtlgYXDWUkc5RyqnCcYA5X48rj+LShe31fwOOAoToJfZsNzwAWN3W8uTomgKc6KIk5S8GYS0/Q/m99lczV+tLPlfDa9vzlDYvUNQ4PhWg=="
|
||||
"orig_elements": "eJztlV1r2zAUhv+K0fWS+NvW7kpXSG+akCZXbTD6OHJMLcnI8pYu5L9PSpqtHYGxjcEydmUdnffovDoPwg87BC1IULZqOHofoIQBwSUTEPKsKPMcEwoxS5NMYJGFnKB3AZJgCSeWOP0O+UXV68EwOMQdGNn0faNVX72IHnZIau7TSRKV5X7tzjDAtOFVqxmx2hwrid14C5ONljAxg1JgJp+0eZoMqrdmYHYwwN8GsCWya2HENesnG6I41fppFHVjt7FF+73rJJoW7HPn2yPSdW3jOjp3k4+Kj3UHaitboY0kth9pIRoGrnTwExm73rwzmoG7jqplOz5l/BA2QDiYSmht3efUoDONJObZC1qi6oHU0PsJIFA1Wns7FrbWK1f3wdIMvQUIpi++fdXpoOnheOQqvickOMYl4WnKIcNZVuCkjHmRu0VBaVyyyyMEstuQvvkMvPLTqZhW1t33OLjr6dV8ebMIIrQ+I7WkPsroIf2HWJ9B6XY7t1OpQVLw04lew/1m+hXSlXJuoNbG21965Rm6tOQh5lnKOMRlGfKC8ATnOYQkZIxS+o/Rvb1bLmYfVtfL29ndJQF+4/unGSdZnpUkFUXMHVoq4jBnQHOWxDFgEV8e478Fy9d3FzwOcRglwW9xYjnEBY7crzBKS47TOCuKKEnBh0mE0/+cfpXT1fjRzleL+ez+5gdU1l8AKnj1Ng=="
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -383,7 +383,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 2,
|
||||
"orig_elements": "eJxVkMtOwzAQRX8l8pompGmlhB2IIJAQldp0VarIsSdpVNtj+QFBVf4dG+iC3bzPvXO4EBAgQbl25OQuIauCccp51RXLjlf5ii0L3lddWZXlmt2uC3KTEAmOcupomL+QGLQWvWHwk2swcrR2RGXbv6HDhUjksV0UeVnOx3DDAEPDW4GMOjS/m9SdooTshBIy45UCk32iOWdeWWc8c94A/5/ARKUWsODIbHaiineI50Wu01CYyDwHUj8KcF864gnVWoyBGNRlH4qnqEFNUvRoJHV2gX0/MgirPn4kDWyuDTIIdtQgRXrtxCcIqgZPB7DRIAE1kGhLh0qrvOwgelpGvoPJRfZD+u62m9c62TwlzXOd7N9emvox2TX3Tb1Lmu1+19R1vHwV24xOAJmP32EBjeE="
|
||||
"orig_elements": "eJxVkMtOwzAQRX8l8pomtGmlhB2IIJAQldp0VarIsSdpVNtj+QGBKv+ODe2C3bzPvbM/ExAgQblm4OQuIcucccp52eaLlpfzJVvkvCvboiyKFbtd5eQmIRIc5dTRMH8mMWgsesPgN9dg5GDtgMo2l6H9mUjksZ3n86KYDuGGAYaGNwIZdWj+Nqk7RgnZESVkxisFJvtEc8q8ss545rwB/j+BkUotYMaR2exIFW8RT7O5TkNhJNMUSN0gwH3piCdUazEEYlCXfSieogY1StGhkdTZGXbdwCCs+viRNLC5Nsgg2FG9FOm1E58gqOo97cFGgwRUT6ItHSqN8rKF6GkR+Q5GF9kP6bvbrF+rZP2U1M9Vsnt7qavHZFvf19U2qTe7bV1V8fJV7E4FpdCjGb6B1/HKdPgBKuqS2A=="
|
||||
}
|
||||
},
|
||||
{
|
||||
@ -572,7 +572,7 @@
|
||||
"eng"
|
||||
],
|
||||
"page_number": 2,
|
||||
"orig_elements": "eJxVkMtuwjAQRX8l8rokDZRHukOFVmxAgrCoAEVOPAkRtsfyo02F8u+1oSy689yZ8bl3DlcCHARIW7SMvEbkJcvSYZqNR1NaZiU8jwHGFDJIKWMTOp2Qp4gIsJRRS/38lYRHYdDpCm61Ai1aY1qUpvgbOlyJQBbao1E6m/Un/4eGCjUrOFbUor5vUnsOFpIzCki0kxJ08o36kjhprHaVdRrY/wI6KhSHAcPKJGcqWYl4GaQq9kJH+t6T6paD/VEBT6hSvPVE7y75kixGBbITvEYtqDUDrOu2Ar/qwkViz2ZKYwU+jmwEjx+dcAROZeNoAyYEJCAbEmIprxTSiRJCpmHgW+hsYL/FR7vL5/k+32w/o8U+Xy130eY9mkdeXS9W648o3+53+XJ5vAEenvPWciD96RdNTpBK"
|
||||
"orig_elements": "eJxVkMtuwjAQRX8l8rokDZRHukOFVmxAgmRRAYqceBIibI/lR5sW8e+1aVl0N+9z7+wvBDgIkLbsGHmOyFOWpcM0G4+mtMoqeBwDjClkkFLGJnQ6IQ8REWApo5b6+QsJQWnQ6RpuuQItOmM6lKb8G9pfiEAW2qNROptdj/6Ghho1KznW1KL+3aT2FCQkJxSQaCcl6OQT9Tlx0ljtaus0sP8J9FQoDgOGtUlOVLIK8TxIVewLPblePanpONgvFfCEKsU7T/Tqkg/JYlQge8Eb1IJaM8Cm6Wrwqy58JPZspjTW4O3IVvD43glP4FS2jrZggkECsiXBlvKVUjpRQfA0DHwLvQ3sl/hgd/k8L/LN9j1aFPlquYs2r9E88tX1YrV+i/JtscuXy8MNcNdcSC8YWtTdN7A8HLsefwA0I5VB"
|
||||
}
|
||||
},
|
||||
{
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.16.12-dev3" # pragma: no cover
|
||||
__version__ = "0.16.12-dev4" # pragma: no cover
|
||||
|
@ -48,7 +48,6 @@ from unstructured.partition.text_type import (
|
||||
is_bulleted_text,
|
||||
is_email_address,
|
||||
is_possible_narrative_text,
|
||||
is_possible_title,
|
||||
is_us_city_state_zip,
|
||||
)
|
||||
from unstructured.partition.utils.constants import PartitionStrategy
|
||||
@ -412,15 +411,15 @@ class _DocxPartitioner:
|
||||
)
|
||||
)
|
||||
|
||||
# NOTE(scanny) - blank paragraphs are commonly used for spacing between paragraphs and
|
||||
# do not contribute to the document-element stream.
|
||||
# -- blank paragraphs are commonly used for spacing between paragraphs and do not
|
||||
# -- contribute to the document-element stream
|
||||
if not text.strip():
|
||||
return
|
||||
|
||||
metadata = self._paragraph_metadata(paragraph)
|
||||
|
||||
# NOTE(scanny) - a list-item gets some special treatment, mutating the text to remove a
|
||||
# bullet-character if present.
|
||||
# -- a list-item gets some special treatment, mutating the text to remove a
|
||||
# -- bullet-character if present
|
||||
if self._is_list_item(paragraph):
|
||||
clean_text = clean_bullets(text).strip()
|
||||
if clean_text:
|
||||
@ -431,19 +430,19 @@ class _DocxPartitioner:
|
||||
)
|
||||
return
|
||||
|
||||
# NOTE(scanny) - determine element-type from an explicit Word paragraph-style if possible
|
||||
# -- determine element-type from an explicit Word paragraph-style if possible --
|
||||
TextSubCls = self._style_based_element_type(paragraph)
|
||||
if TextSubCls:
|
||||
yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
|
||||
return
|
||||
|
||||
# NOTE(scanny) - try to recognize the element type by parsing its text
|
||||
# -- try to recognize the element type by parsing its text --
|
||||
TextSubCls = self._parse_paragraph_text_for_element_type(paragraph)
|
||||
if TextSubCls:
|
||||
yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
|
||||
return
|
||||
|
||||
# NOTE(scanny) - if all that fails we give it the default `Text` element-type
|
||||
# -- if all that fails we give it the default `Text` element-type --
|
||||
yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN)
|
||||
|
||||
def _convert_table_to_html(self, table: DocxTable) -> str:
|
||||
@ -576,20 +575,20 @@ class _DocxPartitioner:
|
||||
|
||||
page_break = paragraph.rendered_page_breaks[0]
|
||||
|
||||
# NOTE(scanny)- preceding-fragment is None when first paragraph content is a page-break
|
||||
# -- preceding-fragment is None when first paragraph content is a page-break --
|
||||
preceding_paragraph_fragment = page_break.preceding_paragraph_fragment
|
||||
if preceding_paragraph_fragment:
|
||||
yield preceding_paragraph_fragment
|
||||
|
||||
yield page_break
|
||||
|
||||
# NOTE(scanny) - following-fragment is None when page-break is last paragraph content.
|
||||
# This is probably quite rare (Word moves these to the start of the next paragraph) but
|
||||
# easier to check for it than prove it can't happen.
|
||||
# -- following-fragment is None when page-break is last paragraph content. This is
|
||||
# -- probably quite rare (Word moves these to the start of the next paragraph) but
|
||||
# -- easier to check for it than prove it can't happen.
|
||||
following_paragraph_fragment = page_break.following_paragraph_fragment
|
||||
# NOTE(scanny) - the paragraph fragment following a page-break can itself contain
|
||||
# another page-break. This would also be quite rare, but it can happen so we just
|
||||
# recurse into the second fragment the same way we handled the original paragraph.
|
||||
# -- the paragraph fragment following a page-break can itself contain another
|
||||
# -- page-break; this would also be quite rare, but it can happen so we just recurse
|
||||
# -- into the second fragment the same way we handled the original paragraph
|
||||
if following_paragraph_fragment:
|
||||
yield from iter_paragraph_items(following_paragraph_fragment)
|
||||
|
||||
@ -901,8 +900,6 @@ class _DocxPartitioner:
|
||||
return EmailAddress
|
||||
if is_possible_narrative_text(text):
|
||||
return NarrativeText
|
||||
if is_possible_title(text):
|
||||
return Title
|
||||
|
||||
return None
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user