mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-26 15:42:15 +00:00 
			
		
		
		
	fix: improve false-positive Title elements on Chinese text (#3836)
**Summary** Improve element-type mapping for Chinese text. Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements. Fixes #3084 --------- Co-authored-by: scanny <scanny@users.noreply.github.com> Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									9a9bf4c4f5
								
							
						
					
					
						commit
						9ece0b5ad2
					
				| @ -1,4 +1,4 @@ | |||||||
| ## 0.16.12-dev3 | ## 0.16.12-dev4 | ||||||
| 
 | 
 | ||||||
| ### Enhancements | ### Enhancements | ||||||
| 
 | 
 | ||||||
| @ -10,6 +10,7 @@ | |||||||
| 
 | 
 | ||||||
| - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. | - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. | ||||||
| - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file. | - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file. | ||||||
|  | - **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements. | ||||||
| 
 | 
 | ||||||
| ## 0.16.11 | ## 0.16.11 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -74,20 +74,18 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in | |||||||
|             "handbook-1p.docx", |             "handbook-1p.docx", | ||||||
|             { |             { | ||||||
|                 ("Header", None): 1, |                 ("Header", None): 1, | ||||||
|                 ("Title", 0): 1, |                 ("UncategorizedText", 0): 6, | ||||||
|                 ("Title", 1): 1, |  | ||||||
|                 ("Title", 2): 1, |  | ||||||
|                 ("ListItem", 3): 3, |                 ("ListItem", 3): 3, | ||||||
|                 ("NarrativeText", 4): 7, |                 ("NarrativeText", 0): 7, | ||||||
|                 ("Footer", None): 1, |                 ("Footer", None): 1, | ||||||
|             }, |             }, | ||||||
|             (0.43, 0.07, 0.65), |             (0.78, 0.72, 0.81), | ||||||
|         ), |         ), | ||||||
|         ( |         ( | ||||||
|             "handbook-1p.docx", |             "handbook-1p.docx", | ||||||
|             { |             { | ||||||
|                 ("Header", None): 1, |                 ("Header", None): 1, | ||||||
|                 ("Title", 0): 6, |                 ("UncategorizedText", 0): 6, | ||||||
|                 ("NarrativeText", 0): 7, |                 ("NarrativeText", 0): 7, | ||||||
|                 ("PageBreak", None): 1, |                 ("PageBreak", None): 1, | ||||||
|                 ("Footer", None): 1, |                 ("Footer", None): 1, | ||||||
|  | |||||||
| @ -1286,7 +1286,7 @@ def expected_docx_elements(): | |||||||
|         Title("These are a few of my favorite things:"), |         Title("These are a few of my favorite things:"), | ||||||
|         ListItem("Parrots"), |         ListItem("Parrots"), | ||||||
|         ListItem("Hockey"), |         ListItem("Hockey"), | ||||||
|         Title("Analysis"), |         Text("Analysis"), | ||||||
|         NarrativeText("This is my first thought. This is my second thought."), |         NarrativeText("This is my first thought. This is my second thought."), | ||||||
|         NarrativeText("This is my third thought."), |         NarrativeText("This is my third thought."), | ||||||
|         Text("2023"), |         Text("2023"), | ||||||
|  | |||||||
| @ -275,7 +275,7 @@ def expected_elements() -> list[Element]: | |||||||
|         Title("These are a few of my favorite things:"), |         Title("These are a few of my favorite things:"), | ||||||
|         ListItem("Parrots"), |         ListItem("Parrots"), | ||||||
|         ListItem("Hockey"), |         ListItem("Hockey"), | ||||||
|         Title("Analysis"), |         Text("Analysis"), | ||||||
|         NarrativeText("This is my first thought. This is my second thought."), |         NarrativeText("This is my first thought. This is my second thought."), | ||||||
|         NarrativeText("This is my third thought."), |         NarrativeText("This is my third thought."), | ||||||
|         Text("2023"), |         Text("2023"), | ||||||
|  | |||||||
| @ -627,7 +627,7 @@ def expected_elements() -> list[Text]: | |||||||
|         Title("These are a few of my favorite things:"), |         Title("These are a few of my favorite things:"), | ||||||
|         ListItem("Parrots"), |         ListItem("Parrots"), | ||||||
|         ListItem("Hockey"), |         ListItem("Hockey"), | ||||||
|         Title("Analysis"), |         Text("Analysis"), | ||||||
|         NarrativeText("This is my first thought. This is my second thought."), |         NarrativeText("This is my first thought. This is my second thought."), | ||||||
|         NarrativeText("This is my third thought."), |         NarrativeText("This is my third thought."), | ||||||
|         Text("2023"), |         Text("2023"), | ||||||
| @ -1210,7 +1210,7 @@ class Describe_DocxPartitioner: | |||||||
|         opts_args["file_path"] = example_doc_path("page-breaks.docx") |         opts_args["file_path"] = example_doc_path("page-breaks.docx") | ||||||
|         opts = DocxPartitionerOptions(**opts_args) |         opts = DocxPartitionerOptions(**opts_args) | ||||||
|         expected = [ |         expected = [ | ||||||
|             # NOTE(scanny) - -- page 1 -- |             # -- page 1 -- | ||||||
|             NarrativeText( |             NarrativeText( | ||||||
|                 "First page, tab here:\t" |                 "First page, tab here:\t" | ||||||
|                 "followed by line-break here:\n" |                 "followed by line-break here:\n" | ||||||
| @ -1220,28 +1220,28 @@ class Describe_DocxPartitioner: | |||||||
|                 "and hard page-break here>>" |                 "and hard page-break here>>" | ||||||
|             ), |             ), | ||||||
|             PageBreak(""), |             PageBreak(""), | ||||||
|             # NOTE(scanny) - -- page 2 -- |             # -- page 2 -- | ||||||
|             NarrativeText( |             NarrativeText( | ||||||
|                 "<<Text on second page. The font is big so it breaks onto third page--" |                 "<<Text on second page. The font is big so it breaks onto third page--" | ||||||
|                 "------------------here-->> <<but break falls inside link so text stays" |                 "------------------here-->> <<but break falls inside link so text stays" | ||||||
|                 " together." |                 " together." | ||||||
|             ), |             ), | ||||||
|             PageBreak(""), |             PageBreak(""), | ||||||
|             # NOTE(scanny) - -- page 3 -- |             # -- page 3 -- | ||||||
|             NarrativeText("Continuous section break here>>"), |             NarrativeText("Continuous section break here>>"), | ||||||
|             NarrativeText("<<followed by text on same page"), |             NarrativeText("<<followed by text on same page"), | ||||||
|             NarrativeText("Odd-page section break here>>"), |             NarrativeText("Odd-page section break here>>"), | ||||||
|             PageBreak(""), |             PageBreak(""), | ||||||
|             # NOTE(scanny) - -- page 4 -- |             # -- page 4 -- | ||||||
|             PageBreak(""), |             PageBreak(""), | ||||||
|             # NOTE(scanny) - -- page 5 -- |             # -- page 5 -- | ||||||
|             NarrativeText("<<producing two page-breaks to get from page-3 to page-5."), |             NarrativeText("<<producing two page-breaks to get from page-3 to page-5."), | ||||||
|             NarrativeText( |             NarrativeText( | ||||||
|                 'Then text gets big again so a "natural" rendered page break happens again here>> ' |                 'Then text gets big again so a "natural" rendered page break happens again here>> ' | ||||||
|             ), |             ), | ||||||
|             PageBreak(""), |             PageBreak(""), | ||||||
|             # NOTE(scanny) - -- page 6 -- |             # -- page 6 -- | ||||||
|             Title("<<and then more text proceeds."), |             Text("<<and then more text proceeds."), | ||||||
|         ] |         ] | ||||||
| 
 | 
 | ||||||
|         elements = _DocxPartitioner.iter_document_elements(opts) |         elements = _DocxPartitioner.iter_document_elements(opts) | ||||||
|  | |||||||
| @ -23,7 +23,6 @@ from unstructured.documents.elements import ( | |||||||
|     Table, |     Table, | ||||||
|     TableChunk, |     TableChunk, | ||||||
|     Text, |     Text, | ||||||
|     Title, |  | ||||||
| ) | ) | ||||||
| from unstructured.partition.docx import partition_docx | from unstructured.partition.docx import partition_docx | ||||||
| from unstructured.partition.odt import partition_odt | from unstructured.partition.odt import partition_odt | ||||||
| @ -44,7 +43,7 @@ def test_partition_odt_from_filename(): | |||||||
|     elements = partition_odt(example_doc_path("fake.odt")) |     elements = partition_odt(example_doc_path("fake.odt")) | ||||||
| 
 | 
 | ||||||
|     assert elements == [ |     assert elements == [ | ||||||
|         Title("Lorem ipsum dolor sit amet."), |         Text("Lorem ipsum dolor sit amet."), | ||||||
|         Table( |         Table( | ||||||
|             "Header row Mon Wed Fri" |             "Header row Mon Wed Fri" | ||||||
|             " Color Blue Red Green" |             " Color Blue Red Green" | ||||||
| @ -63,7 +62,7 @@ def test_partition_odt_from_file(): | |||||||
|         elements = partition_odt(file=f) |         elements = partition_odt(file=f) | ||||||
| 
 | 
 | ||||||
|     assert elements == [ |     assert elements == [ | ||||||
|         Title("Lorem ipsum dolor sit amet."), |         Text("Lorem ipsum dolor sit amet."), | ||||||
|         Table( |         Table( | ||||||
|             "Header row Mon Wed Fri" |             "Header row Mon Wed Fri" | ||||||
|             " Color Blue Red Green" |             " Color Blue Red Green" | ||||||
|  | |||||||
| @ -23,7 +23,7 @@ | |||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "5209312022a75a31d95385fdccff68fa", |     "element_id": "5209312022a75a31d95385fdccff68fa", | ||||||
|     "text": "CHAPTER 1", |     "text": "CHAPTER 1", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -51,7 +51,7 @@ | |||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "22a23e29022f32945965002cd734a8f0", |     "element_id": "22a23e29022f32945965002cd734a8f0", | ||||||
|     "text": "INTRODUCTION", |     "text": "INTRODUCTION", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -79,7 +79,7 @@ | |||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "4c175cf543957acc4420221de28d3fca", |     "element_id": "4c175cf543957acc4420221de28d3fca", | ||||||
|     "text": "CHAPTER 1 \u2013 INTRODUCTION", |     "text": "CHAPTER 1 \u2013 INTRODUCTION", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -101,7 +101,7 @@ | |||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "77022a5264f552b223538977cd40f640", |     "element_id": "77022a5264f552b223538977cd40f640", | ||||||
|     "text": "A.\tPURPOSE", |     "text": "A.\tPURPOSE", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -189,7 +189,7 @@ | |||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "e341ffc123dd2827638aba18149c4175", |     "element_id": "e341ffc123dd2827638aba18149c4175", | ||||||
|     "text": "B.\tROLE OF THE UNITED STATES TRUSTEE", |     "text": "B.\tROLE OF THE UNITED STATES TRUSTEE", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -255,7 +255,7 @@ | |||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "1b11ebe52652656e0ed8c12e5969de9b", |     "element_id": "1b11ebe52652656e0ed8c12e5969de9b", | ||||||
|     "text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t", |     "text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t", | ||||||
|     "metadata": { |     "metadata": { | ||||||
|  | |||||||
| @ -23,7 +23,7 @@ | |||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "5209312022a75a31d95385fdccff68fa", |     "element_id": "5209312022a75a31d95385fdccff68fa", | ||||||
|     "text": "CHAPTER 1", |     "text": "CHAPTER 1", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -51,7 +51,7 @@ | |||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "22a23e29022f32945965002cd734a8f0", |     "element_id": "22a23e29022f32945965002cd734a8f0", | ||||||
|     "text": "INTRODUCTION", |     "text": "INTRODUCTION", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -79,7 +79,7 @@ | |||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "4c175cf543957acc4420221de28d3fca", |     "element_id": "4c175cf543957acc4420221de28d3fca", | ||||||
|     "text": "CHAPTER 1 \u2013 INTRODUCTION", |     "text": "CHAPTER 1 \u2013 INTRODUCTION", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -101,7 +101,7 @@ | |||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "77022a5264f552b223538977cd40f640", |     "element_id": "77022a5264f552b223538977cd40f640", | ||||||
|     "text": "A.\tPURPOSE", |     "text": "A.\tPURPOSE", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -189,7 +189,7 @@ | |||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "e341ffc123dd2827638aba18149c4175", |     "element_id": "e341ffc123dd2827638aba18149c4175", | ||||||
|     "text": "B.\tROLE OF THE UNITED STATES TRUSTEE", |     "text": "B.\tROLE OF THE UNITED STATES TRUSTEE", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -255,7 +255,7 @@ | |||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "1b11ebe52652656e0ed8c12e5969de9b", |     "element_id": "1b11ebe52652656e0ed8c12e5969de9b", | ||||||
|     "text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t", |     "text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t", | ||||||
|     "metadata": { |     "metadata": { | ||||||
|  | |||||||
| @ -1,6 +1,6 @@ | |||||||
| [ | [ | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "56d531394823d81787d77a04462ed096", |     "element_id": "56d531394823d81787d77a04462ed096", | ||||||
|     "text": "Lorem ipsum dolor sit amet.", |     "text": "Lorem ipsum dolor sit amet.", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -17,6 +17,13 @@ | |||||||
|         "date_created": "1686809759.687", |         "date_created": "1686809759.687", | ||||||
|         "date_modified": "1686809743.0", |         "date_modified": "1686809743.0", | ||||||
|         "permissions_data": [ |         "permissions_data": [ | ||||||
|  |           { | ||||||
|  |             "id": "anyoneWithLink", | ||||||
|  |             "type": "anyone", | ||||||
|  |             "kind": "drive#permission", | ||||||
|  |             "role": "reader", | ||||||
|  |             "allowFileDiscovery": false | ||||||
|  |           }, | ||||||
|           { |           { | ||||||
|             "id": "18298851591250030956", |             "id": "18298851591250030956", | ||||||
|             "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com", |             "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com", | ||||||
| @ -28,6 +35,17 @@ | |||||||
|             "deleted": false, |             "deleted": false, | ||||||
|             "pendingOwner": false |             "pendingOwner": false | ||||||
|           }, |           }, | ||||||
|  |           { | ||||||
|  |             "id": "04774006893477068632", | ||||||
|  |             "displayName": "ryan", | ||||||
|  |             "type": "user", | ||||||
|  |             "kind": "drive#permission", | ||||||
|  |             "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64", | ||||||
|  |             "emailAddress": "ryan@unstructured.io", | ||||||
|  |             "role": "owner", | ||||||
|  |             "deleted": false, | ||||||
|  |             "pendingOwner": false | ||||||
|  |           }, | ||||||
|           { |           { | ||||||
|             "id": "09147371668407854156", |             "id": "09147371668407854156", | ||||||
|             "displayName": "roman", |             "displayName": "roman", | ||||||
| @ -38,24 +56,6 @@ | |||||||
|             "role": "writer", |             "role": "writer", | ||||||
|             "deleted": false, |             "deleted": false, | ||||||
|             "pendingOwner": false |             "pendingOwner": false | ||||||
|           }, |  | ||||||
|           { |  | ||||||
|             "id": "anyoneWithLink", |  | ||||||
|             "type": "anyone", |  | ||||||
|             "kind": "drive#permission", |  | ||||||
|             "role": "reader", |  | ||||||
|             "allowFileDiscovery": false |  | ||||||
|           }, |  | ||||||
|           { |  | ||||||
|             "id": "04774006893477068632", |  | ||||||
|             "displayName": "ryan", |  | ||||||
|             "type": "user", |  | ||||||
|             "kind": "drive#permission", |  | ||||||
|             "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64", |  | ||||||
|             "emailAddress": "ryan@unstructured.io", |  | ||||||
|             "role": "owner", |  | ||||||
|             "deleted": false, |  | ||||||
|             "pendingOwner": false |  | ||||||
|           } |           } | ||||||
|         ] |         ] | ||||||
|       } |       } | ||||||
|  | |||||||
| @ -1,6 +1,6 @@ | |||||||
| [ | [ | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "56d531394823d81787d77a04462ed096", |     "element_id": "56d531394823d81787d77a04462ed096", | ||||||
|     "text": "Lorem ipsum dolor sit amet.", |     "text": "Lorem ipsum dolor sit amet.", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -17,6 +17,13 @@ | |||||||
|         "date_created": "1718722775.76", |         "date_created": "1718722775.76", | ||||||
|         "date_modified": "1718722788.018", |         "date_modified": "1718722788.018", | ||||||
|         "permissions_data": [ |         "permissions_data": [ | ||||||
|  |           { | ||||||
|  |             "id": "anyoneWithLink", | ||||||
|  |             "type": "anyone", | ||||||
|  |             "kind": "drive#permission", | ||||||
|  |             "role": "reader", | ||||||
|  |             "allowFileDiscovery": false | ||||||
|  |           }, | ||||||
|           { |           { | ||||||
|             "id": "18298851591250030956", |             "id": "18298851591250030956", | ||||||
|             "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com", |             "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com", | ||||||
| @ -39,13 +46,6 @@ | |||||||
|             "deleted": false, |             "deleted": false, | ||||||
|             "pendingOwner": false |             "pendingOwner": false | ||||||
|           }, |           }, | ||||||
|           { |  | ||||||
|             "id": "anyoneWithLink", |  | ||||||
|             "type": "anyone", |  | ||||||
|             "kind": "drive#permission", |  | ||||||
|             "role": "reader", |  | ||||||
|             "allowFileDiscovery": false |  | ||||||
|           }, |  | ||||||
|           { |           { | ||||||
|             "id": "09147371668407854156", |             "id": "09147371668407854156", | ||||||
|             "displayName": "roman", |             "displayName": "roman", | ||||||
|  | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -1,6 +1,6 @@ | |||||||
| [ | [ | ||||||
|   { |   { | ||||||
|     "type": "Title", |     "type": "UncategorizedText", | ||||||
|     "element_id": "cc23ac9998df1db62b795ec4e5133ab0", |     "element_id": "cc23ac9998df1db62b795ec4e5133ab0", | ||||||
|     "text": "Title", |     "text": "Title", | ||||||
|     "metadata": { |     "metadata": { | ||||||
| @ -22,6 +22,13 @@ | |||||||
|         "date_created": "1686809758.931", |         "date_created": "1686809758.931", | ||||||
|         "date_modified": "1686809744.0", |         "date_modified": "1686809744.0", | ||||||
|         "permissions_data": [ |         "permissions_data": [ | ||||||
|  |           { | ||||||
|  |             "id": "anyoneWithLink", | ||||||
|  |             "type": "anyone", | ||||||
|  |             "kind": "drive#permission", | ||||||
|  |             "role": "reader", | ||||||
|  |             "allowFileDiscovery": false | ||||||
|  |           }, | ||||||
|           { |           { | ||||||
|             "id": "18298851591250030956", |             "id": "18298851591250030956", | ||||||
|             "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com", |             "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com", | ||||||
| @ -33,24 +40,6 @@ | |||||||
|             "deleted": false, |             "deleted": false, | ||||||
|             "pendingOwner": false |             "pendingOwner": false | ||||||
|           }, |           }, | ||||||
|           { |  | ||||||
|             "id": "09147371668407854156", |  | ||||||
|             "displayName": "roman", |  | ||||||
|             "type": "user", |  | ||||||
|             "kind": "drive#permission", |  | ||||||
|             "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64", |  | ||||||
|             "emailAddress": "roman@unstructured.io", |  | ||||||
|             "role": "writer", |  | ||||||
|             "deleted": false, |  | ||||||
|             "pendingOwner": false |  | ||||||
|           }, |  | ||||||
|           { |  | ||||||
|             "id": "anyoneWithLink", |  | ||||||
|             "type": "anyone", |  | ||||||
|             "kind": "drive#permission", |  | ||||||
|             "role": "reader", |  | ||||||
|             "allowFileDiscovery": false |  | ||||||
|           }, |  | ||||||
|           { |           { | ||||||
|             "id": "04774006893477068632", |             "id": "04774006893477068632", | ||||||
|             "displayName": "ryan", |             "displayName": "ryan", | ||||||
| @ -61,6 +50,17 @@ | |||||||
|             "role": "owner", |             "role": "owner", | ||||||
|             "deleted": false, |             "deleted": false, | ||||||
|             "pendingOwner": false |             "pendingOwner": false | ||||||
|  |           }, | ||||||
|  |           { | ||||||
|  |             "id": "09147371668407854156", | ||||||
|  |             "displayName": "roman", | ||||||
|  |             "type": "user", | ||||||
|  |             "kind": "drive#permission", | ||||||
|  |             "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64", | ||||||
|  |             "emailAddress": "roman@unstructured.io", | ||||||
|  |             "role": "writer", | ||||||
|  |             "deleted": false, | ||||||
|  |             "pendingOwner": false | ||||||
|           } |           } | ||||||
|         ] |         ] | ||||||
|       } |       } | ||||||
| @ -89,6 +89,13 @@ | |||||||
|         "date_created": "1686809758.931", |         "date_created": "1686809758.931", | ||||||
|         "date_modified": "1686809744.0", |         "date_modified": "1686809744.0", | ||||||
|         "permissions_data": [ |         "permissions_data": [ | ||||||
|  |           { | ||||||
|  |             "id": "anyoneWithLink", | ||||||
|  |             "type": "anyone", | ||||||
|  |             "kind": "drive#permission", | ||||||
|  |             "role": "reader", | ||||||
|  |             "allowFileDiscovery": false | ||||||
|  |           }, | ||||||
|           { |           { | ||||||
|             "id": "18298851591250030956", |             "id": "18298851591250030956", | ||||||
|             "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com", |             "displayName": "ingest@unstructured-ingest-test.iam.gserviceaccount.com", | ||||||
| @ -100,6 +107,17 @@ | |||||||
|             "deleted": false, |             "deleted": false, | ||||||
|             "pendingOwner": false |             "pendingOwner": false | ||||||
|           }, |           }, | ||||||
|  |           { | ||||||
|  |             "id": "04774006893477068632", | ||||||
|  |             "displayName": "ryan", | ||||||
|  |             "type": "user", | ||||||
|  |             "kind": "drive#permission", | ||||||
|  |             "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64", | ||||||
|  |             "emailAddress": "ryan@unstructured.io", | ||||||
|  |             "role": "owner", | ||||||
|  |             "deleted": false, | ||||||
|  |             "pendingOwner": false | ||||||
|  |           }, | ||||||
|           { |           { | ||||||
|             "id": "09147371668407854156", |             "id": "09147371668407854156", | ||||||
|             "displayName": "roman", |             "displayName": "roman", | ||||||
| @ -110,24 +128,6 @@ | |||||||
|             "role": "writer", |             "role": "writer", | ||||||
|             "deleted": false, |             "deleted": false, | ||||||
|             "pendingOwner": false |             "pendingOwner": false | ||||||
|           }, |  | ||||||
|           { |  | ||||||
|             "id": "anyoneWithLink", |  | ||||||
|             "type": "anyone", |  | ||||||
|             "kind": "drive#permission", |  | ||||||
|             "role": "reader", |  | ||||||
|             "allowFileDiscovery": false |  | ||||||
|           }, |  | ||||||
|           { |  | ||||||
|             "id": "04774006893477068632", |  | ||||||
|             "displayName": "ryan", |  | ||||||
|             "type": "user", |  | ||||||
|             "kind": "drive#permission", |  | ||||||
|             "photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64", |  | ||||||
|             "emailAddress": "ryan@unstructured.io", |  | ||||||
|             "role": "owner", |  | ||||||
|             "deleted": false, |  | ||||||
|             "pendingOwner": false |  | ||||||
|           } |           } | ||||||
|         ] |         ] | ||||||
|       } |       } | ||||||
|  | |||||||
| @ -27,7 +27,7 @@ | |||||||
|         "eng" |         "eng" | ||||||
|       ], |       ], | ||||||
|       "page_number": 1, |       "page_number": 1, | ||||||
|       "orig_elements": "eJztlU1v2jAYx79K5PMgifNm71Z1leilIAqnFkWO/RiiJnbkOBsd4rvPhtK1E4dt0qQx7RQ/74//P1l52CFooAVly1qgjwFKODBKuIRIZAXJc8oqwDxNMkllFgmGPgSoBcsEs8zl75A/lL0eDIeD3YFp676vterLl6SHHWq18OEkiQnZr1wPA1wbUTaaM6vNsZLZjV8h3OgWQjMoBSb8os1TOKjemoHbwYB4b8CWtV0DI6F5H26YEpXWT6O4GzvHFu33bpKsG7DPnR+PWNc1tZvotgs/KzHWHaht20htWmb7kZay5uBKB6/I2M0WndEc3HXUum3Gp4gXYQNMgCml1tZ9TgM6U7fMPPuEhqn1wNbQewUQqDVa+XUsbK3PXN4HCzP0FiCYvOztq06NJof2yFX8SEgKSgkTaSogo1lW0IRgUeTuUFQVJvzyCEHbbVhffwVRenVKrpV19z0Kdz25mi1u5kGMVmdSLVsf06pD+A+xPoPSeTvnKdXQVuDVid/C/b70G6SL2jZwjmhFRERFlnIBmJBIFEwkNM8hYhHnVVX9Y0Rv7xbz6afl9eJ2endJUN/t/VNckyzPCEtlgYXDWUkc5RyqnCcYA5X48rj+LShe31fwOOAoToJfZsNzwAWN3W8uTomgKc6KIk5S8GYS0/Q/m99lczV+tLPlfDa9vzlDYvUNQ4PhWg==" |       "orig_elements": "eJztlV1r2zAUhv+K0fWS+NvW7kpXSG+akCZXbTD6OHJMLcnI8pYu5L9PSpqtHYGxjcEydmUdnffovDoPwg87BC1IULZqOHofoIQBwSUTEPKsKPMcEwoxS5NMYJGFnKB3AZJgCSeWOP0O+UXV68EwOMQdGNn0faNVX72IHnZIau7TSRKV5X7tzjDAtOFVqxmx2hwrid14C5ONljAxg1JgJp+0eZoMqrdmYHYwwN8GsCWya2HENesnG6I41fppFHVjt7FF+73rJJoW7HPn2yPSdW3jOjp3k4+Kj3UHaitboY0kth9pIRoGrnTwExm73rwzmoG7jqplOz5l/BA2QDiYSmht3efUoDONJObZC1qi6oHU0PsJIFA1Wns7FrbWK1f3wdIMvQUIpi++fdXpoOnheOQqvickOMYl4WnKIcNZVuCkjHmRu0VBaVyyyyMEstuQvvkMvPLTqZhW1t33OLjr6dV8ebMIIrQ+I7WkPsroIf2HWJ9B6XY7t1OpQVLw04lew/1m+hXSlXJuoNbG21965Rm6tOQh5lnKOMRlGfKC8ATnOYQkZIxS+o/Rvb1bLmYfVtfL29ndJQF+4/unGSdZnpUkFUXMHVoq4jBnQHOWxDFgEV8e478Fy9d3FzwOcRglwW9xYjnEBY7crzBKS47TOCuKKEnBh0mE0/+cfpXT1fjRzleL+ez+5gdU1l8AKnj1Ng==" | ||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
| @ -383,7 +383,7 @@ | |||||||
|         "eng" |         "eng" | ||||||
|       ], |       ], | ||||||
|       "page_number": 2, |       "page_number": 2, | ||||||
|       "orig_elements": "eJxVkMtOwzAQRX8l8pompGmlhB2IIJAQldp0VarIsSdpVNtj+QFBVf4dG+iC3bzPvXO4EBAgQbl25OQuIauCccp51RXLjlf5ii0L3lddWZXlmt2uC3KTEAmOcupomL+QGLQWvWHwk2swcrR2RGXbv6HDhUjksV0UeVnOx3DDAEPDW4GMOjS/m9SdooTshBIy45UCk32iOWdeWWc8c94A/5/ARKUWsODIbHaiineI50Wu01CYyDwHUj8KcF864gnVWoyBGNRlH4qnqEFNUvRoJHV2gX0/MgirPn4kDWyuDTIIdtQgRXrtxCcIqgZPB7DRIAE1kGhLh0qrvOwgelpGvoPJRfZD+u62m9c62TwlzXOd7N9emvox2TX3Tb1Lmu1+19R1vHwV24xOAJmP32EBjeE=" |       "orig_elements": "eJxVkMtOwzAQRX8l8pomtGmlhB2IIJAQldp0VarIsSdpVNtj+QGBKv+ODe2C3bzPvbM/ExAgQblm4OQuIcucccp52eaLlpfzJVvkvCvboiyKFbtd5eQmIRIc5dTRMH8mMWgsesPgN9dg5GDtgMo2l6H9mUjksZ3n86KYDuGGAYaGNwIZdWj+Nqk7RgnZESVkxisFJvtEc8q8ss545rwB/j+BkUotYMaR2exIFW8RT7O5TkNhJNMUSN0gwH3piCdUazEEYlCXfSieogY1StGhkdTZGXbdwCCs+viRNLC5Nsgg2FG9FOm1E58gqOo97cFGgwRUT6ItHSqN8rKF6GkR+Q5GF9kP6bvbrF+rZP2U1M9Vsnt7qavHZFvf19U2qTe7bV1V8fJV7E4FpdCjGb6B1/HKdPgBKuqS2A==" | ||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
| @ -572,7 +572,7 @@ | |||||||
|         "eng" |         "eng" | ||||||
|       ], |       ], | ||||||
|       "page_number": 2, |       "page_number": 2, | ||||||
|       "orig_elements": "eJxVkMtuwjAQRX8l8rokDZRHukOFVmxAgrCoAEVOPAkRtsfyo02F8u+1oSy689yZ8bl3DlcCHARIW7SMvEbkJcvSYZqNR1NaZiU8jwHGFDJIKWMTOp2Qp4gIsJRRS/38lYRHYdDpCm61Ai1aY1qUpvgbOlyJQBbao1E6m/Un/4eGCjUrOFbUor5vUnsOFpIzCki0kxJ08o36kjhprHaVdRrY/wI6KhSHAcPKJGcqWYl4GaQq9kJH+t6T6paD/VEBT6hSvPVE7y75kixGBbITvEYtqDUDrOu2Ar/qwkViz2ZKYwU+jmwEjx+dcAROZeNoAyYEJCAbEmIprxTSiRJCpmHgW+hsYL/FR7vL5/k+32w/o8U+Xy130eY9mkdeXS9W648o3+53+XJ5vAEenvPWciD96RdNTpBK" |       "orig_elements": "eJxVkMtuwjAQRX8l8rokDZRHukOFVmxAgmRRAYqceBIibI/lR5sW8e+1aVl0N+9z7+wvBDgIkLbsGHmOyFOWpcM0G4+mtMoqeBwDjClkkFLGJnQ6IQ8REWApo5b6+QsJQWnQ6RpuuQItOmM6lKb8G9pfiEAW2qNROptdj/6Ghho1KznW1KL+3aT2FCQkJxSQaCcl6OQT9Tlx0ljtaus0sP8J9FQoDgOGtUlOVLIK8TxIVewLPblePanpONgvFfCEKsU7T/Tqkg/JYlQge8Eb1IJaM8Cm6Wrwqy58JPZspjTW4O3IVvD43glP4FS2jrZggkECsiXBlvKVUjpRQfA0DHwLvQ3sl/hgd/k8L/LN9j1aFPlquYs2r9E88tX1YrV+i/JtscuXy8MNcNdcSC8YWtTdN7A8HLsefwA0I5VB" | ||||||
|     } |     } | ||||||
|   }, |   }, | ||||||
|   { |   { | ||||||
|  | |||||||
| @ -1 +1 @@ | |||||||
| __version__ = "0.16.12-dev3"  # pragma: no cover | __version__ = "0.16.12-dev4"  # pragma: no cover | ||||||
|  | |||||||
| @ -48,7 +48,6 @@ from unstructured.partition.text_type import ( | |||||||
|     is_bulleted_text, |     is_bulleted_text, | ||||||
|     is_email_address, |     is_email_address, | ||||||
|     is_possible_narrative_text, |     is_possible_narrative_text, | ||||||
|     is_possible_title, |  | ||||||
|     is_us_city_state_zip, |     is_us_city_state_zip, | ||||||
| ) | ) | ||||||
| from unstructured.partition.utils.constants import PartitionStrategy | from unstructured.partition.utils.constants import PartitionStrategy | ||||||
| @ -412,15 +411,15 @@ class _DocxPartitioner: | |||||||
|             ) |             ) | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|         # NOTE(scanny) - blank paragraphs are commonly used for spacing between paragraphs and |         # -- blank paragraphs are commonly used for spacing between paragraphs and do not | ||||||
|         # do not contribute to the document-element stream. |         # -- contribute to the document-element stream | ||||||
|         if not text.strip(): |         if not text.strip(): | ||||||
|             return |             return | ||||||
| 
 | 
 | ||||||
|         metadata = self._paragraph_metadata(paragraph) |         metadata = self._paragraph_metadata(paragraph) | ||||||
| 
 | 
 | ||||||
|         # NOTE(scanny) - a list-item gets some special treatment, mutating the text to remove a |         # -- a list-item gets some special treatment, mutating the text to remove a | ||||||
|         # bullet-character if present. |         # -- bullet-character if present | ||||||
|         if self._is_list_item(paragraph): |         if self._is_list_item(paragraph): | ||||||
|             clean_text = clean_bullets(text).strip() |             clean_text = clean_bullets(text).strip() | ||||||
|             if clean_text: |             if clean_text: | ||||||
| @ -431,19 +430,19 @@ class _DocxPartitioner: | |||||||
|                 ) |                 ) | ||||||
|             return |             return | ||||||
| 
 | 
 | ||||||
|         # NOTE(scanny) - determine element-type from an explicit Word paragraph-style if possible |         # -- determine element-type from an explicit Word paragraph-style if possible -- | ||||||
|         TextSubCls = self._style_based_element_type(paragraph) |         TextSubCls = self._style_based_element_type(paragraph) | ||||||
|         if TextSubCls: |         if TextSubCls: | ||||||
|             yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN) |             yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN) | ||||||
|             return |             return | ||||||
| 
 | 
 | ||||||
|         # NOTE(scanny) - try to recognize the element type by parsing its text |         # -- try to recognize the element type by parsing its text -- | ||||||
|         TextSubCls = self._parse_paragraph_text_for_element_type(paragraph) |         TextSubCls = self._parse_paragraph_text_for_element_type(paragraph) | ||||||
|         if TextSubCls: |         if TextSubCls: | ||||||
|             yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN) |             yield TextSubCls(text=text, metadata=metadata, detection_origin=DETECTION_ORIGIN) | ||||||
|             return |             return | ||||||
| 
 | 
 | ||||||
|         # NOTE(scanny) - if all that fails we give it the default `Text` element-type |         # -- if all that fails we give it the default `Text` element-type -- | ||||||
|         yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN) |         yield Text(text, metadata=metadata, detection_origin=DETECTION_ORIGIN) | ||||||
| 
 | 
 | ||||||
|     def _convert_table_to_html(self, table: DocxTable) -> str: |     def _convert_table_to_html(self, table: DocxTable) -> str: | ||||||
| @ -576,20 +575,20 @@ class _DocxPartitioner: | |||||||
| 
 | 
 | ||||||
|             page_break = paragraph.rendered_page_breaks[0] |             page_break = paragraph.rendered_page_breaks[0] | ||||||
| 
 | 
 | ||||||
|             # NOTE(scanny)- preceding-fragment is None when first paragraph content is a page-break |             # -- preceding-fragment is None when first paragraph content is a page-break -- | ||||||
|             preceding_paragraph_fragment = page_break.preceding_paragraph_fragment |             preceding_paragraph_fragment = page_break.preceding_paragraph_fragment | ||||||
|             if preceding_paragraph_fragment: |             if preceding_paragraph_fragment: | ||||||
|                 yield preceding_paragraph_fragment |                 yield preceding_paragraph_fragment | ||||||
| 
 | 
 | ||||||
|             yield page_break |             yield page_break | ||||||
| 
 | 
 | ||||||
|             # NOTE(scanny) - following-fragment is None when page-break is last paragraph content. |             # -- following-fragment is None when page-break is last paragraph content. This is | ||||||
|             # This is probably quite rare (Word moves these to the start of the next paragraph) but |             # -- probably quite rare (Word moves these to the start of the next paragraph) but | ||||||
|             # easier to check for it than prove it can't happen. |             # -- easier to check for it than prove it can't happen. | ||||||
|             following_paragraph_fragment = page_break.following_paragraph_fragment |             following_paragraph_fragment = page_break.following_paragraph_fragment | ||||||
|             # NOTE(scanny) - the paragraph fragment following a page-break can itself contain |             # -- the paragraph fragment following a page-break can itself contain another | ||||||
|             # another page-break. This would also be quite rare, but it can happen so we just |             # -- page-break; this would also be quite rare, but it can happen so we just recurse | ||||||
|             # recurse into the second fragment the same way we handled the original paragraph. |             # -- into the second fragment the same way we handled the original paragraph | ||||||
|             if following_paragraph_fragment: |             if following_paragraph_fragment: | ||||||
|                 yield from iter_paragraph_items(following_paragraph_fragment) |                 yield from iter_paragraph_items(following_paragraph_fragment) | ||||||
| 
 | 
 | ||||||
| @ -901,8 +900,6 @@ class _DocxPartitioner: | |||||||
|             return EmailAddress |             return EmailAddress | ||||||
|         if is_possible_narrative_text(text): |         if is_possible_narrative_text(text): | ||||||
|             return NarrativeText |             return NarrativeText | ||||||
|         if is_possible_title(text): |  | ||||||
|             return Title |  | ||||||
| 
 | 
 | ||||||
|         return None |         return None | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Steve Canny
						Steve Canny