diff --git a/CHANGELOG.md b/CHANGELOG.md index d13d85980..aa832741f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.12-dev3 +## 0.16.12-dev4 ### Enhancements @@ -10,6 +10,7 @@ - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted. - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file. +- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements. ## 0.16.11 diff --git a/test_unstructured/metrics/test_element_type.py b/test_unstructured/metrics/test_element_type.py index d1faba6a2..9a44a08f0 100644 --- a/test_unstructured/metrics/test_element_type.py +++ b/test_unstructured/metrics/test_element_type.py @@ -74,20 +74,18 @@ def test_get_element_type_frequency(filename: str, frequency: dict[tuple[str, in "handbook-1p.docx", { ("Header", None): 1, - ("Title", 0): 1, - ("Title", 1): 1, - ("Title", 2): 1, + ("UncategorizedText", 0): 6, ("ListItem", 3): 3, - ("NarrativeText", 4): 7, + ("NarrativeText", 0): 7, ("Footer", None): 1, }, - (0.43, 0.07, 0.65), + (0.78, 0.72, 0.81), ), ( "handbook-1p.docx", { ("Header", None): 1, - ("Title", 0): 6, + ("UncategorizedText", 0): 6, ("NarrativeText", 0): 7, ("PageBreak", None): 1, ("Footer", None): 1, diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py index 944305817..74187aa3b 100644 --- a/test_unstructured/partition/test_auto.py +++ b/test_unstructured/partition/test_auto.py @@ -1286,7 +1286,7 @@ def expected_docx_elements(): Title("These are a few of my favorite things:"), ListItem("Parrots"), ListItem("Hockey"), - Title("Analysis"), + Text("Analysis"), NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my third thought."), Text("2023"), diff --git a/test_unstructured/partition/test_doc.py b/test_unstructured/partition/test_doc.py index 7c8c4d3ef..e2698a3f7 100644 --- a/test_unstructured/partition/test_doc.py +++ b/test_unstructured/partition/test_doc.py @@ -275,7 +275,7 @@ def expected_elements() -> list[Element]: Title("These are a few of my favorite things:"), ListItem("Parrots"), ListItem("Hockey"), - Title("Analysis"), + Text("Analysis"), NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my third thought."), Text("2023"), diff --git a/test_unstructured/partition/test_docx.py b/test_unstructured/partition/test_docx.py index 1330b4a79..34a27cfde 100644 --- a/test_unstructured/partition/test_docx.py +++ b/test_unstructured/partition/test_docx.py @@ -627,7 +627,7 @@ def expected_elements() -> list[Text]: Title("These are a few of my favorite things:"), ListItem("Parrots"), ListItem("Hockey"), - Title("Analysis"), + Text("Analysis"), NarrativeText("This is my first thought. This is my second thought."), NarrativeText("This is my third thought."), Text("2023"), @@ -1210,7 +1210,7 @@ class Describe_DocxPartitioner: opts_args["file_path"] = example_doc_path("page-breaks.docx") opts = DocxPartitionerOptions(**opts_args) expected = [ - # NOTE(scanny) - -- page 1 -- + # -- page 1 -- NarrativeText( "First page, tab here:\t" "followed by line-break here:\n" @@ -1220,28 +1220,28 @@ class Describe_DocxPartitioner: "and hard page-break here>>" ), PageBreak(""), - # NOTE(scanny) - -- page 2 -- + # -- page 2 -- NarrativeText( "<> <>"), NarrativeText("<>"), PageBreak(""), - # NOTE(scanny) - -- page 4 -- + # -- page 4 -- PageBreak(""), - # NOTE(scanny) - -- page 5 -- + # -- page 5 -- NarrativeText("<> ' ), PageBreak(""), - # NOTE(scanny) - -- page 6 -- - Title("< str: @@ -576,20 +575,20 @@ class _DocxPartitioner: page_break = paragraph.rendered_page_breaks[0] - # NOTE(scanny)- preceding-fragment is None when first paragraph content is a page-break + # -- preceding-fragment is None when first paragraph content is a page-break -- preceding_paragraph_fragment = page_break.preceding_paragraph_fragment if preceding_paragraph_fragment: yield preceding_paragraph_fragment yield page_break - # NOTE(scanny) - following-fragment is None when page-break is last paragraph content. - # This is probably quite rare (Word moves these to the start of the next paragraph) but - # easier to check for it than prove it can't happen. + # -- following-fragment is None when page-break is last paragraph content. This is + # -- probably quite rare (Word moves these to the start of the next paragraph) but + # -- easier to check for it than prove it can't happen. following_paragraph_fragment = page_break.following_paragraph_fragment - # NOTE(scanny) - the paragraph fragment following a page-break can itself contain - # another page-break. This would also be quite rare, but it can happen so we just - # recurse into the second fragment the same way we handled the original paragraph. + # -- the paragraph fragment following a page-break can itself contain another + # -- page-break; this would also be quite rare, but it can happen so we just recurse + # -- into the second fragment the same way we handled the original paragraph if following_paragraph_fragment: yield from iter_paragraph_items(following_paragraph_fragment) @@ -901,8 +900,6 @@ class _DocxPartitioner: return EmailAddress if is_possible_narrative_text(text): return NarrativeText - if is_possible_title(text): - return Title return None