fix: fix text_type.py exceeds_cap_ratio() returns (#478)

There are cases when function is_possible_narrative_text receives an incorrect return from function exceeds_cap_ratio and does an incorrect classification, so some of the return values of exceeds_cap_ratio are corrected. --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
2025-11-03 19:43:24 +00:00 · 2023-04-15 03:53:10 +09:00 · 2023-04-15 03:53:10 +09:00 · 8456676fad
commit 8456676fad
parent 46ac2a2226
3 changed files with 14 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -13,6 +13,7 @@
 * unstructured-documents encode xml string if document_tree is `None` in `_read_xml`.
 * Update to `_read_xml` so that Markdown files with embedded HTML process correctly.
 * Fallback to "fast" strategy only emits a warning if the user specifies the "hi_res" strategy.
+* unstructured-partition-text_type exceeds_cap_ratio fix returns and how capitalization ratios are calculated

 ## 0.5.12

--- a/test_unstructured/partition/test_text_type.py
+++ b/test_unstructured/partition/test_text_type.py
@ -215,7 +215,7 @@ def test_contains_english_word(text, expected, monkeypatch):
        ("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", False),
        ("This Has All Caps. It's Weird But Two Sentences", False),
        ("The Business Report is expected within 6 hours of closing", False),
-        ("", False),
+        ("", True),
    ],
 )
 def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@ -265,15 +265,24 @@ def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool:
    # NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
    # The assumption is that sections with multiple sentences are not titles.
    if sentence_count(text, 3) > 1:
-        logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
        return False

    if text.isupper():
        return False

-    tokens = word_tokenize(text)
+    # NOTE(jay-ylee) - The word_tokenize function also recognizes and separates special characters
+    # into one word, causing problems with ratio measurement.
+    # Therefore, only words consisting of alphabets are used to measure the ratio.
+    # ex. world_tokenize("ITEM 1. Financial Statements (Unaudited)")
+    #     = ['ITEM', '1', '.', 'Financial', 'Statements', '(', 'Unaudited', ')'],
+    # however, "ITEM 1. Financial Statements (Unaudited)" is Title, not NarrativeText
+    tokens = [tk for tk in word_tokenize(text) if tk.isalpha()]
+
+    # NOTE(jay-ylee) - If word_tokenize(text) is empty, return must be True to
+    # avoid being misclassified as Narrative Text.
    if len(tokens) == 0:
-        return False
+        return True
+
    capitalized = sum([word.istitle() or word.isupper() for word in tokens])
    ratio = capitalized / len(tokens)
    return ratio > threshold