diff --git a/CHANGELOG.md b/CHANGELOG.md index 63ed7124a..3fb7ac2d2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ * unstructured-documents encode xml string if document_tree is `None` in `_read_xml`. * Update to `_read_xml` so that Markdown files with embedded HTML process correctly. * Fallback to "fast" strategy only emits a warning if the user specifies the "hi_res" strategy. +* unstructured-partition-text_type exceeds_cap_ratio fix returns and how capitalization ratios are calculated ## 0.5.12 diff --git a/test_unstructured/partition/test_text_type.py b/test_unstructured/partition/test_text_type.py index 1b8e52de6..948c6daf2 100644 --- a/test_unstructured/partition/test_text_type.py +++ b/test_unstructured/partition/test_text_type.py @@ -215,7 +215,7 @@ def test_contains_english_word(text, expected, monkeypatch): ("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", False), ("This Has All Caps. It's Weird But Two Sentences", False), ("The Business Report is expected within 6 hours of closing", False), - ("", False), + ("", True), ], ) def test_contains_exceeds_cap_ratio(text, expected, monkeypatch): diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py index 3099dab5f..63cae6898 100644 --- a/unstructured/partition/text_type.py +++ b/unstructured/partition/text_type.py @@ -265,15 +265,24 @@ def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool: # NOTE(robinson) - Currently limiting this to only sections of text with one sentence. # The assumption is that sections with multiple sentences are not titles. if sentence_count(text, 3) > 1: - logger.debug(f"Text does not contain multiple sentences:\n\n{text}") return False if text.isupper(): return False - tokens = word_tokenize(text) + # NOTE(jay-ylee) - The word_tokenize function also recognizes and separates special characters + # into one word, causing problems with ratio measurement. + # Therefore, only words consisting of alphabets are used to measure the ratio. + # ex. world_tokenize("ITEM 1. Financial Statements (Unaudited)") + # = ['ITEM', '1', '.', 'Financial', 'Statements', '(', 'Unaudited', ')'], + # however, "ITEM 1. Financial Statements (Unaudited)" is Title, not NarrativeText + tokens = [tk for tk in word_tokenize(text) if tk.isalpha()] + + # NOTE(jay-ylee) - If word_tokenize(text) is empty, return must be True to + # avoid being misclassified as Narrative Text. if len(tokens) == 0: - return False + return True + capitalized = sum([word.istitle() or word.isupper() for word in tokens]) ratio = capitalized / len(tokens) return ratio > threshold