fix: correct return types in exceeds_caps_ratio (#489)

* fix: fix text_type.py exceeds_cap_ratio() returns There are cases when function is_possible_narrative_text receives an incorrect return from function exceeds_cap_ratio and does an incorrect classification, so some of the return values of exceeds_cap_ratio are corrected * Update text_type.py exceeds_cap_ratio() .. * Update text_type.py .. * Update CHANGELOG.md .. * linting, linting, linting ... * update tests * more test fixes * Update text_type.py .. * bump version and changelog * add punctuation check --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io> Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
2025-12-16 09:47:18 +00:00 · 2023-04-24 23:45:09 +09:00 · 2023-04-24 23:45:09 +09:00 · be8e6da884
commit be8e6da884
parent 894a190001
5 changed files with 15 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.6.2-dev0
+## 0.6.2-dev1

 ### Enhancements

@ -9,6 +9,8 @@

 ### Fixes

+* Fix how `exceeds_cap_ratio` handles empty (returns `True` instead of `False`)
+
 ## 0.6.1

 ### Enhancements
--- a/test_unstructured/partition/test_text_type.py
+++ b/test_unstructured/partition/test_text_type.py
@ -106,6 +106,7 @@ def test_text_type_handles_non_english_examples_with_env_var(monkeypatch):
        ("1.A.RISKS", True),  # Tests that "RISKS" gets flagged as an english word
        ("1. Unstructured Technologies", True),  # Make sure we're English words :-)
        ("Big/Brown/Sheet", True),
+        ("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", False),
    ],
 )
 def test_is_possible_title(text, expected, monkeypatch):
@ -212,7 +213,7 @@ def test_contains_english_word(text, expected, monkeypatch):
        ("Intellectual Property in the United States", True),
        ("Intellectual property helps incentivize innovation.", False),
        ("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
-        ("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", False),
+        ("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", True),
        ("This Has All Caps. It's Weird But Two Sentences", False),
        ("The Business Report is expected within 6 hours of closing", False),
        ("", True),
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.6.2-dev0"  # pragma: no cover
+__version__ = "0.6.2-dev1"  # pragma: no cover
--- a/unstructured/nlp/patterns.py
+++ b/unstructured/nlp/patterns.py
@ -94,3 +94,7 @@ EMAIL_DATETIMETZ_PATTERN = "[a-zA-z]{3},\s[0-9]{2}\s[a-zA-Z]{3}\s[0-9]{4}\s[0-9]

 EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"  # noqa: W605 NOTE(harrell)
 # - skipping qa because we need the escape for the regex
+
+
+ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
+ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@ -13,6 +13,7 @@ from unstructured.cleaners.core import remove_punctuation
 from unstructured.logger import logger
 from unstructured.nlp.english_words import ENGLISH_WORDS
 from unstructured.nlp.patterns import (
+    ENDS_IN_PUNCT_RE,
    UNICODE_BULLETS_RE,
    US_CITY_STATE_ZIP_RE,
    US_PHONE_NUMBERS_RE,
@ -123,6 +124,9 @@ def is_possible_title(
        logger.debug("Not a title. Text is empty.")
        return False

+    if text.isupper() and ENDS_IN_PUNCT_RE.search(text) is not None:
+        return False
+
    title_max_word_length = int(
        os.environ.get("UNSTRUCTURED_TITLE_MAX_WORD_LENGTH", title_max_word_length),
    )
@ -268,7 +272,7 @@ def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool:
        return False

    if text.isupper():
-        return False
+        return True

    # NOTE(jay-ylee) - The word_tokenize function also recognizes and separates special characters
    # into one word, causing problems with ratio measurement.