diff --git a/CHANGELOG.md b/CHANGELOG.md index 58f48f12a..17d9fc751 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.6.2-dev0 +## 0.6.2-dev1 ### Enhancements @@ -9,6 +9,8 @@ ### Fixes +* Fix how `exceeds_cap_ratio` handles empty (returns `True` instead of `False`) + ## 0.6.1 ### Enhancements diff --git a/test_unstructured/partition/test_text_type.py b/test_unstructured/partition/test_text_type.py index 948c6daf2..fbb740a9e 100644 --- a/test_unstructured/partition/test_text_type.py +++ b/test_unstructured/partition/test_text_type.py @@ -106,6 +106,7 @@ def test_text_type_handles_non_english_examples_with_env_var(monkeypatch): ("1.A.RISKS", True), # Tests that "RISKS" gets flagged as an english word ("1. Unstructured Technologies", True), # Make sure we're English words :-) ("Big/Brown/Sheet", True), + ("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", False), ], ) def test_is_possible_title(text, expected, monkeypatch): @@ -212,7 +213,7 @@ def test_contains_english_word(text, expected, monkeypatch): ("Intellectual Property in the United States", True), ("Intellectual property helps incentivize innovation.", False), ("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False), - ("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", False), + ("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", True), ("This Has All Caps. It's Weird But Two Sentences", False), ("The Business Report is expected within 6 hours of closing", False), ("", True), diff --git a/unstructured/__version__.py b/unstructured/__version__.py index fcec1a4df..4dd7b1de3 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.6.2-dev0" # pragma: no cover +__version__ = "0.6.2-dev1" # pragma: no cover diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py index 8922adf1b..f935e732f 100644 --- a/unstructured/nlp/patterns.py +++ b/unstructured/nlp/patterns.py @@ -94,3 +94,7 @@ EMAIL_DATETIMETZ_PATTERN = "[a-zA-z]{3},\s[0-9]{2}\s[a-zA-Z]{3}\s[0-9]{4}\s[0-9] EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell) # - skipping qa because we need the escape for the regex + + +ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z" +ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN) diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py index 63cae6898..436af6956 100644 --- a/unstructured/partition/text_type.py +++ b/unstructured/partition/text_type.py @@ -13,6 +13,7 @@ from unstructured.cleaners.core import remove_punctuation from unstructured.logger import logger from unstructured.nlp.english_words import ENGLISH_WORDS from unstructured.nlp.patterns import ( + ENDS_IN_PUNCT_RE, UNICODE_BULLETS_RE, US_CITY_STATE_ZIP_RE, US_PHONE_NUMBERS_RE, @@ -123,6 +124,9 @@ def is_possible_title( logger.debug("Not a title. Text is empty.") return False + if text.isupper() and ENDS_IN_PUNCT_RE.search(text) is not None: + return False + title_max_word_length = int( os.environ.get("UNSTRUCTURED_TITLE_MAX_WORD_LENGTH", title_max_word_length), ) @@ -268,7 +272,7 @@ def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool: return False if text.isupper(): - return False + return True # NOTE(jay-ylee) - The word_tokenize function also recognizes and separates special characters # into one word, causing problems with ratio measurement.