fix: correct return types in exceeds_caps_ratio (#489)

* fix: fix text_type.py exceeds_cap_ratio() returns

There are cases when function is_possible_narrative_text receives an incorrect return from function exceeds_cap_ratio and does an incorrect classification, so some of the return values of exceeds_cap_ratio are corrected

* Update text_type.py exceeds_cap_ratio()

..

* Update text_type.py

..

* Update CHANGELOG.md

..

* linting, linting, linting ...

* update tests

* more test fixes

* Update text_type.py

..

* bump version and changelog

* add punctuation check

---------

Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
JaeyongLee 2023-04-24 23:45:09 +09:00 committed by GitHub
parent 894a190001
commit be8e6da884
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 15 additions and 4 deletions

View File

@ -1,4 +1,4 @@
## 0.6.2-dev0
## 0.6.2-dev1
### Enhancements
@ -9,6 +9,8 @@
### Fixes
* Fix how `exceeds_cap_ratio` handles empty (returns `True` instead of `False`)
## 0.6.1
### Enhancements

View File

@ -106,6 +106,7 @@ def test_text_type_handles_non_english_examples_with_env_var(monkeypatch):
("1.A.RISKS", True), # Tests that "RISKS" gets flagged as an english word
("1. Unstructured Technologies", True), # Make sure we're English words :-)
("Big/Brown/Sheet", True),
("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", False),
],
)
def test_is_possible_title(text, expected, monkeypatch):
@ -212,7 +213,7 @@ def test_contains_english_word(text, expected, monkeypatch):
("Intellectual Property in the United States", True),
("Intellectual property helps incentivize innovation.", False),
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", False),
("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", True),
("This Has All Caps. It's Weird But Two Sentences", False),
("The Business Report is expected within 6 hours of closing", False),
("", True),

View File

@ -1 +1 @@
__version__ = "0.6.2-dev0" # pragma: no cover
__version__ = "0.6.2-dev1" # pragma: no cover

View File

@ -94,3 +94,7 @@ EMAIL_DATETIMETZ_PATTERN = "[a-zA-z]{3},\s[0-9]{2}\s[a-zA-Z]{3}\s[0-9]{4}\s[0-9]
EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell)
# - skipping qa because we need the escape for the regex
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)

View File

@ -13,6 +13,7 @@ from unstructured.cleaners.core import remove_punctuation
from unstructured.logger import logger
from unstructured.nlp.english_words import ENGLISH_WORDS
from unstructured.nlp.patterns import (
ENDS_IN_PUNCT_RE,
UNICODE_BULLETS_RE,
US_CITY_STATE_ZIP_RE,
US_PHONE_NUMBERS_RE,
@ -123,6 +124,9 @@ def is_possible_title(
logger.debug("Not a title. Text is empty.")
return False
if text.isupper() and ENDS_IN_PUNCT_RE.search(text) is not None:
return False
title_max_word_length = int(
os.environ.get("UNSTRUCTURED_TITLE_MAX_WORD_LENGTH", title_max_word_length),
)
@ -268,7 +272,7 @@ def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool:
return False
if text.isupper():
return False
return True
# NOTE(jay-ylee) - The word_tokenize function also recognizes and separates special characters
# into one word, causing problems with ratio measurement.