mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-13 20:15:54 +00:00
fix: correct return types in exceeds_caps_ratio
(#489)
* fix: fix text_type.py exceeds_cap_ratio() returns There are cases when function is_possible_narrative_text receives an incorrect return from function exceeds_cap_ratio and does an incorrect classification, so some of the return values of exceeds_cap_ratio are corrected * Update text_type.py exceeds_cap_ratio() .. * Update text_type.py .. * Update CHANGELOG.md .. * linting, linting, linting ... * update tests * more test fixes * Update text_type.py .. * bump version and changelog * add punctuation check --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io> Co-authored-by: Matt Robinson <mrobinson@unstructuredai.io>
This commit is contained in:
parent
894a190001
commit
be8e6da884
@ -1,4 +1,4 @@
|
||||
## 0.6.2-dev0
|
||||
## 0.6.2-dev1
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -9,6 +9,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Fix how `exceeds_cap_ratio` handles empty (returns `True` instead of `False`)
|
||||
|
||||
## 0.6.1
|
||||
|
||||
### Enhancements
|
||||
|
@ -106,6 +106,7 @@ def test_text_type_handles_non_english_examples_with_env_var(monkeypatch):
|
||||
("1.A.RISKS", True), # Tests that "RISKS" gets flagged as an english word
|
||||
("1. Unstructured Technologies", True), # Make sure we're English words :-)
|
||||
("Big/Brown/Sheet", True),
|
||||
("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", False),
|
||||
],
|
||||
)
|
||||
def test_is_possible_title(text, expected, monkeypatch):
|
||||
@ -212,7 +213,7 @@ def test_contains_english_word(text, expected, monkeypatch):
|
||||
("Intellectual Property in the United States", True),
|
||||
("Intellectual property helps incentivize innovation.", False),
|
||||
("THIS IS ALL CAPS. BUT IT IS TWO SENTENCES.", False),
|
||||
("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", False),
|
||||
("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", True),
|
||||
("This Has All Caps. It's Weird But Two Sentences", False),
|
||||
("The Business Report is expected within 6 hours of closing", False),
|
||||
("", True),
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.6.2-dev0" # pragma: no cover
|
||||
__version__ = "0.6.2-dev1" # pragma: no cover
|
||||
|
@ -94,3 +94,7 @@ EMAIL_DATETIMETZ_PATTERN = "[a-zA-z]{3},\s[0-9]{2}\s[a-zA-Z]{3}\s[0-9]{4}\s[0-9]
|
||||
|
||||
EMAIL_ADDRESS_PATTERN = "[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+" # noqa: W605 NOTE(harrell)
|
||||
# - skipping qa because we need the escape for the regex
|
||||
|
||||
|
||||
ENDS_IN_PUNCT_PATTERN = r"[^\w\s]\Z"
|
||||
ENDS_IN_PUNCT_RE = re.compile(ENDS_IN_PUNCT_PATTERN)
|
||||
|
@ -13,6 +13,7 @@ from unstructured.cleaners.core import remove_punctuation
|
||||
from unstructured.logger import logger
|
||||
from unstructured.nlp.english_words import ENGLISH_WORDS
|
||||
from unstructured.nlp.patterns import (
|
||||
ENDS_IN_PUNCT_RE,
|
||||
UNICODE_BULLETS_RE,
|
||||
US_CITY_STATE_ZIP_RE,
|
||||
US_PHONE_NUMBERS_RE,
|
||||
@ -123,6 +124,9 @@ def is_possible_title(
|
||||
logger.debug("Not a title. Text is empty.")
|
||||
return False
|
||||
|
||||
if text.isupper() and ENDS_IN_PUNCT_RE.search(text) is not None:
|
||||
return False
|
||||
|
||||
title_max_word_length = int(
|
||||
os.environ.get("UNSTRUCTURED_TITLE_MAX_WORD_LENGTH", title_max_word_length),
|
||||
)
|
||||
@ -268,7 +272,7 @@ def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool:
|
||||
return False
|
||||
|
||||
if text.isupper():
|
||||
return False
|
||||
return True
|
||||
|
||||
# NOTE(jay-ylee) - The word_tokenize function also recognizes and separates special characters
|
||||
# into one word, causing problems with ratio measurement.
|
||||
|
Loading…
x
Reference in New Issue
Block a user