diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a20aa043..9dccda63c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.5.8-dev5 +## 0.5.8-dev6 ### Enhancements @@ -44,8 +44,6 @@ ## 0.5.6 -* Fix problem with PDF partition (duplicated test) - ### Enhancements * `contains_english_word()`, used heavily in text processing, is 10x faster. @@ -57,6 +55,8 @@ ### Fixes +* Fix problem with PDF partition (duplicated test) + ## 0.5.4 ### Enhancements diff --git a/test_unstructured/partition/test_text_type.py b/test_unstructured/partition/test_text_type.py index c4633d51e..1b8e52de6 100644 --- a/test_unstructured/partition/test_text_type.py +++ b/test_unstructured/partition/test_text_type.py @@ -190,6 +190,11 @@ def test_contains_verb(text, expected, monkeypatch): ("Parrot Beak", True), ("parrot beak", True), ("parrot!", True), + ("?parrot", True), + ("zombie?parrot", True), + ("notaWordHa 'parrot'", True), + ("notaWordHa'parrot'", False), + ('notaWordHa "parrot,"', True), ("daljdf adlfajldj ajadfa", False), ("BTAR ADFJA L", False), ("Unstructured Technologies", True), diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ba72c0621..ed263d35c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.5.8-dev5" # pragma: no cover +__version__ = "0.5.8-dev6" # pragma: no cover diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py index 465098748..3099dab5f 100644 --- a/unstructured/partition/text_type.py +++ b/unstructured/partition/text_type.py @@ -20,7 +20,8 @@ from unstructured.nlp.patterns import ( from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"] -ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s|\.|-|_|\/]") +ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s\-,.!?_\/]+") +NON_LOWERCASE_ALPHA_RE = re.compile(r"[^a-z]") def is_possible_narrative_text( @@ -188,11 +189,16 @@ def contains_english_word(text: str) -> bool: text = text.lower() words = ENGLISH_WORD_SPLIT_RE.split(text) for word in words: - # NOTE(robinson) - to ignore punctuation at the ends of words like "best." - word = "".join([character for character in word if character.isalpha()]) + # NOTE(Crag): Remove any non-lowercase alphabetical + # characters. These removed chars will usually be trailing or + # leading characters not already matched in ENGLISH_WORD_SPLIT_RE. + # The possessive case is also generally ok: + # "beggar's" -> "beggars" (still an english word) + # and of course: + # "'beggars'"-> "beggars" (also still an english word) + word = NON_LOWERCASE_ALPHA_RE.sub("", word) if len(word) > 1 and word in ENGLISH_WORDS: return True - return False