chore: use only regex for contains_english_word. (#382)

Updates the characters to split when creating candidate english words. Now uses regex to parse out non-alphabetic characters for each word Note: This was originally an attempt to speedup contains_english_word() but there is no measurable change in performance.
2025-12-12 23:51:47 +00:00 · 2023-03-30 09:57:43 -07:00 · 2023-03-30 09:57:43 -07:00 · 32c79caee3
commit 32c79caee3
parent e5dd9d5676
4 changed files with 19 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.5.8-dev5
+## 0.5.8-dev6

 ### Enhancements

@ -44,8 +44,6 @@

 ## 0.5.6

-* Fix problem with PDF partition (duplicated test)
-
 ### Enhancements

 * `contains_english_word()`, used heavily in text processing, is 10x faster.
@ -57,6 +55,8 @@

 ### Fixes

+* Fix problem with PDF partition (duplicated test)
+
 ## 0.5.4

 ### Enhancements
--- a/test_unstructured/partition/test_text_type.py
+++ b/test_unstructured/partition/test_text_type.py
@ -190,6 +190,11 @@ def test_contains_verb(text, expected, monkeypatch):
        ("Parrot Beak", True),
        ("parrot beak", True),
        ("parrot!", True),
+        ("?parrot", True),
+        ("zombie?parrot", True),
+        ("notaWordHa 'parrot'", True),
+        ("notaWordHa'parrot'", False),
+        ('notaWordHa "parrot,"', True),
        ("daljdf adlfajldj ajadfa", False),
        ("BTAR ADFJA L", False),
        ("Unstructured Technologies", True),
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.5.8-dev5"  # pragma: no cover
+__version__ = "0.5.8-dev6"  # pragma: no cover
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@ -20,7 +20,8 @@ from unstructured.nlp.patterns import (
 from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize

 POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
-ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s|\.|-|_|\/]")
+ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s\-,.!?_\/]+")
+NON_LOWERCASE_ALPHA_RE = re.compile(r"[^a-z]")


 def is_possible_narrative_text(
@ -188,11 +189,16 @@ def contains_english_word(text: str) -> bool:
    text = text.lower()
    words = ENGLISH_WORD_SPLIT_RE.split(text)
    for word in words:
-        # NOTE(robinson) - to ignore punctuation at the ends of words like "best."
-        word = "".join([character for character in word if character.isalpha()])
+        # NOTE(Crag): Remove any non-lowercase alphabetical
+        # characters.  These removed chars will usually be trailing or
+        # leading characters not already matched in ENGLISH_WORD_SPLIT_RE.
+        # The possessive case is also generally ok:
+        #   "beggar's" -> "beggars" (still an english word)
+        # and of course:
+        #   "'beggars'"-> "beggars" (also still an english word)
+        word = NON_LOWERCASE_ALPHA_RE.sub("", word)
        if len(word) > 1 and word in ENGLISH_WORDS:
            return True
-
    return False