chore: use only regex for contains_english_word. (#382)

Updates the characters to split when creating candidate english words. Now uses regex to parse out non-alphabetic characters for each word Note: This was originally an attempt to speedup contains_english_word() but there is no measurable change in performance.
2025-12-18 10:44:23 +00:00 · 2023-03-30 09:57:43 -07:00 · 2023-03-30 09:57:43 -07:00 · 32c79caee3
commit 32c79caee3
parent e5dd9d5676
4 changed files with 19 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.5.8-dev5
+## 0.5.8-dev6
 ### Enhancements
@ -44,8 +44,6 @@
 ## 0.5.6
 * Fix problem with PDF partition (duplicated test)
 ### Enhancements
 * `contains_english_word()`, used heavily in text processing, is 10x faster.
@ -57,6 +55,8 @@
 ### Fixes
 * Fix problem with PDF partition (duplicated test)
 ## 0.5.4
 ### Enhancements
--- a/test_unstructured/partition/test_text_type.py
+++ b/test_unstructured/partition/test_text_type.py
@ -190,6 +190,11 @@ def test_contains_verb(text, expected, monkeypatch):
        ("Parrot Beak", True),
        ("parrot beak", True),
        ("parrot!", True),
        ("?parrot", True),
        ("zombie?parrot", True),
        ("notaWordHa 'parrot'", True),
        ("notaWordHa'parrot'", False),
        ('notaWordHa "parrot,"', True),
        ("daljdf adlfajldj ajadfa", False),
        ("BTAR ADFJA L", False),
        ("Unstructured Technologies", True),
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.5.8-dev5"  # pragma: no cover
+__version__ = "0.5.8-dev6"  # pragma: no cover
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@ -20,7 +20,8 @@ from unstructured.nlp.patterns import (
 from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
 POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
-ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s|\.|-|_|\/]")
+ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s\-,.!?_\/]+")
 NON_LOWERCASE_ALPHA_RE = re.compile(r"[^a-z]")
 def is_possible_narrative_text(
@ -188,11 +189,16 @@ def contains_english_word(text: str) -> bool:
    text = text.lower()
    words = ENGLISH_WORD_SPLIT_RE.split(text)
    for word in words:
-        # NOTE(robinson) - to ignore punctuation at the ends of words like "best."
+        # NOTE(Crag): Remove any non-lowercase alphabetical
-        word = "".join([character for character in word if character.isalpha()])
+        # characters.  These removed chars will usually be trailing or
        # leading characters not already matched in ENGLISH_WORD_SPLIT_RE.
        # The possessive case is also generally ok:
        #   "beggar's" -> "beggars" (still an english word)
        # and of course:
        #   "'beggars'"-> "beggars" (also still an english word)
        word = NON_LOWERCASE_ALPHA_RE.sub("", word)
        if len(word) > 1 and word in ENGLISH_WORDS:
            return True
    return False
`@ -1 +1 @@`
	`__version__ = "0.5.8-dev5" # pragma: no cover`	`__version__ = "0.5.8-dev6" # pragma: no cover`