chore: use only regex for contains_english_word. (#382)

Updates the characters to split when creating candidate english words. Now uses regex to parse out non-alphabetic characters for each word

Note: This was originally an attempt to speedup contains_english_word() but there is no measurable change in performance.
This commit is contained in:
cragwolfe 2023-03-30 09:57:43 -07:00 committed by GitHub
parent e5dd9d5676
commit 32c79caee3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 19 additions and 8 deletions

View File

@ -1,4 +1,4 @@
## 0.5.8-dev5
## 0.5.8-dev6
### Enhancements
@ -44,8 +44,6 @@
## 0.5.6
* Fix problem with PDF partition (duplicated test)
### Enhancements
* `contains_english_word()`, used heavily in text processing, is 10x faster.
@ -57,6 +55,8 @@
### Fixes
* Fix problem with PDF partition (duplicated test)
## 0.5.4
### Enhancements

View File

@ -190,6 +190,11 @@ def test_contains_verb(text, expected, monkeypatch):
("Parrot Beak", True),
("parrot beak", True),
("parrot!", True),
("?parrot", True),
("zombie?parrot", True),
("notaWordHa 'parrot'", True),
("notaWordHa'parrot'", False),
('notaWordHa "parrot,"', True),
("daljdf adlfajldj ajadfa", False),
("BTAR ADFJA L", False),
("Unstructured Technologies", True),

View File

@ -1 +1 @@
__version__ = "0.5.8-dev5" # pragma: no cover
__version__ = "0.5.8-dev6" # pragma: no cover

View File

@ -20,7 +20,8 @@ from unstructured.nlp.patterns import (
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s|\.|-|_|\/]")
ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s\-,.!?_\/]+")
NON_LOWERCASE_ALPHA_RE = re.compile(r"[^a-z]")
def is_possible_narrative_text(
@ -188,11 +189,16 @@ def contains_english_word(text: str) -> bool:
text = text.lower()
words = ENGLISH_WORD_SPLIT_RE.split(text)
for word in words:
# NOTE(robinson) - to ignore punctuation at the ends of words like "best."
word = "".join([character for character in word if character.isalpha()])
# NOTE(Crag): Remove any non-lowercase alphabetical
# characters. These removed chars will usually be trailing or
# leading characters not already matched in ENGLISH_WORD_SPLIT_RE.
# The possessive case is also generally ok:
# "beggar's" -> "beggars" (still an english word)
# and of course:
# "'beggars'"-> "beggars" (also still an english word)
word = NON_LOWERCASE_ALPHA_RE.sub("", word)
if len(word) > 1 and word in ENGLISH_WORDS:
return True
return False