mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-12 23:51:47 +00:00
chore: use only regex for contains_english_word. (#382)
Updates the characters to split when creating candidate english words. Now uses regex to parse out non-alphabetic characters for each word Note: This was originally an attempt to speedup contains_english_word() but there is no measurable change in performance.
This commit is contained in:
parent
e5dd9d5676
commit
32c79caee3
@ -1,4 +1,4 @@
|
||||
## 0.5.8-dev5
|
||||
## 0.5.8-dev6
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -44,8 +44,6 @@
|
||||
|
||||
## 0.5.6
|
||||
|
||||
* Fix problem with PDF partition (duplicated test)
|
||||
|
||||
### Enhancements
|
||||
|
||||
* `contains_english_word()`, used heavily in text processing, is 10x faster.
|
||||
@ -57,6 +55,8 @@
|
||||
|
||||
### Fixes
|
||||
|
||||
* Fix problem with PDF partition (duplicated test)
|
||||
|
||||
## 0.5.4
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -190,6 +190,11 @@ def test_contains_verb(text, expected, monkeypatch):
|
||||
("Parrot Beak", True),
|
||||
("parrot beak", True),
|
||||
("parrot!", True),
|
||||
("?parrot", True),
|
||||
("zombie?parrot", True),
|
||||
("notaWordHa 'parrot'", True),
|
||||
("notaWordHa'parrot'", False),
|
||||
('notaWordHa "parrot,"', True),
|
||||
("daljdf adlfajldj ajadfa", False),
|
||||
("BTAR ADFJA L", False),
|
||||
("Unstructured Technologies", True),
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.5.8-dev5" # pragma: no cover
|
||||
__version__ = "0.5.8-dev6" # pragma: no cover
|
||||
|
||||
@ -20,7 +20,8 @@ from unstructured.nlp.patterns import (
|
||||
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
|
||||
|
||||
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
|
||||
ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s|\.|-|_|\/]")
|
||||
ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s\-,.!?_\/]+")
|
||||
NON_LOWERCASE_ALPHA_RE = re.compile(r"[^a-z]")
|
||||
|
||||
|
||||
def is_possible_narrative_text(
|
||||
@ -188,11 +189,16 @@ def contains_english_word(text: str) -> bool:
|
||||
text = text.lower()
|
||||
words = ENGLISH_WORD_SPLIT_RE.split(text)
|
||||
for word in words:
|
||||
# NOTE(robinson) - to ignore punctuation at the ends of words like "best."
|
||||
word = "".join([character for character in word if character.isalpha()])
|
||||
# NOTE(Crag): Remove any non-lowercase alphabetical
|
||||
# characters. These removed chars will usually be trailing or
|
||||
# leading characters not already matched in ENGLISH_WORD_SPLIT_RE.
|
||||
# The possessive case is also generally ok:
|
||||
# "beggar's" -> "beggars" (still an english word)
|
||||
# and of course:
|
||||
# "'beggars'"-> "beggars" (also still an english word)
|
||||
word = NON_LOWERCASE_ALPHA_RE.sub("", word)
|
||||
if len(word) > 1 and word in ENGLISH_WORDS:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user