mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-18 10:44:23 +00:00
chore: use only regex for contains_english_word. (#382)
Updates the characters to split when creating candidate english words. Now uses regex to parse out non-alphabetic characters for each word Note: This was originally an attempt to speedup contains_english_word() but there is no measurable change in performance.
This commit is contained in:
parent
e5dd9d5676
commit
32c79caee3
@ -1,4 +1,4 @@
|
|||||||
## 0.5.8-dev5
|
## 0.5.8-dev6
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -44,8 +44,6 @@
|
|||||||
|
|
||||||
## 0.5.6
|
## 0.5.6
|
||||||
|
|
||||||
* Fix problem with PDF partition (duplicated test)
|
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* `contains_english_word()`, used heavily in text processing, is 10x faster.
|
* `contains_english_word()`, used heavily in text processing, is 10x faster.
|
||||||
@ -57,6 +55,8 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
* Fix problem with PDF partition (duplicated test)
|
||||||
|
|
||||||
## 0.5.4
|
## 0.5.4
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|||||||
@ -190,6 +190,11 @@ def test_contains_verb(text, expected, monkeypatch):
|
|||||||
("Parrot Beak", True),
|
("Parrot Beak", True),
|
||||||
("parrot beak", True),
|
("parrot beak", True),
|
||||||
("parrot!", True),
|
("parrot!", True),
|
||||||
|
("?parrot", True),
|
||||||
|
("zombie?parrot", True),
|
||||||
|
("notaWordHa 'parrot'", True),
|
||||||
|
("notaWordHa'parrot'", False),
|
||||||
|
('notaWordHa "parrot,"', True),
|
||||||
("daljdf adlfajldj ajadfa", False),
|
("daljdf adlfajldj ajadfa", False),
|
||||||
("BTAR ADFJA L", False),
|
("BTAR ADFJA L", False),
|
||||||
("Unstructured Technologies", True),
|
("Unstructured Technologies", True),
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.5.8-dev5" # pragma: no cover
|
__version__ = "0.5.8-dev6" # pragma: no cover
|
||||||
|
|||||||
@ -20,7 +20,8 @@ from unstructured.nlp.patterns import (
|
|||||||
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
|
from unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
|
||||||
|
|
||||||
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
|
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
|
||||||
ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s|\.|-|_|\/]")
|
ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s\-,.!?_\/]+")
|
||||||
|
NON_LOWERCASE_ALPHA_RE = re.compile(r"[^a-z]")
|
||||||
|
|
||||||
|
|
||||||
def is_possible_narrative_text(
|
def is_possible_narrative_text(
|
||||||
@ -188,11 +189,16 @@ def contains_english_word(text: str) -> bool:
|
|||||||
text = text.lower()
|
text = text.lower()
|
||||||
words = ENGLISH_WORD_SPLIT_RE.split(text)
|
words = ENGLISH_WORD_SPLIT_RE.split(text)
|
||||||
for word in words:
|
for word in words:
|
||||||
# NOTE(robinson) - to ignore punctuation at the ends of words like "best."
|
# NOTE(Crag): Remove any non-lowercase alphabetical
|
||||||
word = "".join([character for character in word if character.isalpha()])
|
# characters. These removed chars will usually be trailing or
|
||||||
|
# leading characters not already matched in ENGLISH_WORD_SPLIT_RE.
|
||||||
|
# The possessive case is also generally ok:
|
||||||
|
# "beggar's" -> "beggars" (still an english word)
|
||||||
|
# and of course:
|
||||||
|
# "'beggars'"-> "beggars" (also still an english word)
|
||||||
|
word = NON_LOWERCASE_ALPHA_RE.sub("", word)
|
||||||
if len(word) > 1 and word in ENGLISH_WORDS:
|
if len(word) > 1 and word in ENGLISH_WORDS:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user