fix: more english words; split on punctuation (#191)

* add a bigger list of english words

* update thresholds and add tests

* update docs; bump version

* fix version

* add additional english words back in

* linting, linting, linting

* add slashes

* work -> word
This commit is contained in:
Matt Robinson 2023-02-02 12:25:47 -05:00 committed by GitHub
parent 0589344ff7
commit a7ca58e0bc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 427177 additions and 19 deletions

View File

@ -1,4 +1,4 @@
## 0.4.5-dev4
## 0.4.5-dev5
* Loosen the default cap threshold to `0.5`.
* Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
@ -10,7 +10,7 @@
* Adds an `Address` element for capturing elements that only contain an address.
* Suppress the `UserWarning` when detectron is called.
* Checks that titles and narrative test have at least one English word.
* Checks that titles and narrative text are at least 75% alpha characters.
* Checks that titles and narrative text are at least 50% alpha characters.
* Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
environment variable for controlling the max number of words in a title.

View File

@ -252,7 +252,7 @@ for consideration as narrative text. The function performs the following checks
takes precedence over the kwarg.
* If a the text contains too many non-alpha characters it is
not narrative text.
The default is to expect a minimum of 75% alpha characters
The default is to expect a minimum of 50% alpha characters
(not countings spaces). You can change the minimum value with the
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_RATIO`` environment variable.
The environment variables takes precedence over the kwarg.
@ -290,7 +290,7 @@ for consideration as a title. The function performs the following checks:
the ``title_max_word_length`` kwarg or the ``UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`` environment variable. The environment
variable takes precedence over the kwarg.
* If a text contains too many non-alpha characters it is not a
title. The default is to expect a minimum of 75% alpha characters
title. The default is to expect a minimum of 50% alpha characters
(not countings spaces). You can change the minimum value with the
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_TITLE_NON_ALPHA_RATIO`` environment variable.
The environment variables takes precedence over the kwarg.

View File

@ -72,4 +72,6 @@ setup(
],
"local-inference": ["unstructured-inference>=0.2.4"],
},
package_dir={"unstructured": "unstructured"},
package_data={"unstructured": ["nlp/*.txt"]},
)

View File

@ -46,8 +46,8 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
has_verb = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
assert has_verb is expected
is_possible_narrative = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
assert is_possible_narrative is expected
@pytest.mark.parametrize(
@ -65,6 +65,9 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
("BTAR ADFJA L", False), # Doesn't have english words
("ITEM 1A. RISK FACTORS " * 15, False), # Title is too long
("/--------BREAK-------/", False), # Contains too many non-alpha characters
("1.A.RISKS", True), # Tests that "RISKS" gets flagged as an english word
("1. Unstructured Technologies", True), # Make sure we're English words :-)
("Big/Brown/Sheet", True),
],
)
def test_is_possible_title(text, expected, monkeypatch):
@ -144,11 +147,12 @@ def test_contains_verb(text, expected, monkeypatch):
("daljdf adlfajldj ajadfa", False),
("BTAR ADFJA L", False),
("Unstructured Technologies", True),
("1.A.RISKS", True), # Test crammed together words get picked up
("Big/Brown/Sheep", True),
],
)
def test_contains_english_word(text, expected, monkeypatch):
has_verb = text_type.contains_english_word(text)
assert has_verb is expected
assert text_type.contains_english_word(text) is expected
@pytest.mark.parametrize(

View File

@ -1 +1 @@
__version__ = "0.4.5-dev4" # pragma: no cover
__version__ = "0.4.5-dev5" # pragma: no cover

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,17 @@
from nltk.corpus import words as nltk_words
import pathlib
import os
from typing import List
ADDITIONAL_ENGLISH_WORDS = [
"unstructured",
"technologies",
]
ENGLISH_WORDS = nltk_words.words() + ADDITIONAL_ENGLISH_WORDS
DIRECTORY = pathlib.Path(__file__).parent.resolve()
# NOTE(robinson) - the list of English words is based on the nlkt.corpus.words corpus
# and the list of English words found here at the link below. Add more words to the text
# file if needed.
# ref: https://github.com/jeremy-rifkin/Wordlist
ENGLISH_WORDS_FILE = os.path.join(DIRECTORY, "english-words.txt")
with open(ENGLISH_WORDS_FILE, "r") as f:
BASE_ENGLISH_WORDS = f.read().split("\n")
# NOTE(robinson) - add new words that we want to pass for the English check in here
ADDITIONAL_ENGLISH_WORDS: List[str] = []
ENGLISH_WORDS: List[str] = BASE_ENGLISH_WORDS + ADDITIONAL_ENGLISH_WORDS

View File

@ -1,5 +1,6 @@
"""partition.py implements logic for partitioning plain text documents into sections."""
import os
import re
import sys
from typing import List, Optional
@ -17,10 +18,11 @@ from unstructured.logger import logger
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s|\.|-|_|\/]")
def is_possible_narrative_text(
text: str, cap_threshold: float = 0.5, non_alpha_threshold: float = 0.75, language: str = "en"
text: str, cap_threshold: float = 0.5, non_alpha_threshold: float = 0.5, language: str = "en"
) -> bool:
"""Checks to see if the text passes all of the checks for a narrative text section.
You can change the cap threshold using the cap_threshold kwarg or the
@ -76,7 +78,7 @@ def is_possible_title(
text: str,
sentence_min_length: int = 5,
title_max_word_length: int = 12,
non_alpha_threshold: float = 0.75,
non_alpha_threshold: float = 0.5,
language: str = "en",
) -> bool:
"""Checks to see if the text passes all of the checks for a valid title.
@ -164,7 +166,7 @@ def contains_verb(text: str) -> bool:
def contains_english_word(text: str) -> bool:
"""Checks to see if the text contains an English word."""
text = text.lower()
words = text.split(" ")
words = ENGLISH_WORD_SPLIT_RE.split(text)
for word in words:
# NOTE(robinson) - to ignore punctuation at the ends of words like "best."
word = "".join([character for character in word if character.isalpha()])
@ -200,7 +202,7 @@ def sentence_count(text: str, min_length: Optional[int] = None) -> int:
return count
def under_non_alpha_ratio(text: str, threshold: float = 0.75):
def under_non_alpha_ratio(text: str, threshold: float = 0.5):
"""Checks if the proportion of non-alpha characters in the text snippet exceeds a given
threshold. This helps prevent text like "-----------BREAK---------" from being tagged
as a title or narrative text. The ratio does not count spaces.