mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-05 03:23:03 +00:00
fix: more english words; split on punctuation (#191)
* add a bigger list of english words * update thresholds and add tests * update docs; bump version * fix version * add additional english words back in * linting, linting, linting * add slashes * work -> word
This commit is contained in:
parent
0589344ff7
commit
a7ca58e0bc
@ -1,4 +1,4 @@
|
||||
## 0.4.5-dev4
|
||||
## 0.4.5-dev5
|
||||
|
||||
* Loosen the default cap threshold to `0.5`.
|
||||
* Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
|
||||
@ -10,7 +10,7 @@
|
||||
* Adds an `Address` element for capturing elements that only contain an address.
|
||||
* Suppress the `UserWarning` when detectron is called.
|
||||
* Checks that titles and narrative test have at least one English word.
|
||||
* Checks that titles and narrative text are at least 75% alpha characters.
|
||||
* Checks that titles and narrative text are at least 50% alpha characters.
|
||||
* Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
|
||||
environment variable for controlling the max number of words in a title.
|
||||
|
||||
|
||||
@ -252,7 +252,7 @@ for consideration as narrative text. The function performs the following checks
|
||||
takes precedence over the kwarg.
|
||||
* If a the text contains too many non-alpha characters it is
|
||||
not narrative text.
|
||||
The default is to expect a minimum of 75% alpha characters
|
||||
The default is to expect a minimum of 50% alpha characters
|
||||
(not countings spaces). You can change the minimum value with the
|
||||
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_RATIO`` environment variable.
|
||||
The environment variables takes precedence over the kwarg.
|
||||
@ -290,7 +290,7 @@ for consideration as a title. The function performs the following checks:
|
||||
the ``title_max_word_length`` kwarg or the ``UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`` environment variable. The environment
|
||||
variable takes precedence over the kwarg.
|
||||
* If a text contains too many non-alpha characters it is not a
|
||||
title. The default is to expect a minimum of 75% alpha characters
|
||||
title. The default is to expect a minimum of 50% alpha characters
|
||||
(not countings spaces). You can change the minimum value with the
|
||||
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_TITLE_NON_ALPHA_RATIO`` environment variable.
|
||||
The environment variables takes precedence over the kwarg.
|
||||
|
||||
2
setup.py
2
setup.py
@ -72,4 +72,6 @@ setup(
|
||||
],
|
||||
"local-inference": ["unstructured-inference>=0.2.4"],
|
||||
},
|
||||
package_dir={"unstructured": "unstructured"},
|
||||
package_data={"unstructured": ["nlp/*.txt"]},
|
||||
)
|
||||
|
||||
@ -46,8 +46,8 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
|
||||
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||
monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
|
||||
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||
has_verb = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
|
||||
assert has_verb is expected
|
||||
is_possible_narrative = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
|
||||
assert is_possible_narrative is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -65,6 +65,9 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
|
||||
("BTAR ADFJA L", False), # Doesn't have english words
|
||||
("ITEM 1A. RISK FACTORS " * 15, False), # Title is too long
|
||||
("/--------BREAK-------/", False), # Contains too many non-alpha characters
|
||||
("1.A.RISKS", True), # Tests that "RISKS" gets flagged as an english word
|
||||
("1. Unstructured Technologies", True), # Make sure we're English words :-)
|
||||
("Big/Brown/Sheet", True),
|
||||
],
|
||||
)
|
||||
def test_is_possible_title(text, expected, monkeypatch):
|
||||
@ -144,11 +147,12 @@ def test_contains_verb(text, expected, monkeypatch):
|
||||
("daljdf adlfajldj ajadfa", False),
|
||||
("BTAR ADFJA L", False),
|
||||
("Unstructured Technologies", True),
|
||||
("1.A.RISKS", True), # Test crammed together words get picked up
|
||||
("Big/Brown/Sheep", True),
|
||||
],
|
||||
)
|
||||
def test_contains_english_word(text, expected, monkeypatch):
|
||||
has_verb = text_type.contains_english_word(text)
|
||||
assert has_verb is expected
|
||||
assert text_type.contains_english_word(text) is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.4.5-dev4" # pragma: no cover
|
||||
__version__ = "0.4.5-dev5" # pragma: no cover
|
||||
|
||||
427140
unstructured/nlp/english-words.txt
Normal file
427140
unstructured/nlp/english-words.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -1,7 +1,17 @@
|
||||
from nltk.corpus import words as nltk_words
|
||||
import pathlib
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
ADDITIONAL_ENGLISH_WORDS = [
|
||||
"unstructured",
|
||||
"technologies",
|
||||
]
|
||||
ENGLISH_WORDS = nltk_words.words() + ADDITIONAL_ENGLISH_WORDS
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
# NOTE(robinson) - the list of English words is based on the nlkt.corpus.words corpus
|
||||
# and the list of English words found here at the link below. Add more words to the text
|
||||
# file if needed.
|
||||
# ref: https://github.com/jeremy-rifkin/Wordlist
|
||||
ENGLISH_WORDS_FILE = os.path.join(DIRECTORY, "english-words.txt")
|
||||
|
||||
with open(ENGLISH_WORDS_FILE, "r") as f:
|
||||
BASE_ENGLISH_WORDS = f.read().split("\n")
|
||||
|
||||
# NOTE(robinson) - add new words that we want to pass for the English check in here
|
||||
ADDITIONAL_ENGLISH_WORDS: List[str] = []
|
||||
ENGLISH_WORDS: List[str] = BASE_ENGLISH_WORDS + ADDITIONAL_ENGLISH_WORDS
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
"""partition.py implements logic for partitioning plain text documents into sections."""
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
from typing import List, Optional
|
||||
@ -17,10 +18,11 @@ from unstructured.logger import logger
|
||||
|
||||
|
||||
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
|
||||
ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s|\.|-|_|\/]")
|
||||
|
||||
|
||||
def is_possible_narrative_text(
|
||||
text: str, cap_threshold: float = 0.5, non_alpha_threshold: float = 0.75, language: str = "en"
|
||||
text: str, cap_threshold: float = 0.5, non_alpha_threshold: float = 0.5, language: str = "en"
|
||||
) -> bool:
|
||||
"""Checks to see if the text passes all of the checks for a narrative text section.
|
||||
You can change the cap threshold using the cap_threshold kwarg or the
|
||||
@ -76,7 +78,7 @@ def is_possible_title(
|
||||
text: str,
|
||||
sentence_min_length: int = 5,
|
||||
title_max_word_length: int = 12,
|
||||
non_alpha_threshold: float = 0.75,
|
||||
non_alpha_threshold: float = 0.5,
|
||||
language: str = "en",
|
||||
) -> bool:
|
||||
"""Checks to see if the text passes all of the checks for a valid title.
|
||||
@ -164,7 +166,7 @@ def contains_verb(text: str) -> bool:
|
||||
def contains_english_word(text: str) -> bool:
|
||||
"""Checks to see if the text contains an English word."""
|
||||
text = text.lower()
|
||||
words = text.split(" ")
|
||||
words = ENGLISH_WORD_SPLIT_RE.split(text)
|
||||
for word in words:
|
||||
# NOTE(robinson) - to ignore punctuation at the ends of words like "best."
|
||||
word = "".join([character for character in word if character.isalpha()])
|
||||
@ -200,7 +202,7 @@ def sentence_count(text: str, min_length: Optional[int] = None) -> int:
|
||||
return count
|
||||
|
||||
|
||||
def under_non_alpha_ratio(text: str, threshold: float = 0.75):
|
||||
def under_non_alpha_ratio(text: str, threshold: float = 0.5):
|
||||
"""Checks if the proportion of non-alpha characters in the text snippet exceeds a given
|
||||
threshold. This helps prevent text like "-----------BREAK---------" from being tagged
|
||||
as a title or narrative text. The ratio does not count spaces.
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user