fix: more english words; split on punctuation (#191)

* add a bigger list of english words * update thresholds and add tests * update docs; bump version * fix version * add additional english words back in * linting, linting, linting * add slashes * work -> word
2025-12-05 03:23:03 +00:00 · 2023-02-02 12:25:47 -05:00 · 2023-02-02 12:25:47 -05:00 · a7ca58e0bc
commit a7ca58e0bc
parent 0589344ff7
8 changed files with 427177 additions and 19 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.4.5-dev4
+## 0.4.5-dev5

 * Loosen the default cap threshold to `0.5`.
 * Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
@ -10,7 +10,7 @@
 * Adds an `Address` element for capturing elements that only contain an address.
 * Suppress the `UserWarning` when detectron is called.
 * Checks that titles and narrative test have at least one English word.
-* Checks that titles and narrative text are at least 75% alpha characters.
+* Checks that titles and narrative text are at least 50% alpha characters.
 * Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
  environment variable for controlling the max number of words in a title.

--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -252,7 +252,7 @@ for consideration as narrative text. The function performs the following checks
  takes precedence over the kwarg.
 * If a the text contains too many non-alpha characters it is
  not narrative text.
-  The default is to expect a minimum of 75% alpha characters
+  The default is to expect a minimum of 50% alpha characters
  (not countings spaces). You can change the minimum value with the
  ``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_RATIO`` environment variable.
  The environment variables takes precedence over the kwarg.
@ -290,7 +290,7 @@ for consideration as a title. The function performs the following checks:
  the ``title_max_word_length`` kwarg or the ``UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`` environment variable. The environment
  variable takes precedence over the kwarg.
 * If a text contains too many non-alpha characters it is not a
-  title. The default is to expect a minimum of 75% alpha characters
+  title. The default is to expect a minimum of 50% alpha characters
  (not countings spaces). You can change the minimum value with the
  ``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_TITLE_NON_ALPHA_RATIO`` environment variable.
  The environment variables takes precedence over the kwarg.
--- a/setup.py
+++ b/setup.py
@ -72,4 +72,6 @@ setup(
        ],
        "local-inference": ["unstructured-inference>=0.2.4"],
    },
+    package_dir={"unstructured": "unstructured"},
+    package_data={"unstructured": ["nlp/*.txt"]},
 )
--- a/test_unstructured/partition/test_text_type.py
+++ b/test_unstructured/partition/test_text_type.py
@ -46,8 +46,8 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
    monkeypatch.setattr(text_type, "pos_tag", mock_pos_tag)
    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
-    has_verb = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
-    assert has_verb is expected
+    is_possible_narrative = text_type.is_possible_narrative_text(text, cap_threshold=0.3)
+    assert is_possible_narrative is expected


@pytest.mark.parametrize(
@ -65,6 +65,9 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
        ("BTAR ADFJA L", False),  # Doesn't have english words
        ("ITEM 1A. RISK FACTORS " * 15, False),  # Title is too long
        ("/--------BREAK-------/", False),  # Contains too many non-alpha characters
+        ("1.A.RISKS", True),  # Tests that "RISKS" gets flagged as an english word
+        ("1. Unstructured Technologies", True),  # Make sure we're English words :-)
+        ("Big/Brown/Sheet", True),
    ],
 )
 def test_is_possible_title(text, expected, monkeypatch):
@ -144,11 +147,12 @@ def test_contains_verb(text, expected, monkeypatch):
        ("daljdf adlfajldj ajadfa", False),
        ("BTAR ADFJA L", False),
        ("Unstructured Technologies", True),
+        ("1.A.RISKS", True),  # Test crammed together words get picked up
+        ("Big/Brown/Sheep", True),
    ],
 )
 def test_contains_english_word(text, expected, monkeypatch):
-    has_verb = text_type.contains_english_word(text)
-    assert has_verb is expected
+    assert text_type.contains_english_word(text) is expected


@pytest.mark.parametrize(
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.4.5-dev4"  # pragma: no cover
+__version__ = "0.4.5-dev5"  # pragma: no cover
--- a/unstructured/nlp/english-words.txt
+++ b/unstructured/nlp/english-words.txt
--- a/unstructured/nlp/english_words.py
+++ b/unstructured/nlp/english_words.py
@ -1,7 +1,17 @@
-from nltk.corpus import words as nltk_words
+import pathlib
+import os
+from typing import List

-ADDITIONAL_ENGLISH_WORDS = [
-    "unstructured",
-    "technologies",
-]
-ENGLISH_WORDS = nltk_words.words() + ADDITIONAL_ENGLISH_WORDS
+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+# NOTE(robinson) - the list of English words is based on the nlkt.corpus.words corpus
+# and the list of English words found here at the link below. Add more words to the text
+# file if needed.
+# ref: https://github.com/jeremy-rifkin/Wordlist
+ENGLISH_WORDS_FILE = os.path.join(DIRECTORY, "english-words.txt")
+
+with open(ENGLISH_WORDS_FILE, "r") as f:
+    BASE_ENGLISH_WORDS = f.read().split("\n")
+
+# NOTE(robinson) - add new words that we want to pass for the English check in here
+ADDITIONAL_ENGLISH_WORDS: List[str] = []
+ENGLISH_WORDS: List[str] = BASE_ENGLISH_WORDS + ADDITIONAL_ENGLISH_WORDS
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@ -1,5 +1,6 @@
 """partition.py implements logic for partitioning plain text documents into sections."""
 import os
+import re
 import sys

 from typing import List, Optional
@ -17,10 +18,11 @@ from unstructured.logger import logger


 POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
+ENGLISH_WORD_SPLIT_RE = re.compile(r"[\s|\.|-|_|\/]")


 def is_possible_narrative_text(
-    text: str, cap_threshold: float = 0.5, non_alpha_threshold: float = 0.75, language: str = "en"
+    text: str, cap_threshold: float = 0.5, non_alpha_threshold: float = 0.5, language: str = "en"
 ) -> bool:
    """Checks to see if the text passes all of the checks for a narrative text section.
    You can change the cap threshold using the cap_threshold kwarg or the
@ -76,7 +78,7 @@ def is_possible_title(
    text: str,
    sentence_min_length: int = 5,
    title_max_word_length: int = 12,
-    non_alpha_threshold: float = 0.75,
+    non_alpha_threshold: float = 0.5,
    language: str = "en",
 ) -> bool:
    """Checks to see if the text passes all of the checks for a valid title.
@ -164,7 +166,7 @@ def contains_verb(text: str) -> bool:
 def contains_english_word(text: str) -> bool:
    """Checks to see if the text contains an English word."""
    text = text.lower()
-    words = text.split(" ")
+    words = ENGLISH_WORD_SPLIT_RE.split(text)
    for word in words:
        # NOTE(robinson) - to ignore punctuation at the ends of words like "best."
        word = "".join([character for character in word if character.isalpha()])
@ -200,7 +202,7 @@ def sentence_count(text: str, min_length: Optional[int] = None) -> int:
    return count


-def under_non_alpha_ratio(text: str, threshold: float = 0.75):
+def under_non_alpha_ratio(text: str, threshold: float = 0.5):
    """Checks if the proportion of non-alpha characters in the text snippet exceeds a given
    threshold. This helps prevent text like "-----------BREAK---------" from being tagged
    as a title or narrative text. The ratio does not count spaces.