fix: require a minimum prop of alpha characters for titles and narrative text (#190)

* added alpha ratio check * added tests for alpha ratio * bump changelog and update docs * update changelog/version; update docs * ofr -> or
2025-12-27 15:13:35 +00:00 · 2023-02-02 09:59:04 -05:00 · 2023-02-02 09:59:04 -05:00 · 0589344ff7
commit 0589344ff7
parent 1230a163fd
5 changed files with 107 additions and 16 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,4 +1,4 @@
-## 0.4.5-dev3
+## 0.4.5-dev4

 * Loosen the default cap threshold to `0.5`.
 * Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
@ -10,6 +10,7 @@
 * Adds an `Address` element for capturing elements that only contain an address.
 * Suppress the `UserWarning` when detectron is called.
 * Checks that titles and narrative test have at least one English word.
+* Checks that titles and narrative text are at least 75% alpha characters.
 * Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
  environment variable for controlling the max number of words in a title.

--- a/docs/source/bricks.rst
+++ b/docs/source/bricks.rst
@ -250,6 +250,12 @@ for consideration as narrative text. The function performs the following checks
  ``cap_threshold=1.0``. You can also set the threshold by using the
  ``UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD`` environment variable. The environment variable
  takes precedence over the kwarg.
+* If a the text contains too many non-alpha characters it is
+  not narrative text.
+  The default is to expect a minimum of 75% alpha characters
+  (not countings spaces). You can change the minimum value with the
+  ``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_RATIO`` environment variable.
+  The environment variables takes precedence over the kwarg.
 * The cap ratio test does not apply to text that is all uppercase.


@ -280,9 +286,14 @@ for consideration as a title. The function performs the following checks:

 * Empty text cannot be a title
 * Text that is all numeric cannot be a title.
-* If a title contains too many words it is not a title. The default max length is ``15``. You can change the max length with
+* If a title contains too many words it is not a title. The default max length is ``12``. You can change the max length with
  the ``title_max_word_length`` kwarg or the ``UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`` environment variable. The environment
  variable takes precedence over the kwarg.
+* If a text contains too many non-alpha characters it is not a
+  title. The default is to expect a minimum of 75% alpha characters
+  (not countings spaces). You can change the minimum value with the
+  ``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_TITLE_NON_ALPHA_RATIO`` environment variable.
+  The environment variables takes precedence over the kwarg.
 * Narrative text must contain at least one English word (if ``language`` is set to "en")
 * If a title contains more than one sentence that exceeds a certain length, it cannot be a title. Sentence length threshold is controlled by the ``sentence_min_length`` kwarg and defaults to 5.
 * If a segment of text ends in a comma, it is not considered a potential title. This is to avoid salutations like "To My Dearest Friends," getting flagged as titles.
--- a/test_unstructured/partition/test_text_type.py
+++ b/test_unstructured/partition/test_text_type.py
@ -38,6 +38,7 @@ def test_headings_are_not_narrative_text(text, expected):
        ("7", False),  # Fails because it is numeric
        ("intellectual property", False),  # Fails because it does not contain a verb
        ("Dal;kdjfal adawels adfjwalsdf. Addad jaja fjawlek", False),
+        ("---------------Aske the teacher for an apple----------", False),  # Too many non-alpha
        ("", False),  # Doesn't have english words  # Fails because it is empty
    ],
 )
@ -63,13 +64,13 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
        ("To My Dearest Friends,", False),  # Ends with a comma
        ("BTAR ADFJA L", False),  # Doesn't have english words
        ("ITEM 1A. RISK FACTORS " * 15, False),  # Title is too long
+        ("/--------BREAK-------/", False),  # Contains too many non-alpha characters
    ],
 )
 def test_is_possible_title(text, expected, monkeypatch):
    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
-    has_verb = text_type.is_possible_title(text)
-    assert has_verb is expected
+    assert text_type.is_possible_title(text) is expected


@pytest.mark.parametrize(
@ -178,6 +179,30 @@ def test_set_caps_ratio_with_environment_variable(monkeypatch):
    mock_exceeds.assert_called_once_with(text, threshold=0.8)


+def test_set_title_non_alpha_threshold_with_environment_variable(monkeypatch):
+    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
+    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
+    monkeypatch.setenv("UNSTRUCTURED_TITLE_NON_ALPHA_THRESHOLD", 0.8)
+
+    text = "/--------------- All the king's horses----------------/"
+    with patch.object(text_type, "under_non_alpha_ratio", return_value=False) as mock_exceeds:
+        text_type.is_possible_title(text)
+
+    mock_exceeds.assert_called_once_with(text, threshold=0.8)
+
+
+def test_set_narrative_text_non_alpha_threshold_with_environment_variable(monkeypatch):
+    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
+    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
+    monkeypatch.setenv("UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_THRESHOLD", 0.8)
+
+    text = "/--------------- All the king's horses----------------/"
+    with patch.object(text_type, "under_non_alpha_ratio", return_value=False) as mock_exceeds:
+        text_type.is_possible_narrative_text(text)
+
+    mock_exceeds.assert_called_once_with(text, threshold=0.8)
+
+
 def test_set_title_max_word_length_with_environment_variable(monkeypatch):
    monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
    monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.4.5-dev3"  # pragma: no cover
+__version__ = "0.4.5-dev4"  # pragma: no cover
--- a/unstructured/partition/text_type.py
+++ b/unstructured/partition/text_type.py
@ -19,7 +19,9 @@ from unstructured.logger import logger
 POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]


-def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language: str = "en") -> bool:
+def is_possible_narrative_text(
+    text: str, cap_threshold: float = 0.5, non_alpha_threshold: float = 0.75, language: str = "en"
+) -> bool:
    """Checks to see if the text passes all of the checks for a narrative text section.
    You can change the cap threshold using the cap_threshold kwarg or the
    NARRATIVE_TEXT_CAP_THRESHOLD environment variable. The environment variable takes
@ -28,11 +30,14 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
    Parameters
    ----------
    text
-        the input text
+        The input text to check
    cap_threshold
-        the percentage of capitalized words necessary to disqualify the segment as narrative
+        The percentage of capitalized words necessary to disqualify the segment as narrative
+    non_alpha_threshold
+        The minimum proportion of alpha characters the text needs to be considered
+        narrative text
    language
-        the two letter language code for the text. defaults to "en" for English
+        The two letter language code for the text. defaults to "en" for English
    """
    if len(text) == 0:
        logger.debug("Not narrative. Text is empty.")
@ -54,6 +59,12 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
        logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
        return False

+    non_alpha_threshold = float(
+        os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_THRESHOLD", non_alpha_threshold)
+    )
+    if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
+        return False
+
    if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
        logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
        return False
@ -62,20 +73,26 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:


 def is_possible_title(
-    text: str, sentence_min_length: int = 5, title_max_word_length: int = 12, language: str = "en"
+    text: str,
+    sentence_min_length: int = 5,
+    title_max_word_length: int = 12,
+    non_alpha_threshold: float = 0.75,
+    language: str = "en",
 ) -> bool:
    """Checks to see if the text passes all of the checks for a valid title.

    Parameters
    ----------
    text
-        the input text
+        The input text to check
    sentence_min_length
-        the minimum number of words required to consider a section of text a sentence
+        The minimum number of words required to consider a section of text a sentence
    title_max_word_length
-        the maximum number of words a title can contain
+        The maximum number of words a title can contain
+    non_alpha_threshold
+        The minimum number of alpha characters the text needs to be considered a title
    language
-        the two letter language code for the text. defaults to "en" for English
+        The two letter language code for the text. defaults to "en" for English
    """
    if len(text) == 0:
        logger.debug("Not a title. Text is empty.")
@ -89,6 +106,12 @@ def is_possible_title(
    if len(text.split(" ")) > title_max_word_length:
        return False

+    non_alpha_threshold = float(
+        os.environ.get("UNSTRUCTURED_TITLE_NON_ALPHA_THRESHOLD", non_alpha_threshold)
+    )
+    if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
+        return False
+
    # NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
    if text.endswith(","):
        return False
@ -177,9 +200,40 @@ def sentence_count(text: str, min_length: Optional[int] = None) -> int:
    return count


+def under_non_alpha_ratio(text: str, threshold: float = 0.75):
+    """Checks if the proportion of non-alpha characters in the text snippet exceeds a given
+    threshold. This helps prevent text like "-----------BREAK---------" from being tagged
+    as a title or narrative text. The ratio does not count spaces.
+
+    Parameters
+    ----------
+    text
+        The input string to test
+    threshold
+        If the proportion of non-alpha characters exceeds this threshold, the function
+        returns False
+    """
+    if len(text) == 0:
+        return False
+
+    alpha_count = len([char for char in text if char.strip() and char.isalpha()])
+    total_count = len([char for char in text if char.strip()])
+    ratio = alpha_count / total_count
+    return ratio < threshold
+
+
 def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool:
-    """Checks the title ratio in a section of text. If a sufficient proportion of the text is
-    capitalized."""
+    """Checks the title ratio in a section of text. If a sufficient proportion of the words
+    are capitalized, that can be indiciated on non-narrative text (i.e. "1A. Risk Factors").
+
+    Parameters
+    ----------
+    text
+        The input string to test
+    threshold
+        If the percentage of words beginning with a capital letter exceeds this threshold,
+        the function returns True
+    """
    # NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
    # The assumption is that sections with multiple sentences are not titles.
    if sentence_count(text, 3) > 1: