fix: require a minimum prop of alpha characters for titles and narrative text (#190)

* added alpha ratio check

* added tests for alpha ratio

* bump changelog and update docs

* update changelog/version; update docs

* ofr -> or
This commit is contained in:
Matt Robinson 2023-02-02 09:59:04 -05:00 committed by GitHub
parent 1230a163fd
commit 0589344ff7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 107 additions and 16 deletions

View File

@ -1,4 +1,4 @@
## 0.4.5-dev3
## 0.4.5-dev4
* Loosen the default cap threshold to `0.5`.
* Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
@ -10,6 +10,7 @@
* Adds an `Address` element for capturing elements that only contain an address.
* Suppress the `UserWarning` when detectron is called.
* Checks that titles and narrative test have at least one English word.
* Checks that titles and narrative text are at least 75% alpha characters.
* Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
environment variable for controlling the max number of words in a title.

View File

@ -250,6 +250,12 @@ for consideration as narrative text. The function performs the following checks
``cap_threshold=1.0``. You can also set the threshold by using the
``UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD`` environment variable. The environment variable
takes precedence over the kwarg.
* If a the text contains too many non-alpha characters it is
not narrative text.
The default is to expect a minimum of 75% alpha characters
(not countings spaces). You can change the minimum value with the
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_RATIO`` environment variable.
The environment variables takes precedence over the kwarg.
* The cap ratio test does not apply to text that is all uppercase.
@ -280,9 +286,14 @@ for consideration as a title. The function performs the following checks:
* Empty text cannot be a title
* Text that is all numeric cannot be a title.
* If a title contains too many words it is not a title. The default max length is ``15``. You can change the max length with
* If a title contains too many words it is not a title. The default max length is ``12``. You can change the max length with
the ``title_max_word_length`` kwarg or the ``UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`` environment variable. The environment
variable takes precedence over the kwarg.
* If a text contains too many non-alpha characters it is not a
title. The default is to expect a minimum of 75% alpha characters
(not countings spaces). You can change the minimum value with the
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_TITLE_NON_ALPHA_RATIO`` environment variable.
The environment variables takes precedence over the kwarg.
* Narrative text must contain at least one English word (if ``language`` is set to "en")
* If a title contains more than one sentence that exceeds a certain length, it cannot be a title. Sentence length threshold is controlled by the ``sentence_min_length`` kwarg and defaults to 5.
* If a segment of text ends in a comma, it is not considered a potential title. This is to avoid salutations like "To My Dearest Friends," getting flagged as titles.

View File

@ -38,6 +38,7 @@ def test_headings_are_not_narrative_text(text, expected):
("7", False), # Fails because it is numeric
("intellectual property", False), # Fails because it does not contain a verb
("Dal;kdjfal adawels adfjwalsdf. Addad jaja fjawlek", False),
("---------------Aske the teacher for an apple----------", False), # Too many non-alpha
("", False), # Doesn't have english words # Fails because it is empty
],
)
@ -63,13 +64,13 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
("To My Dearest Friends,", False), # Ends with a comma
("BTAR ADFJA L", False), # Doesn't have english words
("ITEM 1A. RISK FACTORS " * 15, False), # Title is too long
("/--------BREAK-------/", False), # Contains too many non-alpha characters
],
)
def test_is_possible_title(text, expected, monkeypatch):
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
has_verb = text_type.is_possible_title(text)
assert has_verb is expected
assert text_type.is_possible_title(text) is expected
@pytest.mark.parametrize(
@ -178,6 +179,30 @@ def test_set_caps_ratio_with_environment_variable(monkeypatch):
mock_exceeds.assert_called_once_with(text, threshold=0.8)
def test_set_title_non_alpha_threshold_with_environment_variable(monkeypatch):
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
monkeypatch.setenv("UNSTRUCTURED_TITLE_NON_ALPHA_THRESHOLD", 0.8)
text = "/--------------- All the king's horses----------------/"
with patch.object(text_type, "under_non_alpha_ratio", return_value=False) as mock_exceeds:
text_type.is_possible_title(text)
mock_exceeds.assert_called_once_with(text, threshold=0.8)
def test_set_narrative_text_non_alpha_threshold_with_environment_variable(monkeypatch):
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
monkeypatch.setenv("UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_THRESHOLD", 0.8)
text = "/--------------- All the king's horses----------------/"
with patch.object(text_type, "under_non_alpha_ratio", return_value=False) as mock_exceeds:
text_type.is_possible_narrative_text(text)
mock_exceeds.assert_called_once_with(text, threshold=0.8)
def test_set_title_max_word_length_with_environment_variable(monkeypatch):
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)

View File

@ -1 +1 @@
__version__ = "0.4.5-dev3" # pragma: no cover
__version__ = "0.4.5-dev4" # pragma: no cover

View File

@ -19,7 +19,9 @@ from unstructured.logger import logger
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language: str = "en") -> bool:
def is_possible_narrative_text(
text: str, cap_threshold: float = 0.5, non_alpha_threshold: float = 0.75, language: str = "en"
) -> bool:
"""Checks to see if the text passes all of the checks for a narrative text section.
You can change the cap threshold using the cap_threshold kwarg or the
NARRATIVE_TEXT_CAP_THRESHOLD environment variable. The environment variable takes
@ -28,11 +30,14 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
Parameters
----------
text
the input text
The input text to check
cap_threshold
the percentage of capitalized words necessary to disqualify the segment as narrative
The percentage of capitalized words necessary to disqualify the segment as narrative
non_alpha_threshold
The minimum proportion of alpha characters the text needs to be considered
narrative text
language
the two letter language code for the text. defaults to "en" for English
The two letter language code for the text. defaults to "en" for English
"""
if len(text) == 0:
logger.debug("Not narrative. Text is empty.")
@ -54,6 +59,12 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
return False
non_alpha_threshold = float(
os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_THRESHOLD", non_alpha_threshold)
)
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
return False
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
return False
@ -62,20 +73,26 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
def is_possible_title(
text: str, sentence_min_length: int = 5, title_max_word_length: int = 12, language: str = "en"
text: str,
sentence_min_length: int = 5,
title_max_word_length: int = 12,
non_alpha_threshold: float = 0.75,
language: str = "en",
) -> bool:
"""Checks to see if the text passes all of the checks for a valid title.
Parameters
----------
text
the input text
The input text to check
sentence_min_length
the minimum number of words required to consider a section of text a sentence
The minimum number of words required to consider a section of text a sentence
title_max_word_length
the maximum number of words a title can contain
The maximum number of words a title can contain
non_alpha_threshold
The minimum number of alpha characters the text needs to be considered a title
language
the two letter language code for the text. defaults to "en" for English
The two letter language code for the text. defaults to "en" for English
"""
if len(text) == 0:
logger.debug("Not a title. Text is empty.")
@ -89,6 +106,12 @@ def is_possible_title(
if len(text.split(" ")) > title_max_word_length:
return False
non_alpha_threshold = float(
os.environ.get("UNSTRUCTURED_TITLE_NON_ALPHA_THRESHOLD", non_alpha_threshold)
)
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
return False
# NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
if text.endswith(","):
return False
@ -177,9 +200,40 @@ def sentence_count(text: str, min_length: Optional[int] = None) -> int:
return count
def under_non_alpha_ratio(text: str, threshold: float = 0.75):
"""Checks if the proportion of non-alpha characters in the text snippet exceeds a given
threshold. This helps prevent text like "-----------BREAK---------" from being tagged
as a title or narrative text. The ratio does not count spaces.
Parameters
----------
text
The input string to test
threshold
If the proportion of non-alpha characters exceeds this threshold, the function
returns False
"""
if len(text) == 0:
return False
alpha_count = len([char for char in text if char.strip() and char.isalpha()])
total_count = len([char for char in text if char.strip()])
ratio = alpha_count / total_count
return ratio < threshold
def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool:
"""Checks the title ratio in a section of text. If a sufficient proportion of the text is
capitalized."""
"""Checks the title ratio in a section of text. If a sufficient proportion of the words
are capitalized, that can be indiciated on non-narrative text (i.e. "1A. Risk Factors").
Parameters
----------
text
The input string to test
threshold
If the percentage of words beginning with a capital letter exceeds this threshold,
the function returns True
"""
# NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
# The assumption is that sections with multiple sentences are not titles.
if sentence_count(text, 3) > 1: