mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 07:03:52 +00:00
fix: require a minimum prop of alpha characters for titles and narrative text (#190)
* added alpha ratio check * added tests for alpha ratio * bump changelog and update docs * update changelog/version; update docs * ofr -> or
This commit is contained in:
parent
1230a163fd
commit
0589344ff7
@ -1,4 +1,4 @@
|
||||
## 0.4.5-dev3
|
||||
## 0.4.5-dev4
|
||||
|
||||
* Loosen the default cap threshold to `0.5`.
|
||||
* Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
|
||||
@ -10,6 +10,7 @@
|
||||
* Adds an `Address` element for capturing elements that only contain an address.
|
||||
* Suppress the `UserWarning` when detectron is called.
|
||||
* Checks that titles and narrative test have at least one English word.
|
||||
* Checks that titles and narrative text are at least 75% alpha characters.
|
||||
* Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
|
||||
environment variable for controlling the max number of words in a title.
|
||||
|
||||
|
||||
@ -250,6 +250,12 @@ for consideration as narrative text. The function performs the following checks
|
||||
``cap_threshold=1.0``. You can also set the threshold by using the
|
||||
``UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD`` environment variable. The environment variable
|
||||
takes precedence over the kwarg.
|
||||
* If a the text contains too many non-alpha characters it is
|
||||
not narrative text.
|
||||
The default is to expect a minimum of 75% alpha characters
|
||||
(not countings spaces). You can change the minimum value with the
|
||||
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_RATIO`` environment variable.
|
||||
The environment variables takes precedence over the kwarg.
|
||||
* The cap ratio test does not apply to text that is all uppercase.
|
||||
|
||||
|
||||
@ -280,9 +286,14 @@ for consideration as a title. The function performs the following checks:
|
||||
|
||||
* Empty text cannot be a title
|
||||
* Text that is all numeric cannot be a title.
|
||||
* If a title contains too many words it is not a title. The default max length is ``15``. You can change the max length with
|
||||
* If a title contains too many words it is not a title. The default max length is ``12``. You can change the max length with
|
||||
the ``title_max_word_length`` kwarg or the ``UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`` environment variable. The environment
|
||||
variable takes precedence over the kwarg.
|
||||
* If a text contains too many non-alpha characters it is not a
|
||||
title. The default is to expect a minimum of 75% alpha characters
|
||||
(not countings spaces). You can change the minimum value with the
|
||||
``non_alpha_ratio`` kwarg or the ``UNSTRUCTURED_TITLE_NON_ALPHA_RATIO`` environment variable.
|
||||
The environment variables takes precedence over the kwarg.
|
||||
* Narrative text must contain at least one English word (if ``language`` is set to "en")
|
||||
* If a title contains more than one sentence that exceeds a certain length, it cannot be a title. Sentence length threshold is controlled by the ``sentence_min_length`` kwarg and defaults to 5.
|
||||
* If a segment of text ends in a comma, it is not considered a potential title. This is to avoid salutations like "To My Dearest Friends," getting flagged as titles.
|
||||
|
||||
@ -38,6 +38,7 @@ def test_headings_are_not_narrative_text(text, expected):
|
||||
("7", False), # Fails because it is numeric
|
||||
("intellectual property", False), # Fails because it does not contain a verb
|
||||
("Dal;kdjfal adawels adfjwalsdf. Addad jaja fjawlek", False),
|
||||
("---------------Aske the teacher for an apple----------", False), # Too many non-alpha
|
||||
("", False), # Doesn't have english words # Fails because it is empty
|
||||
],
|
||||
)
|
||||
@ -63,13 +64,13 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
|
||||
("To My Dearest Friends,", False), # Ends with a comma
|
||||
("BTAR ADFJA L", False), # Doesn't have english words
|
||||
("ITEM 1A. RISK FACTORS " * 15, False), # Title is too long
|
||||
("/--------BREAK-------/", False), # Contains too many non-alpha characters
|
||||
],
|
||||
)
|
||||
def test_is_possible_title(text, expected, monkeypatch):
|
||||
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||
has_verb = text_type.is_possible_title(text)
|
||||
assert has_verb is expected
|
||||
assert text_type.is_possible_title(text) is expected
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -178,6 +179,30 @@ def test_set_caps_ratio_with_environment_variable(monkeypatch):
|
||||
mock_exceeds.assert_called_once_with(text, threshold=0.8)
|
||||
|
||||
|
||||
def test_set_title_non_alpha_threshold_with_environment_variable(monkeypatch):
|
||||
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||
monkeypatch.setenv("UNSTRUCTURED_TITLE_NON_ALPHA_THRESHOLD", 0.8)
|
||||
|
||||
text = "/--------------- All the king's horses----------------/"
|
||||
with patch.object(text_type, "under_non_alpha_ratio", return_value=False) as mock_exceeds:
|
||||
text_type.is_possible_title(text)
|
||||
|
||||
mock_exceeds.assert_called_once_with(text, threshold=0.8)
|
||||
|
||||
|
||||
def test_set_narrative_text_non_alpha_threshold_with_environment_variable(monkeypatch):
|
||||
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||
monkeypatch.setenv("UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_THRESHOLD", 0.8)
|
||||
|
||||
text = "/--------------- All the king's horses----------------/"
|
||||
with patch.object(text_type, "under_non_alpha_ratio", return_value=False) as mock_exceeds:
|
||||
text_type.is_possible_narrative_text(text)
|
||||
|
||||
mock_exceeds.assert_called_once_with(text, threshold=0.8)
|
||||
|
||||
|
||||
def test_set_title_max_word_length_with_environment_variable(monkeypatch):
|
||||
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.4.5-dev3" # pragma: no cover
|
||||
__version__ = "0.4.5-dev4" # pragma: no cover
|
||||
|
||||
@ -19,7 +19,9 @@ from unstructured.logger import logger
|
||||
POS_VERB_TAGS: Final[List[str]] = ["VB", "VBG", "VBD", "VBN", "VBP", "VBZ"]
|
||||
|
||||
|
||||
def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language: str = "en") -> bool:
|
||||
def is_possible_narrative_text(
|
||||
text: str, cap_threshold: float = 0.5, non_alpha_threshold: float = 0.75, language: str = "en"
|
||||
) -> bool:
|
||||
"""Checks to see if the text passes all of the checks for a narrative text section.
|
||||
You can change the cap threshold using the cap_threshold kwarg or the
|
||||
NARRATIVE_TEXT_CAP_THRESHOLD environment variable. The environment variable takes
|
||||
@ -28,11 +30,14 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
|
||||
Parameters
|
||||
----------
|
||||
text
|
||||
the input text
|
||||
The input text to check
|
||||
cap_threshold
|
||||
the percentage of capitalized words necessary to disqualify the segment as narrative
|
||||
The percentage of capitalized words necessary to disqualify the segment as narrative
|
||||
non_alpha_threshold
|
||||
The minimum proportion of alpha characters the text needs to be considered
|
||||
narrative text
|
||||
language
|
||||
the two letter language code for the text. defaults to "en" for English
|
||||
The two letter language code for the text. defaults to "en" for English
|
||||
"""
|
||||
if len(text) == 0:
|
||||
logger.debug("Not narrative. Text is empty.")
|
||||
@ -54,6 +59,12 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
|
||||
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
|
||||
return False
|
||||
|
||||
non_alpha_threshold = float(
|
||||
os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_NON_ALPHA_THRESHOLD", non_alpha_threshold)
|
||||
)
|
||||
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
|
||||
return False
|
||||
|
||||
if (sentence_count(text, 3) < 2) and (not contains_verb(text)):
|
||||
logger.debug(f"Not narrative. Text does not contain a verb:\n\n{text}")
|
||||
return False
|
||||
@ -62,20 +73,26 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
|
||||
|
||||
|
||||
def is_possible_title(
|
||||
text: str, sentence_min_length: int = 5, title_max_word_length: int = 12, language: str = "en"
|
||||
text: str,
|
||||
sentence_min_length: int = 5,
|
||||
title_max_word_length: int = 12,
|
||||
non_alpha_threshold: float = 0.75,
|
||||
language: str = "en",
|
||||
) -> bool:
|
||||
"""Checks to see if the text passes all of the checks for a valid title.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text
|
||||
the input text
|
||||
The input text to check
|
||||
sentence_min_length
|
||||
the minimum number of words required to consider a section of text a sentence
|
||||
The minimum number of words required to consider a section of text a sentence
|
||||
title_max_word_length
|
||||
the maximum number of words a title can contain
|
||||
The maximum number of words a title can contain
|
||||
non_alpha_threshold
|
||||
The minimum number of alpha characters the text needs to be considered a title
|
||||
language
|
||||
the two letter language code for the text. defaults to "en" for English
|
||||
The two letter language code for the text. defaults to "en" for English
|
||||
"""
|
||||
if len(text) == 0:
|
||||
logger.debug("Not a title. Text is empty.")
|
||||
@ -89,6 +106,12 @@ def is_possible_title(
|
||||
if len(text.split(" ")) > title_max_word_length:
|
||||
return False
|
||||
|
||||
non_alpha_threshold = float(
|
||||
os.environ.get("UNSTRUCTURED_TITLE_NON_ALPHA_THRESHOLD", non_alpha_threshold)
|
||||
)
|
||||
if under_non_alpha_ratio(text, threshold=non_alpha_threshold):
|
||||
return False
|
||||
|
||||
# NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
|
||||
if text.endswith(","):
|
||||
return False
|
||||
@ -177,9 +200,40 @@ def sentence_count(text: str, min_length: Optional[int] = None) -> int:
|
||||
return count
|
||||
|
||||
|
||||
def under_non_alpha_ratio(text: str, threshold: float = 0.75):
|
||||
"""Checks if the proportion of non-alpha characters in the text snippet exceeds a given
|
||||
threshold. This helps prevent text like "-----------BREAK---------" from being tagged
|
||||
as a title or narrative text. The ratio does not count spaces.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text
|
||||
The input string to test
|
||||
threshold
|
||||
If the proportion of non-alpha characters exceeds this threshold, the function
|
||||
returns False
|
||||
"""
|
||||
if len(text) == 0:
|
||||
return False
|
||||
|
||||
alpha_count = len([char for char in text if char.strip() and char.isalpha()])
|
||||
total_count = len([char for char in text if char.strip()])
|
||||
ratio = alpha_count / total_count
|
||||
return ratio < threshold
|
||||
|
||||
|
||||
def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool:
|
||||
"""Checks the title ratio in a section of text. If a sufficient proportion of the text is
|
||||
capitalized."""
|
||||
"""Checks the title ratio in a section of text. If a sufficient proportion of the words
|
||||
are capitalized, that can be indiciated on non-narrative text (i.e. "1A. Risk Factors").
|
||||
|
||||
Parameters
|
||||
----------
|
||||
text
|
||||
The input string to test
|
||||
threshold
|
||||
If the percentage of words beginning with a capital letter exceeds this threshold,
|
||||
the function returns True
|
||||
"""
|
||||
# NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
|
||||
# The assumption is that sections with multiple sentences are not titles.
|
||||
if sentence_count(text, 3) > 1:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user