mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-24 05:34:58 +00:00
feat: set a user controlled max word length for titles (#189)
* update the docs * add option for title max word length * bump version; update changelog * change max length to 12 * docs updates * to -> too
This commit is contained in:
parent
2d08fcbf83
commit
1230a163fd
@ -1,7 +1,8 @@
|
||||
## 0.4.5-dev2
|
||||
## 0.4.5-dev3
|
||||
|
||||
* Loosen the default cap threshold to `0.5`.
|
||||
* Add a `NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling the cap ratio threshold.
|
||||
* Add a `UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD` environment variable for controlling
|
||||
the cap ratio threshold.
|
||||
* Unknown text elements are identified as `Text` for HTML and plain text documents.
|
||||
* `Body Text` styles no longer default to `NarrativeText` for Word documents. The style information
|
||||
is insufficient to determine that the text is narrative.
|
||||
@ -9,6 +10,8 @@
|
||||
* Adds an `Address` element for capturing elements that only contain an address.
|
||||
* Suppress the `UserWarning` when detectron is called.
|
||||
* Checks that titles and narrative test have at least one English word.
|
||||
* Restricts titles to a maximum word length. Adds a `UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`
|
||||
environment variable for controlling the max number of words in a title.
|
||||
|
||||
## 0.4.4
|
||||
|
||||
|
||||
@ -248,7 +248,7 @@ for consideration as narrative text. The function performs the following checks
|
||||
* Text that exceeds the specified caps ratio cannot be narrative text. The threshold
|
||||
is configurable with the ``cap_threshold`` kwarg. To ignore this check, you can set
|
||||
``cap_threshold=1.0``. You can also set the threshold by using the
|
||||
``NARRATIVE_TEXT_CAP_THRESHOLD`` environment variable. The environment variable
|
||||
``UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD`` environment variable. The environment variable
|
||||
takes precedence over the kwarg.
|
||||
* The cap ratio test does not apply to text that is all uppercase.
|
||||
|
||||
@ -279,7 +279,10 @@ The ``is_possible_title`` function determines if a section of text is a candidat
|
||||
for consideration as a title. The function performs the following checks:
|
||||
|
||||
* Empty text cannot be a title
|
||||
* Text that is all numeric cannot be a title
|
||||
* Text that is all numeric cannot be a title.
|
||||
* If a title contains too many words it is not a title. The default max length is ``15``. You can change the max length with
|
||||
the ``title_max_word_length`` kwarg or the ``UNSTRUCTURED_TITLE_MAX_WORD_LENGTH`` environment variable. The environment
|
||||
variable takes precedence over the kwarg.
|
||||
* Narrative text must contain at least one English word (if ``language`` is set to "en")
|
||||
* If a title contains more than one sentence that exceeds a certain length, it cannot be a title. Sentence length threshold is controlled by the ``sentence_min_length`` kwarg and defaults to 5.
|
||||
* If a segment of text ends in a comma, it is not considered a potential title. This is to avoid salutations like "To My Dearest Friends," getting flagged as titles.
|
||||
@ -379,10 +382,7 @@ Examples:
|
||||
Determines if the section of text exceeds the specified caps ratio. Used in
|
||||
``is_possible_narrative_text`` and ``is_possible_title``, but can be used independently
|
||||
as well. You can set the caps threshold using the ``threshold`` kwarg. The threshold
|
||||
defaults to ``0.3``. Only runs on sections of text that are a single sentence.
|
||||
You can also set the threshold using the ``NARRATIVE_TEXT_CAP_THRESHOLD`` environment
|
||||
variable. The environment variable takes precedence over the kwarg. The caps ratio
|
||||
check does not apply to text that is all capitalized.
|
||||
defaults to ``0.3``. Only runs on sections of text that are a single sentence. The caps ratio check does not apply to text that is all capitalized.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@ -62,6 +62,7 @@ def test_is_possible_narrative_text(text, expected, monkeypatch):
|
||||
("ITEM 1A. RISK FACTORS", True), # Two "sentences", but both are short
|
||||
("To My Dearest Friends,", False), # Ends with a comma
|
||||
("BTAR ADFJA L", False), # Doesn't have english words
|
||||
("ITEM 1A. RISK FACTORS " * 15, False), # Title is too long
|
||||
],
|
||||
)
|
||||
def test_is_possible_title(text, expected, monkeypatch):
|
||||
@ -168,7 +169,7 @@ def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
|
||||
def test_set_caps_ratio_with_environment_variable(monkeypatch):
|
||||
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||
monkeypatch.setenv("NARRATIVE_TEXT_CAP_THRESHOLD", 0.8)
|
||||
monkeypatch.setenv("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", 0.8)
|
||||
|
||||
text = "All The King's Horses. And All The King's Men."
|
||||
with patch.object(text_type, "exceeds_cap_ratio", return_value=False) as mock_exceeds:
|
||||
@ -177,6 +178,15 @@ def test_set_caps_ratio_with_environment_variable(monkeypatch):
|
||||
mock_exceeds.assert_called_once_with(text, threshold=0.8)
|
||||
|
||||
|
||||
def test_set_title_max_word_length_with_environment_variable(monkeypatch):
|
||||
monkeypatch.setattr(text_type, "word_tokenize", mock_word_tokenize)
|
||||
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||
monkeypatch.setenv("UNSTRUCTURED_TITLE_MAX_WORD_LENGTH", 5)
|
||||
|
||||
text = "Intellectual Property in the United States"
|
||||
assert text_type.is_possible_narrative_text(text) is False
|
||||
|
||||
|
||||
def test_sentence_count(monkeypatch):
|
||||
monkeypatch.setattr(text_type, "sent_tokenize", mock_sent_tokenize)
|
||||
text = "Hi my name is Matt. I work with Crag."
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.4.5-dev2" # pragma: no cover
|
||||
__version__ = "0.4.5-dev3" # pragma: no cover
|
||||
|
||||
@ -47,7 +47,9 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
|
||||
|
||||
# NOTE(robinson): it gets read in from the environment as a string so we need to
|
||||
# cast it to a float
|
||||
cap_threshold = float(os.environ.get("NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold))
|
||||
cap_threshold = float(
|
||||
os.environ.get("UNSTRUCTURED_NARRATIVE_TEXT_CAP_THRESHOLD", cap_threshold)
|
||||
)
|
||||
if exceeds_cap_ratio(text, threshold=cap_threshold):
|
||||
logger.debug(f"Not narrative. Text exceeds cap ratio {cap_threshold}:\n\n{text}")
|
||||
return False
|
||||
@ -59,7 +61,9 @@ def is_possible_narrative_text(text: str, cap_threshold: float = 0.5, language:
|
||||
return True
|
||||
|
||||
|
||||
def is_possible_title(text: str, sentence_min_length: int = 5, language: str = "en") -> bool:
|
||||
def is_possible_title(
|
||||
text: str, sentence_min_length: int = 5, title_max_word_length: int = 12, language: str = "en"
|
||||
) -> bool:
|
||||
"""Checks to see if the text passes all of the checks for a valid title.
|
||||
|
||||
Parameters
|
||||
@ -68,6 +72,8 @@ def is_possible_title(text: str, sentence_min_length: int = 5, language: str = "
|
||||
the input text
|
||||
sentence_min_length
|
||||
the minimum number of words required to consider a section of text a sentence
|
||||
title_max_word_length
|
||||
the maximum number of words a title can contain
|
||||
language
|
||||
the two letter language code for the text. defaults to "en" for English
|
||||
"""
|
||||
@ -75,6 +81,14 @@ def is_possible_title(text: str, sentence_min_length: int = 5, language: str = "
|
||||
logger.debug("Not a title. Text is empty.")
|
||||
return False
|
||||
|
||||
title_max_word_length = int(
|
||||
os.environ.get("UNSTRUCTURED_TITLE_MAX_WORD_LENGTH", title_max_word_length)
|
||||
)
|
||||
# NOTE(robinson) - splitting on spaces here instead of word tokenizing because it
|
||||
# is less expensive and actual tokenization doesn't add much value for the length check
|
||||
if len(text.split(" ")) > title_max_word_length:
|
||||
return False
|
||||
|
||||
# NOTE(robinson) - Prevent flagging salutations like "To My Dearest Friends," as titles
|
||||
if text.endswith(","):
|
||||
return False
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user