mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-13 17:07:29 +00:00
fix: fix text_type.py exceeds_cap_ratio() returns (#478)
There are cases when function is_possible_narrative_text receives an incorrect return from function exceeds_cap_ratio and does an incorrect classification, so some of the return values of exceeds_cap_ratio are corrected. --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
parent
46ac2a2226
commit
8456676fad
@ -13,6 +13,7 @@
|
||||
* unstructured-documents encode xml string if document_tree is `None` in `_read_xml`.
|
||||
* Update to `_read_xml` so that Markdown files with embedded HTML process correctly.
|
||||
* Fallback to "fast" strategy only emits a warning if the user specifies the "hi_res" strategy.
|
||||
* unstructured-partition-text_type exceeds_cap_ratio fix returns and how capitalization ratios are calculated
|
||||
|
||||
## 0.5.12
|
||||
|
||||
|
||||
@ -215,7 +215,7 @@ def test_contains_english_word(text, expected, monkeypatch):
|
||||
("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", False),
|
||||
("This Has All Caps. It's Weird But Two Sentences", False),
|
||||
("The Business Report is expected within 6 hours of closing", False),
|
||||
("", False),
|
||||
("", True),
|
||||
],
|
||||
)
|
||||
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
|
||||
|
||||
@ -265,15 +265,24 @@ def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool:
|
||||
# NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
|
||||
# The assumption is that sections with multiple sentences are not titles.
|
||||
if sentence_count(text, 3) > 1:
|
||||
logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
|
||||
return False
|
||||
|
||||
if text.isupper():
|
||||
return False
|
||||
|
||||
tokens = word_tokenize(text)
|
||||
# NOTE(jay-ylee) - The word_tokenize function also recognizes and separates special characters
|
||||
# into one word, causing problems with ratio measurement.
|
||||
# Therefore, only words consisting of alphabets are used to measure the ratio.
|
||||
# ex. world_tokenize("ITEM 1. Financial Statements (Unaudited)")
|
||||
# = ['ITEM', '1', '.', 'Financial', 'Statements', '(', 'Unaudited', ')'],
|
||||
# however, "ITEM 1. Financial Statements (Unaudited)" is Title, not NarrativeText
|
||||
tokens = [tk for tk in word_tokenize(text) if tk.isalpha()]
|
||||
|
||||
# NOTE(jay-ylee) - If word_tokenize(text) is empty, return must be True to
|
||||
# avoid being misclassified as Narrative Text.
|
||||
if len(tokens) == 0:
|
||||
return False
|
||||
return True
|
||||
|
||||
capitalized = sum([word.istitle() or word.isupper() for word in tokens])
|
||||
ratio = capitalized / len(tokens)
|
||||
return ratio > threshold
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user