mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-03 19:43:24 +00:00 
			
		
		
		
	fix: fix text_type.py exceeds_cap_ratio() returns (#478)
There are cases when function is_possible_narrative_text receives an incorrect return from function exceeds_cap_ratio and does an incorrect classification, so some of the return values of exceeds_cap_ratio are corrected. --------- Co-authored-by: Matt Robinson <mrobinson@unstructured.io>
This commit is contained in:
		
							parent
							
								
									46ac2a2226
								
							
						
					
					
						commit
						8456676fad
					
				@ -13,6 +13,7 @@
 | 
			
		||||
* unstructured-documents encode xml string if document_tree is `None` in `_read_xml`.
 | 
			
		||||
* Update to `_read_xml` so that Markdown files with embedded HTML process correctly.
 | 
			
		||||
* Fallback to "fast" strategy only emits a warning if the user specifies the "hi_res" strategy.
 | 
			
		||||
* unstructured-partition-text_type exceeds_cap_ratio fix returns and how capitalization ratios are calculated
 | 
			
		||||
 | 
			
		||||
## 0.5.12
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -215,7 +215,7 @@ def test_contains_english_word(text, expected, monkeypatch):
 | 
			
		||||
        ("LOOK AT THIS IT IS CAPS BUT NOT A TITLE.", False),
 | 
			
		||||
        ("This Has All Caps. It's Weird But Two Sentences", False),
 | 
			
		||||
        ("The Business Report is expected within 6 hours of closing", False),
 | 
			
		||||
        ("", False),
 | 
			
		||||
        ("", True),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_contains_exceeds_cap_ratio(text, expected, monkeypatch):
 | 
			
		||||
 | 
			
		||||
@ -265,15 +265,24 @@ def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool:
 | 
			
		||||
    # NOTE(robinson) - Currently limiting this to only sections of text with one sentence.
 | 
			
		||||
    # The assumption is that sections with multiple sentences are not titles.
 | 
			
		||||
    if sentence_count(text, 3) > 1:
 | 
			
		||||
        logger.debug(f"Text does not contain multiple sentences:\n\n{text}")
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
    if text.isupper():
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
    tokens = word_tokenize(text)
 | 
			
		||||
    # NOTE(jay-ylee) - The word_tokenize function also recognizes and separates special characters
 | 
			
		||||
    # into one word, causing problems with ratio measurement.
 | 
			
		||||
    # Therefore, only words consisting of alphabets are used to measure the ratio.
 | 
			
		||||
    # ex. world_tokenize("ITEM 1. Financial Statements (Unaudited)")
 | 
			
		||||
    #     = ['ITEM', '1', '.', 'Financial', 'Statements', '(', 'Unaudited', ')'],
 | 
			
		||||
    # however, "ITEM 1. Financial Statements (Unaudited)" is Title, not NarrativeText
 | 
			
		||||
    tokens = [tk for tk in word_tokenize(text) if tk.isalpha()]
 | 
			
		||||
 | 
			
		||||
    # NOTE(jay-ylee) - If word_tokenize(text) is empty, return must be True to
 | 
			
		||||
    # avoid being misclassified as Narrative Text.
 | 
			
		||||
    if len(tokens) == 0:
 | 
			
		||||
        return False
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    capitalized = sum([word.istitle() or word.isupper() for word in tokens])
 | 
			
		||||
    ratio = capitalized / len(tokens)
 | 
			
		||||
    return ratio > threshold
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user