mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-10-25 23:24:24 +00:00 
			
		
		
		
	Feat: Bag of words for testing metric (#1650)
This PR adds the `bag_of_words` function to count the frequency of words for evaluation. **Testing** ```Python from unstructured.cleaners.core import bag_of_words string = "The dog loved the cat, but the cat loved the cow." print(bag_of_words) --------- Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local> Co-authored-by: Klaijan <klaijan@unstructured.io> Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com> Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									b38a6b3022
								
							
						
					
					
						commit
						a5d7ae4611
					
				| @ -14,6 +14,7 @@ | ||||
| * | ||||
| ### Features | ||||
| 
 | ||||
| * **Adds `bag_of_words` function** In order to count the word frequency to evaluate extraction accuracy. | ||||
| * **Adds `edit_distance` calculation metrics** In order to benchmark the cleaned, extracted text with unstructured, `edit_distance` (`Levenshtein distance`) is included. | ||||
| * **Adds detection_origin field to metadata** Problem: Currently isn't an easy way to find out how an element was created. With this change that information is added. Importance: With this information the developers and users are now able to know how an element was created to make decisions on how to use it. In order tu use this feature | ||||
| setting UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true is needed. | ||||
|  | ||||
| @ -2,7 +2,7 @@ import re | ||||
| 
 | ||||
| import pytest | ||||
| 
 | ||||
| from unstructured.metrics.text_extraction import calculate_edit_distance | ||||
| from unstructured.metrics import text_extraction | ||||
| from unstructured.partition.auto import partition | ||||
| 
 | ||||
| 
 | ||||
| @ -16,32 +16,85 @@ def test_calculate_edit_distance(): | ||||
|     source_cct_addn_char = "I like pizza. I like beagles." | ||||
|     source_cct_dup_word = "I like pizza pizza. I like bagels." | ||||
| 
 | ||||
|     assert round(calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) == 1.0 | ||||
|     assert ( | ||||
|         round(calculate_edit_distance(source_cct_word_space, source_cct, return_as="score"), 2) | ||||
|         round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) | ||||
|         == 1.0 | ||||
|     ) | ||||
|     assert ( | ||||
|         round( | ||||
|             text_extraction.calculate_edit_distance( | ||||
|                 source_cct_word_space, | ||||
|                 source_cct, | ||||
|                 return_as="score", | ||||
|             ), | ||||
|             2, | ||||
|         ) | ||||
|         == 0.75 | ||||
|     ) | ||||
|     assert ( | ||||
|         round(calculate_edit_distance(source_cct_spaces, source_cct, return_as="score"), 2) == 0.39 | ||||
|         round( | ||||
|             text_extraction.calculate_edit_distance( | ||||
|                 source_cct_spaces, | ||||
|                 source_cct, | ||||
|                 return_as="score", | ||||
|             ), | ||||
|             2, | ||||
|         ) | ||||
|         == 0.39 | ||||
|     ) | ||||
|     assert ( | ||||
|         round(calculate_edit_distance(source_cct_no_space, source_cct, return_as="score"), 2) | ||||
|         round( | ||||
|             text_extraction.calculate_edit_distance( | ||||
|                 source_cct_no_space, | ||||
|                 source_cct, | ||||
|                 return_as="score", | ||||
|             ), | ||||
|             2, | ||||
|         ) | ||||
|         == 0.64 | ||||
|     ) | ||||
|     assert ( | ||||
|         round(calculate_edit_distance(source_cct_one_sentence, source_cct, return_as="score"), 2) | ||||
|         round( | ||||
|             text_extraction.calculate_edit_distance( | ||||
|                 source_cct_one_sentence, | ||||
|                 source_cct, | ||||
|                 return_as="score", | ||||
|             ), | ||||
|             2, | ||||
|         ) | ||||
|         == 0.0 | ||||
|     ) | ||||
|     assert ( | ||||
|         round(calculate_edit_distance(source_cct_missing_word, source_cct, return_as="score"), 2) | ||||
|         round( | ||||
|             text_extraction.calculate_edit_distance( | ||||
|                 source_cct_missing_word, | ||||
|                 source_cct, | ||||
|                 return_as="score", | ||||
|             ), | ||||
|             2, | ||||
|         ) | ||||
|         == 0.57 | ||||
|     ) | ||||
|     assert ( | ||||
|         round(calculate_edit_distance(source_cct_addn_char, source_cct, return_as="score"), 2) | ||||
|         round( | ||||
|             text_extraction.calculate_edit_distance( | ||||
|                 source_cct_addn_char, | ||||
|                 source_cct, | ||||
|                 return_as="score", | ||||
|             ), | ||||
|             2, | ||||
|         ) | ||||
|         == 0.89 | ||||
|     ) | ||||
|     assert ( | ||||
|         round(calculate_edit_distance(source_cct_dup_word, source_cct, return_as="score"), 2) | ||||
|         round( | ||||
|             text_extraction.calculate_edit_distance( | ||||
|                 source_cct_dup_word, | ||||
|                 source_cct, | ||||
|                 return_as="score", | ||||
|             ), | ||||
|             2, | ||||
|         ) | ||||
|         == 0.79 | ||||
|     ) | ||||
| 
 | ||||
| @ -59,11 +112,70 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte | ||||
|     elements = partition(filename=f"example-docs/{filename}") | ||||
|     output_cct = "\n".join([str(el) for el in elements]) | ||||
| 
 | ||||
|     score = calculate_edit_distance(output_cct, source_cct, return_as="score") | ||||
|     distance = calculate_edit_distance(output_cct, source_cct, return_as="distance") | ||||
|     score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score") | ||||
|     distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance") | ||||
| 
 | ||||
|     assert score >= 0 | ||||
|     assert score <= 1.0 | ||||
|     assert distance >= 0 | ||||
|     assert round(score, 2) == expected_score | ||||
|     assert distance == expected_distance | ||||
| 
 | ||||
| 
 | ||||
| @pytest.mark.parametrize( | ||||
|     ("text", "expected"), | ||||
|     [ | ||||
|         ( | ||||
|             "The dog loved the cat, but the cat loved the cow", | ||||
|             {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1}, | ||||
|         ), | ||||
|         ( | ||||
|             "Hello my name is H a r p e r, what's your name?", | ||||
|             {"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1}, | ||||
|         ), | ||||
|         ( | ||||
|             "I have a dog and a cat, I love my dog.", | ||||
|             {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1}, | ||||
|         ), | ||||
|         ( | ||||
|             "My dog's hair is red, but the dogs' houses are blue.", | ||||
|             { | ||||
|                 "my": 1, | ||||
|                 "dog's": 1, | ||||
|                 "hair": 1, | ||||
|                 "is": 1, | ||||
|                 "red": 1, | ||||
|                 "but": 1, | ||||
|                 "the": 1, | ||||
|                 "dogs'": 1, | ||||
|                 "houses": 1, | ||||
|                 "are": 1, | ||||
|                 "blue": 1, | ||||
|             }, | ||||
|         ), | ||||
|         ( | ||||
|             """Sometimes sentences have a dash - like this one! | ||||
|             A hyphen connects 2 words with no gap: easy-peasy.""", | ||||
|             { | ||||
|                 "sometimes": 1, | ||||
|                 "sentences": 1, | ||||
|                 "have": 1, | ||||
|                 "a": 2, | ||||
|                 "dash": 1, | ||||
|                 "like": 1, | ||||
|                 "this": 1, | ||||
|                 "one": 1, | ||||
|                 "hyphen": 1, | ||||
|                 "connects": 1, | ||||
|                 "2": 1, | ||||
|                 "words": 1, | ||||
|                 "with": 1, | ||||
|                 "no": 1, | ||||
|                 "gap": 1, | ||||
|                 "easy-peasy": 1, | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
| ) | ||||
| def test_bag_of_words(text, expected): | ||||
|     assert text_extraction.bag_of_words(text) == expected | ||||
|  | ||||
| @ -2,7 +2,7 @@ import quopri | ||||
| import re | ||||
| import sys | ||||
| import unicodedata | ||||
| from typing import Tuple | ||||
| from typing import Optional, Tuple | ||||
| 
 | ||||
| import numpy as np | ||||
| 
 | ||||
| @ -302,6 +302,15 @@ def remove_punctuation(s: str) -> str: | ||||
|     return s | ||||
| 
 | ||||
| 
 | ||||
| def remove_sentence_punctuation(s: str, exclude_punctuation: Optional[list]) -> str: | ||||
|     tbl_new = tbl.copy() | ||||
|     if exclude_punctuation: | ||||
|         for punct in exclude_punctuation: | ||||
|             del tbl_new[ord(punct)] | ||||
|     s = s.translate(tbl_new) | ||||
|     return s | ||||
| 
 | ||||
| 
 | ||||
| def clean_extra_whitespace(text: str) -> str: | ||||
|     """Cleans extra whitespace characters that appear between words. | ||||
| 
 | ||||
|  | ||||
| @ -1,7 +1,9 @@ | ||||
| from typing import Tuple | ||||
| from typing import Dict, Tuple | ||||
| 
 | ||||
| from rapidfuzz.distance import Levenshtein | ||||
| 
 | ||||
| from unstructured.cleaners.core import clean_bullets, remove_sentence_punctuation | ||||
| 
 | ||||
| 
 | ||||
| def calculate_edit_distance( | ||||
|     output: str, | ||||
| @ -50,3 +52,33 @@ def calculate_edit_distance( | ||||
|     elif return_as == "distance": | ||||
|         return distance | ||||
|     return 0.0 | ||||
| 
 | ||||
| 
 | ||||
| def bag_of_words(text: str) -> Dict[str, int]: | ||||
|     bow: Dict[str, int] = {} | ||||
|     incorrect_word: str = "" | ||||
|     words = clean_bullets(remove_sentence_punctuation(text.lower(), ["-", "'"])).split() | ||||
| 
 | ||||
|     i = 0 | ||||
|     while i < len(words): | ||||
|         if len(words[i]) > 1: | ||||
|             if words[i] in bow: | ||||
|                 bow[words[i]] += 1 | ||||
|             else: | ||||
|                 bow[words[i]] = 1 | ||||
|             i += 1 | ||||
|         else: | ||||
|             j = i | ||||
|             incorrect_word = "" | ||||
| 
 | ||||
|             while j < len(words) and len(words[j]) == 1: | ||||
|                 incorrect_word += words[j] | ||||
|                 j += 1 | ||||
| 
 | ||||
|             if len(incorrect_word) == 1 and words[i].isalnum(): | ||||
|                 if incorrect_word in bow: | ||||
|                     bow[incorrect_word] += 1 | ||||
|                 else: | ||||
|                     bow[incorrect_word] = 1 | ||||
|             i = j | ||||
|     return bow | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Mallori Harrell
						Mallori Harrell