| 
									
										
										
										
											2023-10-06 21:21:14 -04:00
										 |  |  | import re | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import pytest | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-10 13:46:01 -05:00
										 |  |  | from unstructured.metrics import text_extraction | 
					
						
							| 
									
										
										
										
											2024-05-07 15:57:38 +02:00
										 |  |  | from unstructured.metrics.table.table_extraction import ( | 
					
						
							| 
									
										
										
										
											2024-06-19 09:03:38 +02:00
										 |  |  |     deckerd_table_to_html, | 
					
						
							| 
									
										
										
										
											2024-05-07 15:57:38 +02:00
										 |  |  |     extract_cells_from_table_as_cells, | 
					
						
							|  |  |  |     extract_cells_from_text_as_html, | 
					
						
							| 
									
										
										
										
											2024-06-19 09:03:38 +02:00
										 |  |  |     html_table_to_deckerd, | 
					
						
							| 
									
										
										
										
											2024-05-07 15:57:38 +02:00
										 |  |  | ) | 
					
						
							| 
									
										
										
										
											2023-10-06 21:21:14 -04:00
										 |  |  | from unstructured.partition.auto import partition | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_calculate_edit_distance(): | 
					
						
							|  |  |  |     source_cct = "I like pizza. I like bagels." | 
					
						
							|  |  |  |     source_cct_word_space = "I like p i z z a . I like bagles." | 
					
						
							|  |  |  |     source_cct_spaces = re.sub(r"\s+", " ", " ".join(source_cct)) | 
					
						
							|  |  |  |     source_cct_no_space = source_cct.replace(" ", "") | 
					
						
							|  |  |  |     source_cct_one_sentence = "I like pizza." | 
					
						
							|  |  |  |     source_cct_missing_word = "I like pizza. I like ." | 
					
						
							|  |  |  |     source_cct_addn_char = "I like pizza. I like beagles." | 
					
						
							|  |  |  |     source_cct_dup_word = "I like pizza pizza. I like bagels." | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     assert ( | 
					
						
							| 
									
										
										
										
											2023-10-10 13:46:01 -05:00
										 |  |  |         round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) | 
					
						
							|  |  |  |         == 1.0 | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert ( | 
					
						
							|  |  |  |         round( | 
					
						
							|  |  |  |             text_extraction.calculate_edit_distance( | 
					
						
							|  |  |  |                 source_cct_word_space, | 
					
						
							|  |  |  |                 source_cct, | 
					
						
							|  |  |  |                 return_as="score", | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |             2, | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-10-06 21:21:14 -04:00
										 |  |  |         == 0.75 | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert ( | 
					
						
							| 
									
										
										
										
											2023-10-10 13:46:01 -05:00
										 |  |  |         round( | 
					
						
							|  |  |  |             text_extraction.calculate_edit_distance( | 
					
						
							|  |  |  |                 source_cct_spaces, | 
					
						
							|  |  |  |                 source_cct, | 
					
						
							|  |  |  |                 return_as="score", | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |             2, | 
					
						
							|  |  |  |         ) | 
					
						
							|  |  |  |         == 0.39 | 
					
						
							| 
									
										
										
										
											2023-10-06 21:21:14 -04:00
										 |  |  |     ) | 
					
						
							|  |  |  |     assert ( | 
					
						
							| 
									
										
										
										
											2023-10-10 13:46:01 -05:00
										 |  |  |         round( | 
					
						
							|  |  |  |             text_extraction.calculate_edit_distance( | 
					
						
							|  |  |  |                 source_cct_no_space, | 
					
						
							|  |  |  |                 source_cct, | 
					
						
							|  |  |  |                 return_as="score", | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |             2, | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-10-06 21:21:14 -04:00
										 |  |  |         == 0.64 | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert ( | 
					
						
							| 
									
										
										
										
											2023-10-10 13:46:01 -05:00
										 |  |  |         round( | 
					
						
							|  |  |  |             text_extraction.calculate_edit_distance( | 
					
						
							|  |  |  |                 source_cct_one_sentence, | 
					
						
							|  |  |  |                 source_cct, | 
					
						
							|  |  |  |                 return_as="score", | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |             2, | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-10-06 21:21:14 -04:00
										 |  |  |         == 0.0 | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert ( | 
					
						
							| 
									
										
										
										
											2023-10-10 13:46:01 -05:00
										 |  |  |         round( | 
					
						
							|  |  |  |             text_extraction.calculate_edit_distance( | 
					
						
							|  |  |  |                 source_cct_missing_word, | 
					
						
							|  |  |  |                 source_cct, | 
					
						
							|  |  |  |                 return_as="score", | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |             2, | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-10-06 21:21:14 -04:00
										 |  |  |         == 0.57 | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert ( | 
					
						
							| 
									
										
										
										
											2023-10-10 13:46:01 -05:00
										 |  |  |         round( | 
					
						
							|  |  |  |             text_extraction.calculate_edit_distance( | 
					
						
							|  |  |  |                 source_cct_addn_char, | 
					
						
							|  |  |  |                 source_cct, | 
					
						
							|  |  |  |                 return_as="score", | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |             2, | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-10-06 21:21:14 -04:00
										 |  |  |         == 0.89 | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert ( | 
					
						
							| 
									
										
										
										
											2023-10-10 13:46:01 -05:00
										 |  |  |         round( | 
					
						
							|  |  |  |             text_extraction.calculate_edit_distance( | 
					
						
							|  |  |  |                 source_cct_dup_word, | 
					
						
							|  |  |  |                 source_cct, | 
					
						
							|  |  |  |                 return_as="score", | 
					
						
							|  |  |  |             ), | 
					
						
							|  |  |  |             2, | 
					
						
							|  |  |  |         ) | 
					
						
							| 
									
										
										
										
											2023-10-06 21:21:14 -04:00
										 |  |  |         == 0.79 | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("filename", "expected_score", "expected_distance"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ("fake-text.txt", 0.78, 38), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_calculate_edit_distance_with_filename(filename, expected_score, expected_distance): | 
					
						
							|  |  |  |     with open("example-docs/fake-text.txt") as f: | 
					
						
							|  |  |  |         source_cct = f.read() | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     elements = partition(filename=f"example-docs/{filename}") | 
					
						
							|  |  |  |     output_cct = "\n".join([str(el) for el in elements]) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-10 13:46:01 -05:00
										 |  |  |     score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score") | 
					
						
							|  |  |  |     distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance") | 
					
						
							| 
									
										
										
										
											2023-10-06 21:21:14 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  |     assert score >= 0 | 
					
						
							|  |  |  |     assert score <= 1.0 | 
					
						
							|  |  |  |     assert distance >= 0 | 
					
						
							|  |  |  |     assert round(score, 2) == expected_score | 
					
						
							|  |  |  |     assert distance == expected_distance | 
					
						
							| 
									
										
										
										
											2023-10-10 13:46:01 -05:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("text", "expected"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ( | 
					
						
							|  |  |  |             "The dog loved the cat, but the cat loved the cow", | 
					
						
							|  |  |  |             {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1}, | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         ( | 
					
						
							|  |  |  |             "Hello my name is H a r p e r, what's your name?", | 
					
						
							|  |  |  |             {"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1}, | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         ( | 
					
						
							|  |  |  |             "I have a dog and a cat, I love my dog.", | 
					
						
							|  |  |  |             {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1}, | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         ( | 
					
						
							|  |  |  |             "My dog's hair is red, but the dogs' houses are blue.", | 
					
						
							|  |  |  |             { | 
					
						
							|  |  |  |                 "my": 1, | 
					
						
							|  |  |  |                 "dog's": 1, | 
					
						
							|  |  |  |                 "hair": 1, | 
					
						
							|  |  |  |                 "is": 1, | 
					
						
							|  |  |  |                 "red": 1, | 
					
						
							|  |  |  |                 "but": 1, | 
					
						
							|  |  |  |                 "the": 1, | 
					
						
							|  |  |  |                 "dogs'": 1, | 
					
						
							|  |  |  |                 "houses": 1, | 
					
						
							|  |  |  |                 "are": 1, | 
					
						
							|  |  |  |                 "blue": 1, | 
					
						
							|  |  |  |             }, | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         ( | 
					
						
							|  |  |  |             """Sometimes sentences have a dash - like this one!
 | 
					
						
							| 
									
										
										
										
											2024-06-14 11:03:27 +02:00
										 |  |  |                     A hyphen connects 2 words with no gap: easy-peasy.""",
 | 
					
						
							| 
									
										
										
										
											2023-10-10 13:46:01 -05:00
										 |  |  |             { | 
					
						
							|  |  |  |                 "sometimes": 1, | 
					
						
							|  |  |  |                 "sentences": 1, | 
					
						
							|  |  |  |                 "have": 1, | 
					
						
							|  |  |  |                 "a": 2, | 
					
						
							|  |  |  |                 "dash": 1, | 
					
						
							|  |  |  |                 "like": 1, | 
					
						
							|  |  |  |                 "this": 1, | 
					
						
							|  |  |  |                 "one": 1, | 
					
						
							|  |  |  |                 "hyphen": 1, | 
					
						
							|  |  |  |                 "connects": 1, | 
					
						
							|  |  |  |                 "2": 1, | 
					
						
							|  |  |  |                 "words": 1, | 
					
						
							|  |  |  |                 "with": 1, | 
					
						
							|  |  |  |                 "no": 1, | 
					
						
							|  |  |  |                 "gap": 1, | 
					
						
							|  |  |  |                 "easy-peasy": 1, | 
					
						
							|  |  |  |             }, | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_bag_of_words(text, expected): | 
					
						
							|  |  |  |     assert text_extraction.bag_of_words(text) == expected | 
					
						
							| 
									
										
										
										
											2023-10-10 13:54:49 -07:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("output_text", "source_text", "expected_percentage"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         ( | 
					
						
							|  |  |  |             "extra", | 
					
						
							|  |  |  |             "", | 
					
						
							|  |  |  |             0, | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         ( | 
					
						
							|  |  |  |             "", | 
					
						
							|  |  |  |             "Source text has a sentence.", | 
					
						
							|  |  |  |             1, | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         ( | 
					
						
							|  |  |  |             "The original s e n t e n c e is normal.", | 
					
						
							|  |  |  |             "The original sentence is normal...", | 
					
						
							|  |  |  |             0.2, | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         ( | 
					
						
							|  |  |  |             "We saw 23% improvement in this quarter.", | 
					
						
							|  |  |  |             "We saw 23% improvement in sales this quarter.", | 
					
						
							| 
									
										
										
										
											2023-11-01 16:50:34 -04:00
										 |  |  |             0.125, | 
					
						
							| 
									
										
										
										
											2023-10-10 13:54:49 -07:00
										 |  |  |         ), | 
					
						
							|  |  |  |         ( | 
					
						
							|  |  |  |             "no", | 
					
						
							|  |  |  |             "Is it possible to have more than everything missing?", | 
					
						
							|  |  |  |             1, | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_calculate_percent_missing_text(output_text, source_text, expected_percentage): | 
					
						
							|  |  |  |     assert ( | 
					
						
							|  |  |  |         text_extraction.calculate_percent_missing_text(output_text, source_text) | 
					
						
							|  |  |  |         == expected_percentage | 
					
						
							|  |  |  |     ) | 
					
						
							| 
									
										
										
										
											2024-05-07 15:57:38 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-14 11:03:27 +02:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("table_as_cells", "expected_extraction"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         pytest.param( | 
					
						
							|  |  |  |             [ | 
					
						
							| 
									
										
										
										
											2024-05-07 15:57:38 +02:00
										 |  |  |                 {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."}, | 
					
						
							|  |  |  |                 {"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"}, | 
					
						
							|  |  |  |             ], | 
					
						
							| 
									
										
										
										
											2024-06-14 11:03:27 +02:00
										 |  |  |             [ | 
					
						
							|  |  |  |                 {"row_index": 0, "col_index": 0, "content": "Month A."}, | 
					
						
							|  |  |  |                 {"row_index": 1, "col_index": 0, "content": "22"}, | 
					
						
							|  |  |  |             ], | 
					
						
							|  |  |  |             id="Simple table, 1 head cell, 1 body cell, no spans", | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         pytest.param( | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."}, | 
					
						
							|  |  |  |                 {"x": 1, "y": 0, "w": 1, "h": 1, "content": "Month B."}, | 
					
						
							|  |  |  |                 {"x": 2, "y": 0, "w": 1, "h": 1, "content": "Month C."}, | 
					
						
							|  |  |  |                 {"x": 0, "y": 1, "w": 1, "h": 1, "content": "11"}, | 
					
						
							|  |  |  |                 {"x": 1, "y": 1, "w": 1, "h": 1, "content": "12"}, | 
					
						
							|  |  |  |                 {"x": 2, "y": 1, "w": 1, "h": 1, "content": "13"}, | 
					
						
							|  |  |  |                 {"x": 0, "y": 2, "w": 1, "h": 1, "content": "21"}, | 
					
						
							|  |  |  |                 {"x": 1, "y": 2, "w": 1, "h": 1, "content": "22"}, | 
					
						
							|  |  |  |                 {"x": 2, "y": 2, "w": 1, "h": 1, "content": "23"}, | 
					
						
							|  |  |  |             ], | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 {"row_index": 0, "col_index": 0, "content": "Month A."}, | 
					
						
							|  |  |  |                 {"row_index": 0, "col_index": 1, "content": "Month B."}, | 
					
						
							|  |  |  |                 {"row_index": 0, "col_index": 2, "content": "Month C."}, | 
					
						
							|  |  |  |                 {"row_index": 1, "col_index": 0, "content": "11"}, | 
					
						
							|  |  |  |                 {"row_index": 1, "col_index": 1, "content": "12"}, | 
					
						
							|  |  |  |                 {"row_index": 1, "col_index": 2, "content": "13"}, | 
					
						
							|  |  |  |                 {"row_index": 2, "col_index": 0, "content": "21"}, | 
					
						
							|  |  |  |                 {"row_index": 2, "col_index": 1, "content": "22"}, | 
					
						
							|  |  |  |                 {"row_index": 2, "col_index": 2, "content": "23"}, | 
					
						
							|  |  |  |             ], | 
					
						
							|  |  |  |             id="Simple table, 3 head cell, 5 body cell, no spans", | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         # +----------+---------------------+----------+ | 
					
						
							|  |  |  |         # |          |       h1col23       |  h1col4  | | 
					
						
							|  |  |  |         # | h12col1  |----------+----------+----------| | 
					
						
							|  |  |  |         # |          |  h2col2  |       h2col34       | | 
					
						
							|  |  |  |         # |----------|----------+----------+----------+ | 
					
						
							|  |  |  |         # |  r3col1  |  r3col2  |                     | | 
					
						
							|  |  |  |         # |----------+----------|      r34col34       | | 
					
						
							|  |  |  |         # |       r4col12       |                     | | 
					
						
							|  |  |  |         # +----------+----------+----------+----------+ | 
					
						
							|  |  |  |         pytest.param( | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "y": 0, | 
					
						
							|  |  |  |                     "x": 0, | 
					
						
							|  |  |  |                     "w": 2, | 
					
						
							|  |  |  |                     "h": 1, | 
					
						
							|  |  |  |                     "content": "h12col1", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "y": 0, | 
					
						
							|  |  |  |                     "x": 1, | 
					
						
							|  |  |  |                     "w": 1, | 
					
						
							|  |  |  |                     "h": 2, | 
					
						
							|  |  |  |                     "content": "h1col23", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "y": 0, | 
					
						
							|  |  |  |                     "x": 3, | 
					
						
							|  |  |  |                     "w": 1, | 
					
						
							|  |  |  |                     "h": 1, | 
					
						
							|  |  |  |                     "content": "h1col4", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "y": 1, | 
					
						
							|  |  |  |                     "x": 1, | 
					
						
							|  |  |  |                     "w": 1, | 
					
						
							|  |  |  |                     "h": 1, | 
					
						
							|  |  |  |                     "content": "h2col2", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "y": 1, | 
					
						
							|  |  |  |                     "x": 2, | 
					
						
							|  |  |  |                     "w": 1, | 
					
						
							|  |  |  |                     "h": 2, | 
					
						
							|  |  |  |                     "content": "h2col34", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "y": 2, | 
					
						
							|  |  |  |                     "x": 0, | 
					
						
							|  |  |  |                     "w": 1, | 
					
						
							|  |  |  |                     "h": 1, | 
					
						
							|  |  |  |                     "content": "r3col1", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "y": 2, | 
					
						
							|  |  |  |                     "x": 1, | 
					
						
							|  |  |  |                     "w": 1, | 
					
						
							|  |  |  |                     "h": 1, | 
					
						
							|  |  |  |                     "content": "r3col2", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "y": 2, | 
					
						
							|  |  |  |                     "x": 2, | 
					
						
							|  |  |  |                     "w": 2, | 
					
						
							|  |  |  |                     "h": 2, | 
					
						
							|  |  |  |                     "content": "r34col34", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "y": 3, | 
					
						
							|  |  |  |                     "x": 0, | 
					
						
							|  |  |  |                     "w": 1, | 
					
						
							|  |  |  |                     "h": 2, | 
					
						
							|  |  |  |                     "content": "r4col12", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |             ], | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 0, | 
					
						
							|  |  |  |                     "col_index": 0, | 
					
						
							|  |  |  |                     "content": "h12col1", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 0, | 
					
						
							|  |  |  |                     "col_index": 1, | 
					
						
							|  |  |  |                     "content": "h1col23", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 0, | 
					
						
							|  |  |  |                     "col_index": 3, | 
					
						
							|  |  |  |                     "content": "h1col4", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 1, | 
					
						
							|  |  |  |                     "col_index": 1, | 
					
						
							|  |  |  |                     "content": "h2col2", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 1, | 
					
						
							|  |  |  |                     "col_index": 2, | 
					
						
							|  |  |  |                     "content": "h2col34", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 2, | 
					
						
							|  |  |  |                     "col_index": 0, | 
					
						
							|  |  |  |                     "content": "r3col1", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 2, | 
					
						
							|  |  |  |                     "col_index": 1, | 
					
						
							|  |  |  |                     "content": "r3col2", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 2, | 
					
						
							|  |  |  |                     "col_index": 2, | 
					
						
							|  |  |  |                     "content": "r34col34", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 3, | 
					
						
							|  |  |  |                     "col_index": 0, | 
					
						
							|  |  |  |                     "content": "r4col12", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |             ], | 
					
						
							|  |  |  |             id="various spans, with 2 row header", | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_cells_table_extraction_from_prediction(table_as_cells, expected_extraction): | 
					
						
							|  |  |  |     example_element = { | 
					
						
							|  |  |  |         "type": "Table", | 
					
						
							|  |  |  |         "metadata": {"table_as_cells": table_as_cells}, | 
					
						
							| 
									
										
										
										
											2024-05-07 15:57:38 +02:00
										 |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-06-14 11:03:27 +02:00
										 |  |  |     assert extract_cells_from_table_as_cells(example_element) == expected_extraction | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-05-07 15:57:38 +02:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-06-14 11:03:27 +02:00
										 |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     ("text_as_html", "expected_extraction"), | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         pytest.param( | 
					
						
							|  |  |  |             """
 | 
					
						
							|  |  |  | <table> | 
					
						
							|  |  |  |     <thead> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <th>Month A.</th> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |     </thead> | 
					
						
							|  |  |  |     <tbody> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <td>22</td> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |     </tbody> | 
					
						
							|  |  |  | </table>" | 
					
						
							|  |  |  |             """,
 | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 {"row_index": 0, "col_index": 0, "content": "Month A."}, | 
					
						
							|  |  |  |                 {"row_index": 1, "col_index": 0, "content": "22"}, | 
					
						
							|  |  |  |             ], | 
					
						
							|  |  |  |             id="Simple table, 1 head cell, 1 body cell, no spans", | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         pytest.param( | 
					
						
							|  |  |  |             """
 | 
					
						
							|  |  |  | <table> | 
					
						
							|  |  |  |     <thead> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <th>Month A.</th> | 
					
						
							|  |  |  |             <th>Month B.</th> | 
					
						
							|  |  |  |             <th>Month C.</th> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |     </thead> | 
					
						
							|  |  |  |     <tbody> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <td>11</td> | 
					
						
							|  |  |  |             <td>12</td> | 
					
						
							|  |  |  |             <td>13</td> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <td>21</td> | 
					
						
							|  |  |  |             <td>22</td> | 
					
						
							|  |  |  |             <td>23</td> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |     </tbody> | 
					
						
							|  |  |  | </table>" | 
					
						
							|  |  |  | """,
 | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 {"row_index": 0, "col_index": 0, "content": "Month A."}, | 
					
						
							|  |  |  |                 {"row_index": 0, "col_index": 1, "content": "Month B."}, | 
					
						
							|  |  |  |                 {"row_index": 0, "col_index": 2, "content": "Month C."}, | 
					
						
							|  |  |  |                 {"row_index": 1, "col_index": 0, "content": "11"}, | 
					
						
							|  |  |  |                 {"row_index": 1, "col_index": 1, "content": "12"}, | 
					
						
							|  |  |  |                 {"row_index": 1, "col_index": 2, "content": "13"}, | 
					
						
							|  |  |  |                 {"row_index": 2, "col_index": 0, "content": "21"}, | 
					
						
							|  |  |  |                 {"row_index": 2, "col_index": 1, "content": "22"}, | 
					
						
							|  |  |  |                 {"row_index": 2, "col_index": 2, "content": "23"}, | 
					
						
							|  |  |  |             ], | 
					
						
							|  |  |  |             id="Simple table, 3 head cell, 5 body cell, no spans", | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |         # +----------+---------------------+----------+ | 
					
						
							|  |  |  |         # |          |       h1col23       |  h1col4  | | 
					
						
							|  |  |  |         # | h12col1  |----------+----------+----------| | 
					
						
							|  |  |  |         # |          |  h2col2  |       h2col34       | | 
					
						
							|  |  |  |         # |----------|----------+----------+----------+ | 
					
						
							|  |  |  |         # |  r3col1  |  r3col2  |                     | | 
					
						
							|  |  |  |         # |----------+----------|      r34col34       | | 
					
						
							|  |  |  |         # |       r4col12       |                     | | 
					
						
							|  |  |  |         # +----------+----------+----------+----------+ | 
					
						
							|  |  |  |         pytest.param( | 
					
						
							|  |  |  |             """
 | 
					
						
							|  |  |  | <table> | 
					
						
							|  |  |  |     <thead> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <th rowspan="2">h12col1</th> | 
					
						
							|  |  |  |             <th colspan="2">h1col23</th> | 
					
						
							|  |  |  |             <th>h1col4</th> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <th>h2col2</th> | 
					
						
							|  |  |  |             <th colspan="2">h2col34</th> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |     </thead> | 
					
						
							|  |  |  |     <tbody> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <td>r3col1</td> | 
					
						
							|  |  |  |             <td>r3col2</td> | 
					
						
							|  |  |  |             <td colspan="2" rowspan="2">r34col34</td> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <td colspan="2">r4col12</td> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |     </tbody> | 
					
						
							|  |  |  | </table> | 
					
						
							|  |  |  | """,
 | 
					
						
							|  |  |  |             [ | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 0, | 
					
						
							|  |  |  |                     "col_index": 0, | 
					
						
							|  |  |  |                     "content": "h12col1", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 0, | 
					
						
							|  |  |  |                     "col_index": 1, | 
					
						
							|  |  |  |                     "content": "h1col23", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 0, | 
					
						
							|  |  |  |                     "col_index": 3, | 
					
						
							|  |  |  |                     "content": "h1col4", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 1, | 
					
						
							|  |  |  |                     "col_index": 1, | 
					
						
							|  |  |  |                     "content": "h2col2", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 1, | 
					
						
							|  |  |  |                     "col_index": 2, | 
					
						
							|  |  |  |                     "content": "h2col34", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 2, | 
					
						
							|  |  |  |                     "col_index": 0, | 
					
						
							|  |  |  |                     "content": "r3col1", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 2, | 
					
						
							|  |  |  |                     "col_index": 1, | 
					
						
							|  |  |  |                     "content": "r3col2", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 2, | 
					
						
							|  |  |  |                     "col_index": 2, | 
					
						
							|  |  |  |                     "content": "r34col34", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |                 { | 
					
						
							|  |  |  |                     "row_index": 3, | 
					
						
							|  |  |  |                     "col_index": 0, | 
					
						
							|  |  |  |                     "content": "r4col12", | 
					
						
							|  |  |  |                 }, | 
					
						
							|  |  |  |             ], | 
					
						
							|  |  |  |             id="various spans, with 2 row header", | 
					
						
							|  |  |  |         ), | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_html_table_extraction_from_prediction(text_as_html, expected_extraction): | 
					
						
							|  |  |  |     example_element = { | 
					
						
							|  |  |  |         "type": "Table", | 
					
						
							|  |  |  |         "metadata": { | 
					
						
							|  |  |  |             "text_as_html": text_as_html, | 
					
						
							|  |  |  |         }, | 
					
						
							|  |  |  |     } | 
					
						
							| 
									
										
										
										
											2024-05-07 15:57:38 +02:00
										 |  |  |     assert extract_cells_from_text_as_html(example_element) == expected_extraction | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def test_cells_extraction_from_prediction_when_missing_prediction(): | 
					
						
							|  |  |  |     example_element = {"type": "Table", "metadata": {"text_as_html": "", "table_as_cells": []}} | 
					
						
							|  |  |  |     assert extract_cells_from_text_as_html(example_element) is None | 
					
						
							|  |  |  |     assert extract_cells_from_table_as_cells(example_element) is None | 
					
						
							| 
									
										
										
										
											2024-06-19 09:03:38 +02:00
										 |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | def _trim_html(html: str) -> str: | 
					
						
							|  |  |  |     html_lines = [line.strip() for line in html.split("\n") if line] | 
					
						
							|  |  |  |     return "".join(html_lines) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.parametrize( | 
					
						
							|  |  |  |     "html_to_test", | 
					
						
							|  |  |  |     [ | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  | <table> | 
					
						
							|  |  |  |     <thead> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <th>Month A.</th> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |     </thead> | 
					
						
							|  |  |  |     <tbody> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <td>22</td> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |     </tbody> | 
					
						
							|  |  |  | </table> | 
					
						
							|  |  |  | """,
 | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  | <table> | 
					
						
							|  |  |  |     <thead> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <th>Month A.</th> | 
					
						
							|  |  |  |             <th>Month B.</th> | 
					
						
							|  |  |  |             <th>Month C.</th> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |     </thead> | 
					
						
							|  |  |  |     <tbody> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <td>11</td> | 
					
						
							|  |  |  |             <td>12</td> | 
					
						
							|  |  |  |             <td>13</td> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <td>21</td> | 
					
						
							|  |  |  |             <td>22</td> | 
					
						
							|  |  |  |             <td>23</td> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |     </tbody> | 
					
						
							|  |  |  | </table> | 
					
						
							|  |  |  | """,
 | 
					
						
							|  |  |  |         """
 | 
					
						
							|  |  |  | <table> | 
					
						
							|  |  |  |     <thead> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <th rowspan="2">h12col1</th> | 
					
						
							|  |  |  |             <th colspan="2">h1col23</th> | 
					
						
							|  |  |  |             <th>h1col4</th> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <th>h2col2</th> | 
					
						
							|  |  |  |             <th colspan="2">h2col34</th> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |     </thead> | 
					
						
							|  |  |  |     <tbody> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <td>r3col1</td> | 
					
						
							|  |  |  |             <td>r3col2</td> | 
					
						
							|  |  |  |             <td colspan="2" rowspan="2">r34col34</td> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |         <tr> | 
					
						
							|  |  |  |             <td colspan="2">r4col12</td> | 
					
						
							|  |  |  |         </tr> | 
					
						
							|  |  |  |     </tbody> | 
					
						
							|  |  |  | </table> | 
					
						
							|  |  |  | """,
 | 
					
						
							|  |  |  |     ], | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_deckerd_html_converter(html_to_test): | 
					
						
							|  |  |  |     deckerd_table = html_table_to_deckerd(html_to_test) | 
					
						
							|  |  |  |     html_table = deckerd_table_to_html(deckerd_table) | 
					
						
							|  |  |  |     assert _trim_html(html_to_test) == html_table |