import re import pytest from unstructured.metrics import text_extraction from unstructured.metrics.table.table_extraction import ( deckerd_table_to_html, extract_cells_from_table_as_cells, extract_cells_from_text_as_html, html_table_to_deckerd, ) from unstructured.partition.auto import partition def test_calculate_edit_distance(): source_cct = "I like pizza. I like bagels." source_cct_word_space = "I like p i z z a . I like bagles." source_cct_spaces = re.sub(r"\s+", " ", " ".join(source_cct)) source_cct_no_space = source_cct.replace(" ", "") source_cct_one_sentence = "I like pizza." source_cct_missing_word = "I like pizza. I like ." source_cct_addn_char = "I like pizza. I like beagles." source_cct_dup_word = "I like pizza pizza. I like bagels." assert ( round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2) == 1.0 ) assert ( round( text_extraction.calculate_edit_distance( source_cct_word_space, source_cct, return_as="score", ), 2, ) == 0.75 ) assert ( round( text_extraction.calculate_edit_distance( source_cct_spaces, source_cct, return_as="score", ), 2, ) == 0.39 ) assert ( round( text_extraction.calculate_edit_distance( source_cct_no_space, source_cct, return_as="score", ), 2, ) == 0.64 ) assert ( round( text_extraction.calculate_edit_distance( source_cct_one_sentence, source_cct, return_as="score", ), 2, ) == 0.0 ) assert ( round( text_extraction.calculate_edit_distance( source_cct_missing_word, source_cct, return_as="score", ), 2, ) == 0.57 ) assert ( round( text_extraction.calculate_edit_distance( source_cct_addn_char, source_cct, return_as="score", ), 2, ) == 0.89 ) assert ( round( text_extraction.calculate_edit_distance( source_cct_dup_word, source_cct, return_as="score", ), 2, ) == 0.79 ) @pytest.mark.parametrize( ("filename", "expected_score", "expected_distance"), [ ("fake-text.txt", 0.78, 38), ], ) def test_calculate_edit_distance_with_filename(filename, expected_score, expected_distance): with open("example-docs/fake-text.txt") as f: source_cct = f.read() elements = partition(filename=f"example-docs/{filename}") output_cct = "\n".join([str(el) for el in elements]) score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score") distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance") assert score >= 0 assert score <= 1.0 assert distance >= 0 assert round(score, 2) == expected_score assert distance == expected_distance @pytest.mark.parametrize( ("text", "expected"), [ ( "The dog loved the cat, but the cat loved the cow", {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1}, ), ( "Hello my name is H a r p e r, what's your name?", {"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1}, ), ( "I have a dog and a cat, I love my dog.", {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1}, ), ( "My dog's hair is red, but the dogs' houses are blue.", { "my": 1, "dog's": 1, "hair": 1, "is": 1, "red": 1, "but": 1, "the": 1, "dogs'": 1, "houses": 1, "are": 1, "blue": 1, }, ), ( """Sometimes sentences have a dash - like this one! A hyphen connects 2 words with no gap: easy-peasy.""", { "sometimes": 1, "sentences": 1, "have": 1, "a": 2, "dash": 1, "like": 1, "this": 1, "one": 1, "hyphen": 1, "connects": 1, "2": 1, "words": 1, "with": 1, "no": 1, "gap": 1, "easy-peasy": 1, }, ), ], ) def test_bag_of_words(text, expected): assert text_extraction.bag_of_words(text) == expected @pytest.mark.parametrize( ("output_text", "source_text", "expected_percentage"), [ ( "extra", "", 0, ), ( "", "Source text has a sentence.", 1, ), ( "The original s e n t e n c e is normal.", "The original sentence is normal...", 0.2, ), ( "We saw 23% improvement in this quarter.", "We saw 23% improvement in sales this quarter.", 0.125, ), ( "no", "Is it possible to have more than everything missing?", 1, ), ], ) def test_calculate_percent_missing_text(output_text, source_text, expected_percentage): assert ( text_extraction.calculate_percent_missing_text(output_text, source_text) == expected_percentage ) @pytest.mark.parametrize( ("table_as_cells", "expected_extraction"), [ pytest.param( [ {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."}, {"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"}, ], [ {"row_index": 0, "col_index": 0, "content": "Month A."}, {"row_index": 1, "col_index": 0, "content": "22"}, ], id="Simple table, 1 head cell, 1 body cell, no spans", ), pytest.param( [ {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."}, {"x": 1, "y": 0, "w": 1, "h": 1, "content": "Month B."}, {"x": 2, "y": 0, "w": 1, "h": 1, "content": "Month C."}, {"x": 0, "y": 1, "w": 1, "h": 1, "content": "11"}, {"x": 1, "y": 1, "w": 1, "h": 1, "content": "12"}, {"x": 2, "y": 1, "w": 1, "h": 1, "content": "13"}, {"x": 0, "y": 2, "w": 1, "h": 1, "content": "21"}, {"x": 1, "y": 2, "w": 1, "h": 1, "content": "22"}, {"x": 2, "y": 2, "w": 1, "h": 1, "content": "23"}, ], [ {"row_index": 0, "col_index": 0, "content": "Month A."}, {"row_index": 0, "col_index": 1, "content": "Month B."}, {"row_index": 0, "col_index": 2, "content": "Month C."}, {"row_index": 1, "col_index": 0, "content": "11"}, {"row_index": 1, "col_index": 1, "content": "12"}, {"row_index": 1, "col_index": 2, "content": "13"}, {"row_index": 2, "col_index": 0, "content": "21"}, {"row_index": 2, "col_index": 1, "content": "22"}, {"row_index": 2, "col_index": 2, "content": "23"}, ], id="Simple table, 3 head cell, 5 body cell, no spans", ), # +----------+---------------------+----------+ # | | h1col23 | h1col4 | # | h12col1 |----------+----------+----------| # | | h2col2 | h2col34 | # |----------|----------+----------+----------+ # | r3col1 | r3col2 | | # |----------+----------| r34col34 | # | r4col12 | | # +----------+----------+----------+----------+ pytest.param( [ { "y": 0, "x": 0, "w": 2, "h": 1, "content": "h12col1", }, { "y": 0, "x": 1, "w": 1, "h": 2, "content": "h1col23", }, { "y": 0, "x": 3, "w": 1, "h": 1, "content": "h1col4", }, { "y": 1, "x": 1, "w": 1, "h": 1, "content": "h2col2", }, { "y": 1, "x": 2, "w": 1, "h": 2, "content": "h2col34", }, { "y": 2, "x": 0, "w": 1, "h": 1, "content": "r3col1", }, { "y": 2, "x": 1, "w": 1, "h": 1, "content": "r3col2", }, { "y": 2, "x": 2, "w": 2, "h": 2, "content": "r34col34", }, { "y": 3, "x": 0, "w": 1, "h": 2, "content": "r4col12", }, ], [ { "row_index": 0, "col_index": 0, "content": "h12col1", }, { "row_index": 0, "col_index": 1, "content": "h1col23", }, { "row_index": 0, "col_index": 3, "content": "h1col4", }, { "row_index": 1, "col_index": 1, "content": "h2col2", }, { "row_index": 1, "col_index": 2, "content": "h2col34", }, { "row_index": 2, "col_index": 0, "content": "r3col1", }, { "row_index": 2, "col_index": 1, "content": "r3col2", }, { "row_index": 2, "col_index": 2, "content": "r34col34", }, { "row_index": 3, "col_index": 0, "content": "r4col12", }, ], id="various spans, with 2 row header", ), ], ) def test_cells_table_extraction_from_prediction(table_as_cells, expected_extraction): example_element = { "type": "Table", "metadata": {"table_as_cells": table_as_cells}, } assert extract_cells_from_table_as_cells(example_element) == expected_extraction @pytest.mark.parametrize( ("text_as_html", "expected_extraction"), [ pytest.param( """
Month A.
22
" """, [ {"row_index": 0, "col_index": 0, "content": "Month A."}, {"row_index": 1, "col_index": 0, "content": "22"}, ], id="Simple table, 1 head cell, 1 body cell, no spans", ), pytest.param( """
Month A. Month B. Month C.
11 12 13
21 22 23
" """, [ {"row_index": 0, "col_index": 0, "content": "Month A."}, {"row_index": 0, "col_index": 1, "content": "Month B."}, {"row_index": 0, "col_index": 2, "content": "Month C."}, {"row_index": 1, "col_index": 0, "content": "11"}, {"row_index": 1, "col_index": 1, "content": "12"}, {"row_index": 1, "col_index": 2, "content": "13"}, {"row_index": 2, "col_index": 0, "content": "21"}, {"row_index": 2, "col_index": 1, "content": "22"}, {"row_index": 2, "col_index": 2, "content": "23"}, ], id="Simple table, 3 head cell, 5 body cell, no spans", ), # +----------+---------------------+----------+ # | | h1col23 | h1col4 | # | h12col1 |----------+----------+----------| # | | h2col2 | h2col34 | # |----------|----------+----------+----------+ # | r3col1 | r3col2 | | # |----------+----------| r34col34 | # | r4col12 | | # +----------+----------+----------+----------+ pytest.param( """
h12col1 h1col23 h1col4
h2col2 h2col34
r3col1 r3col2 r34col34
r4col12
""", [ { "row_index": 0, "col_index": 0, "content": "h12col1", }, { "row_index": 0, "col_index": 1, "content": "h1col23", }, { "row_index": 0, "col_index": 3, "content": "h1col4", }, { "row_index": 1, "col_index": 1, "content": "h2col2", }, { "row_index": 1, "col_index": 2, "content": "h2col34", }, { "row_index": 2, "col_index": 0, "content": "r3col1", }, { "row_index": 2, "col_index": 1, "content": "r3col2", }, { "row_index": 2, "col_index": 2, "content": "r34col34", }, { "row_index": 3, "col_index": 0, "content": "r4col12", }, ], id="various spans, with 2 row header", ), ], ) def test_html_table_extraction_from_prediction(text_as_html, expected_extraction): example_element = { "type": "Table", "metadata": { "text_as_html": text_as_html, }, } assert extract_cells_from_text_as_html(example_element) == expected_extraction def test_cells_extraction_from_prediction_when_missing_prediction(): example_element = {"type": "Table", "metadata": {"text_as_html": "", "table_as_cells": []}} assert extract_cells_from_text_as_html(example_element) is None assert extract_cells_from_table_as_cells(example_element) is None def _trim_html(html: str) -> str: html_lines = [line.strip() for line in html.split("\n") if line] return "".join(html_lines) @pytest.mark.parametrize( "html_to_test", [ """
Month A.
22
""", """
Month A. Month B. Month C.
11 12 13
21 22 23
""", """
h12col1 h1col23 h1col4
h2col2 h2col34
r3col1 r3col2 r34col34
r4col12
""", ], ) def test_deckerd_html_converter(html_to_test): deckerd_table = html_table_to_deckerd(html_to_test) html_table = deckerd_table_to_html(deckerd_table) assert _trim_html(html_to_test) == html_table