unstructured/test_unstructured/metrics/test_text_extraction.py

import re

import pytest

from unstructured.metrics import text_extraction
from unstructured.metrics.table.table_extraction import (
    deckerd_table_to_html,
    extract_cells_from_table_as_cells,
    extract_cells_from_text_as_html,
    html_table_to_deckerd,
)
from unstructured.partition.auto import partition


def test_calculate_edit_distance():
    source_cct = "I like pizza. I like bagels."
    source_cct_word_space = "I like p i z z a . I like bagles."
    source_cct_spaces = re.sub(r"\s+", " ", " ".join(source_cct))
    source_cct_no_space = source_cct.replace(" ", "")
    source_cct_one_sentence = "I like pizza."
    source_cct_missing_word = "I like pizza. I like ."
    source_cct_addn_char = "I like pizza. I like beagles."
    source_cct_dup_word = "I like pizza pizza. I like bagels."

    assert (
        round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2)
        == 1.0
    )
    assert (
        round(
            text_extraction.calculate_edit_distance(
                source_cct_word_space,
                source_cct,
                return_as="score",
            ),
            2,
        )
        == 0.75
    )
    assert (
        round(
            text_extraction.calculate_edit_distance(
                source_cct_spaces,
                source_cct,
                return_as="score",
            ),
            2,
        )
        == 0.39
    )
    assert (
        round(
            text_extraction.calculate_edit_distance(
                source_cct_no_space,
                source_cct,
                return_as="score",
            ),
            2,
        )
        == 0.64
    )
    assert (
        round(
            text_extraction.calculate_edit_distance(
                source_cct_one_sentence,
                source_cct,
                return_as="score",
            ),
            2,
        )
        == 0.0
    )
    assert (
        round(
            text_extraction.calculate_edit_distance(
                source_cct_missing_word,
                source_cct,
                return_as="score",
            ),
            2,
        )
        == 0.57
    )
    assert (
        round(
            text_extraction.calculate_edit_distance(
                source_cct_addn_char,
                source_cct,
                return_as="score",
            ),
            2,
        )
        == 0.89
    )
    assert (
        round(
            text_extraction.calculate_edit_distance(
                source_cct_dup_word,
                source_cct,
                return_as="score",
            ),
            2,
        )
        == 0.79
    )


@pytest.mark.parametrize(
    ("filename", "standardize_whitespaces", "expected_score", "expected_distance"),
    [
        ("fake-text.txt", False, 0.78, 38),
        ("fake-text.txt", True, 0.92, 12),
    ],
)
def test_calculate_edit_distance_with_filename(
    filename, standardize_whitespaces, expected_score, expected_distance
):
    with open("example-docs/fake-text.txt") as f:
        source_cct = f.read()

    elements = partition(filename=f"example-docs/{filename}")
    output_cct = "\n".join([str(el) for el in elements])

    score = text_extraction.calculate_edit_distance(
        output_cct, source_cct, return_as="score", standardize_whitespaces=standardize_whitespaces
    )
    distance = text_extraction.calculate_edit_distance(
        output_cct,
        source_cct,
        return_as="distance",
        standardize_whitespaces=standardize_whitespaces,
    )

    assert score >= 0
    assert score <= 1.0
    assert distance >= 0
    assert round(score, 2) == expected_score
    assert distance == expected_distance


@pytest.mark.parametrize(
    ("text1", "text2"),
    [
        (
            "The  dog\rloved the cat, but\t\n    the cat\tloved the\n cow",
            "The dog loved the cat, but the cat loved the cow",
        ),
        (
            "Hello    my\tname\tis H a r p e r, \nwhat's your\vname?",
            "Hello my name is H a r p e r, what's your name?",
        ),
        (
            "I have a\t\n\tdog and a\tcat,\fI love my\n\n\n\ndog.",
            "I have a dog and a cat, I love my dog.",
        ),
        (
            """
            Name    Age City           Occupation
            Alice   30  New York       Engineer
            Bob     25  Los Angeles    Designer
            Charlie 35  Chicago        Teacher
            David   40  San Francisco  Developer
            """,
            """
            Name\tAge\tCity\tOccupation
            Alice\t30\tNew York\tEngineer
            Bob\t25\tLos Angeles\tDesigner
            Charlie\t35\tChicago\tTeacher
            David\t40\tSan Francisco\tDeveloper
            """,
        ),
        (
            """
            Name\tAge\tCity\tOccupation
            Alice\t30\tNew York\tEngineer
            Bob\t25\tLos Angeles\tDesigner
            Charlie\t35\tChicago\tTeacher
            David\t40\tSan Francisco\tDeveloper
            """,
            "Name\tAge\tCity\tOccupation\n\n \nAlice\t30\tNew York\tEngineer\nBob\t25\tLos Angeles\tDesigner\nCharlie\t35\tChicago\tTeacher\nDavid\t40\tSan Francisco\tDeveloper",  # noqa: E501
        ),
    ],
)
def test_calculate_edit_distance_with_various_whitespace_1(text1, text2):
    assert (
        text_extraction.calculate_edit_distance(
            text1, text2, return_as="score", standardize_whitespaces=True
        )
        == 1.0
    )
    assert (
        text_extraction.calculate_edit_distance(
            text1, text2, return_as="distance", standardize_whitespaces=True
        )
        == 0
    )
    assert (
        text_extraction.calculate_edit_distance(
            text1, text2, return_as="score", standardize_whitespaces=False
        )
        < 1.0
    )
    assert (
        text_extraction.calculate_edit_distance(
            text1, text2, return_as="distance", standardize_whitespaces=False
        )
        > 0
    )


def test_calculate_edit_distance_with_various_whitespace_2():
    source_cct_tabs = """
            Name\tAge\tCity\tOccupation
            Alice\t30\tNew York\tEngineer
            Bob\t25\tLos Angeles\tDesigner
            Charlie\t35\tChicago\tTeacher
            David\t40\tSan Francisco\tDeveloper
            """
    source_cct_with_borders = """

            | Name    | Age | City         | Occupation     |
            |---------|-----|--------------|----------------|
            | Alice   | 30  | New York     | Engineer       |
            | Bob     | 25  | Los Angeles  | Designer       |
            | Charlie | 35  | Chicago      | Teacher        |
            | David   | 40  | San Francisco| Developer      |

            """
    assert text_extraction.calculate_edit_distance(
        source_cct_tabs, source_cct_with_borders, return_as="score", standardize_whitespaces=True
    ) > text_extraction.calculate_edit_distance(
        source_cct_tabs, source_cct_with_borders, return_as="score", standardize_whitespaces=False
    )
    assert text_extraction.calculate_edit_distance(
        source_cct_tabs, source_cct_with_borders, return_as="distance", standardize_whitespaces=True
    ) < text_extraction.calculate_edit_distance(
        source_cct_tabs,
        source_cct_with_borders,
        return_as="distance",
        standardize_whitespaces=False,
    )


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        (
            "The dog loved the cat, but the cat loved the cow",
            {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
        ),
        (
            "Hello my name is H a r p e r, what's your name?",
            {"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1},
        ),
        (
            "I have a dog and a cat, I love my dog.",
            {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
        ),
        (
            "My dog's hair is red, but the dogs' houses are blue.",
            {
                "my": 1,
                "dog's": 1,
                "hair": 1,
                "is": 1,
                "red": 1,
                "but": 1,
                "the": 1,
                "dogs'": 1,
                "houses": 1,
                "are": 1,
                "blue": 1,
            },
        ),
        (
            """Sometimes sentences have a dash - like this one!
                    A hyphen connects 2 words with no gap: easy-peasy.""",
            {
                "sometimes": 1,
                "sentences": 1,
                "have": 1,
                "a": 2,
                "dash": 1,
                "like": 1,
                "this": 1,
                "one": 1,
                "hyphen": 1,
                "connects": 1,
                "2": 1,
                "words": 1,
                "with": 1,
                "no": 1,
                "gap": 1,
                "easy-peasy": 1,
            },
        ),
    ],
)
def test_bag_of_words(text, expected):
    assert text_extraction.bag_of_words(text) == expected


@pytest.mark.parametrize(
    ("text", "expected"),
    [
        (
            "The  dog\rloved the cat, but\t\n    the cat\tloved the\n cow\n\n",
            "The dog loved the cat, but the cat loved the cow",
        ),
        (
            "\n\nHello    my\tname\tis H a r p e r, \nwhat's your\vname?",
            "Hello my name is H a r p e r, what's your name?",
        ),
        (
            "I have a\t\n\tdog and a\tcat,\fI love my\n\n\n\ndog.",
            "I have a dog and a cat, I love my dog.",
        ),
        (
            """L     is for the way you look at me
            O    is for the only one I see
            V    is very, very extraordinary
            E    is even more than anyone that you adore can""",
            "L is for the way you look at me O is for the only one I see V is very, very extraordinary E is even more than anyone that you adore can",  # noqa: E501
        ),
        (
            """
            | Name    | Age | City         | Occupation     |
            |---------|-----|--------------|----------------|
            | Alice   | 30  | New York     | Engineer       |
            | Bob     | 25  | Los Angeles  | Designer       |
            | Charlie | 35  | Chicago      | Teacher        |
            | David   | 40  | San Francisco| Developer      |
            """,
            "| Name | Age | City | Occupation | |---------|-----|--------------|----------------| | Alice | 30 | New York | Engineer | | Bob | 25 | Los Angeles | Designer | | Charlie | 35 | Chicago | Teacher | | David | 40 | San Francisco| Developer |",  # noqa: E501
        ),
    ],
)
def test_prepare_string(text, expected):
    assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
    assert text_extraction.prepare_str(text) == text

@pytest.mark.parametrize(
    ("input_text", "expected_output"),
    [
        # Complex sentences with standard quotes
        ('"The quick brown fox jumps over the lazy dog," said the narrator.', 
         '"The quick brown fox jumps over the lazy dog," said the narrator.'),
        
        # Mixed quotes in longer sentences
        ('She said "Hello" and then whispered \'Goodbye\' before leaving.', 
         'She said "Hello" and then whispered \'Goodbye\' before leaving.'),
        
        # Double low-9 quotes with complex content
        ('„To be, or not to be, that is the question" - Shakespeare\'s famous quote.',
         '"To be, or not to be, that is the question" - Shakespeare\'s famous quote.'),
        
        # Angle quotes with nested quotes
        ('«When he said "life is beautiful," I believed him» wrote Maria.',
         '"When he said "life is beautiful," I believed him" wrote Maria.'),
        
        # Heavy ornament quotes in dialogue
        ('❝Do you remember when we first met?❞ she asked with a smile.',
         '"Do you remember when we first met?" she asked with a smile.'),
        
        # Double prime quotes with punctuation
        ('〝The meeting starts at 10:00, don\'t be late!〟 announced the manager.',
         '"The meeting starts at 10:00, don\'t be late!" announced the manager.'),
        
        # Corner brackets with nested quotes
        ('「He told me "This is important" yesterday」, she explained.',
         '\'He told me "This is important" yesterday\', she explained.'),
        
        # White corner brackets with multiple sentences
        ('『The sun was setting. The birds were singing. It was peaceful.』',
         '\'The sun was setting. The birds were singing. It was peaceful.\''),
        
        # Vertical corner brackets with numbers and special characters
        ('﹂Meeting #123 @ 15:00 - Don\'t forget!﹁',
         '\'Meeting #123 @ 15:00 - Don\'t forget!\''),
        
        # Complex mixed quote types
        ('「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»',
         '\'Hello\', "World", "Test", \'Example\', "Quote", "Final"'),
        
        # Quotes with multiple apostrophes
        ('It\'s John\'s book, isn\'t it?',
         "It's John's book, isn't it?"),
        
        # Single angle quotes with nested content
        ('‹Testing the system\'s capability for "quoted" text›',
         '\'Testing the system\'s capability for "quoted" text\''),
        
        # Heavy single ornament quotes with multiple sentences
        ('❛First sentence. Second sentence. Third sentence.❜',
         '\'First sentence. Second sentence. Third sentence.\''),
        
        # Mix of various quote types in complex text
        ('「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
         '\'Chapter 1\': "The Beginning" - "A new story" begins "today".')
    ],
)
def test_standardize_quotes(input_text, expected_output):
    assert text_extraction.standardize_quotes(input_text) == expected_output

@pytest.mark.parametrize(
    ("output_text", "source_text", "expected_percentage"),
    [
        (
            "extra",
            "",
            0,
        ),
        (
            "",
            "Source text has a sentence.",
            1,
        ),
        (
            "The original s e n t e n c e is normal.",
            "The original sentence is normal...",
            0.2,
        ),
        (
            "We saw 23% improvement in this quarter.",
            "We saw 23% improvement in sales this quarter.",
            0.125,
        ),
        (
            "no",
            "Is it possible to have more than everything missing?",
            1,
        ),
    ],
)
def test_calculate_percent_missing_text(output_text, source_text, expected_percentage):
    assert (
        text_extraction.calculate_percent_missing_text(output_text, source_text)
        == expected_percentage
    )


@pytest.mark.parametrize(
    ("table_as_cells", "expected_extraction"),
    [
        pytest.param(
            [
                {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
                {"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"},
            ],
            [
                {"row_index": 0, "col_index": 0, "content": "Month A."},
                {"row_index": 1, "col_index": 0, "content": "22"},
            ],
            id="Simple table, 1 head cell, 1 body cell, no spans",
        ),
        pytest.param(
            [
                {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
                {"x": 1, "y": 0, "w": 1, "h": 1, "content": "Month B."},
                {"x": 2, "y": 0, "w": 1, "h": 1, "content": "Month C."},
                {"x": 0, "y": 1, "w": 1, "h": 1, "content": "11"},
                {"x": 1, "y": 1, "w": 1, "h": 1, "content": "12"},
                {"x": 2, "y": 1, "w": 1, "h": 1, "content": "13"},
                {"x": 0, "y": 2, "w": 1, "h": 1, "content": "21"},
                {"x": 1, "y": 2, "w": 1, "h": 1, "content": "22"},
                {"x": 2, "y": 2, "w": 1, "h": 1, "content": "23"},
            ],
            [
                {"row_index": 0, "col_index": 0, "content": "Month A."},
                {"row_index": 0, "col_index": 1, "content": "Month B."},
                {"row_index": 0, "col_index": 2, "content": "Month C."},
                {"row_index": 1, "col_index": 0, "content": "11"},
                {"row_index": 1, "col_index": 1, "content": "12"},
                {"row_index": 1, "col_index": 2, "content": "13"},
                {"row_index": 2, "col_index": 0, "content": "21"},
                {"row_index": 2, "col_index": 1, "content": "22"},
                {"row_index": 2, "col_index": 2, "content": "23"},
            ],
            id="Simple table, 3 head cell, 5 body cell, no spans",
        ),
        # +----------+---------------------+----------+
        # |          |       h1col23       |  h1col4  |
        # | h12col1  |----------+----------+----------|
        # |          |  h2col2  |       h2col34       |
        # |----------|----------+----------+----------+
        # |  r3col1  |  r3col2  |                     |
        # |----------+----------|      r34col34       |
        # |       r4col12       |                     |
        # +----------+----------+----------+----------+
        pytest.param(
            [
                {
                    "y": 0,
                    "x": 0,
                    "w": 2,
                    "h": 1,
                    "content": "h12col1",
                },
                {
                    "y": 0,
                    "x": 1,
                    "w": 1,
                    "h": 2,
                    "content": "h1col23",
                },
                {
                    "y": 0,
                    "x": 3,
                    "w": 1,
                    "h": 1,
                    "content": "h1col4",
                },
                {
                    "y": 1,
                    "x": 1,
                    "w": 1,
                    "h": 1,
                    "content": "h2col2",
                },
                {
                    "y": 1,
                    "x": 2,
                    "w": 1,
                    "h": 2,
                    "content": "h2col34",
                },
                {
                    "y": 2,
                    "x": 0,
                    "w": 1,
                    "h": 1,
                    "content": "r3col1",
                },
                {
                    "y": 2,
                    "x": 1,
                    "w": 1,
                    "h": 1,
                    "content": "r3col2",
                },
                {
                    "y": 2,
                    "x": 2,
                    "w": 2,
                    "h": 2,
                    "content": "r34col34",
                },
                {
                    "y": 3,
                    "x": 0,
                    "w": 1,
                    "h": 2,
                    "content": "r4col12",
                },
            ],
            [
                {
                    "row_index": 0,
                    "col_index": 0,
                    "content": "h12col1",
                },
                {
                    "row_index": 0,
                    "col_index": 1,
                    "content": "h1col23",
                },
                {
                    "row_index": 0,
                    "col_index": 3,
                    "content": "h1col4",
                },
                {
                    "row_index": 1,
                    "col_index": 1,
                    "content": "h2col2",
                },
                {
                    "row_index": 1,
                    "col_index": 2,
                    "content": "h2col34",
                },
                {
                    "row_index": 2,
                    "col_index": 0,
                    "content": "r3col1",
                },
                {
                    "row_index": 2,
                    "col_index": 1,
                    "content": "r3col2",
                },
                {
                    "row_index": 2,
                    "col_index": 2,
                    "content": "r34col34",
                },
                {
                    "row_index": 3,
                    "col_index": 0,
                    "content": "r4col12",
                },
            ],
            id="various spans, with 2 row header",
        ),
    ],
)
def test_cells_table_extraction_from_prediction(table_as_cells, expected_extraction):
    example_element = {
        "type": "Table",
        "metadata": {"table_as_cells": table_as_cells},
    }
    assert extract_cells_from_table_as_cells(example_element) == expected_extraction


@pytest.mark.parametrize(
    ("text_as_html", "expected_extraction"),
    [
        pytest.param(
            """
<table>
    <thead>
        <tr>
            <th>Month A.</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>22</td>
        </tr>
    </tbody>
</table>"
            """,
            [
                {"row_index": 0, "col_index": 0, "content": "Month A."},
                {"row_index": 1, "col_index": 0, "content": "22"},
            ],
            id="Simple table, 1 head cell, 1 body cell, no spans",
        ),
        pytest.param(
            """
<table>
    <thead>
        <tr>
            <th>Month A.</th>
            <th>Month B.</th>
            <th>Month C.</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>11</td>
            <td>12</td>
            <td>13</td>
        </tr>
        <tr>
            <td>21</td>
            <td>22</td>
            <td>23</td>
        </tr>
    </tbody>
</table>"
""",
            [
                {"row_index": 0, "col_index": 0, "content": "Month A."},
                {"row_index": 0, "col_index": 1, "content": "Month B."},
                {"row_index": 0, "col_index": 2, "content": "Month C."},
                {"row_index": 1, "col_index": 0, "content": "11"},
                {"row_index": 1, "col_index": 1, "content": "12"},
                {"row_index": 1, "col_index": 2, "content": "13"},
                {"row_index": 2, "col_index": 0, "content": "21"},
                {"row_index": 2, "col_index": 1, "content": "22"},
                {"row_index": 2, "col_index": 2, "content": "23"},
            ],
            id="Simple table, 3 head cell, 5 body cell, no spans",
        ),
        # +----------+---------------------+----------+
        # |          |       h1col23       |  h1col4  |
        # | h12col1  |----------+----------+----------|
        # |          |  h2col2  |       h2col34       |
        # |----------|----------+----------+----------+
        # |  r3col1  |  r3col2  |                     |
        # |----------+----------|      r34col34       |
        # |       r4col12       |                     |
        # +----------+----------+----------+----------+
        pytest.param(
            """
<table>
    <thead>
        <tr>
            <th rowspan="2">h12col1</th>
            <th colspan="2">h1col23</th>
            <th>h1col4</th>
        </tr>
        <tr>
            <th>h2col2</th>
            <th colspan="2">h2col34</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>r3col1</td>
            <td>r3col2</td>
            <td colspan="2" rowspan="2">r34col34</td>
        </tr>
        <tr>
            <td colspan="2">r4col12</td>
        </tr>
    </tbody>
</table>
""",
            [
                {
                    "row_index": 0,
                    "col_index": 0,
                    "content": "h12col1",
                },
                {
                    "row_index": 0,
                    "col_index": 1,
                    "content": "h1col23",
                },
                {
                    "row_index": 0,
                    "col_index": 3,
                    "content": "h1col4",
                },
                {
                    "row_index": 1,
                    "col_index": 1,
                    "content": "h2col2",
                },
                {
                    "row_index": 1,
                    "col_index": 2,
                    "content": "h2col34",
                },
                {
                    "row_index": 2,
                    "col_index": 0,
                    "content": "r3col1",
                },
                {
                    "row_index": 2,
                    "col_index": 1,
                    "content": "r3col2",
                },
                {
                    "row_index": 2,
                    "col_index": 2,
                    "content": "r34col34",
                },
                {
                    "row_index": 3,
                    "col_index": 0,
                    "content": "r4col12",
                },
            ],
            id="various spans, with 2 row header",
        ),
    ],
)
def test_html_table_extraction_from_prediction(text_as_html, expected_extraction):
    example_element = {
        "type": "Table",
        "metadata": {
            "text_as_html": text_as_html,
        },
    }
    assert extract_cells_from_text_as_html(example_element) == expected_extraction


def test_cells_extraction_from_prediction_when_missing_prediction():
    example_element = {"type": "Table", "metadata": {"text_as_html": "", "table_as_cells": []}}
    assert extract_cells_from_text_as_html(example_element) is None
    assert extract_cells_from_table_as_cells(example_element) is None


def _trim_html(html: str) -> str:
    html_lines = [line.strip() for line in html.split("\n") if line]
    return "".join(html_lines)


@pytest.mark.parametrize(
    "html_to_test",
    [
        """
<table>
    <thead>
        <tr>
            <th>Month A.</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>22</td>
        </tr>
    </tbody>
</table>
""",
        """
<table>
    <thead>
        <tr>
            <th>Month A.</th>
            <th>Month B.</th>
            <th>Month C.</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>11</td>
            <td>12</td>
            <td>13</td>
        </tr>
        <tr>
            <td>21</td>
            <td>22</td>
            <td>23</td>
        </tr>
    </tbody>
</table>
""",
        """
<table>
    <thead>
        <tr>
            <th rowspan="2">h12col1</th>
            <th colspan="2">h1col23</th>
            <th>h1col4</th>
        </tr>
        <tr>
            <th>h2col2</th>
            <th colspan="2">h2col34</th>
        </tr>
    </thead>
    <tbody>
        <tr>
            <td>r3col1</td>
            <td>r3col2</td>
            <td colspan="2" rowspan="2">r34col34</td>
        </tr>
        <tr>
            <td colspan="2">r4col12</td>
        </tr>
    </tbody>
</table>
""",
    ],
)
def test_deckerd_html_converter(html_to_test):
    deckerd_table = html_table_to_deckerd(html_to_test)
    html_table = deckerd_table_to_html(deckerd_table)
    assert _trim_html(html_to_test) == html_table
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
+								import re
 								import pytest
-												Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words
for evaluation.

**Testing**
```Python
from unstructured.cleaners.core import bag_of_words
string = "The dog loved the cat, but the cat loved the cow."

print(bag_of_words)

---------

Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Co-authored-by: Klaijan <klaijan@unstructured.io>
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
											
										
										
											2023-10-10 13:46:01 -05:00
+								from unstructured.metrics import text_extraction
-												Add calculation of table related metrics based on table_as_cells (#2898)

This pull request add metrics that are calculated based on
table_as_cells instead of text_as_html. This change is required for
comprehensive metrics calculation, as previously every colspan or
rowspan predicted was considered to be an incorrect predicted (even if
it was correct prediction)

This change has to be merged after
https://github.com/Unstructured-IO/unstructured/pull/2892 which
introduces table_as_cells field
											
										
										
											2024-05-07 15:57:38 +02:00
+								from unstructured.metrics.table.table_extraction import (
-												feat: expose converters deckerd -> html and back (#3233)

This PR exposes functions in evaluation module for easy conversion
between tables in Deckerd and HTML formats, which are useful in
evalution experiments.
											
										
										
											2024-06-19 09:03:38 +02:00
+								    deckerd_table_to_html,
-												Add calculation of table related metrics based on table_as_cells (#2898)

This pull request add metrics that are calculated based on
table_as_cells instead of text_as_html. This change is required for
comprehensive metrics calculation, as previously every colspan or
rowspan predicted was considered to be an incorrect predicted (even if
it was correct prediction)

This change has to be merged after
https://github.com/Unstructured-IO/unstructured/pull/2892 which
introduces table_as_cells field
											
										
										
											2024-05-07 15:57:38 +02:00
+								    extract_cells_from_table_as_cells,
 								    extract_cells_from_text_as_html,
-												feat: expose converters deckerd -> html and back (#3233)

This PR exposes functions in evaluation module for easy conversion
between tables in Deckerd and HTML formats, which are useful in
evalution experiments.
											
										
										
											2024-06-19 09:03:38 +02:00
+								    html_table_to_deckerd,
-												Add calculation of table related metrics based on table_as_cells (#2898)

This pull request add metrics that are calculated based on
table_as_cells instead of text_as_html. This change is required for
comprehensive metrics calculation, as previously every colspan or
rowspan predicted was considered to be an incorrect predicted (even if
it was correct prediction)

This change has to be merged after
https://github.com/Unstructured-IO/unstructured/pull/2892 which
introduces table_as_cells field
											
										
										
											2024-05-07 15:57:38 +02:00
+								)
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
+								from unstructured.partition.auto import partition
 								def test_calculate_edit_distance():
 								    source_cct = "I like pizza. I like bagels."
 								    source_cct_word_space = "I like p i z z a . I like bagles."
 								    source_cct_spaces = re.sub(r"\s+", " ", " ".join(source_cct))
 								    source_cct_no_space = source_cct.replace(" ", "")
 								    source_cct_one_sentence = "I like pizza."
 								    source_cct_missing_word = "I like pizza. I like ."
 								    source_cct_addn_char = "I like pizza. I like beagles."
 								    source_cct_dup_word = "I like pizza pizza. I like bagels."
 								    assert (
-												Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words
for evaluation.

**Testing**
```Python
from unstructured.cleaners.core import bag_of_words
string = "The dog loved the cat, but the cat loved the cow."

print(bag_of_words)

---------

Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Co-authored-by: Klaijan <klaijan@unstructured.io>
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
											
										
										
											2023-10-10 13:46:01 -05:00
+								        round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2)
 								        == 1.0
 								    )
 								    assert (
 								        round(
 								            text_extraction.calculate_edit_distance(
 								                source_cct_word_space,
 								                source_cct,
 								                return_as="score",
 								            ),
 ,
 								        )
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
+								        == 0.75
 								    )
 								    assert (
-												Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words
for evaluation.

**Testing**
```Python
from unstructured.cleaners.core import bag_of_words
string = "The dog loved the cat, but the cat loved the cow."

print(bag_of_words)

---------

Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Co-authored-by: Klaijan <klaijan@unstructured.io>
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
											
										
										
											2023-10-10 13:46:01 -05:00
+								        round(
 								            text_extraction.calculate_edit_distance(
 								                source_cct_spaces,
 								                source_cct,
 								                return_as="score",
 								            ),
 ,
 								        )
 								        == 0.39
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
+								    )
 								    assert (
-												Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words
for evaluation.

**Testing**
```Python
from unstructured.cleaners.core import bag_of_words
string = "The dog loved the cat, but the cat loved the cow."

print(bag_of_words)

---------

Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Co-authored-by: Klaijan <klaijan@unstructured.io>
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
											
										
										
											2023-10-10 13:46:01 -05:00
+								        round(
 								            text_extraction.calculate_edit_distance(
 								                source_cct_no_space,
 								                source_cct,
 								                return_as="score",
 								            ),
 ,
 								        )
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
+								        == 0.64
 								    )
 								    assert (
-												Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words
for evaluation.

**Testing**
```Python
from unstructured.cleaners.core import bag_of_words
string = "The dog loved the cat, but the cat loved the cow."

print(bag_of_words)

---------

Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Co-authored-by: Klaijan <klaijan@unstructured.io>
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
											
										
										
											2023-10-10 13:46:01 -05:00
+								        round(
 								            text_extraction.calculate_edit_distance(
 								                source_cct_one_sentence,
 								                source_cct,
 								                return_as="score",
 								            ),
 ,
 								        )
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
+								        == 0.0
 								    )
 								    assert (
-												Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words
for evaluation.

**Testing**
```Python
from unstructured.cleaners.core import bag_of_words
string = "The dog loved the cat, but the cat loved the cow."

print(bag_of_words)

---------

Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Co-authored-by: Klaijan <klaijan@unstructured.io>
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
											
										
										
											2023-10-10 13:46:01 -05:00
+								        round(
 								            text_extraction.calculate_edit_distance(
 								                source_cct_missing_word,
 								                source_cct,
 								                return_as="score",
 								            ),
 ,
 								        )
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
+								        == 0.57
 								    )
 								    assert (
-												Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words
for evaluation.

**Testing**
```Python
from unstructured.cleaners.core import bag_of_words
string = "The dog loved the cat, but the cat loved the cow."

print(bag_of_words)

---------

Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Co-authored-by: Klaijan <klaijan@unstructured.io>
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
											
										
										
											2023-10-10 13:46:01 -05:00
+								        round(
 								            text_extraction.calculate_edit_distance(
 								                source_cct_addn_char,
 								                source_cct,
 								                return_as="score",
 								            ),
 ,
 								        )
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
+								        == 0.89
 								    )
 								    assert (
-												Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words
for evaluation.

**Testing**
```Python
from unstructured.cleaners.core import bag_of_words
string = "The dog loved the cat, but the cat loved the cow."

print(bag_of_words)

---------

Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Co-authored-by: Klaijan <klaijan@unstructured.io>
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
											
										
										
											2023-10-10 13:46:01 -05:00
+								        round(
 								            text_extraction.calculate_edit_distance(
 								                source_cct_dup_word,
 								                source_cct,
 								                return_as="score",
 								            ),
 ,
 								        )
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
+								        == 0.79
 								    )
 								@pytest.mark.parametrize(
-												Ml 384/whitespaces in cct (#3747)

This ticket ensures that CCT metric will not be sensitive to differences
in whitespace (including newline).
All whitespaces in string are changed to single space `" "` in both GT
and PRED before the metric is computed.

Additional changes in CHANGELOG due to auto-formatting.
											
										
										
											2024-10-24 15:02:34 +02:00
+								    ("filename", "standardize_whitespaces", "expected_score", "expected_distance"),
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
+								    [
-												Ml 384/whitespaces in cct (#3747)

This ticket ensures that CCT metric will not be sensitive to differences
in whitespace (including newline).
All whitespaces in string are changed to single space `" "` in both GT
and PRED before the metric is computed.

Additional changes in CHANGELOG due to auto-formatting.
											
										
										
											2024-10-24 15:02:34 +02:00
+								        ("fake-text.txt", False, 0.78, 38),
 								        ("fake-text.txt", True, 0.92, 12),
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
+								    ],
 								)
-												Ml 384/whitespaces in cct (#3747)

This ticket ensures that CCT metric will not be sensitive to differences
in whitespace (including newline).
All whitespaces in string are changed to single space `" "` in both GT
and PRED before the metric is computed.

Additional changes in CHANGELOG due to auto-formatting.
											
										
										
											2024-10-24 15:02:34 +02:00
+								def test_calculate_edit_distance_with_filename(
 								    filename, standardize_whitespaces, expected_score, expected_distance
 								):
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
+								    with open("example-docs/fake-text.txt") as f:
 								        source_cct = f.read()
 								    elements = partition(filename=f"example-docs/{filename}")
 								    output_cct = "\n".join([str(el) for el in elements])
-												Ml 384/whitespaces in cct (#3747)

This ticket ensures that CCT metric will not be sensitive to differences
in whitespace (including newline).
All whitespaces in string are changed to single space `" "` in both GT
and PRED before the metric is computed.

Additional changes in CHANGELOG due to auto-formatting.
											
										
										
											2024-10-24 15:02:34 +02:00
+								    score = text_extraction.calculate_edit_distance(
 								        output_cct, source_cct, return_as="score", standardize_whitespaces=standardize_whitespaces
 								    )
 								    distance = text_extraction.calculate_edit_distance(
 								        output_cct,
 								        source_cct,
 								        return_as="distance",
 								        standardize_whitespaces=standardize_whitespaces,
 								    )
-												feat: add calculate edit distance feature (#1656)

**Executive Summary**

Adds function to calculate edit distance (Levenshtein distance) between
two strings. The function can return as: 1. score (similarity = 1 -
distance/source_len) 2. distance (raw levenshtein distance)

**Technical details**
- The `weights` param is set to default at (2,1,1) for (insertion,
deletion, substitution), meaning that we will penalize the insertion we
need to add from output (target) in comparison with the source
(reference). In other word, the missing extraction will be penalized
higher.
- The function takes in 2 strings in an assumption that both string are
already clean and concatenated (CCT)

**Important Note!**
Test case needs to be updated to use CCT once the function is ready. It
is now only tested the "functionality" of edit distance, not the edit
distance with CCT as its intended to be.

---------

Co-authored-by: cragwolfe <crag@unstructured.io>
											
										
										
											2023-10-06 21:21:14 -04:00
 								    assert score >= 0
 								    assert score <= 1.0
 								    assert distance >= 0
 								    assert round(score, 2) == expected_score
 								    assert distance == expected_distance
-												Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words
for evaluation.

**Testing**
```Python
from unstructured.cleaners.core import bag_of_words
string = "The dog loved the cat, but the cat loved the cow."

print(bag_of_words)

---------

Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Co-authored-by: Klaijan <klaijan@unstructured.io>
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
											
										
										
											2023-10-10 13:46:01 -05:00
-												Ml 384/whitespaces in cct (#3747)

This ticket ensures that CCT metric will not be sensitive to differences
in whitespace (including newline).
All whitespaces in string are changed to single space `" "` in both GT
and PRED before the metric is computed.

Additional changes in CHANGELOG due to auto-formatting.
											
										
										
											2024-10-24 15:02:34 +02:00
+								@pytest.mark.parametrize(
 								    ("text1", "text2"),
 								    [
 								        (
 								            "The  dog\rloved the cat, but\t\n    the cat\tloved the\n cow",
 								            "The dog loved the cat, but the cat loved the cow",
 								        ),
 								        (
 								            "Hello    my\tname\tis H a r p e r, \nwhat's your\vname?",
 								            "Hello my name is H a r p e r, what's your name?",
 								        ),
 								        (
 								            "I have a\t\n\tdog and a\tcat,\fI love my\n\n\n\ndog.",
 								            "I have a dog and a cat, I love my dog.",
 								        ),
 								        (
 								            """
 								            Name    Age City           Occupation
 								            Alice   30  New York       Engineer
 								            Bob     25  Los Angeles    Designer
 								            Charlie 35  Chicago        Teacher
 								            David   40  San Francisco  Developer
 								            """,
 								            """
 								            Name\tAge\tCity\tOccupation
 								            Alice\t30\tNew York\tEngineer
 								            Bob\t25\tLos Angeles\tDesigner
 								            Charlie\t35\tChicago\tTeacher
 								            David\t40\tSan Francisco\tDeveloper
 								            """,
 								        ),
 								        (
 								            """
 								            Name\tAge\tCity\tOccupation
 								            Alice\t30\tNew York\tEngineer
 								            Bob\t25\tLos Angeles\tDesigner
 								            Charlie\t35\tChicago\tTeacher
 								            David\t40\tSan Francisco\tDeveloper
 								            """,
 								            "Name\tAge\tCity\tOccupation\n\n \nAlice\t30\tNew York\tEngineer\nBob\t25\tLos Angeles\tDesigner\nCharlie\t35\tChicago\tTeacher\nDavid\t40\tSan Francisco\tDeveloper",  # noqa: E501
 								        ),
 								    ],
 								)
 								def test_calculate_edit_distance_with_various_whitespace_1(text1, text2):
 								    assert (
 								        text_extraction.calculate_edit_distance(
 								            text1, text2, return_as="score", standardize_whitespaces=True
 								        )
 								        == 1.0
 								    )
 								    assert (
 								        text_extraction.calculate_edit_distance(
 								            text1, text2, return_as="distance", standardize_whitespaces=True
 								        )
 								        == 0
 								    )
 								    assert (
 								        text_extraction.calculate_edit_distance(
 								            text1, text2, return_as="score", standardize_whitespaces=False
 								        )
 								        < 1.0
 								    )
 								    assert (
 								        text_extraction.calculate_edit_distance(
 								            text1, text2, return_as="distance", standardize_whitespaces=False
 								        )
 								        > 0
 								    )
 								def test_calculate_edit_distance_with_various_whitespace_2():
 								    source_cct_tabs = """
 								            Name\tAge\tCity\tOccupation
 								            Alice\t30\tNew York\tEngineer
 								            Bob\t25\tLos Angeles\tDesigner
 								            Charlie\t35\tChicago\tTeacher
 								            David\t40\tSan Francisco\tDeveloper
 								            """
 								    source_cct_with_borders = """
 								            | Name    | Age | City         | Occupation     |
 								            |---------|-----|--------------|----------------|
 								            | Alice   | 30  | New York     | Engineer       |
 								            | Bob     | 25  | Los Angeles  | Designer       |
 								            | Charlie | 35  | Chicago      | Teacher        |
 								            | David   | 40  | San Francisco| Developer      |
 								            """
 								    assert text_extraction.calculate_edit_distance(
 								        source_cct_tabs, source_cct_with_borders, return_as="score", standardize_whitespaces=True
 								    ) > text_extraction.calculate_edit_distance(
 								        source_cct_tabs, source_cct_with_borders, return_as="score", standardize_whitespaces=False
 								    )
 								    assert text_extraction.calculate_edit_distance(
 								        source_cct_tabs, source_cct_with_borders, return_as="distance", standardize_whitespaces=True
 								    ) < text_extraction.calculate_edit_distance(
 								        source_cct_tabs,
 								        source_cct_with_borders,
 								        return_as="distance",
 								        standardize_whitespaces=False,
 								    )
-												Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words
for evaluation.

**Testing**
```Python
from unstructured.cleaners.core import bag_of_words
string = "The dog loved the cat, but the cat loved the cow."

print(bag_of_words)

---------

Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Co-authored-by: Klaijan <klaijan@unstructured.io>
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
											
										
										
											2023-10-10 13:46:01 -05:00
+								@pytest.mark.parametrize(
 								    ("text", "expected"),
 								    [
 								        (
 								            "The dog loved the cat, but the cat loved the cow",
 								            {"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
 								        ),
 								        (
 								            "Hello my name is H a r p e r, what's your name?",
 								            {"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1},
 								        ),
 								        (
 								            "I have a dog and a cat, I love my dog.",
 								            {"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
 								        ),
 								        (
 								            "My dog's hair is red, but the dogs' houses are blue.",
 								            {
 								                "my": 1,
 								                "dog's": 1,
 								                "hair": 1,
 								                "is": 1,
 								                "red": 1,
 								                "but": 1,
 								                "the": 1,
 								                "dogs'": 1,
 								                "houses": 1,
 								                "are": 1,
 								                "blue": 1,
 								            },
 								        ),
 								        (
 								            """Sometimes sentences have a dash - like this one!
-												feat: table evaluations for fixed html table generation (#3196)

Update to the evaluation script to handle correct HTML syntax for
tables.
See https://github.com/Unstructured-IO/unstructured-inference/pull/355
for details.

This change:
- modifies transforming HTML tables to evaluation internal `cells`
format
- fixes the indexing of the output (internal format cells) when HTML
cells use spans
											
										
										
											2024-06-14 11:03:27 +02:00
+								                    A hyphen connects 2 words with no gap: easy-peasy.""",
-												Feat: Bag of words for testing metric (#1650)

This PR adds the `bag_of_words` function to count the frequency of words
for evaluation.

**Testing**
```Python
from unstructured.cleaners.core import bag_of_words
string = "The dog loved the cat, but the cat loved the cow."

print(bag_of_words)

---------

Co-authored-by: Mallori Harrell <mallori@Malloris-MacBook-Pro.local>
Co-authored-by: Klaijan <klaijan@unstructured.io>
Co-authored-by: Shreya Nidadavolu <shreyanid9@gmail.com>
Co-authored-by: shreyanid <42684285+shreyanid@users.noreply.github.com>
											
										
										
											2023-10-10 13:46:01 -05:00
+								            {
 								                "sometimes": 1,
 								                "sentences": 1,
 								                "have": 1,
 								                "a": 2,
 								                "dash": 1,
 								                "like": 1,
 								                "this": 1,
 								                "one": 1,
 								                "hyphen": 1,
 								                "connects": 1,
 								                "2": 1,
 								                "words": 1,
 								                "with": 1,
 								                "no": 1,
 								                "gap": 1,
 								                "easy-peasy": 1,
 								            },
 								        ),
 								    ],
 								)
 								def test_bag_of_words(text, expected):
 								    assert text_extraction.bag_of_words(text) == expected
-												feat: calculate metric for percent of text missing (#1701)

### Summary
Missing text is a particularly important metric of quality for the
Unstructured library because information from the document is not being
captured and therefore not usable by downstream applications.

Add function to calculate the percent of text missing relative to the
source transcription. Function takes 2 text strings (output and source)
as input, and returns the percentage of text missing as a decimal.

### Technical Details
- The 2 input strings are both assumed to already contain clean and
concatenated text (CCT)
- Implementation compares the bags of words (frequency counts for each
word present in the text) of each input text
- Duplicated/extra text is not penalized
- Value is limited to the range [0, 1]

### Test
- Several edge cases are covered in the test function (missing text,
duplicated text, spaced out words, etc).
- Can test other cases or text inputs by calling the function with 2 CCT
strings as "output" and "source"
											
										
										
											2023-10-10 13:54:49 -07:00
-												Ml 384/whitespaces in cct (#3747)

This ticket ensures that CCT metric will not be sensitive to differences
in whitespace (including newline).
All whitespaces in string are changed to single space `" "` in both GT
and PRED before the metric is computed.

Additional changes in CHANGELOG due to auto-formatting.
											
										
										
											2024-10-24 15:02:34 +02:00
+								@pytest.mark.parametrize(
 								    ("text", "expected"),
 								    [
 								        (
 								            "The  dog\rloved the cat, but\t\n    the cat\tloved the\n cow\n\n",
 								            "The dog loved the cat, but the cat loved the cow",
 								        ),
 								        (
 								            "\n\nHello    my\tname\tis H a r p e r, \nwhat's your\vname?",
 								            "Hello my name is H a r p e r, what's your name?",
 								        ),
 								        (
 								            "I have a\t\n\tdog and a\tcat,\fI love my\n\n\n\ndog.",
 								            "I have a dog and a cat, I love my dog.",
 								        ),
 								        (
 								            """L     is for the way you look at me
 								            O    is for the only one I see
 								            V    is very, very extraordinary
 								            E    is even more than anyone that you adore can""",
 								            "L is for the way you look at me O is for the only one I see V is very, very extraordinary E is even more than anyone that you adore can",  # noqa: E501
 								        ),
 								        (
 								            """
 								            | Name    | Age | City         | Occupation     |
 								            |---------|-----|--------------|----------------|
 								            | Alice   | 30  | New York     | Engineer       |
 								            | Bob     | 25  | Los Angeles  | Designer       |
 								            | Charlie | 35  | Chicago      | Teacher        |
 								            | David   | 40  | San Francisco| Developer      |
 								            """,
 								            "| Name | Age | City | Occupation | |---------|-----|--------------|----------------| | Alice | 30 | New York | Engineer | | Bob | 25 | Los Angeles | Designer | | Charlie | 35 | Chicago | Teacher | | David | 40 | San Francisco| Developer |",  # noqa: E501
 								        ),
 								    ],
 								)
 								def test_prepare_string(text, expected):
 								    assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
 								    assert text_extraction.prepare_str(text) == text
-												Feat: add quote standardization and update edit distance calculation

											
										
										
											2024-12-03 21:21:39 -08:00
+								@pytest.mark.parametrize(
 								    ("input_text", "expected_output"),
 								    [
-												Feat: enhance quote standardization with comprehensive Unicode coverage and update tests

											
										
										
											2024-12-04 11:33:03 -08:00
+								        # Complex sentences with standard quotes
 								        ('"The quick brown fox jumps over the lazy dog," said the narrator.',
 								         '"The quick brown fox jumps over the lazy dog," said the narrator.'),
 								        # Mixed quotes in longer sentences
 								        ('She said "Hello" and then whispered \'Goodbye\' before leaving.',
 								         'She said "Hello" and then whispered \'Goodbye\' before leaving.'),
 								        # Double low-9 quotes with complex content
 								        ('„To be, or not to be, that is the question" - Shakespeare\'s famous quote.',
 								         '"To be, or not to be, that is the question" - Shakespeare\'s famous quote.'),
 								        # Angle quotes with nested quotes
 								        ('«When he said "life is beautiful," I believed him» wrote Maria.',
 								         '"When he said "life is beautiful," I believed him" wrote Maria.'),
 								        # Heavy ornament quotes in dialogue
 								        ('❝Do you remember when we first met?❞ she asked with a smile.',
 								         '"Do you remember when we first met?" she asked with a smile.'),
 								        # Double prime quotes with punctuation
 								        ('〝The meeting starts at 10:00, don\'t be late!〟 announced the manager.',
 								         '"The meeting starts at 10:00, don\'t be late!" announced the manager.'),
 								        # Corner brackets with nested quotes
 								        ('「He told me "This is important" yesterday」, she explained.',
 								         '\'He told me "This is important" yesterday\', she explained.'),
 								        # White corner brackets with multiple sentences
 								        ('『The sun was setting. The birds were singing. It was peaceful.』',
 								         '\'The sun was setting. The birds were singing. It was peaceful.\''),
 								        # Vertical corner brackets with numbers and special characters
 								        ('﹂Meeting #123 @ 15:00 - Don\'t forget!﹁',
 								         '\'Meeting #123 @ 15:00 - Don\'t forget!\''),
 								        # Complex mixed quote types
 								        ('「Hello」, ❝World❞, "Test", \'Example\', „Quote", «Final»',
 								         '\'Hello\', "World", "Test", \'Example\', "Quote", "Final"'),
 								        # Quotes with multiple apostrophes
 								        ('It\'s John\'s book, isn\'t it?',
 								         "It's John's book, isn't it?"),
 								        # Single angle quotes with nested content
 								        ('‹Testing the system\'s capability for "quoted" text›',
 								         '\'Testing the system\'s capability for "quoted" text\''),
 								        # Heavy single ornament quotes with multiple sentences
 								        ('❛First sentence. Second sentence. Third sentence.❜',
 								         '\'First sentence. Second sentence. Third sentence.\''),
 								        # Mix of various quote types in complex text
 								        ('「Chapter 1」: ❝The Beginning❞ - „A new story" begins «today».',
 								         '\'Chapter 1\': "The Beginning" - "A new story" begins "today".')
-												Feat: add quote standardization and update edit distance calculation

											
										
										
											2024-12-03 21:21:39 -08:00
+								    ],
 								)
 								def test_standardize_quotes(input_text, expected_output):
 								    assert text_extraction.standardize_quotes(input_text) == expected_output
-												Ml 384/whitespaces in cct (#3747)

This ticket ensures that CCT metric will not be sensitive to differences
in whitespace (including newline).
All whitespaces in string are changed to single space `" "` in both GT
and PRED before the metric is computed.

Additional changes in CHANGELOG due to auto-formatting.
											
										
										
											2024-10-24 15:02:34 +02:00
-												feat: calculate metric for percent of text missing (#1701)

### Summary
Missing text is a particularly important metric of quality for the
Unstructured library because information from the document is not being
captured and therefore not usable by downstream applications.

Add function to calculate the percent of text missing relative to the
source transcription. Function takes 2 text strings (output and source)
as input, and returns the percentage of text missing as a decimal.

### Technical Details
- The 2 input strings are both assumed to already contain clean and
concatenated text (CCT)
- Implementation compares the bags of words (frequency counts for each
word present in the text) of each input text
- Duplicated/extra text is not penalized
- Value is limited to the range [0, 1]

### Test
- Several edge cases are covered in the test function (missing text,
duplicated text, spaced out words, etc).
- Can test other cases or text inputs by calling the function with 2 CCT
strings as "output" and "source"
											
										
										
											2023-10-10 13:54:49 -07:00
+								@pytest.mark.parametrize(
 								    ("output_text", "source_text", "expected_percentage"),
 								    [
 								        (
 								            "extra",
 								            "",
 ,
 								        ),
 								        (
 								            "",
 								            "Source text has a sentence.",
 ,
 								        ),
 								        (
 								            "The original s e n t e n c e is normal.",
 								            "The original sentence is normal...",
 .2,
 								        ),
 								        (
 								            "We saw 23% improvement in this quarter.",
 								            "We saw 23% improvement in sales this quarter.",
-												fix: avoid loop through None (#1975)

Fix this issue https://unstructured-ai.atlassian.net/browse/CORE-2455.
Adding logical check if the variable is not None.
											
										
										
											2023-11-01 16:50:34 -04:00
+.125,
-												feat: calculate metric for percent of text missing (#1701)

### Summary
Missing text is a particularly important metric of quality for the
Unstructured library because information from the document is not being
captured and therefore not usable by downstream applications.

Add function to calculate the percent of text missing relative to the
source transcription. Function takes 2 text strings (output and source)
as input, and returns the percentage of text missing as a decimal.

### Technical Details
- The 2 input strings are both assumed to already contain clean and
concatenated text (CCT)
- Implementation compares the bags of words (frequency counts for each
word present in the text) of each input text
- Duplicated/extra text is not penalized
- Value is limited to the range [0, 1]

### Test
- Several edge cases are covered in the test function (missing text,
duplicated text, spaced out words, etc).
- Can test other cases or text inputs by calling the function with 2 CCT
strings as "output" and "source"
											
										
										
											2023-10-10 13:54:49 -07:00
+								        ),
 								        (
 								            "no",
 								            "Is it possible to have more than everything missing?",
 ,
 								        ),
 								    ],
 								)
 								def test_calculate_percent_missing_text(output_text, source_text, expected_percentage):
 								    assert (
 								        text_extraction.calculate_percent_missing_text(output_text, source_text)
 								        == expected_percentage
 								    )
-												Add calculation of table related metrics based on table_as_cells (#2898)

This pull request add metrics that are calculated based on
table_as_cells instead of text_as_html. This change is required for
comprehensive metrics calculation, as previously every colspan or
rowspan predicted was considered to be an incorrect predicted (even if
it was correct prediction)

This change has to be merged after
https://github.com/Unstructured-IO/unstructured/pull/2892 which
introduces table_as_cells field
											
										
										
											2024-05-07 15:57:38 +02:00
-												feat: table evaluations for fixed html table generation (#3196)

Update to the evaluation script to handle correct HTML syntax for
tables.
See https://github.com/Unstructured-IO/unstructured-inference/pull/355
for details.

This change:
- modifies transforming HTML tables to evaluation internal `cells`
format
- fixes the indexing of the output (internal format cells) when HTML
cells use spans
											
										
										
											2024-06-14 11:03:27 +02:00
+								@pytest.mark.parametrize(
 								    ("table_as_cells", "expected_extraction"),
 								    [
 								        pytest.param(
 								            [
-												Add calculation of table related metrics based on table_as_cells (#2898)

This pull request add metrics that are calculated based on
table_as_cells instead of text_as_html. This change is required for
comprehensive metrics calculation, as previously every colspan or
rowspan predicted was considered to be an incorrect predicted (even if
it was correct prediction)

This change has to be merged after
https://github.com/Unstructured-IO/unstructured/pull/2892 which
introduces table_as_cells field
											
										
										
											2024-05-07 15:57:38 +02:00
+								                {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
 								                {"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"},
 								            ],
-												feat: table evaluations for fixed html table generation (#3196)

Update to the evaluation script to handle correct HTML syntax for
tables.
See https://github.com/Unstructured-IO/unstructured-inference/pull/355
for details.

This change:
- modifies transforming HTML tables to evaluation internal `cells`
format
- fixes the indexing of the output (internal format cells) when HTML
cells use spans
											
										
										
											2024-06-14 11:03:27 +02:00
+								            [
 								                {"row_index": 0, "col_index": 0, "content": "Month A."},
 								                {"row_index": 1, "col_index": 0, "content": "22"},
 								            ],
 								            id="Simple table, 1 head cell, 1 body cell, no spans",
 								        ),
 								        pytest.param(
 								            [
 								                {"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
 								                {"x": 1, "y": 0, "w": 1, "h": 1, "content": "Month B."},
 								                {"x": 2, "y": 0, "w": 1, "h": 1, "content": "Month C."},
 								                {"x": 0, "y": 1, "w": 1, "h": 1, "content": "11"},
 								                {"x": 1, "y": 1, "w": 1, "h": 1, "content": "12"},
 								                {"x": 2, "y": 1, "w": 1, "h": 1, "content": "13"},
 								                {"x": 0, "y": 2, "w": 1, "h": 1, "content": "21"},
 								                {"x": 1, "y": 2, "w": 1, "h": 1, "content": "22"},
 								                {"x": 2, "y": 2, "w": 1, "h": 1, "content": "23"},
 								            ],
 								            [
 								                {"row_index": 0, "col_index": 0, "content": "Month A."},
 								                {"row_index": 0, "col_index": 1, "content": "Month B."},
 								                {"row_index": 0, "col_index": 2, "content": "Month C."},
 								                {"row_index": 1, "col_index": 0, "content": "11"},
 								                {"row_index": 1, "col_index": 1, "content": "12"},
 								                {"row_index": 1, "col_index": 2, "content": "13"},
 								                {"row_index": 2, "col_index": 0, "content": "21"},
 								                {"row_index": 2, "col_index": 1, "content": "22"},
 								                {"row_index": 2, "col_index": 2, "content": "23"},
 								            ],
 								            id="Simple table, 3 head cell, 5 body cell, no spans",
 								        ),
 								        # +----------+---------------------+----------+
 								        # |          |       h1col23       |  h1col4  |
 								        # | h12col1  |----------+----------+----------|
 								        # |          |  h2col2  |       h2col34       |
 								        # |----------|----------+----------+----------+
 								        # |  r3col1  |  r3col2  |                     |
 								        # |----------+----------|      r34col34       |
 								        # |       r4col12       |                     |
 								        # +----------+----------+----------+----------+
 								        pytest.param(
 								            [
 								                {
 								                    "y": 0,
 								                    "x": 0,
 								                    "w": 2,
 								                    "h": 1,
 								                    "content": "h12col1",
 								                },
 								                {
 								                    "y": 0,
 								                    "x": 1,
 								                    "w": 1,
 								                    "h": 2,
 								                    "content": "h1col23",
 								                },
 								                {
 								                    "y": 0,
 								                    "x": 3,
 								                    "w": 1,
 								                    "h": 1,
 								                    "content": "h1col4",
 								                },
 								                {
 								                    "y": 1,
 								                    "x": 1,
 								                    "w": 1,
 								                    "h": 1,
 								                    "content": "h2col2",
 								                },
 								                {
 								                    "y": 1,
 								                    "x": 2,
 								                    "w": 1,
 								                    "h": 2,
 								                    "content": "h2col34",
 								                },
 								                {
 								                    "y": 2,
 								                    "x": 0,
 								                    "w": 1,
 								                    "h": 1,
 								                    "content": "r3col1",
 								                },
 								                {
 								                    "y": 2,
 								                    "x": 1,
 								                    "w": 1,
 								                    "h": 1,
 								                    "content": "r3col2",
 								                },
 								                {
 								                    "y": 2,
 								                    "x": 2,
 								                    "w": 2,
 								                    "h": 2,
 								                    "content": "r34col34",
 								                },
 								                {
 								                    "y": 3,
 								                    "x": 0,
 								                    "w": 1,
 								                    "h": 2,
 								                    "content": "r4col12",
 								                },
 								            ],
 								            [
 								                {
 								                    "row_index": 0,
 								                    "col_index": 0,
 								                    "content": "h12col1",
 								                },
 								                {
 								                    "row_index": 0,
 								                    "col_index": 1,
 								                    "content": "h1col23",
 								                },
 								                {
 								                    "row_index": 0,
 								                    "col_index": 3,
 								                    "content": "h1col4",
 								                },
 								                {
 								                    "row_index": 1,
 								                    "col_index": 1,
 								                    "content": "h2col2",
 								                },
 								                {
 								                    "row_index": 1,
 								                    "col_index": 2,
 								                    "content": "h2col34",
 								                },
 								                {
 								                    "row_index": 2,
 								                    "col_index": 0,
 								                    "content": "r3col1",
 								                },
 								                {
 								                    "row_index": 2,
 								                    "col_index": 1,
 								                    "content": "r3col2",
 								                },
 								                {
 								                    "row_index": 2,
 								                    "col_index": 2,
 								                    "content": "r34col34",
 								                },
 								                {
 								                    "row_index": 3,
 								                    "col_index": 0,
 								                    "content": "r4col12",
 								                },
 								            ],
 								            id="various spans, with 2 row header",
 								        ),
 								    ],
 								)
 								def test_cells_table_extraction_from_prediction(table_as_cells, expected_extraction):
 								    example_element = {
 								        "type": "Table",
 								        "metadata": {"table_as_cells": table_as_cells},
-												Add calculation of table related metrics based on table_as_cells (#2898)

This pull request add metrics that are calculated based on
table_as_cells instead of text_as_html. This change is required for
comprehensive metrics calculation, as previously every colspan or
rowspan predicted was considered to be an incorrect predicted (even if
it was correct prediction)

This change has to be merged after
https://github.com/Unstructured-IO/unstructured/pull/2892 which
introduces table_as_cells field
											
										
										
											2024-05-07 15:57:38 +02:00
+								    }
-												feat: table evaluations for fixed html table generation (#3196)

Update to the evaluation script to handle correct HTML syntax for
tables.
See https://github.com/Unstructured-IO/unstructured-inference/pull/355
for details.

This change:
- modifies transforming HTML tables to evaluation internal `cells`
format
- fixes the indexing of the output (internal format cells) when HTML
cells use spans
											
										
										
											2024-06-14 11:03:27 +02:00
+								    assert extract_cells_from_table_as_cells(example_element) == expected_extraction
-												Add calculation of table related metrics based on table_as_cells (#2898)

This pull request add metrics that are calculated based on
table_as_cells instead of text_as_html. This change is required for
comprehensive metrics calculation, as previously every colspan or
rowspan predicted was considered to be an incorrect predicted (even if
it was correct prediction)

This change has to be merged after
https://github.com/Unstructured-IO/unstructured/pull/2892 which
introduces table_as_cells field
											
										
										
											2024-05-07 15:57:38 +02:00
-												feat: table evaluations for fixed html table generation (#3196)

Update to the evaluation script to handle correct HTML syntax for
tables.
See https://github.com/Unstructured-IO/unstructured-inference/pull/355
for details.

This change:
- modifies transforming HTML tables to evaluation internal `cells`
format
- fixes the indexing of the output (internal format cells) when HTML
cells use spans
											
										
										
											2024-06-14 11:03:27 +02:00
+								@pytest.mark.parametrize(
 								    ("text_as_html", "expected_extraction"),
 								    [
 								        pytest.param(
 								            """
 								<table>
 								    <thead>
 								        <tr>
 								            <th>Month A.</th>
 								        </tr>
 								    </thead>
 								    <tbody>
 								        <tr>
 								            <td>22</td>
 								        </tr>
 								    </tbody>
 								</table>"
 								            """,
 								            [
 								                {"row_index": 0, "col_index": 0, "content": "Month A."},
 								                {"row_index": 1, "col_index": 0, "content": "22"},
 								            ],
 								            id="Simple table, 1 head cell, 1 body cell, no spans",
 								        ),
 								        pytest.param(
 								            """
 								<table>
 								    <thead>
 								        <tr>
 								            <th>Month A.</th>
 								            <th>Month B.</th>
 								            <th>Month C.</th>
 								        </tr>
 								    </thead>
 								    <tbody>
 								        <tr>
 								            <td>11</td>
 								            <td>12</td>
 								            <td>13</td>
 								        </tr>
 								        <tr>
 								            <td>21</td>
 								            <td>22</td>
 								            <td>23</td>
 								        </tr>
 								    </tbody>
 								</table>"
 								""",
 								            [
 								                {"row_index": 0, "col_index": 0, "content": "Month A."},
 								                {"row_index": 0, "col_index": 1, "content": "Month B."},
 								                {"row_index": 0, "col_index": 2, "content": "Month C."},
 								                {"row_index": 1, "col_index": 0, "content": "11"},
 								                {"row_index": 1, "col_index": 1, "content": "12"},
 								                {"row_index": 1, "col_index": 2, "content": "13"},
 								                {"row_index": 2, "col_index": 0, "content": "21"},
 								                {"row_index": 2, "col_index": 1, "content": "22"},
 								                {"row_index": 2, "col_index": 2, "content": "23"},
 								            ],
 								            id="Simple table, 3 head cell, 5 body cell, no spans",
 								        ),
 								        # +----------+---------------------+----------+
 								        # |          |       h1col23       |  h1col4  |
 								        # | h12col1  |----------+----------+----------|
 								        # |          |  h2col2  |       h2col34       |
 								        # |----------|----------+----------+----------+
 								        # |  r3col1  |  r3col2  |                     |
 								        # |----------+----------|      r34col34       |
 								        # |       r4col12       |                     |
 								        # +----------+----------+----------+----------+
 								        pytest.param(
 								            """
 								<table>
 								    <thead>
 								        <tr>
 								            <th rowspan="2">h12col1</th>
 								            <th colspan="2">h1col23</th>
 								            <th>h1col4</th>
 								        </tr>
 								        <tr>
 								            <th>h2col2</th>
 								            <th colspan="2">h2col34</th>
 								        </tr>
 								    </thead>
 								    <tbody>
 								        <tr>
 								            <td>r3col1</td>
 								            <td>r3col2</td>
 								            <td colspan="2" rowspan="2">r34col34</td>
 								        </tr>
 								        <tr>
 								            <td colspan="2">r4col12</td>
 								        </tr>
 								    </tbody>
 								</table>
 								""",
 								            [
 								                {
 								                    "row_index": 0,
 								                    "col_index": 0,
 								                    "content": "h12col1",
 								                },
 								                {
 								                    "row_index": 0,
 								                    "col_index": 1,
 								                    "content": "h1col23",
 								                },
 								                {
 								                    "row_index": 0,
 								                    "col_index": 3,
 								                    "content": "h1col4",
 								                },
 								                {
 								                    "row_index": 1,
 								                    "col_index": 1,
 								                    "content": "h2col2",
 								                },
 								                {
 								                    "row_index": 1,
 								                    "col_index": 2,
 								                    "content": "h2col34",
 								                },
 								                {
 								                    "row_index": 2,
 								                    "col_index": 0,
 								                    "content": "r3col1",
 								                },
 								                {
 								                    "row_index": 2,
 								                    "col_index": 1,
 								                    "content": "r3col2",
 								                },
 								                {
 								                    "row_index": 2,
 								                    "col_index": 2,
 								                    "content": "r34col34",
 								                },
 								                {
 								                    "row_index": 3,
 								                    "col_index": 0,
 								                    "content": "r4col12",
 								                },
 								            ],
 								            id="various spans, with 2 row header",
 								        ),
 								    ],
 								)
 								def test_html_table_extraction_from_prediction(text_as_html, expected_extraction):
 								    example_element = {
 								        "type": "Table",
 								        "metadata": {
 								            "text_as_html": text_as_html,
 								        },
 								    }
-												Add calculation of table related metrics based on table_as_cells (#2898)

This pull request add metrics that are calculated based on
table_as_cells instead of text_as_html. This change is required for
comprehensive metrics calculation, as previously every colspan or
rowspan predicted was considered to be an incorrect predicted (even if
it was correct prediction)

This change has to be merged after
https://github.com/Unstructured-IO/unstructured/pull/2892 which
introduces table_as_cells field
											
										
										
											2024-05-07 15:57:38 +02:00
+								    assert extract_cells_from_text_as_html(example_element) == expected_extraction
 								def test_cells_extraction_from_prediction_when_missing_prediction():
 								    example_element = {"type": "Table", "metadata": {"text_as_html": "", "table_as_cells": []}}
 								    assert extract_cells_from_text_as_html(example_element) is None
 								    assert extract_cells_from_table_as_cells(example_element) is None
-												feat: expose converters deckerd -> html and back (#3233)

This PR exposes functions in evaluation module for easy conversion
between tables in Deckerd and HTML formats, which are useful in
evalution experiments.
											
										
										
											2024-06-19 09:03:38 +02:00
 								def _trim_html(html: str) -> str:
 								    html_lines = [line.strip() for line in html.split("\n") if line]
 								    return "".join(html_lines)
 								@pytest.mark.parametrize(
 								    "html_to_test",
 								    [
 								        """
 								<table>
 								    <thead>
 								        <tr>
 								            <th>Month A.</th>
 								        </tr>
 								    </thead>
 								    <tbody>
 								        <tr>
 								            <td>22</td>
 								        </tr>
 								    </tbody>
 								</table>
 								""",
 								        """
 								<table>
 								    <thead>
 								        <tr>
 								            <th>Month A.</th>
 								            <th>Month B.</th>
 								            <th>Month C.</th>
 								        </tr>
 								    </thead>
 								    <tbody>
 								        <tr>
 								            <td>11</td>
 								            <td>12</td>
 								            <td>13</td>
 								        </tr>
 								        <tr>
 								            <td>21</td>
 								            <td>22</td>
 								            <td>23</td>
 								        </tr>
 								    </tbody>
 								</table>
 								""",
 								        """
 								<table>
 								    <thead>
 								        <tr>
 								            <th rowspan="2">h12col1</th>
 								            <th colspan="2">h1col23</th>
 								            <th>h1col4</th>
 								        </tr>
 								        <tr>
 								            <th>h2col2</th>
 								            <th colspan="2">h2col34</th>
 								        </tr>
 								    </thead>
 								    <tbody>
 								        <tr>
 								            <td>r3col1</td>
 								            <td>r3col2</td>
 								            <td colspan="2" rowspan="2">r34col34</td>
 								        </tr>
 								        <tr>
 								            <td colspan="2">r4col12</td>
 								        </tr>
 								    </tbody>
 								</table>
 								""",
 								    ],
 								)
 								def test_deckerd_html_converter(html_to_test):
 								    deckerd_table = html_table_to_deckerd(html_to_test)
 								    html_table = deckerd_table_to_html(deckerd_table)
 								    assert _trim_html(html_to_test) == html_table