# pyright: reportPrivateUsage=false
"""Unit-test suite for the `unstructured.common.html_table` module."""
from __future__ import annotations
import pytest
from lxml.html import fragment_fromstring
from unstructured.common.html_table import (
    HtmlCell,
    HtmlRow,
    HtmlTable,
    htmlify_matrix_of_cell_texts,
)
class Describe_htmlify_matrix_of_cell_texts:
    """Unit-test suite for `unstructured.common.html_table.htmlify_matrix_of_cell_texts()`."""
    def test_htmlify_matrix_handles_empty_cells(self):
        assert htmlify_matrix_of_cell_texts([["cell1", "", "cell3"], ["", "cell5", ""]]) == (
            "
"
        )
    def test_htmlify_matrix_handles_special_characters(self):
        assert htmlify_matrix_of_cell_texts([['<>&"', "newline\n"]]) == (
            ""
        )
    def test_htmlify_matrix_handles_multiple_rows_and_cells(self):
        assert htmlify_matrix_of_cell_texts([["cell1", "cell2"], ["cell3", "cell4"]]) == (
            ""
            "| cell1 | cell2 | 
"
            "| cell3 | cell4 | 
"
            "
"
        )
    def test_htmlify_matrix_handles_empty_matrix(self):
        assert htmlify_matrix_of_cell_texts([]) == ""
class DescribeHtmlTable:
    """Unit-test suite for `unstructured.common.html_table.HtmlTable`."""
    def it_can_construct_from_html_text(self):
        html_table = HtmlTable.from_html_text("")
        assert isinstance(html_table, HtmlTable)
        assert html_table._table.tag == "table"
    @pytest.mark.parametrize(
        "html_text",
        [
            "",
            "",
            "",
        ],
    )
    def it_can_find_a_table_wrapped_in_an_html_or_body_element(self, html_text: str):
        html_table = HtmlTable.from_html_text(html_text)
        assert isinstance(html_table, HtmlTable)
        assert html_table._table.tag == "table"
    def but_it_raises_when_no_table_element_is_present_in_the_html(self):
        with pytest.raises(ValueError, match="`html_text` contains no `` element"):
            HtmlTable.from_html_text("| foobar | 
")
    def it_removes_any_attributes_present_on_the_table_element(self):
        html_table = HtmlTable.from_html_text(
            '',
        )
        assert html_table.html == ""
    @pytest.mark.parametrize(
        "html_text",
        [
            "",
            "",
            "",
        ],
    )
    def it_removes_any_thead_tbody_or_tfoot_elements_present_within_the_table_element(
        self, html_text: str
    ):
        html_table = HtmlTable.from_html_text(html_text)
        assert html_table.html == ""
    def it_changes_any_th_elements_to_td_elements_for_cell_element_uniformity(self):
        html_table = HtmlTable.from_html_text(
            ""
        )
        assert html_table.html == (
            ""
        )
    def it_removes_any_extra_whitespace_between_elements_and_normalizes_whitespace_in_text(self):
        html_table = HtmlTable.from_html_text(
            "\n  \n  \n    | \tabc   def\nghi\n | 
\n
\n  ",
        )
        assert html_table.html == ""
    def it_can_serialize_the_table_element_to_str_html_text(self):
        table = fragment_fromstring("")
        html_table = HtmlTable(table)
        assert html_table.html == ""
    def it_can_iterate_the_rows_in_the_table(self):
        html_table = HtmlTable.from_html_text(
            ""
            "  | abc | def | ghi | 
"
            "  | jkl | mno | pqr | 
"
            "  | stu | vwx | yz | 
"
            "
"
        )
        row_iter = html_table.iter_rows()
        row = next(row_iter)
        assert isinstance(row, HtmlRow)
        assert row.html == "| abc | def | ghi | 
"
        # --
        row = next(row_iter)
        assert isinstance(row, HtmlRow)
        assert row.html == "| jkl | mno | pqr | 
"
        # --
        row = next(row_iter)
        assert isinstance(row, HtmlRow)
        assert row.html == "| stu | vwx | yz | 
"
        # --
        with pytest.raises(StopIteration):
            next(row_iter)
    def it_provides_access_to_the_clear_concatenated_text_of_the_table(self):
        html_table = HtmlTable.from_html_text(
            ""
            "  | a\n b  c |  | def | 
|---|
"
            "  | gh \ti |  | \n jk l | 
"
            "  |  | m n op\n |  | 
"
            "
"
        )
        assert html_table.text == "a b c def gh i jk l m n op"
class DescribeHtmlRow:
    """Unit-test suite for `unstructured.common.html_table.HtmlRow`."""
    def it_can_serialize_the_row_to_html(self):
        assert HtmlRow(fragment_fromstring("| a | b |  | 
")).html == (
            "| a | b |  | 
"
        )
    def it_can_iterate_the_cells_in_the_row(self):
        row = HtmlRow(fragment_fromstring("| a | b |  | 
"))
        cell_iter = row.iter_cells()
        cell = next(cell_iter)
        assert isinstance(cell, HtmlCell)
        assert cell.html == "a"
        # --
        cell = next(cell_iter)
        assert isinstance(cell, HtmlCell)
        assert cell.html == " | b"
        # --
        cell = next(cell_iter)
        assert isinstance(cell, HtmlCell)
        assert cell.html == " | "
        # --
        with pytest.raises(StopIteration):
            next(cell_iter)
    def it_can_iterate_the_texts_of_the_cells_in_the_row(self):
        row = HtmlRow(fragment_fromstring(" | | a | b |  | 
"))
        text_iter = row.iter_cell_texts()
        assert next(text_iter) == "a"
        assert next(text_iter) == "b"
        with pytest.raises(StopIteration):
            next(text_iter)
class DescribeHtmlCell:
    """Unit-test suite for `unstructured.common.html_table.HtmlCell`."""
    def it_can_serialize_the_cell_to_html(self):
        assert HtmlCell(fragment_fromstring("a b c")).html == " | a b c"
    @pytest.mark.parametrize(
        ("cell_html", "expected_value"),
        [(" | Lorem ipsum", "Lorem ipsum"), (" | ", "")],
    )
    def it_knows_the_text_in_the_cell(self, cell_html: str, expected_value: str):
        assert HtmlCell(fragment_fromstring(cell_html)).text == expected_value |