# pyright: reportPrivateUsage=false """Unit-test suite for the `unstructured.common.html_table` module.""" from __future__ import annotations import pytest from lxml.html import fragment_fromstring from unstructured.common.html_table import ( HtmlCell, HtmlRow, HtmlTable, htmlify_matrix_of_cell_texts, ) class Describe_htmlify_matrix_of_cell_texts: """Unit-test suite for `unstructured.common.html_table.htmlify_matrix_of_cell_texts()`.""" def test_htmlify_matrix_handles_empty_cells(self): assert htmlify_matrix_of_cell_texts([["cell1", "", "cell3"], ["", "cell5", ""]]) == ( "" "" "" "
cell1cell3
cell5
" ) def test_htmlify_matrix_handles_special_characters(self): assert htmlify_matrix_of_cell_texts([['<>&"', "newline\n"]]) == ( "
<>&"newline
" ) def test_htmlify_matrix_handles_multiple_rows_and_cells(self): assert htmlify_matrix_of_cell_texts([["cell1", "cell2"], ["cell3", "cell4"]]) == ( "" "" "" "
cell1cell2
cell3cell4
" ) def test_htmlify_matrix_handles_empty_matrix(self): assert htmlify_matrix_of_cell_texts([]) == "" class DescribeHtmlTable: """Unit-test suite for `unstructured.common.html_table.HtmlTable`.""" def it_can_construct_from_html_text(self): html_table = HtmlTable.from_html_text("
foobar
") assert isinstance(html_table, HtmlTable) assert html_table._table.tag == "table" @pytest.mark.parametrize( "html_text", [ "
foobar
", "
foobar
", "
foobar
", ], ) def it_can_find_a_table_wrapped_in_an_html_or_body_element(self, html_text: str): html_table = HtmlTable.from_html_text(html_text) assert isinstance(html_table, HtmlTable) assert html_table._table.tag == "table" def but_it_raises_when_no_table_element_is_present_in_the_html(self): with pytest.raises(ValueError, match="`html_text` contains no `` element"): HtmlTable.from_html_text("") def it_removes_any_attributes_present_on_the_table_element(self): html_table = HtmlTable.from_html_text( '
foobar
foobar
', ) assert html_table.html == "
foobar
" @pytest.mark.parametrize( "html_text", [ "
foobar
", "
foobar
", "
foobar
", ], ) def it_removes_any_thead_tbody_or_tfoot_elements_present_within_the_table_element( self, html_text: str ): html_table = HtmlTable.from_html_text(html_text) assert html_table.html == "
foobar
" def it_changes_any_th_elements_to_td_elements_for_cell_element_uniformity(self): html_table = HtmlTable.from_html_text( "" " " " " "
ab
c
" ) assert html_table.html == ( "
ab
c
" ) def it_removes_any_extra_whitespace_between_elements_and_normalizes_whitespace_in_text(self): html_table = HtmlTable.from_html_text( "\n \n \n \n \n
\tabc def\nghi
\n ", ) assert html_table.html == "
abc def ghi
" def it_can_serialize_the_table_element_to_str_html_text(self): table = fragment_fromstring("
foobar
") html_table = HtmlTable(table) assert html_table.html == "
foobar
" def it_can_iterate_the_rows_in_the_table(self): html_table = HtmlTable.from_html_text( "" " " " " " " "
abcdefghi
jklmnopqr
stuvwxyz
" ) row_iter = html_table.iter_rows() row = next(row_iter) assert isinstance(row, HtmlRow) assert row.html == "abcdefghi" # -- row = next(row_iter) assert isinstance(row, HtmlRow) assert row.html == "jklmnopqr" # -- row = next(row_iter) assert isinstance(row, HtmlRow) assert row.html == "stuvwxyz" # -- with pytest.raises(StopIteration): next(row_iter) def it_provides_access_to_the_clear_concatenated_text_of_the_table(self): html_table = HtmlTable.from_html_text( "" " " " " " " "
a\n b c def
gh \ti\n jk l
m n op\n
" ) assert html_table.text == "a b c def gh i jk l m n op" class DescribeHtmlRow: """Unit-test suite for `unstructured.common.html_table.HtmlRow`.""" def it_can_serialize_the_row_to_html(self): assert HtmlRow(fragment_fromstring("ab")).html == ( "ab" ) def it_can_iterate_the_cells_in_the_row(self): row = HtmlRow(fragment_fromstring("ab")) cell_iter = row.iter_cells() cell = next(cell_iter) assert isinstance(cell, HtmlCell) assert cell.html == "a" # -- cell = next(cell_iter) assert isinstance(cell, HtmlCell) assert cell.html == "b" # -- cell = next(cell_iter) assert isinstance(cell, HtmlCell) assert cell.html == "" # -- with pytest.raises(StopIteration): next(cell_iter) def it_can_iterate_the_texts_of_the_cells_in_the_row(self): row = HtmlRow(fragment_fromstring("ab")) text_iter = row.iter_cell_texts() assert next(text_iter) == "a" assert next(text_iter) == "b" with pytest.raises(StopIteration): next(text_iter) class DescribeHtmlCell: """Unit-test suite for `unstructured.common.html_table.HtmlCell`.""" def it_can_serialize_the_cell_to_html(self): assert HtmlCell(fragment_fromstring("a b c")).html == "a b c" @pytest.mark.parametrize( ("cell_html", "expected_value"), [(" Lorem ipsum ", "Lorem ipsum"), ("", "")], ) def it_knows_the_text_in_the_cell(self, cell_html: str, expected_value: str): assert HtmlCell(fragment_fromstring(cell_html)).text == expected_value