mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-04 07:27:34 +00:00

This ticket ensures that CCT metric will not be sensitive to differences in whitespace (including newline). All whitespaces in string are changed to single space `" "` in both GT and PRED before the metric is computed. Additional changes in CHANGELOG due to auto-formatting.
792 lines
24 KiB
Python
792 lines
24 KiB
Python
import re
|
|
|
|
import pytest
|
|
|
|
from unstructured.metrics import text_extraction
|
|
from unstructured.metrics.table.table_extraction import (
|
|
deckerd_table_to_html,
|
|
extract_cells_from_table_as_cells,
|
|
extract_cells_from_text_as_html,
|
|
html_table_to_deckerd,
|
|
)
|
|
from unstructured.partition.auto import partition
|
|
|
|
|
|
def test_calculate_edit_distance():
|
|
source_cct = "I like pizza. I like bagels."
|
|
source_cct_word_space = "I like p i z z a . I like bagles."
|
|
source_cct_spaces = re.sub(r"\s+", " ", " ".join(source_cct))
|
|
source_cct_no_space = source_cct.replace(" ", "")
|
|
source_cct_one_sentence = "I like pizza."
|
|
source_cct_missing_word = "I like pizza. I like ."
|
|
source_cct_addn_char = "I like pizza. I like beagles."
|
|
source_cct_dup_word = "I like pizza pizza. I like bagels."
|
|
|
|
assert (
|
|
round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2)
|
|
== 1.0
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_word_space,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.75
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_spaces,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.39
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_no_space,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.64
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_one_sentence,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.0
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_missing_word,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.57
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_addn_char,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.89
|
|
)
|
|
assert (
|
|
round(
|
|
text_extraction.calculate_edit_distance(
|
|
source_cct_dup_word,
|
|
source_cct,
|
|
return_as="score",
|
|
),
|
|
2,
|
|
)
|
|
== 0.79
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "standardize_whitespaces", "expected_score", "expected_distance"),
|
|
[
|
|
("fake-text.txt", False, 0.78, 38),
|
|
("fake-text.txt", True, 0.92, 12),
|
|
],
|
|
)
|
|
def test_calculate_edit_distance_with_filename(
|
|
filename, standardize_whitespaces, expected_score, expected_distance
|
|
):
|
|
with open("example-docs/fake-text.txt") as f:
|
|
source_cct = f.read()
|
|
|
|
elements = partition(filename=f"example-docs/{filename}")
|
|
output_cct = "\n".join([str(el) for el in elements])
|
|
|
|
score = text_extraction.calculate_edit_distance(
|
|
output_cct, source_cct, return_as="score", standardize_whitespaces=standardize_whitespaces
|
|
)
|
|
distance = text_extraction.calculate_edit_distance(
|
|
output_cct,
|
|
source_cct,
|
|
return_as="distance",
|
|
standardize_whitespaces=standardize_whitespaces,
|
|
)
|
|
|
|
assert score >= 0
|
|
assert score <= 1.0
|
|
assert distance >= 0
|
|
assert round(score, 2) == expected_score
|
|
assert distance == expected_distance
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text1", "text2"),
|
|
[
|
|
(
|
|
"The dog\rloved the cat, but\t\n the cat\tloved the\n cow",
|
|
"The dog loved the cat, but the cat loved the cow",
|
|
),
|
|
(
|
|
"Hello my\tname\tis H a r p e r, \nwhat's your\vname?",
|
|
"Hello my name is H a r p e r, what's your name?",
|
|
),
|
|
(
|
|
"I have a\t\n\tdog and a\tcat,\fI love my\n\n\n\ndog.",
|
|
"I have a dog and a cat, I love my dog.",
|
|
),
|
|
(
|
|
"""
|
|
Name Age City Occupation
|
|
Alice 30 New York Engineer
|
|
Bob 25 Los Angeles Designer
|
|
Charlie 35 Chicago Teacher
|
|
David 40 San Francisco Developer
|
|
""",
|
|
"""
|
|
Name\tAge\tCity\tOccupation
|
|
Alice\t30\tNew York\tEngineer
|
|
Bob\t25\tLos Angeles\tDesigner
|
|
Charlie\t35\tChicago\tTeacher
|
|
David\t40\tSan Francisco\tDeveloper
|
|
""",
|
|
),
|
|
(
|
|
"""
|
|
Name\tAge\tCity\tOccupation
|
|
Alice\t30\tNew York\tEngineer
|
|
Bob\t25\tLos Angeles\tDesigner
|
|
Charlie\t35\tChicago\tTeacher
|
|
David\t40\tSan Francisco\tDeveloper
|
|
""",
|
|
"Name\tAge\tCity\tOccupation\n\n \nAlice\t30\tNew York\tEngineer\nBob\t25\tLos Angeles\tDesigner\nCharlie\t35\tChicago\tTeacher\nDavid\t40\tSan Francisco\tDeveloper", # noqa: E501
|
|
),
|
|
],
|
|
)
|
|
def test_calculate_edit_distance_with_various_whitespace_1(text1, text2):
|
|
assert (
|
|
text_extraction.calculate_edit_distance(
|
|
text1, text2, return_as="score", standardize_whitespaces=True
|
|
)
|
|
== 1.0
|
|
)
|
|
assert (
|
|
text_extraction.calculate_edit_distance(
|
|
text1, text2, return_as="distance", standardize_whitespaces=True
|
|
)
|
|
== 0
|
|
)
|
|
assert (
|
|
text_extraction.calculate_edit_distance(
|
|
text1, text2, return_as="score", standardize_whitespaces=False
|
|
)
|
|
< 1.0
|
|
)
|
|
assert (
|
|
text_extraction.calculate_edit_distance(
|
|
text1, text2, return_as="distance", standardize_whitespaces=False
|
|
)
|
|
> 0
|
|
)
|
|
|
|
|
|
def test_calculate_edit_distance_with_various_whitespace_2():
|
|
source_cct_tabs = """
|
|
Name\tAge\tCity\tOccupation
|
|
Alice\t30\tNew York\tEngineer
|
|
Bob\t25\tLos Angeles\tDesigner
|
|
Charlie\t35\tChicago\tTeacher
|
|
David\t40\tSan Francisco\tDeveloper
|
|
"""
|
|
source_cct_with_borders = """
|
|
|
|
| Name | Age | City | Occupation |
|
|
|---------|-----|--------------|----------------|
|
|
| Alice | 30 | New York | Engineer |
|
|
| Bob | 25 | Los Angeles | Designer |
|
|
| Charlie | 35 | Chicago | Teacher |
|
|
| David | 40 | San Francisco| Developer |
|
|
|
|
"""
|
|
assert text_extraction.calculate_edit_distance(
|
|
source_cct_tabs, source_cct_with_borders, return_as="score", standardize_whitespaces=True
|
|
) > text_extraction.calculate_edit_distance(
|
|
source_cct_tabs, source_cct_with_borders, return_as="score", standardize_whitespaces=False
|
|
)
|
|
assert text_extraction.calculate_edit_distance(
|
|
source_cct_tabs, source_cct_with_borders, return_as="distance", standardize_whitespaces=True
|
|
) < text_extraction.calculate_edit_distance(
|
|
source_cct_tabs,
|
|
source_cct_with_borders,
|
|
return_as="distance",
|
|
standardize_whitespaces=False,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text", "expected"),
|
|
[
|
|
(
|
|
"The dog loved the cat, but the cat loved the cow",
|
|
{"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
|
|
),
|
|
(
|
|
"Hello my name is H a r p e r, what's your name?",
|
|
{"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1},
|
|
),
|
|
(
|
|
"I have a dog and a cat, I love my dog.",
|
|
{"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
|
|
),
|
|
(
|
|
"My dog's hair is red, but the dogs' houses are blue.",
|
|
{
|
|
"my": 1,
|
|
"dog's": 1,
|
|
"hair": 1,
|
|
"is": 1,
|
|
"red": 1,
|
|
"but": 1,
|
|
"the": 1,
|
|
"dogs'": 1,
|
|
"houses": 1,
|
|
"are": 1,
|
|
"blue": 1,
|
|
},
|
|
),
|
|
(
|
|
"""Sometimes sentences have a dash - like this one!
|
|
A hyphen connects 2 words with no gap: easy-peasy.""",
|
|
{
|
|
"sometimes": 1,
|
|
"sentences": 1,
|
|
"have": 1,
|
|
"a": 2,
|
|
"dash": 1,
|
|
"like": 1,
|
|
"this": 1,
|
|
"one": 1,
|
|
"hyphen": 1,
|
|
"connects": 1,
|
|
"2": 1,
|
|
"words": 1,
|
|
"with": 1,
|
|
"no": 1,
|
|
"gap": 1,
|
|
"easy-peasy": 1,
|
|
},
|
|
),
|
|
],
|
|
)
|
|
def test_bag_of_words(text, expected):
|
|
assert text_extraction.bag_of_words(text) == expected
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text", "expected"),
|
|
[
|
|
(
|
|
"The dog\rloved the cat, but\t\n the cat\tloved the\n cow\n\n",
|
|
"The dog loved the cat, but the cat loved the cow",
|
|
),
|
|
(
|
|
"\n\nHello my\tname\tis H a r p e r, \nwhat's your\vname?",
|
|
"Hello my name is H a r p e r, what's your name?",
|
|
),
|
|
(
|
|
"I have a\t\n\tdog and a\tcat,\fI love my\n\n\n\ndog.",
|
|
"I have a dog and a cat, I love my dog.",
|
|
),
|
|
(
|
|
"""L is for the way you look at me
|
|
O is for the only one I see
|
|
V is very, very extraordinary
|
|
E is even more than anyone that you adore can""",
|
|
"L is for the way you look at me O is for the only one I see V is very, very extraordinary E is even more than anyone that you adore can", # noqa: E501
|
|
),
|
|
(
|
|
"""
|
|
| Name | Age | City | Occupation |
|
|
|---------|-----|--------------|----------------|
|
|
| Alice | 30 | New York | Engineer |
|
|
| Bob | 25 | Los Angeles | Designer |
|
|
| Charlie | 35 | Chicago | Teacher |
|
|
| David | 40 | San Francisco| Developer |
|
|
""",
|
|
"| Name | Age | City | Occupation | |---------|-----|--------------|----------------| | Alice | 30 | New York | Engineer | | Bob | 25 | Los Angeles | Designer | | Charlie | 35 | Chicago | Teacher | | David | 40 | San Francisco| Developer |", # noqa: E501
|
|
),
|
|
],
|
|
)
|
|
def test_prepare_string(text, expected):
|
|
assert text_extraction.prepare_str(text, standardize_whitespaces=True) == expected
|
|
assert text_extraction.prepare_str(text) == text
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("output_text", "source_text", "expected_percentage"),
|
|
[
|
|
(
|
|
"extra",
|
|
"",
|
|
0,
|
|
),
|
|
(
|
|
"",
|
|
"Source text has a sentence.",
|
|
1,
|
|
),
|
|
(
|
|
"The original s e n t e n c e is normal.",
|
|
"The original sentence is normal...",
|
|
0.2,
|
|
),
|
|
(
|
|
"We saw 23% improvement in this quarter.",
|
|
"We saw 23% improvement in sales this quarter.",
|
|
0.125,
|
|
),
|
|
(
|
|
"no",
|
|
"Is it possible to have more than everything missing?",
|
|
1,
|
|
),
|
|
],
|
|
)
|
|
def test_calculate_percent_missing_text(output_text, source_text, expected_percentage):
|
|
assert (
|
|
text_extraction.calculate_percent_missing_text(output_text, source_text)
|
|
== expected_percentage
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("table_as_cells", "expected_extraction"),
|
|
[
|
|
pytest.param(
|
|
[
|
|
{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
|
|
{"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"},
|
|
],
|
|
[
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
{"row_index": 1, "col_index": 0, "content": "22"},
|
|
],
|
|
id="Simple table, 1 head cell, 1 body cell, no spans",
|
|
),
|
|
pytest.param(
|
|
[
|
|
{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
|
|
{"x": 1, "y": 0, "w": 1, "h": 1, "content": "Month B."},
|
|
{"x": 2, "y": 0, "w": 1, "h": 1, "content": "Month C."},
|
|
{"x": 0, "y": 1, "w": 1, "h": 1, "content": "11"},
|
|
{"x": 1, "y": 1, "w": 1, "h": 1, "content": "12"},
|
|
{"x": 2, "y": 1, "w": 1, "h": 1, "content": "13"},
|
|
{"x": 0, "y": 2, "w": 1, "h": 1, "content": "21"},
|
|
{"x": 1, "y": 2, "w": 1, "h": 1, "content": "22"},
|
|
{"x": 2, "y": 2, "w": 1, "h": 1, "content": "23"},
|
|
],
|
|
[
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
{"row_index": 0, "col_index": 1, "content": "Month B."},
|
|
{"row_index": 0, "col_index": 2, "content": "Month C."},
|
|
{"row_index": 1, "col_index": 0, "content": "11"},
|
|
{"row_index": 1, "col_index": 1, "content": "12"},
|
|
{"row_index": 1, "col_index": 2, "content": "13"},
|
|
{"row_index": 2, "col_index": 0, "content": "21"},
|
|
{"row_index": 2, "col_index": 1, "content": "22"},
|
|
{"row_index": 2, "col_index": 2, "content": "23"},
|
|
],
|
|
id="Simple table, 3 head cell, 5 body cell, no spans",
|
|
),
|
|
# +----------+---------------------+----------+
|
|
# | | h1col23 | h1col4 |
|
|
# | h12col1 |----------+----------+----------|
|
|
# | | h2col2 | h2col34 |
|
|
# |----------|----------+----------+----------+
|
|
# | r3col1 | r3col2 | |
|
|
# |----------+----------| r34col34 |
|
|
# | r4col12 | |
|
|
# +----------+----------+----------+----------+
|
|
pytest.param(
|
|
[
|
|
{
|
|
"y": 0,
|
|
"x": 0,
|
|
"w": 2,
|
|
"h": 1,
|
|
"content": "h12col1",
|
|
},
|
|
{
|
|
"y": 0,
|
|
"x": 1,
|
|
"w": 1,
|
|
"h": 2,
|
|
"content": "h1col23",
|
|
},
|
|
{
|
|
"y": 0,
|
|
"x": 3,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "h1col4",
|
|
},
|
|
{
|
|
"y": 1,
|
|
"x": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "h2col2",
|
|
},
|
|
{
|
|
"y": 1,
|
|
"x": 2,
|
|
"w": 1,
|
|
"h": 2,
|
|
"content": "h2col34",
|
|
},
|
|
{
|
|
"y": 2,
|
|
"x": 0,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r3col1",
|
|
},
|
|
{
|
|
"y": 2,
|
|
"x": 1,
|
|
"w": 1,
|
|
"h": 1,
|
|
"content": "r3col2",
|
|
},
|
|
{
|
|
"y": 2,
|
|
"x": 2,
|
|
"w": 2,
|
|
"h": 2,
|
|
"content": "r34col34",
|
|
},
|
|
{
|
|
"y": 3,
|
|
"x": 0,
|
|
"w": 1,
|
|
"h": 2,
|
|
"content": "r4col12",
|
|
},
|
|
],
|
|
[
|
|
{
|
|
"row_index": 0,
|
|
"col_index": 0,
|
|
"content": "h12col1",
|
|
},
|
|
{
|
|
"row_index": 0,
|
|
"col_index": 1,
|
|
"content": "h1col23",
|
|
},
|
|
{
|
|
"row_index": 0,
|
|
"col_index": 3,
|
|
"content": "h1col4",
|
|
},
|
|
{
|
|
"row_index": 1,
|
|
"col_index": 1,
|
|
"content": "h2col2",
|
|
},
|
|
{
|
|
"row_index": 1,
|
|
"col_index": 2,
|
|
"content": "h2col34",
|
|
},
|
|
{
|
|
"row_index": 2,
|
|
"col_index": 0,
|
|
"content": "r3col1",
|
|
},
|
|
{
|
|
"row_index": 2,
|
|
"col_index": 1,
|
|
"content": "r3col2",
|
|
},
|
|
{
|
|
"row_index": 2,
|
|
"col_index": 2,
|
|
"content": "r34col34",
|
|
},
|
|
{
|
|
"row_index": 3,
|
|
"col_index": 0,
|
|
"content": "r4col12",
|
|
},
|
|
],
|
|
id="various spans, with 2 row header",
|
|
),
|
|
],
|
|
)
|
|
def test_cells_table_extraction_from_prediction(table_as_cells, expected_extraction):
|
|
example_element = {
|
|
"type": "Table",
|
|
"metadata": {"table_as_cells": table_as_cells},
|
|
}
|
|
assert extract_cells_from_table_as_cells(example_element) == expected_extraction
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("text_as_html", "expected_extraction"),
|
|
[
|
|
pytest.param(
|
|
"""
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Month A.</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>22</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"
|
|
""",
|
|
[
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
{"row_index": 1, "col_index": 0, "content": "22"},
|
|
],
|
|
id="Simple table, 1 head cell, 1 body cell, no spans",
|
|
),
|
|
pytest.param(
|
|
"""
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Month A.</th>
|
|
<th>Month B.</th>
|
|
<th>Month C.</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>11</td>
|
|
<td>12</td>
|
|
<td>13</td>
|
|
</tr>
|
|
<tr>
|
|
<td>21</td>
|
|
<td>22</td>
|
|
<td>23</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>"
|
|
""",
|
|
[
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
{"row_index": 0, "col_index": 1, "content": "Month B."},
|
|
{"row_index": 0, "col_index": 2, "content": "Month C."},
|
|
{"row_index": 1, "col_index": 0, "content": "11"},
|
|
{"row_index": 1, "col_index": 1, "content": "12"},
|
|
{"row_index": 1, "col_index": 2, "content": "13"},
|
|
{"row_index": 2, "col_index": 0, "content": "21"},
|
|
{"row_index": 2, "col_index": 1, "content": "22"},
|
|
{"row_index": 2, "col_index": 2, "content": "23"},
|
|
],
|
|
id="Simple table, 3 head cell, 5 body cell, no spans",
|
|
),
|
|
# +----------+---------------------+----------+
|
|
# | | h1col23 | h1col4 |
|
|
# | h12col1 |----------+----------+----------|
|
|
# | | h2col2 | h2col34 |
|
|
# |----------|----------+----------+----------+
|
|
# | r3col1 | r3col2 | |
|
|
# |----------+----------| r34col34 |
|
|
# | r4col12 | |
|
|
# +----------+----------+----------+----------+
|
|
pytest.param(
|
|
"""
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th rowspan="2">h12col1</th>
|
|
<th colspan="2">h1col23</th>
|
|
<th>h1col4</th>
|
|
</tr>
|
|
<tr>
|
|
<th>h2col2</th>
|
|
<th colspan="2">h2col34</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>r3col1</td>
|
|
<td>r3col2</td>
|
|
<td colspan="2" rowspan="2">r34col34</td>
|
|
</tr>
|
|
<tr>
|
|
<td colspan="2">r4col12</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
""",
|
|
[
|
|
{
|
|
"row_index": 0,
|
|
"col_index": 0,
|
|
"content": "h12col1",
|
|
},
|
|
{
|
|
"row_index": 0,
|
|
"col_index": 1,
|
|
"content": "h1col23",
|
|
},
|
|
{
|
|
"row_index": 0,
|
|
"col_index": 3,
|
|
"content": "h1col4",
|
|
},
|
|
{
|
|
"row_index": 1,
|
|
"col_index": 1,
|
|
"content": "h2col2",
|
|
},
|
|
{
|
|
"row_index": 1,
|
|
"col_index": 2,
|
|
"content": "h2col34",
|
|
},
|
|
{
|
|
"row_index": 2,
|
|
"col_index": 0,
|
|
"content": "r3col1",
|
|
},
|
|
{
|
|
"row_index": 2,
|
|
"col_index": 1,
|
|
"content": "r3col2",
|
|
},
|
|
{
|
|
"row_index": 2,
|
|
"col_index": 2,
|
|
"content": "r34col34",
|
|
},
|
|
{
|
|
"row_index": 3,
|
|
"col_index": 0,
|
|
"content": "r4col12",
|
|
},
|
|
],
|
|
id="various spans, with 2 row header",
|
|
),
|
|
],
|
|
)
|
|
def test_html_table_extraction_from_prediction(text_as_html, expected_extraction):
|
|
example_element = {
|
|
"type": "Table",
|
|
"metadata": {
|
|
"text_as_html": text_as_html,
|
|
},
|
|
}
|
|
assert extract_cells_from_text_as_html(example_element) == expected_extraction
|
|
|
|
|
|
def test_cells_extraction_from_prediction_when_missing_prediction():
|
|
example_element = {"type": "Table", "metadata": {"text_as_html": "", "table_as_cells": []}}
|
|
assert extract_cells_from_text_as_html(example_element) is None
|
|
assert extract_cells_from_table_as_cells(example_element) is None
|
|
|
|
|
|
def _trim_html(html: str) -> str:
|
|
html_lines = [line.strip() for line in html.split("\n") if line]
|
|
return "".join(html_lines)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"html_to_test",
|
|
[
|
|
"""
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Month A.</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>22</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
""",
|
|
"""
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th>Month A.</th>
|
|
<th>Month B.</th>
|
|
<th>Month C.</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>11</td>
|
|
<td>12</td>
|
|
<td>13</td>
|
|
</tr>
|
|
<tr>
|
|
<td>21</td>
|
|
<td>22</td>
|
|
<td>23</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
""",
|
|
"""
|
|
<table>
|
|
<thead>
|
|
<tr>
|
|
<th rowspan="2">h12col1</th>
|
|
<th colspan="2">h1col23</th>
|
|
<th>h1col4</th>
|
|
</tr>
|
|
<tr>
|
|
<th>h2col2</th>
|
|
<th colspan="2">h2col34</th>
|
|
</tr>
|
|
</thead>
|
|
<tbody>
|
|
<tr>
|
|
<td>r3col1</td>
|
|
<td>r3col2</td>
|
|
<td colspan="2" rowspan="2">r34col34</td>
|
|
</tr>
|
|
<tr>
|
|
<td colspan="2">r4col12</td>
|
|
</tr>
|
|
</tbody>
|
|
</table>
|
|
""",
|
|
],
|
|
)
|
|
def test_deckerd_html_converter(html_to_test):
|
|
deckerd_table = html_table_to_deckerd(html_to_test)
|
|
html_table = deckerd_table_to_html(deckerd_table)
|
|
assert _trim_html(html_to_test) == html_table
|