2023-10-06 21:21:14 -04:00
|
|
|
import re
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
2023-10-10 13:46:01 -05:00
|
|
|
from unstructured.metrics import text_extraction
|
2024-05-07 15:57:38 +02:00
|
|
|
from unstructured.metrics.table.table_extraction import (
|
2024-06-19 09:03:38 +02:00
|
|
|
deckerd_table_to_html,
|
2024-05-07 15:57:38 +02:00
|
|
|
extract_cells_from_table_as_cells,
|
|
|
|
extract_cells_from_text_as_html,
|
2024-06-19 09:03:38 +02:00
|
|
|
html_table_to_deckerd,
|
2024-05-07 15:57:38 +02:00
|
|
|
)
|
2023-10-06 21:21:14 -04:00
|
|
|
from unstructured.partition.auto import partition
|
|
|
|
|
|
|
|
|
|
|
|
def test_calculate_edit_distance():
|
|
|
|
source_cct = "I like pizza. I like bagels."
|
|
|
|
source_cct_word_space = "I like p i z z a . I like bagles."
|
|
|
|
source_cct_spaces = re.sub(r"\s+", " ", " ".join(source_cct))
|
|
|
|
source_cct_no_space = source_cct.replace(" ", "")
|
|
|
|
source_cct_one_sentence = "I like pizza."
|
|
|
|
source_cct_missing_word = "I like pizza. I like ."
|
|
|
|
source_cct_addn_char = "I like pizza. I like beagles."
|
|
|
|
source_cct_dup_word = "I like pizza pizza. I like bagels."
|
|
|
|
|
|
|
|
assert (
|
2023-10-10 13:46:01 -05:00
|
|
|
round(text_extraction.calculate_edit_distance(source_cct, source_cct, return_as="score"), 2)
|
|
|
|
== 1.0
|
|
|
|
)
|
|
|
|
assert (
|
|
|
|
round(
|
|
|
|
text_extraction.calculate_edit_distance(
|
|
|
|
source_cct_word_space,
|
|
|
|
source_cct,
|
|
|
|
return_as="score",
|
|
|
|
),
|
|
|
|
2,
|
|
|
|
)
|
2023-10-06 21:21:14 -04:00
|
|
|
== 0.75
|
|
|
|
)
|
|
|
|
assert (
|
2023-10-10 13:46:01 -05:00
|
|
|
round(
|
|
|
|
text_extraction.calculate_edit_distance(
|
|
|
|
source_cct_spaces,
|
|
|
|
source_cct,
|
|
|
|
return_as="score",
|
|
|
|
),
|
|
|
|
2,
|
|
|
|
)
|
|
|
|
== 0.39
|
2023-10-06 21:21:14 -04:00
|
|
|
)
|
|
|
|
assert (
|
2023-10-10 13:46:01 -05:00
|
|
|
round(
|
|
|
|
text_extraction.calculate_edit_distance(
|
|
|
|
source_cct_no_space,
|
|
|
|
source_cct,
|
|
|
|
return_as="score",
|
|
|
|
),
|
|
|
|
2,
|
|
|
|
)
|
2023-10-06 21:21:14 -04:00
|
|
|
== 0.64
|
|
|
|
)
|
|
|
|
assert (
|
2023-10-10 13:46:01 -05:00
|
|
|
round(
|
|
|
|
text_extraction.calculate_edit_distance(
|
|
|
|
source_cct_one_sentence,
|
|
|
|
source_cct,
|
|
|
|
return_as="score",
|
|
|
|
),
|
|
|
|
2,
|
|
|
|
)
|
2023-10-06 21:21:14 -04:00
|
|
|
== 0.0
|
|
|
|
)
|
|
|
|
assert (
|
2023-10-10 13:46:01 -05:00
|
|
|
round(
|
|
|
|
text_extraction.calculate_edit_distance(
|
|
|
|
source_cct_missing_word,
|
|
|
|
source_cct,
|
|
|
|
return_as="score",
|
|
|
|
),
|
|
|
|
2,
|
|
|
|
)
|
2023-10-06 21:21:14 -04:00
|
|
|
== 0.57
|
|
|
|
)
|
|
|
|
assert (
|
2023-10-10 13:46:01 -05:00
|
|
|
round(
|
|
|
|
text_extraction.calculate_edit_distance(
|
|
|
|
source_cct_addn_char,
|
|
|
|
source_cct,
|
|
|
|
return_as="score",
|
|
|
|
),
|
|
|
|
2,
|
|
|
|
)
|
2023-10-06 21:21:14 -04:00
|
|
|
== 0.89
|
|
|
|
)
|
|
|
|
assert (
|
2023-10-10 13:46:01 -05:00
|
|
|
round(
|
|
|
|
text_extraction.calculate_edit_distance(
|
|
|
|
source_cct_dup_word,
|
|
|
|
source_cct,
|
|
|
|
return_as="score",
|
|
|
|
),
|
|
|
|
2,
|
|
|
|
)
|
2023-10-06 21:21:14 -04:00
|
|
|
== 0.79
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename", "expected_score", "expected_distance"),
|
|
|
|
[
|
|
|
|
("fake-text.txt", 0.78, 38),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_calculate_edit_distance_with_filename(filename, expected_score, expected_distance):
|
|
|
|
with open("example-docs/fake-text.txt") as f:
|
|
|
|
source_cct = f.read()
|
|
|
|
|
|
|
|
elements = partition(filename=f"example-docs/{filename}")
|
|
|
|
output_cct = "\n".join([str(el) for el in elements])
|
|
|
|
|
2023-10-10 13:46:01 -05:00
|
|
|
score = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="score")
|
|
|
|
distance = text_extraction.calculate_edit_distance(output_cct, source_cct, return_as="distance")
|
2023-10-06 21:21:14 -04:00
|
|
|
|
|
|
|
assert score >= 0
|
|
|
|
assert score <= 1.0
|
|
|
|
assert distance >= 0
|
|
|
|
assert round(score, 2) == expected_score
|
|
|
|
assert distance == expected_distance
|
2023-10-10 13:46:01 -05:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("text", "expected"),
|
|
|
|
[
|
|
|
|
(
|
|
|
|
"The dog loved the cat, but the cat loved the cow",
|
|
|
|
{"the": 4, "cat": 2, "loved": 2, "dog": 1, "but": 1, "cow": 1},
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"Hello my name is H a r p e r, what's your name?",
|
|
|
|
{"hello": 1, "my": 1, "name": 2, "is": 1, "what's": 1, "your": 1},
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"I have a dog and a cat, I love my dog.",
|
|
|
|
{"i": 2, "have": 1, "a": 2, "dog": 2, "and": 1, "cat": 1, "love": 1, "my": 1},
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"My dog's hair is red, but the dogs' houses are blue.",
|
|
|
|
{
|
|
|
|
"my": 1,
|
|
|
|
"dog's": 1,
|
|
|
|
"hair": 1,
|
|
|
|
"is": 1,
|
|
|
|
"red": 1,
|
|
|
|
"but": 1,
|
|
|
|
"the": 1,
|
|
|
|
"dogs'": 1,
|
|
|
|
"houses": 1,
|
|
|
|
"are": 1,
|
|
|
|
"blue": 1,
|
|
|
|
},
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"""Sometimes sentences have a dash - like this one!
|
2024-06-14 11:03:27 +02:00
|
|
|
A hyphen connects 2 words with no gap: easy-peasy.""",
|
2023-10-10 13:46:01 -05:00
|
|
|
{
|
|
|
|
"sometimes": 1,
|
|
|
|
"sentences": 1,
|
|
|
|
"have": 1,
|
|
|
|
"a": 2,
|
|
|
|
"dash": 1,
|
|
|
|
"like": 1,
|
|
|
|
"this": 1,
|
|
|
|
"one": 1,
|
|
|
|
"hyphen": 1,
|
|
|
|
"connects": 1,
|
|
|
|
"2": 1,
|
|
|
|
"words": 1,
|
|
|
|
"with": 1,
|
|
|
|
"no": 1,
|
|
|
|
"gap": 1,
|
|
|
|
"easy-peasy": 1,
|
|
|
|
},
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_bag_of_words(text, expected):
|
|
|
|
assert text_extraction.bag_of_words(text) == expected
|
2023-10-10 13:54:49 -07:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("output_text", "source_text", "expected_percentage"),
|
|
|
|
[
|
|
|
|
(
|
|
|
|
"extra",
|
|
|
|
"",
|
|
|
|
0,
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"",
|
|
|
|
"Source text has a sentence.",
|
|
|
|
1,
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"The original s e n t e n c e is normal.",
|
|
|
|
"The original sentence is normal...",
|
|
|
|
0.2,
|
|
|
|
),
|
|
|
|
(
|
|
|
|
"We saw 23% improvement in this quarter.",
|
|
|
|
"We saw 23% improvement in sales this quarter.",
|
2023-11-01 16:50:34 -04:00
|
|
|
0.125,
|
2023-10-10 13:54:49 -07:00
|
|
|
),
|
|
|
|
(
|
|
|
|
"no",
|
|
|
|
"Is it possible to have more than everything missing?",
|
|
|
|
1,
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_calculate_percent_missing_text(output_text, source_text, expected_percentage):
|
|
|
|
assert (
|
|
|
|
text_extraction.calculate_percent_missing_text(output_text, source_text)
|
|
|
|
== expected_percentage
|
|
|
|
)
|
2024-05-07 15:57:38 +02:00
|
|
|
|
|
|
|
|
2024-06-14 11:03:27 +02:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("table_as_cells", "expected_extraction"),
|
|
|
|
[
|
|
|
|
pytest.param(
|
|
|
|
[
|
2024-05-07 15:57:38 +02:00
|
|
|
{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
|
|
|
|
{"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"},
|
|
|
|
],
|
2024-06-14 11:03:27 +02:00
|
|
|
[
|
|
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
|
|
{"row_index": 1, "col_index": 0, "content": "22"},
|
|
|
|
],
|
|
|
|
id="Simple table, 1 head cell, 1 body cell, no spans",
|
|
|
|
),
|
|
|
|
pytest.param(
|
|
|
|
[
|
|
|
|
{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
|
|
|
|
{"x": 1, "y": 0, "w": 1, "h": 1, "content": "Month B."},
|
|
|
|
{"x": 2, "y": 0, "w": 1, "h": 1, "content": "Month C."},
|
|
|
|
{"x": 0, "y": 1, "w": 1, "h": 1, "content": "11"},
|
|
|
|
{"x": 1, "y": 1, "w": 1, "h": 1, "content": "12"},
|
|
|
|
{"x": 2, "y": 1, "w": 1, "h": 1, "content": "13"},
|
|
|
|
{"x": 0, "y": 2, "w": 1, "h": 1, "content": "21"},
|
|
|
|
{"x": 1, "y": 2, "w": 1, "h": 1, "content": "22"},
|
|
|
|
{"x": 2, "y": 2, "w": 1, "h": 1, "content": "23"},
|
|
|
|
],
|
|
|
|
[
|
|
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
|
|
{"row_index": 0, "col_index": 1, "content": "Month B."},
|
|
|
|
{"row_index": 0, "col_index": 2, "content": "Month C."},
|
|
|
|
{"row_index": 1, "col_index": 0, "content": "11"},
|
|
|
|
{"row_index": 1, "col_index": 1, "content": "12"},
|
|
|
|
{"row_index": 1, "col_index": 2, "content": "13"},
|
|
|
|
{"row_index": 2, "col_index": 0, "content": "21"},
|
|
|
|
{"row_index": 2, "col_index": 1, "content": "22"},
|
|
|
|
{"row_index": 2, "col_index": 2, "content": "23"},
|
|
|
|
],
|
|
|
|
id="Simple table, 3 head cell, 5 body cell, no spans",
|
|
|
|
),
|
|
|
|
# +----------+---------------------+----------+
|
|
|
|
# | | h1col23 | h1col4 |
|
|
|
|
# | h12col1 |----------+----------+----------|
|
|
|
|
# | | h2col2 | h2col34 |
|
|
|
|
# |----------|----------+----------+----------+
|
|
|
|
# | r3col1 | r3col2 | |
|
|
|
|
# |----------+----------| r34col34 |
|
|
|
|
# | r4col12 | |
|
|
|
|
# +----------+----------+----------+----------+
|
|
|
|
pytest.param(
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"y": 0,
|
|
|
|
"x": 0,
|
|
|
|
"w": 2,
|
|
|
|
"h": 1,
|
|
|
|
"content": "h12col1",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"y": 0,
|
|
|
|
"x": 1,
|
|
|
|
"w": 1,
|
|
|
|
"h": 2,
|
|
|
|
"content": "h1col23",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"y": 0,
|
|
|
|
"x": 3,
|
|
|
|
"w": 1,
|
|
|
|
"h": 1,
|
|
|
|
"content": "h1col4",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"y": 1,
|
|
|
|
"x": 1,
|
|
|
|
"w": 1,
|
|
|
|
"h": 1,
|
|
|
|
"content": "h2col2",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"y": 1,
|
|
|
|
"x": 2,
|
|
|
|
"w": 1,
|
|
|
|
"h": 2,
|
|
|
|
"content": "h2col34",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"y": 2,
|
|
|
|
"x": 0,
|
|
|
|
"w": 1,
|
|
|
|
"h": 1,
|
|
|
|
"content": "r3col1",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"y": 2,
|
|
|
|
"x": 1,
|
|
|
|
"w": 1,
|
|
|
|
"h": 1,
|
|
|
|
"content": "r3col2",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"y": 2,
|
|
|
|
"x": 2,
|
|
|
|
"w": 2,
|
|
|
|
"h": 2,
|
|
|
|
"content": "r34col34",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"y": 3,
|
|
|
|
"x": 0,
|
|
|
|
"w": 1,
|
|
|
|
"h": 2,
|
|
|
|
"content": "r4col12",
|
|
|
|
},
|
|
|
|
],
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"row_index": 0,
|
|
|
|
"col_index": 0,
|
|
|
|
"content": "h12col1",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 0,
|
|
|
|
"col_index": 1,
|
|
|
|
"content": "h1col23",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 0,
|
|
|
|
"col_index": 3,
|
|
|
|
"content": "h1col4",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 1,
|
|
|
|
"col_index": 1,
|
|
|
|
"content": "h2col2",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 1,
|
|
|
|
"col_index": 2,
|
|
|
|
"content": "h2col34",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 2,
|
|
|
|
"col_index": 0,
|
|
|
|
"content": "r3col1",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 2,
|
|
|
|
"col_index": 1,
|
|
|
|
"content": "r3col2",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 2,
|
|
|
|
"col_index": 2,
|
|
|
|
"content": "r34col34",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 3,
|
|
|
|
"col_index": 0,
|
|
|
|
"content": "r4col12",
|
|
|
|
},
|
|
|
|
],
|
|
|
|
id="various spans, with 2 row header",
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_cells_table_extraction_from_prediction(table_as_cells, expected_extraction):
|
|
|
|
example_element = {
|
|
|
|
"type": "Table",
|
|
|
|
"metadata": {"table_as_cells": table_as_cells},
|
2024-05-07 15:57:38 +02:00
|
|
|
}
|
2024-06-14 11:03:27 +02:00
|
|
|
assert extract_cells_from_table_as_cells(example_element) == expected_extraction
|
|
|
|
|
2024-05-07 15:57:38 +02:00
|
|
|
|
2024-06-14 11:03:27 +02:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("text_as_html", "expected_extraction"),
|
|
|
|
[
|
|
|
|
pytest.param(
|
|
|
|
"""
|
|
|
|
<table>
|
|
|
|
<thead>
|
|
|
|
<tr>
|
|
|
|
<th>Month A.</th>
|
|
|
|
</tr>
|
|
|
|
</thead>
|
|
|
|
<tbody>
|
|
|
|
<tr>
|
|
|
|
<td>22</td>
|
|
|
|
</tr>
|
|
|
|
</tbody>
|
|
|
|
</table>"
|
|
|
|
""",
|
|
|
|
[
|
|
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
|
|
{"row_index": 1, "col_index": 0, "content": "22"},
|
|
|
|
],
|
|
|
|
id="Simple table, 1 head cell, 1 body cell, no spans",
|
|
|
|
),
|
|
|
|
pytest.param(
|
|
|
|
"""
|
|
|
|
<table>
|
|
|
|
<thead>
|
|
|
|
<tr>
|
|
|
|
<th>Month A.</th>
|
|
|
|
<th>Month B.</th>
|
|
|
|
<th>Month C.</th>
|
|
|
|
</tr>
|
|
|
|
</thead>
|
|
|
|
<tbody>
|
|
|
|
<tr>
|
|
|
|
<td>11</td>
|
|
|
|
<td>12</td>
|
|
|
|
<td>13</td>
|
|
|
|
</tr>
|
|
|
|
<tr>
|
|
|
|
<td>21</td>
|
|
|
|
<td>22</td>
|
|
|
|
<td>23</td>
|
|
|
|
</tr>
|
|
|
|
</tbody>
|
|
|
|
</table>"
|
|
|
|
""",
|
|
|
|
[
|
|
|
|
{"row_index": 0, "col_index": 0, "content": "Month A."},
|
|
|
|
{"row_index": 0, "col_index": 1, "content": "Month B."},
|
|
|
|
{"row_index": 0, "col_index": 2, "content": "Month C."},
|
|
|
|
{"row_index": 1, "col_index": 0, "content": "11"},
|
|
|
|
{"row_index": 1, "col_index": 1, "content": "12"},
|
|
|
|
{"row_index": 1, "col_index": 2, "content": "13"},
|
|
|
|
{"row_index": 2, "col_index": 0, "content": "21"},
|
|
|
|
{"row_index": 2, "col_index": 1, "content": "22"},
|
|
|
|
{"row_index": 2, "col_index": 2, "content": "23"},
|
|
|
|
],
|
|
|
|
id="Simple table, 3 head cell, 5 body cell, no spans",
|
|
|
|
),
|
|
|
|
# +----------+---------------------+----------+
|
|
|
|
# | | h1col23 | h1col4 |
|
|
|
|
# | h12col1 |----------+----------+----------|
|
|
|
|
# | | h2col2 | h2col34 |
|
|
|
|
# |----------|----------+----------+----------+
|
|
|
|
# | r3col1 | r3col2 | |
|
|
|
|
# |----------+----------| r34col34 |
|
|
|
|
# | r4col12 | |
|
|
|
|
# +----------+----------+----------+----------+
|
|
|
|
pytest.param(
|
|
|
|
"""
|
|
|
|
<table>
|
|
|
|
<thead>
|
|
|
|
<tr>
|
|
|
|
<th rowspan="2">h12col1</th>
|
|
|
|
<th colspan="2">h1col23</th>
|
|
|
|
<th>h1col4</th>
|
|
|
|
</tr>
|
|
|
|
<tr>
|
|
|
|
<th>h2col2</th>
|
|
|
|
<th colspan="2">h2col34</th>
|
|
|
|
</tr>
|
|
|
|
</thead>
|
|
|
|
<tbody>
|
|
|
|
<tr>
|
|
|
|
<td>r3col1</td>
|
|
|
|
<td>r3col2</td>
|
|
|
|
<td colspan="2" rowspan="2">r34col34</td>
|
|
|
|
</tr>
|
|
|
|
<tr>
|
|
|
|
<td colspan="2">r4col12</td>
|
|
|
|
</tr>
|
|
|
|
</tbody>
|
|
|
|
</table>
|
|
|
|
""",
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"row_index": 0,
|
|
|
|
"col_index": 0,
|
|
|
|
"content": "h12col1",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 0,
|
|
|
|
"col_index": 1,
|
|
|
|
"content": "h1col23",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 0,
|
|
|
|
"col_index": 3,
|
|
|
|
"content": "h1col4",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 1,
|
|
|
|
"col_index": 1,
|
|
|
|
"content": "h2col2",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 1,
|
|
|
|
"col_index": 2,
|
|
|
|
"content": "h2col34",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 2,
|
|
|
|
"col_index": 0,
|
|
|
|
"content": "r3col1",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 2,
|
|
|
|
"col_index": 1,
|
|
|
|
"content": "r3col2",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 2,
|
|
|
|
"col_index": 2,
|
|
|
|
"content": "r34col34",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"row_index": 3,
|
|
|
|
"col_index": 0,
|
|
|
|
"content": "r4col12",
|
|
|
|
},
|
|
|
|
],
|
|
|
|
id="various spans, with 2 row header",
|
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_html_table_extraction_from_prediction(text_as_html, expected_extraction):
|
|
|
|
example_element = {
|
|
|
|
"type": "Table",
|
|
|
|
"metadata": {
|
|
|
|
"text_as_html": text_as_html,
|
|
|
|
},
|
|
|
|
}
|
2024-05-07 15:57:38 +02:00
|
|
|
assert extract_cells_from_text_as_html(example_element) == expected_extraction
|
|
|
|
|
|
|
|
|
|
|
|
def test_cells_extraction_from_prediction_when_missing_prediction():
|
|
|
|
example_element = {"type": "Table", "metadata": {"text_as_html": "", "table_as_cells": []}}
|
|
|
|
assert extract_cells_from_text_as_html(example_element) is None
|
|
|
|
assert extract_cells_from_table_as_cells(example_element) is None
|
2024-06-19 09:03:38 +02:00
|
|
|
|
|
|
|
|
|
|
|
def _trim_html(html: str) -> str:
|
|
|
|
html_lines = [line.strip() for line in html.split("\n") if line]
|
|
|
|
return "".join(html_lines)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"html_to_test",
|
|
|
|
[
|
|
|
|
"""
|
|
|
|
<table>
|
|
|
|
<thead>
|
|
|
|
<tr>
|
|
|
|
<th>Month A.</th>
|
|
|
|
</tr>
|
|
|
|
</thead>
|
|
|
|
<tbody>
|
|
|
|
<tr>
|
|
|
|
<td>22</td>
|
|
|
|
</tr>
|
|
|
|
</tbody>
|
|
|
|
</table>
|
|
|
|
""",
|
|
|
|
"""
|
|
|
|
<table>
|
|
|
|
<thead>
|
|
|
|
<tr>
|
|
|
|
<th>Month A.</th>
|
|
|
|
<th>Month B.</th>
|
|
|
|
<th>Month C.</th>
|
|
|
|
</tr>
|
|
|
|
</thead>
|
|
|
|
<tbody>
|
|
|
|
<tr>
|
|
|
|
<td>11</td>
|
|
|
|
<td>12</td>
|
|
|
|
<td>13</td>
|
|
|
|
</tr>
|
|
|
|
<tr>
|
|
|
|
<td>21</td>
|
|
|
|
<td>22</td>
|
|
|
|
<td>23</td>
|
|
|
|
</tr>
|
|
|
|
</tbody>
|
|
|
|
</table>
|
|
|
|
""",
|
|
|
|
"""
|
|
|
|
<table>
|
|
|
|
<thead>
|
|
|
|
<tr>
|
|
|
|
<th rowspan="2">h12col1</th>
|
|
|
|
<th colspan="2">h1col23</th>
|
|
|
|
<th>h1col4</th>
|
|
|
|
</tr>
|
|
|
|
<tr>
|
|
|
|
<th>h2col2</th>
|
|
|
|
<th colspan="2">h2col34</th>
|
|
|
|
</tr>
|
|
|
|
</thead>
|
|
|
|
<tbody>
|
|
|
|
<tr>
|
|
|
|
<td>r3col1</td>
|
|
|
|
<td>r3col2</td>
|
|
|
|
<td colspan="2" rowspan="2">r34col34</td>
|
|
|
|
</tr>
|
|
|
|
<tr>
|
|
|
|
<td colspan="2">r4col12</td>
|
|
|
|
</tr>
|
|
|
|
</tbody>
|
|
|
|
</table>
|
|
|
|
""",
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_deckerd_html_converter(html_to_test):
|
|
|
|
deckerd_table = html_table_to_deckerd(html_to_test)
|
|
|
|
html_table = deckerd_table_to_html(deckerd_table)
|
|
|
|
assert _trim_html(html_to_test) == html_table
|