2023-10-06 21:21:14 -04:00
import re
import pytest
2023-10-10 13:46:01 -05:00
from unstructured . metrics import text_extraction
2024-05-07 15:57:38 +02:00
from unstructured . metrics . table . table_extraction import (
2024-06-19 09:03:38 +02:00
deckerd_table_to_html ,
2024-05-07 15:57:38 +02:00
extract_cells_from_table_as_cells ,
extract_cells_from_text_as_html ,
2024-06-19 09:03:38 +02:00
html_table_to_deckerd ,
2024-05-07 15:57:38 +02:00
)
2023-10-06 21:21:14 -04:00
from unstructured . partition . auto import partition
def test_calculate_edit_distance ( ) :
source_cct = " I like pizza. I like bagels. "
source_cct_word_space = " I like p i z z a . I like bagles. "
source_cct_spaces = re . sub ( r " \ s+ " , " " , " " . join ( source_cct ) )
source_cct_no_space = source_cct . replace ( " " , " " )
source_cct_one_sentence = " I like pizza. "
source_cct_missing_word = " I like pizza. I like . "
source_cct_addn_char = " I like pizza. I like beagles. "
source_cct_dup_word = " I like pizza pizza. I like bagels. "
assert (
2023-10-10 13:46:01 -05:00
round ( text_extraction . calculate_edit_distance ( source_cct , source_cct , return_as = " score " ) , 2 )
== 1.0
)
assert (
round (
text_extraction . calculate_edit_distance (
source_cct_word_space ,
source_cct ,
return_as = " score " ,
) ,
2 ,
)
2023-10-06 21:21:14 -04:00
== 0.75
)
assert (
2023-10-10 13:46:01 -05:00
round (
text_extraction . calculate_edit_distance (
source_cct_spaces ,
source_cct ,
return_as = " score " ,
) ,
2 ,
)
== 0.39
2023-10-06 21:21:14 -04:00
)
assert (
2023-10-10 13:46:01 -05:00
round (
text_extraction . calculate_edit_distance (
source_cct_no_space ,
source_cct ,
return_as = " score " ,
) ,
2 ,
)
2023-10-06 21:21:14 -04:00
== 0.64
)
assert (
2023-10-10 13:46:01 -05:00
round (
text_extraction . calculate_edit_distance (
source_cct_one_sentence ,
source_cct ,
return_as = " score " ,
) ,
2 ,
)
2023-10-06 21:21:14 -04:00
== 0.0
)
assert (
2023-10-10 13:46:01 -05:00
round (
text_extraction . calculate_edit_distance (
source_cct_missing_word ,
source_cct ,
return_as = " score " ,
) ,
2 ,
)
2023-10-06 21:21:14 -04:00
== 0.57
)
assert (
2023-10-10 13:46:01 -05:00
round (
text_extraction . calculate_edit_distance (
source_cct_addn_char ,
source_cct ,
return_as = " score " ,
) ,
2 ,
)
2023-10-06 21:21:14 -04:00
== 0.89
)
assert (
2023-10-10 13:46:01 -05:00
round (
text_extraction . calculate_edit_distance (
source_cct_dup_word ,
source_cct ,
return_as = " score " ,
) ,
2 ,
)
2023-10-06 21:21:14 -04:00
== 0.79
)
@pytest.mark.parametrize (
2024-10-24 15:02:34 +02:00
( " filename " , " standardize_whitespaces " , " expected_score " , " expected_distance " ) ,
2023-10-06 21:21:14 -04:00
[
2024-10-24 15:02:34 +02:00
( " fake-text.txt " , False , 0.78 , 38 ) ,
( " fake-text.txt " , True , 0.92 , 12 ) ,
2023-10-06 21:21:14 -04:00
] ,
)
2024-10-24 15:02:34 +02:00
def test_calculate_edit_distance_with_filename (
filename , standardize_whitespaces , expected_score , expected_distance
) :
2023-10-06 21:21:14 -04:00
with open ( " example-docs/fake-text.txt " ) as f :
source_cct = f . read ( )
elements = partition ( filename = f " example-docs/ { filename } " )
output_cct = " \n " . join ( [ str ( el ) for el in elements ] )
2024-10-24 15:02:34 +02:00
score = text_extraction . calculate_edit_distance (
output_cct , source_cct , return_as = " score " , standardize_whitespaces = standardize_whitespaces
)
distance = text_extraction . calculate_edit_distance (
output_cct ,
source_cct ,
return_as = " distance " ,
standardize_whitespaces = standardize_whitespaces ,
)
2023-10-06 21:21:14 -04:00
assert score > = 0
assert score < = 1.0
assert distance > = 0
assert round ( score , 2 ) == expected_score
assert distance == expected_distance
2023-10-10 13:46:01 -05:00
2024-10-24 15:02:34 +02:00
@pytest.mark.parametrize (
( " text1 " , " text2 " ) ,
[
(
" The dog \r loved the cat, but \t \n the cat \t loved the \n cow " ,
" The dog loved the cat, but the cat loved the cow " ,
) ,
(
" Hello my \t name \t is H a r p e r, \n what ' s your \v name? " ,
" Hello my name is H a r p e r, what ' s your name? " ,
) ,
(
" I have a \t \n \t dog and a \t cat, \f I love my \n \n \n \n dog. " ,
" I have a dog and a cat, I love my dog. " ,
) ,
(
"""
Name Age City Occupation
Alice 30 New York Engineer
Bob 25 Los Angeles Designer
Charlie 35 Chicago Teacher
David 40 San Francisco Developer
""" ,
"""
Name \tAge \tCity \tOccupation
Alice \t30 \tNew York \tEngineer
Bob \t25 \tLos Angeles \tDesigner
Charlie \t35 \tChicago \tTeacher
David \t40 \tSan Francisco \tDeveloper
""" ,
) ,
(
"""
Name \tAge \tCity \tOccupation
Alice \t30 \tNew York \tEngineer
Bob \t25 \tLos Angeles \tDesigner
Charlie \t35 \tChicago \tTeacher
David \t40 \tSan Francisco \tDeveloper
""" ,
" Name \t Age \t City \t Occupation \n \n \n Alice \t 30 \t New York \t Engineer \n Bob \t 25 \t Los Angeles \t Designer \n Charlie \t 35 \t Chicago \t Teacher \n David \t 40 \t San Francisco \t Developer " , # noqa: E501
) ,
] ,
)
def test_calculate_edit_distance_with_various_whitespace_1 ( text1 , text2 ) :
assert (
text_extraction . calculate_edit_distance (
text1 , text2 , return_as = " score " , standardize_whitespaces = True
)
== 1.0
)
assert (
text_extraction . calculate_edit_distance (
text1 , text2 , return_as = " distance " , standardize_whitespaces = True
)
== 0
)
assert (
text_extraction . calculate_edit_distance (
text1 , text2 , return_as = " score " , standardize_whitespaces = False
)
< 1.0
)
assert (
text_extraction . calculate_edit_distance (
text1 , text2 , return_as = " distance " , standardize_whitespaces = False
)
> 0
)
def test_calculate_edit_distance_with_various_whitespace_2 ( ) :
source_cct_tabs = """
Name \tAge \tCity \tOccupation
Alice \t30 \tNew York \tEngineer
Bob \t25 \tLos Angeles \tDesigner
Charlie \t35 \tChicago \tTeacher
David \t40 \tSan Francisco \tDeveloper
"""
source_cct_with_borders = """
| Name | Age | City | Occupation |
| - - - - - - - - - | - - - - - | - - - - - - - - - - - - - - | - - - - - - - - - - - - - - - - |
| Alice | 30 | New York | Engineer |
| Bob | 25 | Los Angeles | Designer |
| Charlie | 35 | Chicago | Teacher |
| David | 40 | San Francisco | Developer |
"""
assert text_extraction . calculate_edit_distance (
source_cct_tabs , source_cct_with_borders , return_as = " score " , standardize_whitespaces = True
) > text_extraction . calculate_edit_distance (
source_cct_tabs , source_cct_with_borders , return_as = " score " , standardize_whitespaces = False
)
assert text_extraction . calculate_edit_distance (
source_cct_tabs , source_cct_with_borders , return_as = " distance " , standardize_whitespaces = True
) < text_extraction . calculate_edit_distance (
source_cct_tabs ,
source_cct_with_borders ,
return_as = " distance " ,
standardize_whitespaces = False ,
)
2023-10-10 13:46:01 -05:00
@pytest.mark.parametrize (
( " text " , " expected " ) ,
[
(
" The dog loved the cat, but the cat loved the cow " ,
{ " the " : 4 , " cat " : 2 , " loved " : 2 , " dog " : 1 , " but " : 1 , " cow " : 1 } ,
) ,
(
" Hello my name is H a r p e r, what ' s your name? " ,
{ " hello " : 1 , " my " : 1 , " name " : 2 , " is " : 1 , " what ' s " : 1 , " your " : 1 } ,
) ,
(
" I have a dog and a cat, I love my dog. " ,
{ " i " : 2 , " have " : 1 , " a " : 2 , " dog " : 2 , " and " : 1 , " cat " : 1 , " love " : 1 , " my " : 1 } ,
) ,
(
" My dog ' s hair is red, but the dogs ' houses are blue. " ,
{
" my " : 1 ,
" dog ' s " : 1 ,
" hair " : 1 ,
" is " : 1 ,
" red " : 1 ,
" but " : 1 ,
" the " : 1 ,
" dogs ' " : 1 ,
" houses " : 1 ,
" are " : 1 ,
" blue " : 1 ,
} ,
) ,
(
""" Sometimes sentences have a dash - like this one!
2024-06-14 11:03:27 +02:00
A hyphen connects 2 words with no gap : easy - peasy . """ ,
2023-10-10 13:46:01 -05:00
{
" sometimes " : 1 ,
" sentences " : 1 ,
" have " : 1 ,
" a " : 2 ,
" dash " : 1 ,
" like " : 1 ,
" this " : 1 ,
" one " : 1 ,
" hyphen " : 1 ,
" connects " : 1 ,
" 2 " : 1 ,
" words " : 1 ,
" with " : 1 ,
" no " : 1 ,
" gap " : 1 ,
" easy-peasy " : 1 ,
} ,
) ,
] ,
)
def test_bag_of_words ( text , expected ) :
assert text_extraction . bag_of_words ( text ) == expected
2023-10-10 13:54:49 -07:00
2024-10-24 15:02:34 +02:00
@pytest.mark.parametrize (
( " text " , " expected " ) ,
[
(
" The dog \r loved the cat, but \t \n the cat \t loved the \n cow \n \n " ,
" The dog loved the cat, but the cat loved the cow " ,
) ,
(
" \n \n Hello my \t name \t is H a r p e r, \n what ' s your \v name? " ,
" Hello my name is H a r p e r, what ' s your name? " ,
) ,
(
" I have a \t \n \t dog and a \t cat, \f I love my \n \n \n \n dog. " ,
" I have a dog and a cat, I love my dog. " ,
) ,
(
""" L is for the way you look at me
O is for the only one I see
V is very , very extraordinary
E is even more than anyone that you adore can """ ,
" L is for the way you look at me O is for the only one I see V is very, very extraordinary E is even more than anyone that you adore can " , # noqa: E501
) ,
(
"""
| Name | Age | City | Occupation |
| - - - - - - - - - | - - - - - | - - - - - - - - - - - - - - | - - - - - - - - - - - - - - - - |
| Alice | 30 | New York | Engineer |
| Bob | 25 | Los Angeles | Designer |
| Charlie | 35 | Chicago | Teacher |
| David | 40 | San Francisco | Developer |
""" ,
" | Name | Age | City | Occupation | |---------|-----|--------------|----------------| | Alice | 30 | New York | Engineer | | Bob | 25 | Los Angeles | Designer | | Charlie | 35 | Chicago | Teacher | | David | 40 | San Francisco| Developer | " , # noqa: E501
) ,
] ,
)
def test_prepare_string ( text , expected ) :
assert text_extraction . prepare_str ( text , standardize_whitespaces = True ) == expected
assert text_extraction . prepare_str ( text ) == text
2024-12-03 21:21:39 -08:00
@pytest.mark.parametrize (
( " input_text " , " expected_output " ) ,
[
2024-12-04 11:33:03 -08:00
# Complex sentences with standard quotes
( ' " The quick brown fox jumps over the lazy dog, " said the narrator. ' ,
' " The quick brown fox jumps over the lazy dog, " said the narrator. ' ) ,
# Mixed quotes in longer sentences
( ' She said " Hello " and then whispered \' Goodbye \' before leaving. ' ,
' She said " Hello " and then whispered \' Goodbye \' before leaving. ' ) ,
# Double low-9 quotes with complex content
( ' „To be, or not to be, that is the question " - Shakespeare \' s famous quote. ' ,
' " To be, or not to be, that is the question " - Shakespeare \' s famous quote. ' ) ,
# Angle quotes with nested quotes
( ' «When he said " life is beautiful, " I believed him» wrote Maria. ' ,
' " When he said " life is beautiful, " I believed him " wrote Maria. ' ) ,
# Heavy ornament quotes in dialogue
( ' ❝Do you remember when we first met?❞ she asked with a smile. ' ,
' " Do you remember when we first met? " she asked with a smile. ' ) ,
# Double prime quotes with punctuation
( ' 〝The meeting starts at 10:00, don \' t be late!〟 announced the manager. ' ,
' " The meeting starts at 10:00, don \' t be late! " announced the manager. ' ) ,
# Corner brackets with nested quotes
( ' 「He told me " This is important " yesterday」, she explained. ' ,
' \' He told me " This is important " yesterday \' , she explained. ' ) ,
# White corner brackets with multiple sentences
( ' 『The sun was setting. The birds were singing. It was peaceful.』 ' ,
' \' The sun was setting. The birds were singing. It was peaceful. \' ' ) ,
# Vertical corner brackets with numbers and special characters
( ' ﹂Meeting #123 @ 15:00 - Don \' t forget!﹁ ' ,
' \' Meeting #123 @ 15:00 - Don \' t forget! \' ' ) ,
# Complex mixed quote types
( ' 「Hello」, ❝World❞, " Test " , \' Example \' , „Quote " , «Final» ' ,
' \' Hello \' , " World " , " Test " , \' Example \' , " Quote " , " Final " ' ) ,
# Quotes with multiple apostrophes
( ' It \' s John \' s book, isn \' t it? ' ,
" It ' s John ' s book, isn ' t it? " ) ,
# Single angle quotes with nested content
( ' ‹ Testing the system\' s capability for " quoted " text› ' ,
' \' Testing the system \' s capability for " quoted " text \' ' ) ,
# Heavy single ornament quotes with multiple sentences
( ' ❛First sentence. Second sentence. Third sentence.❜ ' ,
' \' First sentence. Second sentence. Third sentence. \' ' ) ,
# Mix of various quote types in complex text
( ' 「Chapter 1」: ❝The Beginning❞ - „A new story " begins «today». ' ,
' \' Chapter 1 \' : " The Beginning " - " A new story " begins " today " . ' )
2024-12-03 21:21:39 -08:00
] ,
)
def test_standardize_quotes ( input_text , expected_output ) :
assert text_extraction . standardize_quotes ( input_text ) == expected_output
2024-10-24 15:02:34 +02:00
2023-10-10 13:54:49 -07:00
@pytest.mark.parametrize (
( " output_text " , " source_text " , " expected_percentage " ) ,
[
(
" extra " ,
" " ,
0 ,
) ,
(
" " ,
" Source text has a sentence. " ,
1 ,
) ,
(
" The original s e n t e n c e is normal. " ,
" The original sentence is normal... " ,
0.2 ,
) ,
(
" We saw 23 % i mprovement in this quarter. " ,
" We saw 23 % i mprovement in sales this quarter. " ,
2023-11-01 16:50:34 -04:00
0.125 ,
2023-10-10 13:54:49 -07:00
) ,
(
" no " ,
" Is it possible to have more than everything missing? " ,
1 ,
) ,
] ,
)
def test_calculate_percent_missing_text ( output_text , source_text , expected_percentage ) :
assert (
text_extraction . calculate_percent_missing_text ( output_text , source_text )
== expected_percentage
)
2024-05-07 15:57:38 +02:00
2024-06-14 11:03:27 +02:00
@pytest.mark.parametrize (
( " table_as_cells " , " expected_extraction " ) ,
[
pytest . param (
[
2024-05-07 15:57:38 +02:00
{ " x " : 0 , " y " : 0 , " w " : 1 , " h " : 1 , " content " : " Month A. " } ,
{ " x " : 0 , " y " : 1 , " w " : 1 , " h " : 1 , " content " : " 22 " } ,
] ,
2024-06-14 11:03:27 +02:00
[
{ " row_index " : 0 , " col_index " : 0 , " content " : " Month A. " } ,
{ " row_index " : 1 , " col_index " : 0 , " content " : " 22 " } ,
] ,
id = " Simple table, 1 head cell, 1 body cell, no spans " ,
) ,
pytest . param (
[
{ " x " : 0 , " y " : 0 , " w " : 1 , " h " : 1 , " content " : " Month A. " } ,
{ " x " : 1 , " y " : 0 , " w " : 1 , " h " : 1 , " content " : " Month B. " } ,
{ " x " : 2 , " y " : 0 , " w " : 1 , " h " : 1 , " content " : " Month C. " } ,
{ " x " : 0 , " y " : 1 , " w " : 1 , " h " : 1 , " content " : " 11 " } ,
{ " x " : 1 , " y " : 1 , " w " : 1 , " h " : 1 , " content " : " 12 " } ,
{ " x " : 2 , " y " : 1 , " w " : 1 , " h " : 1 , " content " : " 13 " } ,
{ " x " : 0 , " y " : 2 , " w " : 1 , " h " : 1 , " content " : " 21 " } ,
{ " x " : 1 , " y " : 2 , " w " : 1 , " h " : 1 , " content " : " 22 " } ,
{ " x " : 2 , " y " : 2 , " w " : 1 , " h " : 1 , " content " : " 23 " } ,
] ,
[
{ " row_index " : 0 , " col_index " : 0 , " content " : " Month A. " } ,
{ " row_index " : 0 , " col_index " : 1 , " content " : " Month B. " } ,
{ " row_index " : 0 , " col_index " : 2 , " content " : " Month C. " } ,
{ " row_index " : 1 , " col_index " : 0 , " content " : " 11 " } ,
{ " row_index " : 1 , " col_index " : 1 , " content " : " 12 " } ,
{ " row_index " : 1 , " col_index " : 2 , " content " : " 13 " } ,
{ " row_index " : 2 , " col_index " : 0 , " content " : " 21 " } ,
{ " row_index " : 2 , " col_index " : 1 , " content " : " 22 " } ,
{ " row_index " : 2 , " col_index " : 2 , " content " : " 23 " } ,
] ,
id = " Simple table, 3 head cell, 5 body cell, no spans " ,
) ,
# +----------+---------------------+----------+
# | | h1col23 | h1col4 |
# | h12col1 |----------+----------+----------|
# | | h2col2 | h2col34 |
# |----------|----------+----------+----------+
# | r3col1 | r3col2 | |
# |----------+----------| r34col34 |
# | r4col12 | |
# +----------+----------+----------+----------+
pytest . param (
[
{
" y " : 0 ,
" x " : 0 ,
" w " : 2 ,
" h " : 1 ,
" content " : " h12col1 " ,
} ,
{
" y " : 0 ,
" x " : 1 ,
" w " : 1 ,
" h " : 2 ,
" content " : " h1col23 " ,
} ,
{
" y " : 0 ,
" x " : 3 ,
" w " : 1 ,
" h " : 1 ,
" content " : " h1col4 " ,
} ,
{
" y " : 1 ,
" x " : 1 ,
" w " : 1 ,
" h " : 1 ,
" content " : " h2col2 " ,
} ,
{
" y " : 1 ,
" x " : 2 ,
" w " : 1 ,
" h " : 2 ,
" content " : " h2col34 " ,
} ,
{
" y " : 2 ,
" x " : 0 ,
" w " : 1 ,
" h " : 1 ,
" content " : " r3col1 " ,
} ,
{
" y " : 2 ,
" x " : 1 ,
" w " : 1 ,
" h " : 1 ,
" content " : " r3col2 " ,
} ,
{
" y " : 2 ,
" x " : 2 ,
" w " : 2 ,
" h " : 2 ,
" content " : " r34col34 " ,
} ,
{
" y " : 3 ,
" x " : 0 ,
" w " : 1 ,
" h " : 2 ,
" content " : " r4col12 " ,
} ,
] ,
[
{
" row_index " : 0 ,
" col_index " : 0 ,
" content " : " h12col1 " ,
} ,
{
" row_index " : 0 ,
" col_index " : 1 ,
" content " : " h1col23 " ,
} ,
{
" row_index " : 0 ,
" col_index " : 3 ,
" content " : " h1col4 " ,
} ,
{
" row_index " : 1 ,
" col_index " : 1 ,
" content " : " h2col2 " ,
} ,
{
" row_index " : 1 ,
" col_index " : 2 ,
" content " : " h2col34 " ,
} ,
{
" row_index " : 2 ,
" col_index " : 0 ,
" content " : " r3col1 " ,
} ,
{
" row_index " : 2 ,
" col_index " : 1 ,
" content " : " r3col2 " ,
} ,
{
" row_index " : 2 ,
" col_index " : 2 ,
" content " : " r34col34 " ,
} ,
{
" row_index " : 3 ,
" col_index " : 0 ,
" content " : " r4col12 " ,
} ,
] ,
id = " various spans, with 2 row header " ,
) ,
] ,
)
def test_cells_table_extraction_from_prediction ( table_as_cells , expected_extraction ) :
example_element = {
" type " : " Table " ,
" metadata " : { " table_as_cells " : table_as_cells } ,
2024-05-07 15:57:38 +02:00
}
2024-06-14 11:03:27 +02:00
assert extract_cells_from_table_as_cells ( example_element ) == expected_extraction
2024-05-07 15:57:38 +02:00
2024-06-14 11:03:27 +02:00
@pytest.mark.parametrize (
( " text_as_html " , " expected_extraction " ) ,
[
pytest . param (
"""
< table >
< thead >
< tr >
< th > Month A . < / th >
< / tr >
< / thead >
< tbody >
< tr >
< td > 22 < / td >
< / tr >
< / tbody >
< / table > "
""" ,
[
{ " row_index " : 0 , " col_index " : 0 , " content " : " Month A. " } ,
{ " row_index " : 1 , " col_index " : 0 , " content " : " 22 " } ,
] ,
id = " Simple table, 1 head cell, 1 body cell, no spans " ,
) ,
pytest . param (
"""
< table >
< thead >
< tr >
< th > Month A . < / th >
< th > Month B . < / th >
< th > Month C . < / th >
< / tr >
< / thead >
< tbody >
< tr >
< td > 11 < / td >
< td > 12 < / td >
< td > 13 < / td >
< / tr >
< tr >
< td > 21 < / td >
< td > 22 < / td >
< td > 23 < / td >
< / tr >
< / tbody >
< / table > "
""" ,
[
{ " row_index " : 0 , " col_index " : 0 , " content " : " Month A. " } ,
{ " row_index " : 0 , " col_index " : 1 , " content " : " Month B. " } ,
{ " row_index " : 0 , " col_index " : 2 , " content " : " Month C. " } ,
{ " row_index " : 1 , " col_index " : 0 , " content " : " 11 " } ,
{ " row_index " : 1 , " col_index " : 1 , " content " : " 12 " } ,
{ " row_index " : 1 , " col_index " : 2 , " content " : " 13 " } ,
{ " row_index " : 2 , " col_index " : 0 , " content " : " 21 " } ,
{ " row_index " : 2 , " col_index " : 1 , " content " : " 22 " } ,
{ " row_index " : 2 , " col_index " : 2 , " content " : " 23 " } ,
] ,
id = " Simple table, 3 head cell, 5 body cell, no spans " ,
) ,
# +----------+---------------------+----------+
# | | h1col23 | h1col4 |
# | h12col1 |----------+----------+----------|
# | | h2col2 | h2col34 |
# |----------|----------+----------+----------+
# | r3col1 | r3col2 | |
# |----------+----------| r34col34 |
# | r4col12 | |
# +----------+----------+----------+----------+
pytest . param (
"""
< table >
< thead >
< tr >
< th rowspan = " 2 " > h12col1 < / th >
< th colspan = " 2 " > h1col23 < / th >
< th > h1col4 < / th >
< / tr >
< tr >
< th > h2col2 < / th >
< th colspan = " 2 " > h2col34 < / th >
< / tr >
< / thead >
< tbody >
< tr >
< td > r3col1 < / td >
< td > r3col2 < / td >
< td colspan = " 2 " rowspan = " 2 " > r34col34 < / td >
< / tr >
< tr >
< td colspan = " 2 " > r4col12 < / td >
< / tr >
< / tbody >
< / table >
""" ,
[
{
" row_index " : 0 ,
" col_index " : 0 ,
" content " : " h12col1 " ,
} ,
{
" row_index " : 0 ,
" col_index " : 1 ,
" content " : " h1col23 " ,
} ,
{
" row_index " : 0 ,
" col_index " : 3 ,
" content " : " h1col4 " ,
} ,
{
" row_index " : 1 ,
" col_index " : 1 ,
" content " : " h2col2 " ,
} ,
{
" row_index " : 1 ,
" col_index " : 2 ,
" content " : " h2col34 " ,
} ,
{
" row_index " : 2 ,
" col_index " : 0 ,
" content " : " r3col1 " ,
} ,
{
" row_index " : 2 ,
" col_index " : 1 ,
" content " : " r3col2 " ,
} ,
{
" row_index " : 2 ,
" col_index " : 2 ,
" content " : " r34col34 " ,
} ,
{
" row_index " : 3 ,
" col_index " : 0 ,
" content " : " r4col12 " ,
} ,
] ,
id = " various spans, with 2 row header " ,
) ,
] ,
)
def test_html_table_extraction_from_prediction ( text_as_html , expected_extraction ) :
example_element = {
" type " : " Table " ,
" metadata " : {
" text_as_html " : text_as_html ,
} ,
}
2024-05-07 15:57:38 +02:00
assert extract_cells_from_text_as_html ( example_element ) == expected_extraction
def test_cells_extraction_from_prediction_when_missing_prediction ( ) :
example_element = { " type " : " Table " , " metadata " : { " text_as_html " : " " , " table_as_cells " : [ ] } }
assert extract_cells_from_text_as_html ( example_element ) is None
assert extract_cells_from_table_as_cells ( example_element ) is None
2024-06-19 09:03:38 +02:00
def _trim_html ( html : str ) - > str :
html_lines = [ line . strip ( ) for line in html . split ( " \n " ) if line ]
return " " . join ( html_lines )
@pytest.mark.parametrize (
" html_to_test " ,
[
"""
< table >
< thead >
< tr >
< th > Month A . < / th >
< / tr >
< / thead >
< tbody >
< tr >
< td > 22 < / td >
< / tr >
< / tbody >
< / table >
""" ,
"""
< table >
< thead >
< tr >
< th > Month A . < / th >
< th > Month B . < / th >
< th > Month C . < / th >
< / tr >
< / thead >
< tbody >
< tr >
< td > 11 < / td >
< td > 12 < / td >
< td > 13 < / td >
< / tr >
< tr >
< td > 21 < / td >
< td > 22 < / td >
< td > 23 < / td >
< / tr >
< / tbody >
< / table >
""" ,
"""
< table >
< thead >
< tr >
< th rowspan = " 2 " > h12col1 < / th >
< th colspan = " 2 " > h1col23 < / th >
< th > h1col4 < / th >
< / tr >
< tr >
< th > h2col2 < / th >
< th colspan = " 2 " > h2col34 < / th >
< / tr >
< / thead >
< tbody >
< tr >
< td > r3col1 < / td >
< td > r3col2 < / td >
< td colspan = " 2 " rowspan = " 2 " > r34col34 < / td >
< / tr >
< tr >
< td colspan = " 2 " > r4col12 < / td >
< / tr >
< / tbody >
< / table >
""" ,
] ,
)
def test_deckerd_html_converter ( html_to_test ) :
deckerd_table = html_table_to_deckerd ( html_to_test )
html_table = deckerd_table_to_html ( deckerd_table )
assert _trim_html ( html_to_test ) == html_table