2025-03-19 17:44:49 +00:00
import unittest
2025-03-19 21:06:52 +00:00
2025-03-19 17:44:49 +00:00
from olmocr . bench . tests import (
2025-03-19 18:01:53 +00:00
BaselineTest ,
BasePDFTest ,
MathTest ,
TableTest ,
2025-03-19 17:44:49 +00:00
TestChecked ,
2025-03-19 18:01:53 +00:00
TestType ,
TextOrderTest ,
TextPresenceTest ,
2025-03-19 17:44:49 +00:00
ValidationError ,
normalize_text ,
2025-04-02 20:25:16 +00:00
parse_html_tables ,
parse_markdown_tables ,
2025-03-19 17:44:49 +00:00
)
class TestNormalizeText ( unittest . TestCase ) :
""" Test the normalize_text function """
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_whitespace_normalization ( self ) :
""" Test that whitespace is properly normalized """
input_text = " This has \t multiple spaces \n and \n newlines "
expected = " This has multiple spaces and newlines "
self . assertEqual ( normalize_text ( input_text ) , expected )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_character_replacement ( self ) :
""" Test that fancy characters are replaced with ASCII equivalents """
input_text = " This has ' fancy ' “quotes” and—dashes "
expected = " This has ' fancy ' \" quotes \" and-dashes "
self . assertEqual ( normalize_text ( input_text ) , expected )
2025-03-19 18:01:53 +00:00
2025-03-21 18:30:09 +00:00
def test_markdown1 ( self ) :
""" Test that fancy characters are replaced with ASCII equivalents """
input_text = " this is *bold* "
expected = " this is bold "
self . assertEqual ( normalize_text ( input_text ) , expected )
def test_markdown2 ( self ) :
""" Test that fancy characters are replaced with ASCII equivalents """
input_text = " _italic__ is *bold* "
expected = " italic_ is bold "
self . assertEqual ( normalize_text ( input_text ) , expected )
2025-03-19 17:44:49 +00:00
def test_empty_input ( self ) :
""" Test that empty input returns empty output """
self . assertEqual ( normalize_text ( " " ) , " " )
2025-05-08 17:21:06 +00:00
def test_brs ( self ) :
""" Test that empty input returns empty output """
self . assertEqual ( normalize_text ( " Hello<br>everyone " ) , " Hello everyone " )
self . assertEqual ( normalize_text ( " Hello<br>everyone " ) , normalize_text ( " Hello \n everyone " ) )
self . assertEqual ( normalize_text ( " Hello<br/>everyone " ) , " Hello everyone " )
self . assertEqual ( normalize_text ( " Hello<br/>everyone " ) , normalize_text ( " Hello \n everyone " ) )
def test_two_stars ( self ) :
self . assertEqual ( normalize_text ( " **Georges V.** (2007) – *Le Forez du VIe au IVe millénaire av. J.-C. Territoires, identités et stratégies des sociétés humaines du Massif central dans le bassin amont de la Loire (France)*, thèse de doctorat, université de Bourgogne, Dijon, 2 vol., 435 p. " ) ,
" Georges V. (2007) - Le Forez du VIe au IVe millénaire av. J.-C. Territoires, identités et stratégies des sociétés humaines du Massif central dans le bassin amont de la Loire (France), thèse de doctorat, université de Bourgogne, Dijon, 2 vol., 435 p. " )
2025-03-19 17:44:49 +00:00
class TestBasePDFTest ( unittest . TestCase ) :
""" Test the BasePDFTest class """
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_valid_initialization ( self ) :
""" Test that a valid initialization works """
test = BasePDFTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . BASELINE . value )
self . assertEqual ( test . pdf , " test.pdf " )
self . assertEqual ( test . page , 1 )
self . assertEqual ( test . id , " test_id " )
self . assertEqual ( test . type , TestType . BASELINE . value )
self . assertEqual ( test . max_diffs , 0 )
self . assertIsNone ( test . checked )
self . assertIsNone ( test . url )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_empty_pdf ( self ) :
""" Test that empty PDF raises ValidationError """
with self . assertRaises ( ValidationError ) :
BasePDFTest ( pdf = " " , page = 1 , id = " test_id " , type = TestType . BASELINE . value )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_empty_id ( self ) :
""" Test that empty ID raises ValidationError """
with self . assertRaises ( ValidationError ) :
BasePDFTest ( pdf = " test.pdf " , page = 1 , id = " " , type = TestType . BASELINE . value )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_negative_max_diffs ( self ) :
""" Test that negative max_diffs raises ValidationError """
with self . assertRaises ( ValidationError ) :
BasePDFTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . BASELINE . value , max_diffs = - 1 )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_invalid_test_type ( self ) :
""" Test that invalid test type raises ValidationError """
with self . assertRaises ( ValidationError ) :
BasePDFTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = " invalid_type " )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_run_method_not_implemented ( self ) :
""" Test that run method raises NotImplementedError """
test = BasePDFTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . BASELINE . value )
with self . assertRaises ( NotImplementedError ) :
test . run ( " content " )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_checked_enum ( self ) :
""" Test that checked accepts valid TestChecked enums """
2025-03-19 18:01:53 +00:00
test = BasePDFTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . BASELINE . value , checked = TestChecked . VERIFIED )
2025-03-19 17:44:49 +00:00
self . assertEqual ( test . checked , TestChecked . VERIFIED )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
class TestTextPresenceTest ( unittest . TestCase ) :
""" Test the TextPresenceTest class """
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_valid_present_test ( self ) :
""" Test that a valid PRESENT test initializes correctly """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " test text " )
2025-03-19 17:44:49 +00:00
self . assertEqual ( test . text , " test text " )
self . assertTrue ( test . case_sensitive )
self . assertIsNone ( test . first_n )
self . assertIsNone ( test . last_n )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_valid_absent_test ( self ) :
""" Test that a valid ABSENT test initializes correctly """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ABSENT . value , text = " test text " , case_sensitive = False )
2025-03-19 17:44:49 +00:00
self . assertEqual ( test . text , " test text " )
self . assertFalse ( test . case_sensitive )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_empty_text ( self ) :
""" Test that empty text raises ValidationError """
with self . assertRaises ( ValidationError ) :
2025-03-19 18:01:53 +00:00
TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " " )
2025-03-19 17:44:49 +00:00
def test_present_text_exact_match ( self ) :
""" Test that PRESENT test returns True for exact match """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " target text " )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " This is some target text in a document " )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_present_text_not_found ( self ) :
""" Test that PRESENT test returns False when text not found """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " missing text " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " This document doesn ' t have the target " )
self . assertFalse ( result )
self . assertIn ( " missing text " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_present_text_with_max_diffs ( self ) :
""" Test that PRESENT test with max_diffs handles fuzzy matching """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " target text " , max_diffs = 2 )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " This is some targett textt in a document " )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_absent_text_found ( self ) :
""" Test that ABSENT test returns False when text is found """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ABSENT . value , text = " target text " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " This is some target text in a document " )
self . assertFalse ( result )
self . assertIn ( " target text " , explanation )
def test_absent_text_found_diffs ( self ) :
""" Test that ABSENT test returns False when text is found """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ABSENT . value , text = " target text " , max_diffs = 2 )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " This is some target text in a document " )
2025-03-19 18:01:53 +00:00
self . assertFalse ( result )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " This is some targett text in a document " )
self . assertFalse ( result )
result , explanation = test . run ( " This is some targettt text in a document " )
2025-03-19 18:01:53 +00:00
self . assertFalse ( result )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " This is some targetttt text in a document " )
2025-03-19 18:01:53 +00:00
self . assertTrue ( result )
2025-03-19 17:44:49 +00:00
def test_absent_text_not_found ( self ) :
""" Test that ABSENT test returns True when text is not found """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ABSENT . value , text = " missing text " )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " This document doesn ' t have the target " )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_case_insensitive_present ( self ) :
""" Test that case_sensitive=False works for PRESENT test """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " TARGET TEXT " , case_sensitive = False )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " This is some target text in a document " )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_case_insensitive_absent ( self ) :
""" Test that case_sensitive=False works for ABSENT test """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ABSENT . value , text = " TARGET TEXT " , case_sensitive = False )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " This is some target text in a document " )
self . assertFalse ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_first_n_limit ( self ) :
""" Test that first_n parameter works correctly """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " beginning " , first_n = 20 )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " beginning of text, but not the end " )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
# Test that text beyond first_n isn't matched
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " end " , first_n = 20 )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " beginning of text, but not the end " )
self . assertFalse ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_last_n_limit ( self ) :
""" Test that last_n parameter works correctly """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " end " , last_n = 20 )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " beginning of text, but not the end " )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
# Test that text beyond last_n isn't matched
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " beginning " , last_n = 20 )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " beginning of text, but not the end " )
self . assertFalse ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_both_first_and_last_n ( self ) :
""" Test that combining first_n and last_n works correctly """
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " beginning " , first_n = 15 , last_n = 10 )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " beginning of text, middle part, but not the end " )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
# Text only in middle shouldn't be found
2025-03-19 18:01:53 +00:00
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " middle " , first_n = 15 , last_n = 10 )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " beginning of text, middle part, but not the end " )
self . assertFalse ( result )
2025-04-17 14:26:06 -07:00
def test_unicode_normalized_forms ( self ) :
""" Test that e+accent == e_with_accent unicode chars """
test = TextPresenceTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , text = " I like to eat at a caf \u00e9 " )
result , _ = test . run ( " I like to eat at a caf \u00e9 " )
self . assertTrue ( result )
result , _ = test . run ( " I like to eat at a cafe \u0301 " )
self . assertTrue ( result )
2025-03-19 17:44:49 +00:00
class TestTextOrderTest ( unittest . TestCase ) :
""" Test the TextOrderTest class """
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_valid_initialization ( self ) :
""" Test that valid initialization works """
2025-03-19 18:01:53 +00:00
test = TextOrderTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ORDER . value , before = " first text " , after = " second text " )
2025-03-19 17:44:49 +00:00
self . assertEqual ( test . before , " first text " )
self . assertEqual ( test . after , " second text " )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_invalid_test_type ( self ) :
""" Test that invalid test type raises ValidationError """
with self . assertRaises ( ValidationError ) :
2025-03-19 18:01:53 +00:00
TextOrderTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , before = " first text " , after = " second text " )
2025-03-19 17:44:49 +00:00
def test_empty_before ( self ) :
""" Test that empty before text raises ValidationError """
with self . assertRaises ( ValidationError ) :
2025-03-19 18:01:53 +00:00
TextOrderTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ORDER . value , before = " " , after = " second text " )
2025-03-19 17:44:49 +00:00
def test_empty_after ( self ) :
""" Test that empty after text raises ValidationError """
with self . assertRaises ( ValidationError ) :
2025-03-19 18:01:53 +00:00
TextOrderTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ORDER . value , before = " first text " , after = " " )
2025-03-19 17:44:49 +00:00
def test_correct_order ( self ) :
""" Test that correct order returns True """
2025-03-19 18:01:53 +00:00
test = TextOrderTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ORDER . value , before = " first " , after = " second " )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " This has first and then second in correct order " )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_incorrect_order ( self ) :
""" Test that incorrect order returns False """
2025-03-19 18:01:53 +00:00
test = TextOrderTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ORDER . value , before = " second " , after = " first " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " This has first and then second in correct order " )
self . assertFalse ( result )
def test_before_not_found ( self ) :
""" Test that ' before ' text not found returns False """
2025-03-19 18:01:53 +00:00
test = TextOrderTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ORDER . value , before = " missing " , after = " present " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " This text has present but not the other word " )
self . assertFalse ( result )
def test_after_not_found ( self ) :
""" Test that ' after ' text not found returns False """
2025-03-19 18:01:53 +00:00
test = TextOrderTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ORDER . value , before = " present " , after = " missing " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " This text has present but not the other word " )
self . assertFalse ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_max_diffs ( self ) :
""" Test that max_diffs parameter works correctly """
2025-03-19 18:01:53 +00:00
test = TextOrderTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ORDER . value , before = " first " , after = " second " , max_diffs = 1 )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " This has firsst and then secand in correct order " )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_multiple_occurrences ( self ) :
""" Test that multiple occurrences are handled correctly """
2025-03-19 18:01:53 +00:00
test = TextOrderTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ORDER . value , before = " target " , after = " target " )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " This has target and then target again " )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
# Test reverse direction fails
2025-03-19 18:01:53 +00:00
test = TextOrderTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . ORDER . value , before = " B " , after = " A " )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " A B A B " ) # A comes before B, but B also comes before second A
self . assertTrue ( result )
class TestTableTest ( unittest . TestCase ) :
""" Test the TableTest class """
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def setUp ( self ) :
""" Set up test fixtures """
self . markdown_table = """
| Header 1 | Header 2 | Header 3 |
| - - - - - - - - | - - - - - - - - | - - - - - - - - |
| Cell A1 | Cell A2 | Cell A3 |
| Cell B1 | Cell B2 | Cell B3 |
"""
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
self . html_table = """
< table >
< tr >
< th > Header 1 < / th >
< th > Header 2 < / th >
< th > Header 3 < / th >
< / tr >
< tr >
< td > Cell A1 < / td >
< td > Cell A2 < / td >
< td > Cell A3 < / td >
< / tr >
< tr >
< td > Cell B1 < / td >
< td > Cell B2 < / td >
< td > Cell B3 < / td >
< / tr >
< / table >
"""
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_valid_initialization ( self ) :
""" Test that valid initialization works """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " target cell " )
2025-03-19 17:44:49 +00:00
self . assertEqual ( test . cell , " target cell " )
self . assertEqual ( test . up , " " )
self . assertEqual ( test . down , " " )
self . assertEqual ( test . left , " " )
self . assertEqual ( test . right , " " )
self . assertEqual ( test . top_heading , " " )
self . assertEqual ( test . left_heading , " " )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_invalid_test_type ( self ) :
""" Test that invalid test type raises ValidationError """
with self . assertRaises ( ValidationError ) :
2025-03-19 18:01:53 +00:00
TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , cell = " target cell " )
2025-03-19 17:44:49 +00:00
def test_parse_markdown_tables ( self ) :
""" Test markdown table parsing """
2025-04-02 21:39:50 +00:00
_test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-04-02 20:25:16 +00:00
tables = parse_markdown_tables ( self . markdown_table )
2025-03-19 17:44:49 +00:00
self . assertEqual ( len ( tables ) , 1 )
2025-03-19 20:42:04 +00:00
self . assertEqual ( tables [ 0 ] . data . shape , ( 3 , 3 ) ) # 3 rows, 3 columns
self . assertEqual ( tables [ 0 ] . data [ 0 , 0 ] , " Header 1 " )
self . assertEqual ( tables [ 0 ] . data [ 1 , 1 ] , " Cell A2 " )
self . assertEqual ( tables [ 0 ] . data [ 2 , 2 ] , " Cell B3 " )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_parse_html_tables ( self ) :
""" Test HTML table parsing """
2025-04-02 21:39:50 +00:00
_test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-04-02 20:25:16 +00:00
tables = parse_html_tables ( self . html_table )
2025-03-19 17:44:49 +00:00
self . assertEqual ( len ( tables ) , 1 )
2025-03-19 20:42:04 +00:00
self . assertEqual ( tables [ 0 ] . data . shape , ( 3 , 3 ) ) # 3 rows, 3 columns
self . assertEqual ( tables [ 0 ] . data [ 0 , 0 ] , " Header 1 " )
self . assertEqual ( tables [ 0 ] . data [ 1 , 1 ] , " Cell A2 " )
self . assertEqual ( tables [ 0 ] . data [ 2 , 2 ] , " Cell B3 " )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_match_cell ( self ) :
""" Test finding a cell in a table """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( self . markdown_table )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_cell_not_found ( self ) :
""" Test cell not found in table """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Missing Cell " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( self . markdown_table )
self . assertFalse ( result )
self . assertIn ( " No cell matching " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_up_relationship ( self ) :
""" Test up relationship in table """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " , up = " Header 2 " )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( self . markdown_table )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
# Test incorrect up relationship
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " , up = " Wrong Header " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( self . markdown_table )
self . assertFalse ( result )
self . assertIn ( " doesn ' t match expected " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_down_relationship ( self ) :
""" Test down relationship in table """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " , down = " Cell B2 " )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( self . markdown_table )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
# Test incorrect down relationship
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " , down = " Wrong Cell " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( self . markdown_table )
self . assertFalse ( result )
self . assertIn ( " doesn ' t match expected " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_left_relationship ( self ) :
""" Test left relationship in table """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " , left = " Cell A1 " )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( self . markdown_table )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
# Test incorrect left relationship
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " , left = " Wrong Cell " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( self . markdown_table )
self . assertFalse ( result )
self . assertIn ( " doesn ' t match expected " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_right_relationship ( self ) :
""" Test right relationship in table """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " , right = " Cell A3 " )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( self . markdown_table )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
# Test incorrect right relationship
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " , right = " Wrong Cell " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( self . markdown_table )
self . assertFalse ( result )
self . assertIn ( " doesn ' t match expected " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_top_heading_relationship ( self ) :
""" Test top_heading relationship in table """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell B2 " , top_heading = " Header 2 " )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( self . markdown_table )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
# Test incorrect top_heading relationship
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell B2 " , top_heading = " Wrong Header " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( self . markdown_table )
self . assertFalse ( result )
self . assertIn ( " doesn ' t match expected " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_left_heading_relationship ( self ) :
""" Test left_heading relationship in table """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A3 " , left_heading = " Cell A1 " )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( self . markdown_table )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
# Test incorrect left_heading relationship
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A3 " , left_heading = " Wrong Cell " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( self . markdown_table )
self . assertFalse ( result )
self . assertIn ( " doesn ' t match expected " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_multiple_relationships ( self ) :
""" Test multiple relationships in table """
test = TableTest (
2025-03-19 18:01:53 +00:00
pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " , up = " Header 2 " , down = " Cell B2 " , left = " Cell A1 " , right = " Cell A3 "
2025-03-19 17:44:49 +00:00
)
result , _ = test . run ( self . markdown_table )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
# Test one incorrect relationship
test = TableTest (
2025-03-19 18:01:53 +00:00
pdf = " test.pdf " ,
page = 1 ,
id = " test_id " ,
type = TestType . TABLE . value ,
2025-03-19 17:44:49 +00:00
cell = " Cell A2 " ,
up = " Header 2 " ,
down = " Cell B2 " ,
left = " Wrong Cell " , # This is incorrect
2025-03-19 18:01:53 +00:00
right = " Cell A3 " ,
2025-03-19 17:44:49 +00:00
)
result , explanation = test . run ( self . markdown_table )
self . assertFalse ( result )
self . assertIn ( " doesn ' t match expected " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_no_tables_found ( self ) :
""" Test behavior when no tables are found """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " This is plain text with no tables " )
self . assertFalse ( result )
self . assertEqual ( explanation , " No tables found in the content " )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_fuzzy_matching ( self ) :
""" Test fuzzy matching with max_diffs """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " , max_diffs = 1 )
2025-03-19 17:44:49 +00:00
# Create table with slightly misspelled cell
misspelled_table = self . markdown_table . replace ( " Cell A2 " , " Cel A2 " )
result , _ = test . run ( misspelled_table )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:53:45 +00:00
def test_with_stripped_content ( self ) :
""" Test table parsing with stripped content """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-03-19 17:53:45 +00:00
# Strip all leading/trailing whitespace from the markdown table
stripped_table = self . markdown_table . strip ( )
result , explanation = test . run ( stripped_table )
self . assertTrue ( result , f " Table test failed with stripped content: { explanation } " )
2025-03-19 18:01:53 +00:00
2025-03-19 18:01:02 +00:00
def test_table_at_end_of_file ( self ) :
""" Test that a table at the very end of the file is correctly detected """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-03-19 18:01:02 +00:00
# Create content with text followed by a table at the very end with no trailing newline
content_with_table_at_end = " Some text before the table. \n " + self . markdown_table . strip ( )
result , explanation = test . run ( content_with_table_at_end )
self . assertTrue ( result , f " Table at end of file not detected: { explanation } " )
2025-03-19 18:01:53 +00:00
2025-03-19 18:01:02 +00:00
def test_table_at_end_with_no_trailing_newline ( self ) :
""" Test that a table at the end with no trailing newline is detected """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-03-19 18:01:02 +00:00
# Remove the trailing newline from the markdown table
content_without_newline = self . markdown_table . rstrip ( )
result , explanation = test . run ( content_without_newline )
self . assertTrue ( result , f " Table without trailing newline not detected: { explanation } " )
2025-03-19 18:01:53 +00:00
2025-03-19 18:01:02 +00:00
def test_table_at_end_with_extra_spaces ( self ) :
""" Test that a table at the end with extra spaces is detected """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-03-19 18:01:02 +00:00
# Add extra spaces to the end of lines in the table
2025-03-19 18:01:53 +00:00
lines = self . markdown_table . split ( " \n " )
content_with_extra_spaces = " \n " . join ( [ line + " " for line in lines ] )
2025-03-19 18:01:02 +00:00
result , explanation = test . run ( content_with_extra_spaces )
self . assertTrue ( result , f " Table with extra spaces not detected: { explanation } " )
2025-03-19 18:01:53 +00:00
2025-03-19 18:01:02 +00:00
def test_table_at_end_with_mixed_whitespace ( self ) :
""" Test that a table at the end with mixed whitespace is detected """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-03-19 18:01:02 +00:00
# Add various whitespace characters to the table
content_with_mixed_whitespace = " Some text before the table. \n " + self . markdown_table . strip ( ) + " \t "
result , explanation = test . run ( content_with_mixed_whitespace )
self . assertTrue ( result , f " Table with mixed whitespace not detected: { explanation } " )
def test_malformed_table_at_end ( self ) :
""" Test that a slightly malformed table at the end is still detected """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-03-19 18:01:02 +00:00
# Create a table with irregular pipe placement at the end
malformed_table = """
Some text before the table .
| Header 1 | Header 2 | Header 3
| - - - - - - - - | - - - - - - - - | - - - - - - - -
| Cell A1 | Cell A2 | Cell A3 |
| Cell B1 | Cell B2 | Cell B3 """
result , explanation = test . run ( malformed_table )
self . assertTrue ( result , f " Malformed table at end not detected: { explanation } " )
2025-03-19 18:01:53 +00:00
2025-03-19 18:01:02 +00:00
def test_incomplete_table_at_end ( self ) :
""" Test that an incomplete table at the end still gets detected if it contains valid rows """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-03-19 18:01:02 +00:00
# Missing the separator row
incomplete_table = """
Some text before the table .
| Header 1 | Header 2 | Header 3 |
| Cell A1 | Cell A2 | Cell A3 |
| Cell B1 | Cell B2 | Cell B3 | """
result , explanation = test . run ( incomplete_table )
self . assertTrue ( result , f " Incomplete table at end not detected: { explanation } " )
2025-03-19 18:01:53 +00:00
2025-03-19 18:01:02 +00:00
def test_table_with_excessive_blank_lines_at_end ( self ) :
""" Test that a table followed by many blank lines is detected """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-03-19 18:01:02 +00:00
# Add many blank lines after the table
table_with_blanks = self . markdown_table + " \n \n \n \n \n \n \n \n \n \n "
result , explanation = test . run ( table_with_blanks )
self . assertTrue ( result , f " Table with blank lines at end not detected: { explanation } " )
2025-03-19 18:01:53 +00:00
2025-03-19 18:01:02 +00:00
def test_table_at_end_after_long_text ( self ) :
""" Test that a table at the end after a very long text is detected """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-03-19 18:01:02 +00:00
# Create a very long text before the table
long_text = " Lorem ipsum dolor sit amet, " * 100
content_with_long_text = long_text + " \n " + self . markdown_table . strip ( )
result , explanation = test . run ( content_with_long_text )
self . assertTrue ( result , f " Table after long text not detected: { explanation } " )
2025-03-19 18:01:53 +00:00
2025-03-19 18:01:02 +00:00
def test_valid_table_at_eof_without_newline ( self ) :
""" Test that a valid table at EOF without a trailing newline is detected """
2025-03-19 18:01:53 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Cell A2 " )
2025-03-19 18:01:02 +00:00
# Valid table but without trailing newline at the very end of the file
valid_table_eof = """
| Header 1 | Header 2 | Header 3 |
| - - - - - - - - | - - - - - - - - | - - - - - - - - |
| Cell A1 | Cell A2 | Cell A3 |
| Cell B1 | Cell B2 | Cell B3 | """ .strip()
result , explanation = test . run ( valid_table_eof )
self . assertTrue ( result , f " Valid table at EOF without newline not detected: { explanation } " )
2025-03-19 17:44:49 +00:00
2025-03-19 18:29:42 +00:00
def test_normalizing ( self ) :
table = """ | Question - – Satisfaction on scale of 10 | Response | Resident Sample | Business Sample |
| - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | - - - - - - - - - - | - - - - - - - - - - - - - - - - | - - - - - - - - - - - - - - - - - |
| Planning for and managing residential , commercial and industrial development | Rating of 8 , 9 or 10 | 13 % | 11 % |
| | Average rating | 6.4 | 5.7 |
| | Don ’ t know responses | 11 % | 6 % |
| Environmental protection , support for green projects ( e . g . green grants , building retrofits programs , zero waste ) | Rating of 8 , 9 or 10 | 35 % | 34 % |
| | Average rating | 8.0 | 7.5 |
| | Don ’ t know responses | 8 % | 6 % |
| Providing and maintaining parks and green spaces | Rating of 8 , 9 or 10 | 42 % | 41 % |
| | Average rating | 7.7 | 7.3 |
| | Don ’ t know responses | 1 % | 1 % | """
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 6 % " , top_heading = " Business \n Sample " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
2025-03-19 18:46:07 +00:00
def test_mathematical_minus ( self ) :
table = """ | Response | Chinese experimenter | White experimenter |
| - - - - - - - - - - | - - - - - - - - - - - - - - - - - - - - - - | - - - - - - - - - - - - - - - - - - - - |
| | Divided attention | Full attention | Divided attention | Full attention |
| Nonverbal | − .34 ( .22 ) | .54 * ( .17 ) | .12 ( .27 ) | − .20 ( .24 ) |
| Verbal | − .25 ( .23 ) | .36 ( .20 ) | .12 ( .27 ) | − .34 ( .22 ) |
"""
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " -.34 (.22) " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
2025-03-19 20:42:04 +00:00
def test_markdown_marker ( self ) :
table = """ | CATEGORY | POINTS EARNED |
| - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | - - - - - - - - - - - - - - - - - - |
| Sustainable Sites | 9 |
| Water Efficiency | 3 |
| Energy & Atmosphere | 12 |
| Materials & Resources | 6 |
| Indoor Environmental Quality | 11 |
| Innovation & Design Process | 5 |
| TOTAL | 46 | """
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 9 " , up = " POINTS EARNED " )
result , explanation = test . run ( table )
2025-04-03 20:27:01 +00:00
self . assertTrue ( result , explanation )
def test_diffs ( self ) :
table = """ | CATEGORY | POINTS EARNED |
| - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | - - - - - - - - - - - - - - - - - - |
| Sustainable Sites | 9 |
| Water Efficiency | 3 |
| Energy & Atmosphere | 12 |
| Materials & Resources | 6 |
| Indoor Environmental Quality | 11 |
| Innovation & Design Process | 5 |
| TOTAL | 46 | """
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 9 " , left = " Sustl Sie " , max_diffs = 2 )
result , explanation = test . run ( table )
self . assertFalse ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 9 " , left = " Sustainable Site " , max_diffs = 2 )
result , explanation = test . run ( table )
2025-03-19 20:42:04 +00:00
self . assertTrue ( result , explanation )
2025-03-19 18:57:00 +00:00
2025-03-19 20:42:04 +00:00
def test_markdown_marker2 ( self ) :
table = """ | Concentration
level | [ CO ] | [ SO2 ] | [ NOx ] |
| - - - - - - - - - - - - - - - - - - - - - - - - | - - - - - - - - - - - | - - - - - - - | - - - - - - - - - - |
| Control | 0 μM | 0 μM | 0 nM |
| Low | 250
μM | 8 μM | 0.002 nM |
| Medium | 625 μM | 20 μM | 0.005 nM |
| High | 1250 μM | 40 μM | 0.01 nM | """
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 20 μM " , up = " .002 nM " )
result , explanation = test . run ( table )
self . assertFalse ( result , explanation )
def test_marker3 ( self ) :
table = """ | | N | Minimum | Maximum | Gemiddelde | Sd |
| - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - | - - - - - - - | - - - - - - - - - | - - - - - - - - - | - - - - - - - - - - - - | - - - - - |
| Slaapkwaliteit tijdens
gewone nachten | 2017 | 1 , 0 | 6 , 0 | 3 , 9 | 1 , 0 |
| Slaapkwaliteit tijdens
consignatiediensten | 19816 | 1 , 0 | 6 , 0 | 2 , 8 | 1 , 2 |
"""
2025-03-19 21:06:52 +00:00
test = TableTest (
pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 2,8 " , left_heading = " Slaapkwaliteit tijdens \n consignatiediensten "
)
2025-03-19 20:42:04 +00:00
result , explanation = test . run ( table )
self . assertFalse ( result , explanation )
def test_big_table ( self ) :
table = """ <table>
< caption > Base : Resident respondents ( n = 1 , 315 ) and Business respondents ( n = 397 ) < / caption >
< thead >
< tr >
< th > Question – Satisfaction on scale of 10 < / th >
< th > Response < / th >
< th > Resident Sample < / th >
< th > Business Sample < / th >
< / tr >
< / thead >
< tbody >
< ! - - First category - - >
< tr class = " category-row " >
< td rowspan = " 3 " > Planning for and managing residential , commercial and industrial development < / td >
< td > Rating of 8 , 9 or 10 < / td >
< td > 13 % < / td >
< td > 11 % < / td >
< / tr >
< tr >
< td class = " subcategory " > Average rating < / td >
< td > 6.4 < / td >
< td > 5.7 < / td >
< / tr >
< tr >
< td class = " subcategory " > Don ' t know responses</td>
< td > 11 % < / td >
< td > 6 % < / td >
< / tr >
< ! - - Second category - - >
< tr class = " category-row " >
< td rowspan = " 3 " > Environmental protection , support for green projects ( e . g . green grants , building retrofits programs , zero waste ) < / td >
< td > Rating of 8 , 9 or 10 < / td >
< td > 35 % < / td >
< td > 34 % < / td >
< / tr >
< tr >
< td class = " subcategory " > Average rating < / td >
< td > 8.0 < / td >
< td > 7.5 < / td >
< / tr >
< tr >
< td class = " subcategory " > Don ' t know responses</td>
< td > 8 % < / td >
< td > 6 % < / td >
< / tr >
< ! - - Third category - - >
< tr class = " category-row " >
< td rowspan = " 3 " > Providing and maintaining parks and green spaces < / td >
< td > Rating of 8 , 9 or 10 < / td >
< td > 42 % < / td >
< td > 41 % < / td >
< / tr >
< tr >
< td class = " subcategory " > Average rating < / td >
< td > 7.7 < / td >
< td > 7.3 < / td >
< / tr >
< tr >
< td class = " subcategory " > Don ' t know responses</td>
< td > 1 % < / td >
< td > 1 % < / td >
< / tr >
< / tbody >
< / table >
2025-03-19 18:57:00 +00:00
"""
2025-03-19 21:06:52 +00:00
test = TableTest (
pdf = " test.pdf " ,
page = 1 ,
id = " test_id " ,
type = TestType . TABLE . value ,
max_diffs = 5 ,
cell = " Planning for and managing residential, commercial and industrial development " ,
down = " Environmental protection, \n support for green projects \n (e.g. green grants, \n building retrofits programs, \n zero waste) " ,
)
2025-03-19 20:42:04 +00:00
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
def test_html_rowspans_colspans ( self ) :
table = """ <table>
< thead >
< tr >
< th rowspan = " 2 " > Product Category < / th >
< th rowspan = " 2 " > Product Subcategory < / th >
< th colspan = " 4 " > Quarterly Sales ( $ 000 s ) < / th >
< th rowspan = " 2 " > Annual Total < / th >
< / tr >
< tr >
< th > Q1 < / th >
< th > Q2 < / th >
< th > Q3 < / th >
< th > Q4 < / th >
< / tr >
< / thead >
< tbody >
< tr class = " category " >
< td rowspan = " 4 " > Electronics < / td >
< td > Smartphones < / td >
< td > 245 < / td >
< td > 278 < / td >
< td > 312 < / td >
< td > 389 < / td >
< td > 1 , 224 < / td >
< / tr >
< tr class = " subcategory " >
< td > Laptops < / td >
< td > 187 < / td >
< td > 192 < / td >
< td > 243 < / td >
< td > 297 < / td >
< td > 919 < / td >
< / tr >
< tr class = " subcategory " >
< td > Tablets < / td >
< td > 95 < / td >
< td > 123 < / td >
< td > 135 < / td >
< td > 156 < / td >
< td > 509 < / td >
< / tr >
< tr class = " subcategory " >
< td > Accessories < / td >
< td > 64 < / td >
< td > 72 < / td >
< td > 87 < / td >
< td > 105 < / td >
< td > 328 < / td >
< / tr >
< tr class = " category " >
< td rowspan = " 3 " > Home Appliances < / td >
< td > Refrigerators < / td >
< td > 132 < / td >
< td > 145 < / td >
< td > 151 < / td >
< td > 162 < / td >
< td > 590 < / td >
< / tr >
< tr class = " subcategory " >
< td > Washing Machines < / td >
< td > 98 < / td >
< td > 112 < / td >
< td > 127 < / td >
< td > 143 < / td >
< td > 480 < / td >
< / tr >
< tr class = " subcategory " >
< td > Microwaves < / td >
< td > 54 < / td >
< td > 67 < / td >
< td > 72 < / td >
< td > 84 < / td >
< td > 277 < / td >
< / tr >
< tr class = " category " >
< td rowspan = " 3 " > Furniture < / td >
< td > Sofas < / td >
< td > 112 < / td >
< td > 128 < / td >
< td > 134 < / td >
< td > 142 < / td >
< td > 516 < / td >
< / tr >
< tr class = " subcategory " >
< td > Tables < / td >
< td > 87 < / td >
< td > 95 < / td >
< td > 103 < / td >
< td > 124 < / td >
< td > 409 < / td >
< / tr >
< tr class = " subcategory " >
< td > Chairs < / td >
< td > 76 < / td >
< td > 84 < / td >
< td > 92 < / td >
< td > 110 < / td >
< td > 362 < / td >
< / tr >
< tr class = " total " >
< td colspan = " 2 " > Quarterly Totals < / td >
< td > 1 , 150 < / td >
< td > 1 , 296 < / td >
< td > 1 , 456 < / td >
< td > 1 , 712 < / td >
< td > 5 , 614 < / td >
< / tr >
< / tbody >
< / table > """
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Refrigerators " , left = " Home Appliances " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Washing Machines " , left = " Home Appliances " )
2025-03-19 18:57:00 +00:00
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 20:42:04 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Microwaves " , left = " Home Appliances " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Sofas " , top_heading = " Product Subcategory " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 135 " , top_heading = " Q3 " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 135 " , top_heading = " Quarterly Sales ($000s) " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 1,712 " , top_heading = " Quarterly Sales ($000s) " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 135 " , top_heading = " Q2 " )
result , explanation = test . run ( table )
self . assertFalse ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 135 " , top_heading = " Q1 " )
result , explanation = test . run ( table )
self . assertFalse ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 135 " , top_heading = " Q4 " )
result , explanation = test . run ( table )
self . assertFalse ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Home Appliances " , top_heading = " Product Category " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Washing Machines " , top_heading = " Product Category " )
result , explanation = test . run ( table )
self . assertFalse ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Washing Machines " , top_heading = " Q3 " )
result , explanation = test . run ( table )
self . assertFalse ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Washing Machines " , top_heading = " Quarterly Sales ($000s) " )
result , explanation = test . run ( table )
self . assertFalse ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 23:06:18 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Electronics " , right = " Laptops " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Electronics " , right = " Accessories " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
2025-04-17 14:26:06 -07:00
# TODO Skipping these for now
# test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Quarterly Sales ($000s)", down="Q2")
# result, explanation = test.run(table)
# self.assertTrue(result, explanation)
2025-03-19 23:06:18 +00:00
2025-04-17 14:26:06 -07:00
# test = TableTest(pdf="test.pdf", page=1, id="test_id", type=TestType.TABLE.value, cell="Q2", up="Quarterly Sales ($000s)")
# result, explanation = test.run(table)
# self.assertTrue(result, explanation)
2025-03-19 23:06:18 +00:00
2025-03-19 20:42:04 +00:00
def test_multiple_markdown_tables ( self ) :
""" Test that we can find and verify cells in multiple markdown tables in one document """
content = """
# First Table
| Name | Age | Role |
| - - - - | - - - | - - - - |
| John | 28 | Developer |
| Jane | 32 | Designer |
| Bob | 45 | Manager |
Some text between tables . . .
# Second Table
| Department | Budget | Employees |
| - - - - - - - - - - | - - - - - - | - - - - - - - - - |
| Engineering | 1.2 M | 15 |
| Design | 0.8 M | 8 |
| Marketing | 1.5 M | 12 |
| HR | 0.5 M | 5 |
"""
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
# Test cells in the first table
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " John " , right = " 28 " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 32 " , left = " Jane " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
# Test cells in the second table
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Engineering " , right = " 1.2M " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 12 " , left = " 1.5M " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
# Verify top headings work correctly across tables
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Bob " , top_heading = " Name " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " HR " , top_heading = " Department " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
def test_multiple_html_tables ( self ) :
""" Test that we can find and verify cells in multiple HTML tables in one document """
content = """
< h1 > First Table < / h1 >
< table >
< thead >
< tr >
< th > Country < / th >
< th > Capital < / th >
< th > Population < / th >
< / tr >
< / thead >
< tbody >
< tr >
< td > USA < / td >
< td > Washington DC < / td >
< td > 331 M < / td >
< / tr >
< tr >
< td > France < / td >
< td > Paris < / td >
< td > 67 M < / td >
< / tr >
< tr >
< td > Japan < / td >
< td > Tokyo < / td >
< td > 126 M < / td >
< / tr >
< / tbody >
< / table >
< p > Some text between tables . . . < / p >
< h1 > Second Table < / h1 >
< table >
< thead >
< tr >
< th > Company < / th >
< th > Industry < / th >
< th > Revenue < / th >
< th > Employees < / th >
< / tr >
< / thead >
< tbody >
< tr >
< td > ABC Corp < / td >
< td > Technology < / td >
< td > $ 5 B < / td >
< td > 10 , 000 < / td >
< / tr >
< tr >
< td > XYZ Inc < / td >
< td > Healthcare < / td >
< td > $ 2.5 B < / td >
< td > 8 , 500 < / td >
< / tr >
< tr >
< td > Acme Co < / td >
< td > Manufacturing < / td >
< td > $ 1.8 B < / td >
< td > 15 , 000 < / td >
< / tr >
< tr >
< td > Global LLC < / td >
< td > Finance < / td >
< td > $ 3.2 B < / td >
< td > 6 , 200 < / td >
< / tr >
< / tbody >
< / table >
"""
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
# Test cells in the first table
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " USA " , right = " Washington DC " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 126M " , left = " Tokyo " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
# Test cells in the second table
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " XYZ Inc " , right = " Healthcare " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 15,000 " , left = " $1.8B " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
# Verify top headings work correctly across tables
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Tokyo " , top_heading = " Capital " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Finance " , top_heading = " Industry " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
def test_mixed_markdown_and_html_tables ( self ) :
""" Test that we can find and verify cells in mixed markdown and HTML tables in one document """
content = """
# Markdown Table
| Product | Price | Quantity |
| - - - - - - - | - - - - - | - - - - - - - - |
| Apple | $ 1.20 | 100 |
| Orange | $ 0.80 | 150 |
| Banana | $ 0.60 | 200 |
< h1 > HTML Table < / h1 >
< table >
< tr >
< th > Month < / th >
< th > Income < / th >
< th > Expenses < / th >
< th > Profit < / th >
< / tr >
< tr >
< td > January < / td >
< td > $ 10 , 000 < / td >
< td > $ 8 , 000 < / td >
< td > $ 2 , 000 < / td >
< / tr >
< tr >
< td > February < / td >
< td > $ 12 , 000 < / td >
< td > $ 9 , 500 < / td >
< td > $ 2 , 500 < / td >
< / tr >
< tr >
< td > March < / td >
< td > $ 15 , 000 < / td >
< td > $ 10 , 200 < / td >
< td > $ 4 , 800 < / td >
< / tr >
< / table >
"""
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
# Test cells in the markdown table
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Orange " , right = " $0.80 " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
# Test cells in the HTML table
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " February " , right = " $12,000 " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
# Verify we can find cells with specific top headings
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 100 " , top_heading = " Quantity " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
2025-03-19 20:42:04 +00:00
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " $4,800 " , top_heading = " Profit " )
result , explanation = test . run ( content )
self . assertTrue ( result , explanation )
2025-03-19 21:06:52 +00:00
def test_br_tags_replacement ( self ) :
""" Test that <br> and <br/> tags are correctly replaced with newlines """
table = """ <table>
< tr >
< th > Header 1 < / th >
< th > Header 2 < / th >
< / tr >
< tr >
< td > Line 1 < br / > Line 2 < br / > Line 3 < / td >
< td > Single line < / td >
< / tr >
< / table > """
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " Line 1 Line 2 Line 3 " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
def test_real_complicated_table ( self ) :
table = """ <table>
< thead >
< tr >
< th colspan = " 7 " > Table 1 & nbsp ; & nbsp ; Differences in diagnoses , gender and family status for participants with a suicide attempt and those without a suicide attempt within the 12 - month follow - up interval < / th >
< / tr >
< tr class = " header-row " >
< th rowspan = " 2 " > < / th >
< th colspan = " 2 " > Participants with no < br > suicide attempt < br > ( n = 132 ) < sup > a < / sup > < / th >
< th colspan = " 2 " > Participants with a < br > suicide attempt < br > ( n = 43 ) < sup > b < / sup > < / th >
< th colspan = " 3 " > < / th >
< / tr >
< tr class = " header-row " >
< th > n < / th >
< th > % < / th >
< th > n < / th >
< th > % < / th >
< th > χ < sup > 2 < / sup > < / th >
< th > d . f . < / th >
< th > P < / th >
< / tr >
< / thead >
< tbody >
< tr >
< td class = " section-header " > ICD - 10 diagnoses < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; F0 < / td >
< td > 1 < / td >
< td > 0.76 < / td >
< td > 0 < / td >
< td > 0.00 < / td >
< td > 0.00 < / td >
< td > 1 < / td >
< td > 1.00 < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; F1 < / td >
< td > 17 < / td >
< td > 12.88 < / td >
< td > 12 < / td >
< td > 27.91 < / td >
< td > 4.39 < / td >
< td > 1 < / td >
< td > 0.04 < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; F2 < / td >
< td > 1 < / td >
< td > 0.76 < / td >
< td > 0 < / td >
< td > 0.00 < / td >
< td > 0.00 < / td >
< td > 1 < / td >
< td > 1.00 < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; F3 < / td >
< td > 106 < / td >
< td > 80.30 < / td >
< td > 31 < / td >
< td > 72.09 < / td >
< td > 0.74 < / td >
< td > 1 < / td >
< td > 0.39 < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; F4 < / td >
< td > 42 < / td >
< td > 31.82 < / td >
< td > 17 < / td >
< td > 39.53 < / td >
< td > 0.61 < / td >
< td > 1 < / td >
< td > 0.43 < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; F5 < / td >
< td > 5 < / td >
< td > 3.79 < / td >
< td > 5 < / td >
< td > 11.63 < / td >
< td > 2.44 < / td >
< td > 1 < / td >
< td > 0.12 < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; F6 < / td >
< td > 20 < / td >
< td > 15.15 < / td >
< td > 19 < / td >
< td > 44.19 < / td >
< td > 14.48 < / td >
< td > 1 < / td >
< td > 0.00 < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; F7 < / td >
< td > 0 < / td >
< td > 0.00 < / td >
< td > 0 < / td >
< td > 0.00 < / td >
< td > — < / td >
< td > — < / td >
< td > — < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; F8 < / td >
< td > 1 < / td >
< td > 0.76 < / td >
< td > 0 < / td >
< td > 0.00 < / td >
< td > 0.00 < / td >
< td > 1 < / td >
< td > 1.00 < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; F9 < / td >
< td > 2 < / td >
< td > 1.52 < / td >
< td > 1 < / td >
< td > 2.33 < / td >
< td > 0.00 < / td >
< td > 1 < / td >
< td > 1.00 < / td >
< / tr >
< tr >
< td class = " section-header " > Gender < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< td > 3.09 < / td >
< td > 2 < / td >
< td > 0.21 < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; Female < / td >
< td > 75 < / td >
< td > 56.8 < / td >
< td > 24 < / td >
< td > 55.8 < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; Male < / td >
< td > 57 < / td >
< td > 43.2 < / td >
< td > 18 < / td >
< td > 41.9 < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; Diverse < / td >
< td > 0 < / td >
< td > 0 < / td >
< td > 1 < / td >
< td > 2.3 < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< / tr >
< tr >
< td class = " section-header " > Family status < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< td > 4.87 < / td >
< td > 4 < / td >
< td > 0.30 < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; Single < / td >
< td > 55 < / td >
< td > 41.7 < / td >
< td > 14 < / td >
< td > 32.6 < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; Partnership < / td >
< td > 25 < / td >
< td > 18.9 < / td >
< td > 9 < / td >
< td > 20.9 < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; Married < / td >
< td > 27 < / td >
< td > 20.5 < / td >
< td > 5 < / td >
< td > 11.6 < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; Divorced < / td >
< td > 20 < / td >
< td > 15.2 < / td >
< td > 11 < / td >
< td > 25.6 < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< / tr >
< tr >
< td > & nbsp ; & nbsp ; Widowed < / td >
< td > 1 < / td >
< td > 0.8 < / td >
< td > 1 < / td >
< td > 2.3 < / td >
< td > < / td >
< td > < / td >
< td > < / td >
< / tr >
< / tbody >
< tfoot >
< tr >
< td colspan = " 8 " class = " footnote " >
F0 : Organic , including symptomatic , mental disorders ; F1 : Mental and behavioural disorders due to psychoactive substance use ; F2 : Schizophrenia , schizotypal and delusional disorders ; F3 : affective disorders ; F4 : Neurotic , stress - related and somatoform disorders ; F5 : Behavioural syndromes associated with physiological disturbances and physical factors ; F6 : Disorders of adult personality and behaviour ; F7 : Mental retardation ; F8 : Disorders of psychological development ; F9 : Behavioural and emotional disorders with onset usually occurring in childhood and adolescence . < br >
a . 75.43 % of the total sample with full information on suicide reattempts within the entire 12 - month follow - up interval . < br >
b . 24.57 % of the total sample with full information on suicide reattempts within the entire 12 - month follow - up interval .
< / td >
< / tr >
< / tfoot >
< / table > """
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 4.39 " , top_heading = " χ2 " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
test = TableTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 12.88 " , top_heading = " % " )
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
# Account for the superscript in the header
test = TableTest (
pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . TABLE . value , cell = " 12.88 " , top_heading = " Participants with no suicide attempt (n = 132)a "
2025-03-19 20:42:04 +00:00
)
2025-03-19 21:06:52 +00:00
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
test = TableTest (
pdf = " test.pdf " ,
page = 1 ,
id = " test_id " ,
type = TestType . TABLE . value ,
cell = " 12.88 " ,
top_heading = " Table 1 Differences in diagnoses, gender and family status for participants with a suicide attempt and those without a suicide attempt within the 12-month follow-up interval " ,
)
result , explanation = test . run ( table )
self . assertTrue ( result , explanation )
2025-03-19 20:42:04 +00:00
2025-03-19 17:44:49 +00:00
class TestBaselineTest ( unittest . TestCase ) :
""" Test the BaselineTest class """
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_valid_initialization ( self ) :
""" Test that valid initialization works """
2025-03-19 18:01:53 +00:00
test = BaselineTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . BASELINE . value , max_repeats = 50 )
2025-03-19 17:44:49 +00:00
self . assertEqual ( test . max_repeats , 50 )
2025-03-19 18:01:53 +00:00
2025-03-19 17:53:45 +00:00
def test_non_empty_content ( self ) :
2025-03-19 17:44:49 +00:00
""" Test that non-empty content passes """
2025-03-19 18:01:53 +00:00
test = BaselineTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . BASELINE . value )
2025-03-19 17:44:49 +00:00
result , _ = test . run ( " This is some normal content " )
self . assertTrue ( result )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_empty_content ( self ) :
""" Test that empty content fails """
2025-03-19 18:01:53 +00:00
test = BaselineTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . BASELINE . value )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " \n \t " )
self . assertFalse ( result )
self . assertIn ( " no alpha numeric characters " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:53:45 +00:00
def test_repeating_content ( self ) :
2025-03-19 17:44:49 +00:00
""" Test that highly repeating content fails """
2025-03-19 18:01:53 +00:00
test = BaselineTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . BASELINE . value , max_repeats = 2 )
2025-03-19 17:53:45 +00:00
# Create highly repeating content - repeat "abc" many times
repeating_content = " abc " * 10
result , explanation = test . run ( repeating_content )
2025-03-19 17:44:49 +00:00
self . assertFalse ( result )
2025-03-19 17:53:45 +00:00
self . assertIn ( " repeating " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_content_with_disallowed_characters ( self ) :
""" Test that content with disallowed characters fails """
2025-03-19 18:01:53 +00:00
test = BaselineTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . BASELINE . value )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " This has Chinese characters: 你好 " )
self . assertFalse ( result )
self . assertIn ( " disallowed characters " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_content_with_emoji ( self ) :
""" Test that content with emoji fails """
2025-03-19 18:01:53 +00:00
test = BaselineTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . BASELINE . value )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " This has emoji: 😊 " )
self . assertFalse ( result )
self . assertIn ( " disallowed characters " , explanation )
self . assertIn ( " 😊 " , explanation )
def test_content_with_mandarin ( self ) :
2025-03-19 18:01:53 +00:00
test = BaselineTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . BASELINE . value )
2025-03-19 17:44:49 +00:00
result , explanation = test . run ( " asdfasdfas維基百科/中文asdfw " )
self . assertFalse ( result )
self . assertIn ( " disallowed characters " , explanation )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_valid_content ( self ) :
""" Test that valid content passes all checks """
2025-03-19 18:01:53 +00:00
test = BaselineTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . BASELINE . value )
2025-03-19 17:44:49 +00:00
content = " This is some normal content with proper English letters and no suspicious repetition. "
result , _ = test . run ( content )
self . assertTrue ( result )
class TestMathTest ( unittest . TestCase ) :
""" Test the MathTest class """
2025-03-19 18:01:53 +00:00
2025-03-19 17:53:45 +00:00
def test_valid_initialization ( self ) :
2025-03-19 17:44:49 +00:00
""" Test that valid initialization works """
2025-03-19 17:53:45 +00:00
try :
2025-03-19 18:01:53 +00:00
test = MathTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . MATH . value , math = " a + b = c " )
2025-03-19 17:53:45 +00:00
self . assertEqual ( test . math , " a + b = c " )
except Exception as e :
self . fail ( f " Valid initialization failed with: { e } " )
2025-03-19 18:01:53 +00:00
2025-03-19 17:44:49 +00:00
def test_invalid_test_type ( self ) :
""" Test that invalid test type raises ValidationError """
with self . assertRaises ( ValidationError ) :
2025-03-19 18:01:53 +00:00
MathTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . PRESENT . value , math = " a + b = c " )
2025-03-19 17:44:49 +00:00
def test_empty_math ( self ) :
""" Test that empty math raises ValidationError """
with self . assertRaises ( ValidationError ) :
2025-03-19 18:01:53 +00:00
MathTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . MATH . value , math = " " )
2025-03-19 17:53:45 +00:00
def test_exact_math_match ( self ) :
""" Test exact match of math equation """
try :
2025-03-19 18:01:53 +00:00
test = MathTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . MATH . value , math = " a + b = c " )
2025-03-19 17:53:45 +00:00
# Test content with exact math match
content = " Here is an equation: $$a + b = c$$ "
result , _ = test . run ( content )
self . assertTrue ( result )
except Exception as e :
self . fail ( f " Test failed with: { e } " )
2025-03-19 18:01:53 +00:00
2025-03-19 17:53:45 +00:00
def test_rendered_math_match ( self ) :
2025-03-19 17:44:49 +00:00
""" Test rendered match of math equation """
2025-03-19 17:53:45 +00:00
try :
2025-03-19 18:01:53 +00:00
test = MathTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . MATH . value , math = " a + b = c " )
2025-03-19 17:53:45 +00:00
# Test content with different but equivalent math
content = " Here is an equation: $$a+b=c$$ "
result , _ = test . run ( content )
self . assertTrue ( result )
except Exception as e :
self . fail ( f " Test failed with: { e } " )
2025-03-19 18:01:53 +00:00
2025-03-19 17:53:45 +00:00
def test_no_math_match ( self ) :
2025-03-19 17:44:49 +00:00
""" Test no match of math equation """
2025-03-19 17:53:45 +00:00
try :
2025-03-19 18:01:53 +00:00
test = MathTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . MATH . value , math = " a + b = c " )
2025-03-19 17:53:45 +00:00
# Test content with no matching math
content = " Here is an equation: $$x + y = z$$ "
result , explanation = test . run ( content )
self . assertFalse ( result )
self . assertIn ( " No match found " , explanation )
except Exception as e :
self . fail ( f " Test failed with: { e } " )
2025-03-19 18:01:53 +00:00
2025-03-19 17:53:45 +00:00
def test_different_math_delimiters ( self ) :
2025-03-19 17:44:49 +00:00
""" Test different math delimiters """
2025-03-19 17:53:45 +00:00
try :
2025-03-19 18:01:53 +00:00
test = MathTest ( pdf = " test.pdf " , page = 1 , id = " test_id " , type = TestType . MATH . value , math = " a + b = c " )
2025-03-19 17:53:45 +00:00
# Test different delimiters
delimiters = [
2025-03-19 18:01:53 +00:00
" $$a + b = c$$ " , # $$...$$
" $a + b = c$ " , # $...$
" \\ (a + b = c \\ ) " , # \(...\)
" \\ [a + b = c \\ ] " , # \[...\]
2025-03-19 17:53:45 +00:00
]
2025-03-19 18:01:53 +00:00
2025-03-19 17:53:45 +00:00
for delim in delimiters :
content = f " Here is an equation: { delim } "
result , _ = test . run ( content )
self . assertTrue ( result )
except Exception as e :
self . fail ( f " Test failed with: { e } " )
2025-03-19 17:44:49 +00:00
if __name__ == " __main__ " :
2025-03-19 18:01:53 +00:00
unittest . main ( )