From c4dcc4ded4002d81c8cad05e815ea19277478b8f Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Fri, 24 Oct 2025 22:15:25 +0000 Subject: [PATCH] Some small table test gen fixes --- olmocr/bench/synth/mine_html_templates.py | 24 +++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py index ab411fb..9a0ae52 100644 --- a/olmocr/bench/synth/mine_html_templates.py +++ b/olmocr/bench/synth/mine_html_templates.py @@ -913,33 +913,33 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, rand if rowcol in table_data.up_relations and len(table_data.up_relations[rowcol]) > 0: relation = random_gen.choice(list(table_data.up_relations[rowcol])) - if len(table_data.cell_text[relation].strip()) > 1: - test_data["up"] = table_data.cell_text[relation] + if len(table_data.cell_text[relation].strip()) > 1 and "\n" not in table_data.cell_text[relation]: + test_data["up"] = normalize_text(table_data.cell_text[relation]) if rowcol in table_data.down_relations and len(table_data.down_relations[rowcol]) > 0: relation = random_gen.choice(list(table_data.down_relations[rowcol])) - if len(table_data.cell_text[relation].strip()) > 1: - test_data["down"] = table_data.cell_text[relation] + if len(table_data.cell_text[relation].strip()) > 1 and "\n" not in table_data.cell_text[relation]: + test_data["down"] = normalize_text(table_data.cell_text[relation]) if rowcol in table_data.left_relations and len(table_data.left_relations[rowcol]) > 0: relation = random_gen.choice(list(table_data.left_relations[rowcol])) - if len(table_data.cell_text[relation].strip()) > 1: - test_data["left"] = table_data.cell_text[relation] + if len(table_data.cell_text[relation].strip()) > 1 and "\n" not in table_data.cell_text[relation]: + test_data["left"] = normalize_text(table_data.cell_text[relation]) if rowcol in table_data.right_relations and len(table_data.right_relations[rowcol]) > 0: relation = random_gen.choice(list(table_data.right_relations[rowcol])) - if len(table_data.cell_text[relation].strip()) > 1: - test_data["right"] = table_data.cell_text[relation] + if len(table_data.cell_text[relation].strip()) > 1 and "\n" not in table_data.cell_text[relation]: + test_data["right"] = normalize_text(table_data.cell_text[relation]) if len(table_data.left_heading_relations(*rowcol)) > 0: relation = random_gen.choice(list(table_data.left_heading_relations(*rowcol))) - if len(table_data.cell_text[relation].strip()) > 1: - test_data["left_heading"] = table_data.cell_text[relation] + if len(table_data.cell_text[relation].strip()) > 1 and "\n" not in table_data.cell_text[relation]: + test_data["left_heading"] = normalize_text(table_data.cell_text[relation]) if len(table_data.top_heading_relations(*rowcol)) > 0: relation = random_gen.choice(list(table_data.top_heading_relations(*rowcol))) - if len(table_data.cell_text[relation].strip()) > 1: - test_data["top_heading"] = table_data.cell_text[relation] + if len(table_data.cell_text[relation].strip()) > 1 and "\n" not in table_data.cell_text[relation]: + test_data["top_heading"] = normalize_text(table_data.cell_text[relation]) # Only add the test if we have at least one relation if any(x in test_data for x in ["up", "down", "left", "right", "top_heading", "left_heading"]):