From c4dcc4ded4002d81c8cad05e815ea19277478b8f Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Fri, 24 Oct 2025 22:15:25 +0000
Subject: [PATCH] Some small table test gen fixes

---
 olmocr/bench/synth/mine_html_templates.py | 24 +++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py
index ab411fb..9a0ae52 100644
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@@ -913,33 +913,33 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, rand
 
             if rowcol in table_data.up_relations and len(table_data.up_relations[rowcol]) > 0:
                 relation = random_gen.choice(list(table_data.up_relations[rowcol]))
-                if len(table_data.cell_text[relation].strip()) > 1:
-                    test_data["up"] = table_data.cell_text[relation]
+                if len(table_data.cell_text[relation].strip()) > 1 and "\n" not in table_data.cell_text[relation]:
+                    test_data["up"] = normalize_text(table_data.cell_text[relation])
 
             if rowcol in table_data.down_relations and len(table_data.down_relations[rowcol]) > 0:
                 relation = random_gen.choice(list(table_data.down_relations[rowcol]))
-                if len(table_data.cell_text[relation].strip()) > 1:
-                    test_data["down"] = table_data.cell_text[relation]
+                if len(table_data.cell_text[relation].strip()) > 1 and "\n" not in table_data.cell_text[relation]:
+                    test_data["down"] = normalize_text(table_data.cell_text[relation])
 
             if rowcol in table_data.left_relations and len(table_data.left_relations[rowcol]) > 0:
                 relation = random_gen.choice(list(table_data.left_relations[rowcol]))
-                if len(table_data.cell_text[relation].strip()) > 1:
-                    test_data["left"] = table_data.cell_text[relation]
+                if len(table_data.cell_text[relation].strip()) > 1 and "\n" not in table_data.cell_text[relation]:
+                    test_data["left"] = normalize_text(table_data.cell_text[relation])
 
             if rowcol in table_data.right_relations and len(table_data.right_relations[rowcol]) > 0:
                 relation = random_gen.choice(list(table_data.right_relations[rowcol]))
-                if len(table_data.cell_text[relation].strip()) > 1:
-                    test_data["right"] = table_data.cell_text[relation]
+                if len(table_data.cell_text[relation].strip()) > 1 and "\n" not in table_data.cell_text[relation]:
+                    test_data["right"] = normalize_text(table_data.cell_text[relation])
 
             if len(table_data.left_heading_relations(*rowcol)) > 0:
                 relation = random_gen.choice(list(table_data.left_heading_relations(*rowcol)))
-                if len(table_data.cell_text[relation].strip()) > 1:
-                    test_data["left_heading"] = table_data.cell_text[relation]
+                if len(table_data.cell_text[relation].strip()) > 1 and "\n" not in table_data.cell_text[relation]:
+                    test_data["left_heading"] = normalize_text(table_data.cell_text[relation])
 
             if len(table_data.top_heading_relations(*rowcol)) > 0:
                 relation = random_gen.choice(list(table_data.top_heading_relations(*rowcol)))
-                if len(table_data.cell_text[relation].strip()) > 1:
-                    test_data["top_heading"] = table_data.cell_text[relation]
+                if len(table_data.cell_text[relation].strip()) > 1 and "\n" not in table_data.cell_text[relation]:
+                    test_data["top_heading"] = normalize_text(table_data.cell_text[relation])
 
             # Only add the test if we have at least one relation
             if any(x in test_data for x in ["up", "down", "left", "right", "top_heading", "left_heading"]):