From 90a7443b2bd80ab33bf124f50bf16b9935a90c6d Mon Sep 17 00:00:00 2001
From: Jake Poznanski <jakep@allenai.org>
Date: Mon, 25 Aug 2025 20:41:04 +0000
Subject: [PATCH] Working to cleanup miner script

---
 olmocr/bench/synth/mine_html_templates.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py
index c72595c..1650079 100644
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@@ -787,6 +787,10 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
     # Convert HTML to markdown to get cleaner text for presence and ordering tests
     markdown_content = html_to_markdown_with_frontmatter(html_content)
     
+    # Remove any HTML tables from the markdown content
+    # Tables can persist in markdown as raw HTML and we want to exclude them
+    markdown_content = re.sub(r'<table[^>]*>.*?</table>', '', markdown_content, flags=re.DOTALL | re.IGNORECASE)
+    
     # Extract just the content part (after frontmatter)
     markdown_lines = markdown_content.split('\n')
     content_start_idx = 0
@@ -814,7 +818,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, verb
                 sentence_str = sentence_str.strip()
 
                 if sentence_str:
-                    # Skip HTML table content that might still be in markdown
+                    # Skip HTML content that might still be in markdown
                     if not sentence_str.startswith('<') and not sentence_str.endswith('>'):
                         # Skip image placeholders - match any markdown image syntax ![...](...)
                         if re.search(r'!\[.*?\]\(.*?\)', sentence_str):
@@ -1039,9 +1043,9 @@ async def process_pdf(pdf_info, args, client, pdf_filter=None):
             return None
 
         # Create output directories
-        html_dir = os.path.join(args.output_dir, "html")
-        pdfs_dir = os.path.join(args.output_dir, "pdfs")
-        training_dir = os.path.join(args.output_dir, "training")
+        html_dir = os.path.join(args.output_dir, "html", args.name)
+        pdfs_dir = os.path.join(args.output_dir, "pdfs", args.name)
+        training_dir = os.path.join(args.output_dir, "training", args.name)
         bench_data_dir = os.path.join(args.output_dir, "bench_data")
         bench_synthetic_dir = os.path.join(bench_data_dir, "pdfs", args.name)
         claude_original_dir = os.path.join(bench_data_dir, "claude_original", args.name)