diff --git a/olmocr/bench/synth/mine_html_templates.py b/olmocr/bench/synth/mine_html_templates.py
index 6e16220..afb1ac9 100644
--- a/olmocr/bench/synth/mine_html_templates.py
+++ b/olmocr/bench/synth/mine_html_templates.py
@@ -729,6 +729,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, rand
"type": TestType.TABLE.value,
"cell": cell_text,
"max_diffs": 0,
+ "ignore_markdown_tables": True,
}
# Check cell up
@@ -948,6 +949,7 @@ def generate_tests_from_html(html_content: str, pdf_id: str, page_num: int, rand
"type": "math",
"math": equation,
"max_diffs": 0,
+ "ignore_dollar_delimited": True,
}
)
diff --git a/olmocr/bench/tests.py b/olmocr/bench/tests.py
index e763e5f..c4d621f 100644
--- a/olmocr/bench/tests.py
+++ b/olmocr/bench/tests.py
@@ -633,6 +633,8 @@ class TableTest(BasePDFTest):
top_heading: str = ""
left_heading: str = ""
+ ignore_markdown_tables: bool = False
+
def __post_init__(self):
super().__post_init__()
if self.type != TestType.TABLE.value:
@@ -670,8 +672,9 @@ class TableTest(BasePDFTest):
threshold = max(0.5, threshold)
# Parse tables based on content_type
- md_tables = parse_markdown_tables(content)
- tables_to_check.extend(md_tables)
+ if not self.ignore_markdown_tables:
+ md_tables = parse_markdown_tables(content)
+ tables_to_check.extend(md_tables)
html_tables = parse_html_tables(content)
tables_to_check.extend(html_tables)
@@ -926,6 +929,8 @@ class BaselineTest(BasePDFTest):
class MathTest(BasePDFTest):
math: str
+ ignore_dollar_delimited: bool = False
+
def __post_init__(self):
super().__post_init__()
if self.type != TestType.MATH.value:
@@ -941,12 +946,16 @@ class MathTest(BasePDFTest):
def run(self, content: str) -> Tuple[bool, str]:
# Store both the search pattern and the full pattern to replace
patterns = [
- (r"\$\$(.+?)\$\$", r"\$\$(.+?)\$\$"), # $$...$$
(r"\\\((.+?)\\\)", r"\\\((.+?)\\\)"), # \(...\)
(r"\\\[(.+?)\\\]", r"\\\[(.+?)\\\]"), # \[...\]
- (r"\$(.+?)\$", r"\$(.+?)\$"), # $...$
]
+ if not self.ignore_dollar_delimited:
+ patterns.extend([
+ (r"\$\$(.+?)\$\$", r"\$\$(.+?)\$\$"), # $$...$$
+ (r"\$(.+?)\$", r"\$(.+?)\$"), # $...$])
+ ])
+
equations = []
modified_content = content
diff --git a/scripts/check_contamination.py b/scripts/check_contamination.py
index e4a270b..4c2c521 100755
--- a/scripts/check_contamination.py
+++ b/scripts/check_contamination.py
@@ -28,6 +28,7 @@ import sqlite3
import argparse
from pathlib import Path
import re
+import os
def get_bench_urls(bench_data_dir):
@@ -70,7 +71,125 @@ def local_path_to_short_hash(local_path):
return None
-def check_contamination(bench_data_dir, metadata_jsonl_path, sqlite_db_path):
+def find_and_handle_contaminated_files(metadata_jsonl_path, contaminated_pdf_ids, delete_mode=False):
+ """Find and optionally delete files related to contaminated PDFs.
+
+ Returns:
+ List of files that were deleted or would be deleted
+ """
+ # Get the base directory from metadata jsonl path
+ metadata_dir = Path(metadata_jsonl_path).parent
+ output_dir = metadata_dir.parent # Go up one level from metadata directory
+
+ # Get the name from the metadata jsonl filename (e.g., "synthetic" from "synthetic.jsonl")
+ name = Path(metadata_jsonl_path).stem
+
+ files_to_delete = []
+
+ for pdf_id in contaminated_pdf_ids:
+ # Pattern for files related to this pdf_id
+ # Based on mine_html_templates.py, the files are named with pattern:
+ # {pdf_id}_page{page_num}.{extension}
+
+ # Find HTML files
+ html_dir = output_dir / "html" / name
+ if html_dir.exists():
+ for html_file in html_dir.glob(f"{pdf_id}_page*.html"):
+ files_to_delete.append(html_file)
+
+ # Find PDF files (both original and rendered)
+ pdfs_dir = output_dir / "pdfs" / name
+ if pdfs_dir.exists():
+ for pdf_file in pdfs_dir.glob(f"{pdf_id}_page*.pdf"):
+ files_to_delete.append(pdf_file)
+
+ # Find markdown files in training directory
+ training_dir = output_dir / "training" / name
+ if training_dir.exists():
+ for md_file in training_dir.glob(f"{pdf_id}_page*.md"):
+ files_to_delete.append(md_file)
+ # Also check for PDF symlinks
+ for pdf_link in training_dir.glob(f"{pdf_id}_page*.pdf"):
+ files_to_delete.append(pdf_link)
+
+ # Find files in bench_data directory
+ bench_data_dir = output_dir / "bench_data"
+
+ # Check synthetic PDFs subdirectory
+ bench_synthetic_dir = bench_data_dir / "pdfs" / name
+ if bench_synthetic_dir.exists():
+ for pdf_file in bench_synthetic_dir.glob(f"{pdf_id}_page*.pdf"):
+ files_to_delete.append(pdf_file)
+
+ # Check claude_original subdirectory
+ claude_original_dir = bench_data_dir / "claude_original" / name
+ if claude_original_dir.exists():
+ for md_file in claude_original_dir.glob(f"{pdf_id}_page*.md"):
+ files_to_delete.append(md_file)
+
+ # Remove tests from bench_data JSONL file
+ jsonl_file = bench_data_dir / f"{name}.jsonl"
+ if jsonl_file.exists():
+ # Read all tests
+ remaining_tests = []
+ removed_tests = 0
+
+ with open(jsonl_file, 'r') as f:
+ for line in f:
+ try:
+ test = json.loads(line)
+ # Check if this test belongs to a contaminated PDF
+ # Test PDFs are in format "{name}/{pdf_id}_page{page_num}.pdf"
+ test_pdf = test.get('pdf', '')
+ is_contaminated = False
+ for pdf_id in contaminated_pdf_ids:
+ if f"{pdf_id}_page" in test_pdf:
+ is_contaminated = True
+ removed_tests += 1
+ break
+
+ if not is_contaminated:
+ remaining_tests.append(test)
+ except json.JSONDecodeError:
+ continue
+
+ if removed_tests > 0:
+ if delete_mode:
+ # Rewrite the file without contaminated tests
+ with open(jsonl_file, 'w') as f:
+ for test in remaining_tests:
+ f.write(json.dumps(test) + '\n')
+ print(f"Removed {removed_tests} tests from {jsonl_file}")
+ else:
+ print(f"Would remove {removed_tests} tests from {jsonl_file}")
+
+ # Print summary of files to delete
+ if files_to_delete:
+ print(f"\n{'Deleting' if delete_mode else 'Would delete'} {len(files_to_delete)} files:")
+ for file_path in sorted(files_to_delete): # Show first 10
+ relative_path = file_path.relative_to(output_dir) if output_dir in file_path.parents else file_path
+ print(f" - {relative_path}")
+
+ # Actually delete if in delete mode
+ if delete_mode:
+ try:
+ if file_path.is_symlink() or file_path.exists():
+ file_path.unlink()
+ except Exception as e:
+ print(f" Error deleting: {e}")
+
+
+ if delete_mode:
+ print(f"\nSuccessfully deleted {len(files_to_delete)} files")
+ else:
+ print(f"\nTo actually delete these files, run with --delete flag")
+ else:
+ print("\nNo files found to delete")
+
+ return files_to_delete
+
+
+def check_contamination(bench_data_dir, metadata_jsonl_path, sqlite_db_path, delete_mode=False):
"""Main function to check for contamination between bench data and training data."""
print(f"Checking contamination...")
print(f"Bench data directory: {bench_data_dir}")
@@ -173,26 +292,85 @@ def check_contamination(bench_data_dir, metadata_jsonl_path, sqlite_db_path):
# Step 4: Check for contamination
print("Step 4: Checking for contamination...")
contaminated_urls = bench_urls.intersection(real_urls)
-
+
+ # Track which PDF IDs are contaminated (including those with blank URLs)
+ contaminated_pdf_ids = set()
+
+ # Add PDF IDs with blank URLs to contaminated set
+ for entry in blank_url_entries:
+ pdf_id = entry.get('pdf_id', 'N/A')
+ if pdf_id != 'N/A':
+ contaminated_pdf_ids.add(pdf_id)
+
if contaminated_urls:
- print(f"\n⚠️ CONTAMINATION DETECTED! Found {len(contaminated_urls)} matching URLs:")
- for url in sorted(contaminated_urls)[:10]: # Show first 10
- print(f" - {url}")
- if len(contaminated_urls) > 10:
- print(f" ... and {len(contaminated_urls) - 10} more")
+ # Find the pdf_ids that correspond to contaminated URLs
+ for metadata_entry in metadata_entries:
+ source_url = metadata_entry.get('source_url')
+ pdf_id = metadata_entry.get('pdf_id', 'N/A')
+ pdf_hash = None
+
+ # Process URL to get hash
+ if source_url.startswith("s3://"):
+ pdf_hash = s3_url_to_hash(source_url)
+ elif source_url.startswith("./"):
+ short_hash = local_path_to_short_hash(source_url)
+ if short_hash:
+ conn_temp = sqlite3.connect(sqlite_db_path)
+ cursor_temp = conn_temp.cursor()
+ cursor_temp.execute("SELECT full_hash FROM substr_to_full_hash WHERE pdf_hash = ?", (short_hash,))
+ result = cursor_temp.fetchone()
+ if result:
+ pdf_hash = result[0]
+ conn_temp.close()
+
+ # If we have a hash, look up the real URI
+ if pdf_hash:
+ conn_temp = sqlite3.connect(sqlite_db_path)
+ cursor_temp = conn_temp.cursor()
+ cursor_temp.execute("SELECT uri FROM pdf_mapping WHERE pdf_hash = ?", (pdf_hash,))
+ result = cursor_temp.fetchone()
+ conn_temp.close()
+
+ if result and result[0] and result[0] in contaminated_urls:
+ contaminated_pdf_ids.add(pdf_id)
+
+ # Check if we have any contamination (URL matches or blank URLs)
+ total_contaminated = len(contaminated_urls) + len(blank_url_entries)
+
+ if total_contaminated > 0:
+ print(f"\n⚠️ CONTAMINATION DETECTED!")
+ if contaminated_urls:
+ print(f" - Found {len(contaminated_urls)} matching URLs")
+ if blank_url_entries:
+ print(f" - Found {len(blank_url_entries)} entries with blank URLs (treated as contaminated)")
+ print(f" - Total contaminated PDF IDs: {len(contaminated_pdf_ids)}")
+
+ if contaminated_urls:
+ print(f"\nMatching URLs (first 10):")
+ for url in sorted(contaminated_urls)[:10]:
+ print(f" - {url}")
+ if len(contaminated_urls) > 10:
+ print(f" ... and {len(contaminated_urls) - 10} more")
+
+ # Handle file deletion/dry run
+ if contaminated_pdf_ids:
+ print(f"\nProcessing files for {len(contaminated_pdf_ids)} contaminated PDFs...")
+ find_and_handle_contaminated_files(metadata_jsonl_path, contaminated_pdf_ids, delete_mode)
else:
- print("\n✅ No contamination detected. Bench URLs and training URLs are disjoint.")
-
+ print("\n✅ No contamination detected. Bench URLs and training URLs are disjoint, and no blank URLs found.")
+
# Print summary statistics
print(f"\nSummary:")
print(f" Bench URLs: {len(bench_urls)}")
print(f" Training URLs (mapped): {len(real_urls)}")
print(f" Contaminated URLs: {len(contaminated_urls)}")
+ print(f" Blank URL entries: {len(blank_url_entries)}")
+ print(f" Total contaminated: {total_contaminated}")
if bench_urls:
contamination_rate = (len(contaminated_urls) / len(bench_urls)) * 100
print(f" Contamination rate: {contamination_rate:.2f}%")
-
- return len(contaminated_urls)
+
+ return total_contaminated
def main():
@@ -211,7 +389,12 @@ def main():
"sqlite_db",
help="Path to SQLite database with pdf_mapping table"
)
-
+ parser.add_argument(
+ "--delete",
+ action="store_true",
+ help="Delete contaminated files (default is dry run)"
+ )
+
args = parser.parse_args()
# Validate paths
@@ -231,7 +414,8 @@ def main():
contaminated_count = check_contamination(
args.bench_data_dir,
args.metadata_jsonl,
- args.sqlite_db
+ args.sqlite_db,
+ delete_mode=args.delete
)
# Return non-zero exit code if contamination found