Mix contamination checker script

2025-11-16 18:39:29 +00:00 · 2025-09-23 18:17:13 +00:00 · 2025-09-23 18:17:13 +00:00 · 1197c35808
commit 1197c35808
parent 9818797fbc
1 changed files with 29 additions and 9 deletions
--- a/scripts/check_contamination.py
+++ b/scripts/check_contamination.py
@ -84,17 +84,17 @@ def check_contamination(bench_data_dir, metadata_jsonl_path, sqlite_db_path):
    # Step 2: Read metadata JSONL and process source URLs
    print("Step 2: Processing metadata JSONL...")
-    source_urls = []
+    metadata_entries = []
    with open(metadata_jsonl_path, 'r') as f:
        for line_num, line in enumerate(f, 1):
            try:
                data = json.loads(line)
                if 'source_url' in data:
-                    source_urls.append(data['source_url'])
+                    metadata_entries.append(data)
            except json.JSONDecodeError:
                print(f"Warning: Could not parse line {line_num}")
-    print(f"Found {len(source_urls)} source URLs in metadata\n")
+    print(f"Found {len(metadata_entries)} entries with source URLs in metadata\n")
    # Step 3: Map URLs to hashes and query database
    print("Step 3: Mapping URLs and querying database...")
@ -106,8 +106,11 @@ def check_contamination(bench_data_dir, metadata_jsonl_path, sqlite_db_path):
    s3_count = 0
    local_count = 0
    empty_result_count = 0
    blank_url_entries = []  # Store entries with blank URLs
-    for source_url in source_urls:
+    for metadata_entry in metadata_entries:
        source_url = metadata_entry.get('source_url')
        pdf_id = metadata_entry.get('pdf_id', 'N/A')
        pdf_hash = None
        # Handle S3 URLs
@ -134,6 +137,12 @@ def check_contamination(bench_data_dir, metadata_jsonl_path, sqlite_db_path):
                # Check if the looked up URL is empty/blank
                if result[0] == "" or result[0] is None:
                    empty_result_count += 1
                    blank_url_entries.append({
                        'pdf_id': pdf_id,
                        'source_url': source_url,
                        'pdf_hash': pdf_hash,
                        'db_result': result[0]
                    })
                else:
                    real_urls.add(result[0])
        else:
@ -150,6 +159,17 @@ def check_contamination(bench_data_dir, metadata_jsonl_path, sqlite_db_path):
    if unmapped_count > 0:
        print(f"Warning: {unmapped_count} URLs could not be mapped\n")
    # Print entries with blank URLs
    if blank_url_entries:
        print(f"\n⚠️  Entries with blank URLs ({len(blank_url_entries)} total):")
        for entry in blank_url_entries[:20]:  # Show first 20
            print(f"  PDF ID: {entry['pdf_id']}")
            print(f"    Source URL: {entry['source_url']}")
            print(f"    PDF Hash: {entry['pdf_hash']}")
            print(f"    DB Result: {repr(entry['db_result'])}")
        if len(blank_url_entries) > 20:
            print(f"  ... and {len(blank_url_entries) - 20} more entries with blank URLs\n")
    # Step 4: Check for contamination
    print("Step 4: Checking for contamination...")
    contaminated_urls = bench_urls.intersection(real_urls)