mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-16 18:39:29 +00:00
Mix contamination checker script
This commit is contained in:
parent
9818797fbc
commit
1197c35808
@ -84,17 +84,17 @@ def check_contamination(bench_data_dir, metadata_jsonl_path, sqlite_db_path):
|
|||||||
|
|
||||||
# Step 2: Read metadata JSONL and process source URLs
|
# Step 2: Read metadata JSONL and process source URLs
|
||||||
print("Step 2: Processing metadata JSONL...")
|
print("Step 2: Processing metadata JSONL...")
|
||||||
source_urls = []
|
metadata_entries = []
|
||||||
with open(metadata_jsonl_path, 'r') as f:
|
with open(metadata_jsonl_path, 'r') as f:
|
||||||
for line_num, line in enumerate(f, 1):
|
for line_num, line in enumerate(f, 1):
|
||||||
try:
|
try:
|
||||||
data = json.loads(line)
|
data = json.loads(line)
|
||||||
if 'source_url' in data:
|
if 'source_url' in data:
|
||||||
source_urls.append(data['source_url'])
|
metadata_entries.append(data)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
print(f"Warning: Could not parse line {line_num}")
|
print(f"Warning: Could not parse line {line_num}")
|
||||||
|
|
||||||
print(f"Found {len(source_urls)} source URLs in metadata\n")
|
print(f"Found {len(metadata_entries)} entries with source URLs in metadata\n")
|
||||||
|
|
||||||
# Step 3: Map URLs to hashes and query database
|
# Step 3: Map URLs to hashes and query database
|
||||||
print("Step 3: Mapping URLs and querying database...")
|
print("Step 3: Mapping URLs and querying database...")
|
||||||
@ -106,8 +106,11 @@ def check_contamination(bench_data_dir, metadata_jsonl_path, sqlite_db_path):
|
|||||||
s3_count = 0
|
s3_count = 0
|
||||||
local_count = 0
|
local_count = 0
|
||||||
empty_result_count = 0
|
empty_result_count = 0
|
||||||
|
blank_url_entries = [] # Store entries with blank URLs
|
||||||
|
|
||||||
for source_url in source_urls:
|
for metadata_entry in metadata_entries:
|
||||||
|
source_url = metadata_entry.get('source_url')
|
||||||
|
pdf_id = metadata_entry.get('pdf_id', 'N/A')
|
||||||
pdf_hash = None
|
pdf_hash = None
|
||||||
|
|
||||||
# Handle S3 URLs
|
# Handle S3 URLs
|
||||||
@ -134,6 +137,12 @@ def check_contamination(bench_data_dir, metadata_jsonl_path, sqlite_db_path):
|
|||||||
# Check if the looked up URL is empty/blank
|
# Check if the looked up URL is empty/blank
|
||||||
if result[0] == "" or result[0] is None:
|
if result[0] == "" or result[0] is None:
|
||||||
empty_result_count += 1
|
empty_result_count += 1
|
||||||
|
blank_url_entries.append({
|
||||||
|
'pdf_id': pdf_id,
|
||||||
|
'source_url': source_url,
|
||||||
|
'pdf_hash': pdf_hash,
|
||||||
|
'db_result': result[0]
|
||||||
|
})
|
||||||
else:
|
else:
|
||||||
real_urls.add(result[0])
|
real_urls.add(result[0])
|
||||||
else:
|
else:
|
||||||
@ -150,6 +159,17 @@ def check_contamination(bench_data_dir, metadata_jsonl_path, sqlite_db_path):
|
|||||||
if unmapped_count > 0:
|
if unmapped_count > 0:
|
||||||
print(f"Warning: {unmapped_count} URLs could not be mapped\n")
|
print(f"Warning: {unmapped_count} URLs could not be mapped\n")
|
||||||
|
|
||||||
|
# Print entries with blank URLs
|
||||||
|
if blank_url_entries:
|
||||||
|
print(f"\n⚠️ Entries with blank URLs ({len(blank_url_entries)} total):")
|
||||||
|
for entry in blank_url_entries[:20]: # Show first 20
|
||||||
|
print(f" PDF ID: {entry['pdf_id']}")
|
||||||
|
print(f" Source URL: {entry['source_url']}")
|
||||||
|
print(f" PDF Hash: {entry['pdf_hash']}")
|
||||||
|
print(f" DB Result: {repr(entry['db_result'])}")
|
||||||
|
if len(blank_url_entries) > 20:
|
||||||
|
print(f" ... and {len(blank_url_entries) - 20} more entries with blank URLs\n")
|
||||||
|
|
||||||
# Step 4: Check for contamination
|
# Step 4: Check for contamination
|
||||||
print("Step 4: Checking for contamination...")
|
print("Step 4: Checking for contamination...")
|
||||||
contaminated_urls = bench_urls.intersection(real_urls)
|
contaminated_urls = bench_urls.intersection(real_urls)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user