mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-14 01:32:31 +00:00
dedupe script
This commit is contained in:
parent
e06fd622c3
commit
b3b405d077
132
olmocr/bench/scripts/url_matcher.py
Executable file
132
olmocr/bench/scripts/url_matcher.py
Executable file
@ -0,0 +1,132 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
from datasets import load_dataset
|
||||||
|
from collections import defaultdict
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
def extract_urls_from_jsonl(file_path):
|
||||||
|
"""Extract URLs from a JSONL file."""
|
||||||
|
urls = set()
|
||||||
|
url_to_data = {}
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as f:
|
||||||
|
for line in f:
|
||||||
|
try:
|
||||||
|
data = json.loads(line.strip())
|
||||||
|
if 'url' in data and data['url']:
|
||||||
|
url = data['url']
|
||||||
|
urls.add(url)
|
||||||
|
# Store minimal context for each URL
|
||||||
|
url_to_data[url] = {
|
||||||
|
'id': data.get('id', ''),
|
||||||
|
'type': data.get('type', ''),
|
||||||
|
'page': data.get('page', '')
|
||||||
|
}
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print(f"Warning: Could not parse JSON from line in {file_path}")
|
||||||
|
continue
|
||||||
|
return urls, url_to_data
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description='Check for URL matches between local files and Hugging Face dataset')
|
||||||
|
parser.add_argument('--local-dir', default='/home/ubuntu/olmocr/olmOCR-bench/bench_data',
|
||||||
|
help='Directory containing local JSONL files')
|
||||||
|
parser.add_argument('--output', default='url_matches.json',
|
||||||
|
help='Output file for results')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Step 1: Get all local JSONL files
|
||||||
|
local_jsonl_files = glob.glob(os.path.join(args.local_dir, "*.jsonl"))
|
||||||
|
print(f"Found {len(local_jsonl_files)} local JSONL files.")
|
||||||
|
|
||||||
|
# Step 2: Extract URLs from local files
|
||||||
|
local_urls = {}
|
||||||
|
all_local_urls = set()
|
||||||
|
url_metadata = {}
|
||||||
|
|
||||||
|
for file_path in local_jsonl_files:
|
||||||
|
file_name = os.path.basename(file_path)
|
||||||
|
urls, url_data = extract_urls_from_jsonl(file_path)
|
||||||
|
local_urls[file_name] = urls
|
||||||
|
all_local_urls.update(urls)
|
||||||
|
|
||||||
|
# Store metadata with file information
|
||||||
|
for url, data in url_data.items():
|
||||||
|
if url not in url_metadata:
|
||||||
|
url_metadata[url] = []
|
||||||
|
url_metadata[url].append({
|
||||||
|
"file": file_name,
|
||||||
|
**data
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"Extracted {len(all_local_urls)} unique URLs from local files.")
|
||||||
|
|
||||||
|
# Step 3: Load Hugging Face dataset
|
||||||
|
print("Loading Hugging Face dataset...")
|
||||||
|
try:
|
||||||
|
dataset_documents = load_dataset("allenai/olmOCR-mix-0225", "00_documents")
|
||||||
|
dataset_books = load_dataset("allenai/olmOCR-mix-0225", "01_books")
|
||||||
|
|
||||||
|
# Step 4: Extract URLs from Hugging Face dataset
|
||||||
|
hf_urls = set()
|
||||||
|
|
||||||
|
for split in dataset_documents:
|
||||||
|
for item in dataset_documents[split]:
|
||||||
|
if 'url' in item and item['url']:
|
||||||
|
hf_urls.add(item['url'])
|
||||||
|
|
||||||
|
for split in dataset_books:
|
||||||
|
for item in dataset_books[split]:
|
||||||
|
if 'url' in item and item['url']:
|
||||||
|
hf_urls.add(item['url'])
|
||||||
|
|
||||||
|
print(f"Extracted {len(hf_urls)} unique URLs from Hugging Face dataset.")
|
||||||
|
|
||||||
|
# Step 5: Find matches
|
||||||
|
matches = all_local_urls.intersection(hf_urls)
|
||||||
|
|
||||||
|
# Step 6: Group matches by local file with metadata
|
||||||
|
matches_by_file = {}
|
||||||
|
match_details = []
|
||||||
|
|
||||||
|
for file_name, urls in local_urls.items():
|
||||||
|
file_matches = urls.intersection(hf_urls)
|
||||||
|
if file_matches:
|
||||||
|
matches_by_file[file_name] = list(file_matches)
|
||||||
|
|
||||||
|
# Add detailed metadata for each match
|
||||||
|
for url in file_matches:
|
||||||
|
if url in url_metadata:
|
||||||
|
for entry in url_metadata[url]:
|
||||||
|
match_details.append({
|
||||||
|
"url": url,
|
||||||
|
"metadata": entry
|
||||||
|
})
|
||||||
|
|
||||||
|
# Print summary
|
||||||
|
print(f"Found {len(matches)} matching URLs between local files and Hugging Face dataset.")
|
||||||
|
|
||||||
|
for file_name, file_matches in matches_by_file.items():
|
||||||
|
match_percentage = (len(file_matches) / len(local_urls[file_name])) * 100 if local_urls[file_name] else 0
|
||||||
|
print(f"{file_name}: {len(file_matches)}/{len(local_urls[file_name])} matches ({match_percentage:.2f}%)")
|
||||||
|
|
||||||
|
# Save results
|
||||||
|
result = {
|
||||||
|
"total_local_urls": len(all_local_urls),
|
||||||
|
"total_hf_urls": len(hf_urls),
|
||||||
|
"total_matches": len(matches),
|
||||||
|
"matches_by_file": matches_by_file,
|
||||||
|
"match_details": match_details
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(args.output, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(result, f, indent=2)
|
||||||
|
|
||||||
|
print(f"Results saved to {args.output}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading or processing Hugging Face dataset: {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
x
Reference in New Issue
Block a user