dedupe script

2025-10-14 17:52:53 +00:00 · 2025-05-12 17:02:35 +00:00 · 2025-05-12 17:02:35 +00:00 · b3b405d077
commit b3b405d077
parent e06fd622c3
1 changed files with 132 additions and 0 deletions
--- a/olmocr/bench/scripts/url_matcher.py
+++ b/olmocr/bench/scripts/url_matcher.py
@ -0,0 +1,132 @@
+#!/usr/bin/env python
+import json
+import os
+import glob
+from datasets import load_dataset
+from collections import defaultdict
+import argparse
+
+def extract_urls_from_jsonl(file_path):
+    """Extract URLs from a JSONL file."""
+    urls = set()
+    url_to_data = {}
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            try:
+                data = json.loads(line.strip())
+                if 'url' in data and data['url']:
+                    url = data['url']
+                    urls.add(url)
+                    # Store minimal context for each URL
+                    url_to_data[url] = {
+                        'id': data.get('id', ''),
+                        'type': data.get('type', ''),
+                        'page': data.get('page', '')
+                    }
+            except json.JSONDecodeError:
+                print(f"Warning: Could not parse JSON from line in {file_path}")
+                continue
+    return urls, url_to_data
+
+def main():
+    parser = argparse.ArgumentParser(description='Check for URL matches between local files and Hugging Face dataset')
+    parser.add_argument('--local-dir', default='/home/ubuntu/olmocr/olmOCR-bench/bench_data',
+                      help='Directory containing local JSONL files')
+    parser.add_argument('--output', default='url_matches.json',
+                      help='Output file for results')
+    args = parser.parse_args()
+
+    # Step 1: Get all local JSONL files
+    local_jsonl_files = glob.glob(os.path.join(args.local_dir, "*.jsonl"))
+    print(f"Found {len(local_jsonl_files)} local JSONL files.")
+
+    # Step 2: Extract URLs from local files
+    local_urls = {}
+    all_local_urls = set()
+    url_metadata = {}
+
+    for file_path in local_jsonl_files:
+        file_name = os.path.basename(file_path)
+        urls, url_data = extract_urls_from_jsonl(file_path)
+        local_urls[file_name] = urls
+        all_local_urls.update(urls)
+
+        # Store metadata with file information
+        for url, data in url_data.items():
+            if url not in url_metadata:
+                url_metadata[url] = []
+            url_metadata[url].append({
+                "file": file_name,
+                **data
+            })
+
+    print(f"Extracted {len(all_local_urls)} unique URLs from local files.")
+
+    # Step 3: Load Hugging Face dataset
+    print("Loading Hugging Face dataset...")
+    try:
+        dataset_documents = load_dataset("allenai/olmOCR-mix-0225", "00_documents")
+        dataset_books = load_dataset("allenai/olmOCR-mix-0225", "01_books")
+        
+        # Step 4: Extract URLs from Hugging Face dataset
+        hf_urls = set()
+        
+        for split in dataset_documents:
+            for item in dataset_documents[split]:
+                if 'url' in item and item['url']:
+                    hf_urls.add(item['url'])
+        
+        for split in dataset_books:
+            for item in dataset_books[split]:
+                if 'url' in item and item['url']:
+                    hf_urls.add(item['url'])
+        
+        print(f"Extracted {len(hf_urls)} unique URLs from Hugging Face dataset.")
+        
+        # Step 5: Find matches
+        matches = all_local_urls.intersection(hf_urls)
+
+        # Step 6: Group matches by local file with metadata
+        matches_by_file = {}
+        match_details = []
+
+        for file_name, urls in local_urls.items():
+            file_matches = urls.intersection(hf_urls)
+            if file_matches:
+                matches_by_file[file_name] = list(file_matches)
+
+                # Add detailed metadata for each match
+                for url in file_matches:
+                    if url in url_metadata:
+                        for entry in url_metadata[url]:
+                            match_details.append({
+                                "url": url,
+                                "metadata": entry
+                            })
+
+        # Print summary
+        print(f"Found {len(matches)} matching URLs between local files and Hugging Face dataset.")
+
+        for file_name, file_matches in matches_by_file.items():
+            match_percentage = (len(file_matches) / len(local_urls[file_name])) * 100 if local_urls[file_name] else 0
+            print(f"{file_name}: {len(file_matches)}/{len(local_urls[file_name])} matches ({match_percentage:.2f}%)")
+
+        # Save results
+        result = {
+            "total_local_urls": len(all_local_urls),
+            "total_hf_urls": len(hf_urls),
+            "total_matches": len(matches),
+            "matches_by_file": matches_by_file,
+            "match_details": match_details
+        }
+        
+        with open(args.output, 'w', encoding='utf-8') as f:
+            json.dump(result, f, indent=2)
+        
+        print(f"Results saved to {args.output}")
+        
+    except Exception as e:
+        print(f"Error loading or processing Hugging Face dataset: {e}")
+
+if __name__ == "__main__":
+    main()