LOC downloader

2025-11-03 03:25:22 +00:00 · 2025-08-28 17:38:20 +00:00 · 2025-08-28 17:38:20 +00:00 · ed3820c0c7
commit ed3820c0c7
parent 6123d4452b
1 changed files with 119 additions and 39 deletions
--- a/olmocr/data/prepare_loc_transcripts.py
+++ b/olmocr/data/prepare_loc_transcripts.py
@ -12,7 +12,7 @@ import argparse
 import requests
 import img2pdf
 from pathlib import Path
-from typing import List, Dict
+from typing import List, Dict, Set
 from tqdm import tqdm
 import time
 import hashlib
@ -80,7 +80,36 @@ def get_safe_filename(item_id: str) -> str:
    return safe_name


-def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True) -> None:
+def scan_existing_outputs(output_dir: Path) -> Set[str]:
+    """Scan output directory to find all already processed assets."""
+    processed_assets = set()
+    
+    if not output_dir.exists():
+        return processed_assets
+    
+    # Scan each dataset subdirectory
+    for dataset_dir in output_dir.iterdir():
+        if not dataset_dir.is_dir():
+            continue
+        
+        # Find all pairs of .pdf and .md files
+        pdf_files = {f.stem for f in dataset_dir.glob("*.pdf")}
+        md_files = {f.stem for f in dataset_dir.glob("*.md")}
+        
+        # Only consider fully processed (both pdf and md exist)
+        complete_files = pdf_files.intersection(md_files)
+        
+        # Verify files are not empty
+        for filename in complete_files:
+            pdf_path = dataset_dir / f"{filename}.pdf"
+            md_path = dataset_dir / f"{filename}.md"
+            if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
+                processed_assets.add(filename)
+    
+    return processed_assets
+
+
+def process_csv_file(csv_path: Path, output_dir: Path, processed_assets: Set[str], skip_cleanup: bool = True) -> None:
    """Process a single CSV file containing LOC transcription data."""
    csv_name = csv_path.stem
    dataset_output_dir = output_dir / csv_name
@ -89,77 +118,120 @@ def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True
    print(f"\nProcessing {csv_path.name}")
    
    # Read CSV
-    with open(csv_path, 'r', encoding='utf-8') as f:
+    with open(csv_path, 'r', encoding='utf-8', errors='ignore') as f:
        reader = csv.DictReader(f)
        rows = list(reader)
    
+    # Count already processed items for this CSV
+    already_done = sum(1 for row in rows 
+                      if 'Asset' in row and get_safe_filename(row['Asset']) in processed_assets)
+    
+    if already_done > 0:
+        print(f"  Skipping {already_done} already processed items")
+    
    # Process each row
    processed = 0
    skipped = 0
+    newly_processed = 0
+    
+    # Create temp directory for downloads
+    temp_dir = dataset_output_dir / 'temp'
+    temp_dir.mkdir(exist_ok=True)
    
    for row in tqdm(rows, desc=f"Processing {csv_name}"):
        # Check required fields
-        if not all(key in row for key in ['ItemId', 'DownloadUrl', 'Transcription']):
-            print(f"Skipping row - missing required fields")
+        if not all(key in row for key in ['Asset', 'DownloadUrl', 'Transcription']):
            skipped += 1
            continue
        
-        item_id = row['ItemId']
+        asset = row['Asset']
        download_url = row['DownloadUrl']
        transcription = row['Transcription']
        
-        if not item_id or not download_url or not transcription:
+        if not asset or not download_url or not transcription:
            skipped += 1
            continue
        
-        # Create safe filename
-        safe_filename = get_safe_filename(item_id)
+        # Create safe filename using Asset column
+        safe_filename = get_safe_filename(asset)
+        
+        # Skip if already processed
+        if safe_filename in processed_assets:
+            processed += 1
+            continue
        
        # Define output paths
        pdf_path = dataset_output_dir / f"{safe_filename}.pdf"
        md_path = dataset_output_dir / f"{safe_filename}.md"
        
-        # Skip if already processed
+        # Double-check if files already exist on disk
        if pdf_path.exists() and md_path.exists():
-            processed += 1
-            continue
-        
-        # Create temp directory for downloads
-        temp_dir = dataset_output_dir / 'temp'
-        temp_dir.mkdir(exist_ok=True)
-        
-        # Download image
-        image_path = temp_dir / f"{safe_filename}.jpg"
-        if download_image(download_url, image_path):
-            # Convert to PDF
-            if convert_image_to_pdf(image_path, pdf_path):
-                # Clean up transcription if needed (skipping for now)
-                if skip_cleanup:
-                    cleaned_transcription = transcription
-                else:
-                    # TODO: Add transcription cleanup using GPT-4o
-                    cleaned_transcription = transcription
-                
-                # Create markdown file
-                create_markdown_file(cleaned_transcription, md_path)
+            # Verify files are not empty
+            if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
                processed += 1
-                
-                # Clean up temp image
-                image_path.unlink(missing_ok=True)
+                processed_assets.add(safe_filename)
+                continue
            else:
-                skipped += 1
-        else:
+                # Remove empty files to reprocess
+                pdf_path.unlink(missing_ok=True)
+                md_path.unlink(missing_ok=True)
+        
+        # Process the item
+        try:
+            # Download image
+            image_path = temp_dir / f"{safe_filename}.jpg"
+            
+            if not download_image(download_url, image_path):
+                raise Exception(f"Failed to download image")
+            
+            # Convert to PDF
+            if not convert_image_to_pdf(image_path, pdf_path):
+                raise Exception(f"Failed to convert image to PDF")
+            
+            # Clean up transcription if needed (skipping for now)
+            if skip_cleanup:
+                cleaned_transcription = transcription
+            else:
+                # TODO: Add transcription cleanup using GPT-4o
+                cleaned_transcription = transcription
+            
+            # Create markdown file
+            create_markdown_file(cleaned_transcription, md_path)
+            
+            # Verify both files exist and are non-empty
+            if pdf_path.exists() and md_path.exists():
+                if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
+                    processed += 1
+                    newly_processed += 1
+                    processed_assets.add(safe_filename)
+                    
+                    # Clean up temp image
+                    image_path.unlink(missing_ok=True)
+                else:
+                    raise Exception("Output files are empty")
+            else:
+                raise Exception("Output files were not created")
+                
+        except Exception as e:
+            print(f"\nError processing {asset}: {e}")
            skipped += 1
+            # Clean up any partial files
+            pdf_path.unlink(missing_ok=True)
+            md_path.unlink(missing_ok=True)
+            if 'image_path' in locals():
+                image_path.unlink(missing_ok=True)
    
    # Clean up temp directory
-    temp_dir = dataset_output_dir / 'temp'
    if temp_dir.exists():
+        # Remove any remaining temp files
+        for temp_file in temp_dir.glob('*'):
+            temp_file.unlink(missing_ok=True)
        try:
            temp_dir.rmdir()
        except:
            pass
    
-    print(f"Completed {csv_name}: {processed} processed, {skipped} skipped")
+    print(f"Completed {csv_name}: {processed} total processed ({newly_processed} new), {skipped} skipped")


 def main():
@ -193,11 +265,19 @@ def main():
    
    print(f"Found {len(csv_files)} CSV files to process")
    
+    # Scan existing outputs to avoid reprocessing
+    print("Scanning existing outputs...")
+    processed_assets = scan_existing_outputs(output_dir)
+    
+    if processed_assets:
+        print(f"Found {len(processed_assets)} already processed items")
+    
    # Process each CSV file
    for csv_file in csv_files:
-        process_csv_file(csv_file, output_dir, args.skip_cleanup)
+        process_csv_file(csv_file, output_dir, processed_assets, args.skip_cleanup)
    
    print(f"\nAll processing complete. Output saved to {output_dir}")
+    print(f"Total items processed: {len(processed_assets)}")


 if __name__ == '__main__':