LOC downloader

2025-11-10 23:50:43 +00:00 · 2025-08-28 17:38:20 +00:00 · 2025-08-28 17:38:20 +00:00 · ed3820c0c7
commit ed3820c0c7
parent 6123d4452b
1 changed files with 119 additions and 39 deletions
--- a/olmocr/data/prepare_loc_transcripts.py
+++ b/olmocr/data/prepare_loc_transcripts.py
@ -12,7 +12,7 @@ import argparse
 import requests
 import img2pdf
 from pathlib import Path
-from typing import List, Dict
+from typing import List, Dict, Set
 from tqdm import tqdm
 import time
 import hashlib
@ -80,7 +80,36 @@ def get_safe_filename(item_id: str) -> str:
    return safe_name
-def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True) -> None:
+def scan_existing_outputs(output_dir: Path) -> Set[str]:
    """Scan output directory to find all already processed assets."""
    processed_assets = set()
    if not output_dir.exists():
        return processed_assets
    # Scan each dataset subdirectory
    for dataset_dir in output_dir.iterdir():
        if not dataset_dir.is_dir():
            continue
        # Find all pairs of .pdf and .md files
        pdf_files = {f.stem for f in dataset_dir.glob("*.pdf")}
        md_files = {f.stem for f in dataset_dir.glob("*.md")}
        # Only consider fully processed (both pdf and md exist)
        complete_files = pdf_files.intersection(md_files)
        # Verify files are not empty
        for filename in complete_files:
            pdf_path = dataset_dir / f"{filename}.pdf"
            md_path = dataset_dir / f"{filename}.md"
            if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
                processed_assets.add(filename)
    return processed_assets
 def process_csv_file(csv_path: Path, output_dir: Path, processed_assets: Set[str], skip_cleanup: bool = True) -> None:
    """Process a single CSV file containing LOC transcription data."""
    csv_name = csv_path.stem
    dataset_output_dir = output_dir / csv_name
@ -89,50 +118,76 @@ def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True
    print(f"\nProcessing {csv_path.name}")
    # Read CSV
-    with open(csv_path, 'r', encoding='utf-8') as f:
+    with open(csv_path, 'r', encoding='utf-8', errors='ignore') as f:
        reader = csv.DictReader(f)
        rows = list(reader)
    # Count already processed items for this CSV
    already_done = sum(1 for row in rows 
                      if 'Asset' in row and get_safe_filename(row['Asset']) in processed_assets)
    if already_done > 0:
        print(f"  Skipping {already_done} already processed items")
    # Process each row
    processed = 0
    skipped = 0
-    
+    newly_processed = 0
    for row in tqdm(rows, desc=f"Processing {csv_name}"):
        # Check required fields
        if not all(key in row for key in ['ItemId', 'DownloadUrl', 'Transcription']):
            print(f"Skipping row - missing required fields")
            skipped += 1
            continue
        item_id = row['ItemId']
        download_url = row['DownloadUrl']
        transcription = row['Transcription']
        if not item_id or not download_url or not transcription:
            skipped += 1
            continue
        # Create safe filename
        safe_filename = get_safe_filename(item_id)
        # Define output paths
        pdf_path = dataset_output_dir / f"{safe_filename}.pdf"
        md_path = dataset_output_dir / f"{safe_filename}.md"
        # Skip if already processed
        if pdf_path.exists() and md_path.exists():
            processed += 1
            continue
    # Create temp directory for downloads
    temp_dir = dataset_output_dir / 'temp'
    temp_dir.mkdir(exist_ok=True)
    for row in tqdm(rows, desc=f"Processing {csv_name}"):
        # Check required fields
        if not all(key in row for key in ['Asset', 'DownloadUrl', 'Transcription']):
            skipped += 1
            continue
        asset = row['Asset']
        download_url = row['DownloadUrl']
        transcription = row['Transcription']
        if not asset or not download_url or not transcription:
            skipped += 1
            continue
        # Create safe filename using Asset column
        safe_filename = get_safe_filename(asset)
        # Skip if already processed
        if safe_filename in processed_assets:
            processed += 1
            continue
        # Define output paths
        pdf_path = dataset_output_dir / f"{safe_filename}.pdf"
        md_path = dataset_output_dir / f"{safe_filename}.md"
        # Double-check if files already exist on disk
        if pdf_path.exists() and md_path.exists():
            # Verify files are not empty
            if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
                processed += 1
                processed_assets.add(safe_filename)
                continue
            else:
                # Remove empty files to reprocess
                pdf_path.unlink(missing_ok=True)
                md_path.unlink(missing_ok=True)
        # Process the item
        try:
            # Download image
            image_path = temp_dir / f"{safe_filename}.jpg"
-        if download_image(download_url, image_path):
+            
            if not download_image(download_url, image_path):
                raise Exception(f"Failed to download image")
            # Convert to PDF
-            if convert_image_to_pdf(image_path, pdf_path):
+            if not convert_image_to_pdf(image_path, pdf_path):
                raise Exception(f"Failed to convert image to PDF")
            # Clean up transcription if needed (skipping for now)
            if skip_cleanup:
                cleaned_transcription = transcription
@ -142,24 +197,41 @@ def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True
            # Create markdown file
            create_markdown_file(cleaned_transcription, md_path)
            # Verify both files exist and are non-empty
            if pdf_path.exists() and md_path.exists():
                if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
                    processed += 1
                    newly_processed += 1
                    processed_assets.add(safe_filename)
                    # Clean up temp image
                    image_path.unlink(missing_ok=True)
                else:
-                skipped += 1
+                    raise Exception("Output files are empty")
            else:
                raise Exception("Output files were not created")
        except Exception as e:
            print(f"\nError processing {asset}: {e}")
            skipped += 1
            # Clean up any partial files
            pdf_path.unlink(missing_ok=True)
            md_path.unlink(missing_ok=True)
            if 'image_path' in locals():
                image_path.unlink(missing_ok=True)
    # Clean up temp directory
    temp_dir = dataset_output_dir / 'temp'
    if temp_dir.exists():
        # Remove any remaining temp files
        for temp_file in temp_dir.glob('*'):
            temp_file.unlink(missing_ok=True)
        try:
            temp_dir.rmdir()
        except:
            pass
-    print(f"Completed {csv_name}: {processed} processed, {skipped} skipped")
+    print(f"Completed {csv_name}: {processed} total processed ({newly_processed} new), {skipped} skipped")
 def main():
@ -193,11 +265,19 @@ def main():
    print(f"Found {len(csv_files)} CSV files to process")
    # Scan existing outputs to avoid reprocessing
    print("Scanning existing outputs...")
    processed_assets = scan_existing_outputs(output_dir)
    if processed_assets:
        print(f"Found {len(processed_assets)} already processed items")
    # Process each CSV file
    for csv_file in csv_files:
-        process_csv_file(csv_file, output_dir, args.skip_cleanup)
+        process_csv_file(csv_file, output_dir, processed_assets, args.skip_cleanup)
    print(f"\nAll processing complete. Output saved to {output_dir}")
    print(f"Total items processed: {len(processed_assets)}")
 if __name__ == '__main__':