mirror of
				https://github.com/allenai/olmocr.git
				synced 2025-11-04 12:07:15 +00:00 
			
		
		
		
	LOC downloader
This commit is contained in:
		
							parent
							
								
									6123d4452b
								
							
						
					
					
						commit
						ed3820c0c7
					
				@ -12,7 +12,7 @@ import argparse
 | 
			
		||||
import requests
 | 
			
		||||
import img2pdf
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from typing import List, Dict
 | 
			
		||||
from typing import List, Dict, Set
 | 
			
		||||
from tqdm import tqdm
 | 
			
		||||
import time
 | 
			
		||||
import hashlib
 | 
			
		||||
@ -80,7 +80,36 @@ def get_safe_filename(item_id: str) -> str:
 | 
			
		||||
    return safe_name
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True) -> None:
 | 
			
		||||
def scan_existing_outputs(output_dir: Path) -> Set[str]:
 | 
			
		||||
    """Scan output directory to find all already processed assets."""
 | 
			
		||||
    processed_assets = set()
 | 
			
		||||
    
 | 
			
		||||
    if not output_dir.exists():
 | 
			
		||||
        return processed_assets
 | 
			
		||||
    
 | 
			
		||||
    # Scan each dataset subdirectory
 | 
			
		||||
    for dataset_dir in output_dir.iterdir():
 | 
			
		||||
        if not dataset_dir.is_dir():
 | 
			
		||||
            continue
 | 
			
		||||
        
 | 
			
		||||
        # Find all pairs of .pdf and .md files
 | 
			
		||||
        pdf_files = {f.stem for f in dataset_dir.glob("*.pdf")}
 | 
			
		||||
        md_files = {f.stem for f in dataset_dir.glob("*.md")}
 | 
			
		||||
        
 | 
			
		||||
        # Only consider fully processed (both pdf and md exist)
 | 
			
		||||
        complete_files = pdf_files.intersection(md_files)
 | 
			
		||||
        
 | 
			
		||||
        # Verify files are not empty
 | 
			
		||||
        for filename in complete_files:
 | 
			
		||||
            pdf_path = dataset_dir / f"{filename}.pdf"
 | 
			
		||||
            md_path = dataset_dir / f"{filename}.md"
 | 
			
		||||
            if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
 | 
			
		||||
                processed_assets.add(filename)
 | 
			
		||||
    
 | 
			
		||||
    return processed_assets
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def process_csv_file(csv_path: Path, output_dir: Path, processed_assets: Set[str], skip_cleanup: bool = True) -> None:
 | 
			
		||||
    """Process a single CSV file containing LOC transcription data."""
 | 
			
		||||
    csv_name = csv_path.stem
 | 
			
		||||
    dataset_output_dir = output_dir / csv_name
 | 
			
		||||
@ -89,50 +118,76 @@ def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True
 | 
			
		||||
    print(f"\nProcessing {csv_path.name}")
 | 
			
		||||
    
 | 
			
		||||
    # Read CSV
 | 
			
		||||
    with open(csv_path, 'r', encoding='utf-8') as f:
 | 
			
		||||
    with open(csv_path, 'r', encoding='utf-8', errors='ignore') as f:
 | 
			
		||||
        reader = csv.DictReader(f)
 | 
			
		||||
        rows = list(reader)
 | 
			
		||||
    
 | 
			
		||||
    # Count already processed items for this CSV
 | 
			
		||||
    already_done = sum(1 for row in rows 
 | 
			
		||||
                      if 'Asset' in row and get_safe_filename(row['Asset']) in processed_assets)
 | 
			
		||||
    
 | 
			
		||||
    if already_done > 0:
 | 
			
		||||
        print(f"  Skipping {already_done} already processed items")
 | 
			
		||||
    
 | 
			
		||||
    # Process each row
 | 
			
		||||
    processed = 0
 | 
			
		||||
    skipped = 0
 | 
			
		||||
    
 | 
			
		||||
    for row in tqdm(rows, desc=f"Processing {csv_name}"):
 | 
			
		||||
        # Check required fields
 | 
			
		||||
        if not all(key in row for key in ['ItemId', 'DownloadUrl', 'Transcription']):
 | 
			
		||||
            print(f"Skipping row - missing required fields")
 | 
			
		||||
            skipped += 1
 | 
			
		||||
            continue
 | 
			
		||||
        
 | 
			
		||||
        item_id = row['ItemId']
 | 
			
		||||
        download_url = row['DownloadUrl']
 | 
			
		||||
        transcription = row['Transcription']
 | 
			
		||||
        
 | 
			
		||||
        if not item_id or not download_url or not transcription:
 | 
			
		||||
            skipped += 1
 | 
			
		||||
            continue
 | 
			
		||||
        
 | 
			
		||||
        # Create safe filename
 | 
			
		||||
        safe_filename = get_safe_filename(item_id)
 | 
			
		||||
        
 | 
			
		||||
        # Define output paths
 | 
			
		||||
        pdf_path = dataset_output_dir / f"{safe_filename}.pdf"
 | 
			
		||||
        md_path = dataset_output_dir / f"{safe_filename}.md"
 | 
			
		||||
        
 | 
			
		||||
        # Skip if already processed
 | 
			
		||||
        if pdf_path.exists() and md_path.exists():
 | 
			
		||||
            processed += 1
 | 
			
		||||
            continue
 | 
			
		||||
    newly_processed = 0
 | 
			
		||||
    
 | 
			
		||||
    # Create temp directory for downloads
 | 
			
		||||
    temp_dir = dataset_output_dir / 'temp'
 | 
			
		||||
    temp_dir.mkdir(exist_ok=True)
 | 
			
		||||
    
 | 
			
		||||
    for row in tqdm(rows, desc=f"Processing {csv_name}"):
 | 
			
		||||
        # Check required fields
 | 
			
		||||
        if not all(key in row for key in ['Asset', 'DownloadUrl', 'Transcription']):
 | 
			
		||||
            skipped += 1
 | 
			
		||||
            continue
 | 
			
		||||
        
 | 
			
		||||
        asset = row['Asset']
 | 
			
		||||
        download_url = row['DownloadUrl']
 | 
			
		||||
        transcription = row['Transcription']
 | 
			
		||||
        
 | 
			
		||||
        if not asset or not download_url or not transcription:
 | 
			
		||||
            skipped += 1
 | 
			
		||||
            continue
 | 
			
		||||
        
 | 
			
		||||
        # Create safe filename using Asset column
 | 
			
		||||
        safe_filename = get_safe_filename(asset)
 | 
			
		||||
        
 | 
			
		||||
        # Skip if already processed
 | 
			
		||||
        if safe_filename in processed_assets:
 | 
			
		||||
            processed += 1
 | 
			
		||||
            continue
 | 
			
		||||
        
 | 
			
		||||
        # Define output paths
 | 
			
		||||
        pdf_path = dataset_output_dir / f"{safe_filename}.pdf"
 | 
			
		||||
        md_path = dataset_output_dir / f"{safe_filename}.md"
 | 
			
		||||
        
 | 
			
		||||
        # Double-check if files already exist on disk
 | 
			
		||||
        if pdf_path.exists() and md_path.exists():
 | 
			
		||||
            # Verify files are not empty
 | 
			
		||||
            if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
 | 
			
		||||
                processed += 1
 | 
			
		||||
                processed_assets.add(safe_filename)
 | 
			
		||||
                continue
 | 
			
		||||
            else:
 | 
			
		||||
                # Remove empty files to reprocess
 | 
			
		||||
                pdf_path.unlink(missing_ok=True)
 | 
			
		||||
                md_path.unlink(missing_ok=True)
 | 
			
		||||
        
 | 
			
		||||
        # Process the item
 | 
			
		||||
        try:
 | 
			
		||||
            # Download image
 | 
			
		||||
            image_path = temp_dir / f"{safe_filename}.jpg"
 | 
			
		||||
        if download_image(download_url, image_path):
 | 
			
		||||
            
 | 
			
		||||
            if not download_image(download_url, image_path):
 | 
			
		||||
                raise Exception(f"Failed to download image")
 | 
			
		||||
            
 | 
			
		||||
            # Convert to PDF
 | 
			
		||||
            if convert_image_to_pdf(image_path, pdf_path):
 | 
			
		||||
            if not convert_image_to_pdf(image_path, pdf_path):
 | 
			
		||||
                raise Exception(f"Failed to convert image to PDF")
 | 
			
		||||
            
 | 
			
		||||
            # Clean up transcription if needed (skipping for now)
 | 
			
		||||
            if skip_cleanup:
 | 
			
		||||
                cleaned_transcription = transcription
 | 
			
		||||
@ -142,24 +197,41 @@ def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True
 | 
			
		||||
            
 | 
			
		||||
            # Create markdown file
 | 
			
		||||
            create_markdown_file(cleaned_transcription, md_path)
 | 
			
		||||
            
 | 
			
		||||
            # Verify both files exist and are non-empty
 | 
			
		||||
            if pdf_path.exists() and md_path.exists():
 | 
			
		||||
                if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
 | 
			
		||||
                    processed += 1
 | 
			
		||||
                    newly_processed += 1
 | 
			
		||||
                    processed_assets.add(safe_filename)
 | 
			
		||||
                    
 | 
			
		||||
                    # Clean up temp image
 | 
			
		||||
                    image_path.unlink(missing_ok=True)
 | 
			
		||||
                else:
 | 
			
		||||
                skipped += 1
 | 
			
		||||
                    raise Exception("Output files are empty")
 | 
			
		||||
            else:
 | 
			
		||||
                raise Exception("Output files were not created")
 | 
			
		||||
                
 | 
			
		||||
        except Exception as e:
 | 
			
		||||
            print(f"\nError processing {asset}: {e}")
 | 
			
		||||
            skipped += 1
 | 
			
		||||
            # Clean up any partial files
 | 
			
		||||
            pdf_path.unlink(missing_ok=True)
 | 
			
		||||
            md_path.unlink(missing_ok=True)
 | 
			
		||||
            if 'image_path' in locals():
 | 
			
		||||
                image_path.unlink(missing_ok=True)
 | 
			
		||||
    
 | 
			
		||||
    # Clean up temp directory
 | 
			
		||||
    temp_dir = dataset_output_dir / 'temp'
 | 
			
		||||
    if temp_dir.exists():
 | 
			
		||||
        # Remove any remaining temp files
 | 
			
		||||
        for temp_file in temp_dir.glob('*'):
 | 
			
		||||
            temp_file.unlink(missing_ok=True)
 | 
			
		||||
        try:
 | 
			
		||||
            temp_dir.rmdir()
 | 
			
		||||
        except:
 | 
			
		||||
            pass
 | 
			
		||||
    
 | 
			
		||||
    print(f"Completed {csv_name}: {processed} processed, {skipped} skipped")
 | 
			
		||||
    print(f"Completed {csv_name}: {processed} total processed ({newly_processed} new), {skipped} skipped")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def main():
 | 
			
		||||
@ -193,11 +265,19 @@ def main():
 | 
			
		||||
    
 | 
			
		||||
    print(f"Found {len(csv_files)} CSV files to process")
 | 
			
		||||
    
 | 
			
		||||
    # Scan existing outputs to avoid reprocessing
 | 
			
		||||
    print("Scanning existing outputs...")
 | 
			
		||||
    processed_assets = scan_existing_outputs(output_dir)
 | 
			
		||||
    
 | 
			
		||||
    if processed_assets:
 | 
			
		||||
        print(f"Found {len(processed_assets)} already processed items")
 | 
			
		||||
    
 | 
			
		||||
    # Process each CSV file
 | 
			
		||||
    for csv_file in csv_files:
 | 
			
		||||
        process_csv_file(csv_file, output_dir, args.skip_cleanup)
 | 
			
		||||
        process_csv_file(csv_file, output_dir, processed_assets, args.skip_cleanup)
 | 
			
		||||
    
 | 
			
		||||
    print(f"\nAll processing complete. Output saved to {output_dir}")
 | 
			
		||||
    print(f"Total items processed: {len(processed_assets)}")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user