diff --git a/olmocr/data/prepare_loc_transcripts.py b/olmocr/data/prepare_loc_transcripts.py index 24cc8d4..95cb010 100644 --- a/olmocr/data/prepare_loc_transcripts.py +++ b/olmocr/data/prepare_loc_transcripts.py @@ -12,7 +12,7 @@ import argparse import requests import img2pdf from pathlib import Path -from typing import List, Dict +from typing import List, Dict, Set from tqdm import tqdm import time import hashlib @@ -80,7 +80,36 @@ def get_safe_filename(item_id: str) -> str: return safe_name -def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True) -> None: +def scan_existing_outputs(output_dir: Path) -> Set[str]: + """Scan output directory to find all already processed assets.""" + processed_assets = set() + + if not output_dir.exists(): + return processed_assets + + # Scan each dataset subdirectory + for dataset_dir in output_dir.iterdir(): + if not dataset_dir.is_dir(): + continue + + # Find all pairs of .pdf and .md files + pdf_files = {f.stem for f in dataset_dir.glob("*.pdf")} + md_files = {f.stem for f in dataset_dir.glob("*.md")} + + # Only consider fully processed (both pdf and md exist) + complete_files = pdf_files.intersection(md_files) + + # Verify files are not empty + for filename in complete_files: + pdf_path = dataset_dir / f"{filename}.pdf" + md_path = dataset_dir / f"{filename}.md" + if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0: + processed_assets.add(filename) + + return processed_assets + + +def process_csv_file(csv_path: Path, output_dir: Path, processed_assets: Set[str], skip_cleanup: bool = True) -> None: """Process a single CSV file containing LOC transcription data.""" csv_name = csv_path.stem dataset_output_dir = output_dir / csv_name @@ -89,77 +118,120 @@ def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True print(f"\nProcessing {csv_path.name}") # Read CSV - with open(csv_path, 'r', encoding='utf-8') as f: + with open(csv_path, 'r', encoding='utf-8', errors='ignore') as f: reader = csv.DictReader(f) rows = list(reader) + # Count already processed items for this CSV + already_done = sum(1 for row in rows + if 'Asset' in row and get_safe_filename(row['Asset']) in processed_assets) + + if already_done > 0: + print(f" Skipping {already_done} already processed items") + # Process each row processed = 0 skipped = 0 + newly_processed = 0 + + # Create temp directory for downloads + temp_dir = dataset_output_dir / 'temp' + temp_dir.mkdir(exist_ok=True) for row in tqdm(rows, desc=f"Processing {csv_name}"): # Check required fields - if not all(key in row for key in ['ItemId', 'DownloadUrl', 'Transcription']): - print(f"Skipping row - missing required fields") + if not all(key in row for key in ['Asset', 'DownloadUrl', 'Transcription']): skipped += 1 continue - item_id = row['ItemId'] + asset = row['Asset'] download_url = row['DownloadUrl'] transcription = row['Transcription'] - if not item_id or not download_url or not transcription: + if not asset or not download_url or not transcription: skipped += 1 continue - # Create safe filename - safe_filename = get_safe_filename(item_id) + # Create safe filename using Asset column + safe_filename = get_safe_filename(asset) + + # Skip if already processed + if safe_filename in processed_assets: + processed += 1 + continue # Define output paths pdf_path = dataset_output_dir / f"{safe_filename}.pdf" md_path = dataset_output_dir / f"{safe_filename}.md" - # Skip if already processed + # Double-check if files already exist on disk if pdf_path.exists() and md_path.exists(): - processed += 1 - continue - - # Create temp directory for downloads - temp_dir = dataset_output_dir / 'temp' - temp_dir.mkdir(exist_ok=True) - - # Download image - image_path = temp_dir / f"{safe_filename}.jpg" - if download_image(download_url, image_path): - # Convert to PDF - if convert_image_to_pdf(image_path, pdf_path): - # Clean up transcription if needed (skipping for now) - if skip_cleanup: - cleaned_transcription = transcription - else: - # TODO: Add transcription cleanup using GPT-4o - cleaned_transcription = transcription - - # Create markdown file - create_markdown_file(cleaned_transcription, md_path) + # Verify files are not empty + if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0: processed += 1 - - # Clean up temp image - image_path.unlink(missing_ok=True) + processed_assets.add(safe_filename) + continue else: - skipped += 1 - else: + # Remove empty files to reprocess + pdf_path.unlink(missing_ok=True) + md_path.unlink(missing_ok=True) + + # Process the item + try: + # Download image + image_path = temp_dir / f"{safe_filename}.jpg" + + if not download_image(download_url, image_path): + raise Exception(f"Failed to download image") + + # Convert to PDF + if not convert_image_to_pdf(image_path, pdf_path): + raise Exception(f"Failed to convert image to PDF") + + # Clean up transcription if needed (skipping for now) + if skip_cleanup: + cleaned_transcription = transcription + else: + # TODO: Add transcription cleanup using GPT-4o + cleaned_transcription = transcription + + # Create markdown file + create_markdown_file(cleaned_transcription, md_path) + + # Verify both files exist and are non-empty + if pdf_path.exists() and md_path.exists(): + if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0: + processed += 1 + newly_processed += 1 + processed_assets.add(safe_filename) + + # Clean up temp image + image_path.unlink(missing_ok=True) + else: + raise Exception("Output files are empty") + else: + raise Exception("Output files were not created") + + except Exception as e: + print(f"\nError processing {asset}: {e}") skipped += 1 + # Clean up any partial files + pdf_path.unlink(missing_ok=True) + md_path.unlink(missing_ok=True) + if 'image_path' in locals(): + image_path.unlink(missing_ok=True) # Clean up temp directory - temp_dir = dataset_output_dir / 'temp' if temp_dir.exists(): + # Remove any remaining temp files + for temp_file in temp_dir.glob('*'): + temp_file.unlink(missing_ok=True) try: temp_dir.rmdir() except: pass - print(f"Completed {csv_name}: {processed} processed, {skipped} skipped") + print(f"Completed {csv_name}: {processed} total processed ({newly_processed} new), {skipped} skipped") def main(): @@ -193,11 +265,19 @@ def main(): print(f"Found {len(csv_files)} CSV files to process") + # Scan existing outputs to avoid reprocessing + print("Scanning existing outputs...") + processed_assets = scan_existing_outputs(output_dir) + + if processed_assets: + print(f"Found {len(processed_assets)} already processed items") + # Process each CSV file for csv_file in csv_files: - process_csv_file(csv_file, output_dir, args.skip_cleanup) + process_csv_file(csv_file, output_dir, processed_assets, args.skip_cleanup) print(f"\nAll processing complete. Output saved to {output_dir}") + print(f"Total items processed: {len(processed_assets)}") if __name__ == '__main__':