mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-03 03:25:22 +00:00
LOC downloader
This commit is contained in:
parent
6123d4452b
commit
ed3820c0c7
@ -12,7 +12,7 @@ import argparse
|
||||
import requests
|
||||
import img2pdf
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
from typing import List, Dict, Set
|
||||
from tqdm import tqdm
|
||||
import time
|
||||
import hashlib
|
||||
@ -80,7 +80,36 @@ def get_safe_filename(item_id: str) -> str:
|
||||
return safe_name
|
||||
|
||||
|
||||
def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True) -> None:
|
||||
def scan_existing_outputs(output_dir: Path) -> Set[str]:
|
||||
"""Scan output directory to find all already processed assets."""
|
||||
processed_assets = set()
|
||||
|
||||
if not output_dir.exists():
|
||||
return processed_assets
|
||||
|
||||
# Scan each dataset subdirectory
|
||||
for dataset_dir in output_dir.iterdir():
|
||||
if not dataset_dir.is_dir():
|
||||
continue
|
||||
|
||||
# Find all pairs of .pdf and .md files
|
||||
pdf_files = {f.stem for f in dataset_dir.glob("*.pdf")}
|
||||
md_files = {f.stem for f in dataset_dir.glob("*.md")}
|
||||
|
||||
# Only consider fully processed (both pdf and md exist)
|
||||
complete_files = pdf_files.intersection(md_files)
|
||||
|
||||
# Verify files are not empty
|
||||
for filename in complete_files:
|
||||
pdf_path = dataset_dir / f"{filename}.pdf"
|
||||
md_path = dataset_dir / f"{filename}.md"
|
||||
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
|
||||
processed_assets.add(filename)
|
||||
|
||||
return processed_assets
|
||||
|
||||
|
||||
def process_csv_file(csv_path: Path, output_dir: Path, processed_assets: Set[str], skip_cleanup: bool = True) -> None:
|
||||
"""Process a single CSV file containing LOC transcription data."""
|
||||
csv_name = csv_path.stem
|
||||
dataset_output_dir = output_dir / csv_name
|
||||
@ -89,77 +118,120 @@ def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True
|
||||
print(f"\nProcessing {csv_path.name}")
|
||||
|
||||
# Read CSV
|
||||
with open(csv_path, 'r', encoding='utf-8') as f:
|
||||
with open(csv_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
reader = csv.DictReader(f)
|
||||
rows = list(reader)
|
||||
|
||||
# Count already processed items for this CSV
|
||||
already_done = sum(1 for row in rows
|
||||
if 'Asset' in row and get_safe_filename(row['Asset']) in processed_assets)
|
||||
|
||||
if already_done > 0:
|
||||
print(f" Skipping {already_done} already processed items")
|
||||
|
||||
# Process each row
|
||||
processed = 0
|
||||
skipped = 0
|
||||
newly_processed = 0
|
||||
|
||||
# Create temp directory for downloads
|
||||
temp_dir = dataset_output_dir / 'temp'
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
|
||||
for row in tqdm(rows, desc=f"Processing {csv_name}"):
|
||||
# Check required fields
|
||||
if not all(key in row for key in ['ItemId', 'DownloadUrl', 'Transcription']):
|
||||
print(f"Skipping row - missing required fields")
|
||||
if not all(key in row for key in ['Asset', 'DownloadUrl', 'Transcription']):
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
item_id = row['ItemId']
|
||||
asset = row['Asset']
|
||||
download_url = row['DownloadUrl']
|
||||
transcription = row['Transcription']
|
||||
|
||||
if not item_id or not download_url or not transcription:
|
||||
if not asset or not download_url or not transcription:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
# Create safe filename
|
||||
safe_filename = get_safe_filename(item_id)
|
||||
# Create safe filename using Asset column
|
||||
safe_filename = get_safe_filename(asset)
|
||||
|
||||
# Skip if already processed
|
||||
if safe_filename in processed_assets:
|
||||
processed += 1
|
||||
continue
|
||||
|
||||
# Define output paths
|
||||
pdf_path = dataset_output_dir / f"{safe_filename}.pdf"
|
||||
md_path = dataset_output_dir / f"{safe_filename}.md"
|
||||
|
||||
# Skip if already processed
|
||||
# Double-check if files already exist on disk
|
||||
if pdf_path.exists() and md_path.exists():
|
||||
processed += 1
|
||||
continue
|
||||
|
||||
# Create temp directory for downloads
|
||||
temp_dir = dataset_output_dir / 'temp'
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Download image
|
||||
image_path = temp_dir / f"{safe_filename}.jpg"
|
||||
if download_image(download_url, image_path):
|
||||
# Convert to PDF
|
||||
if convert_image_to_pdf(image_path, pdf_path):
|
||||
# Clean up transcription if needed (skipping for now)
|
||||
if skip_cleanup:
|
||||
cleaned_transcription = transcription
|
||||
else:
|
||||
# TODO: Add transcription cleanup using GPT-4o
|
||||
cleaned_transcription = transcription
|
||||
|
||||
# Create markdown file
|
||||
create_markdown_file(cleaned_transcription, md_path)
|
||||
# Verify files are not empty
|
||||
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
|
||||
processed += 1
|
||||
|
||||
# Clean up temp image
|
||||
image_path.unlink(missing_ok=True)
|
||||
processed_assets.add(safe_filename)
|
||||
continue
|
||||
else:
|
||||
skipped += 1
|
||||
else:
|
||||
# Remove empty files to reprocess
|
||||
pdf_path.unlink(missing_ok=True)
|
||||
md_path.unlink(missing_ok=True)
|
||||
|
||||
# Process the item
|
||||
try:
|
||||
# Download image
|
||||
image_path = temp_dir / f"{safe_filename}.jpg"
|
||||
|
||||
if not download_image(download_url, image_path):
|
||||
raise Exception(f"Failed to download image")
|
||||
|
||||
# Convert to PDF
|
||||
if not convert_image_to_pdf(image_path, pdf_path):
|
||||
raise Exception(f"Failed to convert image to PDF")
|
||||
|
||||
# Clean up transcription if needed (skipping for now)
|
||||
if skip_cleanup:
|
||||
cleaned_transcription = transcription
|
||||
else:
|
||||
# TODO: Add transcription cleanup using GPT-4o
|
||||
cleaned_transcription = transcription
|
||||
|
||||
# Create markdown file
|
||||
create_markdown_file(cleaned_transcription, md_path)
|
||||
|
||||
# Verify both files exist and are non-empty
|
||||
if pdf_path.exists() and md_path.exists():
|
||||
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
|
||||
processed += 1
|
||||
newly_processed += 1
|
||||
processed_assets.add(safe_filename)
|
||||
|
||||
# Clean up temp image
|
||||
image_path.unlink(missing_ok=True)
|
||||
else:
|
||||
raise Exception("Output files are empty")
|
||||
else:
|
||||
raise Exception("Output files were not created")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\nError processing {asset}: {e}")
|
||||
skipped += 1
|
||||
# Clean up any partial files
|
||||
pdf_path.unlink(missing_ok=True)
|
||||
md_path.unlink(missing_ok=True)
|
||||
if 'image_path' in locals():
|
||||
image_path.unlink(missing_ok=True)
|
||||
|
||||
# Clean up temp directory
|
||||
temp_dir = dataset_output_dir / 'temp'
|
||||
if temp_dir.exists():
|
||||
# Remove any remaining temp files
|
||||
for temp_file in temp_dir.glob('*'):
|
||||
temp_file.unlink(missing_ok=True)
|
||||
try:
|
||||
temp_dir.rmdir()
|
||||
except:
|
||||
pass
|
||||
|
||||
print(f"Completed {csv_name}: {processed} processed, {skipped} skipped")
|
||||
print(f"Completed {csv_name}: {processed} total processed ({newly_processed} new), {skipped} skipped")
|
||||
|
||||
|
||||
def main():
|
||||
@ -193,11 +265,19 @@ def main():
|
||||
|
||||
print(f"Found {len(csv_files)} CSV files to process")
|
||||
|
||||
# Scan existing outputs to avoid reprocessing
|
||||
print("Scanning existing outputs...")
|
||||
processed_assets = scan_existing_outputs(output_dir)
|
||||
|
||||
if processed_assets:
|
||||
print(f"Found {len(processed_assets)} already processed items")
|
||||
|
||||
# Process each CSV file
|
||||
for csv_file in csv_files:
|
||||
process_csv_file(csv_file, output_dir, args.skip_cleanup)
|
||||
process_csv_file(csv_file, output_dir, processed_assets, args.skip_cleanup)
|
||||
|
||||
print(f"\nAll processing complete. Output saved to {output_dir}")
|
||||
print(f"Total items processed: {len(processed_assets)}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user