mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-10 23:50:43 +00:00
LOC downloader
This commit is contained in:
parent
6123d4452b
commit
ed3820c0c7
@ -12,7 +12,7 @@ import argparse
|
|||||||
import requests
|
import requests
|
||||||
import img2pdf
|
import img2pdf
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Dict
|
from typing import List, Dict, Set
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
import time
|
import time
|
||||||
import hashlib
|
import hashlib
|
||||||
@ -80,7 +80,36 @@ def get_safe_filename(item_id: str) -> str:
|
|||||||
return safe_name
|
return safe_name
|
||||||
|
|
||||||
|
|
||||||
def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True) -> None:
|
def scan_existing_outputs(output_dir: Path) -> Set[str]:
|
||||||
|
"""Scan output directory to find all already processed assets."""
|
||||||
|
processed_assets = set()
|
||||||
|
|
||||||
|
if not output_dir.exists():
|
||||||
|
return processed_assets
|
||||||
|
|
||||||
|
# Scan each dataset subdirectory
|
||||||
|
for dataset_dir in output_dir.iterdir():
|
||||||
|
if not dataset_dir.is_dir():
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Find all pairs of .pdf and .md files
|
||||||
|
pdf_files = {f.stem for f in dataset_dir.glob("*.pdf")}
|
||||||
|
md_files = {f.stem for f in dataset_dir.glob("*.md")}
|
||||||
|
|
||||||
|
# Only consider fully processed (both pdf and md exist)
|
||||||
|
complete_files = pdf_files.intersection(md_files)
|
||||||
|
|
||||||
|
# Verify files are not empty
|
||||||
|
for filename in complete_files:
|
||||||
|
pdf_path = dataset_dir / f"{filename}.pdf"
|
||||||
|
md_path = dataset_dir / f"{filename}.md"
|
||||||
|
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
|
||||||
|
processed_assets.add(filename)
|
||||||
|
|
||||||
|
return processed_assets
|
||||||
|
|
||||||
|
|
||||||
|
def process_csv_file(csv_path: Path, output_dir: Path, processed_assets: Set[str], skip_cleanup: bool = True) -> None:
|
||||||
"""Process a single CSV file containing LOC transcription data."""
|
"""Process a single CSV file containing LOC transcription data."""
|
||||||
csv_name = csv_path.stem
|
csv_name = csv_path.stem
|
||||||
dataset_output_dir = output_dir / csv_name
|
dataset_output_dir = output_dir / csv_name
|
||||||
@ -89,50 +118,76 @@ def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True
|
|||||||
print(f"\nProcessing {csv_path.name}")
|
print(f"\nProcessing {csv_path.name}")
|
||||||
|
|
||||||
# Read CSV
|
# Read CSV
|
||||||
with open(csv_path, 'r', encoding='utf-8') as f:
|
with open(csv_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
reader = csv.DictReader(f)
|
reader = csv.DictReader(f)
|
||||||
rows = list(reader)
|
rows = list(reader)
|
||||||
|
|
||||||
|
# Count already processed items for this CSV
|
||||||
|
already_done = sum(1 for row in rows
|
||||||
|
if 'Asset' in row and get_safe_filename(row['Asset']) in processed_assets)
|
||||||
|
|
||||||
|
if already_done > 0:
|
||||||
|
print(f" Skipping {already_done} already processed items")
|
||||||
|
|
||||||
# Process each row
|
# Process each row
|
||||||
processed = 0
|
processed = 0
|
||||||
skipped = 0
|
skipped = 0
|
||||||
|
newly_processed = 0
|
||||||
for row in tqdm(rows, desc=f"Processing {csv_name}"):
|
|
||||||
# Check required fields
|
|
||||||
if not all(key in row for key in ['ItemId', 'DownloadUrl', 'Transcription']):
|
|
||||||
print(f"Skipping row - missing required fields")
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
item_id = row['ItemId']
|
|
||||||
download_url = row['DownloadUrl']
|
|
||||||
transcription = row['Transcription']
|
|
||||||
|
|
||||||
if not item_id or not download_url or not transcription:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Create safe filename
|
|
||||||
safe_filename = get_safe_filename(item_id)
|
|
||||||
|
|
||||||
# Define output paths
|
|
||||||
pdf_path = dataset_output_dir / f"{safe_filename}.pdf"
|
|
||||||
md_path = dataset_output_dir / f"{safe_filename}.md"
|
|
||||||
|
|
||||||
# Skip if already processed
|
|
||||||
if pdf_path.exists() and md_path.exists():
|
|
||||||
processed += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Create temp directory for downloads
|
# Create temp directory for downloads
|
||||||
temp_dir = dataset_output_dir / 'temp'
|
temp_dir = dataset_output_dir / 'temp'
|
||||||
temp_dir.mkdir(exist_ok=True)
|
temp_dir.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
for row in tqdm(rows, desc=f"Processing {csv_name}"):
|
||||||
|
# Check required fields
|
||||||
|
if not all(key in row for key in ['Asset', 'DownloadUrl', 'Transcription']):
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
asset = row['Asset']
|
||||||
|
download_url = row['DownloadUrl']
|
||||||
|
transcription = row['Transcription']
|
||||||
|
|
||||||
|
if not asset or not download_url or not transcription:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create safe filename using Asset column
|
||||||
|
safe_filename = get_safe_filename(asset)
|
||||||
|
|
||||||
|
# Skip if already processed
|
||||||
|
if safe_filename in processed_assets:
|
||||||
|
processed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Define output paths
|
||||||
|
pdf_path = dataset_output_dir / f"{safe_filename}.pdf"
|
||||||
|
md_path = dataset_output_dir / f"{safe_filename}.md"
|
||||||
|
|
||||||
|
# Double-check if files already exist on disk
|
||||||
|
if pdf_path.exists() and md_path.exists():
|
||||||
|
# Verify files are not empty
|
||||||
|
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
|
||||||
|
processed += 1
|
||||||
|
processed_assets.add(safe_filename)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
# Remove empty files to reprocess
|
||||||
|
pdf_path.unlink(missing_ok=True)
|
||||||
|
md_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
# Process the item
|
||||||
|
try:
|
||||||
# Download image
|
# Download image
|
||||||
image_path = temp_dir / f"{safe_filename}.jpg"
|
image_path = temp_dir / f"{safe_filename}.jpg"
|
||||||
if download_image(download_url, image_path):
|
|
||||||
|
if not download_image(download_url, image_path):
|
||||||
|
raise Exception(f"Failed to download image")
|
||||||
|
|
||||||
# Convert to PDF
|
# Convert to PDF
|
||||||
if convert_image_to_pdf(image_path, pdf_path):
|
if not convert_image_to_pdf(image_path, pdf_path):
|
||||||
|
raise Exception(f"Failed to convert image to PDF")
|
||||||
|
|
||||||
# Clean up transcription if needed (skipping for now)
|
# Clean up transcription if needed (skipping for now)
|
||||||
if skip_cleanup:
|
if skip_cleanup:
|
||||||
cleaned_transcription = transcription
|
cleaned_transcription = transcription
|
||||||
@ -142,24 +197,41 @@ def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True
|
|||||||
|
|
||||||
# Create markdown file
|
# Create markdown file
|
||||||
create_markdown_file(cleaned_transcription, md_path)
|
create_markdown_file(cleaned_transcription, md_path)
|
||||||
|
|
||||||
|
# Verify both files exist and are non-empty
|
||||||
|
if pdf_path.exists() and md_path.exists():
|
||||||
|
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
|
||||||
processed += 1
|
processed += 1
|
||||||
|
newly_processed += 1
|
||||||
|
processed_assets.add(safe_filename)
|
||||||
|
|
||||||
# Clean up temp image
|
# Clean up temp image
|
||||||
image_path.unlink(missing_ok=True)
|
image_path.unlink(missing_ok=True)
|
||||||
else:
|
else:
|
||||||
skipped += 1
|
raise Exception("Output files are empty")
|
||||||
else:
|
else:
|
||||||
|
raise Exception("Output files were not created")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\nError processing {asset}: {e}")
|
||||||
skipped += 1
|
skipped += 1
|
||||||
|
# Clean up any partial files
|
||||||
|
pdf_path.unlink(missing_ok=True)
|
||||||
|
md_path.unlink(missing_ok=True)
|
||||||
|
if 'image_path' in locals():
|
||||||
|
image_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
# Clean up temp directory
|
# Clean up temp directory
|
||||||
temp_dir = dataset_output_dir / 'temp'
|
|
||||||
if temp_dir.exists():
|
if temp_dir.exists():
|
||||||
|
# Remove any remaining temp files
|
||||||
|
for temp_file in temp_dir.glob('*'):
|
||||||
|
temp_file.unlink(missing_ok=True)
|
||||||
try:
|
try:
|
||||||
temp_dir.rmdir()
|
temp_dir.rmdir()
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
print(f"Completed {csv_name}: {processed} processed, {skipped} skipped")
|
print(f"Completed {csv_name}: {processed} total processed ({newly_processed} new), {skipped} skipped")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@ -193,11 +265,19 @@ def main():
|
|||||||
|
|
||||||
print(f"Found {len(csv_files)} CSV files to process")
|
print(f"Found {len(csv_files)} CSV files to process")
|
||||||
|
|
||||||
|
# Scan existing outputs to avoid reprocessing
|
||||||
|
print("Scanning existing outputs...")
|
||||||
|
processed_assets = scan_existing_outputs(output_dir)
|
||||||
|
|
||||||
|
if processed_assets:
|
||||||
|
print(f"Found {len(processed_assets)} already processed items")
|
||||||
|
|
||||||
# Process each CSV file
|
# Process each CSV file
|
||||||
for csv_file in csv_files:
|
for csv_file in csv_files:
|
||||||
process_csv_file(csv_file, output_dir, args.skip_cleanup)
|
process_csv_file(csv_file, output_dir, processed_assets, args.skip_cleanup)
|
||||||
|
|
||||||
print(f"\nAll processing complete. Output saved to {output_dir}")
|
print(f"\nAll processing complete. Output saved to {output_dir}")
|
||||||
|
print(f"Total items processed: {len(processed_assets)}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user