LOC downloader

This commit is contained in:
Jake Poznanski 2025-08-28 17:38:20 +00:00
parent 6123d4452b
commit ed3820c0c7

View File

@ -12,7 +12,7 @@ import argparse
import requests
import img2pdf
from pathlib import Path
from typing import List, Dict
from typing import List, Dict, Set
from tqdm import tqdm
import time
import hashlib
@ -80,7 +80,36 @@ def get_safe_filename(item_id: str) -> str:
return safe_name
def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True) -> None:
def scan_existing_outputs(output_dir: Path) -> Set[str]:
"""Scan output directory to find all already processed assets."""
processed_assets = set()
if not output_dir.exists():
return processed_assets
# Scan each dataset subdirectory
for dataset_dir in output_dir.iterdir():
if not dataset_dir.is_dir():
continue
# Find all pairs of .pdf and .md files
pdf_files = {f.stem for f in dataset_dir.glob("*.pdf")}
md_files = {f.stem for f in dataset_dir.glob("*.md")}
# Only consider fully processed (both pdf and md exist)
complete_files = pdf_files.intersection(md_files)
# Verify files are not empty
for filename in complete_files:
pdf_path = dataset_dir / f"{filename}.pdf"
md_path = dataset_dir / f"{filename}.md"
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
processed_assets.add(filename)
return processed_assets
def process_csv_file(csv_path: Path, output_dir: Path, processed_assets: Set[str], skip_cleanup: bool = True) -> None:
"""Process a single CSV file containing LOC transcription data."""
csv_name = csv_path.stem
dataset_output_dir = output_dir / csv_name
@ -89,77 +118,120 @@ def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True
print(f"\nProcessing {csv_path.name}")
# Read CSV
with open(csv_path, 'r', encoding='utf-8') as f:
with open(csv_path, 'r', encoding='utf-8', errors='ignore') as f:
reader = csv.DictReader(f)
rows = list(reader)
# Count already processed items for this CSV
already_done = sum(1 for row in rows
if 'Asset' in row and get_safe_filename(row['Asset']) in processed_assets)
if already_done > 0:
print(f" Skipping {already_done} already processed items")
# Process each row
processed = 0
skipped = 0
newly_processed = 0
# Create temp directory for downloads
temp_dir = dataset_output_dir / 'temp'
temp_dir.mkdir(exist_ok=True)
for row in tqdm(rows, desc=f"Processing {csv_name}"):
# Check required fields
if not all(key in row for key in ['ItemId', 'DownloadUrl', 'Transcription']):
print(f"Skipping row - missing required fields")
if not all(key in row for key in ['Asset', 'DownloadUrl', 'Transcription']):
skipped += 1
continue
item_id = row['ItemId']
asset = row['Asset']
download_url = row['DownloadUrl']
transcription = row['Transcription']
if not item_id or not download_url or not transcription:
if not asset or not download_url or not transcription:
skipped += 1
continue
# Create safe filename
safe_filename = get_safe_filename(item_id)
# Create safe filename using Asset column
safe_filename = get_safe_filename(asset)
# Skip if already processed
if safe_filename in processed_assets:
processed += 1
continue
# Define output paths
pdf_path = dataset_output_dir / f"{safe_filename}.pdf"
md_path = dataset_output_dir / f"{safe_filename}.md"
# Skip if already processed
# Double-check if files already exist on disk
if pdf_path.exists() and md_path.exists():
processed += 1
continue
# Create temp directory for downloads
temp_dir = dataset_output_dir / 'temp'
temp_dir.mkdir(exist_ok=True)
# Download image
image_path = temp_dir / f"{safe_filename}.jpg"
if download_image(download_url, image_path):
# Convert to PDF
if convert_image_to_pdf(image_path, pdf_path):
# Clean up transcription if needed (skipping for now)
if skip_cleanup:
cleaned_transcription = transcription
else:
# TODO: Add transcription cleanup using GPT-4o
cleaned_transcription = transcription
# Create markdown file
create_markdown_file(cleaned_transcription, md_path)
# Verify files are not empty
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
processed += 1
# Clean up temp image
image_path.unlink(missing_ok=True)
processed_assets.add(safe_filename)
continue
else:
skipped += 1
else:
# Remove empty files to reprocess
pdf_path.unlink(missing_ok=True)
md_path.unlink(missing_ok=True)
# Process the item
try:
# Download image
image_path = temp_dir / f"{safe_filename}.jpg"
if not download_image(download_url, image_path):
raise Exception(f"Failed to download image")
# Convert to PDF
if not convert_image_to_pdf(image_path, pdf_path):
raise Exception(f"Failed to convert image to PDF")
# Clean up transcription if needed (skipping for now)
if skip_cleanup:
cleaned_transcription = transcription
else:
# TODO: Add transcription cleanup using GPT-4o
cleaned_transcription = transcription
# Create markdown file
create_markdown_file(cleaned_transcription, md_path)
# Verify both files exist and are non-empty
if pdf_path.exists() and md_path.exists():
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
processed += 1
newly_processed += 1
processed_assets.add(safe_filename)
# Clean up temp image
image_path.unlink(missing_ok=True)
else:
raise Exception("Output files are empty")
else:
raise Exception("Output files were not created")
except Exception as e:
print(f"\nError processing {asset}: {e}")
skipped += 1
# Clean up any partial files
pdf_path.unlink(missing_ok=True)
md_path.unlink(missing_ok=True)
if 'image_path' in locals():
image_path.unlink(missing_ok=True)
# Clean up temp directory
temp_dir = dataset_output_dir / 'temp'
if temp_dir.exists():
# Remove any remaining temp files
for temp_file in temp_dir.glob('*'):
temp_file.unlink(missing_ok=True)
try:
temp_dir.rmdir()
except:
pass
print(f"Completed {csv_name}: {processed} processed, {skipped} skipped")
print(f"Completed {csv_name}: {processed} total processed ({newly_processed} new), {skipped} skipped")
def main():
@ -193,11 +265,19 @@ def main():
print(f"Found {len(csv_files)} CSV files to process")
# Scan existing outputs to avoid reprocessing
print("Scanning existing outputs...")
processed_assets = scan_existing_outputs(output_dir)
if processed_assets:
print(f"Found {len(processed_assets)} already processed items")
# Process each CSV file
for csv_file in csv_files:
process_csv_file(csv_file, output_dir, args.skip_cleanup)
process_csv_file(csv_file, output_dir, processed_assets, args.skip_cleanup)
print(f"\nAll processing complete. Output saved to {output_dir}")
print(f"Total items processed: {len(processed_assets)}")
if __name__ == '__main__':