LOC downloader

This commit is contained in:
Jake Poznanski 2025-08-28 17:38:20 +00:00
parent 6123d4452b
commit ed3820c0c7

View File

@ -12,7 +12,7 @@ import argparse
import requests import requests
import img2pdf import img2pdf
from pathlib import Path from pathlib import Path
from typing import List, Dict from typing import List, Dict, Set
from tqdm import tqdm from tqdm import tqdm
import time import time
import hashlib import hashlib
@ -80,7 +80,36 @@ def get_safe_filename(item_id: str) -> str:
return safe_name return safe_name
def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True) -> None: def scan_existing_outputs(output_dir: Path) -> Set[str]:
"""Scan output directory to find all already processed assets."""
processed_assets = set()
if not output_dir.exists():
return processed_assets
# Scan each dataset subdirectory
for dataset_dir in output_dir.iterdir():
if not dataset_dir.is_dir():
continue
# Find all pairs of .pdf and .md files
pdf_files = {f.stem for f in dataset_dir.glob("*.pdf")}
md_files = {f.stem for f in dataset_dir.glob("*.md")}
# Only consider fully processed (both pdf and md exist)
complete_files = pdf_files.intersection(md_files)
# Verify files are not empty
for filename in complete_files:
pdf_path = dataset_dir / f"{filename}.pdf"
md_path = dataset_dir / f"{filename}.md"
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
processed_assets.add(filename)
return processed_assets
def process_csv_file(csv_path: Path, output_dir: Path, processed_assets: Set[str], skip_cleanup: bool = True) -> None:
"""Process a single CSV file containing LOC transcription data.""" """Process a single CSV file containing LOC transcription data."""
csv_name = csv_path.stem csv_name = csv_path.stem
dataset_output_dir = output_dir / csv_name dataset_output_dir = output_dir / csv_name
@ -89,50 +118,76 @@ def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True
print(f"\nProcessing {csv_path.name}") print(f"\nProcessing {csv_path.name}")
# Read CSV # Read CSV
with open(csv_path, 'r', encoding='utf-8') as f: with open(csv_path, 'r', encoding='utf-8', errors='ignore') as f:
reader = csv.DictReader(f) reader = csv.DictReader(f)
rows = list(reader) rows = list(reader)
# Count already processed items for this CSV
already_done = sum(1 for row in rows
if 'Asset' in row and get_safe_filename(row['Asset']) in processed_assets)
if already_done > 0:
print(f" Skipping {already_done} already processed items")
# Process each row # Process each row
processed = 0 processed = 0
skipped = 0 skipped = 0
newly_processed = 0
for row in tqdm(rows, desc=f"Processing {csv_name}"):
# Check required fields
if not all(key in row for key in ['ItemId', 'DownloadUrl', 'Transcription']):
print(f"Skipping row - missing required fields")
skipped += 1
continue
item_id = row['ItemId']
download_url = row['DownloadUrl']
transcription = row['Transcription']
if not item_id or not download_url or not transcription:
skipped += 1
continue
# Create safe filename
safe_filename = get_safe_filename(item_id)
# Define output paths
pdf_path = dataset_output_dir / f"{safe_filename}.pdf"
md_path = dataset_output_dir / f"{safe_filename}.md"
# Skip if already processed
if pdf_path.exists() and md_path.exists():
processed += 1
continue
# Create temp directory for downloads # Create temp directory for downloads
temp_dir = dataset_output_dir / 'temp' temp_dir = dataset_output_dir / 'temp'
temp_dir.mkdir(exist_ok=True) temp_dir.mkdir(exist_ok=True)
for row in tqdm(rows, desc=f"Processing {csv_name}"):
# Check required fields
if not all(key in row for key in ['Asset', 'DownloadUrl', 'Transcription']):
skipped += 1
continue
asset = row['Asset']
download_url = row['DownloadUrl']
transcription = row['Transcription']
if not asset or not download_url or not transcription:
skipped += 1
continue
# Create safe filename using Asset column
safe_filename = get_safe_filename(asset)
# Skip if already processed
if safe_filename in processed_assets:
processed += 1
continue
# Define output paths
pdf_path = dataset_output_dir / f"{safe_filename}.pdf"
md_path = dataset_output_dir / f"{safe_filename}.md"
# Double-check if files already exist on disk
if pdf_path.exists() and md_path.exists():
# Verify files are not empty
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
processed += 1
processed_assets.add(safe_filename)
continue
else:
# Remove empty files to reprocess
pdf_path.unlink(missing_ok=True)
md_path.unlink(missing_ok=True)
# Process the item
try:
# Download image # Download image
image_path = temp_dir / f"{safe_filename}.jpg" image_path = temp_dir / f"{safe_filename}.jpg"
if download_image(download_url, image_path):
if not download_image(download_url, image_path):
raise Exception(f"Failed to download image")
# Convert to PDF # Convert to PDF
if convert_image_to_pdf(image_path, pdf_path): if not convert_image_to_pdf(image_path, pdf_path):
raise Exception(f"Failed to convert image to PDF")
# Clean up transcription if needed (skipping for now) # Clean up transcription if needed (skipping for now)
if skip_cleanup: if skip_cleanup:
cleaned_transcription = transcription cleaned_transcription = transcription
@ -142,24 +197,41 @@ def process_csv_file(csv_path: Path, output_dir: Path, skip_cleanup: bool = True
# Create markdown file # Create markdown file
create_markdown_file(cleaned_transcription, md_path) create_markdown_file(cleaned_transcription, md_path)
# Verify both files exist and are non-empty
if pdf_path.exists() and md_path.exists():
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
processed += 1 processed += 1
newly_processed += 1
processed_assets.add(safe_filename)
# Clean up temp image # Clean up temp image
image_path.unlink(missing_ok=True) image_path.unlink(missing_ok=True)
else: else:
skipped += 1 raise Exception("Output files are empty")
else: else:
raise Exception("Output files were not created")
except Exception as e:
print(f"\nError processing {asset}: {e}")
skipped += 1 skipped += 1
# Clean up any partial files
pdf_path.unlink(missing_ok=True)
md_path.unlink(missing_ok=True)
if 'image_path' in locals():
image_path.unlink(missing_ok=True)
# Clean up temp directory # Clean up temp directory
temp_dir = dataset_output_dir / 'temp'
if temp_dir.exists(): if temp_dir.exists():
# Remove any remaining temp files
for temp_file in temp_dir.glob('*'):
temp_file.unlink(missing_ok=True)
try: try:
temp_dir.rmdir() temp_dir.rmdir()
except: except:
pass pass
print(f"Completed {csv_name}: {processed} processed, {skipped} skipped") print(f"Completed {csv_name}: {processed} total processed ({newly_processed} new), {skipped} skipped")
def main(): def main():
@ -193,11 +265,19 @@ def main():
print(f"Found {len(csv_files)} CSV files to process") print(f"Found {len(csv_files)} CSV files to process")
# Scan existing outputs to avoid reprocessing
print("Scanning existing outputs...")
processed_assets = scan_existing_outputs(output_dir)
if processed_assets:
print(f"Found {len(processed_assets)} already processed items")
# Process each CSV file # Process each CSV file
for csv_file in csv_files: for csv_file in csv_files:
process_csv_file(csv_file, output_dir, args.skip_cleanup) process_csv_file(csv_file, output_dir, processed_assets, args.skip_cleanup)
print(f"\nAll processing complete. Output saved to {output_dir}") print(f"\nAll processing complete. Output saved to {output_dir}")
print(f"Total items processed: {len(processed_assets)}")
if __name__ == '__main__': if __name__ == '__main__':