Clean downloads

This commit is contained in:
Jake Poznanski 2025-08-28 18:03:52 +00:00
parent 3d3d184f25
commit f426826850

View File

@ -101,11 +101,11 @@ def scan_existing_outputs(output_dir: Path) -> Set[str]:
# Only consider fully processed (both pdf and md exist) # Only consider fully processed (both pdf and md exist)
complete_files = pdf_files.intersection(md_files) complete_files = pdf_files.intersection(md_files)
# Verify files are not empty # Verify PDF files are not empty (md can be empty)
for filename in complete_files: for filename in complete_files:
pdf_path = dataset_dir / f"{filename}.pdf" pdf_path = dataset_dir / f"{filename}.pdf"
md_path = dataset_dir / f"{filename}.md" md_path = dataset_dir / f"{filename}.md"
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0: if pdf_path.stat().st_size > 0: # Only PDF needs to be non-empty
processed_assets.add(filename) processed_assets.add(filename)
return processed_assets return processed_assets
@ -124,6 +124,11 @@ def process_single_item(
if not all(key in row for key in ['Asset', 'DownloadUrl']): if not all(key in row for key in ['Asset', 'DownloadUrl']):
return ('', False, 'Missing required fields') return ('', False, 'Missing required fields')
# Check AssetStatus is completed
asset_status = row.get('AssetStatus', '')
if asset_status != 'completed':
return (row.get('Asset', ''), False, f'AssetStatus is not completed: {asset_status}')
asset = row['Asset'] asset = row['Asset']
download_url = row['DownloadUrl'] download_url = row['DownloadUrl']
transcription = row.get('Transcription', '') # Allow empty transcription transcription = row.get('Transcription', '') # Allow empty transcription
@ -145,13 +150,13 @@ def process_single_item(
# Double-check if files already exist on disk # Double-check if files already exist on disk
if pdf_path.exists() and md_path.exists(): if pdf_path.exists() and md_path.exists():
# Verify files are not empty # Verify PDF is not empty (md can be empty)
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0: if pdf_path.stat().st_size > 0:
with processed_lock: with processed_lock:
processed_assets.add(safe_filename) processed_assets.add(safe_filename)
return (asset, True, None) return (asset, True, None)
else: else:
# Remove empty files to reprocess # Remove files to reprocess if PDF is empty
pdf_path.unlink(missing_ok=True) pdf_path.unlink(missing_ok=True)
md_path.unlink(missing_ok=True) md_path.unlink(missing_ok=True)
@ -180,9 +185,9 @@ def process_single_item(
# Create markdown file # Create markdown file
create_markdown_file(cleaned_transcription, md_path) create_markdown_file(cleaned_transcription, md_path)
# Verify both files exist and are non-empty # Verify both files exist (md can be empty, pdf should not be)
if pdf_path.exists() and md_path.exists(): if pdf_path.exists() and md_path.exists():
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0: if pdf_path.stat().st_size > 0: # Only PDF needs to be non-empty
with processed_lock: with processed_lock:
processed_assets.add(safe_filename) processed_assets.add(safe_filename)
@ -190,7 +195,7 @@ def process_single_item(
image_path.unlink(missing_ok=True) image_path.unlink(missing_ok=True)
return (asset, True, None) return (asset, True, None)
else: else:
raise Exception("Output files are empty") raise Exception("PDF file is empty")
else: else:
raise Exception("Output files were not created") raise Exception("Output files were not created")