diff --git a/olmocr/data/prepare_loc_transcripts.py b/olmocr/data/prepare_loc_transcripts.py index ffffa37..c1198a5 100644 --- a/olmocr/data/prepare_loc_transcripts.py +++ b/olmocr/data/prepare_loc_transcripts.py @@ -101,11 +101,11 @@ def scan_existing_outputs(output_dir: Path) -> Set[str]: # Only consider fully processed (both pdf and md exist) complete_files = pdf_files.intersection(md_files) - # Verify files are not empty + # Verify PDF files are not empty (md can be empty) for filename in complete_files: pdf_path = dataset_dir / f"{filename}.pdf" md_path = dataset_dir / f"{filename}.md" - if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0: + if pdf_path.stat().st_size > 0: # Only PDF needs to be non-empty processed_assets.add(filename) return processed_assets @@ -124,6 +124,11 @@ def process_single_item( if not all(key in row for key in ['Asset', 'DownloadUrl']): return ('', False, 'Missing required fields') + # Check AssetStatus is completed + asset_status = row.get('AssetStatus', '') + if asset_status != 'completed': + return (row.get('Asset', ''), False, f'AssetStatus is not completed: {asset_status}') + asset = row['Asset'] download_url = row['DownloadUrl'] transcription = row.get('Transcription', '') # Allow empty transcription @@ -145,13 +150,13 @@ def process_single_item( # Double-check if files already exist on disk if pdf_path.exists() and md_path.exists(): - # Verify files are not empty - if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0: + # Verify PDF is not empty (md can be empty) + if pdf_path.stat().st_size > 0: with processed_lock: processed_assets.add(safe_filename) return (asset, True, None) else: - # Remove empty files to reprocess + # Remove files to reprocess if PDF is empty pdf_path.unlink(missing_ok=True) md_path.unlink(missing_ok=True) @@ -180,9 +185,9 @@ def process_single_item( # Create markdown file create_markdown_file(cleaned_transcription, md_path) - # Verify both files exist and are non-empty + # Verify both files exist (md can be empty, pdf should not be) if pdf_path.exists() and md_path.exists(): - if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0: + if pdf_path.stat().st_size > 0: # Only PDF needs to be non-empty with processed_lock: processed_assets.add(safe_filename) @@ -190,7 +195,7 @@ def process_single_item( image_path.unlink(missing_ok=True) return (asset, True, None) else: - raise Exception("Output files are empty") + raise Exception("PDF file is empty") else: raise Exception("Output files were not created")