mirror of
https://github.com/allenai/olmocr.git
synced 2025-10-16 18:52:50 +00:00
Clean downloads
This commit is contained in:
parent
3d3d184f25
commit
f426826850
@ -101,11 +101,11 @@ def scan_existing_outputs(output_dir: Path) -> Set[str]:
|
|||||||
# Only consider fully processed (both pdf and md exist)
|
# Only consider fully processed (both pdf and md exist)
|
||||||
complete_files = pdf_files.intersection(md_files)
|
complete_files = pdf_files.intersection(md_files)
|
||||||
|
|
||||||
# Verify files are not empty
|
# Verify PDF files are not empty (md can be empty)
|
||||||
for filename in complete_files:
|
for filename in complete_files:
|
||||||
pdf_path = dataset_dir / f"{filename}.pdf"
|
pdf_path = dataset_dir / f"{filename}.pdf"
|
||||||
md_path = dataset_dir / f"{filename}.md"
|
md_path = dataset_dir / f"{filename}.md"
|
||||||
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
|
if pdf_path.stat().st_size > 0: # Only PDF needs to be non-empty
|
||||||
processed_assets.add(filename)
|
processed_assets.add(filename)
|
||||||
|
|
||||||
return processed_assets
|
return processed_assets
|
||||||
@ -124,6 +124,11 @@ def process_single_item(
|
|||||||
if not all(key in row for key in ['Asset', 'DownloadUrl']):
|
if not all(key in row for key in ['Asset', 'DownloadUrl']):
|
||||||
return ('', False, 'Missing required fields')
|
return ('', False, 'Missing required fields')
|
||||||
|
|
||||||
|
# Check AssetStatus is completed
|
||||||
|
asset_status = row.get('AssetStatus', '')
|
||||||
|
if asset_status != 'completed':
|
||||||
|
return (row.get('Asset', ''), False, f'AssetStatus is not completed: {asset_status}')
|
||||||
|
|
||||||
asset = row['Asset']
|
asset = row['Asset']
|
||||||
download_url = row['DownloadUrl']
|
download_url = row['DownloadUrl']
|
||||||
transcription = row.get('Transcription', '') # Allow empty transcription
|
transcription = row.get('Transcription', '') # Allow empty transcription
|
||||||
@ -145,13 +150,13 @@ def process_single_item(
|
|||||||
|
|
||||||
# Double-check if files already exist on disk
|
# Double-check if files already exist on disk
|
||||||
if pdf_path.exists() and md_path.exists():
|
if pdf_path.exists() and md_path.exists():
|
||||||
# Verify files are not empty
|
# Verify PDF is not empty (md can be empty)
|
||||||
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
|
if pdf_path.stat().st_size > 0:
|
||||||
with processed_lock:
|
with processed_lock:
|
||||||
processed_assets.add(safe_filename)
|
processed_assets.add(safe_filename)
|
||||||
return (asset, True, None)
|
return (asset, True, None)
|
||||||
else:
|
else:
|
||||||
# Remove empty files to reprocess
|
# Remove files to reprocess if PDF is empty
|
||||||
pdf_path.unlink(missing_ok=True)
|
pdf_path.unlink(missing_ok=True)
|
||||||
md_path.unlink(missing_ok=True)
|
md_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
@ -180,9 +185,9 @@ def process_single_item(
|
|||||||
# Create markdown file
|
# Create markdown file
|
||||||
create_markdown_file(cleaned_transcription, md_path)
|
create_markdown_file(cleaned_transcription, md_path)
|
||||||
|
|
||||||
# Verify both files exist and are non-empty
|
# Verify both files exist (md can be empty, pdf should not be)
|
||||||
if pdf_path.exists() and md_path.exists():
|
if pdf_path.exists() and md_path.exists():
|
||||||
if pdf_path.stat().st_size > 0 and md_path.stat().st_size > 0:
|
if pdf_path.stat().st_size > 0: # Only PDF needs to be non-empty
|
||||||
with processed_lock:
|
with processed_lock:
|
||||||
processed_assets.add(safe_filename)
|
processed_assets.add(safe_filename)
|
||||||
|
|
||||||
@ -190,7 +195,7 @@ def process_single_item(
|
|||||||
image_path.unlink(missing_ok=True)
|
image_path.unlink(missing_ok=True)
|
||||||
return (asset, True, None)
|
return (asset, True, None)
|
||||||
else:
|
else:
|
||||||
raise Exception("Output files are empty")
|
raise Exception("PDF file is empty")
|
||||||
else:
|
else:
|
||||||
raise Exception("Output files were not created")
|
raise Exception("Output files were not created")
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user