mirror of
https://github.com/allenai/olmocr.git
synced 2025-11-11 07:58:10 +00:00
Cleanup stuff
This commit is contained in:
parent
bade86fe91
commit
e6cff25b6b
@ -251,24 +251,50 @@ for i, arg in enumerate(modified_args):
|
|||||||
grpo_cmd += " " + " ".join(filtered_args)
|
grpo_cmd += " " + " ".join(filtered_args)
|
||||||
|
|
||||||
# Create a bash script as a single command string with S3 sync
|
# Create a bash script as a single command string with S3 sync
|
||||||
s3_sync_commands = ""
|
# Prepare S3 sync command for embedding in cleanup function
|
||||||
if local_output_dir and s3_output_path:
|
if local_output_dir and s3_output_path:
|
||||||
s3_sync_commands = f"""
|
s3_sync_cmd = f"s5cmd sync '{local_output_dir}/*' '{s3_output_path}/'"
|
||||||
|
else:
|
||||||
# Sync checkpoints to S3
|
s3_sync_cmd = None
|
||||||
echo 'Syncing checkpoints to S3...'
|
|
||||||
if [ -d "{local_output_dir}" ]; then
|
|
||||||
echo "Syncing from {local_output_dir} to {s3_output_path}"
|
|
||||||
s5cmd sync '{local_output_dir}/*' '{s3_output_path}/'
|
|
||||||
echo 'S3 sync completed successfully'
|
|
||||||
else
|
|
||||||
echo 'Warning: Output directory {local_output_dir} not found, skipping S3 sync'
|
|
||||||
fi
|
|
||||||
"""
|
|
||||||
|
|
||||||
bash_script = f"""
|
bash_script = f"""
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
|
# Define cleanup function that will always run
|
||||||
|
cleanup() {{
|
||||||
|
EXIT_CODE=$?
|
||||||
|
echo "Running cleanup (exit code: $EXIT_CODE)..."
|
||||||
|
|
||||||
|
# Kill VLLM server if it's still running
|
||||||
|
if [ ! -z "$VLLM_PID" ]; then
|
||||||
|
echo "Killing VLLM server (PID: $VLLM_PID)..."
|
||||||
|
kill $VLLM_PID || true
|
||||||
|
echo "VLLM server stopped."
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Always sync to S3 if output directory exists
|
||||||
|
echo "Checking for outputs to sync to S3..."
|
||||||
|
if [ -d "{local_output_dir if local_output_dir else '/tmp/checkpoints'}" ]; then
|
||||||
|
echo "Output directory exists, syncing to S3..."
|
||||||
|
{f'echo "Syncing from {local_output_dir} to {s3_output_path}"' if s3_sync_cmd else ''}
|
||||||
|
{s3_sync_cmd if s3_sync_cmd else 'echo "No S3 sync configured"'}
|
||||||
|
{f'echo "S3 sync completed"' if s3_sync_cmd else ''}
|
||||||
|
else
|
||||||
|
echo "No output directory found, skipping S3 sync"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $EXIT_CODE -eq 0 ]; then
|
||||||
|
echo "Script completed successfully"
|
||||||
|
else
|
||||||
|
echo "Script failed with exit code: $EXIT_CODE"
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit $EXIT_CODE
|
||||||
|
}}
|
||||||
|
|
||||||
|
# Set trap to run cleanup on EXIT (covers both success and failure)
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
# Setup commands
|
# Setup commands
|
||||||
{" && ".join(setup_commands)}
|
{" && ".join(setup_commands)}
|
||||||
|
|
||||||
@ -298,11 +324,7 @@ done
|
|||||||
echo 'Starting GRPO training on GPUs {training_gpu_str}...'
|
echo 'Starting GRPO training on GPUs {training_gpu_str}...'
|
||||||
{grpo_cmd}
|
{grpo_cmd}
|
||||||
|
|
||||||
# Cleanup
|
echo 'Training completed successfully!'
|
||||||
echo 'Training completed. Killing VLLM server...'
|
|
||||||
kill $VLLM_PID || true
|
|
||||||
echo 'VLLM server stopped.'
|
|
||||||
{s3_sync_commands}
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Create single task spec
|
# Create single task spec
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user