olmocr/scripts/qwen2vl-2b-gantry.sh

46 lines
1.4 KiB
Bash
Raw Permalink Normal View History

2024-09-23 14:04:22 -07:00
#!/usr/bin/env bash
set -ex
# check if jq is installed
if ! command -v jq &> /dev/null
then
echo "jq could not be found. Please install it."
exit
fi
EXTRA_ARGS="-c olmocr/train/config/qwen2vl-2b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
2024-09-23 14:04:22 -07:00
run_name=$(basename "$0" .sh)
# --cluster 'ai2/jupiter*' \
# --cluster 'ai2/pluto*' \
# --cluster 'ai2/allennlp-cirrascale' \
# --priority high \
CLUSTER='jupiter'
gantry run \
--description "${run_name}"\
--task-name "${run_name}"\
--allow-dirty \
--host-networking \
2024-09-23 14:32:10 -07:00
--workspace ai2/oe-data-pdf \
2024-10-16 14:46:28 -07:00
--beaker-image 'jakep/jakep-pdf-finetunev1.2' \
2024-09-23 14:04:22 -07:00
--venv 'base' \
2024-09-24 14:45:44 +00:00
--pip gantry-requirements.txt \
2024-09-23 14:41:35 -07:00
--priority normal \
2024-09-24 15:52:34 -07:00
--gpus 8 \
2024-09-23 14:04:22 -07:00
--preemptible \
--cluster "ai2/${CLUSTER}*" \
--budget ai2/oe-data \
--env LOG_FILTER_TYPE=local_rank0_only \
--env OMP_NUM_THREADS=8 \
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
--env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
--env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
2024-10-08 18:16:39 +00:00
--env-secret WANDB_API_KEY=JAKE_WANDB_API_KEY \
2024-09-23 14:04:22 -07:00
--shared-memory 10GiB \
--yes \
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m olmocr.train.train ${EXTRA_ARGS}"