Put LR back, need to save larger checkpoints to weka to prevent timeouts

This commit is contained in:
Jake Poznanski 2024-10-17 19:46:25 +00:00
parent e141c91e5e
commit 529d51d57d
2 changed files with 2 additions and 2 deletions

View File

@ -46,7 +46,7 @@ hparams:
gradient_accumulation_steps: 4
gradient_checkpointing: true
clip_grad_norm: 1.0
learning_rate: 3e-4
learning_rate: 1e-4
max_steps: 10000
pad_multiple_of: 16
log_every_steps: 10

View File

@ -10,7 +10,7 @@ then
fi
EXTRA_ARGS="-c pdelfin/train/config/qwen2vl-7b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
EXTRA_ARGS="-c pdelfin/train/config/qwen2vl-7b.yaml --num_proc 64 --save.path \"/data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
run_name=$(basename "$0" .sh)