diff --git a/pdelfin/train/config/molmo-o-lora-8192.yaml b/pdelfin/train/config/molmo-o-lora-8192.yaml index 9222d28..3b8e5c0 100644 --- a/pdelfin/train/config/molmo-o-lora-8192.yaml +++ b/pdelfin/train/config/molmo-o-lora-8192.yaml @@ -46,6 +46,7 @@ hparams: eval_batch_size: 1 gradient_accumulation_steps: 4 gradient_checkpointing: true + find_unused_parameters: true clip_grad_norm: 1.0 learning_rate: 1e-4 max_steps: 10000 diff --git a/pdelfin/train/config/molmo-o-lora.yaml b/pdelfin/train/config/molmo-o-lora.yaml index 1fefbbb..e6b9e70 100644 --- a/pdelfin/train/config/molmo-o-lora.yaml +++ b/pdelfin/train/config/molmo-o-lora.yaml @@ -46,6 +46,7 @@ hparams: eval_batch_size: 1 gradient_accumulation_steps: 4 gradient_checkpointing: true + find_unused_parameters: true clip_grad_norm: 1.0 learning_rate: 1e-4 max_steps: 10000 diff --git a/scripts/molmo-7b-lora-gantry.sh b/scripts/molmo-7b-lora-gantry.sh index 9fc21ef..db71e37 100755 --- a/scripts/molmo-7b-lora-gantry.sh +++ b/scripts/molmo-7b-lora-gantry.sh @@ -22,8 +22,8 @@ run_name=$(basename "$0" .sh) CLUSTER='jupiter' gantry run \ - --description "${run_name}"\ - --task-name "${run_name}"\ + --description "${run_name}-4096"\ + --task-name "${run_name}-4096"\ --allow-dirty \ --host-networking \ --workspace ai2/oe-data-model-based-cleanup \ @@ -32,7 +32,6 @@ gantry run \ --pip gantry-requirements.txt \ --priority high \ --gpus 8 \ - --preemptible \ --cluster "ai2/${CLUSTER}*" \ --budget ai2/oe-data \ --weka "oe-data-default:/data" \