diff --git a/pdelfin/train/config/molmo-o-lora-8192.yaml b/pdelfin/train/config/molmo-o-lora-8192.yaml new file mode 100644 index 0000000..9222d28 --- /dev/null +++ b/pdelfin/train/config/molmo-o-lora-8192.yaml @@ -0,0 +1,88 @@ +model: + name_or_path: allenai/Molmo-7B-O-0924 + arch: causal + use_flash_attn: true + +wandb: + project: pdelfin + entity: ai2-llm + +generate: + max_length: 8192 + +train_data: + seed: 1337 + cache_location: /data/jakep/pdfdata/pdelfin_cache + sources: + - name: openai_batch_data_v5_1_train + response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_train_done/*.json + target_longest_image_dim: [1024] + target_anchor_text_len: [6000] + - name: openai_batch_data_v5_1_iabooks_train + response_glob_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_train_done/*.json + target_longest_image_dim: [1024] + target_anchor_text_len: [6000] + +valid_data: + cache_location: /data/jakep/pdfdata/pdelfin_cache + metric_for_best_model: openai_batch_data_v5_1_eval_loss + sources: + # These tend to be small, so you can load from s3 it's no big deal + - name: openai_batch_data_v5_1_eval + response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json + target_longest_image_dim: [1024] + target_anchor_text_len: [6000] + - name: openai_batch_data_v5_1_eval + response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json + target_longest_image_dim: [1024] + target_anchor_text_len: [6000] + + + + +# Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh +hparams: + batch_size: 1 + eval_batch_size: 1 + gradient_accumulation_steps: 4 + gradient_checkpointing: true + clip_grad_norm: 1.0 + learning_rate: 1e-4 + max_steps: 10000 + pad_multiple_of: 16 + log_every_steps: 10 + eval_every_steps: 100 + optim: adamw_torch + lr_scheduler: cosine + weight_decay: 0.01 + warmup_ratio: 0.03 + +# From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py +lora: + rank: 32 + alpha: 32 + dropout: 0.05 + task_type: CAUSAL_LM + target_modules: + # attention layers in main transformer + - att_proj + - ff_proj + - attn_out + - ff_out + # vision transformer attention and FF + - attention.wq + - attention.wk + - attention.wv + - attention.wo + - feed_forward.w1 + - feed_forward.w2 + # vision image projector + - vision_backbone.image_projector.w1 + - vision_backbone.image_projector.w2 + - vision_backbone.image_projector.w3 + +save: + path: s3://ai2-oe-data/jakep/experiments/molmo-o-0924/v1/models/ + save_every_steps: 1000 + +max_workers: 10 \ No newline at end of file