From fb4e585e9fd5a97eedf554b3f76a7ffc3de329e6 Mon Sep 17 00:00:00 2001 From: Jake Poznanski Date: Tue, 8 Oct 2024 15:20:37 +0000 Subject: [PATCH] Trying out non-lora training --- pdelfin/train/config/qwen2vl-7b.yaml | 48 +++++++++++----------------- scripts/qwen2vl-7b-gantry.sh | 2 +- scripts/qwen2vl-7b-lora-gantry.sh | 47 +++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 31 deletions(-) create mode 100755 scripts/qwen2vl-7b-lora-gantry.sh diff --git a/pdelfin/train/config/qwen2vl-7b.yaml b/pdelfin/train/config/qwen2vl-7b.yaml index d973f5e..49d4842 100644 --- a/pdelfin/train/config/qwen2vl-7b.yaml +++ b/pdelfin/train/config/qwen2vl-7b.yaml @@ -28,21 +28,24 @@ generate: train_data: seed: 1337 sources: - - name: openai_batch_data_v2 - query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl - response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_v2/*.json - backend: - - openai - size: 100_000 + # These tend to be really big, so it's only practical to host them as parquets on weka, otherwise you may OOM or just never finish dataloading + - name: openai_batch_data_v5_1_train + parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_parquet/*.parquet + - name: openai_batch_data_v5_1_train + parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_parquet/*.parquet valid_data: + metric_for_best_model: openai_batch_data_v5_1_eval_loss sources: - - name: openai_batch_data_eval_mini - query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_eval_mini/*.jsonl - response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_eval_mini/*.json - backend: - - openai - size: 100_000 + # These tend to be small, so you can load from s3 it's no big deal + - name: openai_batch_data_v5_1_eval + query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl + response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json + - name: openai_batch_data_v5_1_iabooks_eval + query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_eval/*.jsonl + response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json + + # Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh hparams: @@ -52,30 +55,15 @@ hparams: gradient_checkpointing: false clip_grad_norm: 1.0 learning_rate: 3e-4 - max_steps: 5000 + max_steps: 9000 pad_multiple_of: 16 - log_every_steps: 50 - eval_every_steps: 500 + log_every_steps: 10 + eval_every_steps: 100 optim: adamw_torch lr_scheduler: cosine weight_decay: 0.01 warmup_ratio: 0.03 -# From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py -# Disable LORA for now, because we want the visual network to get trained too -# lora: -# rank: 32 -# alpha: 32 -# dropout: 0.05 -# task_type: causal_lm -# target_modules: -# - q_proj -# - k_proj -# - v_proj -# - o_proj -# - gate_proj -# - up_proj -# - down_proj save: path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/ diff --git a/scripts/qwen2vl-7b-gantry.sh b/scripts/qwen2vl-7b-gantry.sh index e4eb3be..6f22253 100755 --- a/scripts/qwen2vl-7b-gantry.sh +++ b/scripts/qwen2vl-7b-gantry.sh @@ -44,4 +44,4 @@ gantry run \ --env-secret WANDB_API_KEY=WANDB_API_KEY \ --shared-memory 10GiB \ --yes \ - -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}" \ No newline at end of file + -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --use_fsdp --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}" \ No newline at end of file diff --git a/scripts/qwen2vl-7b-lora-gantry.sh b/scripts/qwen2vl-7b-lora-gantry.sh new file mode 100755 index 0000000..e4eb3be --- /dev/null +++ b/scripts/qwen2vl-7b-lora-gantry.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash + +set -ex + +# check if jq is installed +if ! command -v jq &> /dev/null +then + echo "jq could not be found. Please install it." + exit +fi + + +EXTRA_ARGS="-c pdelfin/train/config/qwen2vl-7b-lora.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\"" + +run_name=$(basename "$0" .sh) + +# --cluster 'ai2/jupiter*' \ +# --cluster 'ai2/pluto*' \ +# --cluster 'ai2/allennlp-cirrascale' \ +# --priority high \ + +CLUSTER='jupiter' + +gantry run \ + --description "${run_name}"\ + --task-name "${run_name}"\ + --allow-dirty \ + --host-networking \ + --workspace ai2/oe-data-model-based-cleanup \ + --beaker-image 'jakep/jakep-pdf-finetunev1.1' \ + --venv 'base' \ + --pip gantry-requirements.txt \ + --priority high \ + --gpus 8 \ + --preemptible \ + --cluster "ai2/${CLUSTER}*" \ + --budget ai2/oe-data \ + --weka "oe-data-default:/data" \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}" \ No newline at end of file