diff --git a/pdelfin/train/config/qwen2vl-2b.yaml b/pdelfin/train/config/qwen2vl-2b.yaml index d77d48c..14d7da0 100644 --- a/pdelfin/train/config/qwen2vl-2b.yaml +++ b/pdelfin/train/config/qwen2vl-2b.yaml @@ -1,6 +1,7 @@ model: name_or_path: Qwen/Qwen2-VL-2B-Instruct arch: causal + use_flash_attn: true wandb: project: pdelfin @@ -48,7 +49,7 @@ hparams: batch_size: 1 eval_batch_size: 1 gradient_accumulation_steps: 4 - gradient_checkpointing: true + gradient_checkpointing: false clip_grad_norm: 1.0 learning_rate: 3e-4 max_steps: 200 @@ -79,4 +80,4 @@ save: path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/ save_every_steps: 100 -max_workers: 1 \ No newline at end of file +max_workers: 30 \ No newline at end of file diff --git a/pdelfin/train/train.py b/pdelfin/train/train.py index 22843ff..7e09dcc 100644 --- a/pdelfin/train/train.py +++ b/pdelfin/train/train.py @@ -1,14 +1,3 @@ -# Step 1, load the data -# Probably, we want to see just a folder with openai batch input jsonls, plus the batch output jsonls -# TODO: Figure out hyperparameters for image sizing -# Step 2. Load those prompts through and do a forward pass to calculate the loss - -# Step 3. Add hugging face accelerate for training - -# Step 4. Checkpointing code, both saving and reloading to restart - -# Step 5. Move over from interactive session to gantry launch script - import os import json import base64 @@ -121,8 +110,6 @@ def run_train(config: TrainConfig): run_name = RunName.get(config) - accelerator = accelerate.Accelerator() - setup_environment(aws_config=config.aws, wandb_config=config.wandb, WANDB_RUN_GROUP=run_name.group) dataset = make_dataset( @@ -133,7 +120,8 @@ def run_train(config: TrainConfig): ) model = Qwen2VLForConditionalGeneration.from_pretrained( - "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.bfloat16, device_map="auto" + "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch.bfloat16, device_map="auto", + _attn_implementation="flash_attention_2" if config.model.use_flash_attn else None ) processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") @@ -187,8 +175,7 @@ def run_train(config: TrainConfig): save_steps=config.save.save_every_steps, warmup_steps=config.hparams.warmup_steps, warmup_ratio=config.hparams.warmup_ratio, - bf16=accelerator.mixed_precision == "bf16", - fp16=accelerator.mixed_precision == "fp16", + bf16=True, label_names=["labels"], # fix from https://github.com/huggingface/transformers/issues/22885 max_grad_norm=config.hparams.clip_grad_norm, remove_unused_columns=False, @@ -219,13 +206,20 @@ def run_train(config: TrainConfig): trainer.train() # pyright: ignore with get_local_dir(join_path("", save_path, "best")) as best_dir: + if config.lora is not None: + logger.info("Merging LoRA adapters into the base model...") + model = model.merge_and_unload() + logger.info("LoRA adapters merged successfully.") + model.save_pretrained(best_dir) logger.info("Saved best model to %s", best_dir) + # Uncomment to test speed of data loader - # train_dataloader = DataLoader(train_ds, batch_size=1, num_workers=2, shuffle=False) + # train_dataloader = DataLoader(formatted_dataset["train"], batch_size=1, num_workers=4, shuffle=False) # for entry in tqdm(train_dataloader): # print("Step!") + # model.forward(**{k: v.to("cuda:0") for (k,v) in entry.items()}) def main(): diff --git a/scripts/beaker/jupiter-ib.sh b/scripts/beaker/jupiter-ib.sh new file mode 100644 index 0000000..9e7d983 --- /dev/null +++ b/scripts/beaker/jupiter-ib.sh @@ -0,0 +1,2 @@ +set -ex +export NCCL_DEBUG=INFO NCCL_SOCKET_IFNAME=ib NCCL_IB_HCA="^=mlx5_bond_0" \ No newline at end of file diff --git a/scripts/qwen2vl-2b-gantry.sh b/scripts/qwen2vl-2b-gantry.sh new file mode 100644 index 0000000..069043e --- /dev/null +++ b/scripts/qwen2vl-2b-gantry.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -ex + +# check if jq is installed +if ! command -v jq &> /dev/null +then + echo "jq could not be found. Please install it." + exit +fi + + +EXTRA_ARGS="-c pdelfin/train/config/qwen2vl-2b.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\"" + +run_name=$(basename "$0" .sh) + +# --cluster 'ai2/jupiter*' \ +# --cluster 'ai2/pluto*' \ +# --cluster 'ai2/allennlp-cirrascale' \ +# --priority high \ + +CLUSTER='jupiter' + +gantry run \ + --description "${run_name}"\ + --task-name "${run_name}"\ + --allow-dirty \ + --host-networking \ + --workspace ai2/oe-data-model-based-cleanup \ + --beaker-image 'lucas/refine-axelot-vllm' \ + --venv 'base' \ + --priority high \ + --gpus 8 \ + --preemptible \ + --cluster "ai2/${CLUSTER}*" \ + --budget ai2/oe-data \ + --env LOG_FILTER_TYPE=local_rank0_only \ + --env OMP_NUM_THREADS=8 \ + --env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \ + --env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \ + --env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \ + --env-secret WANDB_API_KEY=WANDB_API_KEY \ + --shared-memory 10GiB \ + --yes \ + -- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}" \ No newline at end of file