mirror of
https://github.com/allenai/olmocr.git
synced 2025-12-13 08:11:22 +00:00
Trying out non-lora training
This commit is contained in:
parent
ec09408ca9
commit
fb4e585e9f
@ -28,21 +28,24 @@ generate:
|
|||||||
train_data:
|
train_data:
|
||||||
seed: 1337
|
seed: 1337
|
||||||
sources:
|
sources:
|
||||||
- name: openai_batch_data_v2
|
# These tend to be really big, so it's only practical to host them as parquets on weka, otherwise you may OOM or just never finish dataloading
|
||||||
query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_v2/*.jsonl
|
- name: openai_batch_data_v5_1_train
|
||||||
response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_v2/*.json
|
parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_parquet/*.parquet
|
||||||
backend:
|
- name: openai_batch_data_v5_1_train
|
||||||
- openai
|
parquet_path: /data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_parquet/*.parquet
|
||||||
size: 100_000
|
|
||||||
|
|
||||||
valid_data:
|
valid_data:
|
||||||
|
metric_for_best_model: openai_batch_data_v5_1_eval_loss
|
||||||
sources:
|
sources:
|
||||||
- name: openai_batch_data_eval_mini
|
# These tend to be small, so you can load from s3 it's no big deal
|
||||||
query_glob_path: s3://ai2-oe-data/jakep/openai_batch_data_eval_mini/*.jsonl
|
- name: openai_batch_data_v5_1_eval
|
||||||
response_glob_path: s3://ai2-oe-data/jakep/openai_batch_done_eval_mini/*.json
|
query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_eval/*.jsonl
|
||||||
backend:
|
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_eval/*.json
|
||||||
- openai
|
- name: openai_batch_data_v5_1_iabooks_eval
|
||||||
size: 100_000
|
query_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_data_v5_1_iabooks_eval/*.jsonl
|
||||||
|
response_glob_path: s3://ai2-oe-data/jakep/pdfdata/openai_batch_done_v5_1_iabooks_eval/*.json
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh
|
# Mostly pulled from https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.sh
|
||||||
hparams:
|
hparams:
|
||||||
@ -52,30 +55,15 @@ hparams:
|
|||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
clip_grad_norm: 1.0
|
clip_grad_norm: 1.0
|
||||||
learning_rate: 3e-4
|
learning_rate: 3e-4
|
||||||
max_steps: 5000
|
max_steps: 9000
|
||||||
pad_multiple_of: 16
|
pad_multiple_of: 16
|
||||||
log_every_steps: 50
|
log_every_steps: 10
|
||||||
eval_every_steps: 500
|
eval_every_steps: 100
|
||||||
optim: adamw_torch
|
optim: adamw_torch
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
weight_decay: 0.01
|
weight_decay: 0.01
|
||||||
warmup_ratio: 0.03
|
warmup_ratio: 0.03
|
||||||
|
|
||||||
# From https://github.com/QwenLM/Qwen2/blob/main/examples/sft/finetune.py
|
|
||||||
# Disable LORA for now, because we want the visual network to get trained too
|
|
||||||
# lora:
|
|
||||||
# rank: 32
|
|
||||||
# alpha: 32
|
|
||||||
# dropout: 0.05
|
|
||||||
# task_type: causal_lm
|
|
||||||
# target_modules:
|
|
||||||
# - q_proj
|
|
||||||
# - k_proj
|
|
||||||
# - v_proj
|
|
||||||
# - o_proj
|
|
||||||
# - gate_proj
|
|
||||||
# - up_proj
|
|
||||||
# - down_proj
|
|
||||||
|
|
||||||
save:
|
save:
|
||||||
path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/
|
path: s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/
|
||||||
|
|||||||
@ -44,4 +44,4 @@ gantry run \
|
|||||||
--env-secret WANDB_API_KEY=WANDB_API_KEY \
|
--env-secret WANDB_API_KEY=WANDB_API_KEY \
|
||||||
--shared-memory 10GiB \
|
--shared-memory 10GiB \
|
||||||
--yes \
|
--yes \
|
||||||
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}"
|
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --use_fsdp --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --fsdp_offload_params false --fsdp_sharding_strategy FULL_SHARD --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}"
|
||||||
47
scripts/qwen2vl-7b-lora-gantry.sh
Executable file
47
scripts/qwen2vl-7b-lora-gantry.sh
Executable file
@ -0,0 +1,47 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# check if jq is installed
|
||||||
|
if ! command -v jq &> /dev/null
|
||||||
|
then
|
||||||
|
echo "jq could not be found. Please install it."
|
||||||
|
exit
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
EXTRA_ARGS="-c pdelfin/train/config/qwen2vl-7b-lora.yaml --num_proc 64 --save.path \"s3://ai2-oe-data/jakep/experiments/qwen2vl-pdf/v1/models/\${BEAKER_USER_ID}\""
|
||||||
|
|
||||||
|
run_name=$(basename "$0" .sh)
|
||||||
|
|
||||||
|
# --cluster 'ai2/jupiter*' \
|
||||||
|
# --cluster 'ai2/pluto*' \
|
||||||
|
# --cluster 'ai2/allennlp-cirrascale' \
|
||||||
|
# --priority high \
|
||||||
|
|
||||||
|
CLUSTER='jupiter'
|
||||||
|
|
||||||
|
gantry run \
|
||||||
|
--description "${run_name}"\
|
||||||
|
--task-name "${run_name}"\
|
||||||
|
--allow-dirty \
|
||||||
|
--host-networking \
|
||||||
|
--workspace ai2/oe-data-model-based-cleanup \
|
||||||
|
--beaker-image 'jakep/jakep-pdf-finetunev1.1' \
|
||||||
|
--venv 'base' \
|
||||||
|
--pip gantry-requirements.txt \
|
||||||
|
--priority high \
|
||||||
|
--gpus 8 \
|
||||||
|
--preemptible \
|
||||||
|
--cluster "ai2/${CLUSTER}*" \
|
||||||
|
--budget ai2/oe-data \
|
||||||
|
--weka "oe-data-default:/data" \
|
||||||
|
--env LOG_FILTER_TYPE=local_rank0_only \
|
||||||
|
--env OMP_NUM_THREADS=8 \
|
||||||
|
--env BEAKER_USER_ID=$(beaker account whoami --format json | jq '.[0].name' -cr) \
|
||||||
|
--env-secret AWS_ACCESS_KEY_ID=S2_AWS_ACCESS_KEY_ID \
|
||||||
|
--env-secret AWS_SECRET_ACCESS_KEY=S2_AWS_SECRET_ACCESS_KEY \
|
||||||
|
--env-secret WANDB_API_KEY=WANDB_API_KEY \
|
||||||
|
--shared-memory 10GiB \
|
||||||
|
--yes \
|
||||||
|
-- /bin/bash -c "source scripts/beaker/${CLUSTER}-ib.sh && accelerate launch --multi_gpu --num_processes \${BEAKER_ASSIGNED_GPU_COUNT} --mixed_precision bf16 -m pdelfin.train.train ${EXTRA_ARGS}"
|
||||||
Loading…
x
Reference in New Issue
Block a user