mirror of
https://github.com/FlagOpen/FlagEmbedding.git
synced 2025-06-27 02:39:58 +00:00
69 lines
1.7 KiB
Bash
69 lines
1.7 KiB
Bash
export WANDB_MODE=disabled
|
|
|
|
train_data="\
|
|
../example_data/retrieval \
|
|
../example_data/sts/sts.jsonl \
|
|
../example_data/classification-no_in_batch_neg \
|
|
../example_data/clustering-no_in_batch_neg "
|
|
|
|
# set large epochs and small batch size for testing
|
|
num_train_epochs=4
|
|
per_device_train_batch_size=2
|
|
|
|
# set num_gpus to 2 for testing
|
|
num_gpus=2
|
|
|
|
if [ -z "$HF_HUB_CACHE" ]; then
|
|
export HF_HUB_CACHE="$HOME/.cache/huggingface/hub"
|
|
fi
|
|
|
|
model_args="\
|
|
--model_name_or_path BAAI/bge-large-en-v1.5 \
|
|
--cache_dir $HF_HUB_CACHE \
|
|
"
|
|
|
|
data_args="\
|
|
--train_data $train_data \
|
|
--cache_path ~/.cache \
|
|
--train_group_size 8 \
|
|
--query_max_len 512 \
|
|
--passage_max_len 512 \
|
|
--pad_to_multiple_of 8 \
|
|
--query_instruction_for_retrieval 'Represent this sentence for searching relevant passages: ' \
|
|
--query_instruction_format '{}{}' \
|
|
--knowledge_distillation True \
|
|
--same_dataset_within_batch True \
|
|
--small_threshold 0 \
|
|
--drop_threshold 0 \
|
|
"
|
|
|
|
training_args="\
|
|
--output_dir ./test_encoder_only_base_bge-large-en-v1.5_sd \
|
|
--overwrite_output_dir \
|
|
--learning_rate 1e-5 \
|
|
--fp16 \
|
|
--num_train_epochs $num_train_epochs \
|
|
--per_device_train_batch_size $per_device_train_batch_size \
|
|
--dataloader_drop_last True \
|
|
--warmup_ratio 0.1 \
|
|
--gradient_checkpointing \
|
|
--deepspeed ../../ds_stage0.json \
|
|
--logging_steps 1 \
|
|
--save_steps 1000 \
|
|
--negatives_cross_device \
|
|
--temperature 0.02 \
|
|
--sentence_pooling_method cls \
|
|
--normalize_embeddings True \
|
|
--kd_loss_type kl_div \
|
|
"
|
|
|
|
cmd="torchrun --nproc_per_node $num_gpus \
|
|
-m FlagEmbedding.finetune.embedder.encoder_only.base \
|
|
$model_args \
|
|
$data_args \
|
|
$training_args \
|
|
"
|
|
|
|
echo $cmd
|
|
eval $cmd
|