mirror of
https://github.com/FlagOpen/FlagEmbedding.git
synced 2025-06-27 02:39:58 +00:00
update embedder finetune examples
This commit is contained in:
parent
d34f86192d
commit
4e13ad380e
@ -111,7 +111,7 @@ class DecoderOnlyEmbedderICLRunner(AbsEmbedderRunner):
|
||||
self.training_args.per_device_train_batch_size = 1
|
||||
self.training_args.dataloader_num_workers = 0 # avoid multi-processing
|
||||
else:
|
||||
raise NotImplementedError("Only support `same_dataset_within_batch` for now.")
|
||||
raise NotImplementedError("Only support `same_dataset_within_batch` for `DecoderOnlyEmbedderICLRunner`.")
|
||||
return train_dataset
|
||||
|
||||
def run(self):
|
||||
|
@ -37,10 +37,7 @@ data_args="\
|
||||
--pad_to_multiple_of 8 \
|
||||
--query_instruction_for_retrieval 'Given a query, retrieve passages that are relevant to the query.' \
|
||||
--query_instruction_format '<instruct>{}\n<query>{}' \
|
||||
--knowledge_distillation True \
|
||||
--same_dataset_within_batch True \
|
||||
--small_threshold 0 \
|
||||
--drop_threshold 0 \
|
||||
--knowledge_distillation False \
|
||||
"
|
||||
|
||||
training_args="\
|
||||
|
74
examples/finetune/embedder/decoder_only/base_same_dataset.sh
Normal file
74
examples/finetune/embedder/decoder_only/base_same_dataset.sh
Normal file
@ -0,0 +1,74 @@
|
||||
export WANDB_MODE=disabled
|
||||
|
||||
train_data="\
|
||||
../example_data/retrieval \
|
||||
../example_data/sts/sts.jsonl \
|
||||
../example_data/classification-no_in_batch_neg \
|
||||
../example_data/clustering-no_in_batch_neg "
|
||||
|
||||
# set large epochs and small batch size for testing
|
||||
num_train_epochs=4
|
||||
per_device_train_batch_size=2
|
||||
|
||||
# set num_gpus to 2 for testing
|
||||
num_gpus=2
|
||||
|
||||
if [ -z "$HF_HUB_CACHE" ]; then
|
||||
export HF_HUB_CACHE="$HOME/.cache/huggingface/hub"
|
||||
fi
|
||||
|
||||
model_args="\
|
||||
--model_name_or_path BAAI/bge-multilingual-gemma2 \
|
||||
--cache_dir $HF_HUB_CACHE \
|
||||
--use_lora True \
|
||||
--lora_rank 32 \
|
||||
--lora_alpha 64 \
|
||||
--target_modules q_proj k_proj v_proj o_proj gate_proj down_proj up_proj \
|
||||
--additional_special_tokens '<instruct>' '<query>' \
|
||||
--save_merged_lora_model True \
|
||||
"
|
||||
|
||||
data_args="\
|
||||
--train_data $train_data \
|
||||
--cache_path ~/.cache \
|
||||
--train_group_size 8 \
|
||||
--query_max_len 512 \
|
||||
--passage_max_len 512 \
|
||||
--pad_to_multiple_of 8 \
|
||||
--query_instruction_for_retrieval 'Given a query, retrieve passages that are relevant to the query.' \
|
||||
--query_instruction_format '<instruct>{}\n<query>{}' \
|
||||
--knowledge_distillation True \
|
||||
--same_dataset_within_batch True \
|
||||
--small_threshold 0 \
|
||||
--drop_threshold 0 \
|
||||
"
|
||||
|
||||
training_args="\
|
||||
--output_dir ./test_decoder_only_base_bge-multilingual-gemma2_sd \
|
||||
--overwrite_output_dir \
|
||||
--learning_rate 1e-4 \
|
||||
--fp16 \
|
||||
--num_train_epochs $num_train_epochs \
|
||||
--per_device_train_batch_size $per_device_train_batch_size \
|
||||
--dataloader_drop_last True \
|
||||
--warmup_ratio 0.1 \
|
||||
--gradient_checkpointing \
|
||||
--deepspeed ../../ds_stage1.json \
|
||||
--logging_steps 1 \
|
||||
--save_steps 1000 \
|
||||
--negatives_cross_device \
|
||||
--temperature 0.02 \
|
||||
--sentence_pooling_method last_token \
|
||||
--normalize_embeddings True \
|
||||
--kd_loss_type m3_kd_loss \
|
||||
"
|
||||
|
||||
cmd="torchrun --nproc_per_node $num_gpus \
|
||||
-m FlagEmbedding.finetune.embedder.decoder_only.base \
|
||||
$model_args \
|
||||
$data_args \
|
||||
$training_args \
|
||||
"
|
||||
|
||||
echo $cmd
|
||||
eval $cmd
|
@ -48,7 +48,7 @@ data_args="\
|
||||
"
|
||||
|
||||
training_args="\
|
||||
--output_dir ./test_decoder_only_base_bge-en-icl \
|
||||
--output_dir ./test_decoder_only_base_bge-en-icl_sd \
|
||||
--overwrite_output_dir \
|
||||
--learning_rate 1e-4 \
|
||||
--fp16 \
|
@ -31,10 +31,7 @@ data_args="\
|
||||
--pad_to_multiple_of 8 \
|
||||
--query_instruction_for_retrieval 'Represent this sentence for searching relevant passages: ' \
|
||||
--query_instruction_format '{}{}' \
|
||||
--knowledge_distillation True \
|
||||
--same_dataset_within_batch True \
|
||||
--small_threshold 0 \
|
||||
--drop_threshold 0 \
|
||||
--knowledge_distillation False \
|
||||
"
|
||||
|
||||
training_args="\
|
||||
|
68
examples/finetune/embedder/encoder_only/base_same_dataset.sh
Normal file
68
examples/finetune/embedder/encoder_only/base_same_dataset.sh
Normal file
@ -0,0 +1,68 @@
|
||||
export WANDB_MODE=disabled
|
||||
|
||||
train_data="\
|
||||
../example_data/retrieval \
|
||||
../example_data/sts/sts.jsonl \
|
||||
../example_data/classification-no_in_batch_neg \
|
||||
../example_data/clustering-no_in_batch_neg "
|
||||
|
||||
# set large epochs and small batch size for testing
|
||||
num_train_epochs=4
|
||||
per_device_train_batch_size=2
|
||||
|
||||
# set num_gpus to 2 for testing
|
||||
num_gpus=2
|
||||
|
||||
if [ -z "$HF_HUB_CACHE" ]; then
|
||||
export HF_HUB_CACHE="$HOME/.cache/huggingface/hub"
|
||||
fi
|
||||
|
||||
model_args="\
|
||||
--model_name_or_path BAAI/bge-large-en-v1.5 \
|
||||
--cache_dir $HF_HUB_CACHE \
|
||||
"
|
||||
|
||||
data_args="\
|
||||
--train_data $train_data \
|
||||
--cache_path ~/.cache \
|
||||
--train_group_size 8 \
|
||||
--query_max_len 512 \
|
||||
--passage_max_len 512 \
|
||||
--pad_to_multiple_of 8 \
|
||||
--query_instruction_for_retrieval 'Represent this sentence for searching relevant passages: ' \
|
||||
--query_instruction_format '{}{}' \
|
||||
--knowledge_distillation True \
|
||||
--same_dataset_within_batch True \
|
||||
--small_threshold 0 \
|
||||
--drop_threshold 0 \
|
||||
"
|
||||
|
||||
training_args="\
|
||||
--output_dir ./test_encoder_only_base_bge-large-en-v1.5_sd \
|
||||
--overwrite_output_dir \
|
||||
--learning_rate 1e-5 \
|
||||
--fp16 \
|
||||
--num_train_epochs $num_train_epochs \
|
||||
--per_device_train_batch_size $per_device_train_batch_size \
|
||||
--dataloader_drop_last True \
|
||||
--warmup_ratio 0.1 \
|
||||
--gradient_checkpointing \
|
||||
--deepspeed ../../ds_stage0.json \
|
||||
--logging_steps 1 \
|
||||
--save_steps 1000 \
|
||||
--negatives_cross_device \
|
||||
--temperature 0.02 \
|
||||
--sentence_pooling_method cls \
|
||||
--normalize_embeddings True \
|
||||
--kd_loss_type kl_div \
|
||||
"
|
||||
|
||||
cmd="torchrun --nproc_per_node $num_gpus \
|
||||
-m FlagEmbedding.finetune.embedder.encoder_only.base \
|
||||
$model_args \
|
||||
$data_args \
|
||||
$training_args \
|
||||
"
|
||||
|
||||
echo $cmd
|
||||
eval $cmd
|
@ -29,10 +29,7 @@ data_args="\
|
||||
--query_max_len 512 \
|
||||
--passage_max_len 512 \
|
||||
--pad_to_multiple_of 8 \
|
||||
--knowledge_distillation True \
|
||||
--same_dataset_within_batch True \
|
||||
--small_threshold 0 \
|
||||
--drop_threshold 0 \
|
||||
--knowledge_distillation False \
|
||||
"
|
||||
|
||||
training_args="\
|
||||
|
70
examples/finetune/embedder/encoder_only/m3_same_dataset.sh
Normal file
70
examples/finetune/embedder/encoder_only/m3_same_dataset.sh
Normal file
@ -0,0 +1,70 @@
|
||||
export WANDB_MODE=disabled
|
||||
|
||||
train_data="\
|
||||
../example_data/retrieval \
|
||||
../example_data/sts/sts.jsonl \
|
||||
../example_data/classification-no_in_batch_neg \
|
||||
../example_data/clustering-no_in_batch_neg "
|
||||
|
||||
# set large epochs and small batch size for testing
|
||||
num_train_epochs=4
|
||||
per_device_train_batch_size=2
|
||||
|
||||
# set num_gpus to 2 for testing
|
||||
num_gpus=2
|
||||
|
||||
if [ -z "$HF_HUB_CACHE" ]; then
|
||||
export HF_HUB_CACHE="$HOME/.cache/huggingface/hub"
|
||||
fi
|
||||
|
||||
model_args="\
|
||||
--model_name_or_path BAAI/bge-m3 \
|
||||
--cache_dir $HF_HUB_CACHE \
|
||||
"
|
||||
|
||||
data_args="\
|
||||
--train_data $train_data \
|
||||
--cache_path ~/.cache \
|
||||
--train_group_size 8 \
|
||||
--query_max_len 512 \
|
||||
--passage_max_len 512 \
|
||||
--pad_to_multiple_of 8 \
|
||||
--knowledge_distillation True \
|
||||
--same_dataset_within_batch True \
|
||||
--small_threshold 0 \
|
||||
--drop_threshold 0 \
|
||||
"
|
||||
|
||||
training_args="\
|
||||
--output_dir ./test_encoder_only_m3_bge-m3_sd \
|
||||
--overwrite_output_dir \
|
||||
--learning_rate 1e-5 \
|
||||
--fp16 \
|
||||
--num_train_epochs $num_train_epochs \
|
||||
--per_device_train_batch_size $per_device_train_batch_size \
|
||||
--dataloader_drop_last True \
|
||||
--warmup_ratio 0.1 \
|
||||
--gradient_checkpointing \
|
||||
--deepspeed ../../ds_stage0.json \
|
||||
--logging_steps 1 \
|
||||
--save_steps 1000 \
|
||||
--negatives_cross_device \
|
||||
--temperature 0.02 \
|
||||
--sentence_pooling_method cls \
|
||||
--normalize_embeddings True \
|
||||
--kd_loss_type m3_kd_loss \
|
||||
--unified_finetuning True \
|
||||
--use_self_distill True \
|
||||
--fix_encoder False \
|
||||
--self_distill_start_step 0 \
|
||||
"
|
||||
|
||||
cmd="torchrun --nproc_per_node $num_gpus \
|
||||
-m FlagEmbedding.finetune.embedder.encoder_only.m3 \
|
||||
$model_args \
|
||||
$data_args \
|
||||
$training_args \
|
||||
"
|
||||
|
||||
echo $cmd
|
||||
eval $cmd
|
Loading…
x
Reference in New Issue
Block a user