update readme

This commit is contained in:
cfli 2024-10-29 16:56:11 +08:00
parent 34e9c21654
commit 6c71f3fc71
10 changed files with 26 additions and 32 deletions

View File

@ -167,6 +167,9 @@ class AbsEmbedder(ABC):
):
if instruction is None: instruction = self.instruction
if instruction_format is None: instruction_format = self.instruction_format
if batch_size is None: batch_size = self.batch_size
if max_length is None: max_length = self.passage_max_length
if convert_to_numpy is None: convert_to_numpy = self.convert_to_numpy
if instruction is not None:
if isinstance(sentences, str):

View File

@ -8,7 +8,7 @@ eval_args="\
--domains arxiv \
--languages en \
--splits dev test \
--output_dir /share/jianlv/evaluation/air_bench/search_results \
--output_dir ./air_bench/search_results \
--search_top_k 1000 --rerank_top_k 100 \
--cache_dir $HF_HUB_CACHE \
--overwrite False \

View File

@ -2,24 +2,21 @@ if [ -z "$HF_HUB_CACHE" ]; then
export HF_HUB_CACHE="$HOME/.cache/huggingface/hub"
fi
HF_HUB_CACHE="/share/chaofan/code/FlagEmbedding_update/data/BEIR"
HF_MODEL_CACHE="/share/shared_models"
dataset_names="fiqa arguana cqadupstack"
eval_args="\
--eval_name beir \
--dataset_dir /share/chaofan/code/FlagEmbedding_update/data/beir \
--dataset_dir ./beir/data \
--dataset_names $dataset_names \
--splits test dev \
--corpus_embd_save_dir /share/chaofan/code/FlagEmbedding_update/data/beir/corpus_embd \
--output_dir /share/chaofan/code/FlagEmbedding_update/data/beir/search_results \
--corpus_embd_save_dir ./beir/corpus_embd \
--output_dir ./beir/search_results \
--search_top_k 1000 --rerank_top_k 100 \
--cache_path $HF_HUB_CACHE \
--overwrite False \
--k_values 10 100 \
--eval_output_method markdown \
--eval_output_path /share/chaofan/code/FlagEmbedding_update/data/beir/beir_eval_results.md \
--eval_output_path ./beir/beir_eval_results.md \
--eval_metrics ndcg_at_10 recall_at_100 \
"

View File

@ -6,17 +6,17 @@ dataset_names="bn hi sw te th yo"
eval_args="\
--eval_name miracl \
--dataset_dir /share/jianlv/evaluation/miracl/data \
--dataset_dir ./miracl/data \
--dataset_names $dataset_names \
--splits dev \
--corpus_embd_save_dir /share/jianlv/evaluation/miracl/corpus_embd \
--output_dir /share/jianlv/evaluation/miracl/search_results \
--corpus_embd_save_dir ./miracl/corpus_embd \
--output_dir ./miracl/search_results \
--search_top_k 1000 --rerank_top_k 100 \
--cache_path $HF_HUB_CACHE \
--overwrite False \
--k_values 10 100 \
--eval_output_method markdown \
--eval_output_path /share/jianlv/evaluation/miracl/miracl_eval_results.md \
--eval_output_path ./miracl/miracl_eval_results.md \
--eval_metrics ndcg_at_10 recall_at_100 \
"

View File

@ -6,17 +6,17 @@ dataset_names="en zh_cn"
eval_args="\
--eval_name mkqa \
--dataset_dir /share/jianlv/evaluation/mkqa/data \
--dataset_dir ./mkqa/data \
--dataset_names $dataset_names \
--splits test \
--corpus_embd_save_dir /share/jianlv/evaluation/mkqa/corpus_embd \
--output_dir /share/jianlv/evaluation/mkqa/search_results \
--corpus_embd_save_dir ./mkqa/corpus_embd \
--output_dir ./mkqa/search_results \
--search_top_k 1000 --rerank_top_k 100 \
--cache_path $HF_HUB_CACHE \
--overwrite False \
--k_values 20 \
--eval_output_method markdown \
--eval_output_path /share/jianlv/evaluation/mkqa/mkqa_eval_results.md \
--eval_output_path ./mkqa/mkqa_eval_results.md \
--eval_metrics qa_recall_at_20 \
"

View File

@ -6,17 +6,17 @@ dataset_names="hi"
eval_args="\
--eval_name mldr \
--dataset_dir /share/jianlv/evaluation/mldr/data \
--dataset_dir ./mldr/data \
--dataset_names $dataset_names \
--splits test \
--corpus_embd_save_dir /share/jianlv/evaluation/mldr/corpus_embd \
--output_dir /share/jianlv/evaluation/mldr/search_results \
--corpus_embd_save_dir ./mldr/corpus_embd \
--output_dir ./mldr/search_results \
--search_top_k 1000 --rerank_top_k 100 \
--cache_path $HF_HUB_CACHE \
--overwrite False \
--k_values 10 100 \
--eval_output_method markdown \
--eval_output_path /share/jianlv/evaluation/mldr/mldr_eval_results.md \
--eval_output_path ./mldr/mldr_eval_results.md \
--eval_metrics ndcg_at_10 \
"

View File

@ -2,23 +2,21 @@ if [ -z "$HF_HUB_CACHE" ]; then
export HF_HUB_CACHE="$HOME/.cache/huggingface/hub"
fi
HF_HUB_CACHE="/share/shared_models"
dataset_names="passage"
eval_args="\
--eval_name msmarco \
--dataset_dir /share/chaofan/code/FlagEmbedding_update/data/msmarco \
--dataset_dir ./msmarco/data \
--dataset_names $dataset_names \
--splits dev \
--corpus_embd_save_dir /share/chaofan/code/FlagEmbedding_update/data/msmarco/corpus_embd \
--output_dir /share/chaofan/code/FlagEmbedding_update/data/msmarco/search_results \
--corpus_embd_save_dir ./msmarco/corpus_embd \
--output_dir ./msmarco/search_results \
--search_top_k 1000 --rerank_top_k 100 \
--cache_path $HF_HUB_CACHE \
--overwrite True \
--k_values 10 100 \
--eval_output_method markdown \
--eval_output_path /share/chaofan/code/FlagEmbedding_update/data/msmarco/msmarco_eval_results.md \
--eval_output_path ./msmarco/msmarco_eval_results.md \
--eval_metrics ndcg_at_10 recall_at_100 \
"

View File

@ -2,17 +2,15 @@ if [ -z "$HF_HUB_CACHE" ]; then
export HF_HUB_CACHE="$HOME/.cache/huggingface/hub"
fi
HF_HUB_CACHE="/share/shared_models"
languages="eng"
tasks="NFCorpus BiorxivClusteringS2S SciDocsRR"
eval_args="\
--eval_name mteb \
--output_dir /share/chaofan/code/FlagEmbedding_update/data/mteb/search_results \
--output_dir ./mteb/search_results \
--languages $languages \
--tasks $tasks \
--eval_output_path /share/chaofan/code/FlagEmbedding_update/data/mteb/mteb_eval_results.json
--eval_output_path ./mteb/mteb_eval_results.json
"
model_args="\

View File

@ -60,7 +60,6 @@ training_args="\
"
cmd="torchrun --nproc_per_node $num_gpus \
--master_port=4567 \
-m FlagEmbedding.finetune.reranker.decoder_only.base \
$model_args \
$data_args \

View File

@ -65,7 +65,6 @@ training_args="\
"
cmd="torchrun --nproc_per_node $num_gpus \
--master_port=4567 \
-m FlagEmbedding.finetune.reranker.decoder_only.layerwise \
$model_args \
$data_args \