evaluation docs

This commit is contained in:
ZiyiXia 2024-11-14 13:38:02 +08:00
parent 7caa5989a7
commit d1c3b3fa9a
32 changed files with 744 additions and 0 deletions

View File

@ -4,6 +4,9 @@ from typing import List, Optional
@dataclass
class AIRBenchEvalModelArgs:
"""
Evaluation Model arguments for AIR Bench.
"""
embedder_name_or_path: str = field(
metadata={"help": "The embedder name or path.", "required": True}
)

View File

@ -10,6 +10,13 @@ from .arguments import AIRBenchEvalArgs, AIRBenchEvalModelArgs
class AIRBenchEvalRunner:
"""
Evaluation runner for AIR Bench.
Args:
eval_args (AIRBenchEvalArgs): :class:AIRBenchEvalArgs object with the evaluation arguments.
model_args (AIRBenchEvalModelArgs): :class:AIRBenchEvalModelArgs object with the model arguments.
"""
def __init__(
self,
eval_args: AIRBenchEvalArgs,
@ -22,6 +29,12 @@ class AIRBenchEvalRunner:
self.retriever, self.reranker = self.load_retriever_and_reranker()
def load_retriever_and_reranker(self) -> Tuple[EvalDenseRetriever, Union[EvalReranker, None]]:
"""Load retriever and reranker for evaluation
Returns:
Tuple[EvalDenseRetriever, Union[EvalReranker, None]]: A :class:EvalDenseRetriever object for retrieval, and a
:class:EvalReranker object if reranker provided.
"""
embedder, reranker = AbsEvalRunner.get_models(self.model_args)
retriever = EvalDenseRetriever(
embedder,
@ -33,6 +46,9 @@ class AIRBenchEvalRunner:
return retriever, reranker
def run(self):
"""
Run the whole evaluation.
"""
evaluation = AIRBench(
benchmark_version=self.eval_args.benchmark_version,
task_types=self.eval_args.task_types,

View File

@ -5,6 +5,9 @@ from FlagEmbedding.abc.evaluation.arguments import AbsEvalArgs
@dataclass
class BEIREvalArgs(AbsEvalArgs):
"""
Argument class for BEIR evaluation.
"""
use_special_instructions: bool = field(
default=False, metadata={"help": "Whether to use specific instructions in `prompts.py` for evaluation. Default: False"}
)

View File

@ -13,15 +13,42 @@ logger = logging.getLogger(__name__)
class BEIREvalDataLoader(AbsEvalDataLoader):
"""
Data loader class for BEIR.
"""
def available_dataset_names(self) -> List[str]:
"""
Get the available dataset names.
Returns:
List[str]: All the available dataset names.
"""
return ['arguana', 'climate-fever', 'cqadupstack', 'dbpedia-entity', 'fever', 'fiqa', 'hotpotqa', 'msmarco', 'nfcorpus', 'nq', 'quora', 'scidocs', 'scifact', 'trec-covid', 'webis-touche2020']
def available_sub_dataset_names(self, dataset_name: Optional[str] = None) -> List[str]:
"""
Get the available sub-dataset names.
Args:
dataset_name (Optional[str], optional): All the available sub-dataset names. Defaults to ``None``.
Returns:
List[str]: All the available sub-dataset names.
"""
if dataset_name == 'cqadupstack':
return ['android', 'english', 'gaming', 'gis', 'mathematica', 'physics', 'programmers', 'stats', 'tex', 'unix', 'webmasters', 'wordpress']
return None
def available_splits(self, dataset_name: Optional[str] = None) -> List[str]:
"""
Get the avaialble splits.
Args:
dataset_name (str): Dataset name.
Returns:
List[str]: All the available splits for the dataset.
"""
if dataset_name == 'msmarco':
return ['dev']
return ['test']
@ -32,6 +59,16 @@ class BEIREvalDataLoader(AbsEvalDataLoader):
sub_dataset_name: Optional[str] = None,
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Load the corpus dataset from HF.
Args:
dataset_name (str): Name of the dataset.
sub_dataset_name (Optional[str]): Name of the sub-dataset. Defaults to ``None``.
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: Loaded datasets instance of corpus.
"""
if dataset_name != 'cqadupstack':
corpus = datasets.load_dataset(
'BeIR/{d}'.format(d=dataset_name),
@ -94,6 +131,17 @@ class BEIREvalDataLoader(AbsEvalDataLoader):
split: str = 'dev',
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Load the qrels from HF.
Args:
dataset_name (str): Name of the dataset.
sub_dataset_name (Optional[str]): Name of the sub-dataset. Defaults to ``None``.
split (str, optional): Split of the dataset. Defaults to ``'dev'``.
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: Loaded datasets instance of qrel.
"""
if dataset_name != 'cqadupstack':
qrels = datasets.load_dataset(
'BeIR/{d}-qrels'.format(d=dataset_name),
@ -168,6 +216,17 @@ class BEIREvalDataLoader(AbsEvalDataLoader):
split: str = 'test',
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Load the queries from HF.
Args:
dataset_name (str): Name of the dataset.
sub_dataset_name (Optional[str]): Name of the sub-dataset. Defaults to ``None``.
split (str, optional): Split of the dataset. Defaults to ``'dev'``.
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: Loaded datasets instance of queries.
"""
qrels = self.load_qrels(dataset_name=dataset_name, sub_dataset_name=sub_dataset_name, split=split)
if dataset_name != 'cqadupstack':
@ -230,6 +289,15 @@ class BEIREvalDataLoader(AbsEvalDataLoader):
return datasets.DatasetDict(queries_dict)
def load_corpus(self, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None) -> datasets.DatasetDict:
"""Load the corpus from the dataset.
Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
"""
if self.dataset_dir is not None:
if dataset_name is None:
save_dir = self.dataset_dir
@ -240,6 +308,19 @@ class BEIREvalDataLoader(AbsEvalDataLoader):
return self._load_remote_corpus(dataset_name=dataset_name, sub_dataset_name=sub_dataset_name)
def load_qrels(self, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
"""Load the qrels from the dataset.
Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
split (str, optional): The split to load relevance from. Defaults to ``'test'``.
Raises:
ValueError
Returns:
datasets.DatasetDict: A dict of relevance of query and document.
"""
if self.dataset_dir is not None:
if dataset_name is None:
save_dir = self.dataset_dir
@ -256,6 +337,19 @@ class BEIREvalDataLoader(AbsEvalDataLoader):
return self._load_remote_qrels(dataset_name=dataset_name, sub_dataset_name=sub_dataset_name, split=split)
def load_queries(self, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
"""Load the queries from the dataset.
Args:
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
split (str, optional): The split to load queries from. Defaults to ``'test'``.
Raises:
ValueError
Returns:
datasets.DatasetDict: A dict of queries with id as key, query text as value.
"""
if self.dataset_dir is not None:
if dataset_name is None:
save_dir = self.dataset_dir
@ -272,6 +366,16 @@ class BEIREvalDataLoader(AbsEvalDataLoader):
return self._load_remote_queries(dataset_name=dataset_name, sub_dataset_name=sub_dataset_name, split=split)
def _load_local_corpus(self, save_dir: str, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None) -> datasets.DatasetDict:
"""Load corpus from local dataset.
Args:
save_dir (str): Path to save the loaded corpus.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: A dict of corpus with id as key, title and text as value.
"""
if sub_dataset_name is None:
corpus_path = os.path.join(save_dir, 'corpus.jsonl')
else:
@ -291,6 +395,20 @@ class BEIREvalDataLoader(AbsEvalDataLoader):
return datasets.DatasetDict(corpus)
def _load_local_qrels(self, save_dir: str, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
"""Load relevance from local dataset.
Args:
save_dir (str): Path to save the loaded relevance.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
split (str, optional): Split to load from the local dataset. Defaults to ``'test'``.
Raises:
ValueError
Returns:
datasets.DatasetDict: A dict of relevance of query and document.
"""
checked_split = self.check_splits(split)
if len(checked_split) == 0:
raise ValueError(f"Split {split} not found in the dataset.")
@ -318,6 +436,20 @@ class BEIREvalDataLoader(AbsEvalDataLoader):
return datasets.DatasetDict(qrels)
def _load_local_queries(self, save_dir: str, dataset_name: Optional[str] = None, sub_dataset_name: Optional[str] = None, split: str = 'test') -> datasets.DatasetDict:
"""Load queries from local dataset.
Args:
save_dir (str): Path to save the loaded queries.
dataset_name (Optional[str], optional): Name of the dataset. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
split (str, optional): Split to load from the local dataset. Defaults to ``'test'``.
Raises:
ValueError
Returns:
datasets.DatasetDict: A dict of queries with id as key, query text as value.
"""
checked_split = self.check_splits(split)
if len(checked_split) == 0:
raise ValueError(f"Split {split} not found in the dataset.")

View File

@ -10,6 +10,9 @@ logger = logging.getLogger(__name__)
class BEIREvaluator(AbsEvaluator):
"""
Evaluator class of BEIR
"""
def check_data_info(
self,
data_info: Dict[str, str],
@ -19,6 +22,23 @@ class BEIREvaluator(AbsEvaluator):
dataset_name: Optional[str] = None,
sub_dataset_name: Optional[str] = None,
):
"""Check the validity of data info.
Args:
data_info (Dict[str, str]): The loaded data info to be check.
model_name (str): Name of model used.
reranker_name (str): Name of reranker used.
split (str): Split used in searching.
dataset_name (Optional[str], optional): Name of dataset used. Defaults to None.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
Raises:
ValueError: eval_name mismatch
ValueError: model_name or reranker_name mismatch
ValueError: split mismatch
ValueError: dataset_name mismatch
ValueError: sub_dataset_name mismatch
"""
if data_info["eval_name"] != self.eval_name:
raise ValueError(
f'eval_name mismatch: {data_info["eval_name"]} vs {self.eval_name}'
@ -317,11 +337,21 @@ class BEIREvaluator(AbsEvaluator):
self.output_eval_results_to_json(reranker_eval_results, eval_results_save_path)
if reranker is not None:
reranker.stop_multi_process_pool()
def evaluate_results(
self,
search_results_save_dir: str,
k_values: List[int] = [1, 3, 5, 10, 100, 1000]
):
"""Compute metrics according to the results in the directory.
Args:
search_results_save_dir (str): Path to the search results.
k_values (List[int], optional): Cutoffs. Defaults to :data:`[1, 3, 5, 10, 100, 1000]`.
Returns:
dict: Evaluation results.
"""
eval_results_dict = {}
cqadupstack_results = None
cqadupstack_num = 0
@ -386,6 +416,18 @@ class BEIREvaluator(AbsEvaluator):
dataset_name: Optional[str] = None,
sub_dataset_name: Optional[str] = None,
):
"""Save the metadata and search results into a file.
Args:
eval_name (str): The experiment name of current evaluation.
model_name (str): Name of model used.
reranker_name (str): Name of reranker used.
search_results (Dict[str, Dict[str, float]]): Dictionary of search results.
output_path (str): Output path to write the results.
split (str): Split used in searching.
dataset_name (Optional[str], optional): Name of dataset used. Defaults to ``None``.
sub_dataset_name (Optional[str], optional): Name of the sub-dataset. Defaults to ``None``.
"""
data = {
"eval_name": eval_name,
"model_name": model_name,

View File

@ -9,7 +9,13 @@ logger = logging.getLogger(__name__)
class BEIREvalRunner(AbsEvalRunner):
"""
Runner class of BEIR evaluation.
"""
def run(self):
"""
Run the whole evaluation.
"""
if self.eval_args.dataset_names is None:
dataset_names = self.data_loader.available_dataset_names()
else:
@ -54,6 +60,11 @@ class BEIREvalRunner(AbsEvalRunner):
)
def load_data_loader(self) -> BEIREvalDataLoader:
"""Load the data loader
Returns:
BEIREvalDataLoader: BEIR data loader object.
"""
data_loader = BEIREvalDataLoader(
eval_name=self.eval_args.eval_name,
dataset_dir=self.eval_args.dataset_dir,
@ -64,6 +75,11 @@ class BEIREvalRunner(AbsEvalRunner):
return data_loader
def load_evaluator(self) -> BEIREvaluator:
"""Load the evaluator for evaluation
Returns:
BEIREvaluator: The BEIR evaluator to run the evaluation.
"""
evaluator = BEIREvaluator(
eval_name=self.eval_args.eval_name,
data_loader=self.data_loader,

View File

@ -11,10 +11,28 @@ logger = logging.getLogger(__name__)
class MLDREvalDataLoader(AbsEvalDataLoader):
"""
Data loader class for MLDR.
"""
def available_dataset_names(self) -> List[str]:
"""
Get the available dataset names.
Returns:
List[str]: All the available dataset names.
"""
return ["ar", "de", "en", "es", "fr", "hi", "it", "ja", "ko", "pt", "ru", "th", "zh"]
def available_splits(self, dataset_name: Optional[str] = None) -> List[str]:
"""
Get the avaialble splits.
Args:
dataset_name (Optional[str], optional): Dataset name. Defaults to ``None``.
Returns:
List[str]: All the available splits for the dataset.
"""
return ["train", "dev", "test"]
def _load_remote_corpus(
@ -22,6 +40,15 @@ class MLDREvalDataLoader(AbsEvalDataLoader):
dataset_name: str,
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Load the corpus dataset from HF.
Args:
dataset_name (str): Name of the dataset.
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: Loaded datasets instance of corpus.
"""
corpus = datasets.load_dataset(
"Shitao/MLDR", f"corpus-{dataset_name}",
cache_dir=self.cache_dir,
@ -53,6 +80,16 @@ class MLDREvalDataLoader(AbsEvalDataLoader):
split: str = "test",
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Load the qrels from HF.
Args:
dataset_name (str): Name of the dataset.
split (str, optional): Split of the dataset. Defaults to ``'test'``.
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: Loaded datasets instance of qrel.
"""
qrels_data = datasets.load_dataset(
"Shitao/MLDR", dataset_name,
cache_dir=self.cache_dir,
@ -108,6 +145,16 @@ class MLDREvalDataLoader(AbsEvalDataLoader):
split: str = "test",
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Load the queries from HF.
Args:
dataset_name (str): Name of the dataset.
split (str, optional): Split of the dataset. Defaults to ``'test'``.
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: Loaded datasets instance of queries.
"""
queries_data = datasets.load_dataset(
"Shitao/MLDR", dataset_name,
cache_dir=self.cache_dir,

View File

@ -4,7 +4,15 @@ from .data_loader import MLDREvalDataLoader
class MLDREvalRunner(AbsEvalRunner):
"""
Evaluation runner of MIRACL.
"""
def load_data_loader(self) -> MLDREvalDataLoader:
"""Load the data loader instance by args.
Returns:
MLDREvalDataLoader: The MLDR data loader instance.
"""
data_loader = MLDREvalDataLoader(
eval_name=self.eval_args.eval_name,
dataset_dir=self.eval_args.dataset_dir,

View File

@ -11,10 +11,28 @@ logger = logging.getLogger(__name__)
class MSMARCOEvalDataLoader(AbsEvalDataLoader):
"""
Data loader class for MSMARCO.
"""
def available_dataset_names(self) -> List[str]:
"""
Get the available dataset names.
Returns:
List[str]: All the available dataset names.
"""
return ["passage", "document"]
def available_splits(self, dataset_name: Optional[str] = None) -> List[str]:
"""
Get the avaialble splits.
Args:
dataset_name (Optional[str], optional): Dataset name. Defaults to ``None``.
Returns:
List[str]: All the available splits for the dataset.
"""
return ["dev", "dl19", "dl20"]
def _load_remote_corpus(
@ -22,6 +40,15 @@ class MSMARCOEvalDataLoader(AbsEvalDataLoader):
dataset_name: str,
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Load the corpus dataset from HF.
Args:
dataset_name (str): Name of the dataset.
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: Loaded datasets instance of corpus.
"""
if dataset_name == 'passage':
corpus = datasets.load_dataset(
'Tevatron/msmarco-passage-corpus',
@ -80,6 +107,16 @@ class MSMARCOEvalDataLoader(AbsEvalDataLoader):
split: str = 'dev',
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Load the qrels from HF.
Args:
dataset_name (str): Name of the dataset.
split (str, optional): Split of the dataset. Defaults to ``'dev'``.
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: Loaded datasets instance of qrel.
"""
if dataset_name == 'passage':
if split == 'dev':
qrels = datasets.load_dataset(
@ -164,6 +201,16 @@ class MSMARCOEvalDataLoader(AbsEvalDataLoader):
split: str = 'test',
save_dir: Optional[str] = None
) -> datasets.DatasetDict:
"""Load the queries from HF.
Args:
dataset_name (str): Name of the dataset.
split (str, optional): Split of the dataset. Defaults to ``'test'``.
save_dir (Optional[str], optional): Directory to save the dataset. Defaults to ``None``.
Returns:
datasets.DatasetDict: Loaded datasets instance of queries.
"""
if split == 'dev':
if dataset_name == 'passage':
queries = datasets.load_dataset(

View File

@ -4,7 +4,15 @@ from .data_loader import MSMARCOEvalDataLoader
class MSMARCOEvalRunner(AbsEvalRunner):
"""
Evaluation runner of MSMARCO.
"""
def load_data_loader(self) -> MSMARCOEvalDataLoader:
"""Load the data loader instance by args.
Returns:
MSMARCOEvalDataLoader: The MSMARCO data loader instance.
"""
data_loader = MSMARCOEvalDataLoader(
eval_name=self.eval_args.eval_name,
dataset_dir=self.eval_args.dataset_dir,

View File

@ -6,6 +6,9 @@ from FlagEmbedding.abc.evaluation.arguments import AbsEvalArgs
@dataclass
class MTEBEvalArgs(AbsEvalArgs):
"""
Argument class for MTEB evaluation.
"""
languages: List[str] = field(
default=None, metadata={"help": "Languages to evaluate. Default: eng"}
)

View File

@ -20,6 +20,9 @@ def ensure_dir(file_path):
os.makedirs(directory)
class MTEBEvalRunner(AbsEvalRunner):
"""
Evaluation runner of MTEB.
"""
def __init__(
self,
eval_args: MTEBEvalArgs,
@ -31,6 +34,11 @@ class MTEBEvalRunner(AbsEvalRunner):
self.retriever, self.reranker = self.load_retriever_and_reranker()
def load_retriever_and_reranker(self) -> Tuple[MTEBEvalDenseRetriever, Union[MTEBEvalReranker, None]]:
"""Load the retriever and reranker
Returns:
Tuple[MTEBEvalDenseRetriever, Union[MTEBEvalReranker, None]]: The retriever and reranker instances.
"""
embedder, reranker = self.get_models(self.model_args)
retriever = MTEBEvalDenseRetriever(
embedder,
@ -42,6 +50,15 @@ class MTEBEvalRunner(AbsEvalRunner):
return retriever, reranker
def read_results(self, output_folder, tasks):
"""Read the evaluation results from directory.
Args:
output_folder (str): Path to the directory with results.
tasks (list): List of MTEB tasks.
Returns:
dict: The results of all the tasks.
"""
tasks_results = {}
task_types = list(set([t.metadata.type for t in tasks]))
for t_type in task_types:
@ -77,6 +94,12 @@ class MTEBEvalRunner(AbsEvalRunner):
return tasks_results
def output_json(self, tasks_results, save_file):
"""Save the tasks results into a json file.
Args:
tasks_results (dict): The task results.
save_file (str): Path to a file to save the results.
"""
all_results = 0
all_results_num = 0
cqa_results = 0
@ -110,6 +133,9 @@ class MTEBEvalRunner(AbsEvalRunner):
json.dump(new_results, f)
def run(self):
"""
Run the evaluation.
"""
task_types = self.eval_args.task_types
tasks = self.eval_args.tasks
languages = self.eval_args.languages

View File

@ -3,28 +3,67 @@ from FlagEmbedding.abc.evaluation import EvalDenseRetriever, EvalReranker
class MTEBEvalDenseRetriever(EvalDenseRetriever):
"""
Child class of :class:EvalRetriever for MTEB dense retrieval.
"""
def __init__(self, embedder, **kwargs):
super().__init__(embedder, **kwargs)
def set_examples(self, examples_for_task: Optional[List[dict]] = None):
"""Set examples for the model.
Args:
examples_for_task (Optional[List[dict]], optional): Examples for the task. Defaults to None.
"""
self.embedder.set_examples(examples_for_task)
def set_instruction(self, instruction: Optional[str] = None):
"""Set the instruction to use for the embedding model.
Args:
instruction (Optional[str], optional): _description_. Defaults to None.
"""
self.embedder.query_instruction_for_retrieval = instruction
def get_instruction(self):
"""Get the instruction of embedding model.
Returns:
str: Instruction
"""
return self.embedder.query_instruction_for_retrieval
def set_normalize_embeddings(self, normalize_embeddings: bool = True):
"""Set whether normalize the output embeddings
Args:
normalize_embeddings (bool, optional): Boolean to control whether or not normalize the embeddings. Defaults to ``True``.
"""
self.embedder.normalize_embeddings = normalize_embeddings
def encode_queries(self, queries: List[str], **kwargs):
"""Encode input queries.
Args:
queries (List[str]): Input queries.
Returns:
Union[np.ndarray, torch.Tensor]: Query embeddings.
"""
emb = self.embedder.encode_queries(queries)
if isinstance(emb, dict):
emb = emb["dense_vecs"]
return emb
def encode_corpus(self, corpus: List[Dict[str, str]], **kwargs):
"""Encode input corpus.
Args:
corpus (List[Dict[str, str]]): Input corpus.
Returns:
Union[np.ndarray, torch.Tensor]: Corpus embeddings.
"""
if isinstance(corpus[0], dict):
input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
else:
@ -35,6 +74,14 @@ class MTEBEvalDenseRetriever(EvalDenseRetriever):
return emb
def encode(self, corpus: List[Dict[str, str]], **kwargs):
"""Encode the imput.
Args:
corpus (List[Dict[str, str]]): Input corpus or sentences.
Returns:
Union[np.ndarray, torch.Tensor]: Corpus embeddings.
"""
if isinstance(corpus[0], dict):
input_texts = ['{} {}'.format(doc.get('title', ''), doc['text']).strip() for doc in corpus]
else:
@ -45,5 +92,8 @@ class MTEBEvalDenseRetriever(EvalDenseRetriever):
return emb
class MTEBEvalReranker(EvalReranker):
"""
Child class of :class:EvalReranker for reranker in MTEB.
"""
def __init__(self, reranker, **kwargs):
super().__init__(reranker, **kwargs)

View File

@ -0,0 +1,42 @@
AIR-Bench
=========
`AIR-Bench <https://github.com/AIR-Bench/AIR-Bench>`_ (Automated heterogeneous Information Retrieval Benchmark) is a dynamic (actively being updated) benchmark for information retrieval.
Now the benchmark contains two versions. Notice that the testing data is generated by LLMs with out human intervention.
This helps the evaluation of new domains easier and faster to be updated. It also makes it impossible for any models to have the test data covered in their training sets.
You can evaluate model's performance on AIR-Bench by running our provided shell script:
.. code:: bash
chmod +x /examples/evaluation/air_bench/eval_air_bench.sh
./examples/evaluation/air_bench/eval_air_bench.sh
Or by running:
.. code:: bash
python -m FlagEmbedding.evaluation.air_bench \
--benchmark_version AIR-Bench_24.05 \
--task_types qa long-doc \
--domains arxiv \
--languages en \
--splits dev test \
--output_dir ./air_bench/search_results \
--search_top_k 1000 \
--rerank_top_k 100 \
--cache_dir /root/.cache/huggingface/hub \
--overwrite False \
--embedder_name_or_path BAAI/bge-m3 \
--reranker_name_or_path BAAI/bge-reranker-v2-m3 \
--devices cuda:0 cuda:1 \
--model_cache_dir /root/.cache/huggingface/hub \
--reranker_max_length 1024
change the embedder, reranker, devices and cache directory to your preference.
.. toctree::
:hidden:
airbench/arguments
airbench/runner

View File

@ -0,0 +1,4 @@
arguments
=========
.. autoclass:: FlagEmbedding.evaluation.air_bench.AIRBenchEvalModelArgs

View File

@ -0,0 +1,4 @@
runner
======
.. autoclass:: FlagEmbedding.evaluation.air_bench.AIRBenchEvalRunner

View File

@ -0,0 +1,48 @@
BEIR
====
`BEIR <https://github.com/beir-cellar/beir>`_ (Benchmarking-IR) is a heterogeneous evaluation benchmark for information retrieval.
It is designed for evaluating the performance of NLP-based retrieval models and widely used by research of modern embedding models.
You can evaluate model's performance on the BEIR benchmark by running our provided shell script:
.. code:: bash
chmod +x /examples/evaluation/beir/eval_beir.sh
./examples/evaluation/beir/eval_beir.sh
Or by running:
.. code:: bash
python -m FlagEmbedding.evaluation.beir \
--eval_name beir \
--dataset_dir ./beir/data \
--dataset_names fiqa arguana cqadupstack \
--splits test dev \
--corpus_embd_save_dir ./beir/corpus_embd \
--output_dir ./beir/search_results \
--search_top_k 1000 \
--rerank_top_k 100 \
--cache_path /root/.cache/huggingface/hub \
--overwrite False \
--k_values 10 100 \
--eval_output_method markdown \
--eval_output_path ./beir/beir_eval_results.md \
--eval_metrics ndcg_at_10 recall_at_100 \
--ignore_identical_ids True \
--embedder_name_or_path BAAI/bge-large-en-v1.5 \
--reranker_name_or_path BAAI/bge-reranker-v2-m3 \
--devices cuda:0 cuda:1 \
--cache_dir \
--reranker_max_length 1024 \
change the embedder, devices and cache directory to your preference.
.. toctree::
:hidden:
beir/arguments
beir/data_loader
beir/evaluator
beir/runner

View File

@ -0,0 +1,4 @@
arguments
=========
.. autoclass:: FlagEmbedding.evaluation.bier.BEIREvalArgs

View File

@ -0,0 +1,4 @@
data loader
===========
.. autoclass:: FlagEmbedding.abc.evaluation.BEIREvalDataLoader

View File

@ -0,0 +1,4 @@
evaluator
=========
.. autoclass:: FlagEmbedding.evaluation.beir.BEIREvaluator

View File

@ -0,0 +1,4 @@
runner
======
.. autoclass:: FlagEmbedding.evaluation.beir.BEIREvalRunner

View File

@ -2,6 +2,7 @@ MKQA
====
`MKQA <https://github.com/apple/ml-mkqa>`_ is an open-domain question answering evaluation set comprising 10k question-answer pairs aligned across 26 typologically diverse languages.
Each example in the dataset has the following structure:
.. code:: bash

View File

@ -0,0 +1,95 @@
MLDR
====
`MLDR <https://huggingface.co/datasets/Shitao/MLDR>`_ is a Multilingual Long-Document Retrieval dataset built on Wikipeida, Wudao and mC4, covering 13 typologically diverse languages.
Specifically, we sample lengthy articles from Wikipedia, Wudao and mC4 datasets and randomly choose paragraphs from them.
Then we use GPT-3.5 to generate questions based on these paragraphs.
The generated question and the sampled article constitute a new text pair to the dataset.
An example of ``train`` set looks like:
.. code:: bash
{
'query_id': 'q-zh-<...>',
'query': '...',
'positive_passages': [
{
'docid': 'doc-zh-<...>',
'text': '...'
}
],
'negative_passages': [
{
'docid': 'doc-zh-<...>',
'text': '...'
},
...
]
}
An example of ``dev`` and ``test`` set looks like:
.. code:: bash
{
'query_id': 'q-zh-<...>',
'query': '...',
'positive_passages': [
{
'docid': 'doc-zh-<...>',
'text': '...'
}
],
'negative_passages': []
}
An example of ``corpus`` looks like:
.. code:: bash
{
'docid': 'doc-zh-<...>',
'text': '...'
}
You can evaluate model's performance on MLDR simply by running our provided shell script:
.. code:: bash
chmod +x /examples/evaluation/mldr/eval_mldr.sh
./examples/evaluation/mldr/eval_mldr.sh
Or by running:
.. code:: bash
python -m FlagEmbedding.evaluation.mldr \
--eval_name mldr \
--dataset_dir ./mldr/data \
--dataset_names hi \
--splits test \
--corpus_embd_save_dir ./mldr/corpus_embd \
--output_dir ./mldr/search_results \
--search_top_k 1000 \
--rerank_top_k 100 \
--cache_path /root/.cache/huggingface/hub \
--overwrite False \
--k_values 10 100 \
--eval_output_method markdown \
--eval_output_path ./mldr/mldr_eval_results.md \
--eval_metrics ndcg_at_10 \
--embedder_name_or_path BAAI/bge-m3 \
--reranker_name_or_path BAAI/bge-reranker-v2-m3 \
--devices cuda:0 cuda:1 \
--cache_dir /root/.cache/huggingface/hub \
--embedder_passage_max_length 8192 \
--reranker_max_length 8192
change the args of embedder, reranker, devices and cache directory to your preference.
.. toctree::
:hidden:
mldr/data_loader
mldr/runner

View File

@ -0,0 +1,13 @@
data_loader
===========
.. autoclass:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader
Methods
-------
.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader.available_dataset_names
.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader.available_splits
.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader._load_remote_corpus
.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader._load_remote_qrels
.. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader._load_remote_queries

View File

@ -0,0 +1,5 @@
runner
======
.. autoclass:: FlagEmbedding.evaluation.mldr.MLDREvalRunner
:members:

View File

@ -0,0 +1,46 @@
MSMARCO
=======
`MS Marco <https://microsoft.github.io/msmarco/>`_ (Microsoft MAchine Reading Comprehension) is a large scale real-world reading comprehension dataset.
It is widely used in information retrieval, question answering, and natural language processing research.
You can evaluate model's performance on MS MARCO simply by running our provided shell script:
.. code:: bash
chmod +x /examples/evaluation/msmarco/eval_msmarco.sh
./examples/evaluation/msmarco/eval_msmarco.sh
Or by running:
.. code:: bash
python -m FlagEmbedding.evaluation.msmarco \
--eval_name msmarco \
--dataset_dir ./msmarco/data \
--dataset_names passage \
--splits dev \
--corpus_embd_save_dir ./msmarco/corpus_embd \
--output_dir ./msmarco/search_results \
--search_top_k 1000 \
--rerank_top_k 100 \
--cache_path /root/.cache/huggingface/hub \
--overwrite True \
--k_values 10 100 \
--eval_output_method markdown \
--eval_output_path ./msmarco/msmarco_eval_results.md \
--eval_metrics ndcg_at_10 recall_at_100 \
--embedder_name_or_path BAAI/bge-large-en-v1.5 \
--reranker_name_or_path BAAI/bge-reranker-v2-m3 \
--devices cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 \
--cache_dir /root/.cache/huggingface/hub \
--reranker_max_length 1024
change the embedder, reranker, devices and cache directory to your preference.
.. toctree::
:hidden:
msmarco/data_loader
msmarco/runner

View File

@ -0,0 +1,13 @@
data_loader
===========
.. autoclass:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader
Methods
-------
.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader.available_dataset_names
.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader.available_splits
.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader._load_remote_corpus
.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader._load_remote_qrels
.. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader._load_remote_queries

View File

@ -0,0 +1,5 @@
runner
======
.. autoclass:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalRunner
:members:

View File

@ -0,0 +1,37 @@
MTEB
====
`MTEB <https://github.com/embeddings-benchmark/mteb>`_ (The Massive Text Embedding Benchmark) is a large-scale evaluation framework designed to assess the performance of text embedding models across a wide variety of NLP tasks.
Introduced to standardize and improve the evaluation of text embeddings, MTEB is crucial for assessing how well these models generalize across various real-world applications.
It contains a wide range of datasets in eight main NLP tasks and different languages, and provides an easy pipeline for evaluation.
It also holds the well known MTEB `leaderboard <https://huggingface.co/spaces/mteb/leaderboard>`_, which contains a ranking of the latest first-class embedding models.
You can evaluate model's performance on the whole MTEB benchmark by running our provided shell script:
.. code:: bash
chmod +x /examples/evaluation/mteb/eval_mteb.sh
./examples/evaluation/mteb/eval_mteb.sh
Or by running:
.. code:: bash
python -m FlagEmbedding.evaluation.mteb \
--eval_name mteb \
--output_dir ./mteb/search_results \
--languages eng \
--tasks NFCorpus BiorxivClusteringS2S SciDocsRR \
--eval_output_path ./mteb/mteb_eval_results.json \
--embedder_name_or_path BAAI/bge-large-en-v1.5 \
--devices cuda:7 \
--cache_dir /root/.cache/huggingface/hub
change the embedder, devices and cache directory to your preference.
.. toctree::
:hidden:
mteb/arguments
mteb/searcher
mteb/runner

View File

@ -0,0 +1,4 @@
arguments
=========
.. autoclass:: FlagEmbedding.evaluation.mteb.MTEBEvalArgs

View File

@ -0,0 +1,4 @@
runner
======
.. autoclass:: FlagEmbedding.evaluation.mteb.MTEBEvalRunner

View File

@ -0,0 +1,6 @@
searcher
========
.. autoclass:: FlagEmbedding.evaluation.mteb.MTEBEvalDenseRetriever
.. autoclass:: FlagEmbedding.evaluation.mteb.MTEBEvalReranker