From b8ff1052d409508736802e9845a6e7c8a74cd8d0 Mon Sep 17 00:00:00 2001 From: bogdankostic Date: Fri, 26 May 2023 18:48:11 +0200 Subject: [PATCH] refactor: Adapt running benchmarks (#5007) * Generate eval result in separate method * Adapt benchmarking utils * Adapt running retriever benchmarks * Adapt error message * Adapt running reader benchmarks * Adapt retriever reader benchmark script * Adapt running benchmarks script * Adapt README.md * Raise error if file doesn't exist * Raise error if path doesn't exist or is a directory * minor readme update * Create separate methods for checking if pipeline contains reader or retriever * Fix reader pipeline case --------- Co-authored-by: Darja Fokina --- test/benchmarks/README.md | 106 +++++++++++++++++++++++++--------- test/benchmarks/run.py | 116 +++++++++++++++++++++++--------------- test/benchmarks/utils.py | 17 ++++++ 3 files changed, 167 insertions(+), 72 deletions(-) diff --git a/test/benchmarks/README.md b/test/benchmarks/README.md index e846d9598..361d40505 100644 --- a/test/benchmarks/README.md +++ b/test/benchmarks/README.md @@ -1,45 +1,97 @@ # Benchmarks +The tooling provided in this directory allows running benchmarks on reader pipelines, retriever pipelines, +and retriever-reader pipelines. +## Defining configuration -To start all benchmarks (e.g. for a new Haystack release), run: +To run a benchmark, you need to create a configuration file first. This file should be a Pipeline YAML file that +contains both the querying and, optionally, the indexing pipeline, in case the querying pipeline includes a retriever. -```` -python run.py --reader --retriever_index --retriever_query --update_json --save_markdown -```` +The configuration file should also have a **`benchmark_config`** section that includes the following information: -For custom runs, you can specify which components and processes to benchmark with the following flags: -``` -python run.py [--reader] [--retriever_index] [--retriever_query] [--ci] [--update_json] [--save_markdown] +- **`labels_file`**: The path to a SQuAD-formatted JSON or CSV file that contains the labels to be benchmarked on. +- **`documents_directory`**: The path to a directory containing files intended to be indexed into the document store. + This is only necessary for retriever and retriever-reader pipelines. +- **`data_url`**: This is optional. If provided, the benchmarking script will download data from this URL and + save it in the **`data/`** directory. -where +Here is an example of how a configuration file for a retriever-reader pipeline might look like: -**--reader** will trigger the speed and accuracy benchmarks for the reader. Here we simply use the SQuAD dev set. +```yaml +components: + - name: DocumentStore + type: ElasticsearchDocumentStore + - name: TextConverter + type: TextConverter + - name: Reader + type: FARMReader + params: + model_name_or_path: deepset/roberta-base-squad2-distilled + - name: Retriever + type: BM25Retriever + params: + document_store: DocumentStore + top_k: 10 -**--retriever_index** will trigger indexing benchmarks +pipelines: + - name: indexing + nodes: + - name: TextConverter + inputs: [File] + - name: Retriever + inputs: [TextConverter] + - name: DocumentStore + inputs: [Retriever] + - name: querying + nodes: + - name: Retriever + inputs: [Query] + - name: Reader + inputs: [Retriever] -**--retriever_query** will trigger querying benchmarks (embeddings will be loaded from file instead of being computed on the fly) - -**--ci** will cause the the benchmarks to run on a smaller slice of each dataset and a smaller subset of Retriever / Reader / DocStores. - -**--update-json** will cause the script to update the json files in docs/_src/benchmarks so that the website benchmarks will be updated. - -**--save_markdown** save results additionally to the default csv also as a markdown file +benchmark_config: + data_url: http://example.com/data.tar.gz + documents_directory: /path/to/documents + labels_file: /path/to/labels.csv ``` -Results will be stored in this directory as -- retriever_index_results.csv and retriever_index_results.md -- retriever_query_results.csv and retriever_query_results.md -- reader_results.csv and reader_results.md +## Running benchmarks +Once you have your configuration file, you can run benchmarks by using the **`run.py`** script. -# Temp. Quickfix for bigger runs +```bash +python run.py [--output OUTPUT] config +``` -For bigger indexing runs (500k docs) the standard elastic / opensearch container that we spawn via haystack might run OOM. -Therefore, start them manually before you trigger the benchmark script and assign more memory to them: +The script takes the following arguments: -`docker start opensearch > /dev/null 2>&1 || docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" -e "OPENSEARCH_JAVA_OPTS=-Xms4096m -Xmx4096m" --name opensearch opensearchproject/opensearch:2.2.1` +- `config`: This is the path to your configuration file. +- `--output`: This is an optional path where benchmark results should be saved. If not provided, the script will create a JSON file with the same name as the specified config file. -and +## Metrics -`docker start elasticsearch > /dev/null 2>&1 || docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms4096m -Xmx4096m" --name elasticsearch elasticsearch:7.9.2` +The benchmarks yield the following metrics: + +- Reader pipelines: + - Exact match score + - F1 score + - Total querying time + - Seconds/query +- Retriever pipelines: + - Recall + - Mean-average precision + - Total querying time + - Seconds/query + - Queries/second + - Total indexing time + - Number of indexed Documents/second +- Retriever-Reader pipelines: + - Exact match score + - F1 score + - Total querying time + - Seconds/query + - Total indexing time + - Number of indexed Documents/second + +You can find more details about the performance metrics in our [evaluation guide](https://docs.haystack.deepset.ai/docs/evaluation). diff --git a/test/benchmarks/run.py b/test/benchmarks/run.py index f5259ce8d..5de92bede 100644 --- a/test/benchmarks/run.py +++ b/test/benchmarks/run.py @@ -1,51 +1,77 @@ -# The benchmarks use -# - a variant of the Natural Questions Dataset (https://ai.google.com/research/NaturalQuestions) from Google Research -# licensed under CC BY-SA 3.0 (https://creativecommons.org/licenses/by-sa/3.0/) -# - the SQuAD 2.0 Dataset (https://rajpurkar.github.io/SQuAD-explorer/) from Rajpurkar et al. -# licensed under CC BY-SA 4.0 (https://creativecommons.org/licenses/by-sa/4.0/legalcode) - -from retriever import benchmark_indexing, benchmark_querying -from reader import benchmark_reader -from utils import load_config +from pathlib import Path +from typing import Dict import argparse +import json + +from haystack import Pipeline +from haystack.nodes import BaseRetriever, BaseReader +from haystack.pipelines.config import read_pipeline_config_from_yaml + +from utils import prepare_environment, contains_reader, contains_retriever +from reader import benchmark_reader +from retriever import benchmark_retriever +from retriever_reader import benchmark_retriever_reader -parser = argparse.ArgumentParser() +def run_benchmark(pipeline_yaml: Path) -> Dict: + """ + Run benchmarking on a given pipeline. Pipeline can be a retriever, reader, or retriever-reader pipeline. + In case of retriever or retriever-reader pipelines, indexing is also benchmarked, so the config file must + contain an indexing pipeline as well. -parser.add_argument("--reader", default=False, action="store_true", help="Perform Reader benchmarks") -parser.add_argument( - "--retriever_index", default=False, action="store_true", help="Perform Retriever indexing benchmarks" -) -parser.add_argument( - "--retriever_query", default=False, action="store_true", help="Perform Retriever querying benchmarks" -) -parser.add_argument( - "--ci", default=False, action="store_true", help="Perform a smaller subset of benchmarks that are quicker to run" -) -parser.add_argument( - "--update_json", - default=False, - action="store_true", - help="Update the json file with the results of this run so that the website can be updated", -) -parser.add_argument( - "--save_markdown", - default=False, - action="store_true", - help="Update the json file with the results of this run so that the website can be updated", -) -args = parser.parse_args() + :param pipeline_yaml: Path to pipeline YAML config. The config file should contain a benchmark_config section where + the following parameters are specified: + - documents_directory: Directory containing files to index. + - labels_file: Path to evaluation set. + - data_url (optional): URL to download the data from. Downloaded data will be stored in + the directory `data/`. + """ + pipeline_config = read_pipeline_config_from_yaml(pipeline_yaml) + benchmark_config = pipeline_config.pop("benchmark_config", {}) -# load config -params, filenames = load_config(config_filename="config.json", ci=args.ci) + # Prepare environment + prepare_environment(pipeline_config, benchmark_config) + labels_file = Path(benchmark_config["labels_file"]) -if args.retriever_index: - benchmark_indexing( - **params, **filenames, ci=args.ci, update_json=args.update_json, save_markdown=args.save_markdown - ) -if args.retriever_query: - benchmark_querying( - **params, **filenames, ci=args.ci, update_json=args.update_json, save_markdown=args.save_markdown - ) -if args.reader: - benchmark_reader(**params, **filenames, ci=args.ci, update_json=args.update_json, save_markdown=args.save_markdown) + querying_pipeline = Pipeline.load_from_config(pipeline_config, pipeline_name="querying") + pipeline_contains_reader = contains_reader(querying_pipeline) + pipeline_contains_retriever = contains_retriever(querying_pipeline) + + # Retriever-Reader pipeline + if pipeline_contains_retriever and pipeline_contains_reader: + documents_dir = Path(benchmark_config["documents_directory"]) + indexing_pipeline = Pipeline.load_from_config(pipeline_config, pipeline_name="indexing") + + results = benchmark_retriever_reader(indexing_pipeline, querying_pipeline, documents_dir, labels_file) + + # Retriever pipeline + elif pipeline_contains_retriever: + documents_dir = Path(benchmark_config["documents_directory"]) + indexing_pipeline = Pipeline.load_from_config(pipeline_config, pipeline_name="indexing") + + results = benchmark_retriever(indexing_pipeline, querying_pipeline, documents_dir, labels_file) + + # Reader pipeline + elif pipeline_contains_reader: + results = benchmark_reader(querying_pipeline, labels_file) + + # Unsupported pipeline type + else: + raise ValueError("Pipeline must be a retriever, reader, or retriever-reader pipeline.") + + results["config_file"] = pipeline_config + return results + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("config", type=str, help="Path to pipeline YAML config.") + parser.add_argument("--output", type=str, help="Path to output file.") + args = parser.parse_args() + + config_file = Path(args.config) + output_file = f"{config_file.stem}_results.json" if args.output is None else args.output + + results = run_benchmark(config_file) + with open(output_file, "w") as f: + json.dump(results, f, indent=2) diff --git a/test/benchmarks/utils.py b/test/benchmarks/utils.py index f39716491..4fc667437 100644 --- a/test/benchmarks/utils.py +++ b/test/benchmarks/utils.py @@ -158,3 +158,20 @@ def get_retriever_config(pipeline: Pipeline) -> Tuple[str, Union[int, str]]: retriever_top_k = retriever.top_k return retriever_type, retriever_top_k + + +def contains_reader(pipeline: Pipeline) -> bool: + """ + Check if a pipeline contains a Reader component. + :param pipeline: Pipeline + """ + components = [comp for comp in pipeline.components.values()] + return any(isinstance(comp, BaseReader) for comp in components) + + +def contains_retriever(pipeline: Pipeline) -> bool: + """ + Check if a pipeline contains a Retriever component. + """ + components = [comp for comp in pipeline.components.values()] + return any(isinstance(comp, BaseRetriever) for comp in components)