haystack/test/benchmarks/run.py

from pathlib import Path
from typing import Dict
import argparse
import json

import posthog

from haystack import Pipeline
from haystack.pipelines.config import read_pipeline_config_from_yaml

from utils import prepare_environment, contains_reader, contains_retriever
from reader import benchmark_reader
from retriever import benchmark_retriever
from retriever_reader import benchmark_retriever_reader


# Disable telemetry reports when running benchmarks
posthog.disabled = True


def run_benchmark(pipeline_yaml: Path) -> Dict:
    """
    Run benchmarking on a given pipeline. Pipeline can be a retriever, reader, or retriever-reader pipeline.
    In case of retriever or retriever-reader pipelines, indexing is also benchmarked, so the config file must
    contain an indexing pipeline as well.

    :param pipeline_yaml: Path to pipeline YAML config. The config file should contain a benchmark_config section where
                          the following parameters are specified:
                            - documents_directory: Directory containing files to index.
                            - labels_file: Path to evaluation set.
                            - data_url (optional): URL to download the data from. Downloaded data will be stored in
                                                   the directory `data/`.
    """
    pipeline_config = read_pipeline_config_from_yaml(pipeline_yaml)
    benchmark_config = pipeline_config.pop("benchmark_config", {})

    # Prepare environment
    prepare_environment(pipeline_config, benchmark_config)
    labels_file = Path(benchmark_config["labels_file"])

    querying_pipeline = Pipeline.load_from_config(pipeline_config, pipeline_name="querying")
    pipeline_contains_reader = contains_reader(querying_pipeline)
    pipeline_contains_retriever = contains_retriever(querying_pipeline)

    # Retriever-Reader pipeline
    if pipeline_contains_retriever and pipeline_contains_reader:
        documents_dir = Path(benchmark_config["documents_directory"])
        indexing_pipeline = Pipeline.load_from_config(pipeline_config, pipeline_name="indexing")

        results = benchmark_retriever_reader(indexing_pipeline, querying_pipeline, documents_dir, labels_file)

    # Retriever pipeline
    elif pipeline_contains_retriever:
        documents_dir = Path(benchmark_config["documents_directory"])
        indexing_pipeline = Pipeline.load_from_config(pipeline_config, pipeline_name="indexing")

        results = benchmark_retriever(indexing_pipeline, querying_pipeline, documents_dir, labels_file)

    # Reader pipeline
    elif pipeline_contains_reader:
        results = benchmark_reader(querying_pipeline, labels_file)

    # Unsupported pipeline type
    else:
        raise ValueError("Pipeline must be a retriever, reader, or retriever-reader pipeline.")

    pipeline_config["benchmark_config"] = benchmark_config
    results["config"] = pipeline_config
    return results


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("config", type=str, help="Path to pipeline YAML config.")
    parser.add_argument("--output", type=str, help="Path to output file.")
    args = parser.parse_args()

    config_file = Path(args.config)
    output_file = f"{config_file.stem}_results.json" if args.output is None else args.output
    results = run_benchmark(config_file)
    with open(output_file, "w") as f:
        json.dump(results, f, indent=2)
refactor: Adapt running benchmarks (#5007) * Generate eval result in separate method * Adapt benchmarking utils * Adapt running retriever benchmarks * Adapt error message * Adapt running reader benchmarks * Adapt retriever reader benchmark script * Adapt running benchmarks script * Adapt README.md * Raise error if file doesn't exist * Raise error if path doesn't exist or is a directory * minor readme update * Create separate methods for checking if pipeline contains reader or retriever * Fix reader pipeline case --------- Co-authored-by: Darja Fokina <daria.f93@gmail.com> 2023-05-26 18:48:11 +02:00			`from pathlib import Path`
			`from typing import Dict`
			`import argparse`
			`import json`

ci: Add Github workflow to automate benchmark runs (#5399) * Add config files * log benchmarks to stdout * Add top-k and batch size to configs * Add batch size to configs * fix: don't download files if they already exist * Add batch size to configs * refine script * Remove configs using 1m docs * update run script * update run script * update run script * datadog integration * remove out folder * gitignore benchmarks output * test: send benchmarks to datadog * remove uncommented lines in script * feat: take branch/tag argument for benchmark setup script * fix: run.sh should ignore errors * Add GH workflow to run benchmarks periodically * Remove unused script * Adapt cml.yml * Adapt cml.yml * Rename cml.yml to benchmarks.yml * Revert "Rename cml.yml to benchmarks.yml" This reverts commit 897299433a71a55827124728adff5de918d46d21. * remove benchmarks.yml * Use same file extension for all config files * Use checkout@v3 * Run benchmarks sequentially * Add timeout-minutes parameter * Remove changes unrelated to datadog * Apply black * use haystack-oss aws account * Update test/benchmarks/utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * PR feedback * fix aws credentials step * Fix path * check docker * Allow spinning up containers from within container * Allow spinning up containers from within container * Separate launching doc stores from benchmarks * Remove docker related commands * run only retrievers * change port * Revert "change port" This reverts commit 6e5bcebb1d16e03ba7672be7e8a089084c7fc3a7. * Run opensearch benchmark only * Run weaviate benchmark only * Run bm25 benchmarks only * Changes host of doc stores * add step to get docker logs * Revert "add step to get docker logs" This reverts commit c10e6faa76bde5df406a027203bd775d18c93c90. * Install docker * Launch doc store containers from wtihin runner container * Remove kill command * Change host * dump docker logs * change port * Add cloud startup script * dump docker logs * add network param * add network to startup.sh * check cluster health * move steps * change port * try using services * check cluster health * use services * run only weaviate * change host * Upload benchmark results as artifacts * Update configs * Delete index after benchmark run * Use correct index name * Run only failing config * Use smaller batch size * Increase memory for opensearch * Reduce batch size further * Provide more storage * Reduce batch size * dump docker logs * add java opts * Spin up only opensearch container * Create separate job for each doc store * Run benchmarks sequentially * Set working directory * Account for reader benchmarks not doing indexing * Change key of reader metrics * Apply PR feedback * Remove whitespace * Adapt workflow to changes in datadog scripts * Adapt workflow to changes in datadog scripts * Increase memory for opensearch * Reduce batch size * Add preprocessing_batch_size to Readers * Remove unrelated change * Move order * Fix path * Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance * Manually terminate EC2 instance * Manually terminate EC2 instance * Always terminate runner * Always terminate runner * Remove unnecessary terminate-runner job * Add cron schedule * Disable telemetry * Rename cml.yml to benchmarks.yml --------- Co-authored-by: rjanjua <rohan.janjua@gmail.com> Co-authored-by: Paul Steppacher <p.steppacher91@gmail.com> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2023-08-17 12:56:45 +02:00			`import posthog`

refactor: Adapt running benchmarks (#5007) * Generate eval result in separate method * Adapt benchmarking utils * Adapt running retriever benchmarks * Adapt error message * Adapt running reader benchmarks * Adapt retriever reader benchmark script * Adapt running benchmarks script * Adapt README.md * Raise error if file doesn't exist * Raise error if path doesn't exist or is a directory * minor readme update * Create separate methods for checking if pipeline contains reader or retriever * Fix reader pipeline case --------- Co-authored-by: Darja Fokina <daria.f93@gmail.com> 2023-05-26 18:48:11 +02:00			`from haystack import Pipeline`
			`from haystack.pipelines.config import read_pipeline_config_from_yaml`
Integrate sentence transformers into benchmarks (#843) * Integrate sentence transformers into benchmarks * Add doc store asserts * switch data downloads from s3 client to https. add license info * Fix mypy, revert config Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> 2021-04-09 17:24:16 +02:00
refactor: Adapt running benchmarks (#5007) * Generate eval result in separate method * Adapt benchmarking utils * Adapt running retriever benchmarks * Adapt error message * Adapt running reader benchmarks * Adapt retriever reader benchmark script * Adapt running benchmarks script * Adapt README.md * Raise error if file doesn't exist * Raise error if path doesn't exist or is a directory * minor readme update * Create separate methods for checking if pipeline contains reader or retriever * Fix reader pipeline case --------- Co-authored-by: Darja Fokina <daria.f93@gmail.com> 2023-05-26 18:48:11 +02:00			`from utils import prepare_environment, contains_reader, contains_retriever`
Create time and performance benchmarks for all readers and retrievers (#339) * add time and perf benchmark for es * Add retriever benchmarking * Add Reader benchmarking * add nq to squad conversion * add conversion stats * clean benchmarks * Add link to dataset * Update imports * add first support for neg psgs * Refactor test * set max_seq_len * cleanup benchmark * begin retriever speed benchmarking * Add support for retriever query index benchmarking * improve reader eval, retriever speed benchmarking * improve retriever speed benchmarking * Add retriever accuracy benchmark * Add neg doc shuffling * Add top_n * 3x speedup of SQL. add postgres docker run. make shuffle neg a param. add more logging * Add models to sweep * add option for faiss index type * remove unneeded line * change faiss to faiss_flat * begin automatic benchmark script * remove existing postgres docker for benchmarking * Add data processing scripts * Remove shuffle in script bc data already shuffled * switch hnsw setup from 256 to 128 * change es similarity to dot product by default * Error includes stack trace * Change ES default timeout * remove delete_docs() from timing for indexing * Add support for website export * update website on push to benchmarks * add complete benchmarks results * new json format * removed NaN as is not a valid json token * fix benchmarking for faiss hnsw queries. do sql calls in update_embeddings() as batches * update benchmarks for hnsw 128,20,80 * don't delete full index in delete_all_documents() * update texts for charts * update recall column for retriever * change scale and add units to desc * add units to legend * add axis titles. update desc * add html tags Co-authored-by: deepset <deepset@Crenolape.localdomain> Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai> Co-authored-by: PiffPaffM <markuspaff.mp@gmail.com> 2020-10-12 13:34:42 +02:00			`from reader import benchmark_reader`
refactor: Adapt running benchmarks (#5007) * Generate eval result in separate method * Adapt benchmarking utils * Adapt running retriever benchmarks * Adapt error message * Adapt running reader benchmarks * Adapt retriever reader benchmark script * Adapt running benchmarks script * Adapt README.md * Raise error if file doesn't exist * Raise error if path doesn't exist or is a directory * minor readme update * Create separate methods for checking if pipeline contains reader or retriever * Fix reader pipeline case --------- Co-authored-by: Darja Fokina <daria.f93@gmail.com> 2023-05-26 18:48:11 +02:00			`from retriever import benchmark_retriever`
			`from retriever_reader import benchmark_retriever_reader`


ci: Add Github workflow to automate benchmark runs (#5399) * Add config files * log benchmarks to stdout * Add top-k and batch size to configs * Add batch size to configs * fix: don't download files if they already exist * Add batch size to configs * refine script * Remove configs using 1m docs * update run script * update run script * update run script * datadog integration * remove out folder * gitignore benchmarks output * test: send benchmarks to datadog * remove uncommented lines in script * feat: take branch/tag argument for benchmark setup script * fix: run.sh should ignore errors * Add GH workflow to run benchmarks periodically * Remove unused script * Adapt cml.yml * Adapt cml.yml * Rename cml.yml to benchmarks.yml * Revert "Rename cml.yml to benchmarks.yml" This reverts commit 897299433a71a55827124728adff5de918d46d21. * remove benchmarks.yml * Use same file extension for all config files * Use checkout@v3 * Run benchmarks sequentially * Add timeout-minutes parameter * Remove changes unrelated to datadog * Apply black * use haystack-oss aws account * Update test/benchmarks/utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * PR feedback * fix aws credentials step * Fix path * check docker * Allow spinning up containers from within container * Allow spinning up containers from within container * Separate launching doc stores from benchmarks * Remove docker related commands * run only retrievers * change port * Revert "change port" This reverts commit 6e5bcebb1d16e03ba7672be7e8a089084c7fc3a7. * Run opensearch benchmark only * Run weaviate benchmark only * Run bm25 benchmarks only * Changes host of doc stores * add step to get docker logs * Revert "add step to get docker logs" This reverts commit c10e6faa76bde5df406a027203bd775d18c93c90. * Install docker * Launch doc store containers from wtihin runner container * Remove kill command * Change host * dump docker logs * change port * Add cloud startup script * dump docker logs * add network param * add network to startup.sh * check cluster health * move steps * change port * try using services * check cluster health * use services * run only weaviate * change host * Upload benchmark results as artifacts * Update configs * Delete index after benchmark run * Use correct index name * Run only failing config * Use smaller batch size * Increase memory for opensearch * Reduce batch size further * Provide more storage * Reduce batch size * dump docker logs * add java opts * Spin up only opensearch container * Create separate job for each doc store * Run benchmarks sequentially * Set working directory * Account for reader benchmarks not doing indexing * Change key of reader metrics * Apply PR feedback * Remove whitespace * Adapt workflow to changes in datadog scripts * Adapt workflow to changes in datadog scripts * Increase memory for opensearch * Reduce batch size * Add preprocessing_batch_size to Readers * Remove unrelated change * Move order * Fix path * Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance * Manually terminate EC2 instance * Manually terminate EC2 instance * Always terminate runner * Always terminate runner * Remove unnecessary terminate-runner job * Add cron schedule * Disable telemetry * Rename cml.yml to benchmarks.yml --------- Co-authored-by: rjanjua <rohan.janjua@gmail.com> Co-authored-by: Paul Steppacher <p.steppacher91@gmail.com> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2023-08-17 12:56:45 +02:00			`# Disable telemetry reports when running benchmarks`
			`posthog.disabled = True`


refactor: Adapt running benchmarks (#5007) * Generate eval result in separate method * Adapt benchmarking utils * Adapt running retriever benchmarks * Adapt error message * Adapt running reader benchmarks * Adapt retriever reader benchmark script * Adapt running benchmarks script * Adapt README.md * Raise error if file doesn't exist * Raise error if path doesn't exist or is a directory * minor readme update * Create separate methods for checking if pipeline contains reader or retriever * Fix reader pipeline case --------- Co-authored-by: Darja Fokina <daria.f93@gmail.com> 2023-05-26 18:48:11 +02:00			`def run_benchmark(pipeline_yaml: Path) -> Dict:`
			`"""`
			`Run benchmarking on a given pipeline. Pipeline can be a retriever, reader, or retriever-reader pipeline.`
			`In case of retriever or retriever-reader pipelines, indexing is also benchmarked, so the config file must`
			`contain an indexing pipeline as well.`

			`:param pipeline_yaml: Path to pipeline YAML config. The config file should contain a benchmark_config section where`
			`the following parameters are specified:`
			`- documents_directory: Directory containing files to index.`
			`- labels_file: Path to evaluation set.`
			`- data_url (optional): URL to download the data from. Downloaded data will be stored in`
			the directory `data/`.
			`"""`
			`pipeline_config = read_pipeline_config_from_yaml(pipeline_yaml)`
			`benchmark_config = pipeline_config.pop("benchmark_config", {})`

			`# Prepare environment`
			`prepare_environment(pipeline_config, benchmark_config)`
			`labels_file = Path(benchmark_config["labels_file"])`

			`querying_pipeline = Pipeline.load_from_config(pipeline_config, pipeline_name="querying")`
			`pipeline_contains_reader = contains_reader(querying_pipeline)`
			`pipeline_contains_retriever = contains_retriever(querying_pipeline)`

			`# Retriever-Reader pipeline`
			`if pipeline_contains_retriever and pipeline_contains_reader:`
			`documents_dir = Path(benchmark_config["documents_directory"])`
			`indexing_pipeline = Pipeline.load_from_config(pipeline_config, pipeline_name="indexing")`

			`results = benchmark_retriever_reader(indexing_pipeline, querying_pipeline, documents_dir, labels_file)`

			`# Retriever pipeline`
			`elif pipeline_contains_retriever:`
			`documents_dir = Path(benchmark_config["documents_directory"])`
			`indexing_pipeline = Pipeline.load_from_config(pipeline_config, pipeline_name="indexing")`

			`results = benchmark_retriever(indexing_pipeline, querying_pipeline, documents_dir, labels_file)`

			`# Reader pipeline`
			`elif pipeline_contains_reader:`
			`results = benchmark_reader(querying_pipeline, labels_file)`

			`# Unsupported pipeline type`
			`else:`
			`raise ValueError("Pipeline must be a retriever, reader, or retriever-reader pipeline.")`

fix: Use queries from aggregated labels in benchmarks (#5054) * Include benchmark config in output * Use queries from aggregated labels 2023-06-01 10:49:54 +02:00			`pipeline_config["benchmark_config"] = benchmark_config`
			`results["config"] = pipeline_config`
refactor: Adapt running benchmarks (#5007) * Generate eval result in separate method * Adapt benchmarking utils * Adapt running retriever benchmarks * Adapt error message * Adapt running reader benchmarks * Adapt retriever reader benchmark script * Adapt running benchmarks script * Adapt README.md * Raise error if file doesn't exist * Raise error if path doesn't exist or is a directory * minor readme update * Create separate methods for checking if pipeline contains reader or retriever * Fix reader pipeline case --------- Co-authored-by: Darja Fokina <daria.f93@gmail.com> 2023-05-26 18:48:11 +02:00			`return results`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("config", type=str, help="Path to pipeline YAML config.")`
			`parser.add_argument("--output", type=str, help="Path to output file.")`
			`args = parser.parse_args()`
Create time and performance benchmarks for all readers and retrievers (#339) * add time and perf benchmark for es * Add retriever benchmarking * Add Reader benchmarking * add nq to squad conversion * add conversion stats * clean benchmarks * Add link to dataset * Update imports * add first support for neg psgs * Refactor test * set max_seq_len * cleanup benchmark * begin retriever speed benchmarking * Add support for retriever query index benchmarking * improve reader eval, retriever speed benchmarking * improve retriever speed benchmarking * Add retriever accuracy benchmark * Add neg doc shuffling * Add top_n * 3x speedup of SQL. add postgres docker run. make shuffle neg a param. add more logging * Add models to sweep * add option for faiss index type * remove unneeded line * change faiss to faiss_flat * begin automatic benchmark script * remove existing postgres docker for benchmarking * Add data processing scripts * Remove shuffle in script bc data already shuffled * switch hnsw setup from 256 to 128 * change es similarity to dot product by default * Error includes stack trace * Change ES default timeout * remove delete_docs() from timing for indexing * Add support for website export * update website on push to benchmarks * add complete benchmarks results * new json format * removed NaN as is not a valid json token * fix benchmarking for faiss hnsw queries. do sql calls in update_embeddings() as batches * update benchmarks for hnsw 128,20,80 * don't delete full index in delete_all_documents() * update texts for charts * update recall column for retriever * change scale and add units to desc * add units to legend * add axis titles. update desc * add html tags Co-authored-by: deepset <deepset@Crenolape.localdomain> Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai> Co-authored-by: PiffPaffM <markuspaff.mp@gmail.com> 2020-10-12 13:34:42 +02:00
refactor: Adapt running benchmarks (#5007) * Generate eval result in separate method * Adapt benchmarking utils * Adapt running retriever benchmarks * Adapt error message * Adapt running reader benchmarks * Adapt retriever reader benchmark script * Adapt running benchmarks script * Adapt README.md * Raise error if file doesn't exist * Raise error if path doesn't exist or is a directory * minor readme update * Create separate methods for checking if pipeline contains reader or retriever * Fix reader pipeline case --------- Co-authored-by: Darja Fokina <daria.f93@gmail.com> 2023-05-26 18:48:11 +02:00			`config_file = Path(args.config)`
			`output_file = f"{config_file.stem}_results.json" if args.output is None else args.output`
			`results = run_benchmark(config_file)`
			`with open(output_file, "w") as f:`
			`json.dump(results, f, indent=2)`