mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-29 20:00:46 +00:00

* Add config files * log benchmarks to stdout * Add top-k and batch size to configs * Add batch size to configs * fix: don't download files if they already exist * Add batch size to configs * refine script * Remove configs using 1m docs * update run script * update run script * update run script * datadog integration * remove out folder * gitignore benchmarks output * test: send benchmarks to datadog * remove uncommented lines in script * feat: take branch/tag argument for benchmark setup script * fix: run.sh should ignore errors * Remove changes unrelated to datadog * Apply black * Update test/benchmarks/utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * PR feedback * Account for reader benchmarks not doing indexing * Change key of reader metrics * Apply PR feedback * Remove whitespace --------- Co-authored-by: rjanjua <rohan.janjua@gmail.com> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com>
71 lines
2.5 KiB
Python
71 lines
2.5 KiB
Python
from time import perf_counter
|
|
from typing import Dict
|
|
from pathlib import Path
|
|
import traceback
|
|
import datetime
|
|
import logging
|
|
|
|
from haystack import Pipeline
|
|
from haystack.nodes import BaseReader
|
|
from haystack.utils import aggregate_labels
|
|
from utils import load_eval_data, get_reader_config
|
|
|
|
|
|
def benchmark_reader(pipeline: Pipeline, labels_file: Path) -> Dict:
|
|
try:
|
|
labels, queries = load_eval_data(labels_file)
|
|
eval_labels = aggregate_labels(labels)
|
|
eval_queries = []
|
|
eval_docs = []
|
|
for multi_label in eval_labels:
|
|
eval_queries.append(multi_label.query)
|
|
eval_docs.append([multi_label.labels[0].document])
|
|
|
|
# Run querying
|
|
start_time = perf_counter()
|
|
# We use run_batch instead of eval_batch because we want to get pure inference time
|
|
predictions = pipeline.run_batch(queries=eval_queries, documents=eval_docs, labels=eval_labels, debug=True)
|
|
end_time = perf_counter()
|
|
querying_time = end_time - start_time
|
|
|
|
# Evaluate predictions
|
|
eval_result = pipeline._generate_eval_result_from_batch_preds(predictions_batches=predictions)
|
|
metrics = eval_result.calculate_metrics()["Reader"]
|
|
|
|
reader_type, reader_model, reader_top_k = get_reader_config(pipeline)
|
|
results = {
|
|
"querying": {
|
|
"exact_match": metrics["exact_match"],
|
|
"f1": metrics["f1"],
|
|
"n_queries": len(eval_labels),
|
|
"querying_time": querying_time,
|
|
"seconds_per_query": querying_time / len(eval_labels),
|
|
"reader": reader_type,
|
|
"reader_model": reader_model,
|
|
"top_k": reader_top_k,
|
|
"date_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
"error": None,
|
|
}
|
|
}
|
|
|
|
except Exception:
|
|
tb = traceback.format_exc()
|
|
logging.error("##### The following Error was raised while running querying run:")
|
|
logging.error(tb)
|
|
reader_type, reader_model, reader_top_k = get_reader_config(pipeline)
|
|
results = {
|
|
"reader": {
|
|
"exact_match": 0.0,
|
|
"f1": 0.0,
|
|
"n_queries": 0,
|
|
"querying_time": 0.0,
|
|
"seconds_per_query": 0.0,
|
|
"reader": reader_type,
|
|
"reader_model": reader_model,
|
|
"date_time": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
"error": str(tb),
|
|
}
|
|
}
|
|
|
|
return results
|