haystack/test/benchmarks/datadog/send_metrics.py

import argparse
import os
import json
from typing import Dict

from metric_handler import (
    ReaderModelTags,
    NoneTag,
    RetrieverModelTags,
    DocumentStoreModelTags,
    BenchmarkType,
    LOGGER,
    DatasetSizeTags,
    IndexingDocsPerSecond,
    QueryingExactMatchMetric,
    QueryingF1Metric,
    QueryingRecallMetric,
    QueryingSecondsPerQueryMetric,
    QueryingMapMetric,
    MetricsAPI,
    Tag,
)


def parse_benchmark_files(folder_path: str) -> Dict:
    metrics = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r") as file:
                data = json.load(file)
                indexing_metrics = data.get("indexing", {})
                querying_metrics = data.get("querying")
                config = data.get("config")
                if indexing_metrics.get("error") is None and querying_metrics.get("error") is None:
                    metrics[filename.split(".json")[0]] = {
                        "indexing": indexing_metrics,
                        "querying": querying_metrics,
                        "config": config,
                    }
    return metrics


def get_reader_tag(config: Dict) -> Tag:
    for comp in config["components"]:
        if comp["name"] == "Reader":
            model = comp["params"]["model_name_or_path"]

            if model == "deepset/tinyroberta-squad2":
                return ReaderModelTags.tinyroberta

            if model == "deepset/deberta-v3-base-squad2":
                return ReaderModelTags.debertabase

            if model == "deepset/deberta-v3-large-squad2":
                return ReaderModelTags.debertalarge

    return NoneTag.none


def get_retriever_tag(config: Dict) -> Tag:
    for comp in config["components"]:
        if comp["name"] == "Retriever":
            if comp["type"] == "BM25Retriever":
                return RetrieverModelTags.bm25

            model = comp["params"]["embedding_model"]
            if "minilm" in model.lower():
                return RetrieverModelTags.minilm

            if "mpnet-base" in model.lower():
                return RetrieverModelTags.mpnetbase

    return NoneTag.none


def get_documentstore_tag(config: Dict) -> Tag:
    for comp in config["components"]:
        if comp["name"] == "DocumentStore":
            if comp["type"] == "ElasticsearchDocumentStore":
                return DocumentStoreModelTags.elasticsearch

            if comp["type"] == "WeaviateDocumentStore":
                return DocumentStoreModelTags.weaviate

            if comp["type"] == "OpenSearchDocumentStore":
                return DocumentStoreModelTags.opensearch

    return NoneTag.none


def get_benchmark_type_tag(reader_tag, retriever_tag, document_store_tag):
    if reader_tag != NoneTag.none and retriever_tag != NoneTag.none and document_store_tag != NoneTag.none:
        return BenchmarkType.retriever_reader
    elif retriever_tag != NoneTag.none and document_store_tag != NoneTag.none:
        return BenchmarkType.retriever
    elif reader_tag != NoneTag.none and retriever_tag == NoneTag.none:
        return BenchmarkType.reader

    LOGGER.warn(
        f"Did not find benchmark_type for the combination of tags, retriever={retriever_tag}, reader={reader_tag}, "
        f"document_store={document_store_tag}"
    )
    return NoneTag.none


def collect_metrics_from_json_files(folder_path):
    benchmark_metrics = parse_benchmark_files(folder_path)
    metrics_to_send_to_dd = []
    for metrics in benchmark_metrics.values():
        indexing_metrics = metrics["indexing"]
        querying_metrics = metrics["querying"]
        config = metrics["config"]

        docs_per_second = indexing_metrics.get("docs_per_second")

        exact_match = querying_metrics.get("exact_match")
        f1_score = querying_metrics.get("f1")
        recall = querying_metrics.get("recall")
        seconds_per_query = querying_metrics.get("seconds_per_query")
        map_query = querying_metrics.get("map")

        size_tag = DatasetSizeTags.size_100k
        reader_tag = get_reader_tag(config)
        retriever_tag = get_retriever_tag(config)
        document_store_tag = get_documentstore_tag(config)
        benchmark_type_tag = get_benchmark_type_tag(reader_tag, retriever_tag, document_store_tag)

        tags = [size_tag, reader_tag, retriever_tag, document_store_tag, benchmark_type_tag]

        if docs_per_second:
            metrics_to_send_to_dd.append(IndexingDocsPerSecond(docs_per_second, tags))

        if exact_match or exact_match == 0:
            metrics_to_send_to_dd.append(QueryingExactMatchMetric(exact_match, tags))

        if f1_score or f1_score == 0:
            metrics_to_send_to_dd.append(QueryingF1Metric(f1_score, tags))

        if recall or recall == 0:
            metrics_to_send_to_dd.append(QueryingRecallMetric(recall, tags))

        if seconds_per_query:
            metrics_to_send_to_dd.append(QueryingSecondsPerQueryMetric(seconds_per_query, tags))

        if map_query or map_query == 0:
            metrics_to_send_to_dd.append(QueryingMapMetric(map_query, tags))

    return metrics_to_send_to_dd


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("folder_path", type=str, help="Path to the folder with benchmark results")
    parser.add_argument("datadog_api_key", type=str, help="Datadog API key")
    parser.add_argument("datadog_api_host", type=str, help="Datadog API host")
    args = parser.parse_args()

    folder_path = args.folder_path
    datadog_api_key = args.datadog_api_key
    datadog_api_host = args.datadog_api_host

    metrics_to_send_to_dd = collect_metrics_from_json_files(folder_path)
    api = MetricsAPI(datadog_api_key=datadog_api_key, datadog_host=datadog_api_host)
    api.send_custom_dd_metrics(metrics_to_send_to_dd)
test: Add scripts to send benchmark results to datadog (#5432) * Add config files * log benchmarks to stdout * Add top-k and batch size to configs * Add batch size to configs * fix: don't download files if they already exist * Add batch size to configs * refine script * Remove configs using 1m docs * update run script * update run script * update run script * datadog integration * remove out folder * gitignore benchmarks output * test: send benchmarks to datadog * remove uncommented lines in script * feat: take branch/tag argument for benchmark setup script * fix: run.sh should ignore errors * Remove changes unrelated to datadog * Apply black * Update test/benchmarks/utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * PR feedback * Account for reader benchmarks not doing indexing * Change key of reader metrics * Apply PR feedback * Remove whitespace --------- Co-authored-by: rjanjua <rohan.janjua@gmail.com> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2023-08-03 10:09:00 +02:00			`import argparse`
			`import os`
			`import json`
			`from typing import Dict`

			`from metric_handler import (`
			`ReaderModelTags,`
			`NoneTag,`
			`RetrieverModelTags,`
			`DocumentStoreModelTags,`
			`BenchmarkType,`
			`LOGGER,`
			`DatasetSizeTags,`
			`IndexingDocsPerSecond,`
			`QueryingExactMatchMetric,`
			`QueryingF1Metric,`
			`QueryingRecallMetric,`
			`QueryingSecondsPerQueryMetric,`
			`QueryingMapMetric,`
			`MetricsAPI,`
			`Tag,`
			`)`


			`def parse_benchmark_files(folder_path: str) -> Dict:`
			`metrics = {}`
			`for filename in os.listdir(folder_path):`
			`if filename.endswith(".json"):`
			`file_path = os.path.join(folder_path, filename)`
			`with open(file_path, "r") as file:`
			`data = json.load(file)`
			`indexing_metrics = data.get("indexing", {})`
			`querying_metrics = data.get("querying")`
			`config = data.get("config")`
			`if indexing_metrics.get("error") is None and querying_metrics.get("error") is None:`
			`metrics[filename.split(".json")[0]] = {`
			`"indexing": indexing_metrics,`
			`"querying": querying_metrics,`
			`"config": config,`
			`}`
			`return metrics`


			`def get_reader_tag(config: Dict) -> Tag:`
			`for comp in config["components"]:`
			`if comp["name"] == "Reader":`
			`model = comp["params"]["model_name_or_path"]`

			`if model == "deepset/tinyroberta-squad2":`
			`return ReaderModelTags.tinyroberta`

			`if model == "deepset/deberta-v3-base-squad2":`
			`return ReaderModelTags.debertabase`

			`if model == "deepset/deberta-v3-large-squad2":`
			`return ReaderModelTags.debertalarge`

			`return NoneTag.none`


			`def get_retriever_tag(config: Dict) -> Tag:`
			`for comp in config["components"]:`
			`if comp["name"] == "Retriever":`
			`if comp["type"] == "BM25Retriever":`
			`return RetrieverModelTags.bm25`

			`model = comp["params"]["embedding_model"]`
ci: Add Github workflow to automate benchmark runs (#5399) * Add config files * log benchmarks to stdout * Add top-k and batch size to configs * Add batch size to configs * fix: don't download files if they already exist * Add batch size to configs * refine script * Remove configs using 1m docs * update run script * update run script * update run script * datadog integration * remove out folder * gitignore benchmarks output * test: send benchmarks to datadog * remove uncommented lines in script * feat: take branch/tag argument for benchmark setup script * fix: run.sh should ignore errors * Add GH workflow to run benchmarks periodically * Remove unused script * Adapt cml.yml * Adapt cml.yml * Rename cml.yml to benchmarks.yml * Revert "Rename cml.yml to benchmarks.yml" This reverts commit 897299433a71a55827124728adff5de918d46d21. * remove benchmarks.yml * Use same file extension for all config files * Use checkout@v3 * Run benchmarks sequentially * Add timeout-minutes parameter * Remove changes unrelated to datadog * Apply black * use haystack-oss aws account * Update test/benchmarks/utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * PR feedback * fix aws credentials step * Fix path * check docker * Allow spinning up containers from within container * Allow spinning up containers from within container * Separate launching doc stores from benchmarks * Remove docker related commands * run only retrievers * change port * Revert "change port" This reverts commit 6e5bcebb1d16e03ba7672be7e8a089084c7fc3a7. * Run opensearch benchmark only * Run weaviate benchmark only * Run bm25 benchmarks only * Changes host of doc stores * add step to get docker logs * Revert "add step to get docker logs" This reverts commit c10e6faa76bde5df406a027203bd775d18c93c90. * Install docker * Launch doc store containers from wtihin runner container * Remove kill command * Change host * dump docker logs * change port * Add cloud startup script * dump docker logs * add network param * add network to startup.sh * check cluster health * move steps * change port * try using services * check cluster health * use services * run only weaviate * change host * Upload benchmark results as artifacts * Update configs * Delete index after benchmark run * Use correct index name * Run only failing config * Use smaller batch size * Increase memory for opensearch * Reduce batch size further * Provide more storage * Reduce batch size * dump docker logs * add java opts * Spin up only opensearch container * Create separate job for each doc store * Run benchmarks sequentially * Set working directory * Account for reader benchmarks not doing indexing * Change key of reader metrics * Apply PR feedback * Remove whitespace * Adapt workflow to changes in datadog scripts * Adapt workflow to changes in datadog scripts * Increase memory for opensearch * Reduce batch size * Add preprocessing_batch_size to Readers * Remove unrelated change * Move order * Fix path * Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance * Manually terminate EC2 instance * Manually terminate EC2 instance * Always terminate runner * Always terminate runner * Remove unnecessary terminate-runner job * Add cron schedule * Disable telemetry * Rename cml.yml to benchmarks.yml --------- Co-authored-by: rjanjua <rohan.janjua@gmail.com> Co-authored-by: Paul Steppacher <p.steppacher91@gmail.com> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2023-08-17 12:56:45 +02:00			`if "minilm" in model.lower():`
test: Add scripts to send benchmark results to datadog (#5432) * Add config files * log benchmarks to stdout * Add top-k and batch size to configs * Add batch size to configs * fix: don't download files if they already exist * Add batch size to configs * refine script * Remove configs using 1m docs * update run script * update run script * update run script * datadog integration * remove out folder * gitignore benchmarks output * test: send benchmarks to datadog * remove uncommented lines in script * feat: take branch/tag argument for benchmark setup script * fix: run.sh should ignore errors * Remove changes unrelated to datadog * Apply black * Update test/benchmarks/utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * PR feedback * Account for reader benchmarks not doing indexing * Change key of reader metrics * Apply PR feedback * Remove whitespace --------- Co-authored-by: rjanjua <rohan.janjua@gmail.com> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2023-08-03 10:09:00 +02:00			`return RetrieverModelTags.minilm`

ci: Add Github workflow to automate benchmark runs (#5399) * Add config files * log benchmarks to stdout * Add top-k and batch size to configs * Add batch size to configs * fix: don't download files if they already exist * Add batch size to configs * refine script * Remove configs using 1m docs * update run script * update run script * update run script * datadog integration * remove out folder * gitignore benchmarks output * test: send benchmarks to datadog * remove uncommented lines in script * feat: take branch/tag argument for benchmark setup script * fix: run.sh should ignore errors * Add GH workflow to run benchmarks periodically * Remove unused script * Adapt cml.yml * Adapt cml.yml * Rename cml.yml to benchmarks.yml * Revert "Rename cml.yml to benchmarks.yml" This reverts commit 897299433a71a55827124728adff5de918d46d21. * remove benchmarks.yml * Use same file extension for all config files * Use checkout@v3 * Run benchmarks sequentially * Add timeout-minutes parameter * Remove changes unrelated to datadog * Apply black * use haystack-oss aws account * Update test/benchmarks/utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * PR feedback * fix aws credentials step * Fix path * check docker * Allow spinning up containers from within container * Allow spinning up containers from within container * Separate launching doc stores from benchmarks * Remove docker related commands * run only retrievers * change port * Revert "change port" This reverts commit 6e5bcebb1d16e03ba7672be7e8a089084c7fc3a7. * Run opensearch benchmark only * Run weaviate benchmark only * Run bm25 benchmarks only * Changes host of doc stores * add step to get docker logs * Revert "add step to get docker logs" This reverts commit c10e6faa76bde5df406a027203bd775d18c93c90. * Install docker * Launch doc store containers from wtihin runner container * Remove kill command * Change host * dump docker logs * change port * Add cloud startup script * dump docker logs * add network param * add network to startup.sh * check cluster health * move steps * change port * try using services * check cluster health * use services * run only weaviate * change host * Upload benchmark results as artifacts * Update configs * Delete index after benchmark run * Use correct index name * Run only failing config * Use smaller batch size * Increase memory for opensearch * Reduce batch size further * Provide more storage * Reduce batch size * dump docker logs * add java opts * Spin up only opensearch container * Create separate job for each doc store * Run benchmarks sequentially * Set working directory * Account for reader benchmarks not doing indexing * Change key of reader metrics * Apply PR feedback * Remove whitespace * Adapt workflow to changes in datadog scripts * Adapt workflow to changes in datadog scripts * Increase memory for opensearch * Reduce batch size * Add preprocessing_batch_size to Readers * Remove unrelated change * Move order * Fix path * Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance Manually terminate EC2 instance * Manually terminate EC2 instance * Manually terminate EC2 instance * Always terminate runner * Always terminate runner * Remove unnecessary terminate-runner job * Add cron schedule * Disable telemetry * Rename cml.yml to benchmarks.yml --------- Co-authored-by: rjanjua <rohan.janjua@gmail.com> Co-authored-by: Paul Steppacher <p.steppacher91@gmail.com> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> Co-authored-by: Silvano Cerza <silvanocerza@gmail.com> 2023-08-17 12:56:45 +02:00			`if "mpnet-base" in model.lower():`
test: Add scripts to send benchmark results to datadog (#5432) * Add config files * log benchmarks to stdout * Add top-k and batch size to configs * Add batch size to configs * fix: don't download files if they already exist * Add batch size to configs * refine script * Remove configs using 1m docs * update run script * update run script * update run script * datadog integration * remove out folder * gitignore benchmarks output * test: send benchmarks to datadog * remove uncommented lines in script * feat: take branch/tag argument for benchmark setup script * fix: run.sh should ignore errors * Remove changes unrelated to datadog * Apply black * Update test/benchmarks/utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * PR feedback * Account for reader benchmarks not doing indexing * Change key of reader metrics * Apply PR feedback * Remove whitespace --------- Co-authored-by: rjanjua <rohan.janjua@gmail.com> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2023-08-03 10:09:00 +02:00			`return RetrieverModelTags.mpnetbase`

			`return NoneTag.none`


			`def get_documentstore_tag(config: Dict) -> Tag:`
			`for comp in config["components"]:`
			`if comp["name"] == "DocumentStore":`
			`if comp["type"] == "ElasticsearchDocumentStore":`
			`return DocumentStoreModelTags.elasticsearch`

			`if comp["type"] == "WeaviateDocumentStore":`
			`return DocumentStoreModelTags.weaviate`

			`if comp["type"] == "OpenSearchDocumentStore":`
			`return DocumentStoreModelTags.opensearch`

			`return NoneTag.none`


			`def get_benchmark_type_tag(reader_tag, retriever_tag, document_store_tag):`
			`if reader_tag != NoneTag.none and retriever_tag != NoneTag.none and document_store_tag != NoneTag.none:`
			`return BenchmarkType.retriever_reader`
			`elif retriever_tag != NoneTag.none and document_store_tag != NoneTag.none:`
			`return BenchmarkType.retriever`
			`elif reader_tag != NoneTag.none and retriever_tag == NoneTag.none:`
			`return BenchmarkType.reader`

			`LOGGER.warn(`
			`f"Did not find benchmark_type for the combination of tags, retriever={retriever_tag}, reader={reader_tag}, "`
			`f"document_store={document_store_tag}"`
			`)`
			`return NoneTag.none`


			`def collect_metrics_from_json_files(folder_path):`
			`benchmark_metrics = parse_benchmark_files(folder_path)`
			`metrics_to_send_to_dd = []`
perf: Python performance improvements with ruff C4 and PERF fixes (#5803) * Python performance improvements with ruff C4 and PERF * pre-commit fixes * Revert changes to examples/basic_qa_pipeline.py * Revert changes to haystack/preview/testing/document_store.py * revert releasenotes * Upgrade to ruff v0.0.290 2023-09-16 16:26:07 +02:00			`for metrics in benchmark_metrics.values():`
test: Add scripts to send benchmark results to datadog (#5432) * Add config files * log benchmarks to stdout * Add top-k and batch size to configs * Add batch size to configs * fix: don't download files if they already exist * Add batch size to configs * refine script * Remove configs using 1m docs * update run script * update run script * update run script * datadog integration * remove out folder * gitignore benchmarks output * test: send benchmarks to datadog * remove uncommented lines in script * feat: take branch/tag argument for benchmark setup script * fix: run.sh should ignore errors * Remove changes unrelated to datadog * Apply black * Update test/benchmarks/utils.py Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> * PR feedback * Account for reader benchmarks not doing indexing * Change key of reader metrics * Apply PR feedback * Remove whitespace --------- Co-authored-by: rjanjua <rohan.janjua@gmail.com> Co-authored-by: Silvano Cerza <3314350+silvanocerza@users.noreply.github.com> 2023-08-03 10:09:00 +02:00			`indexing_metrics = metrics["indexing"]`
			`querying_metrics = metrics["querying"]`
			`config = metrics["config"]`

			`docs_per_second = indexing_metrics.get("docs_per_second")`

			`exact_match = querying_metrics.get("exact_match")`
			`f1_score = querying_metrics.get("f1")`
			`recall = querying_metrics.get("recall")`
			`seconds_per_query = querying_metrics.get("seconds_per_query")`
			`map_query = querying_metrics.get("map")`

			`size_tag = DatasetSizeTags.size_100k`
			`reader_tag = get_reader_tag(config)`
			`retriever_tag = get_retriever_tag(config)`
			`document_store_tag = get_documentstore_tag(config)`
			`benchmark_type_tag = get_benchmark_type_tag(reader_tag, retriever_tag, document_store_tag)`

			`tags = [size_tag, reader_tag, retriever_tag, document_store_tag, benchmark_type_tag]`

			`if docs_per_second:`
			`metrics_to_send_to_dd.append(IndexingDocsPerSecond(docs_per_second, tags))`

			`if exact_match or exact_match == 0:`
			`metrics_to_send_to_dd.append(QueryingExactMatchMetric(exact_match, tags))`

			`if f1_score or f1_score == 0:`
			`metrics_to_send_to_dd.append(QueryingF1Metric(f1_score, tags))`

			`if recall or recall == 0:`
			`metrics_to_send_to_dd.append(QueryingRecallMetric(recall, tags))`

			`if seconds_per_query:`
			`metrics_to_send_to_dd.append(QueryingSecondsPerQueryMetric(seconds_per_query, tags))`

			`if map_query or map_query == 0:`
			`metrics_to_send_to_dd.append(QueryingMapMetric(map_query, tags))`

			`return metrics_to_send_to_dd`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("folder_path", type=str, help="Path to the folder with benchmark results")`
			`parser.add_argument("datadog_api_key", type=str, help="Datadog API key")`
			`parser.add_argument("datadog_api_host", type=str, help="Datadog API host")`
			`args = parser.parse_args()`

			`folder_path = args.folder_path`
			`datadog_api_key = args.datadog_api_key`
			`datadog_api_host = args.datadog_api_host`

			`metrics_to_send_to_dd = collect_metrics_from_json_files(folder_path)`
			`api = MetricsAPI(datadog_api_key=datadog_api_key, datadog_host=datadog_api_host)`
			`api.send_custom_dd_metrics(metrics_to_send_to_dd)`