haystack/test/benchmarks/datadog/send_metrics.py

import argparse
import os
import json
from typing import Dict

from metric_handler import (
    ReaderModelTags,
    NoneTag,
    RetrieverModelTags,
    DocumentStoreModelTags,
    BenchmarkType,
    LOGGER,
    DatasetSizeTags,
    IndexingDocsPerSecond,
    QueryingExactMatchMetric,
    QueryingF1Metric,
    QueryingRecallMetric,
    QueryingSecondsPerQueryMetric,
    QueryingMapMetric,
    MetricsAPI,
    Tag,
)


def parse_benchmark_files(folder_path: str) -> Dict:
    metrics = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".json"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r") as file:
                data = json.load(file)
                indexing_metrics = data.get("indexing", {})
                querying_metrics = data.get("querying")
                config = data.get("config")
                if indexing_metrics.get("error") is None and querying_metrics.get("error") is None:
                    metrics[filename.split(".json")[0]] = {
                        "indexing": indexing_metrics,
                        "querying": querying_metrics,
                        "config": config,
                    }
    return metrics


def get_reader_tag(config: Dict) -> Tag:
    for comp in config["components"]:
        if comp["name"] == "Reader":
            model = comp["params"]["model_name_or_path"]

            if model == "deepset/tinyroberta-squad2":
                return ReaderModelTags.tinyroberta

            if model == "deepset/deberta-v3-base-squad2":
                return ReaderModelTags.debertabase

            if model == "deepset/deberta-v3-large-squad2":
                return ReaderModelTags.debertalarge

    return NoneTag.none


def get_retriever_tag(config: Dict) -> Tag:
    for comp in config["components"]:
        if comp["name"] == "Retriever":
            if comp["type"] == "BM25Retriever":
                return RetrieverModelTags.bm25

            model = comp["params"]["embedding_model"]
            if "minilm" in model.lower():
                return RetrieverModelTags.minilm

            if "mpnet-base" in model.lower():
                return RetrieverModelTags.mpnetbase

    return NoneTag.none


def get_documentstore_tag(config: Dict) -> Tag:
    for comp in config["components"]:
        if comp["name"] == "DocumentStore":
            if comp["type"] == "ElasticsearchDocumentStore":
                return DocumentStoreModelTags.elasticsearch

            if comp["type"] == "WeaviateDocumentStore":
                return DocumentStoreModelTags.weaviate

            if comp["type"] == "OpenSearchDocumentStore":
                return DocumentStoreModelTags.opensearch

    return NoneTag.none


def get_benchmark_type_tag(reader_tag, retriever_tag, document_store_tag):
    if reader_tag != NoneTag.none and retriever_tag != NoneTag.none and document_store_tag != NoneTag.none:
        return BenchmarkType.retriever_reader
    elif retriever_tag != NoneTag.none and document_store_tag != NoneTag.none:
        return BenchmarkType.retriever
    elif reader_tag != NoneTag.none and retriever_tag == NoneTag.none:
        return BenchmarkType.reader

    LOGGER.warn(
        f"Did not find benchmark_type for the combination of tags, retriever={retriever_tag}, reader={reader_tag}, "
        f"document_store={document_store_tag}"
    )
    return NoneTag.none


def collect_metrics_from_json_files(folder_path):
    benchmark_metrics = parse_benchmark_files(folder_path)
    metrics_to_send_to_dd = []
    for metrics in benchmark_metrics.values():
        indexing_metrics = metrics["indexing"]
        querying_metrics = metrics["querying"]
        config = metrics["config"]

        docs_per_second = indexing_metrics.get("docs_per_second")

        exact_match = querying_metrics.get("exact_match")
        f1_score = querying_metrics.get("f1")
        recall = querying_metrics.get("recall")
        seconds_per_query = querying_metrics.get("seconds_per_query")
        map_query = querying_metrics.get("map")

        size_tag = DatasetSizeTags.size_100k
        reader_tag = get_reader_tag(config)
        retriever_tag = get_retriever_tag(config)
        document_store_tag = get_documentstore_tag(config)
        benchmark_type_tag = get_benchmark_type_tag(reader_tag, retriever_tag, document_store_tag)

        tags = [size_tag, reader_tag, retriever_tag, document_store_tag, benchmark_type_tag]

        if docs_per_second:
            metrics_to_send_to_dd.append(IndexingDocsPerSecond(docs_per_second, tags))

        if exact_match or exact_match == 0:
            metrics_to_send_to_dd.append(QueryingExactMatchMetric(exact_match, tags))

        if f1_score or f1_score == 0:
            metrics_to_send_to_dd.append(QueryingF1Metric(f1_score, tags))

        if recall or recall == 0:
            metrics_to_send_to_dd.append(QueryingRecallMetric(recall, tags))

        if seconds_per_query:
            metrics_to_send_to_dd.append(QueryingSecondsPerQueryMetric(seconds_per_query, tags))

        if map_query or map_query == 0:
            metrics_to_send_to_dd.append(QueryingMapMetric(map_query, tags))

    return metrics_to_send_to_dd


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("folder_path", type=str, help="Path to the folder with benchmark results")
    parser.add_argument("datadog_api_key", type=str, help="Datadog API key")
    parser.add_argument("datadog_api_host", type=str, help="Datadog API host")
    args = parser.parse_args()

    folder_path = args.folder_path
    datadog_api_key = args.datadog_api_key
    datadog_api_host = args.datadog_api_host

    metrics_to_send_to_dd = collect_metrics_from_json_files(folder_path)
    api = MetricsAPI(datadog_api_key=datadog_api_key, datadog_host=datadog_api_host)
    api.send_custom_dd_metrics(metrics_to_send_to_dd)