autogen/python/packages/agbench/benchmarks/AssistantBench/Scripts/custom_tabulate.py

import os
import sys
import re
from agbench.tabulate_cmd import default_tabulate
import json
import pandas as pd
import sqlite3
import glob
import numpy as np
sys.path.append(os.path.dirname(__file__))

from assistantbench_evaluator import question_scorer

EXCLUDE_DIR_NAMES = ["__pycache__"]


def normalize_answer(a):
    # Lower case
    # Trim (left and right)
    # standardize comma separated values
    # Replace multiple spaces with one space
    # Remove trailing punctuation
    norm_answer = ", ".join(a.strip().lower().split(","))
    norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer))
    return norm_answer


def scorer(instance_dir):
    # Read the expected answer
    expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
    if not os.path.isfile(expected_answer_file):
        return None

    expected_answer = None
    with open(expected_answer_file, "rt") as fh:
        expected_answer = fh.read().strip()

    # Read the console
    console_log_file = os.path.join(instance_dir, "console_log.txt")
    if not os.path.isfile(console_log_file):
        return None

    console_log = ""
    with open(console_log_file, "rt") as fh:
        console_log = fh.read()

        final_answer = None
        m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
        if m:
            final_answer = m.group(1).strip()

        # Missing the final answer line
        if final_answer is None:
            return None
        # get accuracy from assistantbench util, no normalization done for accuracy
        accuracy = question_scorer(final_answer, expected_answer)
        n_ex = normalize_answer(expected_answer)
        n_final = normalize_answer(final_answer)
        return (accuracy, n_ex, n_final)


def get_number_of_chat_messages(chat_messages_dir):
    result = 0
    for file in glob.glob(f"{chat_messages_dir}/*_messages.json"):
        with open(file, "r") as f:
            content = json.load(f)
            for agent, messages in content.items():
                result += len(messages)
    return result


def main(args):
    parsed_args, all_results = default_tabulate(args, scorer=scorer)
    excel_path = parsed_args.excel

    if excel_path:
        excel_dir = os.path.dirname(excel_path) or "."
        if not os.path.exists(excel_dir):
            os.makedirs(excel_dir, exist_ok=True)

        if not excel_path.endswith((".xlsx", ".xls")):
            excel_path += ".xlsx"

        runlogs = (
            parsed_args.runlogs
            if parsed_args.runlogs.endswith("/")
            else parsed_args.runlogs + "/"
        )

        if os.path.isdir(runlogs):
            task_ids = sorted(
                [
                    task_id
                    for task_id in os.listdir(runlogs)
                    if task_id not in EXCLUDE_DIR_NAMES
                ],
                key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)),
            )
        else:
            raise ValueError("please input a valid directory to tabulate result")

        trials = (
            sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x))
            if len(task_ids) > 0
            else []
        )
        dbnames = [
            [f"{runlogs}{task_id}/{trial}/telemetry.db" for task_id in task_ids]
            for trial in trials
        ]

        query = """
            SELECT cost, session_id, response, start_time, end_time
            FROM (
                SELECT invocation_id, cost, session_id, response, start_time, end_time,
                    ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn
                FROM chat_completions
            )
            WHERE rn = 1;
        """

        with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
            for trial_index, each_trial in enumerate(dbnames):
                result_df = pd.DataFrame(
                    columns=[
                        "id",
                        "status",
                        "expected_answer",
                        "final_answer",
                        "cost",
                        "latency",
                        "num_of_llm_requests",
                        "num_of_chat_messages",
                        "prompt_tokens",
                        "completion_tokens",
                        "total_tokens",
                        "model",
                    ]
                )

                result_df_type_mapping = {
                    "id": str,
                    "status": bool,
                    "expected_answer": str,
                    "final_answer": str,
                    "cost": float,
                    "latency": float,
                    "num_of_llm_requests": int,
                    "num_of_chat_messages": int,
                    "prompt_tokens": int,
                    "completion_tokens": int,
                    "total_tokens": int,
                }

                for dbname, scorer_results in zip(each_trial, all_results):
                    task_id = scorer_results[0]
                    scorer_result = scorer_results[trial_index + 1]

                    status, expected_answer, final_answer = (
                        scorer_result if scorer_result else (False, "", "")
                    )

                    con = sqlite3.connect(dbname)

                    # TODO: if large amount of data, add chunksize
                    telemetry_df = pd.read_sql_query(query, con)

                    earliest_starttime = pd.to_datetime(
                        telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f"
                    ).min()
                    latest_endtime = pd.to_datetime(
                        telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f"
                    ).max()

                    num_of_chat_messages = get_number_of_chat_messages(
                        chat_messages_dir=os.path.dirname(dbname)
                    )
                    result = {
                        "id": task_id,
                        "status": status,
                        "expected_answer": expected_answer,
                        "final_answer": final_answer,
                        "cost": telemetry_df["cost"].sum(),
                        "latency": (
                            latest_endtime - earliest_starttime
                        ).total_seconds(),
                        "num_of_llm_requests": len(telemetry_df),
                        "num_of_chat_messages": num_of_chat_messages,
                        "prompt_tokens": telemetry_df["response"]
                        .apply(
                            lambda x: json.loads(x)["usage"]["prompt_tokens"]
                            if "usage" in json.loads(x)
                            and "prompt_tokens" in json.loads(x)["usage"]
                            else 0
                        )
                        .sum(),
                        "completion_tokens": telemetry_df["response"]
                        .apply(
                            lambda x: json.loads(x)["usage"]["completion_tokens"]
                            if "usage" in json.loads(x)
                            and "completion_tokens" in json.loads(x)["usage"]
                            else 0
                        )
                        .sum(),
                        "total_tokens": telemetry_df["response"]
                        .apply(
                            lambda x: json.loads(x)["usage"]["total_tokens"]
                            if "usage" in json.loads(x)
                            and "total_tokens" in json.loads(x)["usage"]
                            else 0
                        )
                        .sum(),
                        "model": telemetry_df["response"]
                        .apply(
                            lambda x: json.loads(x)["model"]
                            if "model" in json.loads(x)
                            else ""
                        )
                        .unique(),
                    }

                    result_df = result_df.astype(result_df_type_mapping)
                    result_df = pd.concat(
                        [result_df, pd.DataFrame([result])], ignore_index=True
                    )
                result_df.to_excel(
                    writer, sheet_name=f"trial_{trial_index}", index=False
                )


if __name__ == "__main__" and __package__ is None:
    main(sys.argv)