autogen/python/packages/agbench/benchmarks/AssistantBench/Scripts/custom_tabulate.py

import os
import sys
import re
from agbench.tabulate_cmd import default_tabulate
import json
import pandas as pd
import sqlite3
import glob
import numpy as np
sys.path.append(os.path.dirname(__file__))

from assistantbench_evaluator import question_scorer

EXCLUDE_DIR_NAMES = ["__pycache__"]


def normalize_answer(a):
    # Lower case
    # Trim (left and right)
    # standardize comma separated values
    # Replace multiple spaces with one space
    # Remove trailing punctuation
    norm_answer = ", ".join(a.strip().lower().split(","))
    norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer))
    return norm_answer


def scorer(instance_dir):
    # Read the expected answer
    expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
    if not os.path.isfile(expected_answer_file):
        return None

    expected_answer = None
    with open(expected_answer_file, "rt") as fh:
        expected_answer = fh.read().strip()

    # Read the console
    console_log_file = os.path.join(instance_dir, "console_log.txt")
    if not os.path.isfile(console_log_file):
        return None

    console_log = ""
    with open(console_log_file, "rt") as fh:
        console_log = fh.read()

        final_answer = None
        m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
        if m:
            final_answer = m.group(1).strip()

        # Missing the final answer line
        if final_answer is None:
            return None
        # get accuracy from assistantbench util, no normalization done for accuracy
        accuracy = question_scorer(final_answer, expected_answer)
        n_ex = normalize_answer(expected_answer)
        n_final = normalize_answer(final_answer)
        return (accuracy, n_ex, n_final)


def get_number_of_chat_messages(chat_messages_dir):
    result = 0
    for file in glob.glob(f"{chat_messages_dir}/*_messages.json"):
        with open(file, "r") as f:
            content = json.load(f)
            for agent, messages in content.items():
                result += len(messages)
    return result


def main(args):
    parsed_args, all_results = default_tabulate(args, scorer=scorer)
    excel_path = parsed_args.excel

    if excel_path:
        excel_dir = os.path.dirname(excel_path) or "."
        if not os.path.exists(excel_dir):
            os.makedirs(excel_dir, exist_ok=True)

        if not excel_path.endswith((".xlsx", ".xls")):
            excel_path += ".xlsx"

        runlogs = (
            parsed_args.runlogs
            if parsed_args.runlogs.endswith("/")
            else parsed_args.runlogs + "/"
        )

        if os.path.isdir(runlogs):
            task_ids = sorted(
                [
                    task_id
                    for task_id in os.listdir(runlogs)
                    if task_id not in EXCLUDE_DIR_NAMES
                ],
                key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)),
            )
        else:
            raise ValueError("please input a valid directory to tabulate result")

        trials = (
            sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x))
            if len(task_ids) > 0
            else []
        )
        dbnames = [
            [f"{runlogs}{task_id}/{trial}/telemetry.db" for task_id in task_ids]
            for trial in trials
        ]

        query = """
            SELECT cost, session_id, response, start_time, end_time
            FROM (
                SELECT invocation_id, cost, session_id, response, start_time, end_time,
                    ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn
                FROM chat_completions
            )
            WHERE rn = 1;
        """

        with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
            for trial_index, each_trial in enumerate(dbnames):
                result_df = pd.DataFrame(
                    columns=[
                        "id",
                        "status",
                        "expected_answer",
                        "final_answer",
                        "cost",
                        "latency",
                        "num_of_llm_requests",
                        "num_of_chat_messages",
                        "prompt_tokens",
                        "completion_tokens",
                        "total_tokens",
                        "model",
                    ]
                )

                result_df_type_mapping = {
                    "id": str,
                    "status": bool,
                    "expected_answer": str,
                    "final_answer": str,
                    "cost": float,
                    "latency": float,
                    "num_of_llm_requests": int,
                    "num_of_chat_messages": int,
                    "prompt_tokens": int,
                    "completion_tokens": int,
                    "total_tokens": int,
                }

                for dbname, scorer_results in zip(each_trial, all_results):
                    task_id = scorer_results[0]
                    scorer_result = scorer_results[trial_index + 1]

                    status, expected_answer, final_answer = (
                        scorer_result if scorer_result else (False, "", "")
                    )

                    con = sqlite3.connect(dbname)

                    # TODO: if large amount of data, add chunksize
                    telemetry_df = pd.read_sql_query(query, con)

                    earliest_starttime = pd.to_datetime(
                        telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f"
                    ).min()
                    latest_endtime = pd.to_datetime(
                        telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f"
                    ).max()

                    num_of_chat_messages = get_number_of_chat_messages(
                        chat_messages_dir=os.path.dirname(dbname)
                    )
                    result = {
                        "id": task_id,
                        "status": status,
                        "expected_answer": expected_answer,
                        "final_answer": final_answer,
                        "cost": telemetry_df["cost"].sum(),
                        "latency": (
                            latest_endtime - earliest_starttime
                        ).total_seconds(),
                        "num_of_llm_requests": len(telemetry_df),
                        "num_of_chat_messages": num_of_chat_messages,
                        "prompt_tokens": telemetry_df["response"]
                        .apply(
                            lambda x: json.loads(x)["usage"]["prompt_tokens"]
                            if "usage" in json.loads(x)
                            and "prompt_tokens" in json.loads(x)["usage"]
                            else 0
                        )
                        .sum(),
                        "completion_tokens": telemetry_df["response"]
                        .apply(
                            lambda x: json.loads(x)["usage"]["completion_tokens"]
                            if "usage" in json.loads(x)
                            and "completion_tokens" in json.loads(x)["usage"]
                            else 0
                        )
                        .sum(),
                        "total_tokens": telemetry_df["response"]
                        .apply(
                            lambda x: json.loads(x)["usage"]["total_tokens"]
                            if "usage" in json.loads(x)
                            and "total_tokens" in json.loads(x)["usage"]
                            else 0
                        )
                        .sum(),
                        "model": telemetry_df["response"]
                        .apply(
                            lambda x: json.loads(x)["model"]
                            if "model" in json.loads(x)
                            else ""
                        )
                        .unique(),
                    }

                    result_df = result_df.astype(result_df_type_mapping)
                    result_df = pd.concat(
                        [result_df, pd.DataFrame([result])], ignore_index=True
                    )
                result_df.to_excel(
                    writer, sheet_name=f"trial_{trial_index}", index=False
                )


if __name__ == "__main__" and __package__ is None:
    main(sys.argv)
Adding Benchmarks to agbench (#3803) * Move from tomllib to tomli * added example code for magentic-one + code comments * adding benchmarks temporarily * add license for datasets * revert changes to magentic-one * change license location --------- Co-authored-by: Ryan Sweet <rysweet@microsoft.com> 2024-10-17 21:33:33 -07:00			`import os`
			`import sys`
			`import re`
			`from agbench.tabulate_cmd import default_tabulate`
			`import json`
			`import pandas as pd`
			`import sqlite3`
			`import glob`
			`import numpy as np`
			`sys.path.append(os.path.dirname(__file__))`

			`from assistantbench_evaluator import question_scorer`

			`EXCLUDE_DIR_NAMES = ["__pycache__"]`


			`def normalize_answer(a):`
			`# Lower case`
			`# Trim (left and right)`
			`# standardize comma separated values`
			`# Replace multiple spaces with one space`
			`# Remove trailing punctuation`
			`norm_answer = ", ".join(a.strip().lower().split(","))`
			`norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer))`
			`return norm_answer`


			`def scorer(instance_dir):`
			`# Read the expected answer`
			`expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")`
			`if not os.path.isfile(expected_answer_file):`
			`return None`

			`expected_answer = None`
			`with open(expected_answer_file, "rt") as fh:`
			`expected_answer = fh.read().strip()`

			`# Read the console`
			`console_log_file = os.path.join(instance_dir, "console_log.txt")`
			`if not os.path.isfile(console_log_file):`
			`return None`

			`console_log = ""`
			`with open(console_log_file, "rt") as fh:`
			`console_log = fh.read()`

			`final_answer = None`
			`m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)`
			`if m:`
			`final_answer = m.group(1).strip()`

			`# Missing the final answer line`
			`if final_answer is None:`
			`return None`
			`# get accuracy from assistantbench util, no normalization done for accuracy`
			`accuracy = question_scorer(final_answer, expected_answer)`
			`n_ex = normalize_answer(expected_answer)`
			`n_final = normalize_answer(final_answer)`
			`return (accuracy, n_ex, n_final)`


			`def get_number_of_chat_messages(chat_messages_dir):`
			`result = 0`
			`for file in glob.glob(f"{chat_messages_dir}/*_messages.json"):`
			`with open(file, "r") as f:`
			`content = json.load(f)`
			`for agent, messages in content.items():`
			`result += len(messages)`
			`return result`


			`def main(args):`
			`parsed_args, all_results = default_tabulate(args, scorer=scorer)`
			`excel_path = parsed_args.excel`

			`if excel_path:`
			`excel_dir = os.path.dirname(excel_path) or "."`
			`if not os.path.exists(excel_dir):`
			`os.makedirs(excel_dir, exist_ok=True)`

			`if not excel_path.endswith((".xlsx", ".xls")):`
			`excel_path += ".xlsx"`

			`runlogs = (`
			`parsed_args.runlogs`
			`if parsed_args.runlogs.endswith("/")`
			`else parsed_args.runlogs + "/"`
			`)`

			`if os.path.isdir(runlogs):`
			`task_ids = sorted(`
			`[`
			`task_id`
			`for task_id in os.listdir(runlogs)`
			`if task_id not in EXCLUDE_DIR_NAMES`
			`],`
			`key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)),`
			`)`
			`else:`
			`raise ValueError("please input a valid directory to tabulate result")`

			`trials = (`
			`sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x))`
			`if len(task_ids) > 0`
			`else []`
			`)`
			`dbnames = [`
			`[f"{runlogs}{task_id}/{trial}/telemetry.db" for task_id in task_ids]`
			`for trial in trials`
			`]`

			`query = """`
			`SELECT cost, session_id, response, start_time, end_time`
			`FROM (`
			`SELECT invocation_id, cost, session_id, response, start_time, end_time,`
			`ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn`
			`FROM chat_completions`
			`)`
			`WHERE rn = 1;`
			`"""`

			`with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:`
			`for trial_index, each_trial in enumerate(dbnames):`
			`result_df = pd.DataFrame(`
			`columns=[`
			`"id",`
			`"status",`
			`"expected_answer",`
			`"final_answer",`
			`"cost",`
			`"latency",`
			`"num_of_llm_requests",`
			`"num_of_chat_messages",`
			`"prompt_tokens",`
			`"completion_tokens",`
			`"total_tokens",`
			`"model",`
			`]`
			`)`

			`result_df_type_mapping = {`
			`"id": str,`
			`"status": bool,`
			`"expected_answer": str,`
			`"final_answer": str,`
			`"cost": float,`
			`"latency": float,`
			`"num_of_llm_requests": int,`
			`"num_of_chat_messages": int,`
			`"prompt_tokens": int,`
			`"completion_tokens": int,`
			`"total_tokens": int,`
			`}`

			`for dbname, scorer_results in zip(each_trial, all_results):`
			`task_id = scorer_results[0]`
			`scorer_result = scorer_results[trial_index + 1]`

			`status, expected_answer, final_answer = (`
			`scorer_result if scorer_result else (False, "", "")`
			`)`

			`con = sqlite3.connect(dbname)`

			`# TODO: if large amount of data, add chunksize`
			`telemetry_df = pd.read_sql_query(query, con)`

			`earliest_starttime = pd.to_datetime(`
			`telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f"`
			`).min()`
			`latest_endtime = pd.to_datetime(`
			`telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f"`
			`).max()`

			`num_of_chat_messages = get_number_of_chat_messages(`
			`chat_messages_dir=os.path.dirname(dbname)`
			`)`
			`result = {`
			`"id": task_id,`
			`"status": status,`
			`"expected_answer": expected_answer,`
			`"final_answer": final_answer,`
			`"cost": telemetry_df["cost"].sum(),`
			`"latency": (`
			`latest_endtime - earliest_starttime`
			`).total_seconds(),`
			`"num_of_llm_requests": len(telemetry_df),`
			`"num_of_chat_messages": num_of_chat_messages,`
			`"prompt_tokens": telemetry_df["response"]`
			`.apply(`
			`lambda x: json.loads(x)["usage"]["prompt_tokens"]`
			`if "usage" in json.loads(x)`
			`and "prompt_tokens" in json.loads(x)["usage"]`
			`else 0`
			`)`
			`.sum(),`
			`"completion_tokens": telemetry_df["response"]`
			`.apply(`
			`lambda x: json.loads(x)["usage"]["completion_tokens"]`
			`if "usage" in json.loads(x)`
			`and "completion_tokens" in json.loads(x)["usage"]`
			`else 0`
			`)`
			`.sum(),`
			`"total_tokens": telemetry_df["response"]`
			`.apply(`
			`lambda x: json.loads(x)["usage"]["total_tokens"]`
			`if "usage" in json.loads(x)`
			`and "total_tokens" in json.loads(x)["usage"]`
			`else 0`
			`)`
			`.sum(),`
			`"model": telemetry_df["response"]`
			`.apply(`
			`lambda x: json.loads(x)["model"]`
			`if "model" in json.loads(x)`
			`else ""`
			`)`
			`.unique(),`
			`}`

			`result_df = result_df.astype(result_df_type_mapping)`
			`result_df = pd.concat(`
			`[result_df, pd.DataFrame([result])], ignore_index=True`
			`)`
			`result_df.to_excel(`
			`writer, sheet_name=f"trial_{trial_index}", index=False`
			`)`


			`if __name__ == "__main__" and __package__ is None:`
			`main(sys.argv)`