mirror of
https://github.com/microsoft/autogen.git
synced 2025-07-04 07:26:28 +00:00

* Move from tomllib to tomli * added example code for magentic-one + code comments * adding benchmarks temporarily * add license for datasets * revert changes to magentic-one * change license location --------- Co-authored-by: Ryan Sweet <rysweet@microsoft.com>
233 lines
8.3 KiB
Python
233 lines
8.3 KiB
Python
import os
|
|
import sys
|
|
import re
|
|
from agbench.tabulate_cmd import default_tabulate
|
|
import json
|
|
import pandas as pd
|
|
import sqlite3
|
|
import glob
|
|
import numpy as np
|
|
sys.path.append(os.path.dirname(__file__))
|
|
|
|
from assistantbench_evaluator import question_scorer
|
|
|
|
EXCLUDE_DIR_NAMES = ["__pycache__"]
|
|
|
|
|
|
def normalize_answer(a):
|
|
# Lower case
|
|
# Trim (left and right)
|
|
# standardize comma separated values
|
|
# Replace multiple spaces with one space
|
|
# Remove trailing punctuation
|
|
norm_answer = ", ".join(a.strip().lower().split(","))
|
|
norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer))
|
|
return norm_answer
|
|
|
|
|
|
def scorer(instance_dir):
|
|
# Read the expected answer
|
|
expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
|
|
if not os.path.isfile(expected_answer_file):
|
|
return None
|
|
|
|
expected_answer = None
|
|
with open(expected_answer_file, "rt") as fh:
|
|
expected_answer = fh.read().strip()
|
|
|
|
# Read the console
|
|
console_log_file = os.path.join(instance_dir, "console_log.txt")
|
|
if not os.path.isfile(console_log_file):
|
|
return None
|
|
|
|
console_log = ""
|
|
with open(console_log_file, "rt") as fh:
|
|
console_log = fh.read()
|
|
|
|
final_answer = None
|
|
m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
|
|
if m:
|
|
final_answer = m.group(1).strip()
|
|
|
|
# Missing the final answer line
|
|
if final_answer is None:
|
|
return None
|
|
# get accuracy from assistantbench util, no normalization done for accuracy
|
|
accuracy = question_scorer(final_answer, expected_answer)
|
|
n_ex = normalize_answer(expected_answer)
|
|
n_final = normalize_answer(final_answer)
|
|
return (accuracy, n_ex, n_final)
|
|
|
|
|
|
def get_number_of_chat_messages(chat_messages_dir):
|
|
result = 0
|
|
for file in glob.glob(f"{chat_messages_dir}/*_messages.json"):
|
|
with open(file, "r") as f:
|
|
content = json.load(f)
|
|
for agent, messages in content.items():
|
|
result += len(messages)
|
|
return result
|
|
|
|
|
|
def main(args):
|
|
parsed_args, all_results = default_tabulate(args, scorer=scorer)
|
|
excel_path = parsed_args.excel
|
|
|
|
if excel_path:
|
|
excel_dir = os.path.dirname(excel_path) or "."
|
|
if not os.path.exists(excel_dir):
|
|
os.makedirs(excel_dir, exist_ok=True)
|
|
|
|
if not excel_path.endswith((".xlsx", ".xls")):
|
|
excel_path += ".xlsx"
|
|
|
|
runlogs = (
|
|
parsed_args.runlogs
|
|
if parsed_args.runlogs.endswith("/")
|
|
else parsed_args.runlogs + "/"
|
|
)
|
|
|
|
if os.path.isdir(runlogs):
|
|
task_ids = sorted(
|
|
[
|
|
task_id
|
|
for task_id in os.listdir(runlogs)
|
|
if task_id not in EXCLUDE_DIR_NAMES
|
|
],
|
|
key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)),
|
|
)
|
|
else:
|
|
raise ValueError("please input a valid directory to tabulate result")
|
|
|
|
trials = (
|
|
sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x))
|
|
if len(task_ids) > 0
|
|
else []
|
|
)
|
|
dbnames = [
|
|
[f"{runlogs}{task_id}/{trial}/telemetry.db" for task_id in task_ids]
|
|
for trial in trials
|
|
]
|
|
|
|
query = """
|
|
SELECT cost, session_id, response, start_time, end_time
|
|
FROM (
|
|
SELECT invocation_id, cost, session_id, response, start_time, end_time,
|
|
ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn
|
|
FROM chat_completions
|
|
)
|
|
WHERE rn = 1;
|
|
"""
|
|
|
|
with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
|
|
for trial_index, each_trial in enumerate(dbnames):
|
|
result_df = pd.DataFrame(
|
|
columns=[
|
|
"id",
|
|
"status",
|
|
"expected_answer",
|
|
"final_answer",
|
|
"cost",
|
|
"latency",
|
|
"num_of_llm_requests",
|
|
"num_of_chat_messages",
|
|
"prompt_tokens",
|
|
"completion_tokens",
|
|
"total_tokens",
|
|
"model",
|
|
]
|
|
)
|
|
|
|
result_df_type_mapping = {
|
|
"id": str,
|
|
"status": bool,
|
|
"expected_answer": str,
|
|
"final_answer": str,
|
|
"cost": float,
|
|
"latency": float,
|
|
"num_of_llm_requests": int,
|
|
"num_of_chat_messages": int,
|
|
"prompt_tokens": int,
|
|
"completion_tokens": int,
|
|
"total_tokens": int,
|
|
}
|
|
|
|
for dbname, scorer_results in zip(each_trial, all_results):
|
|
task_id = scorer_results[0]
|
|
scorer_result = scorer_results[trial_index + 1]
|
|
|
|
status, expected_answer, final_answer = (
|
|
scorer_result if scorer_result else (False, "", "")
|
|
)
|
|
|
|
con = sqlite3.connect(dbname)
|
|
|
|
# TODO: if large amount of data, add chunksize
|
|
telemetry_df = pd.read_sql_query(query, con)
|
|
|
|
earliest_starttime = pd.to_datetime(
|
|
telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f"
|
|
).min()
|
|
latest_endtime = pd.to_datetime(
|
|
telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f"
|
|
).max()
|
|
|
|
num_of_chat_messages = get_number_of_chat_messages(
|
|
chat_messages_dir=os.path.dirname(dbname)
|
|
)
|
|
result = {
|
|
"id": task_id,
|
|
"status": status,
|
|
"expected_answer": expected_answer,
|
|
"final_answer": final_answer,
|
|
"cost": telemetry_df["cost"].sum(),
|
|
"latency": (
|
|
latest_endtime - earliest_starttime
|
|
).total_seconds(),
|
|
"num_of_llm_requests": len(telemetry_df),
|
|
"num_of_chat_messages": num_of_chat_messages,
|
|
"prompt_tokens": telemetry_df["response"]
|
|
.apply(
|
|
lambda x: json.loads(x)["usage"]["prompt_tokens"]
|
|
if "usage" in json.loads(x)
|
|
and "prompt_tokens" in json.loads(x)["usage"]
|
|
else 0
|
|
)
|
|
.sum(),
|
|
"completion_tokens": telemetry_df["response"]
|
|
.apply(
|
|
lambda x: json.loads(x)["usage"]["completion_tokens"]
|
|
if "usage" in json.loads(x)
|
|
and "completion_tokens" in json.loads(x)["usage"]
|
|
else 0
|
|
)
|
|
.sum(),
|
|
"total_tokens": telemetry_df["response"]
|
|
.apply(
|
|
lambda x: json.loads(x)["usage"]["total_tokens"]
|
|
if "usage" in json.loads(x)
|
|
and "total_tokens" in json.loads(x)["usage"]
|
|
else 0
|
|
)
|
|
.sum(),
|
|
"model": telemetry_df["response"]
|
|
.apply(
|
|
lambda x: json.loads(x)["model"]
|
|
if "model" in json.loads(x)
|
|
else ""
|
|
)
|
|
.unique(),
|
|
}
|
|
|
|
result_df = result_df.astype(result_df_type_mapping)
|
|
result_df = pd.concat(
|
|
[result_df, pd.DataFrame([result])], ignore_index=True
|
|
)
|
|
result_df.to_excel(
|
|
writer, sheet_name=f"trial_{trial_index}", index=False
|
|
)
|
|
|
|
|
|
if __name__ == "__main__" and __package__ is None:
|
|
main(sys.argv)
|