mirror of
				https://github.com/microsoft/autogen.git
				synced 2025-10-31 01:40:58 +00:00 
			
		
		
		
	 e11d84b996
			
		
	
	
		e11d84b996
		
			
		
	
	
	
	
		
			
			* Move from tomllib to tomli * added example code for magentic-one + code comments * adding benchmarks temporarily * add license for datasets * revert changes to magentic-one * change license location --------- Co-authored-by: Ryan Sweet <rysweet@microsoft.com>
		
			
				
	
	
		
			233 lines
		
	
	
		
			8.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			233 lines
		
	
	
		
			8.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os
 | |
| import sys
 | |
| import re
 | |
| from agbench.tabulate_cmd import default_tabulate
 | |
| import json
 | |
| import pandas as pd
 | |
| import sqlite3
 | |
| import glob
 | |
| import numpy as np
 | |
| sys.path.append(os.path.dirname(__file__))
 | |
| 
 | |
| from assistantbench_evaluator import question_scorer
 | |
| 
 | |
| EXCLUDE_DIR_NAMES = ["__pycache__"]
 | |
| 
 | |
| 
 | |
| def normalize_answer(a):
 | |
|     # Lower case
 | |
|     # Trim (left and right)
 | |
|     # standardize comma separated values
 | |
|     # Replace multiple spaces with one space
 | |
|     # Remove trailing punctuation
 | |
|     norm_answer = ", ".join(a.strip().lower().split(","))
 | |
|     norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer))
 | |
|     return norm_answer
 | |
| 
 | |
| 
 | |
| def scorer(instance_dir):
 | |
|     # Read the expected answer
 | |
|     expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
 | |
|     if not os.path.isfile(expected_answer_file):
 | |
|         return None
 | |
| 
 | |
|     expected_answer = None
 | |
|     with open(expected_answer_file, "rt") as fh:
 | |
|         expected_answer = fh.read().strip()
 | |
| 
 | |
|     # Read the console
 | |
|     console_log_file = os.path.join(instance_dir, "console_log.txt")
 | |
|     if not os.path.isfile(console_log_file):
 | |
|         return None
 | |
| 
 | |
|     console_log = ""
 | |
|     with open(console_log_file, "rt") as fh:
 | |
|         console_log = fh.read()
 | |
| 
 | |
|         final_answer = None
 | |
|         m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
 | |
|         if m:
 | |
|             final_answer = m.group(1).strip()
 | |
| 
 | |
|         # Missing the final answer line
 | |
|         if final_answer is None:
 | |
|             return None
 | |
|         # get accuracy from assistantbench util, no normalization done for accuracy
 | |
|         accuracy = question_scorer(final_answer, expected_answer)
 | |
|         n_ex = normalize_answer(expected_answer)
 | |
|         n_final = normalize_answer(final_answer)
 | |
|         return (accuracy, n_ex, n_final)
 | |
| 
 | |
| 
 | |
| def get_number_of_chat_messages(chat_messages_dir):
 | |
|     result = 0
 | |
|     for file in glob.glob(f"{chat_messages_dir}/*_messages.json"):
 | |
|         with open(file, "r") as f:
 | |
|             content = json.load(f)
 | |
|             for agent, messages in content.items():
 | |
|                 result += len(messages)
 | |
|     return result
 | |
| 
 | |
| 
 | |
| def main(args):
 | |
|     parsed_args, all_results = default_tabulate(args, scorer=scorer)
 | |
|     excel_path = parsed_args.excel
 | |
| 
 | |
|     if excel_path:
 | |
|         excel_dir = os.path.dirname(excel_path) or "."
 | |
|         if not os.path.exists(excel_dir):
 | |
|             os.makedirs(excel_dir, exist_ok=True)
 | |
| 
 | |
|         if not excel_path.endswith((".xlsx", ".xls")):
 | |
|             excel_path += ".xlsx"
 | |
| 
 | |
|         runlogs = (
 | |
|             parsed_args.runlogs
 | |
|             if parsed_args.runlogs.endswith("/")
 | |
|             else parsed_args.runlogs + "/"
 | |
|         )
 | |
| 
 | |
|         if os.path.isdir(runlogs):
 | |
|             task_ids = sorted(
 | |
|                 [
 | |
|                     task_id
 | |
|                     for task_id in os.listdir(runlogs)
 | |
|                     if task_id not in EXCLUDE_DIR_NAMES
 | |
|                 ],
 | |
|                 key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)),
 | |
|             )
 | |
|         else:
 | |
|             raise ValueError("please input a valid directory to tabulate result")
 | |
| 
 | |
|         trials = (
 | |
|             sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x))
 | |
|             if len(task_ids) > 0
 | |
|             else []
 | |
|         )
 | |
|         dbnames = [
 | |
|             [f"{runlogs}{task_id}/{trial}/telemetry.db" for task_id in task_ids]
 | |
|             for trial in trials
 | |
|         ]
 | |
| 
 | |
|         query = """
 | |
|             SELECT cost, session_id, response, start_time, end_time
 | |
|             FROM (
 | |
|                 SELECT invocation_id, cost, session_id, response, start_time, end_time,
 | |
|                     ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn
 | |
|                 FROM chat_completions
 | |
|             )
 | |
|             WHERE rn = 1;
 | |
|         """
 | |
| 
 | |
|         with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
 | |
|             for trial_index, each_trial in enumerate(dbnames):
 | |
|                 result_df = pd.DataFrame(
 | |
|                     columns=[
 | |
|                         "id",
 | |
|                         "status",
 | |
|                         "expected_answer",
 | |
|                         "final_answer",
 | |
|                         "cost",
 | |
|                         "latency",
 | |
|                         "num_of_llm_requests",
 | |
|                         "num_of_chat_messages",
 | |
|                         "prompt_tokens",
 | |
|                         "completion_tokens",
 | |
|                         "total_tokens",
 | |
|                         "model",
 | |
|                     ]
 | |
|                 )
 | |
| 
 | |
|                 result_df_type_mapping = {
 | |
|                     "id": str,
 | |
|                     "status": bool,
 | |
|                     "expected_answer": str,
 | |
|                     "final_answer": str,
 | |
|                     "cost": float,
 | |
|                     "latency": float,
 | |
|                     "num_of_llm_requests": int,
 | |
|                     "num_of_chat_messages": int,
 | |
|                     "prompt_tokens": int,
 | |
|                     "completion_tokens": int,
 | |
|                     "total_tokens": int,
 | |
|                 }
 | |
| 
 | |
|                 for dbname, scorer_results in zip(each_trial, all_results):
 | |
|                     task_id = scorer_results[0]
 | |
|                     scorer_result = scorer_results[trial_index + 1]
 | |
| 
 | |
|                     status, expected_answer, final_answer = (
 | |
|                         scorer_result if scorer_result else (False, "", "")
 | |
|                     )
 | |
| 
 | |
|                     con = sqlite3.connect(dbname)
 | |
| 
 | |
|                     # TODO: if large amount of data, add chunksize
 | |
|                     telemetry_df = pd.read_sql_query(query, con)
 | |
| 
 | |
|                     earliest_starttime = pd.to_datetime(
 | |
|                         telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f"
 | |
|                     ).min()
 | |
|                     latest_endtime = pd.to_datetime(
 | |
|                         telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f"
 | |
|                     ).max()
 | |
| 
 | |
|                     num_of_chat_messages = get_number_of_chat_messages(
 | |
|                         chat_messages_dir=os.path.dirname(dbname)
 | |
|                     )
 | |
|                     result = {
 | |
|                         "id": task_id,
 | |
|                         "status": status,
 | |
|                         "expected_answer": expected_answer,
 | |
|                         "final_answer": final_answer,
 | |
|                         "cost": telemetry_df["cost"].sum(),
 | |
|                         "latency": (
 | |
|                             latest_endtime - earliest_starttime
 | |
|                         ).total_seconds(),
 | |
|                         "num_of_llm_requests": len(telemetry_df),
 | |
|                         "num_of_chat_messages": num_of_chat_messages,
 | |
|                         "prompt_tokens": telemetry_df["response"]
 | |
|                         .apply(
 | |
|                             lambda x: json.loads(x)["usage"]["prompt_tokens"]
 | |
|                             if "usage" in json.loads(x)
 | |
|                             and "prompt_tokens" in json.loads(x)["usage"]
 | |
|                             else 0
 | |
|                         )
 | |
|                         .sum(),
 | |
|                         "completion_tokens": telemetry_df["response"]
 | |
|                         .apply(
 | |
|                             lambda x: json.loads(x)["usage"]["completion_tokens"]
 | |
|                             if "usage" in json.loads(x)
 | |
|                             and "completion_tokens" in json.loads(x)["usage"]
 | |
|                             else 0
 | |
|                         )
 | |
|                         .sum(),
 | |
|                         "total_tokens": telemetry_df["response"]
 | |
|                         .apply(
 | |
|                             lambda x: json.loads(x)["usage"]["total_tokens"]
 | |
|                             if "usage" in json.loads(x)
 | |
|                             and "total_tokens" in json.loads(x)["usage"]
 | |
|                             else 0
 | |
|                         )
 | |
|                         .sum(),
 | |
|                         "model": telemetry_df["response"]
 | |
|                         .apply(
 | |
|                             lambda x: json.loads(x)["model"]
 | |
|                             if "model" in json.loads(x)
 | |
|                             else ""
 | |
|                         )
 | |
|                         .unique(),
 | |
|                     }
 | |
| 
 | |
|                     result_df = result_df.astype(result_df_type_mapping)
 | |
|                     result_df = pd.concat(
 | |
|                         [result_df, pd.DataFrame([result])], ignore_index=True
 | |
|                     )
 | |
|                 result_df.to_excel(
 | |
|                     writer, sheet_name=f"trial_{trial_index}", index=False
 | |
|                 )
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__" and __package__ is None:
 | |
|     main(sys.argv)
 |