mirror of
				https://github.com/microsoft/autogen.git
				synced 2025-10-31 09:50:11 +00:00 
			
		
		
		
	
		
			
	
	
		
			233 lines
		
	
	
		
			8.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
		
		
			
		
	
	
			233 lines
		
	
	
		
			8.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
|   | import os | ||
|  | import sys | ||
|  | import re | ||
|  | from agbench.tabulate_cmd import default_tabulate | ||
|  | import json | ||
|  | import pandas as pd | ||
|  | import sqlite3 | ||
|  | import glob | ||
|  | import numpy as np | ||
|  | sys.path.append(os.path.dirname(__file__)) | ||
|  | 
 | ||
|  | from assistantbench_evaluator import question_scorer | ||
|  | 
 | ||
|  | EXCLUDE_DIR_NAMES = ["__pycache__"] | ||
|  | 
 | ||
|  | 
 | ||
|  | def normalize_answer(a): | ||
|  |     # Lower case | ||
|  |     # Trim (left and right) | ||
|  |     # standardize comma separated values | ||
|  |     # Replace multiple spaces with one space | ||
|  |     # Remove trailing punctuation | ||
|  |     norm_answer = ", ".join(a.strip().lower().split(",")) | ||
|  |     norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer)) | ||
|  |     return norm_answer | ||
|  | 
 | ||
|  | 
 | ||
|  | def scorer(instance_dir): | ||
|  |     # Read the expected answer | ||
|  |     expected_answer_file = os.path.join(instance_dir, "expected_answer.txt") | ||
|  |     if not os.path.isfile(expected_answer_file): | ||
|  |         return None | ||
|  | 
 | ||
|  |     expected_answer = None | ||
|  |     with open(expected_answer_file, "rt") as fh: | ||
|  |         expected_answer = fh.read().strip() | ||
|  | 
 | ||
|  |     # Read the console | ||
|  |     console_log_file = os.path.join(instance_dir, "console_log.txt") | ||
|  |     if not os.path.isfile(console_log_file): | ||
|  |         return None | ||
|  | 
 | ||
|  |     console_log = "" | ||
|  |     with open(console_log_file, "rt") as fh: | ||
|  |         console_log = fh.read() | ||
|  | 
 | ||
|  |         final_answer = None | ||
|  |         m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL) | ||
|  |         if m: | ||
|  |             final_answer = m.group(1).strip() | ||
|  | 
 | ||
|  |         # Missing the final answer line | ||
|  |         if final_answer is None: | ||
|  |             return None | ||
|  |         # get accuracy from assistantbench util, no normalization done for accuracy | ||
|  |         accuracy = question_scorer(final_answer, expected_answer) | ||
|  |         n_ex = normalize_answer(expected_answer) | ||
|  |         n_final = normalize_answer(final_answer) | ||
|  |         return (accuracy, n_ex, n_final) | ||
|  | 
 | ||
|  | 
 | ||
|  | def get_number_of_chat_messages(chat_messages_dir): | ||
|  |     result = 0 | ||
|  |     for file in glob.glob(f"{chat_messages_dir}/*_messages.json"): | ||
|  |         with open(file, "r") as f: | ||
|  |             content = json.load(f) | ||
|  |             for agent, messages in content.items(): | ||
|  |                 result += len(messages) | ||
|  |     return result | ||
|  | 
 | ||
|  | 
 | ||
|  | def main(args): | ||
|  |     parsed_args, all_results = default_tabulate(args, scorer=scorer) | ||
|  |     excel_path = parsed_args.excel | ||
|  | 
 | ||
|  |     if excel_path: | ||
|  |         excel_dir = os.path.dirname(excel_path) or "." | ||
|  |         if not os.path.exists(excel_dir): | ||
|  |             os.makedirs(excel_dir, exist_ok=True) | ||
|  | 
 | ||
|  |         if not excel_path.endswith((".xlsx", ".xls")): | ||
|  |             excel_path += ".xlsx" | ||
|  | 
 | ||
|  |         runlogs = ( | ||
|  |             parsed_args.runlogs | ||
|  |             if parsed_args.runlogs.endswith("/") | ||
|  |             else parsed_args.runlogs + "/" | ||
|  |         ) | ||
|  | 
 | ||
|  |         if os.path.isdir(runlogs): | ||
|  |             task_ids = sorted( | ||
|  |                 [ | ||
|  |                     task_id | ||
|  |                     for task_id in os.listdir(runlogs) | ||
|  |                     if task_id not in EXCLUDE_DIR_NAMES | ||
|  |                 ], | ||
|  |                 key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)), | ||
|  |             ) | ||
|  |         else: | ||
|  |             raise ValueError("please input a valid directory to tabulate result") | ||
|  | 
 | ||
|  |         trials = ( | ||
|  |             sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x)) | ||
|  |             if len(task_ids) > 0 | ||
|  |             else [] | ||
|  |         ) | ||
|  |         dbnames = [ | ||
|  |             [f"{runlogs}{task_id}/{trial}/telemetry.db" for task_id in task_ids] | ||
|  |             for trial in trials | ||
|  |         ] | ||
|  | 
 | ||
|  |         query = """
 | ||
|  |             SELECT cost, session_id, response, start_time, end_time | ||
|  |             FROM ( | ||
|  |                 SELECT invocation_id, cost, session_id, response, start_time, end_time, | ||
|  |                     ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn | ||
|  |                 FROM chat_completions | ||
|  |             ) | ||
|  |             WHERE rn = 1; | ||
|  |         """
 | ||
|  | 
 | ||
|  |         with pd.ExcelWriter(excel_path, engine="openpyxl") as writer: | ||
|  |             for trial_index, each_trial in enumerate(dbnames): | ||
|  |                 result_df = pd.DataFrame( | ||
|  |                     columns=[ | ||
|  |                         "id", | ||
|  |                         "status", | ||
|  |                         "expected_answer", | ||
|  |                         "final_answer", | ||
|  |                         "cost", | ||
|  |                         "latency", | ||
|  |                         "num_of_llm_requests", | ||
|  |                         "num_of_chat_messages", | ||
|  |                         "prompt_tokens", | ||
|  |                         "completion_tokens", | ||
|  |                         "total_tokens", | ||
|  |                         "model", | ||
|  |                     ] | ||
|  |                 ) | ||
|  | 
 | ||
|  |                 result_df_type_mapping = { | ||
|  |                     "id": str, | ||
|  |                     "status": bool, | ||
|  |                     "expected_answer": str, | ||
|  |                     "final_answer": str, | ||
|  |                     "cost": float, | ||
|  |                     "latency": float, | ||
|  |                     "num_of_llm_requests": int, | ||
|  |                     "num_of_chat_messages": int, | ||
|  |                     "prompt_tokens": int, | ||
|  |                     "completion_tokens": int, | ||
|  |                     "total_tokens": int, | ||
|  |                 } | ||
|  | 
 | ||
|  |                 for dbname, scorer_results in zip(each_trial, all_results): | ||
|  |                     task_id = scorer_results[0] | ||
|  |                     scorer_result = scorer_results[trial_index + 1] | ||
|  | 
 | ||
|  |                     status, expected_answer, final_answer = ( | ||
|  |                         scorer_result if scorer_result else (False, "", "") | ||
|  |                     ) | ||
|  | 
 | ||
|  |                     con = sqlite3.connect(dbname) | ||
|  | 
 | ||
|  |                     # TODO: if large amount of data, add chunksize | ||
|  |                     telemetry_df = pd.read_sql_query(query, con) | ||
|  | 
 | ||
|  |                     earliest_starttime = pd.to_datetime( | ||
|  |                         telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f" | ||
|  |                     ).min() | ||
|  |                     latest_endtime = pd.to_datetime( | ||
|  |                         telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f" | ||
|  |                     ).max() | ||
|  | 
 | ||
|  |                     num_of_chat_messages = get_number_of_chat_messages( | ||
|  |                         chat_messages_dir=os.path.dirname(dbname) | ||
|  |                     ) | ||
|  |                     result = { | ||
|  |                         "id": task_id, | ||
|  |                         "status": status, | ||
|  |                         "expected_answer": expected_answer, | ||
|  |                         "final_answer": final_answer, | ||
|  |                         "cost": telemetry_df["cost"].sum(), | ||
|  |                         "latency": ( | ||
|  |                             latest_endtime - earliest_starttime | ||
|  |                         ).total_seconds(), | ||
|  |                         "num_of_llm_requests": len(telemetry_df), | ||
|  |                         "num_of_chat_messages": num_of_chat_messages, | ||
|  |                         "prompt_tokens": telemetry_df["response"] | ||
|  |                         .apply( | ||
|  |                             lambda x: json.loads(x)["usage"]["prompt_tokens"] | ||
|  |                             if "usage" in json.loads(x) | ||
|  |                             and "prompt_tokens" in json.loads(x)["usage"] | ||
|  |                             else 0 | ||
|  |                         ) | ||
|  |                         .sum(), | ||
|  |                         "completion_tokens": telemetry_df["response"] | ||
|  |                         .apply( | ||
|  |                             lambda x: json.loads(x)["usage"]["completion_tokens"] | ||
|  |                             if "usage" in json.loads(x) | ||
|  |                             and "completion_tokens" in json.loads(x)["usage"] | ||
|  |                             else 0 | ||
|  |                         ) | ||
|  |                         .sum(), | ||
|  |                         "total_tokens": telemetry_df["response"] | ||
|  |                         .apply( | ||
|  |                             lambda x: json.loads(x)["usage"]["total_tokens"] | ||
|  |                             if "usage" in json.loads(x) | ||
|  |                             and "total_tokens" in json.loads(x)["usage"] | ||
|  |                             else 0 | ||
|  |                         ) | ||
|  |                         .sum(), | ||
|  |                         "model": telemetry_df["response"] | ||
|  |                         .apply( | ||
|  |                             lambda x: json.loads(x)["model"] | ||
|  |                             if "model" in json.loads(x) | ||
|  |                             else "" | ||
|  |                         ) | ||
|  |                         .unique(), | ||
|  |                     } | ||
|  | 
 | ||
|  |                     result_df = result_df.astype(result_df_type_mapping) | ||
|  |                     result_df = pd.concat( | ||
|  |                         [result_df, pd.DataFrame([result])], ignore_index=True | ||
|  |                     ) | ||
|  |                 result_df.to_excel( | ||
|  |                     writer, sheet_name=f"trial_{trial_index}", index=False | ||
|  |                 ) | ||
|  | 
 | ||
|  | 
 | ||
|  | if __name__ == "__main__" and __package__ is None: | ||
|  |     main(sys.argv) |