# This Script is slightly modified from the creators of the AssistantBench dataset https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/evaluation/evaluator.py import json from evaluate_utils.evaluate_factory import get_evaluator import numpy as np def find_isnan(samp): try: if np.isnan(samp): return True else: return False except: return False def fix_ans(answer): try: answer = answer.replace("{'", '{"').replace("', '", '", "').replace("': '", '": "').replace("'}", '"}') answer = answer.replace("': ", '": ') return answer except: return answer def parse_answer(answer): if len(answer) == 1: ans, is_num = fix_number(answer[0]) if is_num: return ans, "number" try: ans = json.loads(fix_ans(answer[0])) return [ans], "json" except: ans, is_num = fix_number(answer[0]) if is_num: return ans, "number" else: return answer[0], "string" else: try: ans = [json.loads(fix_ans(ex)) for ex in answer] return ans, "json" except: return answer, "string list" def fix_number(number): if type(number) == str: copy_ans = number copy_ans = " ".join(" ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")).strip() copy_ans = copy_ans.strip() copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "") try: return float(copy_ans), True except: return number, False elif type(number) == int: return float(number), True else: return number, True def fix_prediction(prediction, gold_answer, evaluator): if ( type(prediction) == list and len(prediction) == 1 and (type(prediction[0]) == int or ((type(prediction[0]) == str) and prediction[0].isnumeric())) ): prediction = fix_number(prediction[0]) if type(prediction) != list: prediction, is_num = fix_number(prediction) if evaluator == "json": try: prediction = [json.loads(pred) for pred in prediction.split("\n")] except: prediction = [prediction] if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0): return prediction, False if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float: return prediction, False return prediction, True def question_scorer(prediction, gold_answer): """ prediction: str or list of str gold_answer: str or list of str returns a float between 0 and 1 """ try: try: prediction = json.loads(prediction) except: prediction = prediction answer_list = ( [x for x in gold_answer.split("\n") if len(x.strip()) > 0] if type(gold_answer) != list else gold_answer ) gold_answer, evaluator = parse_answer(answer_list) prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator) has_ans = 1.0 if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction): has_ans = 0.0 if not run_eval: return 0.0 metric_eval = get_evaluator(evaluator) accuracy = metric_eval(prediction, gold_answer) # double check if the accuracy is a number between 0 and 1 if 0 <= accuracy <= 1: return accuracy else: # throw exception raise ValueError(f"Accuracy should be a float between 0 and 1, but got {accuracy}") except Exception as e: print( f"Something went wrong while evaluating prediction {prediction} vs gold answer {gold_answer} with error {e}" ) return 0.0