autogen/python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py

# This Script is slightly modified from the creators of the AssistantBench dataset https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/evaluation/evaluator.py
import json
from evaluate_utils.evaluate_factory import get_evaluator
import numpy as np


def find_isnan(samp):
    try:
        if np.isnan(samp):
            return True
        else:
            return False
    except:
        return False


def fix_ans(answer):
    try:
        answer = answer.replace("{'", '{"').replace("', '", '", "').replace("': '", '": "').replace("'}", '"}')
        answer = answer.replace("': ", '": ')
        return answer
    except:
        return answer


def parse_answer(answer):
    if len(answer) == 1:
        ans, is_num = fix_number(answer[0])
        if is_num:
            return ans, "number"
        try:
            ans = json.loads(fix_ans(answer[0]))
            return [ans], "json"
        except:
            ans, is_num = fix_number(answer[0])
            if is_num:
                return ans, "number"
            else:
                return answer[0], "string"
    else:
        try:
            ans = [json.loads(fix_ans(ex)) for ex in answer]
            return ans, "json"
        except:
            return answer, "string list"


def fix_number(number):
    if type(number) == str:
        copy_ans = number
        copy_ans = " ".join(" ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")).strip()
        copy_ans = copy_ans.strip()
        copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "")
        try:
            return float(copy_ans), True
        except:
            return number, False
    elif type(number) == int:
        return float(number), True
    else:
        return number, True


def fix_prediction(prediction, gold_answer, evaluator):
    if (
        type(prediction) == list
        and len(prediction) == 1
        and (type(prediction[0]) == int or ((type(prediction[0]) == str) and prediction[0].isnumeric()))
    ):
        prediction = fix_number(prediction[0])

    if type(prediction) != list:
        prediction, is_num = fix_number(prediction)
        if evaluator == "json":
            try:
                prediction = [json.loads(pred) for pred in prediction.split("\n")]
            except:
                prediction = [prediction]

    if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0):
        return prediction, False

    if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float:
        return prediction, False

    return prediction, True


def question_scorer(prediction, gold_answer):
    """
    prediction: str or list of str
    gold_answer: str or list of str

    returns a float between 0 and 1
    """
    try:
        try:
            prediction = json.loads(prediction)
        except:
            prediction = prediction

        answer_list = (
            [x for x in gold_answer.split("\n") if len(x.strip()) > 0] if type(gold_answer) != list else gold_answer
        )
        gold_answer, evaluator = parse_answer(answer_list)
        prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator)

        has_ans = 1.0
        if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction):
            has_ans = 0.0

        if not run_eval:
            return 0.0

        metric_eval = get_evaluator(evaluator)
        accuracy = metric_eval(prediction, gold_answer)
        # double check if the accuracy is a number between 0 and 1
        if 0 <= accuracy <= 1:
            return accuracy
        else:
            # throw exception
            raise ValueError(f"Accuracy should be a float between 0 and 1, but got {accuracy}")
    except Exception as e:
        print(
            f"Something went wrong while evaluating prediction {prediction} vs gold answer {gold_answer} with error {e}"
        )
        return 0.0
Adding Benchmarks to agbench (#3803) * Move from tomllib to tomli * added example code for magentic-one + code comments * adding benchmarks temporarily * add license for datasets * revert changes to magentic-one * change license location --------- Co-authored-by: Ryan Sweet <rysweet@microsoft.com> 2024-10-17 21:33:33 -07:00			`# This Script is slightly modified from the creators of the AssistantBench dataset https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/evaluation/evaluator.py`
			`import json`
			`from evaluate_utils.evaluate_factory import get_evaluator`
			`import numpy as np`


			`def find_isnan(samp):`
			`try:`
			`if np.isnan(samp):`
			`return True`
			`else:`
			`return False`
			`except:`
			`return False`


			`def fix_ans(answer):`
			`try:`
			`answer = answer.replace("{'", '{"').replace("', '", '", "').replace("': '", '": "').replace("'}", '"}')`
			`answer = answer.replace("': ", '": ')`
			`return answer`
			`except:`
			`return answer`


			`def parse_answer(answer):`
			`if len(answer) == 1:`
			`ans, is_num = fix_number(answer[0])`
			`if is_num:`
			`return ans, "number"`
			`try:`
			`ans = json.loads(fix_ans(answer[0]))`
			`return [ans], "json"`
			`except:`
			`ans, is_num = fix_number(answer[0])`
			`if is_num:`
			`return ans, "number"`
			`else:`
			`return answer[0], "string"`
			`else:`
			`try:`
			`ans = [json.loads(fix_ans(ex)) for ex in answer]`
			`return ans, "json"`
			`except:`
			`return answer, "string list"`


			`def fix_number(number):`
			`if type(number) == str:`
			`copy_ans = number`
			`copy_ans = " ".join(" ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")).strip()`
			`copy_ans = copy_ans.strip()`
			`copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "")`
			`try:`
			`return float(copy_ans), True`
			`except:`
			`return number, False`
			`elif type(number) == int:`
			`return float(number), True`
			`else:`
			`return number, True`


			`def fix_prediction(prediction, gold_answer, evaluator):`
			`if (`
			`type(prediction) == list`
			`and len(prediction) == 1`
			`and (type(prediction[0]) == int or ((type(prediction[0]) == str) and prediction[0].isnumeric()))`
			`):`
			`prediction = fix_number(prediction[0])`

			`if type(prediction) != list:`
			`prediction, is_num = fix_number(prediction)`
			`if evaluator == "json":`
			`try:`
			`prediction = [json.loads(pred) for pred in prediction.split("\n")]`
			`except:`
			`prediction = [prediction]`

			`if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0):`
			`return prediction, False`

			`if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float:`
			`return prediction, False`

			`return prediction, True`


			`def question_scorer(prediction, gold_answer):`
			`"""`
			`prediction: str or list of str`
			`gold_answer: str or list of str`

			`returns a float between 0 and 1`
			`"""`
			`try:`
			`try:`
			`prediction = json.loads(prediction)`
			`except:`
			`prediction = prediction`

			`answer_list = (`
			`[x for x in gold_answer.split("\n") if len(x.strip()) > 0] if type(gold_answer) != list else gold_answer`
			`)`
			`gold_answer, evaluator = parse_answer(answer_list)`
			`prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator)`

			`has_ans = 1.0`
			`if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction):`
			`has_ans = 0.0`

			`if not run_eval:`
			`return 0.0`

			`metric_eval = get_evaluator(evaluator)`
			`accuracy = metric_eval(prediction, gold_answer)`
			`# double check if the accuracy is a number between 0 and 1`
			`if 0 <= accuracy <= 1:`
			`return accuracy`
			`else:`
			`# throw exception`
			`raise ValueError(f"Accuracy should be a float between 0 and 1, but got {accuracy}")`
			`except Exception as e:`
			`print(`
			`f"Something went wrong while evaluating prediction {prediction} vs gold answer {gold_answer} with error {e}"`
			`)`
			`return 0.0`