autogen/python/packages/agbench/benchmarks/GAIA/Scripts/custom_tabulate.py

import os
import sys
import re
from agbench.tabulate_cmd import default_tabulate
import json
import pandas as pd
import sqlite3
import glob
import string
import warnings
import numpy as np

EXCLUDE_DIR_NAMES = ["__pycache__"]


def in_house_normalize_answer(a):
    # Lower case
    # Trim (left and right)
    # standardize comma separated values
    # Replace multiple spaces with one space
    # Remove trailing punctuation
    norm_answer = ", ".join(a.strip().lower().split(","))
    norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer))
    return norm_answer


def in_house_question_scorer(
    model_answer: str,
    ground_truth: str,
) -> bool:
     n_ma = in_house_normalize_answer(model_answer)
     n_gt = in_house_normalize_answer(ground_truth)
     return (n_gt != "" and n_gt == n_ma)
 

def gaia_question_scorer(
    model_answer: str,
    ground_truth: str,
) -> bool:
    #FROM: https://huggingface.co/spaces/gaia-benchmark/leaderboard/blob/main/scorer.py

    def normalize_number_str(number_str: str) -> float:
        # we replace these common units and commas to allow
        # conversion to float
        for char in ["$", "%", ","]:
            number_str = number_str.replace(char, "")
        try:
            return float(number_str)
        except ValueError:
            print(f"String {number_str} cannot be normalized to number str.")
            return float("inf")

    def split_string(s: str, char_list: list[str] = [",", ";"],) -> list[str]:
        pattern = f"[{''.join(char_list)}]"
        return re.split(pattern, s)

    def normalize_str(input_str, remove_punct=True) -> str:
        """
        Normalize a string by:
        - Removing all white spaces
        - Optionally removing punctuation (if remove_punct is True)
        - Converting to lowercase
        Parameters:
        - input_str: str, the string to normalize
        - remove_punct: bool, whether to remove punctuation (default: True)
        Returns:
        - str, the normalized string
        """
        # Remove all white spaces. Required e.g for seagull vs. sea gull
        no_spaces = re.sub(r"\s", "", input_str)

        # Remove punctuation, if specified.
        if remove_punct:
            translator = str.maketrans("", "", string.punctuation)
            return no_spaces.lower().translate(translator)
        else:
            return no_spaces.lower()


    def is_float(element: any) -> bool:
        try:
            float(element)
            return True
        except ValueError:
            return False

    # if gt is a number
    if is_float(ground_truth):
        normalized_answer = normalize_number_str(model_answer)
        return normalized_answer == float(ground_truth)

    # if gt is a list
    elif any(char in ground_truth for char in [",", ";"]):
        # question with the fish: normalization removes punct

        gt_elems = split_string(ground_truth)
        ma_elems = split_string(model_answer)

        # check length is the same
        if len(gt_elems) != len(ma_elems):
            #warnings.warn(
            #    "Answer lists have different lengths, returning False.", UserWarning
            #)
            return False

        # compare each element as float or str
        comparisons = []
        for ma_elem, gt_elem in zip(ma_elems, gt_elems):
            if is_float(gt_elem):
                normalized_ma_elem = normalize_number_str(ma_elem)
                comparisons.append(normalized_ma_elem == float(gt_elem))
            else:
                # we do not remove punct since comparisons can include punct
                comparisons.append(
                    normalize_str(ma_elem, remove_punct=False)
                    == normalize_str(gt_elem, remove_punct=False)
                )
        return all(comparisons)

    # if gt is a str
    else:
        return normalize_str(model_answer) == normalize_str(ground_truth)


##############

def scorer(instance_dir):
    # Read the expected answer
    expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
    if not os.path.isfile(expected_answer_file):
        return None

    expected_answer = None
    with open(expected_answer_file, "rt") as fh:
        expected_answer = fh.read().strip()

    # Read the console
    console_log_file = os.path.join(instance_dir, "console_log.txt")
    if not os.path.isfile(console_log_file):
        return None

    console_log = ""
    with open(console_log_file, "rt") as fh:
        console_log = fh.read()

        final_answer = None 
        m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
        if m:
            final_answer = m.group(1).strip()

        # Missing the final answer line
        if final_answer is None:
            return None

        # Return true if they are equal after normalization
        # return in_house_question_scorer(final_answer, expected_answer)
        return gaia_question_scorer(final_answer, expected_answer)


def main(args):
    default_tabulate(args, scorer=scorer)

if __name__ == "__main__" and __package__ is None:
    main(sys.argv)
Adding Benchmarks to agbench (#3803) * Move from tomllib to tomli * added example code for magentic-one + code comments * adding benchmarks temporarily * add license for datasets * revert changes to magentic-one * change license location --------- Co-authored-by: Ryan Sweet <rysweet@microsoft.com> 2024-10-17 21:33:33 -07:00			`import os`
			`import sys`
			`import re`
			`from agbench.tabulate_cmd import default_tabulate`
			`import json`
			`import pandas as pd`
			`import sqlite3`
			`import glob`
Significant updates to agbench. (#5313) - Updated HumanEval template to use AgentChat - Update templates to use config.yaml for model and other configuration - Read environment from ENV.yaml (ENV.json still supported but deprecated) - Temporarily removed WebArena and AssistantBench. Neither had viable Templates after `autogen_magentic_one` was removed. Templates need to be update to AgentChat (in a future PR, but this PR is getting big enough already) 2025-02-07 10:01:44 -08:00			`import string`
			`import warnings`
Adding Benchmarks to agbench (#3803) * Move from tomllib to tomli * added example code for magentic-one + code comments * adding benchmarks temporarily * add license for datasets * revert changes to magentic-one * change license location --------- Co-authored-by: Ryan Sweet <rysweet@microsoft.com> 2024-10-17 21:33:33 -07:00			`import numpy as np`

			`EXCLUDE_DIR_NAMES = ["__pycache__"]`


Significant updates to agbench. (#5313) - Updated HumanEval template to use AgentChat - Update templates to use config.yaml for model and other configuration - Read environment from ENV.yaml (ENV.json still supported but deprecated) - Temporarily removed WebArena and AssistantBench. Neither had viable Templates after `autogen_magentic_one` was removed. Templates need to be update to AgentChat (in a future PR, but this PR is getting big enough already) 2025-02-07 10:01:44 -08:00			`def in_house_normalize_answer(a):`
Adding Benchmarks to agbench (#3803) * Move from tomllib to tomli * added example code for magentic-one + code comments * adding benchmarks temporarily * add license for datasets * revert changes to magentic-one * change license location --------- Co-authored-by: Ryan Sweet <rysweet@microsoft.com> 2024-10-17 21:33:33 -07:00			`# Lower case`
			`# Trim (left and right)`
			`# standardize comma separated values`
			`# Replace multiple spaces with one space`
			`# Remove trailing punctuation`
			`norm_answer = ", ".join(a.strip().lower().split(","))`
			`norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer))`
			`return norm_answer`


Significant updates to agbench. (#5313) - Updated HumanEval template to use AgentChat - Update templates to use config.yaml for model and other configuration - Read environment from ENV.yaml (ENV.json still supported but deprecated) - Temporarily removed WebArena and AssistantBench. Neither had viable Templates after `autogen_magentic_one` was removed. Templates need to be update to AgentChat (in a future PR, but this PR is getting big enough already) 2025-02-07 10:01:44 -08:00			`def in_house_question_scorer(`
			`model_answer: str,`
			`ground_truth: str,`
			`) -> bool:`
			`n_ma = in_house_normalize_answer(model_answer)`
			`n_gt = in_house_normalize_answer(ground_truth)`
			`return (n_gt != "" and n_gt == n_ma)`


			`def gaia_question_scorer(`
			`model_answer: str,`
			`ground_truth: str,`
			`) -> bool:`
			`#FROM: https://huggingface.co/spaces/gaia-benchmark/leaderboard/blob/main/scorer.py`

			`def normalize_number_str(number_str: str) -> float:`
			`# we replace these common units and commas to allow`
			`# conversion to float`
			`for char in ["$", "%", ","]:`
			`number_str = number_str.replace(char, "")`
			`try:`
			`return float(number_str)`
			`except ValueError:`
			`print(f"String {number_str} cannot be normalized to number str.")`
			`return float("inf")`

			`def split_string(s: str, char_list: list[str] = [",", ";"],) -> list[str]:`
			`pattern = f"[{''.join(char_list)}]"`
			`return re.split(pattern, s)`

			`def normalize_str(input_str, remove_punct=True) -> str:`
			`"""`
			`Normalize a string by:`
			`- Removing all white spaces`
			`- Optionally removing punctuation (if remove_punct is True)`
			`- Converting to lowercase`
			`Parameters:`
			`- input_str: str, the string to normalize`
			`- remove_punct: bool, whether to remove punctuation (default: True)`
			`Returns:`
			`- str, the normalized string`
			`"""`
			`# Remove all white spaces. Required e.g for seagull vs. sea gull`
			`no_spaces = re.sub(r"\s", "", input_str)`

			`# Remove punctuation, if specified.`
			`if remove_punct:`
			`translator = str.maketrans("", "", string.punctuation)`
			`return no_spaces.lower().translate(translator)`
			`else:`
			`return no_spaces.lower()`


			`def is_float(element: any) -> bool:`
			`try:`
			`float(element)`
			`return True`
			`except ValueError:`
			`return False`

			`# if gt is a number`
			`if is_float(ground_truth):`
			`normalized_answer = normalize_number_str(model_answer)`
			`return normalized_answer == float(ground_truth)`

			`# if gt is a list`
			`elif any(char in ground_truth for char in [",", ";"]):`
			`# question with the fish: normalization removes punct`

			`gt_elems = split_string(ground_truth)`
			`ma_elems = split_string(model_answer)`

			`# check length is the same`
			`if len(gt_elems) != len(ma_elems):`
			`#warnings.warn(`
			`# "Answer lists have different lengths, returning False.", UserWarning`
			`#)`
			`return False`

			`# compare each element as float or str`
			`comparisons = []`
			`for ma_elem, gt_elem in zip(ma_elems, gt_elems):`
			`if is_float(gt_elem):`
			`normalized_ma_elem = normalize_number_str(ma_elem)`
			`comparisons.append(normalized_ma_elem == float(gt_elem))`
			`else:`
			`# we do not remove punct since comparisons can include punct`
			`comparisons.append(`
			`normalize_str(ma_elem, remove_punct=False)`
			`== normalize_str(gt_elem, remove_punct=False)`
			`)`
			`return all(comparisons)`

			`# if gt is a str`
			`else:`
			`return normalize_str(model_answer) == normalize_str(ground_truth)`


			`##############`

Adding Benchmarks to agbench (#3803) * Move from tomllib to tomli * added example code for magentic-one + code comments * adding benchmarks temporarily * add license for datasets * revert changes to magentic-one * change license location --------- Co-authored-by: Ryan Sweet <rysweet@microsoft.com> 2024-10-17 21:33:33 -07:00			`def scorer(instance_dir):`
			`# Read the expected answer`
			`expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")`
			`if not os.path.isfile(expected_answer_file):`
			`return None`

			`expected_answer = None`
			`with open(expected_answer_file, "rt") as fh:`
			`expected_answer = fh.read().strip()`

			`# Read the console`
			`console_log_file = os.path.join(instance_dir, "console_log.txt")`
			`if not os.path.isfile(console_log_file):`
			`return None`

			`console_log = ""`
			`with open(console_log_file, "rt") as fh:`
			`console_log = fh.read()`

			`final_answer = None`
			`m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)`
			`if m:`
			`final_answer = m.group(1).strip()`

			`# Missing the final answer line`
			`if final_answer is None:`
			`return None`

			`# Return true if they are equal after normalization`
Significant updates to agbench. (#5313) - Updated HumanEval template to use AgentChat - Update templates to use config.yaml for model and other configuration - Read environment from ENV.yaml (ENV.json still supported but deprecated) - Temporarily removed WebArena and AssistantBench. Neither had viable Templates after `autogen_magentic_one` was removed. Templates need to be update to AgentChat (in a future PR, but this PR is getting big enough already) 2025-02-07 10:01:44 -08:00			`# return in_house_question_scorer(final_answer, expected_answer)`
			`return gaia_question_scorer(final_answer, expected_answer)`
Adding Benchmarks to agbench (#3803) * Move from tomllib to tomli * added example code for magentic-one + code comments * adding benchmarks temporarily * add license for datasets * revert changes to magentic-one * change license location --------- Co-authored-by: Ryan Sweet <rysweet@microsoft.com> 2024-10-17 21:33:33 -07:00

			`def main(args):`
Significant updates to agbench. (#5313) - Updated HumanEval template to use AgentChat - Update templates to use config.yaml for model and other configuration - Read environment from ENV.yaml (ENV.json still supported but deprecated) - Temporarily removed WebArena and AssistantBench. Neither had viable Templates after `autogen_magentic_one` was removed. Templates need to be update to AgentChat (in a future PR, but this PR is getting big enough already) 2025-02-07 10:01:44 -08:00			`default_tabulate(args, scorer=scorer)`
Adding Benchmarks to agbench (#3803) * Move from tomllib to tomli * added example code for magentic-one + code comments * adding benchmarks temporarily * add license for datasets * revert changes to magentic-one * change license location --------- Co-authored-by: Ryan Sweet <rysweet@microsoft.com> 2024-10-17 21:33:33 -07:00
			`if __name__ == "__main__" and __package__ is None:`
			`main(sys.argv)`