autogen/samples/tools/autogenbench/scenarios/GAIA/Scripts/custom_tabulate.py

import os
import sys
import json
import re
from autogenbench.tabulate_cmd import default_tabulate


def normalize_answer(a):
    # Lower case
    # Trim (left and right)
    # Replace multiple spaces with one space
    # Remove trailing punctuation
    return re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", a.strip().lower()))


def scorer(instance_dir):
    # Read the expected answer
    expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
    if not os.path.isfile(expected_answer_file):
        return None

    expected_answer = None
    with open(expected_answer_file, "rt") as fh:
        expected_answer = fh.read().strip()

    # Read the console
    console_log_file = os.path.join(instance_dir, "console_log.txt")
    if not os.path.isfile(console_log_file):
        return None

    console_log = ""
    with open(console_log_file, "rt") as fh:
        console_log = fh.read()

        final_answer = ""
        m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
        if m:
            final_answer = m.group(1).strip()

        # Return true if they are equal after normalization
        return normalize_answer(expected_answer) == normalize_answer(final_answer)


def main(args):
    default_tabulate(args, scorer=scorer)


if __name__ == "__main__" and __package__ is None:
    main(sys.argv)
Introduces AutoGenBench (#1048) * Initial commit of AutoGenBench * wording * typo * pre-commit reformulation * Updated README to point to contributor's guide earlier. * Simplified the description of the JSON format. * Added print statements to indicate when run.sh and scenario.py are starting. * Added SocietyOfMind scenario to GAIA. * Pointing autogenbench clone command to the latest branch. * Temporarily disable subsample option. * Updated the GAIA readme to specify how to define a BING API key. * Fixed and re-enabled the subsample option. * Added a draft of a blog post. * Updated authors. * Incorporating Gagan's feedback. * Fixed code formatting. * Updated the help string in the docs. * Light editing of the AutoGenBench blogpost. * Support filtering on model tags. * Added websurfer dependencies to Dockerfile. * Renamed testbed -> autogenbench * Attempting to fix formatting. * Added more gracefull handling of task timeouts (the script is allowed to terminate before Docker is stopped). * Updated the blogpost based on Saleema's and Julia's feedback. * Fixed formatting... again. * Added a main MANIFEST to list available scenarios. * Limit main manifest to directories. * Manifests now use relative paths. * All manifests are now relative. * Updated the contributing guide, and address windows path issues. * Updated the version. Fixed formatting. * Fixed formatting. * De-listing Examples, since it has no clear tabulate criteria. * Updated email in pyproject * typo in blogpost * wording --------- Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu> Co-authored-by: Qingyun Wu <qingyun0327@gmail.com> 2024-01-25 16:46:58 -08:00			`import os`
			`import sys`
			`import json`
			`import re`
			`from autogenbench.tabulate_cmd import default_tabulate`


			`def normalize_answer(a):`
			`# Lower case`
			`# Trim (left and right)`
			`# Replace multiple spaces with one space`
			`# Remove trailing punctuation`
			`return re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", a.strip().lower()))`


			`def scorer(instance_dir):`
			`# Read the expected answer`
			`expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")`
			`if not os.path.isfile(expected_answer_file):`
			`return None`

			`expected_answer = None`
			`with open(expected_answer_file, "rt") as fh:`
			`expected_answer = fh.read().strip()`

			`# Read the console`
			`console_log_file = os.path.join(instance_dir, "console_log.txt")`
			`if not os.path.isfile(console_log_file):`
			`return None`

			`console_log = ""`
			`with open(console_log_file, "rt") as fh:`
			`console_log = fh.read()`

			`final_answer = ""`
			`m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)`
			`if m:`
			`final_answer = m.group(1).strip()`

			`# Return true if they are equal after normalization`
			`return normalize_answer(expected_answer) == normalize_answer(final_answer)`


			`def main(args):`
			`default_tabulate(args, scorer=scorer)`


			`if __name__ == "__main__" and __package__ is None:`
			`main(sys.argv)`