autogen/python/packages/agbench/benchmarks/AssistantBench/Scripts/init_tasks.py

import json
import os
import re
import sys

from huggingface_hub import snapshot_download

SCRIPT_PATH = os.path.realpath(__file__)
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)

SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
DOWNLOADS_DIR = os.path.join(SCENARIO_DIR, "Downloads")
REPO_DIR = os.path.join(DOWNLOADS_DIR, "AssistantBench")


def download_assistantbench():
    """Download the AssistantBench benchmark from Hugging Face."""

    if not os.path.isdir(DOWNLOADS_DIR):
        os.mkdir(DOWNLOADS_DIR)

    """Download the AssistantBench dataset from Hugging Face Hub"""
    snapshot_download(
        repo_id="AssistantBench/AssistantBench",
        repo_type="dataset",
        local_dir=REPO_DIR,
        local_dir_use_symlinks=True,
    )


def create_jsonl(data_file_path, file_name, template):
    """Creates a JSONL scenario file with a given name, and template path."""
    tasks = []
    with open(data_file_path) as fh:
        for line in fh:
            data = json.loads(line)
            tasks.append(data)
    file_name = os.path.basename(file_name)
    if not os.path.isdir(TASKS_DIR):
        os.mkdir(TASKS_DIR)

    with open(os.path.join(TASKS_DIR, file_name), "wt") as fh:
        for task in tasks:
            if "answer" not in task or task["answer"] is None:
                task["answer"] = ""
            print(f"Converting: [{file_name}] {task['id']}")
            template_cp_list = [template]
            record = {
                "id": task["id"],
                "template": template_cp_list,
                "substitutions": {
                    "scenario.py": {
                        "__FILE_NAME__": "",
                    },
                    "expected_answer.txt": {"__EXPECTED_ANSWER__": task["answer"]},
                    "prompt.txt": {"__PROMPT__": task["task"]},
                },
                "difficulty": task["difficulty"],
                "explanation": task["explanation"],
                "metadata": task["metadata"],
                "gold_url": task["gold_url"],
                "set": task["set"],
            }
            fh.write(json.dumps(record).strip() + "\n")


###############################################################################
def main():
    ab_validation_files = os.path.join(REPO_DIR, "assistant_bench_v1.0_dev.jsonl")
    ab_test_files = os.path.join(REPO_DIR, "assistant_bench_v1.0_test.jsonl")

    if not os.path.isfile(ab_validation_files) or not os.path.isfile(ab_test_files):
        download_assistantbench()

    if not os.path.isfile(ab_validation_files) or not os.path.isfile(ab_test_files):
        sys.exit(f"Error: '{REPO_DIR}' does not appear to be a copy of the AssistantBench repository.")

    templates = {}
    for entry in os.scandir(TEMPLATES_DIR):
        if entry.is_dir():
            templates[re.sub(r"\s", "", entry.name)] = entry.path
    print(templates)
    # make a copy of the data in the Tasks directory
    for t in templates.items():
        create_jsonl(ab_validation_files, f"assistant_bench_v1.0_dev__{t[0]}.jsonl", t[1])
        create_jsonl(ab_test_files, f"assistant_bench_v1.0_test__{t[0]}.jsonl", t[1])


if __name__ == "__main__" and __package__ is None:
    main()
Adding Benchmarks to agbench (#3803) * Move from tomllib to tomli * added example code for magentic-one + code comments * adding benchmarks temporarily * add license for datasets * revert changes to magentic-one * change license location --------- Co-authored-by: Ryan Sweet <rysweet@microsoft.com> 2024-10-17 21:33:33 -07:00			`import json`
			`import os`
			`import re`
			`import sys`

			`from huggingface_hub import snapshot_download`

			`SCRIPT_PATH = os.path.realpath(__file__)`
			`SCRIPT_NAME = os.path.basename(SCRIPT_PATH)`
			`SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)`

			`SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))`
			`TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")`
			`TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")`
			`DOWNLOADS_DIR = os.path.join(SCENARIO_DIR, "Downloads")`
			`REPO_DIR = os.path.join(DOWNLOADS_DIR, "AssistantBench")`


			`def download_assistantbench():`
			`"""Download the AssistantBench benchmark from Hugging Face."""`

			`if not os.path.isdir(DOWNLOADS_DIR):`
			`os.mkdir(DOWNLOADS_DIR)`

			`"""Download the AssistantBench dataset from Hugging Face Hub"""`
			`snapshot_download(`
			`repo_id="AssistantBench/AssistantBench",`
			`repo_type="dataset",`
			`local_dir=REPO_DIR,`
			`local_dir_use_symlinks=True,`
			`)`


			`def create_jsonl(data_file_path, file_name, template):`
			`"""Creates a JSONL scenario file with a given name, and template path."""`
			`tasks = []`
			`with open(data_file_path) as fh:`
			`for line in fh:`
			`data = json.loads(line)`
			`tasks.append(data)`
			`file_name = os.path.basename(file_name)`
			`if not os.path.isdir(TASKS_DIR):`
			`os.mkdir(TASKS_DIR)`

			`with open(os.path.join(TASKS_DIR, file_name), "wt") as fh:`
			`for task in tasks:`
			`if "answer" not in task or task["answer"] is None:`
			`task["answer"] = ""`
			`print(f"Converting: [{file_name}] {task['id']}")`
			`template_cp_list = [template]`
			`record = {`
			`"id": task["id"],`
			`"template": template_cp_list,`
			`"substitutions": {`
			`"scenario.py": {`
			`"__FILE_NAME__": "",`
			`},`
			`"expected_answer.txt": {"__EXPECTED_ANSWER__": task["answer"]},`
			`"prompt.txt": {"__PROMPT__": task["task"]},`
			`},`
			`"difficulty": task["difficulty"],`
			`"explanation": task["explanation"],`
			`"metadata": task["metadata"],`
			`"gold_url": task["gold_url"],`
			`"set": task["set"],`
			`}`
			`fh.write(json.dumps(record).strip() + "\n")`


			`###############################################################################`
			`def main():`
			`ab_validation_files = os.path.join(REPO_DIR, "assistant_bench_v1.0_dev.jsonl")`
			`ab_test_files = os.path.join(REPO_DIR, "assistant_bench_v1.0_test.jsonl")`

			`if not os.path.isfile(ab_validation_files) or not os.path.isfile(ab_test_files):`
			`download_assistantbench()`

			`if not os.path.isfile(ab_validation_files) or not os.path.isfile(ab_test_files):`
			`sys.exit(f"Error: '{REPO_DIR}' does not appear to be a copy of the AssistantBench repository.")`

			`templates = {}`
			`for entry in os.scandir(TEMPLATES_DIR):`
			`if entry.is_dir():`
			`templates[re.sub(r"\s", "", entry.name)] = entry.path`
			`print(templates)`
			`# make a copy of the data in the Tasks directory`
			`for t in templates.items():`
			`create_jsonl(ab_validation_files, f"assistant_bench_v1.0_dev__{t[0]}.jsonl", t[1])`
			`create_jsonl(ab_test_files, f"assistant_bench_v1.0_test__{t[0]}.jsonl", t[1])`


			`if __name__ == "__main__" and __package__ is None:`
			`main()`