94 lines
3.3 KiB
Python
Raw Normal View History

import json
import os
import re
import sys
from huggingface_hub import snapshot_download
SCRIPT_PATH = os.path.realpath(__file__)
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
DOWNLOADS_DIR = os.path.join(SCENARIO_DIR, "Downloads")
REPO_DIR = os.path.join(DOWNLOADS_DIR, "AssistantBench")
def download_assistantbench():
"""Download the AssistantBench benchmark from Hugging Face."""
if not os.path.isdir(DOWNLOADS_DIR):
os.mkdir(DOWNLOADS_DIR)
"""Download the AssistantBench dataset from Hugging Face Hub"""
snapshot_download(
repo_id="AssistantBench/AssistantBench",
repo_type="dataset",
local_dir=REPO_DIR,
local_dir_use_symlinks=True,
)
def create_jsonl(data_file_path, file_name, template):
"""Creates a JSONL scenario file with a given name, and template path."""
tasks = []
with open(data_file_path) as fh:
for line in fh:
data = json.loads(line)
tasks.append(data)
file_name = os.path.basename(file_name)
if not os.path.isdir(TASKS_DIR):
os.mkdir(TASKS_DIR)
with open(os.path.join(TASKS_DIR, file_name), "wt") as fh:
for task in tasks:
if "answer" not in task or task["answer"] is None:
task["answer"] = ""
print(f"Converting: [{file_name}] {task['id']}")
template_cp_list = [template]
record = {
"id": task["id"],
"template": template_cp_list,
"substitutions": {
"scenario.py": {
"__FILE_NAME__": "",
},
"expected_answer.txt": {"__EXPECTED_ANSWER__": task["answer"]},
"prompt.txt": {"__PROMPT__": task["task"]},
},
"difficulty": task["difficulty"],
"explanation": task["explanation"],
"metadata": task["metadata"],
"gold_url": task["gold_url"],
"set": task["set"],
}
fh.write(json.dumps(record).strip() + "\n")
###############################################################################
def main():
ab_validation_files = os.path.join(REPO_DIR, "assistant_bench_v1.0_dev.jsonl")
ab_test_files = os.path.join(REPO_DIR, "assistant_bench_v1.0_test.jsonl")
if not os.path.isfile(ab_validation_files) or not os.path.isfile(ab_test_files):
download_assistantbench()
if not os.path.isfile(ab_validation_files) or not os.path.isfile(ab_test_files):
sys.exit(f"Error: '{REPO_DIR}' does not appear to be a copy of the AssistantBench repository.")
templates = {}
for entry in os.scandir(TEMPLATES_DIR):
if entry.is_dir():
templates[re.sub(r"\s", "", entry.name)] = entry.path
print(templates)
# make a copy of the data in the Tasks directory
for t in templates.items():
create_jsonl(ab_validation_files, f"assistant_bench_v1.0_dev__{t[0]}.jsonl", t[1])
create_jsonl(ab_test_files, f"assistant_bench_v1.0_test__{t[0]}.jsonl", t[1])
if __name__ == "__main__" and __package__ is None:
main()