afourney cd199c7ab7
Introduces AutoGenBench (#1048)
* Initial commit of AutoGenBench

* wording

* typo

* pre-commit reformulation

* Updated README to point to contributor's guide earlier.

* Simplified the description of the JSON format.

* Added print statements to indicate when run.sh and scenario.py are starting.

* Added SocietyOfMind scenario to GAIA.

* Pointing autogenbench clone command to the latest branch.

* Temporarily disable subsample option.

* Updated the GAIA readme to specify how to define a BING API key.

* Fixed and re-enabled the subsample option.

* Added a draft of a blog post.

* Updated authors.

* Incorporating Gagan's feedback.

* Fixed code formatting.

* Updated the help string in the docs.

* Light editing of the AutoGenBench blogpost.

* Support filtering on model tags.

* Added websurfer dependencies to Dockerfile.

* Renamed testbed -> autogenbench

* Attempting to fix formatting.

* Added more gracefull handling of task timeouts (the script is allowed to terminate before Docker is stopped).

* Updated the blogpost based on Saleema's and Julia's feedback.

* Fixed formatting... again.

* Added a main MANIFEST to list available scenarios.

* Limit main manifest to directories.

* Manifests now use relative paths.

* All manifests are now relative.

* Updated the contributing guide, and address windows path issues.

* Updated the version. Fixed formatting.

* Fixed formatting.

* De-listing Examples, since it has no clear tabulate criteria.

* Updated email in pyproject

* typo in blogpost

* wording

---------

Co-authored-by: Qingyun Wu <qingyun.wu@psu.edu>
Co-authored-by: Qingyun Wu <qingyun0327@gmail.com>
2024-01-26 00:46:58 +00:00

122 lines
3.6 KiB
Python

#
# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
#
import requests
import gzip
import io
import json
import os
import base64
URL = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
SCRIPT_PATH = os.path.realpath(__file__)
SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
# A selected subset of HumanEval problems to work with during development
REDUCED_SET = [
"HumanEval/2",
"HumanEval/26",
"HumanEval/32",
"HumanEval/33",
"HumanEval/36",
"HumanEval/38",
"HumanEval/41",
"HumanEval/50",
"HumanEval/56",
"HumanEval/65",
"HumanEval/67",
"HumanEval/84",
"HumanEval/85",
"HumanEval/86",
"HumanEval/89",
"HumanEval/99",
"HumanEval/104",
"HumanEval/113",
"HumanEval/115",
"HumanEval/120",
"HumanEval/124",
"HumanEval/126",
"HumanEval/132",
"HumanEval/135",
"HumanEval/140",
"HumanEval/146",
]
def download_human_eval():
"""Download the HumanEval dataset, un-gzips it, and returns a list of its parsed JSON objects."""
# Send a HTTP request to the URL of the file
response = requests.get(URL)
# Ensure we raise an error if the download failed
response.raise_for_status()
# Create a BytesIO object from the response content
buffer = io.BytesIO(response.content)
# Read the file, line by line, populating a list of parsed JSON objects
results = []
with gzip.GzipFile(fileobj=buffer) as f_in:
for line in f_in:
# Parse each line as JSON
results.append(json.loads(line))
return results
def create_jsonl(name, tasks, template):
"""Creates a JSONL scenario file with a given name, list of HumanEval tasks, and template path."""
# Create a task directory if it doesn't exist
scenario_dir = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
task_dir = os.path.join(scenario_dir, "Tasks")
if not os.path.isdir(task_dir):
os.mkdir(task_dir)
# Create the jsonl file
with open(os.path.join(task_dir, name + ".jsonl"), "wt") as fh:
for task in tasks:
print(f"Converting: [{name}] {task['task_id']}")
record = {
"id": task["task_id"].replace("/", "_"),
"template": os.path.join(os.path.pardir, template),
"substitutions": {
"scenario.py": {
"__ENTRY_POINT__": task["entry_point"],
"__SELECTION_METHOD__": "auto",
},
"prompt.txt": {"__PROMPT__": task["prompt"]},
"coding/my_tests.py": {"__TEST__": task["test"]},
},
}
fh.write(json.dumps(record).strip() + "\n")
###############################################################################
def main():
human_eval = download_human_eval()
reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]
templates = {
"two_agents": "Templates/TwoAgents",
# "gc3_distractor": "Templates/GroupChatThreeAgents_Distractor",
# "gc3_guardrails": "Templates/GroupChatThreeAgents_Guardrails",
# "gc4": "Templates/GroupChatFourAgents",
}
# Create the various combinations of [models] x [templates]
for t in templates.items():
create_jsonl(f"human_eval_{t[0]}", human_eval, t[1])
create_jsonl(f"r_human_eval_{t[0]}", reduced_human_eval, t[1])
if __name__ == "__main__" and __package__ is None:
main()