mirror of
https://github.com/microsoft/autogen.git
synced 2025-08-08 00:32:03 +00:00

* Initial commit of the autogen testbed environment. * Fixed some typos in the Testbed README.md * Added some stricter termination logic to the two_agent scenario, and swiched the logo task from finding Autogen's logo, to finding Microsoft's (it's easier) * Added documentation to testbed code in preparation for PR * Added a variation of HumanEval to the Testbed. It is also a reasonable example of how to integrate other benchmarks. * Removed ChatCompletion.start_logging and related features. Added an explicit TERMINATE output to HumanEval to save 1 turn in each conversation. * Added metrics utils script for HumanEval * Updated the requirements in the README. * Added documentation for HumanEval csv schemas * Standardized on how the OAI_CONFIG_LIST is handled. * Removed dot-slash from 'includes' path for cross-platform compatibility * Missed a file. * Updated readme to include known-working versions.
68 lines
2.3 KiB
Python
68 lines
2.3 KiB
Python
#
|
|
# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
|
|
# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
|
|
#
|
|
|
|
import requests
|
|
import gzip
|
|
import io
|
|
import json
|
|
import os
|
|
import base64
|
|
|
|
|
|
script_path = os.path.realpath(__file__)
|
|
script_name = os.path.basename(script_path)
|
|
script_dir = os.path.dirname(script_path)
|
|
|
|
# Directory where scenarios are stored
|
|
scenarios_dir = os.path.realpath(os.path.join(script_dir, os.path.pardir, "scenarios"))
|
|
print("Saving HumanEval scenarios to: " + scenarios_dir)
|
|
|
|
|
|
# URL of the file to download
|
|
url = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
|
|
|
|
# Send a HTTP request to the URL of the file
|
|
response = requests.get(url)
|
|
|
|
# Ensure we raise an error if the download failed
|
|
response.raise_for_status()
|
|
|
|
# Create a BytesIO object from the response content
|
|
buffer = io.BytesIO(response.content)
|
|
|
|
# Create a scenario file
|
|
fh_gpt4 = open(os.path.join(scenarios_dir, "human_eval_two_agents_gpt4.jsonl"), "wt")
|
|
fh_gpt35 = open(os.path.join(scenarios_dir, "human_eval_two_agents_gpt35.jsonl"), "wt")
|
|
|
|
# Open the buffer as a .gz file and read it line by line
|
|
with gzip.GzipFile(fileobj=buffer) as f_in:
|
|
for line in f_in:
|
|
# Parse each line as JSON
|
|
data = json.loads(line)
|
|
print("Converting: " + data["task_id"])
|
|
|
|
# Write the GPT-4 scenario
|
|
# Prompts and tests are saved in base 64 to greatly simplify escaping them as they
|
|
# move through the various formats and scripts. I welcome a better, more readable, alternative.
|
|
record = {
|
|
"id": data["task_id"].replace("/", "_"),
|
|
"template": "human_eval_two_agents.py",
|
|
"values": {
|
|
"__MODEL__": "gpt-4",
|
|
"__PROMPT_BASE64__": base64.b64encode(data["prompt"].encode("utf-8")).decode("utf-8"),
|
|
"__ENTRY_POINT__": data["entry_point"],
|
|
"__TEST_BASE64__": base64.b64encode(data["test"].encode("utf-8")).decode("utf-8"),
|
|
},
|
|
}
|
|
fh_gpt4.write(json.dumps(record).strip() + "\n")
|
|
|
|
# Write the GPT 3.5 Version
|
|
record["values"]["__MODEL__"] = "gpt-3.5-turbo-16k"
|
|
fh_gpt35.write(json.dumps(record).strip() + "\n")
|
|
|
|
|
|
fh_gpt4.close()
|
|
fh_gpt35.close()
|