autogen/samples/tools/testbed/utils/download_humaneval.py

#
# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
#

import requests
import gzip
import io
import json
import os
import base64


script_path = os.path.realpath(__file__)
script_name = os.path.basename(script_path)
script_dir = os.path.dirname(script_path)

# Directory where scenarios are stored
scenarios_dir = os.path.realpath(os.path.join(script_dir, os.path.pardir, "scenarios"))
print("Saving HumanEval scenarios to: " + scenarios_dir)


# URL of the file to download
url = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"

# Send a HTTP request to the URL of the file
response = requests.get(url)

# Ensure we raise an error if the download failed
response.raise_for_status()

# Create a BytesIO object from the response content
buffer = io.BytesIO(response.content)

# Create a scenario file
fh_gpt4 = open(os.path.join(scenarios_dir, "human_eval_two_agents_gpt4.jsonl"), "wt")
fh_gpt35 = open(os.path.join(scenarios_dir, "human_eval_two_agents_gpt35.jsonl"), "wt")

# Open the buffer as a .gz file and read it line by line
with gzip.GzipFile(fileobj=buffer) as f_in:
    for line in f_in:
        # Parse each line as JSON
        data = json.loads(line)
        print("Converting: " + data["task_id"])

        # Write the GPT-4 scenario
        # Prompts and tests are saved in base 64 to greatly simplify escaping them as they
        # move through the various formats and scripts. I welcome a better, more readable, alternative.
        record = {
            "id": data["task_id"].replace("/", "_"),
            "template": "human_eval_two_agents.py",
            "values": {
                "__MODEL__": "gpt-4",
                "__PROMPT_BASE64__": base64.b64encode(data["prompt"].encode("utf-8")).decode("utf-8"),
                "__ENTRY_POINT__": data["entry_point"],
                "__TEST_BASE64__": base64.b64encode(data["test"].encode("utf-8")).decode("utf-8"),
            },
        }
        fh_gpt4.write(json.dumps(record).strip() + "\n")

        # Write the GPT 3.5 Version
        record["values"]["__MODEL__"] = "gpt-3.5-turbo-16k"
        fh_gpt35.write(json.dumps(record).strip() + "\n")


fh_gpt4.close()
fh_gpt35.close()
Added a simple Testbed tool for repeatedly running templated Autogen scenarios with tightly-controlled initial conditions. (#455) * Initial commit of the autogen testbed environment. * Fixed some typos in the Testbed README.md * Added some stricter termination logic to the two_agent scenario, and swiched the logo task from finding Autogen's logo, to finding Microsoft's (it's easier) * Added documentation to testbed code in preparation for PR * Added a variation of HumanEval to the Testbed. It is also a reasonable example of how to integrate other benchmarks. * Removed ChatCompletion.start_logging and related features. Added an explicit TERMINATE output to HumanEval to save 1 turn in each conversation. * Added metrics utils script for HumanEval * Updated the requirements in the README. * Added documentation for HumanEval csv schemas * Standardized on how the OAI_CONFIG_LIST is handled. * Removed dot-slash from 'includes' path for cross-platform compatibility * Missed a file. * Updated readme to include known-working versions. 2023-11-04 03:38:43 -07:00			`#`
			`# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:`
			`# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)`
			`#`

			`import requests`
			`import gzip`
			`import io`
			`import json`
			`import os`
			`import base64`


			`script_path = os.path.realpath(__file__)`
			`script_name = os.path.basename(script_path)`
			`script_dir = os.path.dirname(script_path)`

			`# Directory where scenarios are stored`
			`scenarios_dir = os.path.realpath(os.path.join(script_dir, os.path.pardir, "scenarios"))`
			`print("Saving HumanEval scenarios to: " + scenarios_dir)`


			`# URL of the file to download`
			`url = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"`

			`# Send a HTTP request to the URL of the file`
			`response = requests.get(url)`

			`# Ensure we raise an error if the download failed`
			`response.raise_for_status()`

			`# Create a BytesIO object from the response content`
			`buffer = io.BytesIO(response.content)`

			`# Create a scenario file`
			`fh_gpt4 = open(os.path.join(scenarios_dir, "human_eval_two_agents_gpt4.jsonl"), "wt")`
			`fh_gpt35 = open(os.path.join(scenarios_dir, "human_eval_two_agents_gpt35.jsonl"), "wt")`

			`# Open the buffer as a .gz file and read it line by line`
			`with gzip.GzipFile(fileobj=buffer) as f_in:`
			`for line in f_in:`
			`# Parse each line as JSON`
			`data = json.loads(line)`
			`print("Converting: " + data["task_id"])`

			`# Write the GPT-4 scenario`
			`# Prompts and tests are saved in base 64 to greatly simplify escaping them as they`
			`# move through the various formats and scripts. I welcome a better, more readable, alternative.`
			`record = {`
			`"id": data["task_id"].replace("/", "_"),`
			`"template": "human_eval_two_agents.py",`
			`"values": {`
			`"__MODEL__": "gpt-4",`
			`"__PROMPT_BASE64__": base64.b64encode(data["prompt"].encode("utf-8")).decode("utf-8"),`
			`"__ENTRY_POINT__": data["entry_point"],`
			`"__TEST_BASE64__": base64.b64encode(data["test"].encode("utf-8")).decode("utf-8"),`
			`},`
			`}`
			`fh_gpt4.write(json.dumps(record).strip() + "\n")`

			`# Write the GPT 3.5 Version`
			`record["values"]["__MODEL__"] = "gpt-3.5-turbo-16k"`
			`fh_gpt35.write(json.dumps(record).strip() + "\n")`


			`fh_gpt4.close()`
			`fh_gpt35.close()`