autogen/test/openai/test_completion.py

398 lines
12 KiB
Python
Raw Normal View History

import datasets
import sys
import numpy as np
import pytest
from functools import partial
import os
from flaml import oai
from flaml.autogen.code_utils import (
eval_function_completions,
generate_assertions,
implement,
generate_code,
extract_code,
improve_function,
improve_code,
execute_code,
)
from flaml.autogen.math_utils import eval_math_responses, solve_problem
def test_multi_model():
try:
import openai
except ImportError as exc:
print(exc)
return
response = oai.Completion.create(
config_list=[
{
"model": "gpt-4",
"api_key": os.environ.get("OPENAI_API_KEY"),
"api_type": "open_ai",
"api_base": "https://api.openai.com/v1",
"api_version": None,
},
{
"model": "gpt-4",
"api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
"api_type": "azure",
"api_base": os.environ.get("AZURE_OPENAI_API_BASE"),
"api_version": "2023-03-15-preview",
},
{
"model": "gpt-3.5-turbo",
"api_key": os.environ.get("OPENAI_API_KEY"),
"api_type": "open_ai",
"api_base": "https://api.openai.com/v1",
"api_version": None,
},
{
"model": "gpt-3.5-turbo",
"api_key": os.environ.get("AZURE_OPENAI_API_KEY"),
"api_type": "azure",
"api_base": os.environ.get("AZURE_OPENAI_API_BASE"),
"api_version": "2023-03-15-preview",
},
],
prompt="Hi",
)
print(response)
@pytest.mark.skipif(
sys.platform in ["darwin", "win32"],
reason="do not run on MacOS or windows",
)
def test_execute_code():
try:
import docker
except ImportError as exc:
print(exc)
return
exitcode, msg = execute_code("print('hello world')", filename="tmp/codetest.py")
assert exitcode == 0 and msg == b"hello world\n", msg
# read a file
print(execute_code("with open('tmp/codetest.py', 'r') as f: a=f.read()"))
# create a file
print(execute_code("with open('tmp/codetest.py', 'w') as f: f.write('b=1')", work_dir="test/openai/my_tmp"))
# execute code in a file
print(execute_code(filename="tmp/codetest.py"))
# execute code for assertion error
exit_code, msg = execute_code("assert 1==2")
assert exit_code, msg
# execute code which takes a long time
exit_code, error = execute_code("import time; time.sleep(2)", timeout=1)
assert exit_code and error == "Timeout"
exit_code, error = execute_code("import time; time.sleep(2)", timeout=1, use_docker=False)
assert exit_code and error == "Timeout"
def test_improve():
try:
import openai
import diskcache
except ImportError as exc:
print(exc)
return
improved, _ = improve_function(
"flaml/autogen/math_utils.py",
"solve_problem",
"Solve math problems accurately, by avoiding calculation errors and reduce reasoning errors.",
)
with open("test/openai/math_utils.py.improved", "w") as f:
f.write(improved)
suggestion, _ = improve_code(
["flaml/autogen/code_utils.py", "flaml/autogen/math_utils.py"],
"leverage generative AI smartly and cost-effectively",
)
print(suggestion)
improvement, cost = improve_code(
["flaml/autogen/code_utils.py", "flaml/autogen/math_utils.py"],
"leverage generative AI smartly and cost-effectively",
suggest_only=False,
)
print(cost)
with open("test/openai/suggested_improvement.txt", "w") as f:
f.write(improvement)
def test_nocontext():
try:
import openai
import diskcache
except ImportError as exc:
print(exc)
return
response = oai.Completion.create(
model="text-ada-001", prompt="1+1=", max_tokens=1, use_cache=False, request_timeout=10
)
print(response)
code, _ = generate_code(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content": "You want to become a better assistant by learning new skills and improving your existing ones.",
},
{
"role": "user",
"content": "Write reusable code to use web scraping to get information from websites.",
},
],
)
print(code)
# test extract_code from markdown
code, _ = extract_code(
"""
Example:
```
print("hello extract code")
```
"""
)
print(code)
code, _ = extract_code(
"""
Example:
```python
def scrape(url):
import requests
from bs4 import BeautifulSoup
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
title = soup.find("title").text
text = soup.find("div", {"id": "bodyContent"}).text
return title, text
```
Test:
```python
url = "https://en.wikipedia.org/wiki/Web_scraping"
title, text = scrape(url)
print(f"Title: {title}")
print(f"Text: {text}")
"""
)
print(code)
solution, cost = solve_problem("1+1=")
print(solution, cost)
@pytest.mark.skipif(
sys.platform == "win32",
reason="do not run on windows",
)
def test_humaneval(num_samples=1):
eval_with_generated_assertions = partial(eval_function_completions, assertions=generate_assertions)
seed = 41
data = datasets.load_dataset("openai_humaneval")["test"].shuffle(seed=seed)
n_tune_data = 20
tune_data = [
{
"definition": data[x]["prompt"],
"test": data[x]["test"],
"entry_point": data[x]["entry_point"],
}
for x in range(n_tune_data)
]
test_data = [
{
"definition": data[x]["prompt"],
"test": data[x]["test"],
"entry_point": data[x]["entry_point"],
}
for x in range(n_tune_data, len(data))
]
oai.Completion.set_cache(seed)
try:
import openai
import diskcache
except ImportError as exc:
print(exc)
return
# a minimal tuning example
config, _ = oai.Completion.tune(
data=tune_data,
metric="success",
mode="max",
eval_func=eval_function_completions,
n=1,
prompt="{definition}",
)
responses = oai.Completion.create(context=test_data[0], **config)
# a minimal tuning example for tuning chat completion models using the Completion class
config, _ = oai.Completion.tune(
data=tune_data,
metric="succeed_assertions",
mode="max",
eval_func=eval_with_generated_assertions,
n=1,
model="gpt-3.5-turbo",
prompt="{definition}",
)
responses = oai.Completion.create(context=test_data[0], **config)
# a minimal tuning example for tuning chat completion models using the Completion class
config, _ = oai.ChatCompletion.tune(
data=tune_data,
metric="expected_success",
mode="max",
eval_func=eval_function_completions,
n=1,
messages=[{"role": "user", "content": "{definition}"}],
)
responses = oai.ChatCompletion.create(context=test_data[0], **config)
print(responses)
code, cost, _ = implement(tune_data[1], [config])
print(code)
print(cost)
print(eval_function_completions([code], **tune_data[1]))
# a more comprehensive tuning example
config2, analysis = oai.Completion.tune(
data=tune_data,
metric="success",
mode="max",
eval_func=eval_with_generated_assertions,
log_file_name="logs/humaneval.log",
inference_budget=0.002,
optimization_budget=2,
num_samples=num_samples,
# logging_level=logging.INFO,
prompt=[
"{definition}",
"# Python 3{definition}",
"Complete the following Python function:{definition}",
],
stop=[["\nclass", "\ndef", "\nif", "\nprint"], None], # the stop sequences
)
print(config2)
print(analysis.best_result)
print(test_data[0])
responses = oai.Completion.create(context=test_data[0], **config2)
print(responses)
oai.Completion.data = test_data[:num_samples]
result = oai.Completion._eval(analysis.best_config, prune=False, eval_only=True)
print("result without pruning", result)
result = oai.Completion.test(test_data[:num_samples], config=config2)
print(result)
code, cost, selected = implement(tune_data[1], [config2, config])
print(selected)
print(eval_function_completions([code], **tune_data[1]))
def test_math(num_samples=-1):
try:
import openai
import diskcache
except ImportError as exc:
print(exc)
return
seed = 41
data = datasets.load_dataset("competition_math")
train_data = data["train"].shuffle(seed=seed)
test_data = data["test"].shuffle(seed=seed)
n_tune_data = 20
tune_data = [
{
"problem": train_data[x]["problem"],
"solution": train_data[x]["solution"],
}
for x in range(len(train_data))
if train_data[x]["level"] == "Level 1"
][:n_tune_data]
test_data = [
{
"problem": test_data[x]["problem"],
"solution": test_data[x]["solution"],
}
for x in range(len(test_data))
if test_data[x]["level"] == "Level 1"
]
print(
"max tokens in tuning data's canonical solutions",
max([len(x["solution"].split()) for x in tune_data]),
)
print(len(tune_data), len(test_data))
# prompt template
prompts = [
lambda data: "%s Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\boxed{}."
% data["problem"]
]
oai.ChatCompletion.set_cache(seed)
vanilla_config = {
"model": "gpt-3.5-turbo",
"temperature": 1,
"max_tokens": 2048,
"n": 1,
"prompt": prompts[0],
"stop": "###",
}
test_data_sample = test_data[0:3]
result = oai.ChatCompletion.test(test_data_sample, vanilla_config, eval_math_responses)
result = oai.ChatCompletion.test(
test_data_sample,
vanilla_config,
eval_math_responses,
agg_method="median",
)
def my_median(results):
return np.median(results)
def my_average(results):
return np.mean(results)
result = oai.ChatCompletion.test(
test_data_sample,
vanilla_config,
eval_math_responses,
agg_method=my_median,
)
result = oai.ChatCompletion.test(
test_data_sample,
vanilla_config,
eval_math_responses,
agg_method={
"expected_success": my_median,
"success": my_average,
"success_vote": my_average,
"votes": np.mean,
},
)
print(result)
config, _ = oai.ChatCompletion.tune(
data=tune_data, # the data for tuning
metric="expected_success", # the metric to optimize
mode="max", # the optimization mode
eval_func=eval_math_responses, # the evaluation function to return the success metrics
# log_file_name="logs/math.log", # the log file name
inference_budget=0.002, # the inference budget (dollar)
optimization_budget=0.01, # the optimization budget (dollar)
num_samples=num_samples,
prompt=prompts, # the prompt templates to choose from
stop="###", # the stop sequence
)
print("tuned config", config)
result = oai.ChatCompletion.test(test_data_sample, config)
print("result from tuned config:", result)
print("empty responses", eval_math_responses([], None))
if __name__ == "__main__":
import openai
openai.api_key = os.environ["OPENAI_API_KEY"] = open("test/openai/key.txt").read().strip()
os.environ["AZURE_OPENAI_API_KEY"] = open("test/openai/key_azure.txt").read().strip()
os.environ["AZURE_OPENAI_API_BASE"] = open("test/openai/base_azure.txt").read().strip()
# test_multi_model()
# test_execute_code()
test_improve()
# test_nocontext()
# test_humaneval(1)
# test_math(1)