autogen/test/autogen/oai/test_completion.py

455 lines
14 KiB
Python
Raw Normal View History

import datasets
import sys
import numpy as np
import pytest
from functools import partial
import os
import json
from flaml import oai
from flaml.autogen.code_utils import (
eval_function_completions,
generate_assertions,
implement,
generate_code,
improve_function,
improve_code,
)
from flaml.autogen.math_utils import eval_math_responses, solve_problem
KEY_LOC = "test/autogen"
OAI_CONFIG_LIST = "OAI_CONFIG_LIST"
here = os.path.abspath(os.path.dirname(__file__))
def yes_or_no_filter(context, response, **_):
return context.get("yes_or_no_choice", False) is False or any(
text in ["Yes.", "No."] for text in oai.Completion.extract_text(response)
)
def valid_json_filter(response, **_):
for text in oai.Completion.extract_text(response):
try:
json.loads(text)
return True
except ValueError:
pass
return False
def test_filter():
try:
import openai
except ImportError as exc:
print(exc)
return
response = oai.Completion.create(
context={"yes_or_no_choice": True},
config_list=[{"model": "text-ada-001"}, {"model": "gpt-3.5-turbo"}, {"model": "text-davinci-003"}],
prompt="Is 37 a prime number? Please answer 'Yes.' or 'No.'",
filter_func=yes_or_no_filter,
)
assert (
oai.Completion.extract_text(response)[0] in ["Yes.", "No."]
or not response["pass_filter"]
and response["config_id"] == 2
)
response = oai.Completion.create(
context={"yes_or_no_choice": False},
config_list=[{"model": "text-ada-001"}, {"model": "gpt-3.5-turbo"}, {"model": "text-davinci-003"}],
prompt="Is 37 a prime number?",
filter_func=yes_or_no_filter,
)
assert response["model"] == "text-ada-001"
response = oai.Completion.create(
config_list=[{"model": "text-ada-001"}, {"model": "gpt-3.5-turbo"}, {"model": "text-davinci-003"}],
prompt="How to construct a json request to Bing API to search for 'latest AI news'? Return the JSON request.",
filter_func=valid_json_filter,
)
assert response["config_id"] == 2 or response["pass_filter"], "the response must pass filter unless all fail"
assert not response["pass_filter"] or json.loads(oai.Completion.extract_text(response)[0])
def test_chatcompletion():
params = oai.ChatCompletion._construct_params(
context=None,
config={"model": "unknown"},
prompt="hi",
)
assert "messages" in params
params = oai.Completion._construct_params(
context=None,
config={"model": "unknown"},
prompt="hi",
)
assert "messages" not in params
params = oai.Completion._construct_params(
context=None,
config={"model": "gpt-4"},
prompt="hi",
)
assert "messages" in params
def test_multi_model():
try:
import openai
except ImportError as exc:
print(exc)
return
response = oai.Completion.create(
config_list=oai.config_list_gpt4_gpt35(KEY_LOC),
prompt="Hi",
)
print(response)
def test_improve():
try:
import openai
import diskcache
except ImportError as exc:
print(exc)
return
config_list = oai.config_list_openai_aoai(KEY_LOC)
improved, _ = improve_function(
"flaml/autogen/math_utils.py",
"solve_problem",
"Solve math problems accurately, by avoiding calculation errors and reduce reasoning errors.",
config_list=config_list,
)
with open(f"{here}/math_utils.py.improved", "w") as f:
f.write(improved)
suggestion, _ = improve_code(
["flaml/autogen/code_utils.py", "flaml/autogen/math_utils.py"],
"leverage generative AI smartly and cost-effectively",
config_list=config_list,
)
print(suggestion)
improvement, cost = improve_code(
["flaml/autogen/code_utils.py", "flaml/autogen/math_utils.py"],
"leverage generative AI smartly and cost-effectively",
suggest_only=False,
config_list=config_list,
)
print(cost)
with open(f"{here}/suggested_improvement.txt", "w") as f:
f.write(improvement)
def test_nocontext():
try:
import openai
import diskcache
except ImportError as exc:
print(exc)
return
response = oai.Completion.create(
model="text-ada-001", prompt="1+1=", max_tokens=1, use_cache=False, request_timeout=10
)
print(response)
code, _ = generate_code(
config_list=oai.config_list_from_json(
OAI_CONFIG_LIST,
file_location=KEY_LOC,
filter_dict={
"model": {
"gpt-3.5-turbo",
"gpt-3.5-turbo-16k",
"gpt-3.5-turbo-0301",
"chatgpt-35-turbo-0301",
"gpt-35-turbo-v0301",
},
},
),
messages=[
{
"role": "system",
"content": "You want to become a better assistant by learning new skills and improving your existing ones.",
},
{
"role": "user",
"content": "Write reusable code to use web scraping to get information from websites.",
},
],
)
print(code)
solution, cost = solve_problem("1+1=", config_list=oai.config_list_gpt4_gpt35(KEY_LOC))
print(solution, cost)
@pytest.mark.skipif(
sys.platform == "win32",
reason="do not run on windows",
)
def test_humaneval(num_samples=1):
gpt35_config_list = oai.config_list_from_json(
env_or_file="OAI_CONFIG_LIST",
filter_dict={
"model": {
"gpt-3.5-turbo",
"gpt-3.5-turbo-16k",
"gpt-3.5-turbo-0301",
"chatgpt-35-turbo-0301",
"gpt-35-turbo-v0301",
},
},
)
assertions = partial(generate_assertions, config_list=gpt35_config_list)
eval_with_generated_assertions = partial(
eval_function_completions,
assertions=assertions,
)
seed = 41
data = datasets.load_dataset("openai_humaneval")["test"].shuffle(seed=seed)
n_tune_data = 20
tune_data = [
{
"definition": data[x]["prompt"],
"test": data[x]["test"],
"entry_point": data[x]["entry_point"],
}
for x in range(n_tune_data)
]
test_data = [
{
"definition": data[x]["prompt"],
"test": data[x]["test"],
"entry_point": data[x]["entry_point"],
}
for x in range(n_tune_data, len(data))
]
oai.Completion.clear_cache(cache_path_root="{here}/cache")
oai.Completion.set_cache(seed)
try:
import openai
import diskcache
except ImportError as exc:
print(exc)
return
oai.Completion.clear_cache(400)
# no error should be raised
response = oai.Completion.create(
context=test_data[0],
config_list=[{"model": "gpt-3.5-turbo"}],
prompt="",
max_tokens=1,
retry_timeout=0,
raise_on_ratelimit_or_timeout=False,
)
# assert response == -1
# a minimal tuning example
config, _ = oai.Completion.tune(
data=tune_data,
metric="success",
mode="max",
eval_func=eval_function_completions,
n=1,
prompt="{definition}",
)
response = oai.Completion.create(context=test_data[0], **config)
# a minimal tuning example for tuning chat completion models using the Completion class
config, _ = oai.Completion.tune(
data=tune_data,
metric="succeed_assertions",
mode="max",
eval_func=eval_with_generated_assertions,
n=1,
model="text-davinci-003",
prompt="{definition}",
)
response = oai.Completion.create(context=test_data[0], **config)
# a minimal tuning example for tuning chat completion models using the ChatCompletion class
config_list = oai.config_list_openai_aoai(KEY_LOC)
config, _ = oai.ChatCompletion.tune(
data=tune_data,
metric="expected_success",
mode="max",
eval_func=eval_function_completions,
n=1,
messages=[{"role": "user", "content": "{definition}"}],
config_list=config_list,
)
response = oai.ChatCompletion.create(context=test_data[0], config_list=config_list, **config)
print(response)
from openai.error import RateLimitError
try:
code, cost, selected = implement(tune_data[1], [{**config_list[-1], **config}])
except RateLimitError:
code, cost, selected = implement(
tune_data[1],
[{**config_list[0], "model": "text-ada-001", "prompt": config["messages"]["content"]}],
assertions=assertions,
)
print(code)
print(cost)
assert selected == 0
print(eval_function_completions([code], **tune_data[1]))
# a more comprehensive tuning example
config2, analysis = oai.Completion.tune(
data=tune_data,
metric="success",
mode="max",
eval_func=eval_with_generated_assertions,
log_file_name="logs/humaneval.log",
inference_budget=0.002,
optimization_budget=2,
num_samples=num_samples,
# logging_level=logging.INFO,
prompt=[
"{definition}",
"# Python 3{definition}",
"Complete the following Python function:{definition}",
],
stop=[["\nclass", "\ndef", "\nif", "\nprint"], None], # the stop sequences
config_list=config_list,
)
print(config2)
print(analysis.best_result)
print(test_data[0])
response = oai.Completion.create(context=test_data[0], **config2)
print(response)
oai.Completion.data = test_data[:num_samples]
result = oai.Completion._eval(analysis.best_config, prune=False, eval_only=True)
print("result without pruning", result)
result = oai.Completion.test(test_data[:num_samples], **config2)
print(result)
try:
code, cost, selected = implement(
tune_data[1], [{**config_list[-2], **config2}, {**config_list[-1], **config}], assertions=assertions
)
except RateLimitError:
code, cost, selected = implement(
tune_data[1],
[
{**config_list[-3], **config2},
{**config_list[0], "model": "text-ada-001", "prompt": config["messages"]["content"]},
],
assertions=assertions,
)
print(code)
print(cost)
print(selected)
print(eval_function_completions([code], **tune_data[1]))
def test_math(num_samples=-1):
try:
import openai
import diskcache
except ImportError as exc:
print(exc)
return
seed = 41
data = datasets.load_dataset("competition_math")
train_data = data["train"].shuffle(seed=seed)
test_data = data["test"].shuffle(seed=seed)
n_tune_data = 20
tune_data = [
{
"problem": train_data[x]["problem"],
"solution": train_data[x]["solution"],
}
for x in range(len(train_data))
if train_data[x]["level"] == "Level 1"
][:n_tune_data]
test_data = [
{
"problem": test_data[x]["problem"],
"solution": test_data[x]["solution"],
}
for x in range(len(test_data))
if test_data[x]["level"] == "Level 1"
]
print(
"max tokens in tuning data's canonical solutions",
max([len(x["solution"].split()) for x in tune_data]),
)
print(len(tune_data), len(test_data))
# prompt template
prompts = [
lambda data: "%s Solve the problem carefully. Simplify your answer as much as possible. Put the final answer in \\boxed{}."
% data["problem"]
]
oai.Completion.set_cache(seed)
vanilla_config = {
"model": "text-davinci-003",
"temperature": 1,
"max_tokens": 2048,
"n": 1,
"prompt": prompts[0],
"stop": "###",
}
test_data_sample = test_data[0:3]
result = oai.Completion.test(test_data_sample, eval_math_responses, **vanilla_config)
result = oai.Completion.test(
test_data_sample,
eval_math_responses,
agg_method="median",
**vanilla_config,
)
def my_median(results):
return np.median(results)
def my_average(results):
return np.mean(results)
result = oai.Completion.test(
test_data_sample,
eval_math_responses,
agg_method=my_median,
**vanilla_config,
)
result = oai.Completion.test(
test_data_sample,
eval_math_responses,
agg_method={
"expected_success": my_median,
"success": my_average,
"success_vote": my_average,
"votes": np.mean,
},
**vanilla_config,
)
print(result)
config, _ = oai.Completion.tune(
data=tune_data, # the data for tuning
metric="expected_success", # the metric to optimize
mode="max", # the optimization mode
eval_func=eval_math_responses, # the evaluation function to return the success metrics
# log_file_name="logs/math.log", # the log file name
inference_budget=0.002, # the inference budget (dollar)
optimization_budget=0.01, # the optimization budget (dollar)
num_samples=num_samples,
prompt=prompts, # the prompt templates to choose from
stop="###", # the stop sequence
)
print("tuned config", config)
result = oai.Completion.test(test_data_sample, config_list=oai.config_list_openai_aoai(KEY_LOC), **config)
print("result from tuned config:", result)
print("empty responses", eval_math_responses([], None))
if __name__ == "__main__":
import openai
config_list = oai.config_list_openai_aoai(KEY_LOC)
assert len(config_list) >= 3, config_list
openai.api_key = os.environ["OPENAI_API_KEY"]
# test_filter()
# test_chatcompletion()
# test_multi_model()
# test_improve()
# test_nocontext()
test_humaneval(1)
# test_math(1)