import datasets
import signal
import subprocess
import sys
import numpy as np
import pytest
from flaml import oai


@pytest.mark.skipif(
    sys.platform == "win32",
    reason="do not run on windows",
)
def test_humaneval(num_samples=1):
    def timeout_handler(signum, frame):
        raise TimeoutError("Timed out!")

    signal.signal(signal.SIGALRM, timeout_handler)
    max_exec_time = 3  # seconds

    def execute_code(code):
        code = code.strip()
        with open("codetest.py", "w") as fout:
            fout.write(code)
        try:
            signal.alarm(max_exec_time)
            result = subprocess.run(
                [sys.executable, "codetest.py"],
                stdout=subprocess.DEVNULL,
                stderr=subprocess.PIPE,
            )
            signal.alarm(0)
        except TimeoutError:
            return 0
        return int(result.returncode == 0)

    def success_metrics(responses, prompt, test, entry_point):
        """Check if the response is correct.

        Args:
            responses (list): The list of responses.
            prompt (str): The input prompt.
            test (str): The test code.
            entry_point (str): The name of the function.

        Returns:
            dict: The success metrics.
        """
        success_list = []
        n = len(responses)
        for i in range(n):
            response = responses[i]
            code = f"{prompt}{response}\n{test}\ncheck({entry_point})"
            succeed = execute_code(code)
            success_list.append(succeed)
        return {
            "expected_success": 1 - pow(1 - np.mean(success_list), n),
            "success": any(s for s in success_list),
        }

    seed = 41
    data = datasets.load_dataset("openai_humaneval")["test"].shuffle(seed=seed)
    n_tune_data = 20
    tune_data = [
        {
            "prompt": data[x]["prompt"],
            "test": data[x]["test"],
            "entry_point": data[x]["entry_point"],
        }
        for x in range(n_tune_data)
    ]
    test_data = [
        {
            "prompt": data[x]["prompt"],
            "test": data[x]["test"],
            "entry_point": data[x]["entry_point"],
        }
        for x in range(n_tune_data, len(data))
    ]
    oai.Completion.set_cache(seed)
    try:
        # a minimal tuning example
        oai.Completion.tune(
            data=tune_data,
            metric="success",
            mode="max",
            eval_func=success_metrics,
            n=1,
        )
        # a more comprehensive tuning example
        config, analysis = oai.Completion.tune(
            data=tune_data,
            metric="expected_success",
            mode="max",
            eval_func=success_metrics,
            log_file_name="logs/humaneval.log",
            inference_budget=0.02,
            optimization_budget=5,
            num_samples=num_samples,
            prompt=[
                "{prompt}",
                "# Python 3{prompt}",
                "Complete the following Python function:{prompt}",
                "Complete the following Python function while including necessary import statements inside the function:{prompt}",
            ],
            stop=["\nclass", "\ndef", "\nif", "\nprint"],
        )
        print(config)
        print(analysis.best_result)
        print(test_data[0])
        responses = oai.Completion.create(context=test_data[0], **config)
        print(responses)
        oai.Completion.data = test_data[:num_samples]
        result = oai.Completion.eval(analysis.best_config, prune=False, eval_only=True)
        print(result)
    except ImportError as exc:
        print(exc)


if __name__ == "__main__":
    import openai

    openai.api_key_path = "test/openai/key.txt"
    test_humaneval(-1)