autogen/test/automl/test_xgboost2d_sample_size.py

import unittest

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from flaml.automl import AutoML
from flaml.model import XGBoostSklearnEstimator
from flaml import tune


dataset = "credit-g"


class XGBoost2D(XGBoostSklearnEstimator):
    @classmethod
    def search_space(cls, data_size, task):
        upper = min(32768, int(data_size))
        return {
            "n_estimators": {
                "domain": tune.lograndint(lower=4, upper=upper),
                "init_value": 4,
            },
            "max_leaves": {
                "domain": tune.lograndint(lower=4, upper=upper),
                "init_value": 4,
            },
        }


def _test_simple(method=None, size_ratio=1.0):
    automl = AutoML()
    automl.add_learner(learner_name="XGBoost2D", learner_class=XGBoost2D)

    X, y = fetch_openml(name=dataset, return_X_y=True)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42
    )

    final_size = int(len(y_train) * size_ratio)
    X_train = X_train[:final_size]
    y_train = y_train[:final_size]
    automl_settings = {
        "estimator_list": ["XGBoost2D"],
        # "metric": 'accuracy',
        "task": "classification",
        "log_file_name": f"test/xgboost2d_{dataset}_{method}_{final_size}.log",
        # "log_training_metric": True,
        # "split_type": split_type,
        "n_jobs": 1,
        "hpo_method": method,
        "log_type": "all",
        "time_budget": 3600,
    }
    automl.fit(X_train=X_train, y_train=y_train, **automl_settings)


def _test_grid_1():
    _test_simple(method="grid", size_ratio=1.0 / 3.0)


def _test_grid_2():
    _test_simple(method="grid", size_ratio=2.0 / 3.0)


def _test_grid_4():
    _test_simple(method="grid", size_ratio=0.5)


def _test_grid_3():
    _test_simple(method="grid", size_ratio=1.0)


if __name__ == "__main__":
    unittest.main()
V0.3.0 (#55) * flaml v0.3 * low cost partial config 2021-04-06 11:37:52 -07:00			`import unittest`

			`from sklearn.datasets import fetch_openml`
			`from sklearn.model_selection import train_test_split`
			`from flaml.automl import AutoML`
			`from flaml.model import XGBoostSklearnEstimator`
			`from flaml import tune`


			`dataset = "credit-g"`


			`class XGBoost2D(XGBoostSklearnEstimator):`
			`@classmethod`
			`def search_space(cls, data_size, task):`
Issue58 (#59) * iter per learner * code cleanup 2021-04-08 09:29:55 -07:00			`upper = min(32768, int(data_size))`
V0.3.0 (#55) * flaml v0.3 * low cost partial config 2021-04-06 11:37:52 -07:00			`return {`
Make NLP tasks available from AutoML.fit() (#210) Sequence classification and regression: "seq-classification" and "seq-regression" Co-authored-by: Chi Wang <wang.chi@microsoft.com> 2021-11-16 14:06:20 -05:00			`"n_estimators": {`
			`"domain": tune.lograndint(lower=4, upper=upper),`
			`"init_value": 4,`
V0.3.0 (#55) * flaml v0.3 * low cost partial config 2021-04-06 11:37:52 -07:00			`},`
Make NLP tasks available from AutoML.fit() (#210) Sequence classification and regression: "seq-classification" and "seq-regression" Co-authored-by: Chi Wang <wang.chi@microsoft.com> 2021-11-16 14:06:20 -05:00			`"max_leaves": {`
			`"domain": tune.lograndint(lower=4, upper=upper),`
			`"init_value": 4,`
V0.3.0 (#55) * flaml v0.3 * low cost partial config 2021-04-06 11:37:52 -07:00			`},`
			`}`


			`def _test_simple(method=None, size_ratio=1.0):`
			`automl = AutoML()`
Make NLP tasks available from AutoML.fit() (#210) Sequence classification and regression: "seq-classification" and "seq-regression" Co-authored-by: Chi Wang <wang.chi@microsoft.com> 2021-11-16 14:06:20 -05:00			`automl.add_learner(learner_name="XGBoost2D", learner_class=XGBoost2D)`
Issue58 (#59) * iter per learner * code cleanup 2021-04-08 09:29:55 -07:00
			`X, y = fetch_openml(name=dataset, return_X_y=True)`
Make NLP tasks available from AutoML.fit() (#210) Sequence classification and regression: "seq-classification" and "seq-regression" Co-authored-by: Chi Wang <wang.chi@microsoft.com> 2021-11-16 14:06:20 -05:00			`X_train, X_test, y_train, y_test = train_test_split(`
			`X, y, test_size=0.33, random_state=42`
			`)`
V0.3.0 (#55) * flaml v0.3 * low cost partial config 2021-04-06 11:37:52 -07:00
Issue58 (#59) * iter per learner * code cleanup 2021-04-08 09:29:55 -07:00			`final_size = int(len(y_train) * size_ratio)`
V0.3.0 (#55) * flaml v0.3 * low cost partial config 2021-04-06 11:37:52 -07:00			`X_train = X_train[:final_size]`
			`y_train = y_train[:final_size]`
			`automl_settings = {`
Make NLP tasks available from AutoML.fit() (#210) Sequence classification and regression: "seq-classification" and "seq-regression" Co-authored-by: Chi Wang <wang.chi@microsoft.com> 2021-11-16 14:06:20 -05:00			`"estimator_list": ["XGBoost2D"],`
V0.3.0 (#55) * flaml v0.3 * low cost partial config 2021-04-06 11:37:52 -07:00			`# "metric": 'accuracy',`
Make NLP tasks available from AutoML.fit() (#210) Sequence classification and regression: "seq-classification" and "seq-regression" Co-authored-by: Chi Wang <wang.chi@microsoft.com> 2021-11-16 14:06:20 -05:00			`"task": "classification",`
V0.3.0 (#55) * flaml v0.3 * low cost partial config 2021-04-06 11:37:52 -07:00			`"log_file_name": f"test/xgboost2d_{dataset}_{method}_{final_size}.log",`
			`# "log_training_metric": True,`
			`# "split_type": split_type,`
			`"n_jobs": 1,`
			`"hpo_method": method,`
			`"log_type": "all",`
			`"time_budget": 3600,`
			`}`
			`automl.fit(X_train=X_train, y_train=y_train, **automl_settings)`


			`def _test_grid_1():`
Issue58 (#59) * iter per learner * code cleanup 2021-04-08 09:29:55 -07:00			`_test_simple(method="grid", size_ratio=1.0 / 3.0)`

V0.3.0 (#55) * flaml v0.3 * low cost partial config 2021-04-06 11:37:52 -07:00
			`def _test_grid_2():`
Issue58 (#59) * iter per learner * code cleanup 2021-04-08 09:29:55 -07:00			`_test_simple(method="grid", size_ratio=2.0 / 3.0)`

V0.3.0 (#55) * flaml v0.3 * low cost partial config 2021-04-06 11:37:52 -07:00
			`def _test_grid_4():`
			`_test_simple(method="grid", size_ratio=0.5)`

Issue58 (#59) * iter per learner * code cleanup 2021-04-08 09:29:55 -07:00
V0.3.0 (#55) * flaml v0.3 * low cost partial config 2021-04-06 11:37:52 -07:00			`def _test_grid_3():`
			`_test_simple(method="grid", size_ratio=1.0)`


			`if __name__ == "__main__":`
			`unittest.main()`