autogen/test/tune_example.py
Chi Wang 8a44dd4318
data in csv (#430)
* data in csv

* support ray ObjectRef #365

* use object store to store data when using ray

* make lgbm tuning example a test

* homepage title
2022-01-30 19:36:41 -08:00

75 lines
2.6 KiB
Python

from flaml import tune
from flaml.model import LGBMEstimator
import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error
data = fetch_california_housing(return_X_y=False, as_frame=True)
df, X, y = data.frame, data.data, data.target
df_train, _, X_train, X_test, _, y_test = train_test_split(
df, X, y, test_size=0.33, random_state=42
)
csv_file_name = "test/housing.csv"
df_train.to_csv(csv_file_name, index=False)
# X, y = fetch_california_housing(return_X_y=True, as_frame=True)
# X_train, X_test, y_train, y_test = train_test_split(
# X, y, test_size=0.33, random_state=42
# )
def train_lgbm(config: dict) -> dict:
# convert config dict to lgbm params
params = LGBMEstimator(**config).params
# train the model
# train_set = lightgbm.Dataset(X_train, y_train)
# LightGBM only accepts the csv with valid number format, if even these string columns are set to ignore.
train_set = lightgbm.Dataset(
csv_file_name, params={"label_column": "name:MedHouseVal", "header": True}
)
model = lightgbm.train(params, train_set)
# evaluate the model
pred = model.predict(X_test)
mse = mean_squared_error(y_test, pred)
# return eval results as a dictionary
return {"mse": mse}
def test_tune_lgbm_csv():
# load a built-in search space from flaml
flaml_lgbm_search_space = LGBMEstimator.search_space(X_train.shape)
# specify the search space as a dict from hp name to domain; you can define your own search space same way
config_search_space = {
hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()
}
# give guidance about hp values corresponding to low training cost, i.e., {"n_estimators": 4, "num_leaves": 4}
low_cost_partial_config = {
hp: space["low_cost_init_value"]
for hp, space in flaml_lgbm_search_space.items()
if "low_cost_init_value" in space
}
# initial points to evaluate
points_to_evaluate = [
{
hp: space["init_value"]
for hp, space in flaml_lgbm_search_space.items()
if "init_value" in space
}
]
# run the tuning, minimizing mse, with total time budget 3 seconds
analysis = tune.run(
train_lgbm,
metric="mse",
mode="min",
config=config_search_space,
low_cost_partial_config=low_cost_partial_config,
points_to_evaluate=points_to_evaluate,
time_budget_s=3,
num_samples=-1,
)
print(analysis.best_result)
if __name__ == "__main__":
test_tune_lgbm_csv()