* remove extra comma

* exclusive bound

* log file name

* add cost to space

* dataset_format

* add load_openml_dataset test

* docstr

* revise test format

* simplify restore

* order categories

* openml server exception in test

* process space

* add warning

* log format

* reduce n_cpu

* nested space

* hierarchical search space for CFO

* non hierarchical for bs

* unflatten hierarchical config

* connection error

* random sample

* config signature

* check ray version

* preprocess numpy array

* catboost preprocess

* time budget

* seed, verbose, hpo_method

* test cfocat

* shallow copy in flatten_dict
prevent lgbm model duplication

* match estimator name

* quantize and log

* test qloguniform and qrandint

* test qlograndint

* thread.running

Co-authored-by: Chi Wang <wang.chi@microsoft.com>
Co-authored-by: Qingyun Wu <qingyunwu@Qingyuns-MacBook-Pro-2.local>
This commit is contained in:
Qingyun Wu 2021-08-12 02:02:22 -04:00 committed by GitHub
parent 2fb888e64e
commit 10082b9262
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 938 additions and 436 deletions

View File

@ -133,7 +133,7 @@ Please find demo and tutorials of FLAML [here](https://www.youtube.com/channel/U
For more technical details, please check our papers.
* [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys, 2021.
* [FLAML: A Fast and Lightweight AutoML Library](https://www.microsoft.com/en-us/research/publication/flaml-a-fast-and-lightweight-automl-library/). Chi Wang, Qingyun Wu, Markus Weimer, Erkang Zhu. MLSys 2021.
```bibtex
@inproceedings{wang2021flaml,

View File

@ -13,8 +13,6 @@ from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, \
RepeatedKFold, GroupKFold
from sklearn.utils import shuffle
import pandas as pd
import os
import contextlib
from .ml import compute_estimator, train_estimator, get_estimator_class, \
get_classification_objective
@ -56,6 +54,7 @@ class SearchState:
self.low_cost_partial_config = {}
self.cat_hp_cost = {}
self.data_size = data_size
self.ls_ever_converged = False
search_space = learner_class.search_space(
data_size=data_size, task=task)
for name, space in search_space.items():
@ -215,7 +214,6 @@ class AutoMLState:
}
if sampled_weight is not None:
self.fit_kwargs['sample_weight'] = weight
# with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
# tune.report(**result)
return result
@ -820,10 +818,12 @@ class AutoML:
Returns:
A dict.
(a) if there is only one estimator in estimator_list, each key is a
hyperparameter name
hyperparameter name.
(b) otherwise, it is a nested dict with 'ml' as the key, and
a list of the low_cost_partial_configs as the value, corresponding
to each learner's low_cost_partial_config
to each learner's low_cost_partial_config; the estimator index as
an integer corresponding to the cheapest learner is appeneded to the
list at the end.
'''
if len(self.estimator_list) == 1:
@ -835,6 +835,9 @@ class AutoML:
for estimator in self.estimator_list:
c = self._search_states[estimator].low_cost_partial_config
configs.append(c)
configs.append(np.argmin([
self._state.learner_classes.get(estimator).cost_relative2lgbm()
for estimator in self.estimator_list]))
config = {'ml': configs}
return config
@ -845,10 +848,11 @@ class AutoML:
Returns:
A dict.
(a) if there is only one estimator in estimator_list, each key is a
hyperparameter name
hyperparameter name.
(b) otherwise, it is a nested dict with 'ml' as the key, and
a list of the cat_hp_cost's as the value, corresponding
to each learner's cat_hp_cost
to each learner's cat_hp_cost; the cost relative to lgbm for each
learner (as a list itself) is appended to the list at the end.
'''
if len(self.estimator_list) == 1:
@ -860,6 +864,9 @@ class AutoML:
for estimator in self.estimator_list:
c = self._search_states[estimator].cat_hp_cost
configs.append(c)
configs.append([
self._state.learner_classes.get(estimator).cost_relative2lgbm()
for estimator in self.estimator_list])
config = {'ml': configs}
return config
@ -930,7 +937,8 @@ class AutoML:
config['FLAML_sample_size'] = sample_size
estimator = config['learner']
del config['learner']
states[estimator].training_function(config)
result = states[estimator].training_function(config)
return result
return train
@ -943,7 +951,7 @@ class AutoML:
'''
def size_func(config: dict) -> float:
config = config.get('ml', config).copy
config = config.get('ml', config)
estimator = config['learner']
learner_class = self._state.learner_classes.get(estimator)
return learner_class.size(config)
@ -971,7 +979,7 @@ class AutoML:
metric='auto',
task='classification',
n_jobs=-1,
log_file_name='default.log',
log_file_name='flaml.log',
estimator_list='auto',
time_budget=60,
max_iter=1000000,
@ -996,6 +1004,7 @@ class AutoML:
learner_selector='sample',
hpo_method=None,
starting_points={},
seed=None,
**fit_kwargs):
'''Find a model for a given task
@ -1063,12 +1072,20 @@ class AutoML:
samples used while splitting the dataset into train/valid set
verbose: int, default=1 | Controls the verbosity, higher means more
messages.
hpo_method: str or None, default=None | The hyperparameter
optimization method. When it is None, CFO is used.
No need to set when using flaml's default search space or using
a simple customized search space. When set to 'bs', BlendSearch
is used. BlendSearch can be tried when the search space is
complex, for example, containing multiple disjoint, discontinuous
subspaces.
starting_points: A dictionary to specify the starting hyperparameter
config for the estimators.
Keys are the name of the estimators, and values are the starting
hyperparamter configurations for the corresponding estimators.
seed: int or None, default=None | The random seed for np.random.
**fit_kwargs: Other key word arguments to pass to fit() function of
the searched learners, such sample_weight
the searched learners, such as sample_weight.
'''
self._start_time_flag = time.time()
self._state.task = task
@ -1079,6 +1096,8 @@ class AutoML:
self._validate_data(X_train, y_train, dataframe, label, X_val, y_val)
self._search_states = {} # key: estimator name; value: SearchState
self._random = np.random.RandomState(RANDOM_SEED)
if seed is not None:
np.random.seed(seed)
self._learner_selector = learner_selector
old_level = logger.getEffectiveLevel()
self.verbose = verbose
@ -1144,21 +1163,33 @@ class AutoML:
estimator_list))
self.estimator_list = estimator_list
self._hpo_method = hpo_method or 'cfo'
with training_log_writer(log_file_name) as save_helper:
self._training_log = save_helper
self._state.time_budget = time_budget
self._active_estimators = estimator_list.copy()
self._ensemble = ensemble
self._max_iter = max_iter
self._mem_thres = mem_thres
self._pred_time_limit = pred_time_limit
self._state.train_time_limit = train_time_limit
self._log_type = log_type
self.split_ratio = split_ratio
self._save_model_history = model_history
self._state.n_jobs = n_jobs
self._state.time_budget = time_budget
self._active_estimators = estimator_list.copy()
self._ensemble = ensemble
self._max_iter = max_iter
self._mem_thres = mem_thres
self._pred_time_limit = pred_time_limit
self._state.train_time_limit = train_time_limit
self._log_type = log_type
self.split_ratio = split_ratio
self._save_model_history = model_history
self._state.n_jobs = n_jobs
if log_file_name:
with training_log_writer(log_file_name) as save_helper:
self._training_log = save_helper
self._search()
else:
self._training_log = None
self._search()
logger.info("fit succeeded")
logger.info("fit succeeded")
logger.info(f"Time taken to find the best model: {self._time_taken_best_iter}")
if self._time_taken_best_iter >= time_budget * 0.7 and not \
all(self._ever_converged_per_learner.values()):
logger.warn("Time taken to find the best model is {0:.0f}% of the "
"provided time budget and not all estimators' hyperparameter "
"search converged. Consider increasing the time budget.".format(
self._time_taken_best_iter / time_budget * 100))
if verbose == 0:
logger.setLevel(old_level)
@ -1169,14 +1200,18 @@ class AutoML:
self._state.time_from_start = 0
self._estimator_index = None
self._best_iteration = 0
self._time_taken_best_iter = 0
self._model_history = {}
self._config_history = {}
self._max_iter_per_learner = 1000000 # TODO
self._iter_per_learner = dict([(e, 0) for e in self.estimator_list])
self._ever_converged_per_learner = dict([(e, False) for e in self.estimator_list])
self._fullsize_reached = False
self._trained_estimator = None
self._best_estimator = None
self._retrained_config = {}
self._warn_threshold = 10
est_retrain_time = next_trial_time = 0
best_config_sig = None
# use ConcurrencyLimiter to limit the amount of concurrency when
@ -1185,20 +1220,27 @@ class AutoML:
if self._ensemble:
self.best_model = {}
try:
from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
from ray.tune.suggest import ConcurrencyLimiter
except ImportError:
except (ImportError, AssertionError):
from .searcher.suggestion import ConcurrencyLimiter
if self._hpo_method in ('cfo', 'grid'):
from flaml import CFO as SearchAlgo
elif 'optuna' == self._hpo_method:
try:
assert ray_version >= '1.0.0'
from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo
except ImportError:
except (ImportError, AssertionError):
from .searcher.suggestion import OptunaSearch as SearchAlgo
elif 'bs' == self._hpo_method:
from flaml import BlendSearch as SearchAlgo
elif 'cfocat' == self._hpo_method:
from flaml import CFOCat as SearchAlgo
else:
raise NotImplementedError
raise NotImplementedError(
f"hpo_method={self._hpo_method} is not recognized. "
"'cfo' and 'bs' are supported.")
for self._track_iter in range(self._max_iter):
if self._estimator_index is None:
@ -1244,7 +1286,7 @@ class AutoML:
else:
points_to_evaluate = [search_state.init_config]
low_cost_partial_config = search_state.low_cost_partial_config
if self._hpo_method in ('bs', 'cfo', 'grid'):
if self._hpo_method in ('bs', 'cfo', 'grid', 'cfocat'):
algo = SearchAlgo(
metric='val_loss', mode='min', space=search_space,
points_to_evaluate=points_to_evaluate,
@ -1267,7 +1309,7 @@ class AutoML:
max_concurrent=1)
else:
search_space = None
if self._hpo_method in ('bs', 'cfo'):
if self._hpo_method in ('bs', 'cfo', 'cfocat'):
search_state.search_alg.set_search_properties(
metric=None, mode=None,
config={
@ -1320,20 +1362,22 @@ class AutoML:
self._trained_estimator = None
self._trained_estimator = search_state.trained_estimator
self._best_iteration = self._track_iter
self._time_taken_best_iter = self._state.time_from_start
better = True
next_trial_time = search_state.time2eval_best
if better or self._log_type == 'all':
self._training_log.append(
self._iter_per_learner[estimator],
search_state.train_loss,
search_state.trial_time,
self._state.time_from_start,
search_state.val_loss,
search_state.config,
search_state.best_loss,
search_state.best_config,
estimator,
search_state.sample_size)
if self._training_log:
self._training_log.append(
self._iter_per_learner[estimator],
search_state.train_loss,
search_state.trial_time,
self._state.time_from_start,
search_state.val_loss,
search_state.config,
search_state.best_loss,
search_state.best_config,
estimator,
search_state.sample_size)
if mlflow is not None and mlflow.active_run():
with mlflow.start_run(nested=True):
mlflow.log_metric('iter_counter',
@ -1365,6 +1409,15 @@ class AutoML:
search_state.best_loss,
self._best_estimator,
self._state.best_loss))
searcher = search_state.search_alg.searcher
if searcher.is_ls_ever_converged and not self._ever_converged_per_learner[estimator]:
self._ever_converged_per_learner[estimator] = searcher.is_ls_ever_converged
if all(self._ever_converged_per_learner.values()) and \
self._state.time_from_start > self._warn_threshold * self._time_taken_best_iter:
logger.warn("All estimator hyperparameters local search has converged at least once, "
f"and the total search time exceeds {self._warn_threshold} times the time taken "
"to find the best model.")
self._warn_threshold *= 10
else:
logger.info(f"no enough budget for learner {estimator}")
if self._estimator_index is not None:
@ -1396,7 +1449,8 @@ class AutoML:
if time_left < time_ensemble < 2 * time_left:
break
# Add a checkpoint for the current best config to the log.
self._training_log.checkpoint()
if self._training_log:
self._training_log.checkpoint()
if self._best_estimator:
self._selected = self._search_states[self._best_estimator]
self._trained_estimator = self._selected.trained_estimator

View File

@ -11,7 +11,8 @@ from .training_log import training_log_reader
from datetime import datetime
def load_openml_dataset(dataset_id, data_dir=None, random_state=0):
def load_openml_dataset(dataset_id, data_dir=None, random_state=0,
dataset_format='dataframe'):
'''Load dataset from open ML.
If the file is not cached locally, download it from open ML.
@ -20,12 +21,15 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0):
dataset_id: An integer of the dataset id in openml
data_dir: A string of the path to store and load the data
random_state: An integer of the random seed for splitting data
dataset_format: A string specifying the format of returned dataset. Default is 'dataframe'.
Can choose from ['dataframe', 'array'].
If 'dataframe', the returned dataset will be a Pandas DataFrame.
If 'array', the returned dataset will be a NumPy array or a SciPy sparse matrix.
Returns:
X_train: A dataframe of training data
X_test: A dataframe of test data
y_train: A series of labels for training data
y_test: A series of labels for test data
X_train: Training data
X_test: Test data
y_train: A series or array of labels for training data
y_test: A series or array of labels for test data
'''
import os
import openml
@ -48,7 +52,7 @@ def load_openml_dataset(dataset_id, data_dir=None, random_state=0):
print('Dataset name:', dataset.name)
X, y, * \
__ = dataset.get_data(
target=dataset.default_target_attribute)
target=dataset.default_target_attribute, dataset_format=dataset_format)
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=random_state)
print(

View File

@ -23,24 +23,24 @@ logger = logging.getLogger(__name__)
def get_estimator_class(task, estimator_name):
''' when adding a new learner, need to add an elif branch '''
if 'xgboost' in estimator_name:
if 'xgboost' == estimator_name:
if 'regression' in task:
estimator_class = XGBoostEstimator
else:
estimator_class = XGBoostSklearnEstimator
elif 'rf' in estimator_name:
elif 'rf' == estimator_name:
estimator_class = RandomForestEstimator
elif 'lgbm' in estimator_name:
elif 'lgbm' == estimator_name:
estimator_class = LGBMEstimator
elif 'lrl1' in estimator_name:
elif 'lrl1' == estimator_name:
estimator_class = LRL1Classifier
elif 'lrl2' in estimator_name:
elif 'lrl2' == estimator_name:
estimator_class = LRL2Classifier
elif 'catboost' in estimator_name:
elif 'catboost' == estimator_name:
estimator_class = CatBoostEstimator
elif 'extra_tree' in estimator_name:
elif 'extra_tree' == estimator_name:
estimator_class = ExtraTreeEstimator
elif 'kneighbor' in estimator_name:
elif 'kneighbor' == estimator_name:
estimator_class = KNeighborsEstimator
else:
raise ValueError(

View File

@ -183,9 +183,10 @@ class SKLearnEstimator(BaseEstimator):
def _preprocess(self, X):
if isinstance(X, pd.DataFrame):
X = X.copy()
cat_columns = X.select_dtypes(include=['category']).columns
X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)
if not cat_columns.empty:
X = X.copy()
X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)
elif isinstance(X, np.ndarray) and X.dtype.kind not in 'buif':
# numpy array is not of numeric dtype
X = pd.DataFrame(X)
@ -213,7 +214,7 @@ class LGBMEstimator(BaseEstimator):
'low_cost_init_value': 4,
},
'min_child_samples': {
'domain': tune.lograndint(lower=2, upper=2**7),
'domain': tune.lograndint(lower=2, upper=2**7 + 1),
'init_value': 20,
},
'learning_rate': {
@ -225,7 +226,7 @@ class LGBMEstimator(BaseEstimator):
'init_value': 1.0,
},
'log_max_bin': {
'domain': tune.lograndint(lower=3, upper=10),
'domain': tune.lograndint(lower=3, upper=11),
'init_value': 8,
},
'colsample_bytree': {
@ -270,6 +271,8 @@ class LGBMEstimator(BaseEstimator):
self.params["objective"] = objective
if "max_bin" not in self.params:
self.params['max_bin'] = 1 << int(round(log_max_bin)) - 1
if "verbose" not in self.params:
self.params['verbose'] = -1
if 'regression' in task:
self.estimator_class = LGBMRegressor
else:
@ -281,6 +284,13 @@ class LGBMEstimator(BaseEstimator):
if not isinstance(X, pd.DataFrame) and issparse(X) and np.issubdtype(
X.dtype, np.integer):
X = X.astype(float)
elif isinstance(X, np.ndarray) and X.dtype.kind not in 'buif':
# numpy array is not of numeric dtype
X = pd.DataFrame(X)
for col in X.columns:
if isinstance(X[col][0], str):
X[col] = X[col].astype('category').cat.codes
X = X.to_numpy()
return X
def fit(self, X_train, y_train, budget=None, **kwargs):
@ -455,14 +465,15 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
super().__init__(task, **params)
del self.params['objective']
del self.params['max_bin']
del self.params['verbose']
self.params.update({
"n_estimators": int(round(n_estimators)),
'max_leaves': int(round(max_leaves)),
'max_depth': 0,
'grow_policy': params.get("grow_policy", 'lossguide'),
'tree_method': tree_method,
'verbosity': 0,
'n_jobs': n_jobs,
'verbosity': 0,
'learning_rate': float(learning_rate),
'subsample': float(subsample),
'reg_alpha': float(reg_alpha),
@ -531,6 +542,7 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
self.params.update({
"n_estimators": int(round(n_estimators)),
"n_jobs": n_jobs,
"verbose": 0,
'max_features': float(max_features),
"max_leaf_nodes": params.get('max_leaf_nodes', int(round(max_leaves))),
})
@ -629,7 +641,7 @@ class CatBoostEstimator(BaseEstimator):
@classmethod
def search_space(cls, data_size, **params):
upper = max(min(round(1500000 / data_size), 150), 11)
upper = max(min(round(1500000 / data_size), 150), 12)
return {
'early_stopping_rounds': {
'domain': tune.lograndint(lower=10, upper=upper),
@ -657,6 +669,25 @@ class CatBoostEstimator(BaseEstimator):
CatBoostEstimator._time_per_iter = None
CatBoostEstimator._train_size = 0
def _preprocess(self, X):
if isinstance(X, pd.DataFrame):
cat_columns = X.select_dtypes(include=['category']).columns
if not cat_columns.empty:
X = X.copy()
X[cat_columns] = X[cat_columns].apply(
lambda x:
x.cat.rename_categories(
[str(c) if isinstance(c, float) else c
for c in x.cat.categories]))
elif isinstance(X, np.ndarray) and X.dtype.kind not in 'buif':
# numpy array is not of numeric dtype
X = pd.DataFrame(X)
for col in X.columns:
if isinstance(X[col][0], str):
X[col] = X[col].astype('category').cat.codes
X = X.to_numpy()
return X
def __init__(
self, task='binary:logistic', n_jobs=1,
n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params
@ -685,68 +716,69 @@ class CatBoostEstimator(BaseEstimator):
def fit(self, X_train, y_train, budget=None, **kwargs):
start_time = time.time()
n_iter = self.params["n_estimators"]
X_train = self._preprocess(X_train)
if isinstance(X_train, pd.DataFrame):
cat_features = list(X_train.select_dtypes(
include='category').columns)
else:
cat_features = []
from catboost import CatBoostError
try:
if (not CatBoostEstimator._time_per_iter or abs(
CatBoostEstimator._train_size - len(y_train)) > 4) and budget:
# measure the time per iteration
self.params["n_estimators"] = 1
CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
CatBoostEstimator._smallmodel.fit(
X_train, y_train, cat_features=cat_features, **kwargs)
CatBoostEstimator._t1 = time.time() - start_time
if CatBoostEstimator._t1 >= budget:
self.params["n_estimators"] = n_iter
self._model = CatBoostEstimator._smallmodel
return CatBoostEstimator._t1
self.params["n_estimators"] = 4
CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
CatBoostEstimator._smallmodel.fit(
X_train, y_train, cat_features=cat_features, **kwargs)
CatBoostEstimator._time_per_iter = (
time.time() - start_time - CatBoostEstimator._t1) / (
self.params["n_estimators"] - 1)
if CatBoostEstimator._time_per_iter <= 0:
CatBoostEstimator._time_per_iter = CatBoostEstimator._t1
CatBoostEstimator._train_size = len(y_train)
if time.time() - start_time >= budget or n_iter == self.params[
"n_estimators"]:
self.params["n_estimators"] = n_iter
self._model = CatBoostEstimator._smallmodel
return time.time() - start_time
if budget:
train_times = 1
self.params["n_estimators"] = min(n_iter, int(
(budget - time.time() + start_time - CatBoostEstimator._t1)
/ train_times / CatBoostEstimator._time_per_iter + 1))
# from catboost import CatBoostError
# try:
if (not CatBoostEstimator._time_per_iter or abs(
CatBoostEstimator._train_size - len(y_train)) > 4) and budget:
# measure the time per iteration
self.params["n_estimators"] = 1
CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
CatBoostEstimator._smallmodel.fit(
X_train, y_train, cat_features=cat_features, **kwargs)
CatBoostEstimator._t1 = time.time() - start_time
if CatBoostEstimator._t1 >= budget:
self.params["n_estimators"] = n_iter
self._model = CatBoostEstimator._smallmodel
if self.params["n_estimators"] > 0:
n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
X_tr, y_tr = X_train[:n], y_train[:n]
if 'sample_weight' in kwargs:
weight = kwargs['sample_weight']
if weight is not None:
kwargs['sample_weight'] = weight[:n]
else:
weight = None
from catboost import Pool
model = self.estimator_class(**self.params)
model.fit(
X_tr, y_tr, cat_features=cat_features,
eval_set=Pool(
data=X_train[n:], label=y_train[n:],
cat_features=cat_features),
**kwargs) # model.get_best_iteration()
return CatBoostEstimator._t1
self.params["n_estimators"] = 4
CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
CatBoostEstimator._smallmodel.fit(
X_train, y_train, cat_features=cat_features, **kwargs)
CatBoostEstimator._time_per_iter = (
time.time() - start_time - CatBoostEstimator._t1) / (
self.params["n_estimators"] - 1)
if CatBoostEstimator._time_per_iter <= 0:
CatBoostEstimator._time_per_iter = CatBoostEstimator._t1
CatBoostEstimator._train_size = len(y_train)
if time.time() - start_time >= budget or n_iter == self.params[
"n_estimators"]:
self.params["n_estimators"] = n_iter
self._model = CatBoostEstimator._smallmodel
return time.time() - start_time
if budget:
train_times = 1
self.params["n_estimators"] = min(n_iter, int(
(budget - time.time() + start_time - CatBoostEstimator._t1)
/ train_times / CatBoostEstimator._time_per_iter + 1))
self._model = CatBoostEstimator._smallmodel
if self.params["n_estimators"] > 0:
n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
X_tr, y_tr = X_train[:n], y_train[:n]
if 'sample_weight' in kwargs:
weight = kwargs['sample_weight']
if weight is not None:
kwargs['sample_weight'] = weight
self._model = model
except CatBoostError:
self._model = None
kwargs['sample_weight'] = weight[:n]
else:
weight = None
from catboost import Pool
model = self.estimator_class(**self.params)
model.fit(
X_tr, y_tr, cat_features=cat_features,
eval_set=Pool(
data=X_train[n:], label=y_train[n:],
cat_features=cat_features),
**kwargs) # model.get_best_iteration()
if weight is not None:
kwargs['sample_weight'] = weight
self._model = model
# except CatBoostError:
# self._model = None
self.params["n_estimators"] = n_iter
train_time = time.time() - start_time
return train_time

View File

@ -9,16 +9,18 @@ import time
import pickle
try:
from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
from ray.tune.suggest import Searcher
from ray.tune.suggest.optuna import OptunaSearch as GlobalSearch
from ray.tune.suggest.variant_generator import generate_variants
from ray.tune.utils.util import flatten_dict
except ImportError:
except (ImportError, AssertionError):
from .suggestion import Searcher
from .suggestion import OptunaSearch as GlobalSearch
from .variant_generator import generate_variants, flatten_dict
from .variant_generator import flatten_dict
from .search_thread import SearchThread
from .flow2 import FLOW2
from ..tune.space import add_cost_to_space, normalize # TODO: , define_by_run_func
import logging
logger = logging.getLogger(__name__)
@ -125,9 +127,15 @@ class BlendSearch(Searcher):
if self._metric_constraints:
# metric modified by lagrange
metric += self.lagrange
self._cat_hp_cost = cat_hp_cost or {}
if space:
add_cost_to_space(space, init_config, self._cat_hp_cost)
if global_search_alg is not None:
self._gs = global_search_alg
elif getattr(self, '__name__', None) != 'CFO':
gs_space = space
# TODO: when define_by_run is supported
# gs_space = define_by_run_func(space)
try:
gs_seed = seed - 10 if (seed - 10) >= 0 else seed - 11 + (1 << 32)
if experimental:
@ -137,10 +145,11 @@ class BlendSearch(Searcher):
else:
sampler = None
self._gs = GlobalSearch(
space=space, metric=metric, mode=mode, seed=gs_seed,
space=gs_space, metric=metric, mode=mode, seed=gs_seed,
sampler=sampler)
except TypeError:
self._gs = GlobalSearch(space=space, metric=metric, mode=mode)
self._gs = GlobalSearch(space=gs_space, metric=metric, mode=mode)
self._gs.space = space
else:
self._gs = None
self._experimental = experimental
@ -152,8 +161,10 @@ class BlendSearch(Searcher):
else:
self._candidate_start_points = None
self._ls = self.LocalSearch(
init_config, metric, mode, cat_hp_cost, space, prune_attr,
init_config, metric, mode, space, prune_attr,
min_resource, max_resource, reduction_factor, self.cost_attr, seed)
self._is_ls_ever_converged = False
self._subspace = {} # the subspace for each trial id
self._init_search()
def set_search_properties(self,
@ -174,9 +185,13 @@ class BlendSearch(Searcher):
self._mode = mode
if not self._ls.space:
# the search space can be set only once
self._ls.set_search_properties(metric, mode, config)
if self._gs is not None:
self._gs.set_search_properties(metric, mode, config)
self._gs.space = config
if config:
add_cost_to_space(
config, self._ls.init_config, self._cat_hp_cost)
self._ls.set_search_properties(metric, mode, config)
self._init_search()
elif metric_changed or mode_changed:
# reset search when metric or mode changed
@ -205,7 +220,9 @@ class BlendSearch(Searcher):
self._thread_count = 1 # total # threads created
self._init_used = self._ls.init_config is None
self._trial_proposed_by = {} # trial_id: str -> thread_id: int
self._ls_bound_min = self._ls.normalize(self._ls.init_config)
self._ls_bound_min = normalize(
self._ls.init_config.copy(), self._ls.space, self._ls.init_config,
{}, recursive=True)
self._ls_bound_max = self._ls_bound_min.copy()
self._gs_admissible_min = self._ls_bound_min.copy()
self._gs_admissible_max = self._ls_bound_max.copy()
@ -231,34 +248,16 @@ class BlendSearch(Searcher):
'''
with open(checkpoint_path, "rb") as inputFile:
state = pickle.load(inputFile)
self._metric_target = state._metric_target
self._search_thread_pool = state._search_thread_pool
self._thread_count = state._thread_count
self._init_used = state._init_used
self._trial_proposed_by = state._trial_proposed_by
self._ls_bound_min = state._ls_bound_min
self._ls_bound_max = state._ls_bound_max
self._gs_admissible_min = state._gs_admissible_min
self._gs_admissible_max = state._gs_admissible_max
self._result = state._result
self._deadline = state._deadline
self._metric, self._mode = state._metric, state._mode
self._points_to_evaluate = state._points_to_evaluate
self._gs = state._gs
self._ls = state._ls
self._config_constraints = state._config_constraints
self._metric_constraints = state._metric_constraints
self._metric_constraint_satisfied = state._metric_constraint_satisfied
self._metric_constraint_penalty = state._metric_constraint_penalty
self._candidate_start_points = state._candidate_start_points
if self._candidate_start_points:
self._started_from_given = state._started_from_given
self._started_from_low_cost = state._started_from_low_cost
self.__dict__ = state.__dict__
@property
def metric_target(self):
return self._metric_target
@property
def is_ls_ever_converged(self):
return self._is_ls_ever_converged
def on_trial_complete(self, trial_id: str, result: Optional[Dict] = None,
error: bool = False):
''' search thread updater and cleaner
@ -296,10 +295,12 @@ class BlendSearch(Searcher):
for key, value in result.items():
if key.startswith('config/'):
config[key[7:]] = value
signature = self._ls.config_signature(
config, self._subspace.get(trial_id, {}))
if error: # remove from result cache
del self._result[self._ls.config_signature(config)]
del self._result[signature]
else: # add to result cache
self._result[self._ls.config_signature(config)] = result
self._result[signature] = result
# update target metric if improved
objective = result[self._ls.metric]
if (objective - self._metric_target) * self._ls.metric_op < 0:
@ -307,8 +308,11 @@ class BlendSearch(Searcher):
if thread_id:
if not self._metric_constraint_satisfied:
# no point has been found to satisfy metric constraint
self._expand_admissible_region()
self._expand_admissible_region(
self._ls_bound_min, self._ls_bound_max,
self._subspace.get(trial_id, self._ls.space))
if self._gs is not None and self._experimental:
# TODO: key match for hierarchical space
self._gs.add_evaluated_point(flatten_dict(config), objective)
elif metric_constraint_satisfied and self._create_condition(
result):
@ -320,7 +324,8 @@ class BlendSearch(Searcher):
del self._candidate_start_points[trial_id]
else:
self._started_from_low_cost = True
self._create_thread(config, result)
self._create_thread(config, result, self._subspace.get(
trial_id, self._ls.space))
# reset admissible region to ls bounding box
self._gs_admissible_min.update(self._ls_bound_min)
self._gs_admissible_max.update(self._ls_bound_max)
@ -328,29 +333,46 @@ class BlendSearch(Searcher):
if thread_id and thread_id in self._search_thread_pool:
# local search thread
self._clean(thread_id)
if trial_id in self._subspace and not (self._candidate_start_points
and trial_id in self._candidate_start_points):
del self._subspace[trial_id]
def _create_thread(self, config, result):
def _create_thread(self, config, result, space):
# logger.info(f"create local search thread from {config}")
self._search_thread_pool[self._thread_count] = SearchThread(
self._ls.mode,
self._ls.create(
config, result[self._ls.metric],
cost=result.get(self.cost_attr, 1)),
cost=result.get(self.cost_attr, 1), space=space),
self.cost_attr
)
self._thread_count += 1
self._update_admissible_region(
config, self._ls_bound_min, self._ls_bound_max)
config, self._ls_bound_min, self._ls_bound_max, space)
def _update_admissible_region(self, config, admissible_min, admissible_max):
def _update_admissible_region(
self, config, admissible_min, admissible_max, space: Dict = {}
):
# update admissible region
normalized_config = self._ls.normalize(config)
normalized_config = normalize(config, space, config, {})
for key in admissible_min:
value = normalized_config[key]
if value > admissible_max[key]:
admissible_max[key] = value
elif value < admissible_min[key]:
admissible_min[key] = value
if isinstance(admissible_max[key], list):
choice = space[key]['_choice_']
self._update_admissible_region(
value,
admissible_min[key][choice], admissible_max[key][choice],
space[key]
)
elif isinstance(value, dict):
self._update_admissible_region(
value,
admissible_min[key], admissible_max[key], space[key])
else:
if value > admissible_max[key]:
admissible_max[key] = value
elif value < admissible_min[key]:
admissible_min[key] = value
def _create_condition(self, result: Dict) -> bool:
''' create thread condition
@ -379,8 +401,11 @@ class BlendSearch(Searcher):
break
create_new = False
if self._search_thread_pool[thread_id].converged:
self._is_ls_ever_converged = True
todelete.add(thread_id)
self._expand_admissible_region()
self._expand_admissible_region(
self._ls_bound_min, self._ls_bound_max,
self._search_thread_pool[thread_id].space)
if self._candidate_start_points:
if not self._started_from_given:
# remove start points whose perf is worse than the converged
@ -417,12 +442,21 @@ class BlendSearch(Searcher):
config[key[7:]] = value
self._started_from_given = True
del self._candidate_start_points[best_trial_id]
self._create_thread(config, result)
self._create_thread(config, result, self._subspace.get(
best_trial_id, self._ls.space))
def _expand_admissible_region(self):
for key in self._ls_bound_max:
self._ls_bound_max[key] += self._ls.STEPSIZE
self._ls_bound_min[key] -= self._ls.STEPSIZE
def _expand_admissible_region(self, lower, upper, space):
for key in upper:
ub = upper[key]
if isinstance(ub, list):
choice = space[key]['_choice_']
self._expand_admissible_region(
lower[key][choice], upper[key][choice], space[key])
elif isinstance(ub, dict):
self._expand_admissible_region(lower[key], ub, space[key])
else:
upper[key] += self._ls.STEPSIZE
lower[key] -= self._ls.STEPSIZE
def _inferior(self, id1: int, id2: int) -> bool:
''' whether thread id1 is inferior to id2
@ -460,34 +494,42 @@ class BlendSearch(Searcher):
if choice and config is None:
# local search thread finishes
if self._search_thread_pool[choice].converged:
self._expand_admissible_region()
self._expand_admissible_region(
self._ls_bound_min, self._ls_bound_max,
self._search_thread_pool[choice].space)
del self._search_thread_pool[choice]
return None
# preliminary check; not checking config validation
skip = self._should_skip(choice, trial_id, config)
space = self._search_thread_pool[choice].space
skip = self._should_skip(choice, trial_id, config, space)
use_rs = 0
if skip:
if choice:
return None
# use rs when BO fails to suggest a config
for _, generated in generate_variants({'config': self._ls.space}):
config = generated['config']
break # get one random config
skip = self._should_skip(-1, trial_id, config)
config, space = self._ls.complete_config({})
skip = self._should_skip(-1, trial_id, config, space)
if skip:
return None
if choice or self._valid(config):
use_rs = 1
if choice or self._valid(
config, space, self._gs_admissible_min, self._gs_admissible_max):
# LS or valid or no backup choice
self._trial_proposed_by[trial_id] = choice
self._search_thread_pool[choice].running += use_rs
else: # invalid config proposed by GS
if choice == backup:
# use CFO's init point
init_config = self._ls.init_config
config = self._ls.complete_config(
config, space = self._ls.complete_config(
init_config, self._ls_bound_min, self._ls_bound_max)
self._trial_proposed_by[trial_id] = choice
self._search_thread_pool[choice].running += 1
else:
config = self._search_thread_pool[backup].suggest(trial_id)
skip = self._should_skip(backup, trial_id, config)
thread = self._search_thread_pool[backup]
config = thread.suggest(trial_id)
space = thread.space
skip = self._should_skip(backup, trial_id, config, space)
if skip:
return None
self._trial_proposed_by[trial_id] = backup
@ -498,21 +540,24 @@ class BlendSearch(Searcher):
config[self._ls.prune_attr] = self._ls.min_resource
# temporarily relax admissible region for parallel proposals
self._update_admissible_region(
config, self._gs_admissible_min, self._gs_admissible_max)
config, self._gs_admissible_min, self._gs_admissible_max,
space)
else:
self._update_admissible_region(
config, self._ls_bound_min, self._ls_bound_max)
config, self._ls_bound_min, self._ls_bound_max, space)
self._gs_admissible_min.update(self._ls_bound_min)
self._gs_admissible_max.update(self._ls_bound_max)
self._result[self._ls.config_signature(config)] = {}
signature = self._ls.config_signature(config, space)
self._result[signature] = {}
self._subspace[trial_id] = space
else: # use init config
if self._candidate_start_points is not None and self._points_to_evaluate:
self._candidate_start_points[trial_id] = None
init_config = self._points_to_evaluate.pop(
0) if self._points_to_evaluate else self._ls.init_config
config = self._ls.complete_config(
config, space = self._ls.complete_config(
init_config, self._ls_bound_min, self._ls_bound_max)
config_signature = self._ls.config_signature(config)
config_signature = self._ls.config_signature(config, space)
result = self._result.get(config_signature)
if result: # tried before
return None
@ -523,15 +568,16 @@ class BlendSearch(Searcher):
self._init_used = True
self._trial_proposed_by[trial_id] = 0
self._search_thread_pool[0].running += 1
self._subspace[trial_id] = space
return config
def _should_skip(self, choice, trial_id, config) -> bool:
def _should_skip(self, choice, trial_id, config, space) -> bool:
''' if config is None or config's result is known or constraints are violated
return True; o.w. return False
'''
if config is None:
return True
config_signature = self._ls.config_signature(config)
config_signature = self._ls.config_signature(config, space)
exists = config_signature in self._result
# check constraints
if not exists and self._config_constraints:
@ -600,23 +646,35 @@ class BlendSearch(Searcher):
backup_thread_id = thread_id
return top_thread_id, backup_thread_id
def _valid(self, config: Dict) -> bool:
def _valid(self, config: Dict, space: Dict, lower: Dict, upper: Dict) -> bool:
''' config validator
'''
normalized_config = self._ls.normalize(config)
for key in self._gs_admissible_min:
normalized_config = normalize(config, space, config, {})
for key, lb in lower.items():
if key in config:
value = normalized_config[key]
if value + self._ls.STEPSIZE < self._gs_admissible_min[key] \
or value > self._gs_admissible_max[key] + self._ls.STEPSIZE:
if isinstance(lb, list):
subspace = space[key]['_choice_']
elif isinstance(lb, dict):
subspace = space[key]
else:
subspace = None
if subspace:
valid = self._valid(value, subspace, lb, upper[key])
if not valid:
return False
elif (value + self._ls.STEPSIZE < lower[key]
or value > upper[key] + self._ls.STEPSIZE):
return False
return True
try:
from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
from ray.tune import (uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform)
except ImportError:
except (ImportError, AssertionError):
from ..tune.sample import (uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform)

View File

@ -3,17 +3,20 @@
* Licensed under the MIT License. See LICENSE file in the
* project root for license information.
'''
from typing import Dict, Optional
from typing import Dict, Optional, Tuple
import numpy as np
try:
from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
from ray.tune.suggest import Searcher
from ray.tune.suggest.variant_generator import generate_variants
from ray.tune import sample
from ray.tune.utils.util import flatten_dict, unflatten_dict
except ImportError:
except (ImportError, AssertionError):
from .suggestion import Searcher
from .variant_generator import generate_variants, flatten_dict, unflatten_dict
from ..tune import sample
from ..tune.space import complete_config, denormalize, normalize
import logging
@ -31,7 +34,6 @@ class FLOW2(Searcher):
init_config: dict,
metric: Optional[str] = None,
mode: Optional[str] = None,
cat_hp_cost: Optional[dict] = None,
space: Optional[dict] = None,
prune_attr: Optional[str] = None,
min_resource: Optional[float] = None,
@ -90,12 +92,11 @@ class FLOW2(Searcher):
elif mode == "min":
self.metric_op = 1.
self.space = space or {}
self.space = flatten_dict(self.space, prevent_delimiter=True)
self._space = flatten_dict(self.space, prevent_delimiter=True)
self._random = np.random.RandomState(seed)
self._seed = seed
self.init_config = init_config
self.best_config = flatten_dict(init_config)
self.cat_hp_cost = cat_hp_cost
self.prune_attr = prune_attr
self.min_resource = min_resource
self.resource_multiple_factor = resource_multiple_factor or 4
@ -109,18 +110,9 @@ class FLOW2(Searcher):
def _init_search(self):
self._tunable_keys = []
self._bounded_keys = []
# choices of numeric values. integer encoding.
# value: (ordered list of choices,
# dict from choice to index in the ordered list)
self._ordered_choice_hp = {}
# choices with given cost. integer encoding.
# value: (array of choices ordered by cost,
# dict from choice to index in the ordered array)
self._ordered_cat_hp = {}
# unordered choices. value: cardinality
self._unordered_cat_hp = {}
self._cat_hp_cost = {}
for key, domain in self.space.items():
hier = False
for key, domain in self._space.items():
assert not (isinstance(domain, dict) and 'grid_search' in domain), \
f"{key}'s domain is grid search, not supported in FLOW^2."
if callable(getattr(domain, 'get_sampler', None)):
@ -129,41 +121,33 @@ class FLOW2(Searcher):
# the step size lower bound for uniform variables doesn't depend
# on the current config
if isinstance(sampler, sample.Quantized):
sampler_inner = sampler.get_sampler()
if str(sampler_inner) == 'Uniform':
q = sampler.q
sampler = sampler.get_sampler()
if str(sampler) == 'Uniform':
self._step_lb = min(
self._step_lb, sampler.q / (domain.upper - domain.lower))
self._step_lb, q / (domain.upper - domain.lower))
elif isinstance(domain, sample.Integer) and str(sampler) == 'Uniform':
self._step_lb = min(
self._step_lb, 1.0 / (domain.upper - domain.lower))
self._step_lb, 1.0 / (domain.upper - 1 - domain.lower))
if isinstance(domain, sample.Categorical):
cat_hp_cost = self.cat_hp_cost
if cat_hp_cost and key in cat_hp_cost:
cost = np.array(cat_hp_cost[key])
ind = np.argsort(cost)
ordered = np.array(domain.categories)[ind]
cost = self._cat_hp_cost[key] = cost[ind]
d = {}
for i, choice in enumerate(ordered):
d[choice] = i
self._ordered_cat_hp[key] = (ordered, d)
elif all(isinstance(x, int) or isinstance(x, float)
for x in domain.categories):
ordered = sorted(domain.categories)
d = {}
for i, choice in enumerate(ordered):
d[choice] = i
self._ordered_choice_hp[key] = (ordered, d)
else:
if not domain.ordered:
self._unordered_cat_hp[key] = len(domain.categories)
if not hier:
for cat in domain.categories:
if isinstance(cat, dict):
hier = True
break
if str(sampler) != 'Normal':
self._bounded_keys.append(key)
self._space_keys = list(self.space.keys())
if (self.prune_attr and self.prune_attr not in self.space
if not hier:
self._space_keys = sorted(self._space.keys())
self._hierarchical = hier
if (self.prune_attr and self.prune_attr not in self._space
and self.max_resource):
self._space_keys.append(self.prune_attr)
self.min_resource = self.min_resource or self._min_resource()
self._resource = self._round(self.min_resource)
if not hier:
self._space_keys.append(self.prune_attr)
else:
self._resource = None
self.incumbent = {}
@ -203,20 +187,21 @@ class FLOW2(Searcher):
for key in self._tunable_keys:
if key not in self.best_config:
continue
domain = self.space[key]
domain = self._space[key]
sampler = domain.get_sampler()
# the stepsize lower bound for log uniform variables depends on the
# current config
if isinstance(sampler, sample.Quantized):
q = sampler.q
sampler_inner = sampler.get_sampler()
if str(sampler_inner) == 'LogUniform':
step_lb = min(
step_lb, np.log(1.0 + sampler.q / self.best_config[key])
step_lb, np.log(1.0 + q / self.best_config[key])
/ np.log(domain.upper / domain.lower))
elif isinstance(domain, sample.Integer) and str(sampler) == 'LogUniform':
step_lb = min(
step_lb, np.log(1.0 + 1.0 / self.best_config[key])
/ np.log(domain.upper / domain.lower))
/ np.log((domain.upper - 1) / domain.lower))
if np.isinf(step_lb):
step_lb = self.STEP_LOWER_BOUND
else:
@ -246,56 +231,26 @@ class FLOW2(Searcher):
def complete_config(
self, partial_config: Dict,
lower: Optional[Dict] = None, upper: Optional[Dict] = None
) -> Dict:
) -> Tuple[Dict, Dict]:
''' generate a complete config from the partial config input
add minimal resource to config if available
'''
if self._reset_times and partial_config == self.init_config:
# not the first time to complete init_config, use random gaussian
normalized = self.normalize(partial_config)
for key in normalized:
# don't change unordered cat choice
if key not in self._unordered_cat_hp:
if upper and lower:
up, low = upper[key], lower[key]
gauss_std = up - low or self.STEPSIZE
# allowed bound
up += self.STEPSIZE
low -= self.STEPSIZE
elif key in self._bounded_keys:
up, low, gauss_std = 1, 0, 1.0
else:
up, low, gauss_std = np.Inf, -np.Inf, 1.0
if key in self._bounded_keys:
up = min(up, 1)
low = max(low, 0)
delta = self.rand_vector_gaussian(1, gauss_std)[0]
normalized[key] = max(low, min(up, normalized[key] + delta))
# use best config for unordered cat choice
config = self.denormalize(normalized)
else:
# first time init_config, or other configs, take as is
config = partial_config.copy()
disturb = self._reset_times and partial_config == self.init_config
# if not the first time to complete init_config, use random gaussian
config, space = complete_config(
partial_config, self.space, self, disturb, lower, upper)
if partial_config == self.init_config:
self._reset_times += 1
config = flatten_dict(config)
for key, value in self.space.items():
if key not in config:
config[key] = value
for _, generated in generate_variants({'config': config}):
config = generated['config']
break
if self._resource:
config[self.prune_attr] = self.min_resource
return unflatten_dict(config)
return config, space
def create(self, init_config: Dict, obj: float, cost: float) -> Searcher:
flatten_config = flatten_dict(init_config)
# use the subspace where the init_config is located
space = {k: self.space[k] for k in flatten_config if k in self.space}
def create(self, init_config: Dict, obj: float, cost: float, space: Dict
) -> Searcher:
# space is the subspace where the init_config is located
flow2 = self.__class__(
init_config, self.metric, self.mode, self._cat_hp_cost,
unflatten_dict(space), self.prune_attr,
init_config, self.metric, self.mode,
space, self.prune_attr,
self.min_resource, self.max_resource,
self.resource_multiple_factor, self.cost_attr, self._seed + 1)
flow2.best_obj = obj * self.metric_op # minimize internally
@ -303,115 +258,17 @@ class FLOW2(Searcher):
self._seed += 1
return flow2
def normalize(self, config) -> Dict:
def normalize(self, config, recursive=False) -> Dict:
''' normalize each dimension in config to [0,1]
'''
config_norm = {}
for key, value in flatten_dict(config).items():
if key in self.space:
# domain: sample.Categorical/Integer/Float/Function
domain = self.space[key]
if not callable(getattr(domain, 'get_sampler', None)):
config_norm[key] = value
else:
if isinstance(domain, sample.Categorical):
# normalize categorical
if key in self._ordered_cat_hp:
l, d = self._ordered_cat_hp[key]
config_norm[key] = (d[value] + 0.5) / len(l)
elif key in self._ordered_choice_hp:
l, d = self._ordered_choice_hp[key]
config_norm[key] = (d[value] + 0.5) / len(l)
elif key in self.incumbent:
config_norm[key] = self.incumbent[
key] if value == self.best_config[
key] else (
self.incumbent[key]
+ 1.0 / self._unordered_cat_hp[key]) % 1
else:
config_norm[key] = 0.5
continue
# Uniform/LogUniform/Normal/Base
sampler = domain.get_sampler()
if isinstance(sampler, sample.Quantized):
# sampler is sample.Quantized
sampler = sampler.get_sampler()
if str(sampler) == 'LogUniform':
config_norm[key] = np.log(value / domain.lower) / np.log(
domain.upper / domain.lower)
elif str(sampler) == 'Uniform':
config_norm[key] = (
value - domain.lower) / (domain.upper - domain.lower)
elif str(sampler) == 'Normal':
# N(mean, sd) -> N(0,1)
config_norm[key] = (value - sampler.mean) / sampler.sd
else:
# TODO? elif str(sampler) == 'Base': # sample.Function._CallSampler
# e.g., {test: sample_from(lambda spec: randn(10, 2).sample() * 0.01)}
config_norm[key] = value
else: # prune_attr
config_norm[key] = value
return config_norm
return normalize(
config, self._space, self.best_config, self.incumbent, recursive)
def denormalize(self, config):
''' denormalize each dimension in config from [0,1]
'''
config_denorm = {}
for key, value in config.items():
if key in self.space:
# domain: sample.Categorical/Integer/Float/Function
domain = self.space[key]
if not callable(getattr(domain, 'get_sampler', None)):
config_denorm[key] = value
else:
if isinstance(domain, sample.Categorical):
# denormalize categorical
if key in self._ordered_cat_hp:
l, _ = self._ordered_cat_hp[key]
n = len(l)
config_denorm[key] = l[min(n - 1, int(np.floor(value * n)))]
elif key in self._ordered_choice_hp:
l, _ = self._ordered_choice_hp[key]
n = len(l)
config_denorm[key] = l[min(n - 1, int(np.floor(value * n)))]
else:
assert key in self.incumbent
n = self._unordered_cat_hp[key]
if np.floor(value * n) == np.floor(self.incumbent[key] * n):
config_denorm[key] = self.best_config[key]
else: # ****random value each time!****
config_denorm[key] = self._random.choice(
[x for x in domain.categories
if x != self.best_config[key]])
continue
# Uniform/LogUniform/Normal/Base
sampler = domain.get_sampler()
if isinstance(sampler, sample.Quantized):
# sampler is sample.Quantized
sampler = sampler.get_sampler()
# Handle Log/Uniform
if str(sampler) == 'LogUniform':
config_denorm[key] = (
domain.upper / domain.lower) ** value * domain.lower
elif str(sampler) == 'Uniform':
config_denorm[key] = value * (
domain.upper - domain.lower) + domain.lower
elif str(sampler) == 'Normal':
# denormalization for 'Normal'
config_denorm[key] = value * sampler.sd + sampler.mean
else:
config_denorm[key] = value
# Handle quantized
sampler = domain.get_sampler()
if isinstance(sampler, sample.Quantized):
config_denorm[key] = np.round(
np.divide(config_denorm[key], sampler.q)) * sampler.q
# Handle int (4.6 -> 5)
if isinstance(domain, sample.Integer):
config_denorm[key] = int(round(config_denorm[key]))
else: # prune_attr
config_denorm[key] = value
return config_denorm
return denormalize(
config, self._space, self.best_config, self.incumbent, self._random)
def set_search_properties(self,
metric: Optional[str] = None,
@ -428,6 +285,7 @@ class FLOW2(Searcher):
self.metric_op = 1.
if config:
self.space = config
self._space = flatten_dict(self.space)
self._init_search()
return True
@ -600,7 +458,7 @@ class FLOW2(Searcher):
for i, key in enumerate(self._tunable_keys):
if self._direction_tried[i] != 0:
for _, generated in generate_variants({'config': {
key: self.space[key]
key: self._space[key]
}}):
if generated['config'][key] != best_config[key]:
config[key] = generated['config'][key]
@ -632,26 +490,27 @@ class FLOW2(Searcher):
'''
return self._num_allowed4incumbent > 0
def config_signature(self, config) -> tuple:
def config_signature(self, config, space: Dict = None) -> tuple:
''' return the signature tuple of a config
'''
config = flatten_dict(config)
if space:
space = flatten_dict(space)
else:
space = self._space
value_list = []
for key in self._space_keys:
if key in config:
value = config[key]
if key == self.prune_attr:
value_list.append(value)
# else key must be in self.space
# get rid of list type or constant,
# e.g., "eval_metric": ["logloss", "error"]
elif callable(getattr(self.space[key], 'sample', None)):
if isinstance(self.space[key], sample.Integer):
value_list.append(int(round(value)))
else:
value_list.append(value)
keys = sorted(config.keys()) if self._hierarchical else self._space_keys
for key in keys:
value = config[key]
if key == self.prune_attr:
value_list.append(value)
# else key must be in self.space
# get rid of list type or constant,
# e.g., "eval_metric": ["logloss", "error"]
elif isinstance(space[key], sample.Integer):
value_list.append(int(round(value)))
else:
value_list.append(None)
value_list.append(value)
return tuple(value_list)
@property

View File

@ -6,10 +6,13 @@
from typing import Dict, Optional
import numpy as np
try:
from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
from ray.tune.suggest import Searcher
except ImportError:
except (ImportError, AssertionError):
from .suggestion import Searcher
from .flow2 import FLOW2
from ..tune.space import (add_cost_to_space, unflatten_hierarchical)
import logging
logger = logging.getLogger(__name__)
@ -41,6 +44,12 @@ class SearchThread:
self._init_config = True
self.running = 0 # the number of running trials from the thread
self.cost_attr = cost_attr
if search_alg:
self.space = self._space = search_alg.space # unflattened space
# TODO: remove when define_by_run is supported
if not isinstance(self._search_alg, FLOW2):
# remember const config
self._const = add_cost_to_space(self.space, {}, {})
@classmethod
def set_eps(cls, time_budget_s):
@ -54,6 +63,9 @@ class SearchThread:
else:
try:
config = self._search_alg.suggest(trial_id)
# TODO: remove when define_by_run is supported
config.update(self._const)
config, self.space = unflatten_hierarchical(config, self._space)
except FloatingPointError:
logger.warning(
'The global search method raises FloatingPointError. '

View File

@ -1,9 +1,11 @@
try:
from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
from ray.tune import (uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform, lograndint)
except ImportError:
qrandn, loguniform, qloguniform, lograndint, qlograndint)
except (ImportError, AssertionError):
from .sample import (uniform, quniform, choice, randint, qrandint, randn,
qrandn, loguniform, qloguniform, lograndint)
qrandn, loguniform, qloguniform, lograndint, qlograndint)
from .tune import run, report
from .sample import polynomial_expansion_set
from .sample import PolynomialExpansionSet, Categorical, Float

View File

@ -6,6 +6,6 @@
"pip": {"Name": "ray[tune]", "Version": "1.5.1" }
},
"DevelopmentDependency": false
},
}
]
}

View File

@ -1,9 +1,14 @@
try:
from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
from ray.tune import sample
except ImportError:
from ray.tune.suggest.variant_generator import generate_variants
except (ImportError, AssertionError):
from . import sample
from typing import Dict, Optional, Any
from ..searcher.variant_generator import generate_variants
from typing import Dict, Optional, Any, Tuple
import numpy as np
import logging
logger = logging.getLogger(__name__)
@ -15,7 +20,7 @@ def define_by_run_func(
"""Define-by-run function to create the search space.
Returns:
None or a dict with constant values.
A dict with constant values.
"""
config = {}
for key, domain in space.items():
@ -50,12 +55,16 @@ def define_by_run_func(
elif isinstance(domain, sample.Integer):
if isinstance(sampler, sample.LogUniform):
trial.suggest_int(
key, domain.lower, domain.upper, step=quantize or 1, log=True)
key, domain.lower,
domain.upper - int(bool(not quantize)),
step=quantize or 1, log=True)
elif isinstance(sampler, sample.Uniform):
# Upper bound should be inclusive for quantization and
# exclusive otherwise
trial.suggest_int(
key, domain.lower, domain.upper, step=quantize or 1)
key, domain.lower,
domain.upper - int(bool(not quantize)),
step=quantize or 1)
elif isinstance(domain, sample.Categorical):
if isinstance(sampler, sample.Uniform):
if not hasattr(domain, 'choices'):
@ -76,3 +85,352 @@ def define_by_run_func(
type(domain.sampler).__name__))
# Return all constants in a dictionary.
return config
def unflatten_hierarchical(config: Dict, space: Dict) -> Tuple[Dict, Dict]:
'''unflatten hierarchical config'''
hier = {}
subspace = {}
for key, value in config.items():
if '/' in key:
key = key[key.rfind('/') + 1:]
if ':' in key:
pos = key.rfind(':')
true_key = key[:pos]
choice = int(key[pos + 1:])
hier[true_key], subspace[true_key] = unflatten_hierarchical(
value, space[true_key][choice])
else:
domain = space.get(key)
if domain is not None:
subspace[key] = domain
if isinstance(domain, sample.Domain):
sampler = domain.sampler
if isinstance(sampler, sample.Quantized):
q = sampler.q
sampler = sampler.sampler
if isinstance(sampler, sample.LogUniform):
value = domain.cast(np.round(value / q) * q)
hier[key] = value
return hier, subspace
def add_cost_to_space(space: Dict, low_cost_point: Dict, choice_cost: Dict):
"""Update the space in place by adding low_cost_point and choice_cost
Returns:
A dict with constant values.
"""
config = {}
for key in space:
domain = space[key]
if not isinstance(domain, sample.Domain):
if isinstance(domain, dict):
low_cost = low_cost_point.get(key, {})
choice_cost_list = choice_cost.get(key, {})
const = add_cost_to_space(
domain, low_cost, choice_cost_list)
if const:
config[key] = const
else:
config[key] = domain
continue
low_cost = low_cost_point.get(key)
choice_cost_list = choice_cost.get(key)
if callable(getattr(domain, 'get_sampler', None)):
sampler = domain.get_sampler()
if isinstance(sampler, sample.Quantized):
sampler = sampler.get_sampler()
domain.bounded = str(sampler) != 'Normal'
if isinstance(domain, sample.Categorical):
domain.const = []
for i, cat in enumerate(domain.categories):
if isinstance(cat, dict):
if isinstance(low_cost, list):
low_cost_dict = low_cost[i]
else:
low_cost_dict = {}
if choice_cost_list:
choice_cost_dict = choice_cost_list[i]
else:
choice_cost_dict = {}
domain.const.append(add_cost_to_space(
cat, low_cost_dict, choice_cost_dict))
else:
domain.const.append(None)
if choice_cost_list:
if len(choice_cost_list) == len(domain.categories):
domain.choice_cost = choice_cost_list
else:
domain.choice_cost = choice_cost_list[-1]
# sort the choices by cost
cost = np.array(domain.choice_cost)
ind = np.argsort(cost)
domain.categories = [domain.categories[i] for i in ind]
domain.choice_cost = cost[ind]
domain.const = [domain.const[i] for i in ind]
domain.ordered = True
elif all(isinstance(x, int) or isinstance(x, float)
for x in domain.categories):
# sort the choices by value
ind = np.argsort(domain.categories)
domain.categories = [domain.categories[i] for i in ind]
domain.ordered = True
else:
domain.ordered = False
if low_cost and low_cost not in domain.categories:
assert isinstance(low_cost, list), \
f"low cost {low_cost} not in domain {domain.categories}"
if domain.ordered:
sorted_points = [low_cost[i] for i in ind]
for i, point in enumerate(sorted_points):
low_cost[i] = point
if len(low_cost) > len(domain.categories):
if domain.ordered:
low_cost[-1] = int(np.where(ind == low_cost[-1])[0])
domain.low_cost_point = low_cost[-1]
return
if low_cost:
domain.low_cost_point = low_cost
return config
def normalize(
config: Dict, space: Dict, reference_config: Dict,
normalized_reference_config: Dict, recursive: bool = False,
):
'''normalize config in space according to reference_config.
normalize each dimension in config to [0,1].
'''
config_norm = {}
for key in config:
value = config[key]
domain = space.get(key)
if domain is None: # e.g., prune_attr
config_norm[key] = value
continue
if not callable(getattr(domain, 'get_sampler', None)):
if recursive and isinstance(domain, dict):
config_norm[key] = normalize(
value, domain, reference_config[key], {})
else:
config_norm[key] = value
continue
# domain: sample.Categorical/Integer/Float/Function
if isinstance(domain, sample.Categorical):
norm = None
# value is either one category, or the low_cost_point list
if value not in domain.categories:
# nested, low_cost_point list
if recursive:
norm = []
for i, cat in enumerate(domain.categories):
norm.append(normalize(
value[i], cat, reference_config[key][i], {}))
if isinstance(value, list) and len(value) > len(
domain.categories):
# low_cost_point list
index = value[-1]
config[key] = value[index]
value = domain.categories[index]
else:
continue
# normalize categorical
n = len(domain.categories)
if domain.ordered:
normalized = (domain.categories.index(value) + 0.5) / n
elif key in normalized_reference_config:
normalized = normalized_reference_config[
key] if value == reference_config[key] else (
normalized_reference_config[key] + 1 / n) % 1
else:
normalized = 0.5
if norm:
norm.append(normalized)
else:
norm = normalized
config_norm[key] = norm
continue
# Uniform/LogUniform/Normal/Base
sampler = domain.get_sampler()
if isinstance(sampler, sample.Quantized):
# sampler is sample.Quantized
quantize = sampler.q
sampler = sampler.get_sampler()
else:
quantize = None
if str(sampler) == 'LogUniform':
upper = domain.upper - (
isinstance(domain, sample.Integer) & (quantize is None))
config_norm[key] = np.log(value / domain.lower) / np.log(
upper / domain.lower)
elif str(sampler) == 'Uniform':
upper = domain.upper - (
isinstance(domain, sample.Integer) & (quantize is None))
config_norm[key] = (value - domain.lower) / (upper - domain.lower)
elif str(sampler) == 'Normal':
# N(mean, sd) -> N(0,1)
config_norm[key] = (value - sampler.mean) / sampler.sd
else:
# TODO? elif str(sampler) == 'Base': # sample.Function._CallSampler
# e.g., {test: sample_from(lambda spec: randn(10, 2).sample() * 0.01)}
config_norm[key] = value
return config_norm
def denormalize(
config: Dict, space: Dict, reference_config: Dict,
normalized_reference_config: Dict, random_state
):
config_denorm = {}
for key, value in config.items():
if key in space:
# domain: sample.Categorical/Integer/Float/Function
domain = space[key]
if not callable(getattr(domain, 'get_sampler', None)):
config_denorm[key] = value
else:
if isinstance(domain, sample.Categorical):
# denormalize categorical
n = len(domain.categories)
if domain.ordered:
config_denorm[key] = domain.categories[
min(n - 1, int(np.floor(value * n)))]
else:
assert key in normalized_reference_config
if np.floor(value * n) == np.floor(
normalized_reference_config[key] * n):
config_denorm[key] = reference_config[key]
else: # ****random value each time!****
config_denorm[key] = random_state.choice(
[x for x in domain.categories
if x != reference_config[key]])
continue
# Uniform/LogUniform/Normal/Base
sampler = domain.get_sampler()
if isinstance(sampler, sample.Quantized):
# sampler is sample.Quantized
quantize = sampler.q
sampler = sampler.get_sampler()
else:
quantize = None
# Handle Log/Uniform
if str(sampler) == 'LogUniform':
upper = domain.upper - (isinstance(domain, sample.Integer)
& (quantize is None))
config_denorm[key] = (
upper / domain.lower) ** value * domain.lower
elif str(sampler) == 'Uniform':
upper = domain.upper - (isinstance(domain, sample.Integer)
& (quantize is None))
config_denorm[key] = value * (
upper - domain.lower) + domain.lower
elif str(sampler) == 'Normal':
# denormalization for 'Normal'
config_denorm[key] = value * sampler.sd + sampler.mean
else:
config_denorm[key] = value
# Handle quantized
if quantize is not None:
config_denorm[key] = np.round(
np.divide(config_denorm[key], quantize)) * quantize
# Handle int (4.6 -> 5)
if isinstance(domain, sample.Integer):
config_denorm[key] = int(round(config_denorm[key]))
else: # prune_attr
config_denorm[key] = value
return config_denorm
def indexof(domain: Dict, config: Dict) -> int:
'''find the index of config in domain.categories
'''
index = config.get('_choice_')
if index is not None:
return index
if config in domain.categories:
return domain.categories.index(config)
# print(config)
for i, cat in enumerate(domain.categories):
# print(cat)
if not isinstance(cat, dict):
continue
# print(len(cat), len(config))
if len(cat) != len(config):
continue
# print(cat.keys())
if not set(cat.keys()).issubset(set(config.keys())):
continue
# print(domain.const[i])
if all(config[key] == value for key, value in domain.const[i].items()):
return i
return None
def complete_config(
partial_config: Dict, space: Dict, flow2, disturb: bool = False,
lower: Optional[Dict] = None, upper: Optional[Dict] = None
) -> Tuple[Dict, Dict]:
'''Complete partial config in space
Returns:
config, space
'''
config = partial_config.copy()
normalized = normalize(config, space, config, {})
if disturb:
for key in normalized:
domain = space.get(key)
if getattr(domain, 'ordered', True) is False:
# don't change unordered cat choice
continue
if not callable(getattr(domain, 'get_sampler', None)):
continue
if upper and lower:
up, low = upper[key], lower[key]
gauss_std = up - low or flow2.STEPSIZE
# allowed bound
up += flow2.STEPSIZE
low -= flow2.STEPSIZE
elif domain.bounded:
up, low, gauss_std = 1, 0, 1.0
else:
up, low, gauss_std = np.Inf, -np.Inf, 1.0
if domain.bounded:
up = min(up, 1)
low = max(low, 0)
delta = flow2.rand_vector_gaussian(1, gauss_std)[0]
normalized[key] = max(low, min(up, normalized[key] + delta))
config = denormalize(normalized, space, config, normalized, flow2._random)
for key, value in space.items():
if key not in config:
config[key] = value
for _, generated in generate_variants({'config': config}):
config = generated['config']
break
subspace = {}
for key, domain in space.items():
value = config[key]
if isinstance(value, dict):
if isinstance(domain, sample.Categorical):
# nested space
index = indexof(domain, value)
# point = partial_config.get(key)
# if isinstance(point, list): # low cost point list
# point = point[index]
# else:
# point = {}
config[key], subspace[key] = complete_config(
value, domain.categories[index], flow2, disturb,
lower and lower[key][index], upper and upper[key][index]
)
assert '_choice_' not in subspace[key], \
"_choice_ is a reserved key for hierarchical search space"
subspace[key]['_choice_'] = index
else:
config[key], subspace[key] = complete_config(
value, space[key], flow2, disturb,
lower and lower[key], upper and upper[key])
continue
subspace[key] = domain
return config, subspace

View File

@ -21,11 +21,10 @@ import uuid
import time
from numbers import Number
from collections import deque
import copy
def flatten_dict(dt, delimiter="/", prevent_delimiter=False):
dt = copy.deepcopy(dt)
dt = dt.copy()
if prevent_delimiter and any(delimiter in key for key in dt):
# Raise if delimiter is any of the keys
raise ValueError(

View File

@ -4,10 +4,12 @@
* project root for license information.
'''
from typing import Optional
try:
from ray.tune.trial import Trial
except ImportError:
from .trial import Trial
# try:
# from ray import __version__ as ray_version
# assert ray_version >= '1.0.0'
# from ray.tune.trial import Trial
# except (ImportError, AssertionError):
from .trial import Trial
import logging
logger = logging.getLogger(__name__)

View File

@ -8,8 +8,10 @@ import numpy as np
import datetime
import time
try:
from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
from ray.tune.analysis import ExperimentAnalysis as EA
except ImportError:
except (ImportError, AssertionError):
from .analysis import ExperimentAnalysis as EA
import logging
logger = logging.getLogger(__name__)
@ -288,9 +290,11 @@ def run(training_function,
if reduction_factor:
params['reduction_factor'] = reduction_factor
try:
from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
from ray.tune.schedulers import ASHAScheduler
scheduler = ASHAScheduler(**params)
except ImportError:
except (ImportError, AssertionError):
pass
if use_ray:
try:

View File

@ -1 +1 @@
__version__ = "0.5.11"
__version__ = "0.5.12"

View File

@ -52,6 +52,11 @@ try:
except ImportError:
print("pip install torch transformers datasets flaml[blendsearch,ray]")
except ValueError as e:
if 'Connection error' in str(e):
print(e)
else:
raise(e)
import logging
logger = logging.getLogger(__name__)

View File

@ -35,6 +35,11 @@ try:
except ImportError:
print("pip install torch transformers datasets flaml[blendsearch,ray]")
except ValueError as e:
if 'Connection error' in str(e):
print(e)
else:
raise(e)
logger = logging.getLogger(__name__)
os.makedirs('logs', exist_ok=True)

View File

@ -52,6 +52,11 @@ try:
except ImportError:
print("pip install torch transformers datasets flaml[blendsearch,ray]")
except ValueError as e:
if 'Connection error' in str(e):
print(e)
else:
raise(e)
import logging
logger = logging.getLogger(__name__)

View File

@ -156,11 +156,42 @@ class TestAutoML(unittest.TestCase):
X = pd.DataFrame({
'f1': [1, -2, 3, -4, 5, -6, -7, 8, -9, -10, -11, -12, -13, -14],
'f2': [3., 16., 10., 12., 3., 14., 11., 12., 5., 14., 20., 16., 15., 11.],
'f3': ['a', 'b', 'a', 'c', 'c', 'b', 'b', 'b', 'b', 'a', 'b', 'e', 'e', 'a'],
'f3': ['a', 'b', 'a', 'c', 'c', 'b', 'b', 'b', 'b', 'a', 'b', 1.0, 1.0, 'a'],
'f4': [True, True, False, True, True, False, False, False, True, True, False, False, True, True],
})
y = pd.Series([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1])
automl = AutoML()
automl_settings = {
"time_budget": 6,
"task": 'classification',
"n_jobs": 1,
"estimator_list": ['catboost', 'lrl2'],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
"verbose": 1,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
automl = AutoML()
automl_settings = {
"time_budget": 2,
"task": 'classification',
"n_jobs": 1,
"estimator_list": ['lrl2', 'kneighbor'],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
"verbose": 1,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
automl = AutoML()
automl_settings = {
"time_budget": 3,
"task": 'classification',
@ -175,6 +206,21 @@ class TestAutoML(unittest.TestCase):
}
automl.fit(X, y, **automl_settings)
automl = AutoML()
automl_settings = {
"time_budget": 3,
"task": 'classification',
"n_jobs": 1,
"estimator_list": ['lgbm', 'catboost', 'kneighbor'],
"eval_method": "cv",
"n_splits": 3,
"metric": "accuracy",
"log_training_metric": True,
"verbose": 1,
"ensemble": True,
}
automl.fit(X, y, **automl_settings)
def test_dataframe(self):
self.test_classification(True)

View File

@ -1,6 +1,14 @@
def test_automl(budget=5):
from openml.exceptions import OpenMLServerException
def test_automl(budget=5, dataset_format='dataframe'):
from flaml.data import load_openml_dataset
X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='test/')
try:
X_train, X_test, y_train, y_test = load_openml_dataset(
dataset_id=1169, data_dir='test/', dataset_format=dataset_format)
except OpenMLServerException:
print("OpenMLServerException raised")
return
''' import AutoML class from flaml package '''
from flaml import AutoML
automl = AutoML()
@ -42,13 +50,22 @@ def test_automl(budget=5):
print(automl.min_resource)
def test_automl_array():
test_automl(5, 'array')
def test_mlflow():
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "mlflow"])
import mlflow
from flaml.data import load_openml_task
X_train, X_test, y_train, y_test = load_openml_task(task_id=7592, data_dir='test/')
try:
X_train, X_test, y_train, y_test = load_openml_task(
task_id=7592, data_dir='test/')
except OpenMLServerException:
print("OpenMLServerException raised")
return
''' import AutoML class from flaml package '''
from flaml import AutoML
automl = AutoML()

View File

@ -1,3 +1,4 @@
from flaml.tune.space import unflatten_hierarchical
from flaml import AutoML
from sklearn.datasets import load_boston
import os
@ -45,13 +46,38 @@ class TestLogging(unittest.TestCase):
logger.info(automl.search_space)
logger.info(automl.low_cost_partial_config)
logger.info(automl.points_to_evalaute)
logger.info(automl.cat_hp_cost)
import optuna as ot
study = ot.create_study()
from flaml.tune.space import define_by_run_func
logger.info(define_by_run_func(study.ask(), automl.search_space))
from flaml.tune.space import define_by_run_func, add_cost_to_space
sample = define_by_run_func(study.ask(), automl.search_space)
logger.info(sample)
logger.info(unflatten_hierarchical(sample, automl.search_space))
add_cost_to_space(
automl.search_space, automl.low_cost_partial_config,
automl.cat_hp_cost
)
logger.info(automl.search_space["ml"].categories)
config = automl.best_config.copy()
config['learner'] = automl.best_estimator
automl.trainable({"ml": config})
from flaml import tune, CFO
search_alg = CFO(
metric='val_loss',
space=automl.search_space,
low_cost_partial_config=automl.low_cost_partial_config,
points_to_evaluate=automl.points_to_evalaute,
cat_hp_cost=automl.cat_hp_cost,
prune_attr=automl.prune_attr,
min_resource=automl.min_resource,
max_resource=automl.max_resource,
config_constraints=[(automl.size, '<=', automl._mem_thres)],
metric_constraints=automl.metric_constraints)
analysis = tune.run(
automl.trainable, search_alg=search_alg, # verbose=2,
time_budget_s=1, num_samples=-1)
print(min((trial.last_result["val_loss"], trial.last_result)
for trial in analysis.trials))
# Check if the log buffer is populated.
self.assertTrue(len(buf.getvalue()) > 0)

View File

@ -10,10 +10,10 @@ from flaml.training_log import training_log_reader
class TestTrainingLog(unittest.TestCase):
def test_training_log(self):
def test_training_log(self, path='test_training_log.log'):
with TemporaryDirectory() as d:
filename = os.path.join(d, 'test_training_log.log')
filename = os.path.join(d, path)
# Run a simple job.
automl_experiment = AutoML()
@ -42,3 +42,15 @@ class TestTrainingLog(unittest.TestCase):
print(record)
count += 1
self.assertGreater(count, 0)
automl_settings["log_file_name"] = None
automl_experiment.fit(X_train=X_train, y_train=y_train,
**automl_settings)
def test_illfilename(self):
try:
self.test_training_log('/')
except IsADirectoryError:
print("IsADirectoryError happens as expected in linux.")
except PermissionError:
print("PermissionError happens as expected in windows.")

View File

@ -59,7 +59,7 @@ def test_simple(method=None):
automl.trainable(config)
from flaml import tune
analysis = tune.run(
automl.trainable, automl.search_space, metric='val_loss',
automl.trainable, automl.search_space, metric='val_loss', mode="min",
low_cost_partial_config=automl.low_cost_partial_config,
points_to_evaluate=automl.points_to_evalaute,
cat_hp_cost=automl.cat_hp_cost,
@ -68,7 +68,7 @@ def test_simple(method=None):
max_resource=automl.max_resource,
time_budget_s=automl._state.time_budget,
config_constraints=[(automl.size, '<=', automl._mem_thres)],
metric_constraints=automl.metric_constraints)
metric_constraints=automl.metric_constraints, num_samples=5)
print(analysis.trials[-1])

View File

@ -52,7 +52,7 @@ def _test_xgboost(method='BlendSearch'):
else:
from ray import tune
search_space = {
"max_depth": tune.randint(1, 8) if method in [
"max_depth": tune.randint(1, 9) if method in [
"BlendSearch", "BOHB", "Optuna"] else tune.randint(1, 9),
"min_child_weight": tune.choice([1, 2, 3]),
"subsample": tune.uniform(0.5, 1.0),
@ -61,7 +61,7 @@ def _test_xgboost(method='BlendSearch'):
max_iter = 10
for num_samples in [128]:
time_budget_s = 60
for n_cpu in [8]:
for n_cpu in [4]:
start_time = time.time()
ray.init(num_cpus=n_cpu, num_gpus=0)
# ray.init(address='auto')
@ -168,7 +168,7 @@ def test_nested():
search_space = {
# test nested search space
"cost_related": {
"a": tune.randint(1, 8),
"a": tune.randint(1, 9),
},
"b": tune.uniform(0.5, 1.0),
}
@ -194,7 +194,7 @@ def test_nested():
metric_constraints=[("ab", "<=", 4)]),
local_dir='logs/',
num_samples=-1,
time_budget_s=.1)
time_budget_s=1)
best_trial = analysis.get_best_trial()
logger.info(f"CFO best config: {best_trial.config}")
@ -216,7 +216,7 @@ def test_nested():
metric_constraints=[("ab", "<=", 4)]),
local_dir='logs/',
num_samples=-1,
time_budget_s=.1)
time_budget_s=1)
best_trial = analysis.get_best_trial()
logger.info(f"BlendSearch exp best config: {best_trial.config}")
@ -233,7 +233,7 @@ def test_nested():
metric_constraints=[("ab", "<=", 4)],
local_dir='logs/',
num_samples=-1,
time_budget_s=.1)
time_budget_s=1)
best_trial = analysis.get_best_trial()
logger.info(f"BlendSearch best config: {best_trial.config}")
@ -251,10 +251,11 @@ def test_run_training_function_return_value():
tune.run(
evaluate_config_dict,
config={
'x': tune.lograndint(lower=1, upper=100000),
'y': tune.randint(lower=1, upper=100000)
'x': tune.qloguniform(lower=1, upper=100000, q=1),
'y': tune.qrandint(lower=2, upper=100000, q=2)
},
metric='metric',
metric='metric', mode='max',
num_samples=100,
)
# Test scalar return value
@ -265,9 +266,10 @@ def test_run_training_function_return_value():
tune.run(
evaluate_config_scalar,
config={
'x': tune.lograndint(lower=1, upper=100000),
'y': tune.randint(lower=1, upper=100000)
'x': tune.qloguniform(lower=1, upper=100000, q=1),
'y': tune.qlograndint(lower=2, upper=100000, q=2)
},
num_samples=100, mode='max',
)