mirror of
https://github.com/microsoft/autogen.git
synced 2025-09-26 00:24:26 +00:00
include default value in rf search space (#317)
* include default value in rf search space * init _mem_per_iter with -1 * bump version to 0.8.2 * docstr for search space's arguments
This commit is contained in:
parent
1545d5a6d2
commit
c57954fbbd
@ -119,7 +119,7 @@ class SearchState:
|
||||
if config and "FLAML_sample_size" in config:
|
||||
self.sample_size = config["FLAML_sample_size"]
|
||||
else:
|
||||
self.sample_size = self.data_size
|
||||
self.sample_size = self.data_size[0]
|
||||
obj = result["val_loss"]
|
||||
metric_for_logging = result["metric_for_logging"]
|
||||
time2eval = result["time_total_s"]
|
||||
@ -181,7 +181,7 @@ class SearchState:
|
||||
class AutoMLState:
|
||||
def _prepare_sample_train_data(self, sample_size):
|
||||
sampled_weight = groups = None
|
||||
if sample_size <= self.data_size:
|
||||
if sample_size <= self.data_size[0]:
|
||||
if isinstance(self.X_train, pd.DataFrame):
|
||||
sampled_X_train = self.X_train.iloc[:sample_size]
|
||||
else:
|
||||
@ -205,7 +205,7 @@ class AutoMLState:
|
||||
if "FLAML_sample_size" in config_w_resource:
|
||||
sample_size = int(config_w_resource["FLAML_sample_size"])
|
||||
else:
|
||||
sample_size = self.data_size
|
||||
sample_size = self.data_size[0]
|
||||
(
|
||||
sampled_X_train,
|
||||
sampled_y_train,
|
||||
@ -226,11 +226,11 @@ class AutoMLState:
|
||||
None
|
||||
if self.time_budget is None
|
||||
else self.time_budget - self.time_from_start
|
||||
if sample_size == self.data_size
|
||||
if sample_size == self.data_size[0]
|
||||
else (self.time_budget - self.time_from_start)
|
||||
/ 2
|
||||
* sample_size
|
||||
/ self.data_size
|
||||
/ self.data_size[0]
|
||||
)
|
||||
|
||||
if _is_nlp_task(self.task):
|
||||
@ -1122,7 +1122,7 @@ class AutoML(BaseEstimator):
|
||||
test_size=split_ratio,
|
||||
random_state=RANDOM_SEED,
|
||||
)
|
||||
self._state.data_size = X_train.shape[0]
|
||||
self._state.data_size = X_train.shape
|
||||
self.data_size_full = len(y_train_all)
|
||||
self._state.X_train, self._state.y_train = X_train, y_train
|
||||
self._state.X_val, self._state.y_val = X_val, y_val
|
||||
@ -1555,7 +1555,7 @@ class AutoML(BaseEstimator):
|
||||
Returns:
|
||||
A float for the maximal sample size or None.
|
||||
"""
|
||||
return self._state.data_size if self._sample else None
|
||||
return self._state.data_size[0] if self._sample else None
|
||||
|
||||
@property
|
||||
def trainable(self) -> Callable[[dict], Optional[float]]:
|
||||
@ -1965,7 +1965,10 @@ class AutoML(BaseEstimator):
|
||||
sample
|
||||
and task != "rank"
|
||||
and eval_method != "cv"
|
||||
and (self._min_sample_size * SAMPLE_MULTIPLY_FACTOR < self._state.data_size)
|
||||
and (
|
||||
self._min_sample_size * SAMPLE_MULTIPLY_FACTOR
|
||||
< self._state.data_size[0]
|
||||
)
|
||||
)
|
||||
if "auto" == metric:
|
||||
if "binary" in self._state.task:
|
||||
@ -2205,7 +2208,7 @@ class AutoML(BaseEstimator):
|
||||
search_state.update(result, 0)
|
||||
if result["wall_clock_time"] is not None:
|
||||
self._state.time_from_start = result["wall_clock_time"]
|
||||
if search_state.sample_size == self._state.data_size:
|
||||
if search_state.sample_size == self._state.data_size[0]:
|
||||
self._iter_per_learner[estimator] += 1
|
||||
if not self._fullsize_reached:
|
||||
self._fullsize_reached = True
|
||||
@ -2275,7 +2278,7 @@ class AutoML(BaseEstimator):
|
||||
self._max_iter = 0
|
||||
self._best_estimator = estimator = self.estimator_list[0]
|
||||
self._selected = state = self._search_states[estimator]
|
||||
state.best_config_sample_size = self._state.data_size
|
||||
state.best_config_sample_size = self._state.data_size[0]
|
||||
state.best_config = (
|
||||
state.init_config
|
||||
if isinstance(state.init_config, dict)
|
||||
@ -2298,7 +2301,7 @@ class AutoML(BaseEstimator):
|
||||
or better
|
||||
or (not self.best_estimator)
|
||||
or self._search_states[self.best_estimator].sample_size
|
||||
< self._state.data_size
|
||||
< self._state.data_size[0]
|
||||
else time_left - est_retrain_time
|
||||
)
|
||||
if not search_state.search_alg:
|
||||
@ -2309,7 +2312,7 @@ class AutoML(BaseEstimator):
|
||||
if self._sample:
|
||||
prune_attr = "FLAML_sample_size"
|
||||
min_resource = self._min_sample_size
|
||||
max_resource = self._state.data_size
|
||||
max_resource = self._state.data_size[0]
|
||||
else:
|
||||
prune_attr = min_resource = max_resource = None
|
||||
learner_class = self._state.learner_classes.get(estimator)
|
||||
@ -2398,7 +2401,7 @@ class AutoML(BaseEstimator):
|
||||
min_budget = max(10 * self._eci[0], sum(self._eci))
|
||||
max_budget = 10000 * self._eci[0]
|
||||
if search_state.sample_size:
|
||||
ratio = search_state.data_size / search_state.sample_size
|
||||
ratio = search_state.data_size[0] / search_state.sample_size
|
||||
min_budget *= ratio
|
||||
max_budget *= ratio
|
||||
logger.info(
|
||||
@ -2408,7 +2411,7 @@ class AutoML(BaseEstimator):
|
||||
if result["wall_clock_time"] is not None:
|
||||
self._state.time_from_start = result["wall_clock_time"]
|
||||
# logger.info(f"{self._search_states[estimator].sample_size}, {data_size}")
|
||||
if search_state.sample_size == self._state.data_size:
|
||||
if search_state.sample_size == self._state.data_size[0]:
|
||||
self._iter_per_learner[estimator] += 1
|
||||
self._fullsize_reached = True
|
||||
if search_state.best_loss < self._state.best_loss:
|
||||
@ -2519,7 +2522,7 @@ class AutoML(BaseEstimator):
|
||||
and est_retrain_time
|
||||
and not better
|
||||
and self._search_states[self._best_estimator].sample_size
|
||||
== self._state.data_size
|
||||
== self._state.data_size[0]
|
||||
and (
|
||||
est_retrain_time
|
||||
<= self._state.time_budget - self._state.time_from_start
|
||||
@ -2560,7 +2563,7 @@ class AutoML(BaseEstimator):
|
||||
self._best_iteration = 0
|
||||
self._time_taken_best_iter = 0
|
||||
self._config_history = {}
|
||||
self._max_iter_per_learner = 1000000 # TODO
|
||||
self._max_iter_per_learner = 10000
|
||||
self._iter_per_learner = dict([(e, 0) for e in self.estimator_list])
|
||||
self._fullsize_reached = False
|
||||
self._trained_estimator = None
|
||||
@ -2680,7 +2683,7 @@ class AutoML(BaseEstimator):
|
||||
self._state.time_budget - self._state.time_from_start
|
||||
> self._selected.est_retrain_time(self.data_size_full)
|
||||
and self._selected.best_config_sample_size
|
||||
== self._state.data_size
|
||||
== self._state.data_size[0]
|
||||
)
|
||||
):
|
||||
state = self._search_states[self._best_estimator]
|
||||
@ -2736,13 +2739,13 @@ class AutoML(BaseEstimator):
|
||||
inv.append(0)
|
||||
continue
|
||||
estimated_cost = search_state.estimated_cost4improvement
|
||||
if search_state.sample_size < self._state.data_size:
|
||||
if search_state.sample_size < self._state.data_size[0]:
|
||||
estimated_cost = min(
|
||||
estimated_cost,
|
||||
search_state.time2eval_best
|
||||
* min(
|
||||
SAMPLE_MULTIPLY_FACTOR,
|
||||
self._state.data_size / search_state.sample_size,
|
||||
self._state.data_size[0] / search_state.sample_size,
|
||||
),
|
||||
)
|
||||
gap = search_state.best_loss - self._state.best_loss
|
||||
|
@ -233,9 +233,13 @@ class BaseEstimator:
|
||||
self._model = None
|
||||
|
||||
@classmethod
|
||||
def search_space(cls, **params):
|
||||
def search_space(cls, data_size, task, **params):
|
||||
"""[required method] search space.
|
||||
|
||||
Args:
|
||||
data_size: A tuple of two integers, number of rows and columns.
|
||||
task: A str of the task type, e.g., "binary", "multi", "regression".
|
||||
|
||||
Returns:
|
||||
A dictionary of the search space.
|
||||
Each key is the name of a hyperparameter, and value is a dict with
|
||||
@ -674,7 +678,7 @@ class LGBMEstimator(BaseEstimator):
|
||||
|
||||
@classmethod
|
||||
def search_space(cls, data_size, **params):
|
||||
upper = min(32768, int(data_size))
|
||||
upper = min(32768, int(data_size[0]))
|
||||
return {
|
||||
"n_estimators": {
|
||||
"domain": tune.lograndint(lower=4, upper=upper),
|
||||
@ -728,7 +732,7 @@ class LGBMEstimator(BaseEstimator):
|
||||
round(
|
||||
config.get("num_leaves")
|
||||
or config.get("max_leaves")
|
||||
or 1 << config["max_depth"]
|
||||
or 1 << config.get("max_depth", 16)
|
||||
)
|
||||
)
|
||||
n_estimators = int(round(config["n_estimators"]))
|
||||
@ -752,7 +756,7 @@ class LGBMEstimator(BaseEstimator):
|
||||
self.estimator_class = LGBMClassifier
|
||||
self._time_per_iter = None
|
||||
self._train_size = 0
|
||||
self._mem_per_iter = 1
|
||||
self._mem_per_iter = -1
|
||||
self.HAS_CALLBACK = self.HAS_CALLBACK and self._callbacks(0, 0) is not None
|
||||
|
||||
def _preprocess(self, X):
|
||||
@ -784,7 +788,7 @@ class LGBMEstimator(BaseEstimator):
|
||||
or abs(self._train_size - X_train.shape[0]) > 4
|
||||
)
|
||||
and budget is not None
|
||||
or self._mem_per_iter <= 1
|
||||
or self._mem_per_iter < 0
|
||||
and psutil is not None
|
||||
) and n_iter > 1:
|
||||
self.params[self.ITER_HP] = 1
|
||||
@ -806,8 +810,8 @@ class LGBMEstimator(BaseEstimator):
|
||||
self._mem_per_iter = min(
|
||||
self._mem1, self._mem2 / self.params[self.ITER_HP]
|
||||
)
|
||||
if self._mem_per_iter <= 1 and psutil is not None:
|
||||
n_iter = self.params[self.ITER_HP]
|
||||
# if self._mem_per_iter <= 1 and psutil is not None:
|
||||
# n_iter = self.params[self.ITER_HP]
|
||||
self._time_per_iter = (
|
||||
(self._t2 - self._t1) / (self.params[self.ITER_HP] - 1)
|
||||
if self._t2 > self._t1
|
||||
@ -837,7 +841,7 @@ class LGBMEstimator(BaseEstimator):
|
||||
if budget is not None
|
||||
else n_iter,
|
||||
int((1 - FREE_MEM_RATIO) * mem0 / self._mem_per_iter)
|
||||
if psutil is not None
|
||||
if psutil is not None and self._mem_per_iter > 0
|
||||
else n_iter,
|
||||
)
|
||||
if trained and max_iter <= self.params[self.ITER_HP]:
|
||||
@ -887,7 +891,7 @@ class XGBoostEstimator(SKLearnEstimator):
|
||||
|
||||
@classmethod
|
||||
def search_space(cls, data_size, **params):
|
||||
upper = min(32768, int(data_size))
|
||||
upper = min(32768, int(data_size[0]))
|
||||
return {
|
||||
"n_estimators": {
|
||||
"domain": tune.lograndint(lower=4, upper=upper),
|
||||
@ -1086,7 +1090,7 @@ class XGBoostLimitDepthEstimator(XGBoostSklearnEstimator):
|
||||
def search_space(cls, data_size, **params):
|
||||
space = XGBoostEstimator.search_space(data_size)
|
||||
space.pop("max_leaves")
|
||||
upper = max(6, int(np.log2(data_size)))
|
||||
upper = max(6, int(np.log2(data_size[0])))
|
||||
space["max_depth"] = {
|
||||
"domain": tune.randint(lower=1, upper=min(upper, 16)),
|
||||
"init_value": 6,
|
||||
@ -1105,11 +1109,14 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
|
||||
"""The class for tuning Random Forest."""
|
||||
|
||||
HAS_CALLBACK = False
|
||||
nrows = 101
|
||||
|
||||
@classmethod
|
||||
def search_space(cls, data_size, task, **params):
|
||||
data_size = int(data_size)
|
||||
upper = min(2048, data_size)
|
||||
RandomForestEstimator.nrows = int(data_size[0])
|
||||
upper = min(2048, RandomForestEstimator.nrows)
|
||||
init = 1 / np.sqrt(data_size[1]) if task in CLASSIFICATION else 1
|
||||
lower = min(0.1, init)
|
||||
space = {
|
||||
"n_estimators": {
|
||||
"domain": tune.lograndint(lower=4, upper=upper),
|
||||
@ -1117,11 +1124,13 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
|
||||
"low_cost_init_value": 4,
|
||||
},
|
||||
"max_features": {
|
||||
"domain": tune.loguniform(lower=0.1, upper=1.0),
|
||||
"init_value": 1.0,
|
||||
"domain": tune.loguniform(lower=lower, upper=1.0),
|
||||
"init_value": init,
|
||||
},
|
||||
"max_leaves": {
|
||||
"domain": tune.lograndint(lower=4, upper=min(32768, data_size)),
|
||||
"domain": tune.lograndint(
|
||||
lower=4, upper=min(32768, RandomForestEstimator.nrows >> 1)
|
||||
),
|
||||
"init_value": 4,
|
||||
"low_cost_init_value": 4,
|
||||
},
|
||||
@ -1129,13 +1138,13 @@ class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
|
||||
if task in CLASSIFICATION:
|
||||
space["criterion"] = {
|
||||
"domain": tune.choice(["gini", "entropy"]),
|
||||
# 'init_value': 'gini',
|
||||
# "init_value": "gini",
|
||||
}
|
||||
return space
|
||||
|
||||
@classmethod
|
||||
def cost_relative2lgbm(cls):
|
||||
return 2.0
|
||||
return 2
|
||||
|
||||
def config2params(cls, config: dict) -> dict:
|
||||
params = config.copy()
|
||||
@ -1234,7 +1243,7 @@ class CatBoostEstimator(BaseEstimator):
|
||||
|
||||
@classmethod
|
||||
def search_space(cls, data_size, **params):
|
||||
upper = max(min(round(1500000 / data_size), 150), 12)
|
||||
upper = max(min(round(1500000 / data_size[0]), 150), 12)
|
||||
return {
|
||||
"early_stopping_rounds": {
|
||||
"domain": tune.lograndint(lower=10, upper=upper),
|
||||
@ -1380,7 +1389,7 @@ class CatBoostEstimator(BaseEstimator):
|
||||
class KNeighborsEstimator(BaseEstimator):
|
||||
@classmethod
|
||||
def search_space(cls, data_size, **params):
|
||||
upper = min(512, int(data_size / 2))
|
||||
upper = min(512, int(data_size[0] / 2))
|
||||
return {
|
||||
"n_neighbors": {
|
||||
"domain": tune.lograndint(lower=1, upper=upper),
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.8.1"
|
||||
__version__ = "0.8.2"
|
||||
|
@ -893,8 +893,8 @@
|
||||
" {'domain': tune.randint(lower=1, upper=10), 'init_value': 1}.\n",
|
||||
" '''\n",
|
||||
" space = { \n",
|
||||
" 'max_leaf': {'domain': tune.lograndint(lower=4, upper=data_size), 'init_value': 4, 'low_cost_init_value': 4},\n",
|
||||
" 'n_iter': {'domain': tune.lograndint(lower=1, upper=data_size), 'init_value': 1, 'low_cost_init_value': 1},\n",
|
||||
" 'max_leaf': {'domain': tune.lograndint(lower=4, upper=data_size[0]), 'init_value': 4, 'low_cost_init_value': 4},\n",
|
||||
" 'n_iter': {'domain': tune.lograndint(lower=1, upper=data_size[0]), 'init_value': 1, 'low_cost_init_value': 1},\n",
|
||||
" 'n_tree_search': {'domain': tune.lograndint(lower=1, upper=32768), 'init_value': 1, 'low_cost_init_value': 1},\n",
|
||||
" 'opt_interval': {'domain': tune.lograndint(lower=1, upper=10000), 'init_value': 100},\n",
|
||||
" 'learning_rate': {'domain': tune.loguniform(lower=0.01, upper=20.0)},\n",
|
||||
@ -1278,7 +1278,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.12"
|
||||
"version": "3.9.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -29,11 +29,11 @@ class MyRegularizedGreedyForest(SKLearnEstimator):
|
||||
def search_space(cls, data_size, task):
|
||||
space = {
|
||||
"max_leaf": {
|
||||
"domain": tune.lograndint(lower=4, upper=data_size),
|
||||
"domain": tune.lograndint(lower=4, upper=data_size[0]),
|
||||
"init_value": 4,
|
||||
},
|
||||
"n_iter": {
|
||||
"domain": tune.lograndint(lower=1, upper=data_size),
|
||||
"domain": tune.lograndint(lower=1, upper=data_size[0]),
|
||||
"init_value": 1,
|
||||
},
|
||||
"n_tree_search": {
|
||||
|
@ -13,7 +13,7 @@ dataset = "credit-g"
|
||||
class XGBoost2D(XGBoostSklearnEstimator):
|
||||
@classmethod
|
||||
def search_space(cls, data_size, task):
|
||||
upper = min(32768, int(data_size))
|
||||
upper = min(32768, int(data_size[0]))
|
||||
return {
|
||||
"n_estimators": {
|
||||
"domain": tune.lograndint(lower=4, upper=upper),
|
||||
|
Loading…
x
Reference in New Issue
Block a user