fix bug in hierarchical search space (#248); optional dependency on lgbm and xgb (#250)

* close #249

* admissible region

* best_config can be None

* optional dependency on lgbm and xgb
resolve #252
This commit is contained in:
Chi Wang 2021-10-15 21:36:42 -07:00 committed by GitHub
parent fe65fa143d
commit 524f22bcc5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 365 additions and 262 deletions

View File

@ -395,7 +395,8 @@ class AutoML:
@property @property
def best_config(self): def best_config(self):
"""A dictionary of the best configuration.""" """A dictionary of the best configuration."""
return self._search_states[self._best_estimator].best_config state = self._search_states.get(self._best_estimator)
return state and getattr(state, "best_config", None)
@property @property
def best_config_per_estimator(self): def best_config_per_estimator(self):
@ -1104,7 +1105,7 @@ class AutoML:
(b) otherwise, it is a nested dict with 'ml' as the key, and (b) otherwise, it is a nested dict with 'ml' as the key, and
a list of the low_cost_partial_configs as the value, corresponding a list of the low_cost_partial_configs as the value, corresponding
to each learner's low_cost_partial_config; the estimator index as to each learner's low_cost_partial_config; the estimator index as
an integer corresponding to the cheapest learner is appeneded to the an integer corresponding to the cheapest learner is appended to the
list at the end. list at the end.
""" """

View File

@ -4,12 +4,10 @@
""" """
import numpy as np import numpy as np
import xgboost as xgb
import time import time
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor, LGBMRanker
from scipy.sparse import issparse from scipy.sparse import issparse
import pandas as pd import pandas as pd
from . import tune from . import tune
@ -286,10 +284,16 @@ class LGBMEstimator(BaseEstimator):
if "verbose" not in self.params: if "verbose" not in self.params:
self.params["verbose"] = -1 self.params["verbose"] = -1
if "regression" == task: if "regression" == task:
from lightgbm import LGBMRegressor
self.estimator_class = LGBMRegressor self.estimator_class = LGBMRegressor
elif "rank" == task: elif "rank" == task:
from lightgbm import LGBMRanker
self.estimator_class = LGBMRanker self.estimator_class = LGBMRanker
else: else:
from lightgbm import LGBMClassifier
self.estimator_class = LGBMClassifier self.estimator_class = LGBMClassifier
self._time_per_iter = None self._time_per_iter = None
self._train_size = 0 self._train_size = 0
@ -432,6 +436,8 @@ class XGBoostEstimator(SKLearnEstimator):
self.params["verbosity"] = 0 self.params["verbosity"] = 0
def fit(self, X_train, y_train, budget=None, **kwargs): def fit(self, X_train, y_train, budget=None, **kwargs):
import xgboost as xgb
start_time = time.time() start_time = time.time()
if issparse(X_train): if issparse(X_train):
self.params["tree_method"] = "auto" self.params["tree_method"] = "auto"
@ -458,6 +464,8 @@ class XGBoostEstimator(SKLearnEstimator):
return train_time return train_time
def predict(self, X_test): def predict(self, X_test):
import xgboost as xgb
if not issparse(X_test): if not issparse(X_test):
X_test = self._preprocess(X_test) X_test = self._preprocess(X_test)
dtest = xgb.DMatrix(X_test) dtest = xgb.DMatrix(X_test)
@ -492,6 +500,7 @@ class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
super().__init__(task, **config) super().__init__(task, **config)
del self.params["verbose"] del self.params["verbose"]
self.params["verbosity"] = 0 self.params["verbosity"] = 0
import xgboost as xgb
self.estimator_class = xgb.XGBRegressor self.estimator_class = xgb.XGBRegressor
if "rank" == task: if "rank" == task:

View File

@ -313,7 +313,13 @@ class BlendSearch(Searcher):
{}, {},
recursive=True, recursive=True,
) )
self._ls_bound_max = self._ls_bound_min.copy() self._ls_bound_max = normalize(
self._ls.init_config.copy(),
self._ls.space,
self._ls.init_config,
{},
recursive=True,
)
self._gs_admissible_min = self._ls_bound_min.copy() self._gs_admissible_min = self._ls_bound_min.copy()
self._gs_admissible_max = self._ls_bound_max.copy() self._gs_admissible_max = self._ls_bound_max.copy()
self._result = {} # config_signature: tuple -> result: Dict self._result = {} # config_signature: tuple -> result: Dict
@ -492,6 +498,11 @@ class BlendSearch(Searcher):
subspace[key], subspace[key],
domain[choice], domain[choice],
) )
if len(admissible_max[key]) > len(domain.categories):
# points + index
normal = (choice + 0.5) / len(domain.categories)
admissible_max[key][-1] = max(normal, admissible_max[key][-1])
admissible_min[key][-1] = min(normal, admissible_min[key][-1])
elif isinstance(value, dict): elif isinstance(value, dict):
self._update_admissible_region( self._update_admissible_region(
value, value,
@ -583,6 +594,7 @@ class BlendSearch(Searcher):
) )
def _expand_admissible_region(self, lower, upper, space): def _expand_admissible_region(self, lower, upper, space):
"""expand the admissible region for the subspace `space`"""
for key in upper: for key in upper:
ub = upper[key] ub = upper[key]
if isinstance(ub, list): if isinstance(ub, list):

View File

@ -6,7 +6,9 @@
import json import json
from typing import IO from typing import IO
from contextlib import contextmanager from contextlib import contextmanager
import warnings import logging
logger = logging.getLogger("flaml.automl")
class TrainingLogRecord(object): class TrainingLogRecord(object):
@ -113,8 +115,8 @@ class TrainingLogWriter(object):
if self.file is None: if self.file is None:
raise IOError("Call open() to open the outpute file first.") raise IOError("Call open() to open the outpute file first.")
if self.current_best_loss_record_id is None: if self.current_best_loss_record_id is None:
warnings.warn( logger.warning(
"checkpoint() called before any record is written, " "skipped." "flaml.training_log: checkpoint() called before any record is written, skipped."
) )
return return
record = TrainingLogCheckPoint(self.current_best_loss_record_id) record = TrainingLogCheckPoint(self.current_best_loss_record_id)

View File

@ -1,7 +1,7 @@
try: try:
from ray import __version__ as ray_version from ray import __version__ as ray_version
assert ray_version >= '1.0.0'
assert ray_version >= "1.0.0"
from ray.tune import sample from ray.tune import sample
from ray.tune.suggest.variant_generator import generate_variants from ray.tune.suggest.variant_generator import generate_variants
except (ImportError, AssertionError): except (ImportError, AssertionError):
@ -14,9 +14,7 @@ import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def define_by_run_func( def define_by_run_func(trial, space: Dict, path: str = "") -> Optional[Dict[str, Any]]:
trial, space: Dict, path: str = ""
) -> Optional[Dict[str, Any]]:
"""Define-by-run function to create the search space. """Define-by-run function to create the search space.
Returns: Returns:
@ -25,7 +23,7 @@ def define_by_run_func(
config = {} config = {}
for key, domain in space.items(): for key, domain in space.items():
if path: if path:
key = path + '/' + key key = path + "/" + key
if isinstance(domain, dict): if isinstance(domain, dict):
config.update(define_by_run_func(trial, domain, key)) config.update(define_by_run_func(trial, domain, key))
continue continue
@ -41,40 +39,41 @@ def define_by_run_func(
logger.warning( logger.warning(
"Optuna does not handle quantization in loguniform " "Optuna does not handle quantization in loguniform "
"sampling. The parameter will be passed but it will " "sampling. The parameter will be passed but it will "
"probably be ignored.") "probably be ignored."
)
if isinstance(domain, sample.Float): if isinstance(domain, sample.Float):
if isinstance(sampler, sample.LogUniform): if isinstance(sampler, sample.LogUniform):
if quantize: if quantize:
logger.warning( logger.warning(
"Optuna does not support both quantization and " "Optuna does not support both quantization and "
"sampling from LogUniform. Dropped quantization.") "sampling from LogUniform. Dropped quantization."
trial.suggest_float( )
key, domain.lower, domain.upper, log=True) trial.suggest_float(key, domain.lower, domain.upper, log=True)
elif isinstance(sampler, sample.Uniform): elif isinstance(sampler, sample.Uniform):
if quantize: if quantize:
trial.suggest_float( trial.suggest_float(key, domain.lower, domain.upper, step=quantize)
key, domain.lower, domain.upper, step=quantize)
trial.suggest_float(key, domain.lower, domain.upper) trial.suggest_float(key, domain.lower, domain.upper)
elif isinstance(domain, sample.Integer): elif isinstance(domain, sample.Integer):
if isinstance(sampler, sample.LogUniform): if isinstance(sampler, sample.LogUniform):
trial.suggest_int( trial.suggest_int(
key, domain.lower, key, domain.lower, domain.upper - int(bool(not quantize)), log=True
domain.upper - int(bool(not quantize)), )
log=True)
elif isinstance(sampler, sample.Uniform): elif isinstance(sampler, sample.Uniform):
# Upper bound should be inclusive for quantization and # Upper bound should be inclusive for quantization and
# exclusive otherwise # exclusive otherwise
trial.suggest_int( trial.suggest_int(
key, domain.lower, key,
domain.lower,
domain.upper - int(bool(not quantize)), domain.upper - int(bool(not quantize)),
step=quantize or 1) step=quantize or 1,
)
elif isinstance(domain, sample.Categorical): elif isinstance(domain, sample.Categorical):
if isinstance(sampler, sample.Uniform): if isinstance(sampler, sample.Uniform):
if not hasattr(domain, 'choices'): if not hasattr(domain, "choices"):
domain.choices = list(range(len(domain.categories))) domain.choices = list(range(len(domain.categories)))
choices = domain.choices choices = domain.choices
# This choice needs to be removed from the final config # This choice needs to be removed from the final config
index = trial.suggest_categorical(key + '_choice_', choices) index = trial.suggest_categorical(key + "_choice_", choices)
choice = domain.categories[index] choice = domain.categories[index]
if isinstance(choice, dict): if isinstance(choice, dict):
key += f":{index}" key += f":{index}"
@ -84,8 +83,9 @@ def define_by_run_func(
raise ValueError( raise ValueError(
"Optuna search does not support parameters of type " "Optuna search does not support parameters of type "
"`{}` with samplers of type `{}`".format( "`{}` with samplers of type `{}`".format(
type(domain).__name__, type(domain).__name__, type(domain.sampler).__name__
type(domain.sampler).__name__)) )
)
# Return all constants in a dictionary. # Return all constants in a dictionary.
return config return config
@ -117,18 +117,19 @@ def define_by_run_func(
def unflatten_hierarchical(config: Dict, space: Dict) -> Tuple[Dict, Dict]: def unflatten_hierarchical(config: Dict, space: Dict) -> Tuple[Dict, Dict]:
'''unflatten hierarchical config''' """unflatten hierarchical config"""
hier = {} hier = {}
subspace = {} subspace = {}
for key, value in config.items(): for key, value in config.items():
if '/' in key: if "/" in key:
key = key[key.rfind('/') + 1:] key = key[key.rfind("/") + 1 :]
if ':' in key: if ":" in key:
pos = key.rfind(':') pos = key.rfind(":")
true_key = key[:pos] true_key = key[:pos]
choice = int(key[pos + 1:]) choice = int(key[pos + 1 :])
hier[true_key], subspace[true_key] = unflatten_hierarchical( hier[true_key], subspace[true_key] = unflatten_hierarchical(
value, space[true_key][choice]) value, space[true_key][choice]
)
else: else:
if key.endswith("_choice_"): if key.endswith("_choice_"):
key = key[:-8] key = key[:-8]
@ -163,8 +164,7 @@ def add_cost_to_space(space: Dict, low_cost_point: Dict, choice_cost: Dict):
if isinstance(domain, dict): if isinstance(domain, dict):
low_cost = low_cost_point.get(key, {}) low_cost = low_cost_point.get(key, {})
choice_cost_list = choice_cost.get(key, {}) choice_cost_list = choice_cost.get(key, {})
const = add_cost_to_space( const = add_cost_to_space(domain, low_cost, choice_cost_list)
domain, low_cost, choice_cost_list)
if const: if const:
config[key] = const config[key] = const
else: else:
@ -172,11 +172,11 @@ def add_cost_to_space(space: Dict, low_cost_point: Dict, choice_cost: Dict):
continue continue
low_cost = low_cost_point.get(key) low_cost = low_cost_point.get(key)
choice_cost_list = choice_cost.get(key) choice_cost_list = choice_cost.get(key)
if callable(getattr(domain, 'get_sampler', None)): if callable(getattr(domain, "get_sampler", None)):
sampler = domain.get_sampler() sampler = domain.get_sampler()
if isinstance(sampler, sample.Quantized): if isinstance(sampler, sample.Quantized):
sampler = sampler.get_sampler() sampler = sampler.get_sampler()
domain.bounded = str(sampler) != 'Normal' domain.bounded = str(sampler) != "Normal"
if isinstance(domain, sample.Categorical): if isinstance(domain, sample.Categorical):
domain.const = [] domain.const = []
for i, cat in enumerate(domain.categories): for i, cat in enumerate(domain.categories):
@ -189,8 +189,9 @@ def add_cost_to_space(space: Dict, low_cost_point: Dict, choice_cost: Dict):
choice_cost_dict = choice_cost_list[i] choice_cost_dict = choice_cost_list[i]
else: else:
choice_cost_dict = {} choice_cost_dict = {}
domain.const.append(add_cost_to_space( domain.const.append(
cat, low_cost_dict, choice_cost_dict)) add_cost_to_space(cat, low_cost_dict, choice_cost_dict)
)
else: else:
domain.const.append(None) domain.const.append(None)
if choice_cost_list: if choice_cost_list:
@ -205,8 +206,9 @@ def add_cost_to_space(space: Dict, low_cost_point: Dict, choice_cost: Dict):
domain.choice_cost = cost[ind] domain.choice_cost = cost[ind]
domain.const = [domain.const[i] for i in ind] domain.const = [domain.const[i] for i in ind]
domain.ordered = True domain.ordered = True
elif all(isinstance(x, int) or isinstance(x, float) elif all(
for x in domain.categories): isinstance(x, int) or isinstance(x, float) for x in domain.categories
):
# sort the choices by value # sort the choices by value
ind = np.argsort(domain.categories) ind = np.argsort(domain.categories)
domain.categories = [domain.categories[i] for i in ind] domain.categories = [domain.categories[i] for i in ind]
@ -214,8 +216,9 @@ def add_cost_to_space(space: Dict, low_cost_point: Dict, choice_cost: Dict):
else: else:
domain.ordered = False domain.ordered = False
if low_cost and low_cost not in domain.categories: if low_cost and low_cost not in domain.categories:
assert isinstance(low_cost, list), \ assert isinstance(
f"low cost {low_cost} not in domain {domain.categories}" low_cost, list
), f"low cost {low_cost} not in domain {domain.categories}"
if domain.ordered: if domain.ordered:
sorted_points = [low_cost[i] for i in ind] sorted_points = [low_cost[i] for i in ind]
for i, point in enumerate(sorted_points): for i, point in enumerate(sorted_points):
@ -231,53 +234,63 @@ def add_cost_to_space(space: Dict, low_cost_point: Dict, choice_cost: Dict):
def normalize( def normalize(
config: Dict, space: Dict, reference_config: Dict, config: Dict,
normalized_reference_config: Dict, recursive: bool = False, space: Dict,
reference_config: Dict,
normalized_reference_config: Dict,
recursive: bool = False,
): ):
'''normalize config in space according to reference_config. """normalize config in space according to reference_config.
normalize each dimension in config to [0,1]. normalize each dimension in config to [0,1].
''' """
config_norm = {} config_norm = {}
for key in config: for key, value in config.items():
value = config[key]
domain = space.get(key) domain = space.get(key)
if domain is None: # e.g., prune_attr if domain is None: # e.g., prune_attr
config_norm[key] = value config_norm[key] = value
continue continue
if not callable(getattr(domain, 'get_sampler', None)): if not callable(getattr(domain, "get_sampler", None)):
if recursive and isinstance(domain, dict): if recursive and isinstance(domain, dict):
config_norm[key] = normalize( config_norm[key] = normalize(value, domain, reference_config[key], {})
value, domain, reference_config[key], {})
else: else:
config_norm[key] = value config_norm[key] = value
continue continue
# domain: sample.Categorical/Integer/Float/Function # domain: sample.Categorical/Integer/Float/Function
if isinstance(domain, sample.Categorical): if isinstance(domain, sample.Categorical):
norm = None norm = None
# value is either one category, or the low_cost_point list # value is: a category, a nested dict, or a low_cost_point list
if value not in domain.categories: if value not in domain.categories:
# nested, low_cost_point list # nested
if recursive: if isinstance(value, list):
# low_cost_point list
norm = [] norm = []
for i, cat in enumerate(domain.categories): for i, cat in enumerate(domain.categories):
norm.append(normalize( norm.append(
value[i], cat, reference_config[key][i], {})) normalize(value[i], cat, reference_config[key][i], {})
if isinstance(value, list) and len(value) > len( if recursive
domain.categories): else value[i]
# low_cost_point list )
if len(value) > len(domain.categories):
# the low cost index was appended to low_cost_point list
index = value[-1] index = value[-1]
config[key] = value[index]
value = domain.categories[index] value = domain.categories[index]
elif not recursive:
# no low cost index. randomly pick one as init point
continue
else: else:
# nested dict
config_norm[key] = value
continue continue
# normalize categorical # normalize categorical
n = len(domain.categories) n = len(domain.categories)
if domain.ordered: if domain.ordered:
normalized = (domain.categories.index(value) + 0.5) / n normalized = (domain.categories.index(value) + 0.5) / n
elif key in normalized_reference_config: elif key in normalized_reference_config:
normalized = normalized_reference_config[ normalized = (
key] if value == reference_config[key] else ( normalized_reference_config[key]
normalized_reference_config[key] + 1 / n) % 1 if value == reference_config[key]
else (normalized_reference_config[key] + 1 / n) % 1
)
else: else:
normalized = 0.5 normalized = 0.5
if norm: if norm:
@ -294,16 +307,19 @@ def normalize(
sampler = sampler.get_sampler() sampler = sampler.get_sampler()
else: else:
quantize = None quantize = None
if str(sampler) == 'LogUniform': if str(sampler) == "LogUniform":
upper = domain.upper - ( upper = domain.upper - (
isinstance(domain, sample.Integer) & (quantize is None)) isinstance(domain, sample.Integer) & (quantize is None)
)
config_norm[key] = np.log(value / domain.lower) / np.log( config_norm[key] = np.log(value / domain.lower) / np.log(
upper / domain.lower) upper / domain.lower
elif str(sampler) == 'Uniform': )
elif str(sampler) == "Uniform":
upper = domain.upper - ( upper = domain.upper - (
isinstance(domain, sample.Integer) & (quantize is None)) isinstance(domain, sample.Integer) & (quantize is None)
)
config_norm[key] = (value - domain.lower) / (upper - domain.lower) config_norm[key] = (value - domain.lower) / (upper - domain.lower)
elif str(sampler) == 'Normal': elif str(sampler) == "Normal":
# N(mean, sd) -> N(0,1) # N(mean, sd) -> N(0,1)
config_norm[key] = (value - sampler.mean) / sampler.sd config_norm[key] = (value - sampler.mean) / sampler.sd
# else: # else:
@ -312,32 +328,49 @@ def normalize(
def denormalize( def denormalize(
config: Dict, space: Dict, reference_config: Dict, config: Dict,
normalized_reference_config: Dict, random_state space: Dict,
reference_config: Dict,
normalized_reference_config: Dict,
random_state,
): ):
config_denorm = {} config_denorm = {}
for key, value in config.items(): for key, value in config.items():
if key in space: if key in space:
# domain: sample.Categorical/Integer/Float/Function # domain: sample.Categorical/Integer/Float/Function
domain = space[key] domain = space[key]
if not callable(getattr(domain, 'get_sampler', None)): if isinstance(value, dict) or not callable(
getattr(domain, "get_sampler", None)
):
config_denorm[key] = value config_denorm[key] = value
else: else:
if isinstance(domain, sample.Categorical): if isinstance(domain, sample.Categorical):
# denormalize categorical # denormalize categorical
n = len(domain.categories) n = len(domain.categories)
if isinstance(value, list):
# denormalize list
choice = int(np.floor(value[-1] * n))
config_denorm[key] = point = value[choice]
point["_choice_"] = choice
continue
if domain.ordered: if domain.ordered:
config_denorm[key] = domain.categories[ config_denorm[key] = domain.categories[
min(n - 1, int(np.floor(value * n)))] min(n - 1, int(np.floor(value * n)))
]
else: else:
assert key in normalized_reference_config assert key in normalized_reference_config
if np.floor(value * n) == np.floor( if np.floor(value * n) == np.floor(
normalized_reference_config[key] * n): normalized_reference_config[key] * n
):
config_denorm[key] = reference_config[key] config_denorm[key] = reference_config[key]
else: # ****random value each time!**** else: # ****random value each time!****
config_denorm[key] = random_state.choice( config_denorm[key] = random_state.choice(
[x for x in domain.categories [
if x != reference_config[key]]) x
for x in domain.categories
if x != reference_config[key]
]
)
continue continue
# Uniform/LogUniform/Normal/Base # Uniform/LogUniform/Normal/Base
sampler = domain.get_sampler() sampler = domain.get_sampler()
@ -348,25 +381,26 @@ def denormalize(
else: else:
quantize = None quantize = None
# Handle Log/Uniform # Handle Log/Uniform
if str(sampler) == 'LogUniform': if str(sampler) == "LogUniform":
upper = domain.upper - (isinstance(domain, sample.Integer) upper = domain.upper - (
& (quantize is None)) isinstance(domain, sample.Integer) & (quantize is None)
config_denorm[key] = ( )
upper / domain.lower) ** value * domain.lower config_denorm[key] = (upper / domain.lower) ** value * domain.lower
elif str(sampler) == 'Uniform': elif str(sampler) == "Uniform":
upper = domain.upper - (isinstance(domain, sample.Integer) upper = domain.upper - (
& (quantize is None)) isinstance(domain, sample.Integer) & (quantize is None)
config_denorm[key] = value * ( )
upper - domain.lower) + domain.lower config_denorm[key] = value * (upper - domain.lower) + domain.lower
elif str(sampler) == 'Normal': elif str(sampler) == "Normal":
# denormalization for 'Normal' # denormalization for 'Normal'
config_denorm[key] = value * sampler.sd + sampler.mean config_denorm[key] = value * sampler.sd + sampler.mean
else: else:
config_denorm[key] = value config_denorm[key] = value
# Handle quantized # Handle quantized
if quantize is not None: if quantize is not None:
config_denorm[key] = np.round( config_denorm[key] = (
np.divide(config_denorm[key], quantize)) * quantize np.round(np.divide(config_denorm[key], quantize)) * quantize
)
# Handle int (4.6 -> 5) # Handle int (4.6 -> 5)
if isinstance(domain, sample.Integer): if isinstance(domain, sample.Integer):
config_denorm[key] = int(round(config_denorm[key])) config_denorm[key] = int(round(config_denorm[key]))
@ -376,9 +410,8 @@ def denormalize(
def indexof(domain: Dict, config: Dict) -> int: def indexof(domain: Dict, config: Dict) -> int:
'''find the index of config in domain.categories """find the index of config in domain.categories"""
''' index = config.get("_choice_")
index = config.get('_choice_')
if index is not None: if index is not None:
return index return index
if config in domain.categories: if config in domain.categories:
@ -402,27 +435,37 @@ def indexof(domain: Dict, config: Dict) -> int:
def complete_config( def complete_config(
partial_config: Dict, space: Dict, flow2, disturb: bool = False, partial_config: Dict,
lower: Optional[Dict] = None, upper: Optional[Dict] = None space: Dict,
flow2,
disturb: bool = False,
lower: Optional[Dict] = None,
upper: Optional[Dict] = None,
) -> Tuple[Dict, Dict]: ) -> Tuple[Dict, Dict]:
'''Complete partial config in space """Complete partial config in space
Returns: Returns:
config, space config, space
''' """
config = partial_config.copy() config = partial_config.copy()
normalized = normalize(config, space, config, {}) normalized = normalize(config, space, partial_config, {})
# print("normalized", normalized)
if disturb: if disturb:
for key in normalized: for key, value in normalized.items():
domain = space.get(key) domain = space.get(key)
if getattr(domain, 'ordered', True) is False: if getattr(domain, "ordered", True) is False:
# don't change unordered cat choice # don't change unordered cat choice
continue continue
if not callable(getattr(domain, 'get_sampler', None)): if not callable(getattr(domain, "get_sampler", None)):
continue continue
if upper and lower: if upper and lower:
up, low = upper[key], lower[key] up, low = upper[key], lower[key]
gauss_std = up - low or flow2.STEPSIZE if isinstance(up, list):
gauss_std = (up[-1] - low[-1]) or flow2.STEPSIZE
up[-1] += flow2.STEPSIZE
low[-1] -= flow2.STEPSIZE
else:
gauss_std = (up - low) or flow2.STEPSIZE
# allowed bound # allowed bound
up += flow2.STEPSIZE up += flow2.STEPSIZE
low -= flow2.STEPSIZE low -= flow2.STEPSIZE
@ -431,16 +474,25 @@ def complete_config(
else: else:
up, low, gauss_std = np.Inf, -np.Inf, 1.0 up, low, gauss_std = np.Inf, -np.Inf, 1.0
if domain.bounded: if domain.bounded:
if isinstance(up, list):
up[-1] = min(up[-1], 1)
low[-1] = max(low[-1], 0)
else:
up = min(up, 1) up = min(up, 1)
low = max(low, 0) low = max(low, 0)
delta = flow2.rand_vector_gaussian(1, gauss_std)[0] delta = flow2.rand_vector_gaussian(1, gauss_std)[0]
normalized[key] = max(low, min(up, normalized[key] + delta)) if isinstance(value, list):
# points + normalized index
value[-1] = max(low[-1], min(up[-1], value[-1] + delta))
else:
normalized[key] = max(low, min(up, value + delta))
config = denormalize(normalized, space, config, normalized, flow2._random) config = denormalize(normalized, space, config, normalized, flow2._random)
# print("denormalized", config)
for key, value in space.items(): for key, value in space.items():
if key not in config: if key not in config:
config[key] = value config[key] = value
for _, generated in generate_variants({'config': config}): for _, generated in generate_variants({"config": config}):
config = generated['config'] config = generated["config"]
break break
subspace = {} subspace = {}
for key, domain in space.items(): for key, domain in space.items():
@ -455,16 +507,26 @@ def complete_config(
# else: # else:
# point = {} # point = {}
config[key], subspace[key] = complete_config( config[key], subspace[key] = complete_config(
value, domain.categories[index], flow2, disturb, value,
lower and lower[key][index], upper and upper[key][index] domain.categories[index],
flow2,
disturb,
lower and lower[key][index],
upper and upper[key][index],
) )
assert '_choice_' not in subspace[key], \ assert (
"_choice_ is a reserved key for hierarchical search space" "_choice_" not in subspace[key]
subspace[key]['_choice_'] = index ), "_choice_ is a reserved key for hierarchical search space"
subspace[key]["_choice_"] = index
else: else:
config[key], subspace[key] = complete_config( config[key], subspace[key] = complete_config(
value, space[key], flow2, disturb, value,
lower and lower[key], upper and upper[key]) space[key],
flow2,
disturb,
lower and lower[key],
upper and upper[key],
)
continue continue
subspace[key] = domain subspace[key] = domain
return config, subspace return config, subspace

View File

@ -36,9 +36,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"source": [ "source": [
"!pip install flaml[notebook];\n", "!pip install flaml[notebook];\r\n",
"# from v0.6.6, catboost is made an optional dependency to build conda package.\n", "# from v0.6.6, catboost is made an optional dependency to build conda package.\r\n",
"# to install catboost, you can uncomment and run:\n", "# to install catboost, you can uncomment and run:\r\n",
"# !pip install flaml[catboost]" "# !pip install flaml[catboost]"
], ],
"outputs": [], "outputs": [],
@ -62,7 +62,7 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 1,
"source": [ "source": [
"from flaml.data import load_openml_dataset\n", "from flaml.data import load_openml_dataset\r\n",
"X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')" "X_train, X_test, y_train, y_test = load_openml_dataset(dataset_id=1169, data_dir='./')"
], ],
"outputs": [ "outputs": [
@ -87,8 +87,8 @@
{ {
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "source": [
"### Run FLAML\n", "### Run FLAML\r\n",
"In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. For example, the default ML learners of FLAML are `['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree', 'lrl1']`. " "In the FLAML automl run configuration, users can specify the task type, time budget, error metric, learner list, whether to subsample, resampling strategy type, and so on. All these arguments have default values which will be used if users do not provide them. For example, the default classifiers are `['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree', 'lrl1']`. "
], ],
"metadata": { "metadata": {
"slideshow": { "slideshow": {
@ -100,8 +100,8 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 2, "execution_count": 2,
"source": [ "source": [
"''' import AutoML class from flaml package '''\n", "''' import AutoML class from flaml package '''\r\n",
"from flaml import AutoML\n", "from flaml import AutoML\r\n",
"automl = AutoML()" "automl = AutoML()"
], ],
"outputs": [], "outputs": [],
@ -115,13 +115,13 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 3,
"source": [ "source": [
"settings = {\n", "settings = {\r\n",
" \"time_budget\": 240, # total running time in seconds\n", " \"time_budget\": 240, # total running time in seconds\r\n",
" \"metric\": 'accuracy', # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',\n", " \"metric\": 'accuracy', # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',\r\n",
" # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'\n", " # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'\r\n",
" \"task\": 'classification', # task type\n", " \"task\": 'classification', # task type\r\n",
" \"log_file_name\": 'airlines_experiment.log', # flaml log file\n", " \"log_file_name\": 'airlines_experiment.log', # flaml log file\r\n",
" \"seed\": 7654321, # random seed\n", " \"seed\": 7654321, # random seed\r\n",
"}" "}"
], ],
"outputs": [], "outputs": [],
@ -135,7 +135,7 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 4, "execution_count": 4,
"source": [ "source": [
"'''The main flaml automl API'''\n", "'''The main flaml automl API'''\r\n",
"automl.fit(X_train=X_train, y_train=y_train, **settings)" "automl.fit(X_train=X_train, y_train=y_train, **settings)"
], ],
"outputs": [ "outputs": [
@ -330,10 +330,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 5, "execution_count": 5,
"source": [ "source": [
"''' retrieve best config and best learner'''\n", "''' retrieve best config and best learner'''\r\n",
"print('Best ML leaner:', automl.best_estimator)\n", "print('Best ML leaner:', automl.best_estimator)\r\n",
"print('Best hyperparmeter config:', automl.best_config)\n", "print('Best hyperparmeter config:', automl.best_config)\r\n",
"print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))\n", "print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))\r\n",
"print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))" "print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))"
], ],
"outputs": [ "outputs": [
@ -387,9 +387,9 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 7, "execution_count": 7,
"source": [ "source": [
"''' pickle and save the automl object '''\n", "''' pickle and save the automl object '''\r\n",
"import pickle\n", "import pickle\r\n",
"with open('automl.pkl', 'wb') as f:\n", "with open('automl.pkl', 'wb') as f:\r\n",
" pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)" " pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)"
], ],
"outputs": [], "outputs": [],
@ -403,10 +403,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": 8,
"source": [ "source": [
"''' compute predictions of testing dataset ''' \n", "''' compute predictions of testing dataset ''' \r\n",
"y_pred = automl.predict(X_test)\n", "y_pred = automl.predict(X_test)\r\n",
"print('Predicted labels', y_pred)\n", "print('Predicted labels', y_pred)\r\n",
"print('True labels', y_test)\n", "print('True labels', y_test)\r\n",
"y_pred_proba = automl.predict_proba(X_test)[:,1]" "y_pred_proba = automl.predict_proba(X_test)[:,1]"
], ],
"outputs": [ "outputs": [
@ -442,10 +442,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 9,
"source": [ "source": [
"''' compute different metric values on testing dataset'''\n", "''' compute different metric values on testing dataset'''\r\n",
"from flaml.ml import sklearn_metric_loss_score\n", "from flaml.ml import sklearn_metric_loss_score\r\n",
"print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))\n", "print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))\r\n",
"print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))\n", "print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))\r\n",
"print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))" "print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))"
], ],
"outputs": [ "outputs": [
@ -483,10 +483,10 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 10,
"source": [ "source": [
"from flaml.data import get_output_from_log\n", "from flaml.data import get_output_from_log\r\n",
"time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\n", "time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \\\r\n",
" get_output_from_log(filename=settings['log_file_name'], time_budget=240)\n", " get_output_from_log(filename=settings['log_file_name'], time_budget=240)\r\n",
"for config in config_history:\n", "for config in config_history:\r\n",
" print(config)" " print(config)"
], ],
"outputs": [ "outputs": [
@ -518,14 +518,14 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 11,
"source": [ "source": [
"import matplotlib.pyplot as plt\n", "import matplotlib.pyplot as plt\r\n",
"import numpy as np\n", "import numpy as np\r\n",
"\n", "\r\n",
"plt.title('Learning Curve')\n", "plt.title('Learning Curve')\r\n",
"plt.xlabel('Wall Clock Time (s)')\n", "plt.xlabel('Wall Clock Time (s)')\r\n",
"plt.ylabel('Validation Accuracy')\n", "plt.ylabel('Validation Accuracy')\r\n",
"plt.scatter(time_history, 1 - np.array(valid_loss_history))\n", "plt.scatter(time_history, 1 - np.array(valid_loss_history))\r\n",
"plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')\n", "plt.step(time_history, 1 - np.array(best_valid_loss_history), where='post')\r\n",
"plt.show()" "plt.show()"
], ],
"outputs": [ "outputs": [
@ -566,7 +566,7 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 12,
"source": [ "source": [
"from lightgbm import LGBMClassifier\n", "from lightgbm import LGBMClassifier\r\n",
"lgbm = LGBMClassifier()" "lgbm = LGBMClassifier()"
], ],
"outputs": [], "outputs": [],
@ -612,11 +612,11 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 15,
"source": [ "source": [
"from xgboost import XGBClassifier\n", "from xgboost import XGBClassifier\r\n",
"xgb = XGBClassifier()\n", "xgb = XGBClassifier()\r\n",
"cat_columns = X_train.select_dtypes(include=['category']).columns\n", "cat_columns = X_train.select_dtypes(include=['category']).columns\r\n",
"X = X_train.copy()\n", "X = X_train.copy()\r\n",
"X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\n" "X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\r\n"
], ],
"outputs": [], "outputs": [],
"metadata": {} "metadata": {}
@ -652,8 +652,8 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 17, "execution_count": 17,
"source": [ "source": [
"X = X_test.copy()\n", "X = X_test.copy()\r\n",
"X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\n", "X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\r\n",
"y_pred_xgb = xgb.predict(X)" "y_pred_xgb = xgb.predict(X)"
], ],
"outputs": [], "outputs": [],
@ -663,8 +663,8 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 18, "execution_count": 18,
"source": [ "source": [
"print('default xgboost accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_xgb, y_test))\n", "print('default xgboost accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_xgb, y_test))\r\n",
"print('default lgbm accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_lgbm, y_test))\n", "print('default lgbm accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_lgbm, y_test))\r\n",
"print('flaml (4min) accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))" "print('flaml (4min) accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))"
], ],
"outputs": [ "outputs": [
@ -727,73 +727,77 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 19, "execution_count": 19,
"source": [ "source": [
"''' SKLearnEstimator is the super class for a sklearn learner '''\n", "''' SKLearnEstimator is the super class for a sklearn learner '''\r\n",
"from flaml.model import SKLearnEstimator\n", "from flaml.model import SKLearnEstimator\r\n",
"from flaml import tune\n", "from flaml import tune\r\n",
"from rgf.sklearn import RGFClassifier, RGFRegressor\n", "from flaml.data import CLASSIFICATION\r\n",
"\n", "\r\n",
"\n", "\r\n",
"class MyRegularizedGreedyForest(SKLearnEstimator):\n", "class MyRegularizedGreedyForest(SKLearnEstimator):\r\n",
" def __init__(self, task='binary', **config):\n", " def __init__(self, task='binary', **config):\r\n",
" '''Constructor\n", " '''Constructor\r\n",
" \n", " \r\n",
" Args:\n", " Args:\r\n",
" task: A string of the task type, one of\n", " task: A string of the task type, one of\r\n",
" 'binary', 'multi', 'regression'\n", " 'binary', 'multi', 'regression'\r\n",
" config: A dictionary containing the hyperparameter names\n", " config: A dictionary containing the hyperparameter names\r\n",
" and 'n_jobs' as keys. n_jobs is the number of parallel threads.\n", " and 'n_jobs' as keys. n_jobs is the number of parallel threads.\r\n",
" '''\n", " '''\r\n",
"\n", "\r\n",
" super().__init__(task, **config)\n", " super().__init__(task, **config)\r\n",
"\n", "\r\n",
" '''task=binary or multi for classification task'''\n", " '''task=binary or multi for classification task'''\r\n",
" if task in (\"binary\", \"multi\"):\n", " if task in CLASSIFICATION:\r\n",
" self.estimator_class = RGFClassifier\n", " from rgf.sklearn import RGFClassifier\r\n",
" else:\n", "\r\n",
" self.estimator_class = RGFRegressor\n", " self.estimator_class = RGFClassifier\r\n",
"\n", " else:\r\n",
" @classmethod\n", " from rgf.sklearn import RGFRegressor\r\n",
" def search_space(cls, data_size, task):\n", " \r\n",
" '''[required method] search space\n", " self.estimator_class = RGFRegressor\r\n",
"\n", "\r\n",
" Returns:\n", " @classmethod\r\n",
" A dictionary of the search space. \n", " def search_space(cls, data_size, task):\r\n",
" Each key is the name of a hyperparameter, and value is a dict with\n", " '''[required method] search space\r\n",
" its domain (required) and low_cost_init_value, init_value,\n", "\r\n",
" cat_hp_cost (if applicable).\n", " Returns:\r\n",
" e.g.,\n", " A dictionary of the search space. \r\n",
" {'domain': tune.randint(lower=1, upper=10), 'init_value': 1}.\n", " Each key is the name of a hyperparameter, and value is a dict with\r\n",
" '''\n", " its domain (required) and low_cost_init_value, init_value,\r\n",
" space = { \n", " cat_hp_cost (if applicable).\r\n",
" 'max_leaf': {'domain': tune.lograndint(lower=4, upper=data_size), 'init_value': 4, 'low_cost_init_value': 4},\n", " e.g.,\r\n",
" 'n_iter': {'domain': tune.lograndint(lower=1, upper=data_size), 'init_value': 1, 'low_cost_init_value': 1},\n", " {'domain': tune.randint(lower=1, upper=10), 'init_value': 1}.\r\n",
" 'n_tree_search': {'domain': tune.lograndint(lower=1, upper=32768), 'init_value': 1, 'low_cost_init_value': 1},\n", " '''\r\n",
" 'opt_interval': {'domain': tune.lograndint(lower=1, upper=10000), 'init_value': 100},\n", " space = { \r\n",
" 'learning_rate': {'domain': tune.loguniform(lower=0.01, upper=20.0)},\n", " 'max_leaf': {'domain': tune.lograndint(lower=4, upper=data_size), 'init_value': 4, 'low_cost_init_value': 4},\r\n",
" 'min_samples_leaf': {'domain': tune.lograndint(lower=1, upper=20), 'init_value': 20},\n", " 'n_iter': {'domain': tune.lograndint(lower=1, upper=data_size), 'init_value': 1, 'low_cost_init_value': 1},\r\n",
" }\n", " 'n_tree_search': {'domain': tune.lograndint(lower=1, upper=32768), 'init_value': 1, 'low_cost_init_value': 1},\r\n",
" return space\n", " 'opt_interval': {'domain': tune.lograndint(lower=1, upper=10000), 'init_value': 100},\r\n",
"\n", " 'learning_rate': {'domain': tune.loguniform(lower=0.01, upper=20.0)},\r\n",
" @classmethod\n", " 'min_samples_leaf': {'domain': tune.lograndint(lower=1, upper=20), 'init_value': 20},\r\n",
" def size(cls, config):\n", " }\r\n",
" '''[optional method] memory size of the estimator in bytes\n", " return space\r\n",
" \n", "\r\n",
" Args:\n", " @classmethod\r\n",
" config - the dict of the hyperparameter config\n", " def size(cls, config):\r\n",
"\n", " '''[optional method] memory size of the estimator in bytes\r\n",
" Returns:\n", " \r\n",
" A float of the memory size required by the estimator to train the\n", " Args:\r\n",
" given config\n", " config - the dict of the hyperparameter config\r\n",
" '''\n", "\r\n",
" max_leaves = int(round(config['max_leaf']))\n", " Returns:\r\n",
" n_estimators = int(round(config['n_iter']))\n", " A float of the memory size required by the estimator to train the\r\n",
" return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8\n", " given config\r\n",
"\n", " '''\r\n",
" @classmethod\n", " max_leaves = int(round(config['max_leaf']))\r\n",
" def cost_relative2lgbm(cls):\n", " n_estimators = int(round(config['n_iter']))\r\n",
" '''[optional method] relative cost compared to lightgbm\n", " return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8\r\n",
" '''\n", "\r\n",
" return 1.0\n" " @classmethod\r\n",
" def cost_relative2lgbm(cls):\r\n",
" '''[optional method] relative cost compared to lightgbm\r\n",
" '''\r\n",
" return 1.0\r\n"
], ],
"outputs": [], "outputs": [],
"metadata": { "metadata": {
@ -819,7 +823,7 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 20, "execution_count": 20,
"source": [ "source": [
"automl = AutoML()\n", "automl = AutoML()\r\n",
"automl.add_learner(learner_name='RGF', learner_class=MyRegularizedGreedyForest)" "automl.add_learner(learner_name='RGF', learner_class=MyRegularizedGreedyForest)"
], ],
"outputs": [], "outputs": [],
@ -833,15 +837,15 @@
"cell_type": "code", "cell_type": "code",
"execution_count": 21, "execution_count": 21,
"source": [ "source": [
"settings = {\n", "settings = {\r\n",
" \"time_budget\": 10, # total running time in seconds\n", " \"time_budget\": 10, # total running time in seconds\r\n",
" \"metric\": 'accuracy', \n", " \"metric\": 'accuracy', \r\n",
" \"estimator_list\": ['RGF', 'lgbm', 'rf', 'xgboost'], # list of ML learners\n", " \"estimator_list\": ['RGF', 'lgbm', 'rf', 'xgboost'], # list of ML learners\r\n",
" \"task\": 'classification', # task type \n", " \"task\": 'classification', # task type \r\n",
" \"log_file_name\": 'airlines_experiment_custom_learner.log', # flaml log file \n", " \"log_file_name\": 'airlines_experiment_custom_learner.log', # flaml log file \r\n",
" \"log_training_metric\": True, # whether to log training metric\n", " \"log_training_metric\": True, # whether to log training metric\r\n",
"}\n", "}\r\n",
"\n", "\r\n",
"automl.fit(X_train = X_train, y_train = y_train, **settings)" "automl.fit(X_train = X_train, y_train = y_train, **settings)"
], ],
"outputs": [ "outputs": [

View File

@ -13,10 +13,9 @@ import pandas as pd
from datetime import datetime from datetime import datetime
from flaml import AutoML from flaml import AutoML
from flaml.data import get_output_from_log from flaml.data import CLASSIFICATION, get_output_from_log
from flaml.model import LGBMEstimator, SKLearnEstimator, XGBoostEstimator from flaml.model import LGBMEstimator, SKLearnEstimator, XGBoostEstimator
from rgf.sklearn import RGFClassifier, RGFRegressor
from flaml import tune from flaml import tune
from flaml.training_log import training_log_reader from flaml.training_log import training_log_reader
@ -26,9 +25,13 @@ class MyRegularizedGreedyForest(SKLearnEstimator):
super().__init__(task, **config) super().__init__(task, **config)
if task in ("binary", "multi"): if task in CLASSIFICATION:
from rgf.sklearn import RGFClassifier
self.estimator_class = RGFClassifier self.estimator_class = RGFClassifier
else: else:
from rgf.sklearn import RGFRegressor
self.estimator_class = RGFRegressor self.estimator_class = RGFRegressor
@classmethod @classmethod
@ -628,7 +631,7 @@ class TestAutoML(unittest.TestCase):
"log_file_name": "test/california.log", "log_file_name": "test/california.log",
"log_type": "all", "log_type": "all",
"n_jobs": 1, "n_jobs": 1,
"n_concurrent_trials": 2, "n_concurrent_trials": 10,
"hpo_method": hpo_method, "hpo_method": hpo_method,
} }
X_train, y_train = fetch_california_housing(return_X_y=True) X_train, y_train = fetch_california_housing(return_X_y=True)

View File

@ -109,4 +109,4 @@ def test_mlflow():
if __name__ == "__main__": if __name__ == "__main__":
test_automl(300) test_automl(120)

View File

@ -64,6 +64,7 @@ class TestLogging(unittest.TestCase):
automl.search_space, automl.low_cost_partial_config, automl.cat_hp_cost automl.search_space, automl.low_cost_partial_config, automl.cat_hp_cost
) )
logger.info(automl.search_space["ml"].categories) logger.info(automl.search_space["ml"].categories)
if automl.best_config:
config = automl.best_config.copy() config = automl.best_config.copy()
config["learner"] = automl.best_estimator config["learner"] = automl.best_estimator
automl.trainable({"ml": config}) automl.trainable({"ml": config})
@ -71,11 +72,12 @@ class TestLogging(unittest.TestCase):
from flaml.automl import size from flaml.automl import size
from functools import partial from functools import partial
low_cost_partial_config = automl.low_cost_partial_config
search_alg = BlendSearch( search_alg = BlendSearch(
metric="val_loss", metric="val_loss",
mode="min", mode="min",
space=automl.search_space, space=automl.search_space,
low_cost_partial_config=automl.low_cost_partial_config, low_cost_partial_config=low_cost_partial_config,
points_to_evaluate=automl.points_to_evaluate, points_to_evaluate=automl.points_to_evaluate,
cat_hp_cost=automl.cat_hp_cost, cat_hp_cost=automl.cat_hp_cost,
prune_attr=automl.prune_attr, prune_attr=automl.prune_attr,
@ -95,6 +97,14 @@ class TestLogging(unittest.TestCase):
print(min(trial.last_result["val_loss"] for trial in analysis.trials)) print(min(trial.last_result["val_loss"] for trial in analysis.trials))
config = analysis.trials[-1].last_result["config"]["ml"] config = analysis.trials[-1].last_result["config"]["ml"]
automl._state._train_with_config(config["learner"], config) automl._state._train_with_config(config["learner"], config)
for _ in range(3):
print(
search_alg._ls.complete_config(
low_cost_partial_config,
search_alg._ls_bound_min,
search_alg._ls_bound_max,
)
)
# Check if the log buffer is populated. # Check if the log buffer is populated.
self.assertTrue(len(buf.getvalue()) > 0) self.assertTrue(len(buf.getvalue()) > 0)