constraints (#88)

* pre-training constraints

* metric constraints after training
This commit is contained in:
Chi Wang 2021-05-18 15:57:42 -07:00 committed by GitHub
parent 3083229e40
commit 0925e2b308
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 137 additions and 87 deletions

View File

@ -1036,9 +1036,8 @@ class AutoML:
prune_attr=prune_attr, prune_attr=prune_attr,
min_resource=min_resource, min_resource=min_resource,
max_resource=max_resource, max_resource=max_resource,
resources_per_trial={"cpu": self._state.n_jobs, config_constraints=[(learner_class.size, '<=', self._mem_thres)]
"mem": self._mem_thres}, )
mem_size=learner_class.size)
else: else:
algo = SearchAlgo( algo = SearchAlgo(
metric='val_loss', mode='min', space=search_space, metric='val_loss', mode='min', space=search_space,

View File

@ -237,8 +237,8 @@ class DataTransformer:
SimpleImputer(missing_values=np.nan, strategy='median'), SimpleImputer(missing_values=np.nan, strategy='median'),
X_num.columns)]) X_num.columns)])
X[num_columns] = self.transformer.fit_transform(X_num) X[num_columns] = self.transformer.fit_transform(X_num)
self._cat_columns, self._num_columns, self._datetime_columns = cat_columns, \ self._cat_columns, self._num_columns, self._datetime_columns = \
num_columns, datetime_columns cat_columns, num_columns, datetime_columns
self._drop = drop self._drop = drop
if task == 'regression': if task == 'regression':
@ -275,4 +275,3 @@ class DataTransformer:
X_num.columns = range(X_num.shape[1]) X_num.columns = range(X_num.shape[1])
X[num_columns] = self.transformer.transform(X_num) X[num_columns] = self.transformer.transform(X_num)
return X return X

View File

@ -39,9 +39,11 @@ class BlendSearch(Searcher):
min_resource: Optional[float] = None, min_resource: Optional[float] = None,
max_resource: Optional[float] = None, max_resource: Optional[float] = None,
reduction_factor: Optional[float] = None, reduction_factor: Optional[float] = None,
resources_per_trial: Optional[dict] = None,
global_search_alg: Optional[Searcher] = None, global_search_alg: Optional[Searcher] = None,
mem_size: Callable[[dict], float] = None, config_constraints: Optional[
List[Tuple[Callable[[dict], float], str, float]]] = None,
metric_constraints: Optional[
List[Tuple[str, str, float]]] = None,
seed: Optional[int] = 20): seed: Optional[int] = 20):
'''Constructor '''Constructor
@ -82,14 +84,23 @@ class BlendSearch(Searcher):
prune_attr; only valid if prune_attr is not in space. prune_attr; only valid if prune_attr is not in space.
reduction_factor: A float of the reduction factor used for reduction_factor: A float of the reduction factor used for
incremental pruning. incremental pruning.
resources_per_trial: A dictionary of the resources permitted per
trial, such as 'mem'.
global_search_alg: A Searcher instance as the global search global_search_alg: A Searcher instance as the global search
instance. If omitted, Optuna is used. The following algos have instance. If omitted, Optuna is used. The following algos have
known issues when used as global_search_alg: known issues when used as global_search_alg:
- HyperOptSearch raises exception sometimes - HyperOptSearch raises exception sometimes
- TuneBOHB has its own scheduler - TuneBOHB has its own scheduler
mem_size: A function to estimate the memory size for a given config. config_constraints: A list of config constraints to be satisfied.
e.g.,
.. code-block: python
config_constraints = [(mem_size, '<=', 1024**3)]
mem_size is a function which produces a float number for the bytes
needed for a config.
It is used to skip configs which do not fit in memory.
metric_constraints: A list of metric constraints to be satisfied.
e.g., `['precision', '>=', 0.9]`
seed: An integer of the random seed. seed: An integer of the random seed.
''' '''
self._metric, self._mode = metric, mode self._metric, self._mode = metric, mode
@ -104,10 +115,8 @@ class BlendSearch(Searcher):
self._ls = LocalSearch( self._ls = LocalSearch(
init_config, metric, mode, cat_hp_cost, space, init_config, metric, mode, cat_hp_cost, space,
prune_attr, min_resource, max_resource, reduction_factor, seed) prune_attr, min_resource, max_resource, reduction_factor, seed)
self._resources_per_trial = resources_per_trial self._config_constraints = config_constraints
self._mem_size = mem_size self._metric_constraints = metric_constraints
self._mem_threshold = resources_per_trial.get(
'mem') if resources_per_trial else None
self._init_search() self._init_search()
def set_search_properties(self, def set_search_properties(self,
@ -171,9 +180,8 @@ class BlendSearch(Searcher):
self._points_to_evaluate = state._points_to_evaluate self._points_to_evaluate = state._points_to_evaluate
self._gs = state._gs self._gs = state._gs
self._ls = state._ls self._ls = state._ls
self._resources_per_trial = state._resources_per_trial self._config_constraints = state._config_constraints
self._mem_size = state._mem_size self._metric_constraints = state._metric_constraints
self._mem_threshold = state._mem_threshold
def restore_from_dir(self, checkpoint_dir: str): def restore_from_dir(self, checkpoint_dir: str):
super.restore_from_dir(checkpoint_dir) super.restore_from_dir(checkpoint_dir)
@ -182,6 +190,20 @@ class BlendSearch(Searcher):
error: bool = False): error: bool = False):
''' search thread updater and cleaner ''' search thread updater and cleaner
''' '''
if result and not error and self._metric_constraints:
# accout for metric constraints if any
objective = result[self._metric]
for constraint in self._metric_constraints:
metric_constraint, sign, threshold = constraint
value = result.get(metric_constraint)
if value:
# sign is <= or >=
sign_op = 1 if sign == '<=' else -1
violation = (value - threshold) * sign_op
if violation > 0:
# add penalty term to the metric
objective += 1e+10 * violation * self._ls.metric_op
result[self._metric] = objective
thread_id = self._trial_proposed_by.get(trial_id) thread_id = self._trial_proposed_by.get(trial_id)
if thread_id in self._search_thread_pool: if thread_id in self._search_thread_pool:
self._search_thread_pool[thread_id].on_trial_complete( self._search_thread_pool[thread_id].on_trial_complete(
@ -197,14 +219,15 @@ class BlendSearch(Searcher):
else: # add to result cache else: # add to result cache
self._result[self._ls.config_signature(config)] = result self._result[self._ls.config_signature(config)] = result
# update target metric if improved # update target metric if improved
if (result[self._metric] - self._metric_target) * self._ls.metric_op < 0: objective = result[self._metric]
self._metric_target = result[self._metric] if (objective - self._metric_target) * self._ls.metric_op < 0:
self._metric_target = objective
if not thread_id and self._create_condition(result): if not thread_id and self._create_condition(result):
# thread creator # thread creator
self._search_thread_pool[self._thread_count] = SearchThread( self._search_thread_pool[self._thread_count] = SearchThread(
self._ls.mode, self._ls.mode,
self._ls.create(config, result[self._metric], cost=result[ self._ls.create(
self.cost_attr]) config, objective, cost=result[self.cost_attr])
) )
thread_id = self._thread_count thread_id = self._thread_count
self._thread_count += 1 self._thread_count += 1
@ -362,20 +385,26 @@ class BlendSearch(Searcher):
return config return config
def _should_skip(self, choice, trial_id, config) -> bool: def _should_skip(self, choice, trial_id, config) -> bool:
''' if config is None or config's result is known or above mem threshold ''' if config is None or config's result is known or constraints are violated
return True; o.w. return False return True; o.w. return False
''' '''
if config is None: if config is None:
return True return True
config_signature = self._ls.config_signature(config) config_signature = self._ls.config_signature(config)
exists = config_signature in self._result exists = config_signature in self._result
# check mem constraint # check constraints
if not exists and self._mem_threshold and self._mem_size( if not exists and self._config_constraints:
config) > self._mem_threshold: for constraint in self._config_constraints:
func, sign, threshold = constraint
value = func(config)
if (sign == '<=' and value > threshold
or sign == '>=' and value < threshold):
self._result[config_signature] = { self._result[config_signature] = {
self._metric: np.inf * self._ls.metric_op, 'time_total_s': 1 self._metric: np.inf * self._ls.metric_op,
'time_total_s': 1,
} }
exists = True exists = True
break
if exists: if exists:
if not self._use_rs: if not self._use_rs:
result = self._result.get(config_signature) result = self._result.get(config_signature)

View File

@ -3,7 +3,7 @@
* Licensed under the MIT License. See LICENSE file in the * Licensed under the MIT License. See LICENSE file in the
* project root for license information. * project root for license information.
''' '''
from typing import Optional, Union, List, Callable from typing import Optional, Union, List, Callable, Tuple
import datetime import datetime
import time import time
try: try:
@ -118,7 +118,10 @@ def run(training_function,
local_dir: Optional[str] = None, local_dir: Optional[str] = None,
num_samples: Optional[int] = 1, num_samples: Optional[int] = 1,
resources_per_trial: Optional[dict] = None, resources_per_trial: Optional[dict] = None,
mem_size: Callable[[dict], float] = None, config_constraints: Optional[
List[Tuple[Callable[[dict], float], str, float]]] = None,
metric_constraints: Optional[
List[Tuple[str, str, float]]] = None,
use_ray: Optional[bool] = False): use_ray: Optional[bool] = False):
'''The trigger for HPO. '''The trigger for HPO.
@ -210,11 +213,19 @@ def run(training_function,
used; or a local dir to save the tuning log. used; or a local dir to save the tuning log.
num_samples: An integer of the number of configs to try. Defaults to 1. num_samples: An integer of the number of configs to try. Defaults to 1.
resources_per_trial: A dictionary of the hardware resources to allocate resources_per_trial: A dictionary of the hardware resources to allocate
per trial, e.g., `{'mem': 1024**3}`. When not using ray backend, per trial, e.g., `{'cpu': 1}`. Only valid when using ray backend.
only 'mem' is used as approximate resource constraints config_constraints: A list of config constraints to be satisfied.
(in conjunction with mem_size). e.g.,
mem_size: A function to estimate the memory size for a given config.
.. code-block: python
config_constraints = [(mem_size, '<=', 1024**3)]
mem_size is a function which produces a float number for the bytes
needed for a config.
It is used to skip configs which do not fit in memory. It is used to skip configs which do not fit in memory.
metric_constraints: A list of metric constraints to be satisfied.
e.g., `['precision', '>=', 0.9]`
use_ray: A boolean of whether to use ray as the backend use_ray: A boolean of whether to use ray as the backend
''' '''
global _use_ray global _use_ray
@ -252,8 +263,8 @@ def run(training_function,
prune_attr=prune_attr, prune_attr=prune_attr,
min_resource=min_resource, max_resource=max_resource, min_resource=min_resource, max_resource=max_resource,
reduction_factor=reduction_factor, reduction_factor=reduction_factor,
resources_per_trial=resources_per_trial, config_constraints=config_constraints,
mem_size=mem_size) metric_constraints=metric_constraints)
if time_budget_s: if time_budget_s:
search_alg.set_search_properties(metric, mode, config={ search_alg.set_search_properties(metric, mode, config={
'time_budget_s': time_budget_s}) 'time_budget_s': time_budget_s})

View File

@ -1 +1 @@
__version__ = "0.3.6" __version__ = "0.4.0"

View File

@ -237,9 +237,11 @@ class TestAutoML(unittest.TestCase):
fake_df = pd.DataFrame({'A': [datetime(1900, 2, 3), datetime(1900, 3, 4)]}) fake_df = pd.DataFrame({'A': [datetime(1900, 2, 3), datetime(1900, 3, 4)]})
y = np.array([0, 1]) y = np.array([0, 1])
automl_experiment.fit(X_train=fake_df, X_val=fake_df, y_train=y, y_val=y, **automl_settings) automl_experiment.fit(
X_train=fake_df, X_val=fake_df, y_train=y, y_val=y, **automl_settings)
y_pred = automl_experiment.predict(fake_df) y_pred = automl_experiment.predict(fake_df)
print(y_pred)
def test_micro_macro_f1(self): def test_micro_macro_f1(self):
automl_experiment = AutoML() automl_experiment = AutoML()

0
test/tune/__init__.py Normal file
View File

View File

@ -1,19 +1,21 @@
'''Require: pip install flaml[test,ray] '''Require: pip install flaml[test,ray]
''' '''
import unittest
import time import time
import os
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
import sklearn.metrics import sklearn.metrics
import sklearn.datasets import sklearn.datasets
try: try:
from ray.tune.integration.xgboost import TuneReportCheckpointCallback from ray.tune.integration.xgboost import TuneReportCheckpointCallback
except ImportError: except ImportError:
print("skip test_tune because ray tune cannot be imported.") print("skip test_xgboost because ray tune cannot be imported.")
import xgboost as xgb import xgboost as xgb
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.addHandler(logging.FileHandler('test/tune_xgboost.log')) os.makedirs('logs', exist_ok=True)
logger.addHandler(logging.FileHandler('logs/tune_xgboost.log'))
logger.setLevel(logging.INFO)
def train_breast_cancer(config: dict): def train_breast_cancer(config: dict):
@ -61,6 +63,7 @@ def _test_xgboost(method='BlendSearch'):
for n_cpu in [8]: for n_cpu in [8]:
start_time = time.time() start_time = time.time()
ray.init(num_cpus=n_cpu, num_gpus=0) ray.init(num_cpus=n_cpu, num_gpus=0)
# ray.init(address='auto')
if method == 'BlendSearch': if method == 'BlendSearch':
analysis = tune.run( analysis = tune.run(
train_breast_cancer, train_breast_cancer,
@ -163,21 +166,28 @@ def test_nested():
} }
def simple_func(config): def simple_func(config):
tune.report(metric=(config["cost_related"]["a"] - 4)**2 obj = (config["cost_related"]["a"] - 4)**2 \
* (config["b"] - 0.7)**2) + (config["b"] - config["cost_related"]["a"])**2
tune.report(obj=obj)
tune.report(obj=obj, ab=config["cost_related"]["a"] * config["b"])
tune.run( analysis = tune.run(
simple_func, simple_func,
config=search_space, config=search_space,
low_cost_partial_config={ low_cost_partial_config={
"cost_related": {"a": 1} "cost_related": {"a": 1}
}, },
metric="metric", metric="obj",
mode="min", mode="min",
metric_constraints=[("ab", "<=", 4)],
local_dir='logs/', local_dir='logs/',
num_samples=-1, num_samples=-1,
time_budget_s=1) time_budget_s=1)
best_trial = analysis.get_best_trial()
logger.info(f"Best config: {best_trial.config}")
logger.info(f"Best result: {best_trial.last_result}")
def test_xgboost_bs(): def test_xgboost_bs():
_test_xgboost() _test_xgboost()
@ -224,4 +234,4 @@ def _test_xgboost_bohb():
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() test_xgboost_bs()