2021-11-06 09:37:33 -07:00
# !
2021-12-04 18:27:38 +00:00
# * Copyright (c) FLAML authors. All rights reserved.
2021-11-06 09:37:33 -07:00
# * Licensed under the MIT License. See LICENSE file in the
# * project root for license information.
2023-02-22 03:49:56 +01:00
from __future__ import annotations
2021-08-23 19:36:51 -04:00
import time
2021-11-18 09:39:45 -08:00
import os
2022-12-24 00:18:49 +08:00
import sys
2023-03-11 02:39:08 +00:00
from typing import Callable , List , Union , Optional
2021-08-23 19:36:51 -04:00
from functools import partial
import numpy as np
import logging
2021-12-04 18:27:38 +00:00
import json
2023-03-11 02:39:08 +00:00
from flaml . automl . state import SearchState , AutoMLState
2022-12-06 20:46:08 +00:00
from flaml . automl . ml import (
2021-09-10 16:39:16 -07:00
train_estimator ,
get_estimator_class ,
)
2022-12-06 20:46:08 +00:00
from flaml . config import (
2021-09-10 16:39:16 -07:00
MIN_SAMPLE_TRAIN ,
MEM_THRES ,
RANDOM_SEED ,
SMALL_LARGE_THRES ,
CV_HOLDOUT_THRESHOLD ,
SPLIT_RATIO ,
N_SPLITS ,
SAMPLE_MULTIPLY_FACTOR ,
)
2023-03-11 02:39:08 +00:00
# TODO check to see when we can remove these
from flaml . automl . task . task import CLASSIFICATION , TS_FORECAST , Task
from flaml . automl . task . factory import task_factory
2022-12-06 20:46:08 +00:00
from flaml import tune
2023-03-11 02:39:08 +00:00
from flaml . automl . logger import logger , logger_formatter
2022-12-06 20:46:08 +00:00
from flaml . automl . training_log import training_log_reader , training_log_writer
from flaml . default import suggest_learner
from flaml . version import __version__ as flaml_version
2023-05-24 16:55:04 -07:00
from flaml . automl . spark import psDataFrame , psSeries , DataFrame , Series
2022-12-24 00:18:49 +08:00
from flaml . tune . spark . utils import check_spark , get_broadcast_data
2021-08-23 19:36:51 -04:00
2023-05-24 16:55:04 -07:00
ERROR = (
DataFrame is None and ImportError ( " please install flaml[automl] option to use the flaml.automl package. " ) or None
)
Support spark dataframe as input dataset and spark models as estimators (#934)
* add basic support to Spark dataframe
add support to SynapseML LightGBM model
update to pyspark>=3.2.0 to leverage pandas_on_Spark API
* clean code, add TODOs
* add sample_train_data for pyspark.pandas dataframe, fix bugs
* improve some functions, fix bugs
* fix dict change size during iteration
* update model predict
* update LightGBM model, update test
* update SynapseML LightGBM params
* update synapseML and tests
* update TODOs
* Added support to roc_auc for spark models
* Added support to score of spark estimator
* Added test for automl score of spark estimator
* Added cv support to pyspark.pandas dataframe
* Update test, fix bugs
* Added tests
* Updated docs, tests, added a notebook
* Fix bugs in non-spark env
* Fix bugs and improve tests
* Fix uninstall pyspark
* Fix tests error
* Fix java.lang.OutOfMemoryError: Java heap space
* Fix test_performance
* Update test_sparkml to test_0sparkml to use the expected spark conf
* Remove unnecessary widgets in notebook
* Fix iloc java.lang.StackOverflowError
* fix pre-commit
* Added params check for spark dataframes
* Refactor code for train_test_split to a function
* Update train_test_split_pyspark
* Refactor if-else, remove unnecessary code
* Remove y from predict, remove mem control from n_iter compute
* Update workflow
* Improve _split_pyspark
* Fix test failure of too short training time
* Fix typos, improve docstrings
* Fix index errors of pandas_on_spark, add spark loss metric
* Fix typo of ndcgAtK
* Update NDCG metrics and tests
* Remove unuseful logger
* Use cache and count to ensure consistent indexes
* refactor for merge maain
* fix errors of refactor
* Updated SparkLightGBMEstimator and cache
* Updated config2params
* Remove unused import
* Fix unknown parameters
* Update default_estimator_list
* Add unit tests for spark metrics
2023-03-26 03:59:46 +08:00
try :
2023-05-24 16:55:04 -07:00
from sklearn . base import BaseEstimator
Support spark dataframe as input dataset and spark models as estimators (#934)
* add basic support to Spark dataframe
add support to SynapseML LightGBM model
update to pyspark>=3.2.0 to leverage pandas_on_Spark API
* clean code, add TODOs
* add sample_train_data for pyspark.pandas dataframe, fix bugs
* improve some functions, fix bugs
* fix dict change size during iteration
* update model predict
* update LightGBM model, update test
* update SynapseML LightGBM params
* update synapseML and tests
* update TODOs
* Added support to roc_auc for spark models
* Added support to score of spark estimator
* Added test for automl score of spark estimator
* Added cv support to pyspark.pandas dataframe
* Update test, fix bugs
* Added tests
* Updated docs, tests, added a notebook
* Fix bugs in non-spark env
* Fix bugs and improve tests
* Fix uninstall pyspark
* Fix tests error
* Fix java.lang.OutOfMemoryError: Java heap space
* Fix test_performance
* Update test_sparkml to test_0sparkml to use the expected spark conf
* Remove unnecessary widgets in notebook
* Fix iloc java.lang.StackOverflowError
* fix pre-commit
* Added params check for spark dataframes
* Refactor code for train_test_split to a function
* Update train_test_split_pyspark
* Refactor if-else, remove unnecessary code
* Remove y from predict, remove mem control from n_iter compute
* Update workflow
* Improve _split_pyspark
* Fix test failure of too short training time
* Fix typos, improve docstrings
* Fix index errors of pandas_on_spark, add spark loss metric
* Fix typo of ndcgAtK
* Update NDCG metrics and tests
* Remove unuseful logger
* Use cache and count to ensure consistent indexes
* refactor for merge maain
* fix errors of refactor
* Updated SparkLightGBMEstimator and cache
* Updated config2params
* Remove unused import
* Fix unknown parameters
* Update default_estimator_list
* Add unit tests for spark metrics
2023-03-26 03:59:46 +08:00
except ImportError :
2023-05-24 16:55:04 -07:00
BaseEstimator = object
ERROR = ERROR or ImportError ( " please install flaml[automl] option to use the flaml.automl package. " )
2021-08-23 19:36:51 -04:00
try :
import mlflow
except ImportError :
mlflow = None
2022-12-24 00:18:49 +08:00
try :
from ray import __version__ as ray_version
assert ray_version > = " 1.10.0 "
ray_available = True
except ( ImportError , AssertionError ) :
ray_available = False
2021-08-23 19:36:51 -04:00
2023-01-06 03:00:05 +01:00
def size ( learner_classes : dict , config : dict ) - > float :
2021-12-16 17:11:33 -08:00
""" Size function.
2021-08-23 19:36:51 -04:00
Returns :
2021-12-16 17:11:33 -08:00
The mem size in bytes for a config .
2021-09-10 16:39:16 -07:00
"""
config = config . get ( " ml " , config )
estimator = config [ " learner " ]
2023-01-06 03:00:05 +01:00
learner_class = learner_classes . get ( estimator )
2021-08-23 19:36:51 -04:00
return learner_class . size ( config )
2021-11-22 06:59:42 -08:00
class AutoML ( BaseEstimator ) :
2021-11-06 11:37:57 -07:00
""" The AutoML class.
2021-08-23 19:36:51 -04:00
Example :
2021-12-16 17:11:33 -08:00
` ` ` python
automl = AutoML ( )
automl_settings = {
" time_budget " : 60 ,
" metric " : ' accuracy ' ,
" task " : ' classification ' ,
" log_file_name " : ' mylog.log ' ,
}
automl . fit ( X_train = X_train , y_train = y_train , * * automl_settings )
` ` `
2021-08-23 19:36:51 -04:00
2021-09-10 16:39:16 -07:00
"""
2021-08-23 19:36:51 -04:00
2022-12-06 20:46:08 +00:00
__version__ = flaml_version
2021-08-23 19:36:51 -04:00
2021-11-18 09:39:45 -08:00
def __init__ ( self , * * settings ) :
""" Constructor.
2022-05-20 10:49:39 -07:00
Many settings in fit ( ) can be passed to the constructor too .
If an argument in fit ( ) is provided , it will override the setting passed to the constructor .
If an argument in fit ( ) is not provided but provided in the constructor , the value passed to the constructor will be used .
Args :
metric : A string of the metric name or a function ,
2022-12-03 08:57:32 +05:30
e . g . , ' accuracy ' , ' roc_auc ' , ' roc_auc_ovr ' , ' roc_auc_ovo ' , ' roc_auc_weighted ' ,
' roc_auc_ovo_weighted ' , ' roc_auc_ovr_weighted ' , ' f1 ' , ' micro_f1 ' , ' macro_f1 ' ,
' log_loss ' , ' mae ' , ' mse ' , ' r2 ' , ' mape ' . Default is ' auto ' .
2022-05-20 10:49:39 -07:00
If passing a customized metric function , the function needs to
2022-10-15 22:35:49 -04:00
have the following input arguments :
2022-05-20 10:49:39 -07:00
` ` ` python
def custom_metric (
X_test , y_test , estimator , labels ,
X_train , y_train , weight_test = None , weight_train = None ,
config = None , groups_test = None , groups_train = None ,
) :
return metric_to_minimize , metrics_to_log
` ` `
which returns a float number as the minimization objective ,
and a dictionary as the metrics to log . E . g . ,
` ` ` python
def custom_metric (
X_val , y_val , estimator , labels ,
X_train , y_train , weight_val = None , weight_train = None ,
* args ,
) :
from sklearn . metrics import log_loss
import time
start = time . time ( )
y_pred = estimator . predict_proba ( X_val )
pred_time = ( time . time ( ) - start ) / len ( X_val )
val_loss = log_loss ( y_val , y_pred , labels = labels , sample_weight = weight_val )
y_pred = estimator . predict_proba ( X_train )
train_loss = log_loss ( y_train , y_pred , labels = labels , sample_weight = weight_train )
alpha = 0.5
return val_loss * ( 1 + alpha ) - alpha * train_loss , {
" val_loss " : val_loss ,
" train_loss " : train_loss ,
" pred_time " : pred_time ,
}
` ` `
task : A string of the task type , e . g . ,
' classification ' , ' regression ' , ' ts_forecast ' , ' rank ' ,
2023-03-11 02:39:08 +00:00
' seq-classification ' , ' seq-regression ' , ' summarization ' ,
or an instance of the Task class .
2022-05-20 10:49:39 -07:00
n_jobs : An integer of the number of threads for training | default = - 1.
Use all available resources when n_jobs == - 1.
log_file_name : A string of the log file name | default = " " . To disable logging ,
set it to be an empty string " " .
2022-05-28 17:22:09 -07:00
estimator_list : A list of strings for estimator names , or ' auto ' .
e . g . , ` ` ` [ ' lgbm ' , ' xgboost ' , ' xgb_limitdepth ' , ' catboost ' , ' rf ' , ' extra_tree ' ] ` ` ` .
2022-05-20 10:49:39 -07:00
time_budget : A float number of the time budget in seconds .
Use - 1 if no time limit .
max_iter : An integer of the maximal number of iterations .
sample : A boolean of whether to sample the training data during
search .
2022-05-18 21:01:51 -07:00
ensemble : boolean or dict | default = False . Whether to perform
ensemble after search . Can be a dict with keys ' passthrough '
and ' final_estimator ' to specify the passthrough and
final_estimator in the stacker . The dict can also contain
' n_jobs ' as the key to specify the number of jobs for the stacker .
2022-05-20 10:49:39 -07:00
eval_method : A string of resampling strategy , one of
[ ' auto ' , ' cv ' , ' holdout ' ] .
split_ratio : A float of the valiation data percentage for holdout .
n_splits : An integer of the number of folds for cross - validation .
log_type : A string of the log type , one of
[ ' better ' , ' all ' ] .
' better ' only logs configs with better loss than previos iters
' all ' logs all the tried configs .
model_history : A boolean of whether to keep the best
model per estimator . Make sure memory is large enough if setting to True .
log_training_metric : A boolean of whether to log the training
metric for each model .
mem_thres : A float of the memory size constraint in bytes .
pred_time_limit : A float of the prediction latency constraint in seconds .
It refers to the average prediction time per row in validation data .
train_time_limit : A float of the training time constraint in seconds .
verbose : int , default = 3 | Controls the verbosity , higher means more
messages .
retrain_full : bool or str , default = True | whether to retrain the
selected model on the full training data when using holdout .
True - retrain only after search finishes ; False - no retraining ;
' budget ' - do best effort to retrain without violating the time
budget .
split_type : str or splitter object , default = " auto " | the data split type .
* A valid splitter object is an instance of a derived class of scikit - learn
[ KFold ] ( https : / / scikit - learn . org / stable / modules / generated / sklearn . model_selection . KFold . html #sklearn.model_selection.KFold)
and have ` ` split ` ` and ` ` get_n_splits ` ` methods with the same signatures .
Set eval_method to " cv " to use the splitter object .
* Valid str options depend on different tasks .
For classification tasks , valid choices are
[ " auto " , ' stratified ' , ' uniform ' , ' time ' , ' group ' ] . " auto " - > stratified .
For regression tasks , valid choices are [ " auto " , ' uniform ' , ' time ' ] .
" auto " - > uniform .
2022-08-12 11:39:22 -04:00
For time series forecast tasks , must be " auto " or ' time ' .
2022-05-20 10:49:39 -07:00
For ranking task , must be " auto " or ' group ' .
hpo_method : str , default = " auto " | The hyperparameter
optimization method . By default , CFO is used for sequential
search and BlendSearch is used for parallel search .
No need to set when using flaml ' s default search space or using
a simple customized search space . When set to ' bs ' , BlendSearch
is used . BlendSearch can be tried when the search space is
complex , for example , containing multiple disjoint , discontinuous
subspaces . When set to ' random ' , random search is used .
starting_points : A dictionary or a str to specify the starting hyperparameter
config for the estimators | default = " static " .
If str :
- if " data " , use data - dependent defaults ;
- if " data:path " use data - dependent defaults which are stored at path ;
- if " static " , use data - independent defaults .
If dict , keys are the name of the estimators , and values are the starting
hyperparamter configurations for the corresponding estimators .
The value can be a single hyperparamter configuration dict or a list
of hyperparamter configuration dicts .
In the following code example , we get starting_points from the
` automl ` object and use them in the ` new_automl ` object .
e . g . ,
` ` ` python
from flaml import AutoML
automl = AutoML ( )
X_train , y_train = load_iris ( return_X_y = True )
automl . fit ( X_train , y_train )
starting_points = automl . best_config_per_estimator
new_automl = AutoML ( )
new_automl . fit ( X_train , y_train , starting_points = starting_points )
` ` `
seed : int or None , default = None | The random seed for hpo .
n_concurrent_trials : [ Experimental ] int , default = 1 | The number of
concurrent trials . When n_concurrent_trials > 1 , flaml performes
2022-12-06 20:46:08 +00:00
[ parallel tuning ] ( . . / . . / Use - Cases / Task - Oriented - AutoML #parallel-tuning)
2022-12-24 00:18:49 +08:00
and installation of ray or spark is required : ` pip install flaml [ ray ] `
or ` pip install flaml [ spark ] ` . Please check
[ here ] ( https : / / spark . apache . org / docs / latest / api / python / getting_started / install . html )
for more details about installing Spark .
2022-05-20 10:49:39 -07:00
keep_search_state : boolean , default = False | Whether to keep data needed
for model search after fit ( ) . By default the state is deleted for
space saving .
2022-08-20 18:17:10 -04:00
preserve_checkpoint : boolean , default = True | Whether to preserve the saved checkpoint
on disk when deleting automl . By default the checkpoint is preserved .
2022-05-20 10:49:39 -07:00
early_stop : boolean , default = False | Whether to stop early if the
search is considered to converge .
2023-02-24 16:07:00 +08:00
force_cancel : boolean , default = False | Whether to forcely cancel Spark jobs if the
search time exceeded the time budget .
2022-05-20 10:49:39 -07:00
append_log : boolean , default = False | Whetehr to directly append the log
records to the input log file if it exists .
auto_augment : boolean , default = True | Whether to automatically
augment rare classes .
min_sample_size : int , default = MIN_SAMPLE_TRAIN | the minimal sample
size when sample = True .
2022-06-03 15:19:22 -04:00
use_ray : boolean or dict .
If boolean : default = False | Whether to use ray to run the training
2022-05-20 10:49:39 -07:00
in separate processes . This can be used to prevent OOM for large
2022-06-03 15:19:22 -04:00
datasets , but will incur more overhead in time .
If dict : the dict contains the keywords arguments to be passed to
[ ray . tune . run ] ( https : / / docs . ray . io / en / latest / tune / api_docs / execution . html ) .
2022-12-24 00:18:49 +08:00
use_spark : boolean , default = False | Whether to use spark to run the training
in parallel spark jobs . This can be used to accelerate training on large models
and large datasets , but will incur more overhead in time and thus slow down
training in some cases . GPU training is not supported yet when use_spark is True .
For Spark clusters , by default , we will launch one trial per executor . However ,
sometimes we want to launch more trials than the number of executors ( e . g . , local mode ) .
In this case , we can set the environment variable ` FLAML_MAX_CONCURRENT ` to override
the detected ` num_executors ` . The final number of concurrent trials will be the minimum
of ` n_concurrent_trials ` and ` num_executors ` .
2022-12-06 10:13:39 -08:00
free_mem_ratio : float between 0 and 1 , default = 0. The free memory ratio to keep during training .
2022-05-20 10:49:39 -07:00
metric_constraints : list , default = [ ] | The list of metric constraints .
Each element in this list is a 3 - tuple , which shall be expressed
in the following format : the first element of the 3 - tuple is the name of the
metric , the second element is the inequality sign chosen from " >= " and " <= " ,
and the third element is the constraint value . E . g . , ` ( ' val_loss ' , ' <= ' , 0.1 ) ` .
Note that all the metric names in metric_constraints need to be reported via
the metrics_to_log dictionary returned by a customized metric function .
The customized metric function shall be provided via the ` metric ` key word
argument of the fit ( ) function or the automl constructor .
2022-12-06 20:46:08 +00:00
Find an example in the 4 th constraint type in this [ doc ] ( . . / . . / Use - Cases / Task - Oriented - AutoML #constraint).
2022-05-20 10:49:39 -07:00
If ` pred_time_limit ` is provided as one of keyword arguments to fit ( ) function or
the automl constructor , flaml will automatically ( and under the hood )
add it as an additional element in the metric_constraints . Essentially ' pred_time_limit '
specifies a constraint about the prediction latency constraint in seconds .
2022-07-09 16:04:46 -04:00
custom_hp : dict , default = None | The custom search space specified by user .
It is a nested dict with keys being the estimator names , and values being dicts
per estimator search space . In the per estimator search space dict ,
the keys are the hyperparameter names , and values are dicts of info ( " domain " ,
" init_value " , and " low_cost_init_value " ) about the search space associated with
the hyperparameter ( i . e . , per hyperparameter search space dict ) . When custom_hp
is provided , the built - in search space which is also a nested dict of per estimator
search space dict , will be updated with custom_hp . Note that during this nested dict update ,
the per hyperparameter search space dicts will be replaced ( instead of updated ) by the ones
provided in custom_hp . Note that the value for " domain " can either be a constant
or a sample . Domain object .
2022-05-20 10:49:39 -07:00
e . g . ,
2021-11-18 09:39:45 -08:00
2021-12-16 17:11:33 -08:00
` ` ` python
2022-04-28 14:06:29 -04:00
custom_hp = {
" transformer_ms " : {
" model_path " : {
" domain " : " albert-base-v2 " ,
} ,
" learning_rate " : {
" domain " : tune . choice ( [ 1e-4 , 1e-5 ] ) ,
}
}
}
2022-05-20 10:49:39 -07:00
` ` `
2022-08-11 19:41:23 -04:00
skip_transform : boolean , default = False | Whether to pre - process data prior to modeling .
2022-05-20 10:49:39 -07:00
fit_kwargs_by_estimator : dict , default = None | The user specified keywords arguments , grouped by estimator name .
e . g . ,
` ` ` python
fit_kwargs_by_estimator = {
" transformer " : {
" output_dir " : " test/data/output/ " ,
" fp16 " : False ,
}
}
` ` `
2023-05-03 05:09:04 +02:00
mlflow_logging : boolean , default = True | Whether to log the training results to mlflow .
This requires mlflow to be installed and to have an active mlflow run .
FLAML will create nested runs .
2021-11-18 09:39:45 -08:00
"""
2023-05-24 16:55:04 -07:00
if ERROR :
raise ERROR
2021-08-23 19:36:51 -04:00
self . _track_iter = 0
self . _state = AutoMLState ( )
self . _state . learner_classes = { }
2021-11-18 09:39:45 -08:00
self . _settings = settings
2022-05-28 17:22:09 -07:00
# no budget by default
settings [ " time_budget " ] = settings . get ( " time_budget " , - 1 )
2021-11-18 09:39:45 -08:00
settings [ " task " ] = settings . get ( " task " , " classification " )
settings [ " n_jobs " ] = settings . get ( " n_jobs " , - 1 )
settings [ " eval_method " ] = settings . get ( " eval_method " , " auto " )
settings [ " split_ratio " ] = settings . get ( " split_ratio " , SPLIT_RATIO )
settings [ " n_splits " ] = settings . get ( " n_splits " , N_SPLITS )
settings [ " auto_augment " ] = settings . get ( " auto_augment " , True )
settings [ " metric " ] = settings . get ( " metric " , " auto " )
settings [ " estimator_list " ] = settings . get ( " estimator_list " , " auto " )
settings [ " log_file_name " ] = settings . get ( " log_file_name " , " " )
2022-05-28 17:22:09 -07:00
settings [ " max_iter " ] = settings . get ( " max_iter " ) # no budget by default
2021-11-18 09:39:45 -08:00
settings [ " sample " ] = settings . get ( " sample " , True )
settings [ " ensemble " ] = settings . get ( " ensemble " , False )
settings [ " log_type " ] = settings . get ( " log_type " , " better " )
2021-11-22 06:59:42 -08:00
settings [ " model_history " ] = settings . get ( " model_history " , False )
2021-11-18 09:39:45 -08:00
settings [ " log_training_metric " ] = settings . get ( " log_training_metric " , False )
settings [ " mem_thres " ] = settings . get ( " mem_thres " , MEM_THRES )
settings [ " pred_time_limit " ] = settings . get ( " pred_time_limit " , np . inf )
2022-12-06 10:13:39 -08:00
settings [ " train_time_limit " ] = settings . get ( " train_time_limit " , None )
2021-11-18 09:39:45 -08:00
settings [ " verbose " ] = settings . get ( " verbose " , 3 )
settings [ " retrain_full " ] = settings . get ( " retrain_full " , True )
settings [ " split_type " ] = settings . get ( " split_type " , " auto " )
settings [ " hpo_method " ] = settings . get ( " hpo_method " , " auto " )
settings [ " learner_selector " ] = settings . get ( " learner_selector " , " sample " )
2022-03-01 15:39:09 -08:00
settings [ " starting_points " ] = settings . get ( " starting_points " , " static " )
2021-11-18 09:39:45 -08:00
settings [ " n_concurrent_trials " ] = settings . get ( " n_concurrent_trials " , 1 )
settings [ " keep_search_state " ] = settings . get ( " keep_search_state " , False )
2022-08-20 18:17:10 -04:00
settings [ " preserve_checkpoint " ] = settings . get ( " preserve_checkpoint " , True )
2021-11-18 09:39:45 -08:00
settings [ " early_stop " ] = settings . get ( " early_stop " , False )
2023-02-24 16:07:00 +08:00
settings [ " force_cancel " ] = settings . get ( " force_cancel " , False )
2021-11-18 09:39:45 -08:00
settings [ " append_log " ] = settings . get ( " append_log " , False )
settings [ " min_sample_size " ] = settings . get ( " min_sample_size " , MIN_SAMPLE_TRAIN )
settings [ " use_ray " ] = settings . get ( " use_ray " , False )
2022-12-24 00:18:49 +08:00
settings [ " use_spark " ] = settings . get ( " use_spark " , False )
if settings [ " use_ray " ] is not False and settings [ " use_spark " ] is not False :
raise ValueError ( " use_ray and use_spark cannot be both True. " )
2022-12-06 10:13:39 -08:00
settings [ " free_mem_ratio " ] = settings . get ( " free_mem_ratio " , 0 )
2022-03-12 00:39:35 -05:00
settings [ " metric_constraints " ] = settings . get ( " metric_constraints " , [ ] )
2022-08-10 00:42:47 +00:00
settings [ " cv_score_agg_func " ] = settings . get ( " cv_score_agg_func " , None )
2023-04-10 21:50:40 +02:00
settings [ " fit_kwargs_by_estimator " ] = settings . get ( " fit_kwargs_by_estimator " , { } )
2022-04-28 14:06:29 -04:00
settings [ " custom_hp " ] = settings . get ( " custom_hp " , { } )
2022-08-11 19:41:23 -04:00
settings [ " skip_transform " ] = settings . get ( " skip_transform " , False )
2023-05-03 05:09:04 +02:00
settings [ " mlflow_logging " ] = settings . get ( " mlflow_logging " , True )
2022-04-28 14:06:29 -04:00
2023-04-10 21:50:40 +02:00
self . _estimator_type = " classifier " if settings [ " task " ] in CLASSIFICATION else " regressor "
2021-11-22 06:59:42 -08:00
2023-02-22 03:49:56 +01:00
def get_params ( self , deep : bool = False ) - > dict :
2021-11-22 06:59:42 -08:00
return self . _settings . copy ( )
2021-08-23 19:36:51 -04:00
@property
2023-02-22 03:49:56 +01:00
def config_history ( self ) - > dict :
2021-09-10 16:39:16 -07:00
""" A dictionary of iter->(estimator, config, time),
2021-08-23 19:36:51 -04:00
storing the best estimator , config , and the time when the best
model is updated each time .
2021-09-10 16:39:16 -07:00
"""
2021-08-23 19:36:51 -04:00
return self . _config_history
@property
def model ( self ) :
2021-09-10 16:39:16 -07:00
""" An object with `predict()` and `predict_proba()` method (for
2021-08-23 19:36:51 -04:00
classification ) , storing the best trained model .
2021-09-10 16:39:16 -07:00
"""
return self . __dict__ . get ( " _trained_estimator " )
2021-08-23 19:36:51 -04:00
2023-02-22 03:49:56 +01:00
def best_model_for_estimator ( self , estimator_name : str ) :
2021-11-06 11:37:57 -07:00
""" Return the best model found for a particular estimator.
2021-08-23 19:36:51 -04:00
Args :
2021-11-06 11:37:57 -07:00
estimator_name : a str of the estimator ' s name.
2021-08-23 19:36:51 -04:00
Returns :
2021-12-16 17:11:33 -08:00
An object storing the best model for estimator_name .
If ` model_history ` was set to False during fit ( ) , then the returned model
is untrained unless estimator_name is the best estimator .
If ` model_history ` was set to True , then the returned model is trained .
2021-09-10 16:39:16 -07:00
"""
2021-09-04 01:42:21 -07:00
state = self . _search_states . get ( estimator_name )
2021-09-10 16:39:16 -07:00
return state and getattr ( state , " trained_estimator " , None )
2021-08-23 19:36:51 -04:00
@property
def best_estimator ( self ) :
2021-09-10 16:39:16 -07:00
""" A string indicating the best estimator found. """
2021-08-23 19:36:51 -04:00
return self . _best_estimator
@property
def best_iteration ( self ) :
2021-09-10 16:39:16 -07:00
""" An integer of the iteration number where the best
config is found . """
2021-08-23 19:36:51 -04:00
return self . _best_iteration
@property
def best_config ( self ) :
2021-09-10 16:39:16 -07:00
""" A dictionary of the best configuration. """
2021-10-15 21:36:42 -07:00
state = self . _search_states . get ( self . _best_estimator )
2022-12-13 12:48:32 -08:00
config = state and getattr ( state , " best_config " , None )
return config and AutoMLState . sanitize ( config )
2021-08-23 19:36:51 -04:00
@property
def best_config_per_estimator ( self ) :
2021-09-10 16:39:16 -07:00
""" A dictionary of all estimators ' best configuration. """
return {
2023-04-10 21:50:40 +02:00
e : e_search_state . best_config and AutoMLState . sanitize ( e_search_state . best_config )
2021-09-10 16:39:16 -07:00
for e , e_search_state in self . _search_states . items ( )
}
2021-08-23 19:36:51 -04:00
2021-11-18 05:03:12 +00:00
@property
def best_loss_per_estimator ( self ) :
""" A dictionary of all estimators ' best loss. """
2023-04-10 21:50:40 +02:00
return { e : e_search_state . best_loss for e , e_search_state in self . _search_states . items ( ) }
2021-11-18 05:03:12 +00:00
2021-08-23 19:36:51 -04:00
@property
def best_loss ( self ) :
2021-11-06 11:37:57 -07:00
""" A float of the best loss found. """
2021-08-23 19:36:51 -04:00
return self . _state . best_loss
2022-03-26 21:11:45 -04:00
@property
def best_result ( self ) :
""" Result dictionary for model trained with the best config. """
state = self . _search_states . get ( self . _best_estimator )
return state and getattr ( state , " best_result " , None )
@property
def metrics_for_best_config ( self ) :
""" Returns a float of the best loss, and a dictionary of the auxiliary metrics to log
associated with the best config . These two objects correspond to the returned
objects by the customized metric function for the config with the best loss . """
state = self . _search_states . get ( self . _best_estimator )
2023-04-10 21:50:40 +02:00
return self . _state . best_loss , state and getattr ( state , " best_result " , { } ) . get ( " metric_for_logging " )
2022-03-26 21:11:45 -04:00
2021-08-23 19:36:51 -04:00
@property
def best_config_train_time ( self ) :
2021-11-03 19:08:23 -07:00
""" A float of the seconds taken by training the best config. """
2023-04-10 21:50:40 +02:00
return getattr ( self . _search_states [ self . _best_estimator ] , " best_config_train_time " , None )
2021-08-23 19:36:51 -04:00
2021-12-04 18:27:38 +00:00
def save_best_config ( self , filename ) :
best = {
" class " : self . best_estimator ,
" hyperparameters " : self . best_config ,
}
os . makedirs ( os . path . dirname ( filename ) , exist_ok = True )
with open ( filename , " w " ) as f :
json . dump ( best , f )
2023-04-15 12:06:47 -07:00
@property
def feature_transformer ( self ) :
""" Returns AutoML Transformer """
return getattr ( self , " _transformer " , None )
@property
def label_transformer ( self ) :
""" Returns AutoML label transformer """
return getattr ( self , " _label_transformer " , None )
2021-08-23 19:36:51 -04:00
@property
def classes_ ( self ) :
2022-02-25 22:13:41 -08:00
""" A numpy array of shape (n_classes,) for class labels. """
2021-09-04 20:28:37 -07:00
attr = getattr ( self , " _label_transformer " , None )
2021-09-04 01:42:21 -07:00
if attr :
2022-02-25 22:13:41 -08:00
return attr . classes_
2021-09-04 01:42:21 -07:00
attr = getattr ( self , " _trained_estimator " , None )
if attr :
2022-02-25 22:13:41 -08:00
return attr . classes_
2021-08-23 19:36:51 -04:00
return None
2022-01-02 21:37:19 -08:00
@property
def n_features_in_ ( self ) :
return self . _trained_estimator . n_features_in_
2022-07-10 12:25:59 -07:00
@property
def feature_names_in_ ( self ) :
attr = getattr ( self , " _trained_estimator " , None )
attr = attr and getattr ( attr , " feature_names_in_ " , None )
if attr is not None :
return attr
return getattr ( self , " _feature_names_in_ " , None )
@property
def feature_importances_ ( self ) :
attr = getattr ( self , " _trained_estimator " , None )
attr = attr and getattr ( attr , " feature_importances_ " , None )
return attr
2021-09-19 11:19:23 -07:00
@property
def time_to_find_best_model ( self ) - > float :
2021-11-06 11:37:57 -07:00
""" Time taken to find best model in seconds. """
2021-09-19 11:19:23 -07:00
return self . __dict__ . get ( " _time_taken_best_iter " )
Support spark dataframe as input dataset and spark models as estimators (#934)
* add basic support to Spark dataframe
add support to SynapseML LightGBM model
update to pyspark>=3.2.0 to leverage pandas_on_Spark API
* clean code, add TODOs
* add sample_train_data for pyspark.pandas dataframe, fix bugs
* improve some functions, fix bugs
* fix dict change size during iteration
* update model predict
* update LightGBM model, update test
* update SynapseML LightGBM params
* update synapseML and tests
* update TODOs
* Added support to roc_auc for spark models
* Added support to score of spark estimator
* Added test for automl score of spark estimator
* Added cv support to pyspark.pandas dataframe
* Update test, fix bugs
* Added tests
* Updated docs, tests, added a notebook
* Fix bugs in non-spark env
* Fix bugs and improve tests
* Fix uninstall pyspark
* Fix tests error
* Fix java.lang.OutOfMemoryError: Java heap space
* Fix test_performance
* Update test_sparkml to test_0sparkml to use the expected spark conf
* Remove unnecessary widgets in notebook
* Fix iloc java.lang.StackOverflowError
* fix pre-commit
* Added params check for spark dataframes
* Refactor code for train_test_split to a function
* Update train_test_split_pyspark
* Refactor if-else, remove unnecessary code
* Remove y from predict, remove mem control from n_iter compute
* Update workflow
* Improve _split_pyspark
* Fix test failure of too short training time
* Fix typos, improve docstrings
* Fix index errors of pandas_on_spark, add spark loss metric
* Fix typo of ndcgAtK
* Update NDCG metrics and tests
* Remove unuseful logger
* Use cache and count to ensure consistent indexes
* refactor for merge maain
* fix errors of refactor
* Updated SparkLightGBMEstimator and cache
* Updated config2params
* Remove unused import
* Fix unknown parameters
* Update default_estimator_list
* Add unit tests for spark metrics
2023-03-26 03:59:46 +08:00
def score (
self ,
2023-05-24 16:55:04 -07:00
X : Union [ DataFrame , psDataFrame ] ,
y : Union [ Series , psSeries ] ,
Support spark dataframe as input dataset and spark models as estimators (#934)
* add basic support to Spark dataframe
add support to SynapseML LightGBM model
update to pyspark>=3.2.0 to leverage pandas_on_Spark API
* clean code, add TODOs
* add sample_train_data for pyspark.pandas dataframe, fix bugs
* improve some functions, fix bugs
* fix dict change size during iteration
* update model predict
* update LightGBM model, update test
* update SynapseML LightGBM params
* update synapseML and tests
* update TODOs
* Added support to roc_auc for spark models
* Added support to score of spark estimator
* Added test for automl score of spark estimator
* Added cv support to pyspark.pandas dataframe
* Update test, fix bugs
* Added tests
* Updated docs, tests, added a notebook
* Fix bugs in non-spark env
* Fix bugs and improve tests
* Fix uninstall pyspark
* Fix tests error
* Fix java.lang.OutOfMemoryError: Java heap space
* Fix test_performance
* Update test_sparkml to test_0sparkml to use the expected spark conf
* Remove unnecessary widgets in notebook
* Fix iloc java.lang.StackOverflowError
* fix pre-commit
* Added params check for spark dataframes
* Refactor code for train_test_split to a function
* Update train_test_split_pyspark
* Refactor if-else, remove unnecessary code
* Remove y from predict, remove mem control from n_iter compute
* Update workflow
* Improve _split_pyspark
* Fix test failure of too short training time
* Fix typos, improve docstrings
* Fix index errors of pandas_on_spark, add spark loss metric
* Fix typo of ndcgAtK
* Update NDCG metrics and tests
* Remove unuseful logger
* Use cache and count to ensure consistent indexes
* refactor for merge maain
* fix errors of refactor
* Updated SparkLightGBMEstimator and cache
* Updated config2params
* Remove unused import
* Fix unknown parameters
* Update default_estimator_list
* Add unit tests for spark metrics
2023-03-26 03:59:46 +08:00
* * kwargs ,
) :
2022-03-25 17:00:08 -04:00
estimator = getattr ( self , " _trained_estimator " , None )
if estimator is None :
2023-04-10 21:50:40 +02:00
logger . warning ( " No estimator is trained. Please run fit with enough budget. " )
2022-03-25 17:00:08 -04:00
return None
2023-03-11 02:39:08 +00:00
X = self . _state . task . preprocess ( X , self . _transformer )
2022-05-19 11:43:34 -04:00
if self . _label_transformer :
y = self . _label_transformer . transform ( y )
2022-03-25 17:00:08 -04:00
return estimator . score ( X , y , * * kwargs )
2022-03-20 22:03:02 -04:00
def predict (
self ,
2023-05-24 16:55:04 -07:00
X : Union [ np . array , DataFrame , List [ str ] , List [ List [ str ] ] , psDataFrame ] ,
2022-03-20 22:03:02 -04:00
* * pred_kwargs ,
) :
2021-09-10 16:39:16 -07:00
""" Predict label from features.
2021-08-23 19:36:51 -04:00
Args :
Support spark dataframe as input dataset and spark models as estimators (#934)
* add basic support to Spark dataframe
add support to SynapseML LightGBM model
update to pyspark>=3.2.0 to leverage pandas_on_Spark API
* clean code, add TODOs
* add sample_train_data for pyspark.pandas dataframe, fix bugs
* improve some functions, fix bugs
* fix dict change size during iteration
* update model predict
* update LightGBM model, update test
* update SynapseML LightGBM params
* update synapseML and tests
* update TODOs
* Added support to roc_auc for spark models
* Added support to score of spark estimator
* Added test for automl score of spark estimator
* Added cv support to pyspark.pandas dataframe
* Update test, fix bugs
* Added tests
* Updated docs, tests, added a notebook
* Fix bugs in non-spark env
* Fix bugs and improve tests
* Fix uninstall pyspark
* Fix tests error
* Fix java.lang.OutOfMemoryError: Java heap space
* Fix test_performance
* Update test_sparkml to test_0sparkml to use the expected spark conf
* Remove unnecessary widgets in notebook
* Fix iloc java.lang.StackOverflowError
* fix pre-commit
* Added params check for spark dataframes
* Refactor code for train_test_split to a function
* Update train_test_split_pyspark
* Refactor if-else, remove unnecessary code
* Remove y from predict, remove mem control from n_iter compute
* Update workflow
* Improve _split_pyspark
* Fix test failure of too short training time
* Fix typos, improve docstrings
* Fix index errors of pandas_on_spark, add spark loss metric
* Fix typo of ndcgAtK
* Update NDCG metrics and tests
* Remove unuseful logger
* Use cache and count to ensure consistent indexes
* refactor for merge maain
* fix errors of refactor
* Updated SparkLightGBMEstimator and cache
* Updated config2params
* Remove unused import
* Fix unknown parameters
* Update default_estimator_list
* Add unit tests for spark metrics
2023-03-26 03:59:46 +08:00
X : A numpy array or pandas dataframe or pyspark . pandas dataframe
of featurized instances , shape n * m ,
2022-08-12 11:39:22 -04:00
or for time series forcast tasks :
2021-10-30 12:48:57 -04:00
a pandas dataframe with the first column containing
timestamp values ( datetime type ) or an integer n for
the predict steps ( only valid when the estimator is
arima or sarimax ) . Other columns in the dataframe
are assumed to be exogenous variables ( categorical
or numeric ) .
2022-03-20 22:03:02 -04:00
* * pred_kwargs : Other key word arguments to pass to predict ( ) function of
the searched learners , such as per_device_eval_batch_size .
2021-10-30 12:48:57 -04:00
2021-12-16 17:11:33 -08:00
` ` ` python
2023-05-24 16:55:04 -07:00
multivariate_X_test = DataFrame ( {
2021-12-16 17:11:33 -08:00
' timeStamp ' : pd . date_range ( start = ' 1/1/2022 ' , end = ' 1/07/2022 ' ) ,
' categorical_col ' : [ ' yes ' , ' yes ' , ' no ' , ' no ' , ' yes ' , ' no ' , ' yes ' ] ,
' continuous_col ' : [ 105 , 107 , 120 , 118 , 110 , 112 , 115 ]
} )
model . predict ( multivariate_X_test )
` ` `
2021-08-23 19:36:51 -04:00
Returns :
2021-12-16 17:11:33 -08:00
A array - like of shape n * 1 : each element is a predicted
2021-08-23 19:36:51 -04:00
label for an instance .
2021-09-10 16:39:16 -07:00
"""
2021-09-04 01:42:21 -07:00
estimator = getattr ( self , " _trained_estimator " , None )
if estimator is None :
2023-04-10 21:50:40 +02:00
logger . warning ( " No estimator is trained. Please run fit with enough budget. " )
2021-08-23 19:36:51 -04:00
return None
2023-03-11 02:39:08 +00:00
X = self . _state . task . preprocess ( X , self . _transformer )
2022-03-20 22:03:02 -04:00
y_pred = estimator . predict ( X , * * pred_kwargs )
2023-04-09 12:53:30 -04:00
2023-04-10 21:50:40 +02:00
if isinstance ( y_pred , np . ndarray ) and y_pred . ndim > 1 and isinstance ( y_pred , np . ndarray ) :
2021-08-23 19:36:51 -04:00
y_pred = y_pred . flatten ( )
if self . _label_transformer :
2023-05-24 16:55:04 -07:00
return self . _label_transformer . inverse_transform ( Series ( y_pred . astype ( int ) ) )
2021-08-23 19:36:51 -04:00
else :
return y_pred
2022-03-20 22:03:02 -04:00
def predict_proba ( self , X , * * pred_kwargs ) :
2021-09-10 16:39:16 -07:00
""" Predict the probability of each class from features, only works for
2021-08-23 19:36:51 -04:00
classification problems .
Args :
2022-01-16 23:37:56 +01:00
X : A numpy array of featurized instances , shape n * m .
2022-03-20 22:03:02 -04:00
* * pred_kwargs : Other key word arguments to pass to predict_proba ( ) function of
the searched learners , such as per_device_eval_batch_size .
2021-08-23 19:36:51 -04:00
Returns :
A numpy array of shape n * c . c is the # classes. Each element at
( i , j ) is the probability for instance i to be in class j .
2021-09-10 16:39:16 -07:00
"""
2021-11-22 06:59:42 -08:00
estimator = getattr ( self , " _trained_estimator " , None )
if estimator is None :
2023-04-10 21:50:40 +02:00
logger . warning ( " No estimator is trained. Please run fit with enough budget. " )
2021-11-22 06:59:42 -08:00
return None
2023-03-11 02:39:08 +00:00
X = self . _state . task . preprocess ( X , self . _transformer )
2022-03-20 22:03:02 -04:00
proba = self . _trained_estimator . predict_proba ( X , * * pred_kwargs )
2021-08-23 19:36:51 -04:00
return proba
2021-09-10 16:39:16 -07:00
def add_learner ( self , learner_name , learner_class ) :
2021-11-06 11:37:57 -07:00
""" Add a customized learner.
2021-08-23 19:36:51 -04:00
Args :
2021-11-06 11:37:57 -07:00
learner_name : A string of the learner ' s name.
learner_class : A subclass of flaml . model . BaseEstimator .
2021-09-10 16:39:16 -07:00
"""
2021-08-23 19:36:51 -04:00
self . _state . learner_classes [ learner_name ] = learner_class
2023-04-10 21:50:40 +02:00
def get_estimator_from_log ( self , log_file_name : str , record_id : int , task : Union [ str , Task ] ) :
2021-11-06 11:37:57 -07:00
""" Get the estimator from log file.
2021-08-23 19:36:51 -04:00
Args :
2021-11-06 11:37:57 -07:00
log_file_name : A string of the log file name .
2021-08-23 19:36:51 -04:00
record_id : An integer of the record ID in the file ,
2021-11-06 11:37:57 -07:00
0 corresponds to the first trial .
2021-08-23 19:36:51 -04:00
task : A string of the task type ,
2023-03-11 02:39:08 +00:00
' binary ' , ' multiclass ' , ' regression ' , ' ts_forecast ' , ' rank ' ,
or an instance of the Task class .
2021-08-23 19:36:51 -04:00
Returns :
2021-11-06 11:37:57 -07:00
An estimator object for the given configuration .
2021-09-10 16:39:16 -07:00
"""
2021-08-23 19:36:51 -04:00
with training_log_reader ( log_file_name ) as reader :
record = reader . get_record ( record_id )
estimator = record . learner
2022-12-13 12:48:32 -08:00
config = AutoMLState . sanitize ( record . config )
2021-08-23 19:36:51 -04:00
2023-03-11 02:39:08 +00:00
if isinstance ( task , str ) :
task = task_factory ( task )
2021-08-23 19:36:51 -04:00
estimator , _ = train_estimator (
2021-11-16 14:06:20 -05:00
X_train = None ,
y_train = None ,
config_dic = config ,
task = task ,
estimator_name = estimator ,
2021-09-10 16:39:16 -07:00
estimator_class = self . _state . learner_classes . get ( estimator ) ,
2022-03-20 22:03:02 -04:00
eval_metric = " train_time " ,
2021-09-10 16:39:16 -07:00
)
2021-08-23 19:36:51 -04:00
return estimator
2021-09-10 16:39:16 -07:00
def retrain_from_log (
self ,
log_file_name ,
X_train = None ,
y_train = None ,
dataframe = None ,
label = None ,
2021-11-18 09:39:45 -08:00
time_budget = np . inf ,
2023-03-11 02:39:08 +00:00
task : Optional [ Union [ str , Task ] ] = None ,
2021-11-18 09:39:45 -08:00
eval_method = None ,
split_ratio = None ,
n_splits = None ,
2021-09-10 16:39:16 -07:00
split_type = None ,
groups = None ,
2021-10-08 16:09:43 -07:00
n_jobs = - 1 ,
2022-01-14 00:08:51 -05:00
# gpu_per_trial=0,
2021-09-10 16:39:16 -07:00
train_best = True ,
train_full = False ,
record_id = - 1 ,
2021-11-18 09:39:45 -08:00
auto_augment = None ,
2022-04-28 14:06:29 -04:00
custom_hp = None ,
2022-08-11 19:41:23 -04:00
skip_transform = None ,
2022-08-20 18:17:10 -04:00
preserve_checkpoint = True ,
2022-04-28 14:06:29 -04:00
fit_kwargs_by_estimator = None ,
2021-09-10 16:39:16 -07:00
* * fit_kwargs ,
) :
2021-11-06 11:37:57 -07:00
""" Retrain from log file.
2021-08-23 19:36:51 -04:00
2022-06-09 08:11:15 -07:00
This function is intended to retrain the logged configurations .
NOTE : In some rare case , the last config is early stopped to meet time_budget and it ' s the best config.
But the logged config ' s ITER_HP (e.g., n_estimators) is not reduced.
2021-08-23 19:36:51 -04:00
Args :
2021-11-06 09:37:33 -07:00
log_file_name : A string of the log file name .
X_train : A numpy array or dataframe of training data in shape n * m .
2023-03-11 02:39:08 +00:00
For time series forecast tasks , the first column of X_train must be the timestamp column ( datetime type ) . Other columns in the dataframe are assumed to be exogenous variables ( categorical or numeric ) .
2021-11-06 09:37:33 -07:00
y_train : A numpy array or series of labels in shape n * 1.
2021-09-01 16:25:04 -07:00
dataframe : A dataframe of training data including label column .
2022-08-12 11:39:22 -04:00
For time series forecast tasks , dataframe must be specified and should
2021-10-30 12:48:57 -04:00
have at least two columns : timestamp and label , where the first
column is the timestamp column ( datetime type ) . Other columns
in the dataframe are assumed to be exogenous variables
( categorical or numeric ) .
label : A str of the label column name , e . g . , ' label ' ;
2021-09-01 16:25:04 -07:00
Note : If X_train and y_train are provided ,
dataframe and label are ignored ;
If not , dataframe and label must be provided .
time_budget : A float number of the time budget in seconds .
2021-08-23 19:36:51 -04:00
task : A string of the task type , e . g . ,
2021-11-18 09:39:45 -08:00
' classification ' , ' regression ' , ' ts_forecast ' , ' rank ' ,
2023-03-11 02:39:08 +00:00
' seq-classification ' , ' seq-regression ' , ' summarization ' ,
or an instance of Task class .
2021-08-23 19:36:51 -04:00
eval_method : A string of resampling strategy , one of
2021-09-01 16:25:04 -07:00
[ ' auto ' , ' cv ' , ' holdout ' ] .
split_ratio : A float of the validation data percentage for holdout .
n_splits : An integer of the number of folds for cross - validation .
2021-12-17 08:13:04 +08:00
split_type : str or splitter object , default = " auto " | the data split type .
2021-12-22 14:35:13 -08:00
* A valid splitter object is an instance of a derived class of scikit - learn
[ KFold ] ( https : / / scikit - learn . org / stable / modules / generated / sklearn . model_selection . KFold . html #sklearn.model_selection.KFold)
2021-12-17 08:13:04 +08:00
and have ` ` split ` ` and ` ` get_n_splits ` ` methods with the same signatures .
2021-12-22 14:35:13 -08:00
Set eval_method to " cv " to use the splitter object .
* Valid str options depend on different tasks .
For classification tasks , valid choices are
[ " auto " , ' stratified ' , ' uniform ' , ' time ' , ' group ' ] . " auto " - > stratified .
2021-11-18 09:39:45 -08:00
For regression tasks , valid choices are [ " auto " , ' uniform ' , ' time ' ] .
" auto " - > uniform .
2022-08-12 11:39:22 -04:00
For time series forecast tasks , must be " auto " or ' time ' .
2021-11-18 09:39:45 -08:00
For ranking task , must be " auto " or ' group ' .
2021-09-01 16:25:04 -07:00
groups : None or array - like | Group labels ( with matching length to
y_train ) or groups counts ( with sum equal to length of y_train )
for training data .
2022-02-15 12:41:53 -05:00
n_jobs : An integer of the number of threads for training | default = - 1.
Use all available resources when n_jobs == - 1.
2021-08-23 19:36:51 -04:00
train_best : A boolean of whether to train the best config in the
2021-09-01 16:25:04 -07:00
time budget ; if false , train the last config in the budget .
2021-08-23 19:36:51 -04:00
train_full : A boolean of whether to train on the full data . If true ,
2021-09-01 16:25:04 -07:00
eval_method and sample_size in the log file will be ignored .
2021-08-23 19:36:51 -04:00
record_id : the ID of the training log record from which the model will
be retrained . By default ` record_id = - 1 ` which means this will be
ignored . ` record_id = 0 ` corresponds to the first trial , and
when ` record_id > = 0 ` , ` time_budget ` will be ignored .
2021-10-08 16:09:43 -07:00
auto_augment : boolean , default = True | Whether to automatically
augment rare classes .
2022-04-28 14:06:29 -04:00
custom_hp : dict , default = None | The custom search space specified by user
Each key is the estimator name , each value is a dict of the custom search space for that estimator . Notice the
domain of the custom search space can either be a value or a sample . Domain object .
` ` ` python
custom_hp = {
" transformer_ms " : {
" model_path " : {
" domain " : " albert-base-v2 " ,
} ,
" learning_rate " : {
" domain " : tune . choice ( [ 1e-4 , 1e-5 ] ) ,
}
}
}
2022-05-20 10:49:39 -07:00
` ` `
2022-04-28 14:06:29 -04:00
fit_kwargs_by_estimator : dict , default = None | The user specified keywords arguments , grouped by estimator name .
e . g . ,
` ` ` python
fit_kwargs_by_estimator = {
" transformer " : {
" output_dir " : " test/data/output/ " ,
" fp16 " : False ,
}
}
` ` `
2021-08-23 19:36:51 -04:00
* * fit_kwargs : Other key word arguments to pass to fit ( ) function of
2022-08-12 11:39:22 -04:00
the searched learners , such as sample_weight . Below are a few examples of
estimator - specific parameters :
period : int | forecast horizon for all time series forecast tasks .
2022-05-10 17:22:57 -04:00
gpu_per_trial : float , default = 0 | A float of the number of gpus per trial ,
2022-08-12 11:39:22 -04:00
only used by TransformersEstimator , XGBoostSklearnEstimator , and
TemporalFusionTransformerEstimator .
group_ids : list of strings of column names identifying a time series , only
used by TemporalFusionTransformerEstimator , required for
' ts_forecast_panel ' task . ` group_ids ` is a parameter for TimeSeriesDataSet object
from PyTorchForecasting .
For other parameters to describe your dataset , refer to
[ TimeSeriesDataSet PyTorchForecasting ] ( https : / / pytorch - forecasting . readthedocs . io / en / stable / api / pytorch_forecasting . data . timeseries . TimeSeriesDataSet . html ) .
To specify your variables , use ` static_categoricals ` , ` static_reals ` ,
` time_varying_known_categoricals ` , ` time_varying_known_reals ` ,
` time_varying_unknown_categoricals ` , ` time_varying_unknown_reals ` ,
` variable_groups ` . To provide more information on your data , use
` max_encoder_length ` , ` min_encoder_length ` , ` lags ` .
log_dir : str , default = " lightning_logs " | Folder into which to log results
for tensorboard , only used by TemporalFusionTransformerEstimator .
max_epochs : int , default = 20 | Maximum number of epochs to run training ,
only used by TemporalFusionTransformerEstimator .
batch_size : int , default = 64 | Batch size for training model , only
used by TemporalFusionTransformerEstimator .
2021-09-10 16:39:16 -07:00
"""
2021-11-18 09:39:45 -08:00
task = task or self . _settings . get ( " task " )
2023-03-11 02:39:08 +00:00
if isinstance ( task , str ) :
task = task_factory ( task )
2021-11-18 09:39:45 -08:00
eval_method = eval_method or self . _settings . get ( " eval_method " )
split_ratio = split_ratio or self . _settings . get ( " split_ratio " )
n_splits = n_splits or self . _settings . get ( " n_splits " )
split_type = split_type or self . _settings . get ( " split_type " )
2023-04-10 21:50:40 +02:00
auto_augment = self . _settings . get ( " auto_augment " ) if auto_augment is None else auto_augment
2022-01-24 21:39:36 -05:00
self . _state . task = task
2023-03-11 02:39:08 +00:00
self . _estimator_type = " classifier " if task . is_classification ( ) else " regressor "
2021-11-22 06:59:42 -08:00
2021-08-23 19:36:51 -04:00
self . _state . fit_kwargs = fit_kwargs
2022-04-28 14:06:29 -04:00
self . _state . custom_hp = custom_hp or self . _settings . get ( " custom_hp " )
2023-04-10 21:50:40 +02:00
self . _skip_transform = self . _settings . get ( " skip_transform " ) if skip_transform is None else skip_transform
self . _state . fit_kwargs_by_estimator = fit_kwargs_by_estimator or self . _settings . get ( " fit_kwargs_by_estimator " )
2022-08-20 18:17:10 -04:00
self . preserve_checkpoint = (
2023-04-10 21:50:40 +02:00
self . _settings . get ( " preserve_checkpoint " ) if preserve_checkpoint is None else preserve_checkpoint
2023-03-11 02:39:08 +00:00
)
2023-04-10 21:50:40 +02:00
task . validate_data ( self , self . _state , X_train , y_train , dataframe , label , groups = groups )
2021-08-23 19:36:51 -04:00
2021-09-10 16:39:16 -07:00
logger . info ( " log file name {} " . format ( log_file_name ) )
2021-08-23 19:36:51 -04:00
best_config = None
2021-09-10 16:39:16 -07:00
best_val_loss = float ( " +inf " )
2021-08-23 19:36:51 -04:00
best_estimator = None
sample_size = None
time_used = 0.0
training_duration = 0
best = None
with training_log_reader ( log_file_name ) as reader :
if record_id > = 0 :
best = reader . get_record ( record_id )
else :
for record in reader . records ( ) :
time_used = record . wall_clock_time
if time_used > time_budget :
break
training_duration = time_used
val_loss = record . validation_loss
if val_loss < = best_val_loss or not train_best :
if val_loss == best_val_loss and train_best :
size = record . sample_size
if size > sample_size :
best = record
best_val_loss = val_loss
sample_size = size
else :
best = record
size = record . sample_size
best_val_loss = val_loss
sample_size = size
if not training_duration :
2023-04-10 21:50:40 +02:00
logger . warning ( f " No estimator found within time_budget= { time_budget } " )
2021-08-23 19:36:51 -04:00
from . model import BaseEstimator as Estimator
2021-09-10 16:39:16 -07:00
2021-08-23 19:36:51 -04:00
self . _trained_estimator = Estimator ( )
return training_duration
if not best :
return
best_estimator = best . learner
best_config = best . config
2021-09-10 16:39:16 -07:00
sample_size = len ( self . _y_train_all ) if train_full else best . sample_size
2021-08-23 19:36:51 -04:00
2022-04-28 14:06:29 -04:00
this_estimator_kwargs = self . _state . fit_kwargs_by_estimator . get ( best_estimator )
if this_estimator_kwargs :
this_estimator_kwargs = (
this_estimator_kwargs . copy ( )
) # make another shallow copy of the value (a dict obj), so user's fit_kwargs_by_estimator won't be updated
this_estimator_kwargs . update ( self . _state . fit_kwargs )
self . _state . fit_kwargs_by_estimator [ best_estimator ] = this_estimator_kwargs
else :
self . _state . fit_kwargs_by_estimator [ best_estimator ] = self . _state . fit_kwargs
2021-08-23 19:36:51 -04:00
logger . info (
2023-04-10 21:50:40 +02:00
" estimator = {} , config = {} , #training instances = {} " . format ( best_estimator , best_config , sample_size )
2021-09-10 16:39:16 -07:00
)
2021-08-23 19:36:51 -04:00
# Partially copied from fit() function
# Initilize some attributes required for retrain_from_log
2023-03-11 02:39:08 +00:00
self . _split_type = task . decide_split_type (
split_type ,
self . _y_train_all ,
self . _state . fit_kwargs ,
self . _state . groups ,
)
2022-06-29 21:04:25 -07:00
eval_method = self . _decide_eval_method ( eval_method , time_budget )
2021-08-23 19:36:51 -04:00
self . modelcount = 0
2021-10-08 16:09:43 -07:00
self . _auto_augment = auto_augment
2021-09-04 01:42:21 -07:00
self . _prepare_data ( eval_method , split_ratio , n_splits )
2022-12-06 10:13:39 -08:00
self . _state . time_budget = - 1
self . _state . free_mem_ratio = 0
2021-08-23 19:36:51 -04:00
self . _state . n_jobs = n_jobs
2021-11-16 14:06:20 -05:00
import os
self . _state . resources_per_trial = (
2022-05-18 21:01:51 -07:00
{
" cpu " : max ( 1 , os . cpu_count ( ) >> 1 ) ,
" gpu " : fit_kwargs . get ( " gpu_per_trial " , 0 ) ,
}
2021-11-16 14:06:20 -05:00
if self . _state . n_jobs < 0
2022-01-14 00:08:51 -05:00
else { " cpu " : self . _state . n_jobs , " gpu " : fit_kwargs . get ( " gpu_per_trial " , 0 ) }
2021-11-16 14:06:20 -05:00
)
2021-08-23 19:36:51 -04:00
self . _trained_estimator = self . _state . _train_with_config (
2021-11-16 14:06:20 -05:00
best_estimator ,
best_config ,
sample_size = sample_size ,
2021-09-10 16:39:16 -07:00
) [ 0 ]
logger . info ( " retrain from log succeeded " )
2021-08-23 19:36:51 -04:00
return training_duration
2022-06-29 21:04:25 -07:00
def _decide_eval_method ( self , eval_method , time_budget ) :
if not isinstance ( self . _split_type , str ) :
assert eval_method in [
" auto " ,
" cv " ,
] , " eval_method must be ' auto ' or ' cv ' for custom data splitter. "
2023-04-10 21:50:40 +02:00
assert self . _state . X_val is None , " custom splitter and custom validation data can ' t be used together. "
2022-06-29 21:04:25 -07:00
return " cv "
2021-08-23 19:36:51 -04:00
if self . _state . X_val is not None :
2022-06-29 21:04:25 -07:00
assert eval_method in [
" auto " ,
" holdout " ,
] , " eval_method must be ' auto ' or ' holdout ' for custom validation data. "
2021-09-10 16:39:16 -07:00
return " holdout "
2022-06-29 21:04:25 -07:00
if eval_method != " auto " :
assert eval_method in [
" holdout " ,
" cv " ,
] , " eval_method must be ' holdout ' , ' cv ' or ' auto ' . "
return eval_method
2021-08-23 19:36:51 -04:00
nrow , dim = self . _nrow , self . _ndim
2021-09-10 16:39:16 -07:00
if (
2022-12-06 10:13:39 -08:00
time_budget < 0
2021-11-03 19:08:23 -07:00
or nrow * dim / 0.9 < SMALL_LARGE_THRES * ( time_budget / 3600 )
2021-09-10 16:39:16 -07:00
and nrow < CV_HOLDOUT_THRESHOLD
) :
2021-08-23 19:36:51 -04:00
# time allows or sampling can be used and cv is necessary
2021-09-10 16:39:16 -07:00
return " cv "
2021-08-23 19:36:51 -04:00
else :
2021-09-10 16:39:16 -07:00
return " holdout "
2021-08-23 19:36:51 -04:00
@property
def search_space ( self ) - > dict :
2021-11-06 11:37:57 -07:00
""" Search space.
Must be called after fit ( . . . )
( use max_iter = 0 and retrain_final = False to prevent actual fitting ) .
2021-08-23 19:36:51 -04:00
Returns :
2021-11-06 11:37:57 -07:00
A dict of the search space .
2021-09-10 16:39:16 -07:00
"""
2021-08-23 19:36:51 -04:00
estimator_list = self . estimator_list
if len ( estimator_list ) == 1 :
estimator = estimator_list [ 0 ]
space = self . _search_states [ estimator ] . search_space . copy ( )
2021-09-10 16:39:16 -07:00
space [ " learner " ] = estimator
2021-08-23 19:36:51 -04:00
return space
choices = [ ]
for estimator in estimator_list :
space = self . _search_states [ estimator ] . search_space . copy ( )
2021-09-10 16:39:16 -07:00
space [ " learner " ] = estimator
2021-08-23 19:36:51 -04:00
choices . append ( space )
2021-09-10 16:39:16 -07:00
return { " ml " : tune . choice ( choices ) }
2021-08-23 19:36:51 -04:00
@property
def low_cost_partial_config ( self ) - > dict :
2021-11-06 11:37:57 -07:00
""" Low cost partial config.
2021-08-23 19:36:51 -04:00
Returns :
A dict .
( a ) if there is only one estimator in estimator_list , each key is a
hyperparameter name .
( b ) otherwise , it is a nested dict with ' ml ' as the key , and
a list of the low_cost_partial_configs as the value , corresponding
to each learner ' s low_cost_partial_config; the estimator index as
2021-10-15 21:36:42 -07:00
an integer corresponding to the cheapest learner is appended to the
2021-08-23 19:36:51 -04:00
list at the end .
2021-09-10 16:39:16 -07:00
"""
2021-08-23 19:36:51 -04:00
if len ( self . estimator_list ) == 1 :
estimator = self . estimator_list [ 0 ]
c = self . _search_states [ estimator ] . low_cost_partial_config
return c
else :
configs = [ ]
for estimator in self . estimator_list :
c = self . _search_states [ estimator ] . low_cost_partial_config
configs . append ( c )
2021-09-10 16:39:16 -07:00
configs . append (
np . argmin (
[
self . _state . learner_classes . get ( estimator ) . cost_relative2lgbm ( )
for estimator in self . estimator_list
]
)
)
config = { " ml " : configs }
2021-08-23 19:36:51 -04:00
return config
@property
def cat_hp_cost ( self ) - > dict :
2021-09-10 16:39:16 -07:00
""" Categorical hyperparameter cost
2021-08-23 19:36:51 -04:00
Returns :
A dict .
( a ) if there is only one estimator in estimator_list , each key is a
hyperparameter name .
( b ) otherwise , it is a nested dict with ' ml ' as the key , and
a list of the cat_hp_cost ' s as the value, corresponding
to each learner ' s cat_hp_cost; the cost relative to lgbm for each
learner ( as a list itself ) is appended to the list at the end .
2021-09-10 16:39:16 -07:00
"""
2021-08-23 19:36:51 -04:00
if len ( self . estimator_list ) == 1 :
estimator = self . estimator_list [ 0 ]
c = self . _search_states [ estimator ] . cat_hp_cost
return c
else :
configs = [ ]
for estimator in self . estimator_list :
c = self . _search_states [ estimator ] . cat_hp_cost
configs . append ( c )
2021-09-10 16:39:16 -07:00
configs . append (
2023-04-10 21:50:40 +02:00
[ self . _state . learner_classes . get ( estimator ) . cost_relative2lgbm ( ) for estimator in self . estimator_list ]
2021-09-10 16:39:16 -07:00
)
config = { " ml " : configs }
2021-08-23 19:36:51 -04:00
return config
@property
def points_to_evaluate ( self ) - > dict :
2021-12-16 17:11:33 -08:00
""" Initial points to evaluate.
2021-08-23 19:36:51 -04:00
Returns :
2021-12-16 17:11:33 -08:00
A list of dicts . Each dict is the initial point for each learner .
2021-09-10 16:39:16 -07:00
"""
2021-08-23 19:36:51 -04:00
points = [ ]
for estimator in self . estimator_list :
2022-11-13 12:47:59 -08:00
configs = self . _search_states [ estimator ] . init_config
2021-09-10 16:39:16 -07:00
for config in configs :
2021-09-10 20:09:08 -07:00
config [ " learner " ] = estimator
2021-09-10 16:39:16 -07:00
if len ( self . estimator_list ) > 1 :
2021-09-10 20:09:08 -07:00
points . append ( { " ml " : config } )
2021-09-10 16:39:16 -07:00
else :
points . append ( config )
2021-08-23 19:36:51 -04:00
return points
@property
2021-12-04 21:52:20 -05:00
def resource_attr ( self ) - > Optional [ str ] :
""" Attribute of the resource dimension.
2021-08-23 19:36:51 -04:00
Returns :
2021-12-04 21:52:20 -05:00
A string for the sample size attribute
( the resource attribute in AutoML ) or None .
2021-09-10 16:39:16 -07:00
"""
return " FLAML_sample_size " if self . _sample else None
2021-08-23 19:36:51 -04:00
@property
def min_resource ( self ) - > Optional [ float ] :
2021-11-06 11:37:57 -07:00
""" Attribute for pruning.
2021-08-23 19:36:51 -04:00
Returns :
2021-11-06 11:37:57 -07:00
A float for the minimal sample size or None .
2021-09-10 16:39:16 -07:00
"""
2021-10-08 16:09:43 -07:00
return self . _min_sample_size if self . _sample else None
2021-08-23 19:36:51 -04:00
@property
def max_resource ( self ) - > Optional [ float ] :
2021-11-06 11:37:57 -07:00
""" Attribute for pruning.
2021-08-23 19:36:51 -04:00
Returns :
2021-11-06 11:37:57 -07:00
A float for the maximal sample size or None .
2021-09-10 16:39:16 -07:00
"""
2021-12-03 09:15:21 -08:00
return self . _state . data_size [ 0 ] if self . _sample else None
2021-08-23 19:36:51 -04:00
2022-03-20 22:03:02 -04:00
def pickle ( self , output_file_name ) :
import pickle
estimator_to_training_function = { }
for estimator in self . estimator_list :
search_state = self . _search_states [ estimator ]
2022-03-25 17:00:08 -04:00
if hasattr ( search_state , " training_function " ) :
2023-04-10 21:50:40 +02:00
estimator_to_training_function [ estimator ] = search_state . training_function
2022-03-25 17:00:08 -04:00
del search_state . training_function
2022-03-20 22:03:02 -04:00
with open ( output_file_name , " wb " ) as f :
pickle . dump ( self , f , pickle . HIGHEST_PROTOCOL )
2021-08-23 19:36:51 -04:00
@property
def trainable ( self ) - > Callable [ [ dict ] , Optional [ float ] ] :
2021-11-06 11:37:57 -07:00
""" Training function.
2021-08-23 19:36:51 -04:00
Returns :
2021-11-06 11:37:57 -07:00
A function that evaluates each config and returns the loss .
2021-09-10 16:39:16 -07:00
"""
2021-08-23 19:36:51 -04:00
self . _state . time_from_start = 0
states = self . _search_states
mem_res = self . _mem_thres
2022-12-24 00:18:49 +08:00
def train ( config : dict , state , is_report = True ) :
# handle spark broadcast variables
state = get_broadcast_data ( state )
is_report = get_broadcast_data ( is_report )
2021-09-10 16:39:16 -07:00
sample_size = config . get ( " FLAML_sample_size " )
config = config . get ( " ml " , config ) . copy ( )
2021-08-23 19:36:51 -04:00
if sample_size :
2021-09-10 16:39:16 -07:00
config [ " FLAML_sample_size " ] = sample_size
estimator = config [ " learner " ]
2021-08-23 19:36:51 -04:00
# check memory constraints before training
if states [ estimator ] . learner_class . size ( config ) < = mem_res :
2021-09-10 16:39:16 -07:00
del config [ " learner " ]
2022-12-13 12:48:32 -08:00
config . pop ( " _choice_ " , None )
2022-06-03 15:19:22 -04:00
result = AutoMLState . _compute_with_config_base (
2022-12-24 00:18:49 +08:00
config , state = state , estimator = estimator , is_report = is_report
2022-06-03 15:19:22 -04:00
)
2021-08-23 19:36:51 -04:00
else :
2022-06-03 15:19:22 -04:00
# If search algorithm is not in flaml, it does not handle the config constraint, should also tune.report before return
result = {
2021-09-10 16:39:16 -07:00
" pred_time " : 0 ,
" wall_clock_time " : None ,
" metric_for_logging " : np . inf ,
" val_loss " : np . inf ,
" trained_estimator " : None ,
}
2022-12-24 00:18:49 +08:00
if is_report is True :
tune . report ( * * result )
2022-06-03 15:19:22 -04:00
return result
if self . _use_ray is not False :
from ray . tune import with_parameters
2021-09-10 16:39:16 -07:00
2022-06-03 15:19:22 -04:00
return with_parameters (
train ,
state = self . _state ,
)
2022-12-24 00:18:49 +08:00
elif self . _use_spark :
from flaml . tune . spark . utils import with_parameters
return with_parameters ( train , state = self . _state , is_report = False )
2022-06-03 15:19:22 -04:00
else :
return partial (
train ,
state = self . _state ,
)
2021-08-23 19:36:51 -04:00
@property
def metric_constraints ( self ) - > list :
2021-11-06 11:37:57 -07:00
""" Metric constraints.
2021-08-23 19:36:51 -04:00
Returns :
2021-11-06 11:37:57 -07:00
A list of the metric constraints .
2021-09-10 16:39:16 -07:00
"""
2022-03-12 00:39:35 -05:00
return self . _metric_constraints
2021-08-23 19:36:51 -04:00
2023-03-11 02:39:08 +00:00
def _prepare_data ( self , eval_method , split_ratio , n_splits ) :
self . _state . task . prepare_data (
self . _state ,
self . _X_train_all ,
self . _y_train_all ,
self . _auto_augment ,
eval_method ,
self . _split_type ,
split_ratio ,
n_splits ,
self . _df ,
self . _sample_weight_full ,
)
self . data_size_full = len ( self . _state . y_train_all )
2021-09-10 16:39:16 -07:00
def fit (
self ,
X_train = None ,
y_train = None ,
dataframe = None ,
label = None ,
2021-11-18 09:39:45 -08:00
metric = None ,
2023-03-11 02:39:08 +00:00
task : Optional [ Union [ str , Task ] ] = None ,
2021-11-18 09:39:45 -08:00
n_jobs = None ,
2022-01-14 00:08:51 -05:00
# gpu_per_trial=0,
2021-11-18 09:39:45 -08:00
log_file_name = None ,
estimator_list = None ,
time_budget = None ,
max_iter = None ,
sample = None ,
ensemble = None ,
eval_method = None ,
log_type = None ,
model_history = None ,
split_ratio = None ,
n_splits = None ,
log_training_metric = None ,
mem_thres = None ,
pred_time_limit = None ,
train_time_limit = None ,
2021-09-10 16:39:16 -07:00
X_val = None ,
y_val = None ,
sample_weight_val = None ,
groups_val = None ,
groups = None ,
2021-11-18 09:39:45 -08:00
verbose = None ,
retrain_full = None ,
2021-09-10 16:39:16 -07:00
split_type = None ,
2021-11-18 09:39:45 -08:00
learner_selector = None ,
2021-09-10 16:39:16 -07:00
hpo_method = None ,
2021-11-18 09:39:45 -08:00
starting_points = None ,
2021-09-10 16:39:16 -07:00
seed = None ,
2021-11-18 09:39:45 -08:00
n_concurrent_trials = None ,
keep_search_state = None ,
2022-08-20 18:17:10 -04:00
preserve_checkpoint = True ,
2021-11-18 09:39:45 -08:00
early_stop = None ,
2023-02-24 16:07:00 +08:00
force_cancel = None ,
2021-11-18 09:39:45 -08:00
append_log = None ,
auto_augment = None ,
min_sample_size = None ,
use_ray = None ,
2022-12-24 00:18:49 +08:00
use_spark = None ,
2022-12-06 10:13:39 -08:00
free_mem_ratio = 0 ,
2022-03-12 00:39:35 -05:00
metric_constraints = None ,
2022-04-28 14:06:29 -04:00
custom_hp = None ,
2022-08-10 00:42:47 +00:00
cv_score_agg_func = None ,
2022-08-11 19:41:23 -04:00
skip_transform = None ,
2023-05-03 05:09:04 +02:00
mlflow_logging = None ,
2022-04-28 14:06:29 -04:00
fit_kwargs_by_estimator = None ,
2021-09-10 16:39:16 -07:00
* * fit_kwargs ,
) :
2021-11-06 11:37:57 -07:00
""" Find a model for a given task.
2021-08-23 19:36:51 -04:00
Args :
X_train : A numpy array or a pandas dataframe of training data in
2022-08-12 11:39:22 -04:00
shape ( n , m ) . For time series forecsat tasks , the first column of X_train
2021-10-30 12:48:57 -04:00
must be the timestamp column ( datetime type ) . Other columns in
the dataframe are assumed to be exogenous variables ( categorical or numeric ) .
2022-01-30 19:36:41 -08:00
When using ray , X_train can be a ray . ObjectRef .
2021-08-23 19:36:51 -04:00
y_train : A numpy array or a pandas series of labels in shape ( n , ) .
dataframe : A dataframe of training data including label column .
2022-08-12 11:39:22 -04:00
For time series forecast tasks , dataframe must be specified and must have
2021-10-30 12:48:57 -04:00
at least two columns , timestamp and label , where the first
column is the timestamp column ( datetime type ) . Other columns in
the dataframe are assumed to be exogenous variables ( categorical or numeric ) .
2022-01-30 19:36:41 -08:00
When using ray , dataframe can be a ray . ObjectRef .
2021-10-30 12:48:57 -04:00
label : A str of the label column name for , e . g . , ' label ' ;
2021-08-23 19:36:51 -04:00
Note : If X_train and y_train are provided ,
dataframe and label are ignored ;
If not , dataframe and label must be provided .
metric : A string of the metric name or a function ,
2022-12-03 08:57:32 +05:30
e . g . , ' accuracy ' , ' roc_auc ' , ' roc_auc_ovr ' , ' roc_auc_ovo ' , ' roc_auc_weighted ' ,
' roc_auc_ovo_weighted ' , ' roc_auc_ovr_weighted ' , ' f1 ' , ' micro_f1 ' , ' macro_f1 ' ,
' log_loss ' , ' mae ' , ' mse ' , ' r2 ' , ' mape ' . Default is ' auto ' .
2021-08-23 19:36:51 -04:00
If passing a customized metric function , the function needs to
2022-10-15 22:35:49 -04:00
have the following input arguments :
2022-04-28 14:06:29 -04:00
2021-12-16 17:11:33 -08:00
` ` ` python
def custom_metric (
X_test , y_test , estimator , labels ,
X_train , y_train , weight_test = None , weight_train = None ,
config = None , groups_test = None , groups_train = None ,
) :
return metric_to_minimize , metrics_to_log
` ` `
2021-08-23 19:36:51 -04:00
which returns a float number as the minimization objective ,
2021-11-22 22:17:29 -08:00
and a dictionary as the metrics to log . E . g . ,
2022-04-28 14:06:29 -04:00
2021-12-25 16:13:39 -08:00
` ` ` python
def custom_metric (
X_val , y_val , estimator , labels ,
X_train , y_train , weight_val = None , weight_train = None ,
2022-03-12 00:39:35 -05:00
* args ,
2021-12-25 16:13:39 -08:00
) :
from sklearn . metrics import log_loss
import time
2021-11-22 22:17:29 -08:00
2021-12-25 16:13:39 -08:00
start = time . time ( )
y_pred = estimator . predict_proba ( X_val )
pred_time = ( time . time ( ) - start ) / len ( X_val )
val_loss = log_loss ( y_val , y_pred , labels = labels , sample_weight = weight_val )
y_pred = estimator . predict_proba ( X_train )
train_loss = log_loss ( y_train , y_pred , labels = labels , sample_weight = weight_train )
alpha = 0.5
return val_loss * ( 1 + alpha ) - alpha * train_loss , {
" val_loss " : val_loss ,
" train_loss " : train_loss ,
" pred_time " : pred_time ,
}
` ` `
2021-08-23 19:36:51 -04:00
task : A string of the task type , e . g . ,
2022-03-08 14:21:18 -05:00
' classification ' , ' regression ' , ' ts_forecast_regression ' ,
2022-11-27 11:22:54 -08:00
' ts_forecast_classification ' , ' rank ' , ' seq-classification ' ,
2023-03-11 02:39:08 +00:00
' seq-regression ' , ' summarization ' , or an instance of Task class
2022-02-15 12:41:53 -05:00
n_jobs : An integer of the number of threads for training | default = - 1.
Use all available resources when n_jobs == - 1.
2022-01-14 13:39:09 -08:00
log_file_name : A string of the log file name | default = " " . To disable logging ,
2021-11-18 09:39:45 -08:00
set it to be an empty string " " .
2022-05-28 17:22:09 -07:00
estimator_list : A list of strings for estimator names , or ' auto ' .
e . g . , ` ` ` [ ' lgbm ' , ' xgboost ' , ' xgb_limitdepth ' , ' catboost ' , ' rf ' , ' extra_tree ' ] ` ` ` .
2021-08-23 19:36:51 -04:00
time_budget : A float number of the time budget in seconds .
2021-11-18 09:39:45 -08:00
Use - 1 if no time limit .
2021-08-23 19:36:51 -04:00
max_iter : An integer of the maximal number of iterations .
2022-05-28 17:22:09 -07:00
NOTE : when both time_budget and max_iter are unspecified ,
only one model will be trained per estimator .
2021-08-23 19:36:51 -04:00
sample : A boolean of whether to sample the training data during
search .
2021-09-01 16:25:04 -07:00
ensemble : boolean or dict | default = False . Whether to perform
ensemble after search . Can be a dict with keys ' passthrough '
and ' final_estimator ' to specify the passthrough and
2022-05-18 21:01:51 -07:00
final_estimator in the stacker . The dict can also contain
' n_jobs ' as the key to specify the number of jobs for the stacker .
2021-08-23 19:36:51 -04:00
eval_method : A string of resampling strategy , one of
[ ' auto ' , ' cv ' , ' holdout ' ] .
split_ratio : A float of the valiation data percentage for holdout .
n_splits : An integer of the number of folds for cross - validation .
log_type : A string of the log type , one of
[ ' better ' , ' all ' ] .
' better ' only logs configs with better loss than previos iters
' all ' logs all the tried configs .
2021-12-16 17:11:33 -08:00
model_history : A boolean of whether to keep the trained best
2021-11-18 09:39:45 -08:00
model per estimator . Make sure memory is large enough if setting to True .
2021-12-16 17:11:33 -08:00
Default value is False : best_model_for_estimator would return a
untrained model for non - best learner .
2021-08-23 19:36:51 -04:00
log_training_metric : A boolean of whether to log the training
metric for each model .
mem_thres : A float of the memory size constraint in bytes .
pred_time_limit : A float of the prediction latency constraint in seconds .
2021-12-03 16:02:00 -08:00
It refers to the average prediction time per row in validation data .
2022-12-06 10:13:39 -08:00
train_time_limit : None or a float of the training time constraint in seconds .
2021-08-23 19:36:51 -04:00
X_val : None or a numpy array or a pandas dataframe of validation data .
y_val : None or a numpy array or a pandas series of validation labels .
sample_weight_val : None or a numpy array of the sample weight of
2021-09-01 16:25:04 -07:00
validation data of the same shape as y_val .
groups_val : None or array - like | group labels ( with matching length
to y_val ) or group counts ( with sum equal to length of y_val )
for validation data . Need to be consistent with groups .
groups : None or array - like | Group labels ( with matching length to
y_train ) or groups counts ( with sum equal to length of y_train )
for training data .
2021-10-08 16:09:43 -07:00
verbose : int , default = 3 | Controls the verbosity , higher means more
2021-08-23 19:36:51 -04:00
messages .
retrain_full : bool or str , default = True | whether to retrain the
selected model on the full training data when using holdout .
True - retrain only after search finishes ; False - no retraining ;
' budget ' - do best effort to retrain without violating the time
budget .
2021-12-17 08:13:04 +08:00
split_type : str or splitter object , default = " auto " | the data split type .
2021-12-22 14:35:13 -08:00
* A valid splitter object is an instance of a derived class of scikit - learn
[ KFold ] ( https : / / scikit - learn . org / stable / modules / generated / sklearn . model_selection . KFold . html #sklearn.model_selection.KFold)
2021-12-17 08:13:04 +08:00
and have ` ` split ` ` and ` ` get_n_splits ` ` methods with the same signatures .
2021-12-22 14:35:13 -08:00
Set eval_method to " cv " to use the splitter object .
* Valid str options depend on different tasks .
For classification tasks , valid choices are
[ " auto " , ' stratified ' , ' uniform ' , ' time ' , ' group ' ] . " auto " - > stratified .
2021-11-18 09:39:45 -08:00
For regression tasks , valid choices are [ " auto " , ' uniform ' , ' time ' ] .
" auto " - > uniform .
2022-08-12 11:39:22 -04:00
For time series forecast tasks , must be " auto " or ' time ' .
2021-11-18 09:39:45 -08:00
For ranking task , must be " auto " or ' group ' .
hpo_method : str , default = " auto " | The hyperparameter
2021-09-04 01:42:21 -07:00
optimization method . By default , CFO is used for sequential
search and BlendSearch is used for parallel search .
2021-08-23 19:36:51 -04:00
No need to set when using flaml ' s default search space or using
a simple customized search space . When set to ' bs ' , BlendSearch
is used . BlendSearch can be tried when the search space is
complex , for example , containing multiple disjoint , discontinuous
2021-09-19 11:19:23 -07:00
subspaces . When set to ' random ' , random search is used .
2022-03-01 15:39:09 -08:00
starting_points : A dictionary or a str to specify the starting hyperparameter
config for the estimators | default = " data " .
If str :
- if " data " , use data - dependent defaults ;
- if " data:path " use data - dependent defaults which are stored at path ;
- if " static " , use data - independent defaults .
If dict , keys are the name of the estimators , and values are the starting
2021-08-23 19:36:51 -04:00
hyperparamter configurations for the corresponding estimators .
2021-09-10 16:39:16 -07:00
The value can be a single hyperparamter configuration dict or a list
of hyperparamter configuration dicts .
2021-10-19 16:39:28 -04:00
In the following code example , we get starting_points from the
2021-12-16 17:11:33 -08:00
` automl ` object and use them in the ` new_automl ` object .
2021-10-19 16:39:28 -04:00
e . g . ,
2021-11-03 19:08:23 -07:00
2021-12-16 17:11:33 -08:00
` ` ` python
from flaml import AutoML
automl = AutoML ( )
X_train , y_train = load_iris ( return_X_y = True )
automl . fit ( X_train , y_train )
starting_points = automl . best_config_per_estimator
2021-10-19 16:39:28 -04:00
2021-12-16 17:11:33 -08:00
new_automl = AutoML ( )
new_automl . fit ( X_train , y_train , starting_points = starting_points )
` ` `
2021-10-19 16:39:28 -04:00
2021-12-22 12:12:25 -08:00
seed : int or None , default = None | The random seed for hpo .
2021-08-23 19:36:51 -04:00
n_concurrent_trials : [ Experimental ] int , default = 1 | The number of
2022-02-15 12:41:53 -05:00
concurrent trials . When n_concurrent_trials > 1 , flaml performes
2022-12-06 20:46:08 +00:00
[ parallel tuning ] ( . . / . . / Use - Cases / Task - Oriented - AutoML #parallel-tuning)
2022-12-24 00:18:49 +08:00
and installation of ray or spark is required : ` pip install flaml [ ray ] `
or ` pip install flaml [ spark ] ` . Please check
[ here ] ( https : / / spark . apache . org / docs / latest / api / python / getting_started / install . html )
for more details about installing Spark .
2021-12-16 17:11:33 -08:00
keep_search_state : boolean , default = False | Whether to keep data needed
for model search after fit ( ) . By default the state is deleted for
space saving .
2022-08-20 18:17:10 -04:00
preserve_checkpoint : boolean , default = True | Whether to preserve the saved checkpoint
on disk when deleting automl . By default the checkpoint is preserved .
2021-09-10 16:39:16 -07:00
early_stop : boolean , default = False | Whether to stop early if the
search is considered to converge .
2023-02-24 16:07:00 +08:00
force_cancel : boolean , default = False | Whether to forcely cancel the PySpark job if overtime .
2021-10-08 16:09:43 -07:00
append_log : boolean , default = False | Whetehr to directly append the log
2021-09-10 16:39:16 -07:00
records to the input log file if it exists .
2021-10-08 16:09:43 -07:00
auto_augment : boolean , default = True | Whether to automatically
augment rare classes .
min_sample_size : int , default = MIN_SAMPLE_TRAIN | the minimal sample
size when sample = True .
2022-06-03 15:19:22 -04:00
use_ray : boolean or dict .
2022-03-20 22:03:02 -04:00
If boolean : default = False | Whether to use ray to run the training
2021-11-03 19:08:23 -07:00
in separate processes . This can be used to prevent OOM for large
2022-03-20 22:03:02 -04:00
datasets , but will incur more overhead in time .
2022-06-03 15:19:22 -04:00
If dict : the dict contains the keywords arguments to be passed to
[ ray . tune . run ] ( https : / / docs . ray . io / en / latest / tune / api_docs / execution . html ) .
2022-12-24 00:18:49 +08:00
use_spark : boolean , default = False | Whether to use spark to run the training
in parallel spark jobs . This can be used to accelerate training on large models
and large datasets , but will incur more overhead in time and thus slow down
training in some cases .
2022-12-06 10:13:39 -08:00
free_mem_ratio : float between 0 and 1 , default = 0. The free memory ratio to keep during training .
2022-03-12 00:39:35 -05:00
metric_constraints : list , default = [ ] | The list of metric constraints .
Each element in this list is a 3 - tuple , which shall be expressed
in the following format : the first element of the 3 - tuple is the name of the
metric , the second element is the inequality sign chosen from " >= " and " <= " ,
and the third element is the constraint value . E . g . , ` ( ' precision ' , ' >= ' , 0.9 ) ` .
Note that all the metric names in metric_constraints need to be reported via
the metrics_to_log dictionary returned by a customized metric function .
The customized metric function shall be provided via the ` metric ` key word argument
of the fit ( ) function or the automl constructor .
Find examples in this [ test ] ( https : / / github . com / microsoft / FLAML / tree / main / test / automl / test_constraints . py ) .
If ` pred_time_limit ` is provided as one of keyword arguments to fit ( ) function or
the automl constructor , flaml will automatically ( and under the hood )
add it as an additional element in the metric_constraints . Essentially ' pred_time_limit '
specifies a constraint about the prediction latency constraint in seconds .
2022-04-28 14:06:29 -04:00
custom_hp : dict , default = None | The custom search space specified by user
Each key is the estimator name , each value is a dict of the custom search space for that estimator . Notice the
domain of the custom search space can either be a value of a sample . Domain object .
2022-08-11 19:41:23 -04:00
2022-04-28 14:06:29 -04:00
` ` ` python
custom_hp = {
" transformer_ms " : {
" model_path " : {
" domain " : " albert-base-v2 " ,
} ,
" learning_rate " : {
" domain " : tune . choice ( [ 1e-4 , 1e-5 ] ) ,
}
}
}
2022-05-20 04:59:45 +05:30
` ` `
2022-08-10 00:42:47 +00:00
cv_score_agg_func : customized cross - validation scores aggregate function . Default to average metrics across folds . If specificed , this function needs to
2022-10-15 22:35:49 -04:00
have the following input arguments :
* val_loss_folds : list of floats , the loss scores of each fold ;
* log_metrics_folds : list of dicts / floats , the metrics of each fold to log .
2022-08-07 18:11:04 +00:00
2022-08-15 14:41:30 +00:00
This function should return the final aggregate result of all folds . A float number of the minimization objective , and a dictionary as the metrics to log or None .
2022-10-15 22:35:49 -04:00
E . g . ,
2022-08-07 18:16:34 +00:00
2022-08-10 00:42:47 +00:00
` ` ` python
2022-08-15 14:41:30 +00:00
def cv_score_agg_func ( val_loss_folds , log_metrics_folds ) :
metric_to_minimize = sum ( val_loss_folds ) / len ( val_loss_folds )
2022-08-10 00:42:47 +00:00
metrics_to_log = None
2022-08-15 14:41:30 +00:00
for single_fold in log_metrics_folds :
if metrics_to_log is None :
metrics_to_log = single_fold
elif isinstance ( metrics_to_log , dict ) :
metrics_to_log = { k : metrics_to_log [ k ] + v for k , v in single_fold . items ( ) }
2022-08-10 00:42:47 +00:00
else :
2022-08-15 14:41:30 +00:00
metrics_to_log + = single_fold
2022-08-10 00:42:47 +00:00
if metrics_to_log :
2022-08-15 14:41:30 +00:00
n = len ( val_loss_folds )
2022-10-15 22:35:49 -04:00
metrics_to_log = (
{ k : v / n for k , v in metrics_to_log . items ( ) }
if isinstance ( metrics_to_log , dict )
else metrics_to_log / n
)
2022-08-10 00:42:47 +00:00
return metric_to_minimize , metrics_to_log
` ` `
2022-08-07 18:16:34 +00:00
2022-10-15 22:35:49 -04:00
skip_transform : boolean , default = False | Whether to pre - process data prior to modeling .
2023-05-03 05:09:04 +02:00
mlflow_logging : boolean , default = None | Whether to log the training results to mlflow .
Default value is None , which means the logging decision is made based on
AutoML . __init__ ' s mlflow_logging argument.
This requires mlflow to be installed and to have an active mlflow run .
FLAML will create nested runs .
2022-08-10 00:42:47 +00:00
fit_kwargs_by_estimator : dict , default = None | The user specified keywords arguments , grouped by estimator name .
2022-05-10 17:22:57 -04:00
For TransformersEstimator , available fit_kwargs can be found from
2022-07-01 13:28:16 -07:00
[ TrainingArgumentsForAuto ] ( nlp / huggingface / training_args ) .
2022-04-28 14:06:29 -04:00
e . g . ,
` ` ` python
fit_kwargs_by_estimator = {
" transformer " : {
" output_dir " : " test/data/output/ " ,
" fp16 " : False ,
2022-08-12 11:39:22 -04:00
} ,
" tft " : {
" max_encoder_length " : 1 ,
" min_encoder_length " : 1 ,
" static_categoricals " : [ ] ,
" static_reals " : [ ] ,
" time_varying_known_categoricals " : [ ] ,
" time_varying_known_reals " : [ ] ,
" time_varying_unknown_categoricals " : [ ] ,
" time_varying_unknown_reals " : [ ] ,
" variable_groups " : { } ,
" lags " : { } ,
2022-04-28 14:06:29 -04:00
}
}
` ` `
2021-08-23 19:36:51 -04:00
* * fit_kwargs : Other key word arguments to pass to fit ( ) function of
2022-08-12 11:39:22 -04:00
the searched learners , such as sample_weight . Below are a few examples of
estimator - specific parameters :
period : int | forecast horizon for all time series forecast tasks .
2022-01-14 00:08:51 -05:00
gpu_per_trial : float , default = 0 | A float of the number of gpus per trial ,
2022-08-12 11:39:22 -04:00
only used by TransformersEstimator , XGBoostSklearnEstimator , and
TemporalFusionTransformerEstimator .
group_ids : list of strings of column names identifying a time series , only
used by TemporalFusionTransformerEstimator , required for
' ts_forecast_panel ' task . ` group_ids ` is a parameter for TimeSeriesDataSet object
from PyTorchForecasting .
For other parameters to describe your dataset , refer to
[ TimeSeriesDataSet PyTorchForecasting ] ( https : / / pytorch - forecasting . readthedocs . io / en / stable / api / pytorch_forecasting . data . timeseries . TimeSeriesDataSet . html ) .
To specify your variables , use ` static_categoricals ` , ` static_reals ` ,
` time_varying_known_categoricals ` , ` time_varying_known_reals ` ,
` time_varying_unknown_categoricals ` , ` time_varying_unknown_reals ` ,
` variable_groups ` . To provide more information on your data , use
` max_encoder_length ` , ` min_encoder_length ` , ` lags ` .
log_dir : str , default = " lightning_logs " | Folder into which to log results
for tensorboard , only used by TemporalFusionTransformerEstimator .
max_epochs : int , default = 20 | Maximum number of epochs to run training ,
only used by TemporalFusionTransformerEstimator .
batch_size : int , default = 64 | Batch size for training model , only
used by TemporalFusionTransformerEstimator .
2021-09-10 16:39:16 -07:00
"""
2022-08-07 18:16:34 +00:00
2021-08-23 19:36:51 -04:00
self . _state . _start_time_flag = self . _start_time_flag = time . time ( )
2021-11-18 09:39:45 -08:00
task = task or self . _settings . get ( " task " )
2023-03-11 02:39:08 +00:00
if isinstance ( task , str ) :
task = task_factory ( task , X_train , y_train )
self . _state . task = task
self . _estimator_type = " classifier " if task . is_classification ( ) else " regressor "
2021-11-18 09:39:45 -08:00
time_budget = time_budget or self . _settings . get ( " time_budget " )
n_jobs = n_jobs or self . _settings . get ( " n_jobs " )
2022-01-14 00:08:51 -05:00
gpu_per_trial = fit_kwargs . get ( " gpu_per_trial " , 0 )
2021-11-18 09:39:45 -08:00
eval_method = eval_method or self . _settings . get ( " eval_method " )
split_ratio = split_ratio or self . _settings . get ( " split_ratio " )
n_splits = n_splits or self . _settings . get ( " n_splits " )
2023-04-10 21:50:40 +02:00
auto_augment = self . _settings . get ( " auto_augment " ) if auto_augment is None else auto_augment
2021-11-18 09:39:45 -08:00
metric = metric or self . _settings . get ( " metric " )
estimator_list = estimator_list or self . _settings . get ( " estimator_list " )
2023-04-10 21:50:40 +02:00
log_file_name = self . _settings . get ( " log_file_name " ) if log_file_name is None else log_file_name
2021-11-18 09:39:45 -08:00
max_iter = self . _settings . get ( " max_iter " ) if max_iter is None else max_iter
2022-05-28 17:22:09 -07:00
sample_is_none = sample is None
if sample_is_none :
sample = self . _settings . get ( " sample " )
2021-11-18 09:39:45 -08:00
ensemble = self . _settings . get ( " ensemble " ) if ensemble is None else ensemble
log_type = log_type or self . _settings . get ( " log_type " )
2023-04-10 21:50:40 +02:00
model_history = self . _settings . get ( " model_history " ) if model_history is None else model_history
2021-11-18 09:39:45 -08:00
log_training_metric = (
2023-04-10 21:50:40 +02:00
self . _settings . get ( " log_training_metric " ) if log_training_metric is None else log_training_metric
2021-11-18 09:39:45 -08:00
)
mem_thres = mem_thres or self . _settings . get ( " mem_thres " )
pred_time_limit = pred_time_limit or self . _settings . get ( " pred_time_limit " )
train_time_limit = train_time_limit or self . _settings . get ( " train_time_limit " )
2023-04-10 21:50:40 +02:00
self . _metric_constraints = metric_constraints or self . _settings . get ( " metric_constraints " )
2022-03-12 00:39:35 -05:00
if np . isfinite ( pred_time_limit ) :
self . _metric_constraints . append ( ( " pred_time " , " <= " , pred_time_limit ) )
2021-11-18 09:39:45 -08:00
verbose = self . _settings . get ( " verbose " ) if verbose is None else verbose
2023-04-10 21:50:40 +02:00
retrain_full = self . _settings . get ( " retrain_full " ) if retrain_full is None else retrain_full
2021-11-18 09:39:45 -08:00
split_type = split_type or self . _settings . get ( " split_type " )
hpo_method = hpo_method or self . _settings . get ( " hpo_method " )
learner_selector = learner_selector or self . _settings . get ( " learner_selector " )
2022-05-28 17:22:09 -07:00
no_starting_points = starting_points is None
if no_starting_points :
starting_points = self . _settings . get ( " starting_points " )
2023-04-10 21:50:40 +02:00
n_concurrent_trials = n_concurrent_trials or self . _settings . get ( " n_concurrent_trials " )
keep_search_state = self . _settings . get ( " keep_search_state " ) if keep_search_state is None else keep_search_state
2022-08-20 18:17:10 -04:00
self . preserve_checkpoint = (
2023-04-10 21:50:40 +02:00
self . _settings . get ( " preserve_checkpoint " ) if preserve_checkpoint is None else preserve_checkpoint
2023-02-24 16:07:00 +08:00
)
2023-04-10 21:50:40 +02:00
early_stop = self . _settings . get ( " early_stop " ) if early_stop is None else early_stop
force_cancel = self . _settings . get ( " force_cancel " ) if force_cancel is None else force_cancel
2022-05-28 17:22:09 -07:00
# no search budget is provided?
2022-12-06 10:13:39 -08:00
no_budget = time_budget < 0 and max_iter is None and not early_stop
2023-04-10 21:50:40 +02:00
append_log = self . _settings . get ( " append_log " ) if append_log is None else append_log
2021-11-18 09:39:45 -08:00
min_sample_size = min_sample_size or self . _settings . get ( " min_sample_size " )
use_ray = self . _settings . get ( " use_ray " ) if use_ray is None else use_ray
2022-12-24 00:18:49 +08:00
use_spark = self . _settings . get ( " use_spark " ) if use_spark is None else use_spark
if use_spark and use_ray is not False :
raise ValueError ( " use_spark and use_ray cannot be both True. " )
2023-01-07 23:41:35 +08:00
elif use_spark :
spark_available , spark_error_msg = check_spark ( )
if not spark_available :
raise spark_error_msg
2022-12-24 00:18:49 +08:00
old_level = logger . getEffectiveLevel ( )
self . verbose = verbose
logger . setLevel ( 50 - verbose * 10 )
if not logger . handlers :
# Add the console handler.
_ch = logging . StreamHandler ( stream = sys . stdout )
_ch . setFormatter ( logger_formatter )
logger . addHandler ( _ch )
if not use_ray and not use_spark and n_concurrent_trials > 1 :
if ray_available :
logger . warning (
" n_concurrent_trials > 1 is only supported when using Ray or Spark. "
" Ray installed, setting use_ray to True. If you want to use Spark, set use_spark to True. "
)
use_ray = True
else :
2023-01-07 23:41:35 +08:00
spark_available , _ = check_spark ( )
if spark_available :
logger . warning (
" n_concurrent_trials > 1 is only supported when using Ray or Spark. "
" Spark installed, setting use_spark to True. If you want to use Ray, set use_ray to True. "
)
use_spark = True
else :
logger . warning (
" n_concurrent_trials > 1 is only supported when using Ray or Spark. "
" Neither Ray nor Spark installed, setting n_concurrent_trials to 1. "
)
n_concurrent_trials = 1
2022-12-24 00:18:49 +08:00
2022-01-30 19:36:41 -08:00
self . _state . n_jobs = n_jobs
self . _n_concurrent_trials = n_concurrent_trials
self . _early_stop = early_stop
2022-12-24 00:18:49 +08:00
self . _use_spark = use_spark
2023-02-24 16:07:00 +08:00
self . _force_cancel = force_cancel
2022-12-24 00:18:49 +08:00
self . _use_ray = use_ray
2022-01-30 19:36:41 -08:00
# use the following condition if we have an estimation of average_trial_time and average_trial_overhead
2022-12-24 00:18:49 +08:00
# self._use_ray = use_ray or n_concurrent_trials > ( average_trial_time + average_trial_overhead) / (average_trial_time)
2022-12-19 01:49:00 +08:00
2022-05-02 11:05:23 -04:00
if self . _use_ray is not False :
2022-01-30 19:36:41 -08:00
import ray
2023-04-10 21:50:40 +02:00
n_cpus = ray . is_initialized ( ) and ray . available_resources ( ) [ " CPU " ] or os . cpu_count ( )
2022-03-20 22:03:02 -04:00
2022-01-30 19:36:41 -08:00
self . _state . resources_per_trial = (
# when using gpu, default cpu is 1 per job; otherwise, default cpu is n_cpus / n_concurrent_trials
2022-03-20 22:03:02 -04:00
(
{
" cpu " : max ( int ( ( n_cpus - 2 ) / 2 / n_concurrent_trials ) , 1 ) ,
" gpu " : gpu_per_trial ,
}
if gpu_per_trial == 0
else { " cpu " : 1 , " gpu " : gpu_per_trial }
)
2022-01-30 19:36:41 -08:00
if n_jobs < 0
else { " cpu " : n_jobs , " gpu " : gpu_per_trial }
)
2022-03-20 22:03:02 -04:00
2022-01-30 19:36:41 -08:00
if isinstance ( X_train , ray . ObjectRef ) :
X_train = ray . get ( X_train )
elif isinstance ( dataframe , ray . ObjectRef ) :
dataframe = ray . get ( dataframe )
2022-12-24 00:18:49 +08:00
else :
# TODO: Integrate with Spark
2023-04-10 21:50:40 +02:00
self . _state . resources_per_trial = { " cpu " : n_jobs } if n_jobs > 0 else { " cpu " : 1 }
self . _state . free_mem_ratio = self . _settings . get ( " free_mem_ratio " ) if free_mem_ratio is None else free_mem_ratio
2022-01-24 21:39:36 -05:00
self . _state . task = task
2021-08-23 19:36:51 -04:00
self . _state . log_training_metric = log_training_metric
2021-11-16 14:06:20 -05:00
2021-08-23 19:36:51 -04:00
self . _state . fit_kwargs = fit_kwargs
2022-04-28 14:06:29 -04:00
custom_hp = custom_hp or self . _settings . get ( " custom_hp " )
2023-04-10 21:50:40 +02:00
self . _skip_transform = self . _settings . get ( " skip_transform " ) if skip_transform is None else skip_transform
2023-05-03 05:09:04 +02:00
self . _mlflow_logging = self . _settings . get ( " mlflow_logging " ) if mlflow_logging is None else mlflow_logging
2023-04-10 21:50:40 +02:00
fit_kwargs_by_estimator = fit_kwargs_by_estimator or self . _settings . get ( " fit_kwargs_by_estimator " )
self . _state . fit_kwargs_by_estimator = fit_kwargs_by_estimator . copy ( ) # shallow copy of fit_kwargs_by_estimator
2021-08-23 19:36:51 -04:00
self . _state . weight_val = sample_weight_val
2023-03-11 02:39:08 +00:00
task . validate_data (
self ,
self . _state ,
X_train ,
y_train ,
dataframe ,
label ,
X_val ,
y_val ,
groups_val ,
groups ,
2021-09-10 16:39:16 -07:00
)
2021-08-23 19:36:51 -04:00
self . _search_states = { } # key: estimator name; value: SearchState
self . _random = np . random . RandomState ( RANDOM_SEED )
2021-12-22 12:12:25 -08:00
self . _seed = seed if seed is not None else 20
2021-08-23 19:36:51 -04:00
self . _learner_selector = learner_selector
2021-11-22 06:59:42 -08:00
logger . info ( f " task = { task } " )
2023-03-11 02:39:08 +00:00
self . _split_type = self . _state . task . decide_split_type (
split_type ,
self . _y_train_all ,
self . _state . fit_kwargs ,
self . _state . groups ,
)
2021-09-27 21:30:49 -07:00
logger . info ( f " Data split method: { self . _split_type } " )
2022-06-29 21:04:25 -07:00
eval_method = self . _decide_eval_method ( eval_method , time_budget )
2021-09-27 21:30:49 -07:00
self . _state . eval_method = eval_method
2021-08-23 19:36:51 -04:00
logger . info ( " Evaluation method: {} " . format ( eval_method ) )
2023-04-10 21:50:40 +02:00
self . _state . cv_score_agg_func = cv_score_agg_func or self . _settings . get ( " cv_score_agg_func " )
2021-08-23 19:36:51 -04:00
2023-04-10 21:50:40 +02:00
self . _retrain_in_budget = retrain_full == " budget " and ( eval_method == " holdout " and self . _state . X_val is None )
2021-10-08 16:09:43 -07:00
self . _auto_augment = auto_augment
2022-07-09 16:04:46 -04:00
_sample_size_from_starting_points = { }
if isinstance ( starting_points , dict ) :
for _estimator , _point_per_estimator in starting_points . items ( ) :
sample_size = (
_point_per_estimator
and isinstance ( _point_per_estimator , dict )
and _point_per_estimator . get ( " FLAML_sample_size " )
)
if sample_size :
_sample_size_from_starting_points [ _estimator ] = sample_size
elif _point_per_estimator and isinstance ( _point_per_estimator , list ) :
_sample_size_set = set (
[
config [ " FLAML_sample_size " ]
for config in _point_per_estimator
if " FLAML_sample_size " in config
]
)
if _sample_size_set :
2023-04-10 21:50:40 +02:00
_sample_size_from_starting_points [ _estimator ] = min ( _sample_size_set )
2022-07-09 16:04:46 -04:00
if len ( _sample_size_set ) > 1 :
logger . warning (
" Using the min FLAML_sample_size of all the provided starting points for estimator {} . (Provided FLAML_sample_size are: {} ) " . format (
_estimator , _sample_size_set
)
)
if not sample and isinstance ( starting_points , dict ) :
assert (
not _sample_size_from_starting_points
) , " When subsampling is disabled, do not include FLAML_sample_size in the starting point. "
self . _min_sample_size = _sample_size_from_starting_points or min_sample_size
self . _min_sample_size_input = min_sample_size
2021-09-01 16:25:04 -07:00
self . _prepare_data ( eval_method , split_ratio , n_splits )
2021-11-16 14:06:20 -05:00
2023-03-11 02:39:08 +00:00
# TODO pull this to task as decide_sample_size
2022-07-09 16:04:46 -04:00
if isinstance ( self . _min_sample_size , dict ) :
self . _sample = {
(
k ,
sample
2023-03-11 02:39:08 +00:00
and not task . is_rank ( )
2022-07-09 16:04:46 -04:00
and eval_method != " cv "
2023-04-10 21:50:40 +02:00
and ( self . _min_sample_size [ k ] * SAMPLE_MULTIPLY_FACTOR < self . _state . data_size [ 0 ] ) ,
2022-07-09 16:04:46 -04:00
)
for k in self . _min_sample_size . keys ( )
}
else :
self . _sample = (
sample
2023-03-11 02:39:08 +00:00
and not task . is_rank ( )
2022-07-09 16:04:46 -04:00
and eval_method != " cv "
2023-04-10 21:50:40 +02:00
and ( self . _min_sample_size * SAMPLE_MULTIPLY_FACTOR < self . _state . data_size [ 0 ] )
2021-12-03 09:15:21 -08:00
)
2021-12-10 12:32:49 -05:00
2023-03-11 02:39:08 +00:00
metric = task . default_metric ( metric )
2021-08-23 19:36:51 -04:00
self . _state . metric = metric
2021-12-10 12:32:49 -05:00
2023-03-11 02:39:08 +00:00
# TODO pull this to task
2021-12-10 12:32:49 -05:00
def is_to_reverse_metric ( metric , task ) :
if metric . startswith ( " ndcg " ) :
return True , f " 1- { metric } "
if metric in [
" r2 " ,
" accuracy " ,
" roc_auc " ,
" roc_auc_ovr " ,
" roc_auc_ovo " ,
2022-12-03 08:57:32 +05:30
" roc_auc_weighted " ,
" roc_auc_ovr_weighted " ,
" roc_auc_ovo_weighted " ,
2021-12-10 12:32:49 -05:00
" f1 " ,
" ap " ,
" micro_f1 " ,
" macro_f1 " ,
] :
return True , f " 1- { metric } "
2023-03-11 02:39:08 +00:00
if task . is_nlp ( ) :
2022-12-06 20:46:08 +00:00
from flaml . automl . ml import huggingface_metric_to_mode
2021-12-10 12:32:49 -05:00
2023-04-10 21:50:40 +02:00
if metric in huggingface_metric_to_mode and huggingface_metric_to_mode [ metric ] == " max " :
2021-12-10 12:32:49 -05:00
return True , f " - { metric } "
return False , None
if isinstance ( metric , str ) :
is_reverse , reverse_metric = is_to_reverse_metric ( metric , task )
if is_reverse :
error_metric = reverse_metric
else :
error_metric = metric
2021-08-23 19:36:51 -04:00
else :
2021-09-10 16:39:16 -07:00
error_metric = " customized metric "
logger . info ( f " Minimizing error metric: { error_metric } " )
2023-04-10 21:50:40 +02:00
is_spark_dataframe = isinstance ( X_train , psDataFrame ) or isinstance ( dataframe , psDataFrame )
Support spark dataframe as input dataset and spark models as estimators (#934)
* add basic support to Spark dataframe
add support to SynapseML LightGBM model
update to pyspark>=3.2.0 to leverage pandas_on_Spark API
* clean code, add TODOs
* add sample_train_data for pyspark.pandas dataframe, fix bugs
* improve some functions, fix bugs
* fix dict change size during iteration
* update model predict
* update LightGBM model, update test
* update SynapseML LightGBM params
* update synapseML and tests
* update TODOs
* Added support to roc_auc for spark models
* Added support to score of spark estimator
* Added test for automl score of spark estimator
* Added cv support to pyspark.pandas dataframe
* Update test, fix bugs
* Added tests
* Updated docs, tests, added a notebook
* Fix bugs in non-spark env
* Fix bugs and improve tests
* Fix uninstall pyspark
* Fix tests error
* Fix java.lang.OutOfMemoryError: Java heap space
* Fix test_performance
* Update test_sparkml to test_0sparkml to use the expected spark conf
* Remove unnecessary widgets in notebook
* Fix iloc java.lang.StackOverflowError
* fix pre-commit
* Added params check for spark dataframes
* Refactor code for train_test_split to a function
* Update train_test_split_pyspark
* Refactor if-else, remove unnecessary code
* Remove y from predict, remove mem control from n_iter compute
* Update workflow
* Improve _split_pyspark
* Fix test failure of too short training time
* Fix typos, improve docstrings
* Fix index errors of pandas_on_spark, add spark loss metric
* Fix typo of ndcgAtK
* Update NDCG metrics and tests
* Remove unuseful logger
* Use cache and count to ensure consistent indexes
* refactor for merge maain
* fix errors of refactor
* Updated SparkLightGBMEstimator and cache
* Updated config2params
* Remove unused import
* Fix unknown parameters
* Update default_estimator_list
* Add unit tests for spark metrics
2023-03-26 03:59:46 +08:00
estimator_list = task . default_estimator_list ( estimator_list , is_spark_dataframe )
if is_spark_dataframe and self . _use_spark :
# For spark dataframe, use_spark must be False because spark models are trained in parallel themselves
self . _use_spark = False
logger . warning (
" Spark dataframes support only spark.ml type models, which will be trained "
" with spark themselves, no need to start spark trials in flaml. "
" `use_spark` is set to False. "
)
2023-03-11 02:39:08 +00:00
2022-05-28 17:22:09 -07:00
# When no search budget is specified
if no_budget :
max_iter = len ( estimator_list )
self . _learner_selector = " roundrobin "
if sample_is_none :
self . _sample = False
if no_starting_points :
starting_points = " data "
logger . warning (
" No search budget is provided via time_budget or max_iter. "
" Training only one model per estimator. "
2022-12-22 22:36:34 -05:00
" Zero-shot AutoML is used for certain tasks and estimators. "
2022-05-28 17:22:09 -07:00
" To tune hyperparameters for each estimator, "
" please provide budget either via time_budget or max_iter. "
)
elif max_iter is None :
# set to a large number
max_iter = 1000000
self . _state . retrain_final = (
retrain_full is True
and eval_method == " holdout "
and ( X_val is None or self . _use_ray is not False )
or eval_method == " cv "
and ( max_iter > 0 or retrain_full is True )
or max_iter == 1
)
# add custom learner
2021-08-23 19:36:51 -04:00
for estimator_name in estimator_list :
if estimator_name not in self . _state . learner_classes :
self . add_learner (
estimator_name ,
2021-09-10 16:39:16 -07:00
get_estimator_class ( self . _state . task , estimator_name ) ,
)
2021-08-23 19:36:51 -04:00
# set up learner search space
2022-03-01 15:39:09 -08:00
if isinstance ( starting_points , str ) and starting_points . startswith ( " data " ) :
from flaml . default import suggest_config
location = starting_points [ 5 : ]
starting_points = { }
for estimator_name in estimator_list :
try :
configs = suggest_config (
self . _state . task ,
self . _X_train_all ,
self . _y_train_all ,
estimator_name ,
location ,
k = 1 ,
)
2023-04-10 21:50:40 +02:00
starting_points [ estimator_name ] = [ x [ " hyperparameters " ] for x in configs ]
2022-03-01 15:39:09 -08:00
except FileNotFoundError :
pass
try :
learner = suggest_learner (
self . _state . task ,
self . _X_train_all ,
self . _y_train_all ,
estimator_list = estimator_list ,
location = location ,
)
if learner != estimator_list [ 0 ] :
estimator_list . remove ( learner )
estimator_list . insert ( 0 , learner )
except FileNotFoundError :
pass
2022-12-06 10:13:39 -08:00
self . _state . time_budget = time_budget
2022-03-01 15:39:09 -08:00
starting_points = { } if starting_points == " static " else starting_points
2021-08-23 19:36:51 -04:00
for estimator_name in estimator_list :
estimator_class = self . _state . learner_classes [ estimator_name ]
estimator_class . init ( )
2023-04-10 21:50:40 +02:00
this_estimator_kwargs = self . _state . fit_kwargs_by_estimator . get ( estimator_name )
2022-04-28 14:06:29 -04:00
if this_estimator_kwargs :
# make another shallow copy of the value (a dict obj), so user's fit_kwargs_by_estimator won't be updated
this_estimator_kwargs = this_estimator_kwargs . copy ( )
this_estimator_kwargs . update (
self . _state . fit_kwargs
2022-06-03 15:19:22 -04:00
) # update the shallow copy of fit_kwargs to fit_kwargs_by_estimator
2022-04-28 14:06:29 -04:00
self . _state . fit_kwargs_by_estimator [
estimator_name
] = this_estimator_kwargs # set self._state.fit_kwargs_by_estimator[estimator_name] to the update, so only self._state.fit_kwargs_by_estimator will be updated
else :
2023-04-10 21:50:40 +02:00
self . _state . fit_kwargs_by_estimator [ estimator_name ] = self . _state . fit_kwargs
2022-04-28 14:06:29 -04:00
2021-08-23 19:36:51 -04:00
self . _search_states [ estimator_name ] = SearchState (
learner_class = estimator_class ,
2021-09-10 16:39:16 -07:00
data_size = self . _state . data_size ,
task = self . _state . task ,
starting_point = starting_points . get ( estimator_name ) ,
2022-06-03 15:19:22 -04:00
period = self . _state . fit_kwargs . get (
" period "
) , # NOTE: this is after kwargs is updated to fit_kwargs_by_estimator
2022-04-28 14:06:29 -04:00
custom_hp = custom_hp and custom_hp . get ( estimator_name ) ,
2023-04-10 21:50:40 +02:00
max_iter = max_iter / len ( estimator_list ) if self . _learner_selector == " roundrobin " else max_iter ,
2022-12-06 10:13:39 -08:00
budget = self . _state . time_budget ,
2021-08-23 19:36:51 -04:00
)
2021-09-10 16:39:16 -07:00
logger . info ( " List of ML learners in AutoML Run: {} " . format ( estimator_list ) )
2021-08-23 19:36:51 -04:00
self . estimator_list = estimator_list
self . _active_estimators = estimator_list . copy ( )
self . _ensemble = ensemble
self . _max_iter = max_iter
self . _mem_thres = mem_thres
self . _pred_time_limit = pred_time_limit
self . _state . train_time_limit = train_time_limit
self . _log_type = log_type
self . split_ratio = split_ratio
2021-11-18 09:39:45 -08:00
self . _state . model_history = model_history
self . _hpo_method = (
hpo_method
if hpo_method != " auto "
else (
" bs "
2022-05-02 11:05:23 -04:00
if n_concurrent_trials > 1
2022-12-24 00:18:49 +08:00
or ( self . _use_ray is not False or self . _use_spark )
2022-05-02 11:05:23 -04:00
and len ( estimator_list ) > 1
2021-11-18 09:39:45 -08:00
else " cfo "
)
2021-11-06 21:58:05 -07:00
)
2021-08-23 19:36:51 -04:00
if log_file_name :
2021-09-10 16:39:16 -07:00
with training_log_writer ( log_file_name , append_log ) as save_helper :
2021-08-23 19:36:51 -04:00
self . _training_log = save_helper
self . _search ( )
else :
self . _training_log = None
self . _search ( )
if self . _best_estimator :
logger . info ( " fit succeeded " )
2023-04-10 21:50:40 +02:00
logger . info ( f " Time taken to find the best model: { self . _time_taken_best_iter } " )
2021-09-10 16:39:16 -07:00
if (
self . _hpo_method in ( " cfo " , " bs " )
2022-12-06 10:13:39 -08:00
and self . _state . time_budget > 0
2021-11-18 09:39:45 -08:00
and ( self . _time_taken_best_iter > = self . _state . time_budget * 0.7 )
2021-09-10 16:39:16 -07:00
and not all (
state . search_alg and state . search_alg . searcher . is_ls_ever_converged
for state in self . _search_states . values ( )
)
2021-08-23 19:36:51 -04:00
) :
2021-09-04 01:42:21 -07:00
logger . warning (
" Time taken to find the best model is {0:.0f} % o f the "
" provided time budget and not all estimators ' hyperparameter "
" search converged. Consider increasing the time budget. " . format (
2021-11-18 09:39:45 -08:00
self . _time_taken_best_iter / self . _state . time_budget * 100
2021-09-10 16:39:16 -07:00
)
)
2021-08-23 19:36:51 -04:00
2021-08-26 13:45:13 -07:00
if not keep_search_state :
# release space
del self . _X_train_all , self . _y_train_all , self . _state . kf
del self . _state . X_train , self . _state . X_train_all , self . _state . X_val
del self . _state . y_train , self . _state . y_train_all , self . _state . y_val
2022-04-28 14:06:29 -04:00
del (
self . _sample_weight_full ,
self . _state . fit_kwargs_by_estimator ,
self . _state . fit_kwargs ,
2022-06-03 15:19:22 -04:00
) # NOTE: this is after kwargs is updated to fit_kwargs_by_estimator
2021-09-01 16:25:04 -07:00
del self . _state . groups , self . _state . groups_all , self . _state . groups_val
2021-10-08 16:09:43 -07:00
logger . setLevel ( old_level )
2021-08-23 19:36:51 -04:00
def _search_parallel ( self ) :
2022-12-24 00:18:49 +08:00
if self . _use_ray is not False :
try :
from ray import __version__ as ray_version
assert ray_version > = " 1.10.0 "
if ray_version . startswith ( " 1. " ) :
from ray . tune . suggest import ConcurrencyLimiter
else :
from ray . tune . search import ConcurrencyLimiter
import ray
except ( ImportError , AssertionError ) :
2023-04-10 21:50:40 +02:00
raise ImportError ( " use_ray=True requires installation of ray. " " Please run pip install flaml[ray] " )
2022-12-24 00:18:49 +08:00
else :
from flaml . tune . searcher . suggestion import ConcurrencyLimiter
2021-09-10 16:39:16 -07:00
if self . _hpo_method in ( " cfo " , " grid " ) :
2021-08-23 19:36:51 -04:00
from flaml import CFO as SearchAlgo
2021-09-10 16:39:16 -07:00
elif " bs " == self . _hpo_method :
2021-08-23 19:36:51 -04:00
from flaml import BlendSearch as SearchAlgo
2021-09-10 16:39:16 -07:00
elif " random " == self . _hpo_method :
2022-09-13 19:13:06 -07:00
from flaml import RandomSearch as SearchAlgo
2022-03-20 22:03:02 -04:00
elif " optuna " == self . _hpo_method :
2022-12-24 00:18:49 +08:00
if self . _use_ray is not False :
try :
from ray import __version__ as ray_version
2022-03-20 22:03:02 -04:00
2022-12-24 00:18:49 +08:00
assert ray_version > = " 1.10.0 "
if ray_version . startswith ( " 1. " ) :
from ray . tune . suggest . optuna import OptunaSearch as SearchAlgo
else :
from ray . tune . search . optuna import OptunaSearch as SearchAlgo
except ( ImportError , AssertionError ) :
from flaml . tune . searcher . suggestion import (
OptunaSearch as SearchAlgo ,
)
else :
2022-10-04 16:03:22 -07:00
from flaml . tune . searcher . suggestion import OptunaSearch as SearchAlgo
2021-08-23 19:36:51 -04:00
else :
raise NotImplementedError (
2023-04-10 21:50:40 +02:00
f " hpo_method= { self . _hpo_method } is not recognized. " " ' auto ' , ' cfo ' and ' bs ' are supported. "
2021-09-10 16:39:16 -07:00
)
2021-09-04 01:42:21 -07:00
space = self . search_space
2022-09-13 19:13:06 -07:00
self . _state . time_from_start = time . time ( ) - self . _start_time_flag
2023-04-10 21:50:40 +02:00
time_budget_s = self . _state . time_budget - self . _state . time_from_start if self . _state . time_budget > = 0 else None
2022-09-13 19:13:06 -07:00
if self . _hpo_method != " optuna " :
min_resource = self . min_resource
if isinstance ( min_resource , dict ) :
_min_resource_set = set ( min_resource . values ( ) )
min_resource_all_estimator = min ( _min_resource_set )
if len ( _min_resource_set ) > 1 :
logger . warning (
" Using the min FLAML_sample_size of all the provided starting points as the starting sample size in the case of parallel search. "
)
else :
min_resource_all_estimator = min_resource
2021-09-04 01:42:21 -07:00
search_alg = SearchAlgo (
2022-09-13 19:13:06 -07:00
metric = " val_loss " ,
space = space ,
low_cost_partial_config = self . low_cost_partial_config ,
points_to_evaluate = self . points_to_evaluate ,
cat_hp_cost = self . cat_hp_cost ,
resource_attr = self . resource_attr ,
min_resource = min_resource_all_estimator ,
max_resource = self . max_resource ,
2023-04-10 21:50:40 +02:00
config_constraints = [ ( partial ( size , self . _state . learner_classes ) , " <= " , self . _mem_thres ) ] ,
2022-09-13 19:13:06 -07:00
metric_constraints = self . metric_constraints ,
seed = self . _seed ,
2022-12-06 10:13:39 -08:00
time_budget_s = time_budget_s ,
num_samples = self . _max_iter ,
2022-11-13 12:47:59 -08:00
allow_empty_config = True ,
2021-09-10 16:39:16 -07:00
)
2021-08-23 19:36:51 -04:00
else :
2022-12-24 00:18:49 +08:00
# if self._hpo_method is optuna, sometimes the search space and the initial config dimension do not match
2022-09-13 19:13:06 -07:00
# need to remove the extra keys from the search space to be consistent with the initial config
converted_space = SearchAlgo . convert_search_space ( space )
removed_keys = set ( space . keys ( ) ) . difference ( converted_space . keys ( ) )
new_points_to_evaluate = [ ]
for idx in range ( len ( self . points_to_evaluate ) ) :
r = self . points_to_evaluate [ idx ] . copy ( )
for each_key in removed_keys :
r . pop ( each_key )
new_points_to_evaluate . append ( r )
search_alg = SearchAlgo (
metric = " val_loss " ,
mode = " min " ,
2023-04-10 21:50:40 +02:00
points_to_evaluate = [ p for p in new_points_to_evaluate if len ( p ) == len ( converted_space ) ] ,
2022-09-13 19:13:06 -07:00
)
search_alg = ConcurrencyLimiter ( search_alg , self . _n_concurrent_trials )
2021-11-16 14:06:20 -05:00
resources_per_trial = self . _state . resources_per_trial
2022-03-20 22:03:02 -04:00
2022-12-24 00:18:49 +08:00
if self . _use_spark :
# use spark as parallel backend
analysis = tune . run (
self . trainable ,
search_alg = search_alg ,
config = space ,
metric = " val_loss " ,
mode = " min " ,
time_budget_s = time_budget_s ,
num_samples = self . _max_iter ,
verbose = max ( self . verbose - 2 , 0 ) ,
use_ray = False ,
use_spark = True ,
2023-02-24 16:07:00 +08:00
force_cancel = self . _force_cancel ,
2022-12-24 00:18:49 +08:00
# raise_on_failed_trial=False,
# keep_checkpoints_num=1,
# checkpoint_score_attr="min-val_loss",
)
else :
# use ray as parallel backend
analysis = ray . tune . run (
self . trainable ,
search_alg = search_alg ,
config = space ,
metric = " val_loss " ,
mode = " min " ,
resources_per_trial = resources_per_trial ,
time_budget_s = time_budget_s ,
num_samples = self . _max_iter ,
verbose = max ( self . verbose - 2 , 0 ) ,
raise_on_failed_trial = False ,
keep_checkpoints_num = 1 ,
checkpoint_score_attr = " min-val_loss " ,
* * self . _use_ray if isinstance ( self . _use_ray , dict ) else { } ,
)
2021-08-23 19:36:51 -04:00
# logger.info([trial.last_result for trial in analysis.trials])
2021-09-10 16:39:16 -07:00
trials = sorted (
(
trial
for trial in analysis . trials
2023-04-10 21:50:40 +02:00
if trial . last_result and trial . last_result . get ( " wall_clock_time " ) is not None
2021-09-10 16:39:16 -07:00
) ,
key = lambda x : x . last_result [ " wall_clock_time " ] ,
)
2022-01-14 13:39:09 -08:00
for self . _track_iter , trial in enumerate ( trials ) :
2021-08-23 19:36:51 -04:00
result = trial . last_result
better = False
if result :
2021-09-10 16:39:16 -07:00
config = result [ " config " ]
estimator = config . get ( " ml " , config ) [ " learner " ]
2021-08-23 19:36:51 -04:00
search_state = self . _search_states [ estimator ]
2021-11-18 09:39:45 -08:00
search_state . update ( result , 0 )
2022-01-02 21:37:19 -08:00
wall_time = result . get ( " wall_clock_time " )
if wall_time is not None :
self . _state . time_from_start = wall_time
2022-01-14 13:39:09 -08:00
self . _iter_per_learner [ estimator ] + = 1
2021-12-03 09:15:21 -08:00
if search_state . sample_size == self . _state . data_size [ 0 ] :
2021-08-23 19:36:51 -04:00
if not self . _fullsize_reached :
self . _fullsize_reached = True
if search_state . best_loss < self . _state . best_loss :
self . _state . best_loss = search_state . best_loss
self . _best_estimator = estimator
2022-01-14 13:39:09 -08:00
self . _config_history [ self . _track_iter ] = (
2021-09-10 16:39:16 -07:00
self . _best_estimator ,
config ,
self . _time_taken_best_iter ,
)
2021-08-23 19:36:51 -04:00
self . _trained_estimator = search_state . trained_estimator
2022-01-14 13:39:09 -08:00
self . _best_iteration = self . _track_iter
2021-08-23 19:36:51 -04:00
self . _time_taken_best_iter = self . _state . time_from_start
better = True
self . _search_states [ estimator ] . best_config = config
2022-01-02 21:37:19 -08:00
if better or self . _log_type == " all " :
self . _log_trial ( search_state , estimator )
def _log_trial ( self , search_state , estimator ) :
if self . _training_log :
self . _training_log . append (
self . _iter_per_learner [ estimator ] ,
search_state . metric_for_logging ,
search_state . trial_time ,
self . _state . time_from_start ,
search_state . val_loss ,
search_state . config ,
estimator ,
search_state . sample_size ,
)
2023-05-03 05:09:04 +02:00
if self . _mlflow_logging and mlflow is not None and mlflow . active_run ( ) :
2022-01-02 21:37:19 -08:00
with mlflow . start_run ( nested = True ) :
2022-01-14 13:39:09 -08:00
mlflow . log_metric ( " iter_counter " , self . _track_iter )
2022-08-22 13:36:50 -04:00
if ( search_state . metric_for_logging is not None ) and (
" intermediate_results " in search_state . metric_for_logging
) :
2023-04-10 21:50:40 +02:00
for each_entry in search_state . metric_for_logging [ " intermediate_results " ] :
2022-01-12 22:50:39 -05:00
with mlflow . start_run ( nested = True ) :
mlflow . log_metrics ( each_entry )
2023-04-10 21:50:40 +02:00
mlflow . log_metric ( " iter_counter " , self . _iter_per_learner [ estimator ] )
2022-01-12 22:50:39 -05:00
del search_state . metric_for_logging [ " intermediate_results " ]
2022-08-22 13:36:50 -04:00
if search_state . metric_for_logging :
mlflow . log_metrics ( search_state . metric_for_logging )
2022-01-02 21:37:19 -08:00
mlflow . log_metric ( " trial_time " , search_state . trial_time )
mlflow . log_metric ( " wall_clock_time " , self . _state . time_from_start )
mlflow . log_metric ( " validation_loss " , search_state . val_loss )
2022-10-04 12:03:11 -07:00
mlflow . log_params ( search_state . config )
2022-01-02 21:37:19 -08:00
mlflow . log_param ( " learner " , estimator )
mlflow . log_param ( " sample_size " , search_state . sample_size )
mlflow . log_metric ( " best_validation_loss " , search_state . best_loss )
mlflow . log_param ( " best_config " , search_state . best_config )
mlflow . log_param ( " best_learner " , self . _best_estimator )
2021-08-23 19:36:51 -04:00
def _search_sequential ( self ) :
try :
from ray import __version__ as ray_version
2021-09-10 16:39:16 -07:00
2022-03-25 17:32:37 -07:00
assert ray_version > = " 1.10.0 "
2022-09-13 19:13:06 -07:00
if ray_version . startswith ( " 1. " ) :
from ray . tune . suggest import ConcurrencyLimiter
else :
from ray . tune . search import ConcurrencyLimiter
2021-08-23 19:36:51 -04:00
except ( ImportError , AssertionError ) :
2022-10-04 16:03:22 -07:00
from flaml . tune . searcher . suggestion import ConcurrencyLimiter
2021-09-10 16:39:16 -07:00
if self . _hpo_method in ( " cfo " , " grid " ) :
2021-08-23 19:36:51 -04:00
from flaml import CFO as SearchAlgo
2021-09-10 16:39:16 -07:00
elif " optuna " == self . _hpo_method :
2021-08-23 19:36:51 -04:00
try :
2021-09-04 01:42:21 -07:00
from ray import __version__ as ray_version
2021-09-10 16:39:16 -07:00
2022-03-25 17:32:37 -07:00
assert ray_version > = " 1.10.0 "
2022-09-13 19:13:06 -07:00
if ray_version . startswith ( " 1. " ) :
from ray . tune . suggest . optuna import OptunaSearch as SearchAlgo
else :
from ray . tune . search . optuna import OptunaSearch as SearchAlgo
2021-08-23 19:36:51 -04:00
except ( ImportError , AssertionError ) :
2022-10-04 16:03:22 -07:00
from flaml . tune . searcher . suggestion import OptunaSearch as SearchAlgo
2021-09-10 16:39:16 -07:00
elif " bs " == self . _hpo_method :
2021-08-23 19:36:51 -04:00
from flaml import BlendSearch as SearchAlgo
2021-09-19 11:19:23 -07:00
elif " random " == self . _hpo_method :
2022-10-04 16:03:22 -07:00
from flaml . tune . searcher import RandomSearch as SearchAlgo
2021-09-10 16:39:16 -07:00
elif " cfocat " == self . _hpo_method :
2022-10-04 16:03:22 -07:00
from flaml . tune . searcher . cfo_cat import CFOCat as SearchAlgo
2021-08-23 19:36:51 -04:00
else :
raise NotImplementedError (
2023-04-10 21:50:40 +02:00
f " hpo_method= { self . _hpo_method } is not recognized. " " ' cfo ' and ' bs ' are supported. "
2021-09-10 16:39:16 -07:00
)
2021-08-23 19:36:51 -04:00
est_retrain_time = next_trial_time = 0
best_config_sig = None
better = True # whether we find a better model in one trial
for self . _track_iter in range ( self . _max_iter ) :
if self . _estimator_index is None :
estimator = self . _active_estimators [ 0 ]
else :
estimator = self . _select_estimator ( self . _active_estimators )
if not estimator :
break
2021-09-10 16:39:16 -07:00
logger . info ( f " iteration { self . _track_iter } , current learner { estimator } " )
2021-08-23 19:36:51 -04:00
search_state = self . _search_states [ estimator ]
self . _state . time_from_start = time . time ( ) - self . _start_time_flag
time_left = self . _state . time_budget - self . _state . time_from_start
2021-09-10 16:39:16 -07:00
budget_left = (
time_left
if not self . _retrain_in_budget
or better
or ( not self . best_estimator )
2023-04-10 21:50:40 +02:00
or self . _search_states [ self . best_estimator ] . sample_size < self . _state . data_size [ 0 ]
2021-08-23 19:36:51 -04:00
else time_left - est_retrain_time
2021-09-10 16:39:16 -07:00
)
2021-08-23 19:36:51 -04:00
if not search_state . search_alg :
search_state . training_function = partial (
2022-02-11 20:14:10 -08:00
AutoMLState . _compute_with_config_base ,
state = self . _state ,
estimator = estimator ,
2021-09-10 16:39:16 -07:00
)
2021-08-23 19:36:51 -04:00
search_space = search_state . search_space
if self . _sample :
2021-12-04 21:52:20 -05:00
resource_attr = " FLAML_sample_size "
2022-07-09 16:04:46 -04:00
min_resource = (
self . _min_sample_size [ estimator ]
2023-04-10 21:50:40 +02:00
if isinstance ( self . _min_sample_size , dict ) and estimator in self . _min_sample_size
2022-07-09 16:04:46 -04:00
else self . _min_sample_size_input
)
2021-12-03 09:15:21 -08:00
max_resource = self . _state . data_size [ 0 ]
2021-08-23 19:36:51 -04:00
else :
2021-12-04 21:52:20 -05:00
resource_attr = min_resource = max_resource = None
2021-08-23 19:36:51 -04:00
learner_class = self . _state . learner_classes . get ( estimator )
2021-09-10 16:39:16 -07:00
if " grid " == self . _hpo_method : # for synthetic exp only
2021-08-23 19:36:51 -04:00
points_to_evaluate = [ ]
space = search_space
keys = list ( space . keys ( ) )
domain0 , domain1 = space [ keys [ 0 ] ] , space [ keys [ 1 ] ]
for x1 in range ( domain0 . lower , domain0 . upper + 1 ) :
for x2 in range ( domain1 . lower , domain1 . upper + 1 ) :
2021-09-10 16:39:16 -07:00
points_to_evaluate . append (
{
keys [ 0 ] : x1 ,
keys [ 1 ] : x2 ,
}
)
2021-08-23 19:36:51 -04:00
self . _max_iter_per_learner = len ( points_to_evaluate )
low_cost_partial_config = None
else :
2022-11-13 12:47:59 -08:00
points_to_evaluate = search_state . init_config . copy ( )
2022-03-20 22:03:02 -04:00
2021-08-23 19:36:51 -04:00
low_cost_partial_config = search_state . low_cost_partial_config
2022-12-06 10:13:39 -08:00
time_budget_s = (
2023-04-10 21:50:40 +02:00
min ( budget_left , self . _state . train_time_limit or np . inf ) if self . _state . time_budget > = 0 else None
2022-12-06 10:13:39 -08:00
)
2021-09-19 11:19:23 -07:00
if self . _hpo_method in ( " bs " , " cfo " , " grid " , " cfocat " , " random " ) :
2021-08-23 19:36:51 -04:00
algo = SearchAlgo (
2021-09-10 16:39:16 -07:00
metric = " val_loss " ,
mode = " min " ,
space = search_space ,
2021-08-23 19:36:51 -04:00
points_to_evaluate = points_to_evaluate ,
low_cost_partial_config = low_cost_partial_config ,
cat_hp_cost = search_state . cat_hp_cost ,
2021-12-04 21:52:20 -05:00
resource_attr = resource_attr ,
2021-08-23 19:36:51 -04:00
min_resource = min_resource ,
max_resource = max_resource ,
2023-04-10 21:50:40 +02:00
config_constraints = [ ( learner_class . size , " <= " , self . _mem_thres ) ] ,
2021-08-23 19:36:51 -04:00
metric_constraints = self . metric_constraints ,
2021-09-25 19:23:08 -07:00
seed = self . _seed ,
2022-11-13 12:47:59 -08:00
allow_empty_config = True ,
2022-12-06 10:13:39 -08:00
time_budget_s = time_budget_s ,
num_samples = self . _max_iter ,
2021-08-23 19:36:51 -04:00
)
else :
2022-12-24 00:18:49 +08:00
# if self._hpo_method is optuna, sometimes the search space and the initial config dimension do not match
2022-03-20 22:03:02 -04:00
# need to remove the extra keys from the search space to be consistent with the initial config
converted_space = SearchAlgo . convert_search_space ( search_space )
2023-04-10 21:50:40 +02:00
removed_keys = set ( search_space . keys ( ) ) . difference ( converted_space . keys ( ) )
2022-03-20 22:03:02 -04:00
new_points_to_evaluate = [ ]
for idx in range ( len ( points_to_evaluate ) ) :
r = points_to_evaluate [ idx ] . copy ( )
for each_key in removed_keys :
r . pop ( each_key )
new_points_to_evaluate . append ( r )
points_to_evaluate = new_points_to_evaluate
2021-08-23 19:36:51 -04:00
algo = SearchAlgo (
2021-09-10 16:39:16 -07:00
metric = " val_loss " ,
mode = " min " ,
space = search_space ,
2023-04-10 21:50:40 +02:00
points_to_evaluate = [ p for p in points_to_evaluate if len ( p ) == len ( search_space ) ] ,
2021-08-23 19:36:51 -04:00
)
2021-09-10 16:39:16 -07:00
search_state . search_alg = ConcurrencyLimiter ( algo , max_concurrent = 1 )
2021-08-23 19:36:51 -04:00
# search_state.search_alg = algo
else :
search_space = None
2021-09-10 16:39:16 -07:00
if self . _hpo_method in ( " bs " , " cfo " , " cfocat " ) :
2021-10-08 16:09:43 -07:00
search_state . search_alg . searcher . set_search_properties (
2021-09-10 16:39:16 -07:00
metric = None ,
mode = None ,
2022-06-16 16:30:50 -07:00
metric_target = self . _state . best_loss ,
2021-08-23 19:36:51 -04:00
)
start_run_time = time . time ( )
analysis = tune . run (
search_state . training_function ,
search_alg = search_state . search_alg ,
2022-12-06 10:13:39 -08:00
time_budget_s = time_budget_s ,
2021-10-08 16:09:43 -07:00
verbose = max ( self . verbose - 3 , 0 ) ,
2021-09-10 16:39:16 -07:00
use_ray = False ,
2022-12-24 00:18:49 +08:00
use_spark = False ,
2021-09-10 16:39:16 -07:00
)
2021-08-23 19:36:51 -04:00
time_used = time . time ( ) - start_run_time
better = False
if analysis . trials :
result = analysis . trials [ - 1 ] . last_result
2021-11-18 09:39:45 -08:00
search_state . update ( result , time_used = time_used )
2021-08-23 19:36:51 -04:00
if self . _estimator_index is None :
2021-09-10 16:39:16 -07:00
# update init eci estimate
2021-08-23 19:36:51 -04:00
eci_base = search_state . init_eci
self . _eci . append ( search_state . estimated_cost4improvement )
for e in self . estimator_list [ 1 : ] :
2023-04-10 21:50:40 +02:00
self . _eci . append ( self . _search_states [ e ] . init_eci / eci_base * self . _eci [ 0 ] )
2021-08-23 19:36:51 -04:00
self . _estimator_index = 0
2021-09-23 10:49:02 -07:00
min_budget = max ( 10 * self . _eci [ 0 ] , sum ( self . _eci ) )
max_budget = 10000 * self . _eci [ 0 ]
if search_state . sample_size :
2021-12-03 09:15:21 -08:00
ratio = search_state . data_size [ 0 ] / search_state . sample_size
2021-09-23 10:49:02 -07:00
min_budget * = ratio
max_budget * = ratio
logger . info (
f " Estimated sufficient time budget= { max_budget : .0f } s. "
f " Estimated necessary time budget= { min_budget : .0f } s. "
)
2022-01-02 21:37:19 -08:00
wall_time = result . get ( " wall_clock_time " )
if wall_time is not None :
self . _state . time_from_start = wall_time
2021-08-23 19:36:51 -04:00
# logger.info(f"{self._search_states[estimator].sample_size}, {data_size}")
2021-12-03 09:15:21 -08:00
if search_state . sample_size == self . _state . data_size [ 0 ] :
2022-01-14 13:39:09 -08:00
self . _iter_per_learner_fullsize [ estimator ] + = 1
2021-09-10 16:39:16 -07:00
self . _fullsize_reached = True
2022-01-14 13:39:09 -08:00
self . _iter_per_learner [ estimator ] + = 1
2021-08-23 19:36:51 -04:00
if search_state . best_loss < self . _state . best_loss :
best_config_sig = estimator + search_state . get_hist_config_sig (
2021-09-10 16:39:16 -07:00
self . data_size_full , search_state . best_config
)
2021-08-23 19:36:51 -04:00
self . _state . best_loss = search_state . best_loss
self . _best_estimator = estimator
2021-09-10 16:39:16 -07:00
est_retrain_time = (
search_state . est_retrain_time ( self . data_size_full )
if ( best_config_sig not in self . _retrained_config )
else 0
)
2021-08-23 19:36:51 -04:00
self . _config_history [ self . _track_iter ] = (
estimator ,
search_state . best_config ,
2021-09-10 16:39:16 -07:00
self . _state . time_from_start ,
)
2021-11-18 09:39:45 -08:00
if self . _trained_estimator :
self . _trained_estimator . cleanup ( )
2021-08-23 19:36:51 -04:00
del self . _trained_estimator
self . _trained_estimator = None
2021-11-18 09:39:45 -08:00
if not self . _state . retrain_final :
2021-11-03 19:08:23 -07:00
self . _trained_estimator = search_state . trained_estimator
2021-08-23 19:36:51 -04:00
self . _best_iteration = self . _track_iter
self . _time_taken_best_iter = self . _state . time_from_start
better = True
next_trial_time = search_state . time2eval_best
2021-11-18 09:39:45 -08:00
if (
search_state . trained_estimator
and not self . _state . model_history
and search_state . trained_estimator != self . _trained_estimator
2021-11-03 19:08:23 -07:00
) :
2021-11-18 09:39:45 -08:00
search_state . trained_estimator . cleanup ( )
2021-09-10 16:39:16 -07:00
if better or self . _log_type == " all " :
2022-01-02 21:37:19 -08:00
self . _log_trial ( search_state , estimator )
2021-08-23 19:36:51 -04:00
logger . info (
2021-09-23 10:49:02 -07:00
" at {:.1f} s, \t estimator {} ' s best error= {:.4f} , \t best estimator {} ' s best error= {:.4f} " . format (
2021-08-23 19:36:51 -04:00
self . _state . time_from_start ,
estimator ,
search_state . best_loss ,
self . _best_estimator ,
2021-09-10 16:39:16 -07:00
self . _state . best_loss ,
)
)
if (
self . _hpo_method in ( " cfo " , " bs " )
and all (
2023-04-10 21:50:40 +02:00
state . search_alg and state . search_alg . searcher . is_ls_ever_converged
2021-09-10 16:39:16 -07:00
for state in self . _search_states . values ( )
)
2023-04-10 21:50:40 +02:00
and ( self . _state . time_from_start > self . _warn_threshold * self . _time_taken_best_iter )
2021-09-10 16:39:16 -07:00
) :
2021-09-04 01:42:21 -07:00
logger . warning (
" All estimator hyperparameters local search has "
" converged at least once, and the total search time "
f " exceeds { self . _warn_threshold } times the time taken "
2021-09-10 16:39:16 -07:00
" to find the best model. "
)
if self . _early_stop :
logger . warning ( " Stopping search as early_stop is set to True. " )
break
2021-09-23 10:49:02 -07:00
self . _warn_threshold * = 10
2021-08-23 19:36:51 -04:00
else :
2021-09-10 16:39:16 -07:00
logger . info ( f " stop trying learner { estimator } " )
2021-08-23 19:36:51 -04:00
if self . _estimator_index is not None :
self . _active_estimators . remove ( estimator )
self . _estimator_index - = 1
2021-09-23 10:49:02 -07:00
search_state . search_alg . searcher . _is_ls_ever_converged = True
2021-09-10 16:39:16 -07:00
if (
self . _retrain_in_budget
and best_config_sig
and est_retrain_time
and not better
2023-04-10 21:50:40 +02:00
and self . _search_states [ self . _best_estimator ] . sample_size == self . _state . data_size [ 0 ]
2021-09-10 16:39:16 -07:00
and (
est_retrain_time
< = self . _state . time_budget - self . _state . time_from_start
< = est_retrain_time + next_trial_time
)
) :
2021-09-27 21:30:49 -07:00
state = self . _search_states [ self . _best_estimator ]
2021-09-10 16:39:16 -07:00
self . _trained_estimator , retrain_time = self . _state . _train_with_config (
self . _best_estimator ,
2021-09-27 21:30:49 -07:00
state . best_config ,
2021-09-10 16:39:16 -07:00
self . data_size_full ,
)
2023-04-10 21:50:40 +02:00
logger . info ( " retrain {} for {:.1f} s " . format ( self . _best_estimator , retrain_time ) )
self . _retrained_config [ best_config_sig ] = state . best_config_train_time = retrain_time
2021-08-23 19:36:51 -04:00
est_retrain_time = 0
self . _state . time_from_start = time . time ( ) - self . _start_time_flag
2023-04-10 21:50:40 +02:00
if self . _state . time_from_start > = self . _state . time_budget > = 0 or not self . _active_estimators :
2021-08-23 19:36:51 -04:00
break
if self . _ensemble and self . _best_estimator :
time_left = self . _state . time_budget - self . _state . time_from_start
2021-09-10 16:39:16 -07:00
time_ensemble = self . _search_states [ self . _best_estimator ] . time2eval_best
2021-08-23 19:36:51 -04:00
if time_left < time_ensemble < 2 * time_left :
break
def _search ( self ) :
# initialize the search_states
self . _eci = [ ]
2021-09-10 16:39:16 -07:00
self . _state . best_loss = float ( " +inf " )
2021-08-23 19:36:51 -04:00
self . _state . time_from_start = 0
self . _estimator_index = None
self . _best_iteration = 0
self . _time_taken_best_iter = 0
self . _config_history = { }
2021-12-03 09:15:21 -08:00
self . _max_iter_per_learner = 10000
2021-08-23 19:36:51 -04:00
self . _iter_per_learner = dict ( [ ( e , 0 ) for e in self . estimator_list ] )
2022-01-14 13:39:09 -08:00
self . _iter_per_learner_fullsize = dict ( [ ( e , 0 ) for e in self . estimator_list ] )
2021-08-23 19:36:51 -04:00
self . _fullsize_reached = False
self . _trained_estimator = None
self . _best_estimator = None
self . _retrained_config = { }
self . _warn_threshold = 10
2021-09-04 01:42:21 -07:00
self . _selected = None
self . modelcount = 0
2022-01-23 01:24:15 -08:00
if self . _max_iter < 2 and self . estimator_list and self . _state . retrain_final :
# when max_iter is 1, no need to search
self . modelcount = self . _max_iter
self . _max_iter = 0
self . _best_estimator = estimator = self . estimator_list [ 0 ]
self . _selected = state = self . _search_states [ estimator ]
state . best_config_sample_size = self . _state . data_size [ 0 ]
2022-11-13 12:47:59 -08:00
state . best_config = state . init_config [ 0 ] if state . init_config else { }
2022-12-24 00:18:49 +08:00
elif self . _use_ray is False and self . _use_spark is False :
2021-08-23 19:36:51 -04:00
self . _search_sequential ( )
else :
self . _search_parallel ( )
# Add a checkpoint for the current best config to the log.
if self . _training_log :
self . _training_log . checkpoint ( )
2022-06-12 21:12:22 -07:00
self . _state . time_from_start = time . time ( ) - self . _start_time_flag
2021-08-23 19:36:51 -04:00
if self . _best_estimator :
self . _selected = self . _search_states [ self . _best_estimator ]
2023-04-10 21:50:40 +02:00
self . modelcount = sum ( search_state . total_iter for search_state in self . _search_states . values ( ) )
2021-08-23 19:36:51 -04:00
if self . _trained_estimator :
2021-09-10 16:39:16 -07:00
logger . info ( f " selected model: { self . _trained_estimator . model } " )
2022-01-02 21:37:19 -08:00
estimators = [ ]
2021-09-01 16:25:04 -07:00
if self . _ensemble and self . _state . task in (
2021-09-10 16:39:16 -07:00
" binary " ,
2022-03-25 17:00:08 -04:00
" multiclass " ,
2021-09-10 16:39:16 -07:00
" regression " ,
2021-09-01 16:25:04 -07:00
) :
2023-04-10 21:50:40 +02:00
search_states = list ( x for x in self . _search_states . items ( ) if x [ 1 ] . best_config )
2021-08-23 19:36:51 -04:00
search_states . sort ( key = lambda x : x [ 1 ] . best_loss )
2021-11-03 19:08:23 -07:00
estimators = [
(
x [ 0 ] ,
x [ 1 ] . learner_class (
task = self . _state . task ,
n_jobs = self . _state . n_jobs ,
2022-12-13 12:48:32 -08:00
* * AutoMLState . sanitize ( x [ 1 ] . best_config ) ,
2021-11-03 19:08:23 -07:00
) ,
)
for x in search_states [ : 2 ]
]
2021-08-23 19:36:51 -04:00
estimators + = [
2021-11-03 19:08:23 -07:00
(
x [ 0 ] ,
x [ 1 ] . learner_class (
task = self . _state . task ,
n_jobs = self . _state . n_jobs ,
2022-12-13 12:48:32 -08:00
* * AutoMLState . sanitize ( x [ 1 ] . best_config ) ,
2021-11-03 19:08:23 -07:00
) ,
)
2021-09-10 16:39:16 -07:00
for x in search_states [ 2 : ]
if x [ 1 ] . best_loss < 4 * self . _selected . best_loss
]
2023-04-10 21:50:40 +02:00
logger . info ( [ ( estimator [ 0 ] , estimator [ 1 ] . params ) for estimator in estimators ] )
2022-01-02 21:37:19 -08:00
if len ( estimators ) > 1 :
2023-03-11 02:39:08 +00:00
if self . _state . task . is_classification ( ) :
2021-08-23 19:36:51 -04:00
from sklearn . ensemble import StackingClassifier as Stacker
else :
from sklearn . ensemble import StackingRegressor as Stacker
2022-05-18 21:01:51 -07:00
if self . _use_ray is not False :
import ray
2023-04-10 21:50:40 +02:00
n_cpus = ray . is_initialized ( ) and ray . available_resources ( ) [ " CPU " ] or os . cpu_count ( )
2022-12-24 00:18:49 +08:00
elif self . _use_spark :
from flaml . tune . spark . utils import get_n_cpus
n_cpus = get_n_cpus ( )
2022-05-18 21:01:51 -07:00
else :
n_cpus = os . cpu_count ( )
ensemble_n_jobs = (
- self . _state . n_jobs # maximize total parallelization degree
2023-04-10 21:50:40 +02:00
if abs ( self . _state . n_jobs ) == 1 # 1 and -1 correspond to min/max parallelization
2022-05-18 21:01:51 -07:00
else max ( 1 , int ( n_cpus / 2 / self . _state . n_jobs ) )
# the total degree of parallelization = parallelization degree per estimator * parallelization degree of ensemble
)
2021-09-01 16:25:04 -07:00
if isinstance ( self . _ensemble , dict ) :
2023-04-10 21:50:40 +02:00
final_estimator = self . _ensemble . get ( " final_estimator " , self . _trained_estimator )
2021-09-10 16:39:16 -07:00
passthrough = self . _ensemble . get ( " passthrough " , True )
2022-05-18 21:01:51 -07:00
ensemble_n_jobs = self . _ensemble . get ( " n_jobs " , ensemble_n_jobs )
2021-09-01 16:25:04 -07:00
else :
final_estimator = self . _trained_estimator
passthrough = True
stacker = Stacker (
2021-09-10 16:39:16 -07:00
estimators ,
final_estimator ,
2022-05-18 21:01:51 -07:00
n_jobs = ensemble_n_jobs ,
2021-09-10 16:39:16 -07:00
passthrough = passthrough ,
)
2022-04-28 14:06:29 -04:00
sample_weight_dict = (
2023-04-10 21:50:40 +02:00
( self . _sample_weight_full is not None ) and { " sample_weight " : self . _sample_weight_full } or { }
2022-04-28 14:06:29 -04:00
)
2021-11-03 19:08:23 -07:00
for e in estimators :
e [ 1 ] . __class__ . init ( )
2022-05-18 21:01:51 -07:00
import joblib
2021-11-03 19:08:23 -07:00
try :
2022-06-18 01:22:03 +00:00
logger . info ( " Building ensemble with tuned estimators " )
2021-11-03 19:08:23 -07:00
stacker . fit (
2022-04-28 14:06:29 -04:00
self . _X_train_all ,
self . _y_train_all ,
2022-06-03 15:19:22 -04:00
* * sample_weight_dict , # NOTE: _search is after kwargs is updated to fit_kwargs_by_estimator
2021-11-03 19:08:23 -07:00
)
logger . info ( f " ensemble: { stacker } " )
self . _trained_estimator = stacker
self . _trained_estimator . model = stacker
except ValueError as e :
if passthrough :
logger . warning (
" Using passthrough=False for ensemble because the data contain categorical features. "
)
stacker = Stacker (
estimators ,
final_estimator ,
n_jobs = self . _state . n_jobs ,
passthrough = False ,
)
stacker . fit (
self . _X_train_all ,
self . _y_train_all ,
2022-06-03 15:19:22 -04:00
* * sample_weight_dict , # NOTE: _search is after kwargs is updated to fit_kwargs_by_estimator
2021-11-03 19:08:23 -07:00
)
logger . info ( f " ensemble: { stacker } " )
self . _trained_estimator = stacker
self . _trained_estimator . model = stacker
else :
raise e
2022-05-18 21:01:51 -07:00
except joblib . externals . loky . process_executor . TerminatedWorkerError :
logger . error (
" No enough memory to build the ensemble. "
" Please try increasing available RAM, decreasing n_jobs for ensemble, or disabling ensemble. "
)
2021-11-18 09:39:45 -08:00
elif self . _state . retrain_final :
2021-08-23 19:36:51 -04:00
# reset time budget for retraining
2021-11-03 19:08:23 -07:00
if self . _max_iter > 1 :
2022-12-06 10:13:39 -08:00
self . _state . time_budget = - 1
2021-11-03 19:08:23 -07:00
if (
2022-01-24 21:39:36 -05:00
self . _state . task in TS_FORECAST
2021-11-03 19:08:23 -07:00
or self . _trained_estimator is None
2022-01-02 21:37:19 -08:00
or self . _trained_estimator . model is None
2021-11-03 19:08:23 -07:00
or (
2022-12-06 10:13:39 -08:00
self . _state . time_budget < 0
or self . _state . time_budget - self . _state . time_from_start
2021-11-03 19:08:23 -07:00
> self . _selected . est_retrain_time ( self . data_size_full )
)
2023-04-10 21:50:40 +02:00
and self . _selected . best_config_sample_size == self . _state . data_size [ 0 ]
2021-09-01 16:25:04 -07:00
) :
2021-09-27 21:30:49 -07:00
state = self . _search_states [ self . _best_estimator ]
2021-09-10 16:39:16 -07:00
(
self . _trained_estimator ,
retrain_time ,
) = self . _state . _train_with_config (
self . _best_estimator ,
2021-09-27 21:30:49 -07:00
state . best_config ,
2021-09-10 16:39:16 -07:00
self . data_size_full ,
)
2023-04-10 21:50:40 +02:00
logger . info ( " retrain {} for {:.1f} s " . format ( self . _best_estimator , retrain_time ) )
2021-11-03 19:08:23 -07:00
state . best_config_train_time = retrain_time
2021-08-23 19:36:51 -04:00
if self . _trained_estimator :
2021-09-10 16:39:16 -07:00
logger . info ( f " retrained model: { self . _trained_estimator . model } " )
2021-08-23 19:36:51 -04:00
else :
2021-09-10 16:39:16 -07:00
logger . info ( " not retraining because the time budget is too small. " )
2021-08-23 19:36:51 -04:00
def __del__ ( self ) :
2021-09-10 16:39:16 -07:00
if (
hasattr ( self , " _trained_estimator " )
and self . _trained_estimator
and hasattr ( self . _trained_estimator , " cleanup " )
) :
2022-08-20 18:17:10 -04:00
if self . preserve_checkpoint is False :
self . _trained_estimator . cleanup ( )
2021-08-23 19:36:51 -04:00
del self . _trained_estimator
def _select_estimator ( self , estimator_list ) :
2021-09-10 16:39:16 -07:00
if self . _learner_selector == " roundrobin " :
2021-08-23 19:36:51 -04:00
self . _estimator_index + = 1
if self . _estimator_index == len ( estimator_list ) :
self . _estimator_index = 0
return estimator_list [ self . _estimator_index ]
min_estimated_cost , selected = np . Inf , None
inv = [ ]
untried_exists = False
for i , estimator in enumerate ( estimator_list ) :
if estimator in self . _search_states and (
self . _search_states [ estimator ] . sample_size
) : # sample_size=None meaning no result
search_state = self . _search_states [ estimator ]
2021-09-10 16:39:16 -07:00
if (
2022-12-06 10:13:39 -08:00
self . _state . time_budget > = 0
and self . _search_states [ estimator ] . time2eval_best
2021-08-23 19:36:51 -04:00
> self . _state . time_budget - self . _state . time_from_start
2023-04-10 21:50:40 +02:00
or self . _iter_per_learner_fullsize [ estimator ] > = self . _max_iter_per_learner
2021-09-10 16:39:16 -07:00
) :
2021-08-23 19:36:51 -04:00
inv . append ( 0 )
continue
estimated_cost = search_state . estimated_cost4improvement
2023-04-10 21:50:40 +02:00
if search_state . sample_size < self . _state . data_size [ 0 ] and self . _state . time_budget > = 0 :
2021-08-23 19:36:51 -04:00
estimated_cost = min (
estimated_cost ,
2021-09-10 16:39:16 -07:00
search_state . time2eval_best
* min (
2021-08-23 19:36:51 -04:00
SAMPLE_MULTIPLY_FACTOR ,
2021-12-03 09:15:21 -08:00
self . _state . data_size [ 0 ] / search_state . sample_size ,
2021-09-10 16:39:16 -07:00
) ,
)
2021-08-23 19:36:51 -04:00
gap = search_state . best_loss - self . _state . best_loss
if gap > 0 and not self . _ensemble :
2023-04-10 21:50:40 +02:00
delta_loss = ( search_state . best_loss_old - search_state . best_loss ) or search_state . best_loss
delta_time = ( search_state . total_time_used - search_state . time_best_found_old ) or 1e-10
2021-08-23 19:36:51 -04:00
speed = delta_loss / delta_time
if speed :
estimated_cost = max ( 2 * gap / speed , estimated_cost )
2021-11-04 22:06:19 -07:00
estimated_cost = estimated_cost or 1e-9
2021-08-23 19:36:51 -04:00
inv . append ( 1 / estimated_cost )
else :
estimated_cost = self . _eci [ i ]
inv . append ( 0 )
untried_exists = True
if estimated_cost < min_estimated_cost :
min_estimated_cost = estimated_cost
selected = estimator
if untried_exists or not selected :
state = self . _search_states . get ( selected )
if not ( state and state . sample_size ) :
return selected
s = sum ( inv )
p = self . _random . rand ( )
q = 0
for i in range ( len ( inv ) ) :
if inv [ i ] :
q + = inv [ i ] / s
if p < q :
return estimator_list [ i ]