autogen/flaml/automl.py

# !
#  * Copyright (c) FLAML authors. All rights reserved.
#  * Licensed under the MIT License. See LICENSE file in the
#  * project root for license information.
import time
import os
from typing import Callable, Optional, List, Union, Any
import inspect
from functools import partial
import numpy as np
from scipy.sparse import issparse
from sklearn.model_selection import (
    train_test_split,
    RepeatedStratifiedKFold,
    RepeatedKFold,
    GroupKFold,
    TimeSeriesSplit,
    GroupShuffleSplit,
)
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator
import pandas as pd
import logging
import json
from .ml import (
    compute_estimator,
    train_estimator,
    get_estimator_class,
    get_classification_objective,
)
from .config import (
    MIN_SAMPLE_TRAIN,
    MEM_THRES,
    RANDOM_SEED,
    SMALL_LARGE_THRES,
    CV_HOLDOUT_THRESHOLD,
    SPLIT_RATIO,
    N_SPLITS,
    SAMPLE_MULTIPLY_FACTOR,
)
from .data import (
    concat,
    CLASSIFICATION,
    TOKENCLASSIFICATION,
    TS_FORECAST,
    TS_FORECASTREGRESSION,
    TS_FORECASTPANEL,
    TS_TIMESTAMP_COL,
    REGRESSION,
    _is_nlp_task,
    NLG_TASKS,
)
from . import tune
from .training_log import training_log_reader, training_log_writer
from flaml.default.suggest import suggest_learner

logger = logging.getLogger(__name__)
logger_formatter = logging.Formatter(
    "[%(name)s: %(asctime)s] {%(lineno)d} %(levelname)s - %(message)s", "%m-%d %H:%M:%S"
)

try:
    import mlflow
except ImportError:
    mlflow = None


class SearchState:
    @property
    def search_space(self):
        return self._search_space_domain

    @property
    def estimated_cost4improvement(self):
        return max(
            self.time_best_found - self.time_best_found_old,
            self.total_time_used - self.time_best_found,
        )

    def valid_starting_point_one_dim(self, value_one_dim, domain_one_dim):
        from .tune.space import sample

        """
            For each hp in the starting point, check the following 3 conditions:
            (1) If the type of the starting point does not match the required type in search space, return false
            (2) If the starting point is not in the required search space, return false
            (3) If the search space is a value instead of domain, and the value is not equal to the starting point
            Notice (2) include the case starting point not in user specified search space custom_hp
        """
        if isinstance(domain_one_dim, sample.Domain):
            renamed_type = list(
                inspect.signature(domain_one_dim.is_valid).parameters.values()
            )[0].annotation
            type_match = (
                renamed_type == Any
                or isinstance(value_one_dim, renamed_type)
                or isinstance(value_one_dim, int)
                and renamed_type is float
            )
            if not (type_match and domain_one_dim.is_valid(value_one_dim)):
                return False
        elif value_one_dim != domain_one_dim:
            return False
        return True

    def valid_starting_point(self, starting_point, search_space):
        return all(
            self.valid_starting_point_one_dim(value, search_space[name].get("domain"))
            for name, value in starting_point.items()
            if name != "FLAML_sample_size"
        )

    def __init__(
        self,
        learner_class,
        data_size,
        task,
        starting_point=None,
        period=None,
        custom_hp=None,
        max_iter=None,
    ):
        self.init_eci = learner_class.cost_relative2lgbm()
        self._search_space_domain = {}
        self.init_config = {}
        self.low_cost_partial_config = {}
        self.cat_hp_cost = {}
        self.data_size = data_size
        self.ls_ever_converged = False
        self.learner_class = learner_class
        if task in TS_FORECAST:
            search_space = learner_class.search_space(
                data_size=data_size, task=task, pred_horizon=period
            )
        else:
            search_space = learner_class.search_space(data_size=data_size, task=task)

        if custom_hp is not None:
            search_space.update(custom_hp)

        if (
            isinstance(starting_point, dict)
            and max_iter
            > 1  # If the number of starting point is larger than max iter, avoid the checking
            and not self.valid_starting_point(starting_point, search_space)
        ):
            logger.warning(
                "Starting point {} removed because it is outside of the search space".format(
                    starting_point
                )
            )
            starting_point = None
        elif isinstance(starting_point, list) and max_iter > len(
            starting_point
        ):  # If the number of starting point is larger than max iter, avoid the checking
            starting_point_len = len(starting_point)
            starting_point = [
                x for x in starting_point if self.valid_starting_point(x, search_space)
            ]
            if starting_point_len > len(starting_point):
                logger.warning(
                    "Starting points outside of the search space are removed. "
                    f"Remaining starting points: {starting_point}"
                )
            starting_point = starting_point or None

        for name, space in search_space.items():
            assert (
                "domain" in space
            ), f"{name}'s domain is missing in the search space spec {space}"
            if space["domain"] is None:
                # don't search this hp
                continue
            self._search_space_domain[name] = space["domain"]

            if "low_cost_init_value" in space:
                self.low_cost_partial_config[name] = space["low_cost_init_value"]
            if "cat_hp_cost" in space:
                self.cat_hp_cost[name] = space["cat_hp_cost"]
            # if a starting point is provided, set the init config to be
            # the starting point provided
            if (
                isinstance(starting_point, dict)
                and starting_point.get(name) is not None
            ):
                self.init_config[name] = starting_point[name]
            elif (
                not isinstance(starting_point, list)
                and "init_value" in space
                and self.valid_starting_point_one_dim(
                    space["init_value"], space["domain"]
                )
            ):  # If starting point is list, no need to check the validity of self.init_config w.r.t search space
                self.init_config[name] = space[
                    "init_value"
                ]  # If starting_point is list, no need to assign value to self.init_config here

        if isinstance(starting_point, list):
            self.init_config = starting_point

        self._hp_names = list(self._search_space_domain.keys())
        self.search_alg = None
        self.best_config = None
        self.best_result = None
        self.best_loss = self.best_loss_old = np.inf
        self.total_time_used = 0
        self.total_iter = 0
        self.base_eci = None
        self.time_best_found = self.time_best_found_old = 0
        self.time2eval_best = 0
        self.time2eval_best_old = 0
        self.trained_estimator = None
        self.sample_size = None
        self.trial_time = 0

    def update(self, result, time_used):
        if result:
            config = result["config"]
            if config and "FLAML_sample_size" in config:
                self.sample_size = config["FLAML_sample_size"]
            else:
                self.sample_size = self.data_size[0]
            obj = result["val_loss"]
            metric_for_logging = result["metric_for_logging"]
            time2eval = result["time_total_s"]
            trained_estimator = result["trained_estimator"]
            del result["trained_estimator"]  # free up RAM
            n_iter = (
                trained_estimator
                and hasattr(trained_estimator, "ITER_HP")
                and trained_estimator.params[trained_estimator.ITER_HP]
            )
            if n_iter:
                config[trained_estimator.ITER_HP] = n_iter
        else:
            obj, time2eval, trained_estimator = np.inf, 0.0, None
            metric_for_logging = config = None
        self.trial_time = time2eval
        self.total_time_used += time_used
        self.total_iter += 1

        if self.base_eci is None:
            self.base_eci = time_used
        if (obj is not None) and (obj < self.best_loss):
            self.best_loss_old = self.best_loss if self.best_loss < np.inf else 2 * obj
            self.best_loss = obj
            self.best_result = result
            self.time_best_found_old = self.time_best_found
            self.time_best_found = self.total_time_used
            self.iter_best_found = self.total_iter
            self.best_config = config
            self.best_config_sample_size = self.sample_size
            self.best_config_train_time = time_used
            if time2eval:
                self.time2eval_best_old = self.time2eval_best
                self.time2eval_best = time2eval
            if (
                self.trained_estimator
                and trained_estimator
                and self.trained_estimator != trained_estimator
            ):
                self.trained_estimator.cleanup()
            if trained_estimator:
                self.trained_estimator = trained_estimator
        elif trained_estimator:
            trained_estimator.cleanup()
        self.metric_for_logging = metric_for_logging
        self.val_loss, self.config = obj, config

    def get_hist_config_sig(self, sample_size, config):
        config_values = tuple([config[k] for k in self._hp_names])
        config_sig = str(sample_size) + "_" + str(config_values)
        return config_sig

    def est_retrain_time(self, retrain_sample_size):
        assert (
            self.best_config_sample_size is not None
        ), "need to first get best_config_sample_size"
        return self.time2eval_best * retrain_sample_size / self.best_config_sample_size


class AutoMLState:
    def _prepare_sample_train_data(self, sample_size):
        sampled_weight = groups = None
        if sample_size <= self.data_size[0]:
            if isinstance(self.X_train, pd.DataFrame):
                sampled_X_train = self.X_train.iloc[:sample_size]
            else:
                sampled_X_train = self.X_train[:sample_size]
            sampled_y_train = self.y_train[:sample_size]
            weight = self.fit_kwargs.get(
                "sample_weight"
            )  # NOTE: _prepare_sample_train_data is before kwargs is updated to fit_kwargs_by_estimator
            if weight is not None:
                sampled_weight = weight[:sample_size]
            if self.groups is not None:
                groups = self.groups[:sample_size]
        else:
            sampled_X_train = self.X_train_all
            sampled_y_train = self.y_train_all
            if (
                "sample_weight" in self.fit_kwargs
            ):  # NOTE: _prepare_sample_train_data is before kwargs is updated to fit_kwargs_by_estimator
                sampled_weight = self.sample_weight_all
            if self.groups is not None:
                groups = self.groups_all
        return sampled_X_train, sampled_y_train, sampled_weight, groups

    @staticmethod
    def _compute_with_config_base(config_w_resource, state, estimator):
        if "FLAML_sample_size" in config_w_resource:
            sample_size = int(config_w_resource["FLAML_sample_size"])
        else:
            sample_size = state.data_size[0]

        this_estimator_kwargs = state.fit_kwargs_by_estimator.get(
            estimator
        ).copy()  # NOTE: _compute_with_config_base is after kwargs is updated to fit_kwargs_by_estimator
        (
            sampled_X_train,
            sampled_y_train,
            sampled_weight,
            groups,
        ) = state._prepare_sample_train_data(sample_size)
        if sampled_weight is not None:
            weight = this_estimator_kwargs["sample_weight"]
            this_estimator_kwargs["sample_weight"] = sampled_weight
        if groups is not None:
            this_estimator_kwargs["groups"] = groups
        config = config_w_resource.copy()
        if "FLAML_sample_size" in config:
            del config["FLAML_sample_size"]
        budget = (
            None
            if state.time_budget is None
            else state.time_budget - state.time_from_start
            if sample_size == state.data_size[0]
            else (state.time_budget - state.time_from_start)
            / 2
            * sample_size
            / state.data_size[0]
        )

        (
            trained_estimator,
            val_loss,
            metric_for_logging,
            _,
            pred_time,
        ) = compute_estimator(
            sampled_X_train,
            sampled_y_train,
            state.X_val,
            state.y_val,
            state.weight_val,
            state.groups_val,
            state.train_time_limit
            if budget is None
            else min(budget, state.train_time_limit),
            state.kf,
            config,
            state.task,
            estimator,
            state.eval_method,
            state.metric,
            state.best_loss,
            state.n_jobs,
            state.learner_classes.get(estimator),
            state.log_training_metric,
            this_estimator_kwargs,
        )
        if state.retrain_final and not state.model_history:
            trained_estimator.cleanup()

        result = {
            "pred_time": pred_time,
            "wall_clock_time": time.time() - state._start_time_flag,
            "metric_for_logging": metric_for_logging,
            "val_loss": val_loss,
            "trained_estimator": trained_estimator,
        }
        if sampled_weight is not None:
            this_estimator_kwargs["sample_weight"] = weight
        tune.report(**result)
        return result

    def sanitize(self, config: dict) -> dict:
        """Make a config ready for passing to estimator."""
        config = config.get("ml", config).copy()
        if "FLAML_sample_size" in config:
            del config["FLAML_sample_size"]
        if "learner" in config:
            del config["learner"]
        return config

    def _train_with_config(
        self,
        estimator,
        config_w_resource,
        sample_size=None,
    ):
        if not sample_size:
            sample_size = config_w_resource.get(
                "FLAML_sample_size", len(self.y_train_all)
            )
        config = self.sanitize(config_w_resource)

        this_estimator_kwargs = self.fit_kwargs_by_estimator.get(
            estimator
        ).copy()  # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
        (
            sampled_X_train,
            sampled_y_train,
            sampled_weight,
            groups,
        ) = self._prepare_sample_train_data(sample_size)
        if sampled_weight is not None:
            weight = this_estimator_kwargs[
                "sample_weight"
            ]  # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
            this_estimator_kwargs[
                "sample_weight"
            ] = sampled_weight  # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
        if groups is not None:
            this_estimator_kwargs[
                "groups"
            ] = groups  # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator

        budget = (
            None
            if self.time_budget is None
            else self.time_budget - self.time_from_start
        )

        estimator, train_time = train_estimator(
            X_train=sampled_X_train,
            y_train=sampled_y_train,
            config_dic=config,
            task=self.task,
            estimator_name=estimator,
            n_jobs=self.n_jobs,
            estimator_class=self.learner_classes.get(estimator),
            budget=budget,
            fit_kwargs=this_estimator_kwargs,  # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
            eval_metric=self.metric if hasattr(self, "metric") else "train_time",
        )

        if sampled_weight is not None:
            this_estimator_kwargs[
                "sample_weight"
            ] = weight  # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator

        return estimator, train_time


def size(state: AutoMLState, config: dict) -> float:
    """Size function.

    Returns:
        The mem size in bytes for a config.
    """
    config = config.get("ml", config)
    estimator = config["learner"]
    learner_class = state.learner_classes.get(estimator)
    return learner_class.size(config)


class AutoML(BaseEstimator):
    """The AutoML class.
    Example:

    ```python
    automl = AutoML()
    automl_settings = {
        "time_budget": 60,
        "metric": 'accuracy',
        "task": 'classification',
        "log_file_name": 'mylog.log',
    }
    automl.fit(X_train = X_train, y_train = y_train, **automl_settings)
    ```

    """

    from .version import __version__

    def __init__(self, **settings):
        """Constructor.

        Many settings in fit() can be passed to the constructor too.
        If an argument in fit() is provided, it will override the setting passed to the constructor.
        If an argument in fit() is not provided but provided in the constructor, the value passed to the constructor will be used.

        Args:
            metric: A string of the metric name or a function,
                e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
                'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
                'mape'. Default is 'auto'.
                If passing a customized metric function, the function needs to
                have the follwing signature:

        ```python
        def custom_metric(
            X_test, y_test, estimator, labels,
            X_train, y_train, weight_test=None, weight_train=None,
            config=None, groups_test=None, groups_train=None,
        ):
            return metric_to_minimize, metrics_to_log
        ```
                which returns a float number as the minimization objective,
                and a dictionary as the metrics to log. E.g.,

        ```python
        def custom_metric(
            X_val, y_val, estimator, labels,
            X_train, y_train, weight_val=None, weight_train=None,
            *args,
        ):
            from sklearn.metrics import log_loss
            import time

            start = time.time()
            y_pred = estimator.predict_proba(X_val)
            pred_time = (time.time() - start) / len(X_val)
            val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val)
            y_pred = estimator.predict_proba(X_train)
            train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)
            alpha = 0.5
            return val_loss * (1 + alpha) - alpha * train_loss, {
                "val_loss": val_loss,
                "train_loss": train_loss,
                "pred_time": pred_time,
            }
        ```
            task: A string of the task type, e.g.,
                'classification', 'regression', 'ts_forecast', 'rank',
                'seq-classification', 'seq-regression', 'summarization'.
            n_jobs: An integer of the number of threads for training | default=-1.
                Use all available resources when n_jobs == -1.
            log_file_name: A string of the log file name | default="". To disable logging,
                set it to be an empty string "".
            estimator_list: A list of strings for estimator names, or 'auto'.
                e.g., ```['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']```.
            time_budget: A float number of the time budget in seconds.
                Use -1 if no time limit.
            max_iter: An integer of the maximal number of iterations.
            sample: A boolean of whether to sample the training data during
                search.
            ensemble: boolean or dict | default=False. Whether to perform
                ensemble after search. Can be a dict with keys 'passthrough'
                and 'final_estimator' to specify the passthrough and
                final_estimator in the stacker. The dict can also contain
                'n_jobs' as the key to specify the number of jobs for the stacker.
            eval_method: A string of resampling strategy, one of
                ['auto', 'cv', 'holdout'].
            split_ratio: A float of the valiation data percentage for holdout.
            n_splits: An integer of the number of folds for cross - validation.
            log_type: A string of the log type, one of
                ['better', 'all'].
                'better' only logs configs with better loss than previos iters
                'all' logs all the tried configs.
            model_history: A boolean of whether to keep the best
                model per estimator. Make sure memory is large enough if setting to True.
            log_training_metric: A boolean of whether to log the training
                metric for each model.
            mem_thres: A float of the memory size constraint in bytes.
            pred_time_limit: A float of the prediction latency constraint in seconds.
                It refers to the average prediction time per row in validation data.
            train_time_limit: A float of the training time constraint in seconds.
            verbose: int, default=3 | Controls the verbosity, higher means more
                messages.
            retrain_full: bool or str, default=True | whether to retrain the
                selected model on the full training data when using holdout.
                True - retrain only after search finishes; False - no retraining;
                'budget' - do best effort to retrain without violating the time
                budget.
            split_type: str or splitter object, default="auto" | the data split type.
                * A valid splitter object is an instance of a derived class of scikit-learn
                [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
                and have ``split`` and ``get_n_splits`` methods with the same signatures.
                Set eval_method to "cv" to use the splitter object.
                * Valid str options depend on different tasks.
                For classification tasks, valid choices are
                    ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
                    "auto" -> uniform.
                For time series forecast tasks, must be "auto" or 'time'.
                For ranking task, must be "auto" or 'group'.
            hpo_method: str, default="auto" | The hyperparameter
                optimization method. By default, CFO is used for sequential
                search and BlendSearch is used for parallel search.
                No need to set when using flaml's default search space or using
                a simple customized search space. When set to 'bs', BlendSearch
                is used. BlendSearch can be tried when the search space is
                complex, for example, containing multiple disjoint, discontinuous
                subspaces. When set to 'random', random search is used.
            starting_points: A dictionary or a str to specify the starting hyperparameter
                config for the estimators | default="static".
                If str:
                    - if "data", use data-dependent defaults;
                    - if "data:path" use data-dependent defaults which are stored at path;
                    - if "static", use data-independent defaults.
                If dict, keys are the name of the estimators, and values are the starting
                hyperparamter configurations for the corresponding estimators.
                The value can be a single hyperparamter configuration dict or a list
                of hyperparamter configuration dicts.
                In the following code example, we get starting_points from the
                `automl` object and use them in the `new_automl` object.
                e.g.,

        ```python
        from flaml import AutoML
        automl = AutoML()
        X_train, y_train = load_iris(return_X_y=True)
        automl.fit(X_train, y_train)
        starting_points = automl.best_config_per_estimator

        new_automl = AutoML()
        new_automl.fit(X_train, y_train, starting_points=starting_points)
        ```

            seed: int or None, default=None | The random seed for hpo.
            n_concurrent_trials: [Experimental] int, default=1 | The number of
                concurrent trials. When n_concurrent_trials > 1, flaml performes
                [parallel tuning](../Use-Cases/Task-Oriented-AutoML#parallel-tuning)
                and installation of ray is required: `pip install flaml[ray]`.
            keep_search_state: boolean, default=False | Whether to keep data needed
                for model search after fit(). By default the state is deleted for
                space saving.
            preserve_checkpoint: boolean, default=True | Whether to preserve the saved checkpoint
                on disk when deleting automl. By default the checkpoint is preserved.
            early_stop: boolean, default=False | Whether to stop early if the
                search is considered to converge.
            append_log: boolean, default=False | Whetehr to directly append the log
                records to the input log file if it exists.
            auto_augment: boolean, default=True | Whether to automatically
                augment rare classes.
            min_sample_size: int, default=MIN_SAMPLE_TRAIN | the minimal sample
                size when sample=True.
            use_ray: boolean or dict.
                If boolean: default=False | Whether to use ray to run the training
                in separate processes. This can be used to prevent OOM for large
                datasets, but will incur more overhead in time.
                If dict: the dict contains the keywords arguments to be passed to
                [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).
            metric_constraints: list, default=[] | The list of metric constraints.
                Each element in this list is a 3-tuple, which shall be expressed
                in the following format: the first element of the 3-tuple is the name of the
                metric, the second element is the inequality sign chosen from ">=" and "<=",
                and the third element is the constraint value. E.g., `('val_loss', '<=', 0.1)`.
                Note that all the metric names in metric_constraints need to be reported via
                the metrics_to_log dictionary returned by a customized metric function.
                The customized metric function shall be provided via the `metric` key word
                argument of the fit() function or the automl constructor.
                Find an example in the 4th constraint type in this [doc](../Use-Cases/Task-Oriented-AutoML#constraint).
                If `pred_time_limit` is provided as one of keyword arguments to fit() function or
                the automl constructor, flaml will automatically (and under the hood)
                add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'
                specifies a constraint about the prediction latency constraint in seconds.
            custom_hp: dict, default=None | The custom search space specified by user.
                It is a nested dict with keys being the estimator names, and values being dicts
                per estimator search space. In the per estimator search space dict,
                the keys are the hyperparameter names, and values are dicts of info ("domain",
                "init_value", and "low_cost_init_value") about the search space associated with
                the hyperparameter (i.e., per hyperparameter search space dict). When custom_hp
                is provided, the built-in search space which is also a nested dict of per estimator
                search space dict, will be updated with custom_hp. Note that during this nested dict update,
                the per hyperparameter search space dicts will be replaced (instead of updated) by the ones
                provided in custom_hp. Note that the value for "domain" can either be a constant
                or a sample.Domain object.
                e.g.,

        ```python
        custom_hp = {
             "transformer_ms": {
                 "model_path": {
                     "domain": "albert-base-v2",
                 },
                 "learning_rate": {
                     "domain": tune.choice([1e-4, 1e-5]),
                 }
             }
         }
        ```
            skip_transform: boolean, default=False | Whether to pre-process data prior to modeling.
            fit_kwargs_by_estimator: dict, default=None | The user specified keywords arguments, grouped by estimator name.
                e.g.,

        ```python
        fit_kwargs_by_estimator = {
            "transformer": {
                "output_dir": "test/data/output/",
                "fp16": False,
            }
        }
        ```

        """
        self._track_iter = 0
        self._state = AutoMLState()
        self._state.learner_classes = {}
        self._settings = settings
        # no budget by default
        settings["time_budget"] = settings.get("time_budget", -1)
        settings["task"] = settings.get("task", "classification")
        settings["n_jobs"] = settings.get("n_jobs", -1)
        settings["eval_method"] = settings.get("eval_method", "auto")
        settings["split_ratio"] = settings.get("split_ratio", SPLIT_RATIO)
        settings["n_splits"] = settings.get("n_splits", N_SPLITS)
        settings["auto_augment"] = settings.get("auto_augment", True)
        settings["metric"] = settings.get("metric", "auto")
        settings["estimator_list"] = settings.get("estimator_list", "auto")
        settings["log_file_name"] = settings.get("log_file_name", "")
        settings["max_iter"] = settings.get("max_iter")  # no budget by default
        settings["sample"] = settings.get("sample", True)
        settings["ensemble"] = settings.get("ensemble", False)
        settings["log_type"] = settings.get("log_type", "better")
        settings["model_history"] = settings.get("model_history", False)
        settings["log_training_metric"] = settings.get("log_training_metric", False)
        settings["mem_thres"] = settings.get("mem_thres", MEM_THRES)
        settings["pred_time_limit"] = settings.get("pred_time_limit", np.inf)
        settings["train_time_limit"] = settings.get("train_time_limit", np.inf)
        settings["verbose"] = settings.get("verbose", 3)
        settings["retrain_full"] = settings.get("retrain_full", True)
        settings["split_type"] = settings.get("split_type", "auto")
        settings["hpo_method"] = settings.get("hpo_method", "auto")
        settings["learner_selector"] = settings.get("learner_selector", "sample")
        settings["starting_points"] = settings.get("starting_points", "static")
        settings["n_concurrent_trials"] = settings.get("n_concurrent_trials", 1)
        settings["keep_search_state"] = settings.get("keep_search_state", False)
        settings["preserve_checkpoint"] = settings.get("preserve_checkpoint", True)
        settings["early_stop"] = settings.get("early_stop", False)
        settings["append_log"] = settings.get("append_log", False)
        settings["min_sample_size"] = settings.get("min_sample_size", MIN_SAMPLE_TRAIN)
        settings["use_ray"] = settings.get("use_ray", False)
        settings["metric_constraints"] = settings.get("metric_constraints", [])
        settings["fit_kwargs_by_estimator"] = settings.get(
            "fit_kwargs_by_estimator", {}
        )
        settings["custom_hp"] = settings.get("custom_hp", {})
        settings["skip_transform"] = settings.get("skip_transform", False)

        self._estimator_type = (
            "classifier" if settings["task"] in CLASSIFICATION else "regressor"
        )

    def get_params(self, deep=False):
        return self._settings.copy()

    @property
    def config_history(self):
        """A dictionary of iter->(estimator, config, time),
        storing the best estimator, config, and the time when the best
        model is updated each time.
        """
        return self._config_history

    @property
    def model(self):
        """An object with `predict()` and `predict_proba()` method (for
        classification), storing the best trained model.
        """
        return self.__dict__.get("_trained_estimator")

    def best_model_for_estimator(self, estimator_name):
        """Return the best model found for a particular estimator.

        Args:
            estimator_name: a str of the estimator's name.

        Returns:
            An object storing the best model for estimator_name.
            If `model_history` was set to False during fit(), then the returned model
            is untrained unless estimator_name is the best estimator.
            If `model_history` was set to True, then the returned model is trained.
        """
        state = self._search_states.get(estimator_name)
        return state and getattr(state, "trained_estimator", None)

    @property
    def best_estimator(self):
        """A string indicating the best estimator found."""
        return self._best_estimator

    @property
    def best_iteration(self):
        """An integer of the iteration number where the best
        config is found."""
        return self._best_iteration

    @property
    def best_config(self):
        """A dictionary of the best configuration."""
        state = self._search_states.get(self._best_estimator)
        return state and getattr(state, "best_config", None)

    @property
    def best_config_per_estimator(self):
        """A dictionary of all estimators' best configuration."""
        return {
            e: e_search_state.best_config
            for e, e_search_state in self._search_states.items()
        }

    @property
    def best_loss_per_estimator(self):
        """A dictionary of all estimators' best loss."""
        return {
            e: e_search_state.best_loss
            for e, e_search_state in self._search_states.items()
        }

    @property
    def best_loss(self):
        """A float of the best loss found."""
        return self._state.best_loss

    @property
    def best_result(self):
        """Result dictionary for model trained with the best config."""
        state = self._search_states.get(self._best_estimator)
        return state and getattr(state, "best_result", None)

    @property
    def metrics_for_best_config(self):
        """Returns a float of the best loss, and a dictionary of the auxiliary metrics to log
        associated with the best config. These two objects correspond to the returned
        objects by the customized metric function for the config with the best loss."""
        state = self._search_states.get(self._best_estimator)
        return self._state.best_loss, state and getattr(state, "best_result", {}).get(
            "metric_for_logging"
        )

    @property
    def best_config_train_time(self):
        """A float of the seconds taken by training the best config."""
        return getattr(
            self._search_states[self._best_estimator], "best_config_train_time", None
        )

    def save_best_config(self, filename):
        best = {
            "class": self.best_estimator,
            "hyperparameters": self.best_config,
        }
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        with open(filename, "w") as f:
            json.dump(best, f)

    @property
    def classes_(self):
        """A numpy array of shape (n_classes,) for class labels."""
        attr = getattr(self, "_label_transformer", None)
        if attr:
            return attr.classes_
        attr = getattr(self, "_trained_estimator", None)
        if attr:
            return attr.classes_
        return None

    @property
    def n_features_in_(self):
        return self._trained_estimator.n_features_in_

    @property
    def feature_names_in_(self):
        attr = getattr(self, "_trained_estimator", None)
        attr = attr and getattr(attr, "feature_names_in_", None)
        if attr is not None:
            return attr
        return getattr(self, "_feature_names_in_", None)

    @property
    def feature_importances_(self):
        attr = getattr(self, "_trained_estimator", None)
        attr = attr and getattr(attr, "feature_importances_", None)
        return attr

    @property
    def time_to_find_best_model(self) -> float:
        """Time taken to find best model in seconds."""
        return self.__dict__.get("_time_taken_best_iter")

    def score(self, X: pd.DataFrame, y: pd.Series, **kwargs):
        estimator = getattr(self, "_trained_estimator", None)
        if estimator is None:
            logger.warning(
                "No estimator is trained. Please run fit with enough budget."
            )
            return None
        X = self._preprocess(X)
        if self._label_transformer:
            y = self._label_transformer.transform(y)
        return estimator.score(X, y, **kwargs)

    def predict(
        self,
        X: Union[np.array, pd.DataFrame, List[str], List[List[str]]],
        **pred_kwargs,
    ):
        """Predict label from features.

        Args:
            X: A numpy array of featurized instances, shape n * m,
                or for time series forcast tasks:
                    a pandas dataframe with the first column containing
                    timestamp values (datetime type) or an integer n for
                    the predict steps (only valid when the estimator is
                    arima or sarimax). Other columns in the dataframe
                    are assumed to be exogenous variables (categorical
                    or numeric).
            **pred_kwargs: Other key word arguments to pass to predict() function of
                the searched learners, such as per_device_eval_batch_size.

        ```python
        multivariate_X_test = pd.DataFrame({
            'timeStamp': pd.date_range(start='1/1/2022', end='1/07/2022'),
            'categorical_col': ['yes', 'yes', 'no', 'no', 'yes', 'no', 'yes'],
            'continuous_col': [105, 107, 120, 118, 110, 112, 115]
        })
        model.predict(multivariate_X_test)
        ```

        Returns:
            A array-like of shape n * 1: each element is a predicted
            label for an instance.
        """
        estimator = getattr(self, "_trained_estimator", None)
        if estimator is None:
            logger.warning(
                "No estimator is trained. Please run fit with enough budget."
            )
            return None
        X = self._preprocess(X)
        y_pred = estimator.predict(X, **pred_kwargs)
        if (
            isinstance(y_pred, np.ndarray)
            and y_pred.ndim > 1
            and isinstance(y_pred, np.ndarray)
        ):
            y_pred = y_pred.flatten()
        if self._label_transformer:
            return self._label_transformer.inverse_transform(
                pd.Series(y_pred.astype(int))
            )
        else:
            return y_pred

    def predict_proba(self, X, **pred_kwargs):
        """Predict the probability of each class from features, only works for
        classification problems.

        Args:
            X: A numpy array of featurized instances, shape n * m.
            **pred_kwargs: Other key word arguments to pass to predict_proba() function of
                the searched learners, such as per_device_eval_batch_size.

        Returns:
            A numpy array of shape n * c. c is the  # classes. Each element at
            (i, j) is the probability for instance i to be in class j.
        """
        estimator = getattr(self, "_trained_estimator", None)
        if estimator is None:
            logger.warning(
                "No estimator is trained. Please run fit with enough budget."
            )
            return None
        X = self._preprocess(X)
        proba = self._trained_estimator.predict_proba(X, **pred_kwargs)
        return proba

    def _preprocess(self, X):
        if isinstance(X, List):
            try:
                if isinstance(X[0], List):
                    X = [x for x in zip(*X)]
                X = pd.DataFrame(
                    dict(
                        [
                            (self._transformer._str_columns[idx], X[idx])
                            if isinstance(X[0], List)
                            else (self._transformer._str_columns[idx], [X[idx]])
                            for idx in range(len(X))
                        ]
                    )
                )
            except IndexError:
                raise IndexError(
                    "Test data contains more columns than training data, exiting"
                )
        elif isinstance(X, int):
            return X
        elif issparse(X):
            X = X.tocsr()
        if self._state.task in TS_FORECAST:
            X = pd.DataFrame(X)
        if self._transformer:
            X = self._transformer.transform(X)
        return X

    def _validate_ts_data(
        self,
        dataframe,
        y_train_all=None,
    ):
        assert (
            dataframe[dataframe.columns[0]].dtype.name == "datetime64[ns]"
        ), f"For '{TS_FORECAST}' task, the first column must contain timestamp values."
        if y_train_all is not None:
            y_df = (
                pd.DataFrame(y_train_all)
                if isinstance(y_train_all, pd.Series)
                else pd.DataFrame(y_train_all, columns=["labels"])
            )
            dataframe = dataframe.join(y_df)
        duplicates = dataframe.duplicated()
        if any(duplicates):
            logger.warning(
                "Duplicate timestamp values found in timestamp column. "
                f"\n{dataframe.loc[duplicates, dataframe][dataframe.columns[0]]}"
            )
            dataframe = dataframe.drop_duplicates()
            logger.warning("Removed duplicate rows based on all columns")
            assert (
                dataframe[[dataframe.columns[0]]].duplicated() is None
            ), "Duplicate timestamp values with different values for other columns."
        ts_series = pd.to_datetime(dataframe[dataframe.columns[0]])
        inferred_freq = pd.infer_freq(ts_series)
        if inferred_freq is None:
            logger.warning(
                "Missing timestamps detected. To avoid error with estimators, set estimator list to ['prophet']. "
            )
        if y_train_all is not None:
            return dataframe.iloc[:, :-1], dataframe.iloc[:, -1]
        return dataframe

    def _validate_data(
        self,
        X_train_all,
        y_train_all,
        dataframe,
        label,
        X_val=None,
        y_val=None,
        groups_val=None,
        groups=None,
    ):

        if X_train_all is not None and y_train_all is not None:
            assert (
                isinstance(X_train_all, np.ndarray)
                or issparse(X_train_all)
                or isinstance(X_train_all, pd.DataFrame)
            ), (
                "X_train_all must be a numpy array, a pandas dataframe, "
                "or Scipy sparse matrix."
            )
            assert isinstance(y_train_all, np.ndarray) or isinstance(
                y_train_all, pd.Series
            ), "y_train_all must be a numpy array or a pandas series."
            assert (
                X_train_all.size != 0 and y_train_all.size != 0
            ), "Input data must not be empty."
            if isinstance(X_train_all, np.ndarray) and len(X_train_all.shape) == 1:
                X_train_all = np.reshape(X_train_all, (X_train_all.size, 1))
            if isinstance(y_train_all, np.ndarray):
                y_train_all = y_train_all.flatten()
            assert (
                X_train_all.shape[0] == y_train_all.shape[0]
            ), "# rows in X_train must match length of y_train."
            self._df = isinstance(X_train_all, pd.DataFrame)
            self._nrow, self._ndim = X_train_all.shape
            if self._state.task in TS_FORECAST:
                X_train_all = pd.DataFrame(X_train_all)
                X_train_all, y_train_all = self._validate_ts_data(
                    X_train_all, y_train_all
                )
            X, y = X_train_all, y_train_all
        elif dataframe is not None and label is not None:
            assert isinstance(
                dataframe, pd.DataFrame
            ), "dataframe must be a pandas DataFrame"
            assert label in dataframe.columns, "label must a column name in dataframe"
            self._df = True
            if self._state.task in TS_FORECAST:
                dataframe = self._validate_ts_data(dataframe)
            X = dataframe.drop(columns=label)
            self._nrow, self._ndim = X.shape
            y = dataframe[label]
        else:
            raise ValueError("either X_train+y_train or dataframe+label are required")

        # check the validity of input dimensions for NLP tasks, so need to check _is_nlp_task not estimator
        if _is_nlp_task(self._state.task):
            from .nlp.utils import is_a_list_of_str

            is_all_str = True
            is_all_list = True
            for column in X.columns:
                assert X[column].dtype.name in (
                    "object",
                    "string",
                ), "If the task is an NLP task, X can only contain text columns"
                for each_cell in X[column]:
                    if each_cell is not None:
                        is_str = isinstance(each_cell, str)
                        is_list_of_int = isinstance(each_cell, list) and all(
                            isinstance(x, int) for x in each_cell
                        )
                        is_list_of_str = is_a_list_of_str(each_cell)
                        if self._state.task == TOKENCLASSIFICATION:
                            assert is_list_of_str, (
                                "For the token-classification task, the input column needs to be a list of string,"
                                "instead of string, e.g., ['EU', 'rejects','German', 'call','to','boycott','British','lamb','.',].",
                                "For more examples, please refer to test/nlp/test_autohf_tokenclassification.py",
                            )
                        else:
                            assert is_str or is_list_of_int, (
                                "Each column of the input must either be str (untokenized) "
                                "or a list of integers (tokenized)"
                            )
                        is_all_str &= is_str
                        is_all_list &= is_list_of_int or is_list_of_str
            assert is_all_str or is_all_list, (
                "Currently FLAML only supports two modes for NLP: either all columns of X are string (non-tokenized), "
                "or all columns of X are integer ids (tokenized)"
            )

        if issparse(X_train_all) or self._skip_transform:
            self._transformer = self._label_transformer = False
            self._X_train_all, self._y_train_all = X, y
        else:
            from .data import DataTransformer

            self._transformer = DataTransformer()

            self._X_train_all, self._y_train_all = self._transformer.fit_transform(
                X, y, self._state.task
            )
            self._label_transformer = self._transformer.label_transformer
            if self._state.task == TOKENCLASSIFICATION:
                if hasattr(self._label_transformer, "label_list"):
                    self._state.fit_kwargs.update(
                        {"label_list": self._label_transformer.label_list}
                    )
                elif "label_list" not in self._state.fit_kwargs:
                    for each_fit_kwargs in self._state.fit_kwargs_by_estimator.values():
                        assert (
                            "label_list" in each_fit_kwargs
                        ), "For the token-classification task, you must either (1) pass token labels; or (2) pass id labels and the label list. "
                        "Please refer to the documentation for more details: https://microsoft.github.io/FLAML/docs/Examples/AutoML-NLP#a-simple-token-classification-example"
            self._feature_names_in_ = (
                self._X_train_all.columns.to_list()
                if hasattr(self._X_train_all, "columns")
                else None
            )

        self._sample_weight_full = self._state.fit_kwargs.get(
            "sample_weight"
        )  # NOTE: _validate_data is before kwargs is updated to fit_kwargs_by_estimator
        if X_val is not None and y_val is not None:
            assert (
                isinstance(X_val, np.ndarray)
                or issparse(X_val)
                or isinstance(X_val, pd.DataFrame)
            ), (
                "X_val must be None, a numpy array, a pandas dataframe, "
                "or Scipy sparse matrix."
            )
            assert isinstance(y_val, np.ndarray) or isinstance(
                y_val, pd.Series
            ), "y_val must be None, a numpy array or a pandas series."
            assert X_val.size != 0 and y_val.size != 0, (
                "Validation data are expected to be nonempty. "
                "Use None for X_val and y_val if no validation data."
            )
            if isinstance(y_val, np.ndarray):
                y_val = y_val.flatten()
            assert (
                X_val.shape[0] == y_val.shape[0]
            ), "# rows in X_val must match length of y_val."
            if self._transformer:
                self._state.X_val = self._transformer.transform(X_val)
            else:
                self._state.X_val = X_val
            # If it's NLG_TASKS, y_val is a pandas series containing the output sequence tokens,
            # so we cannot use label_transformer.transform to process it
            if self._label_transformer:
                self._state.y_val = self._label_transformer.transform(y_val)
            else:
                self._state.y_val = y_val
        else:
            self._state.X_val = self._state.y_val = None
        if groups is not None and len(groups) != self._nrow:
            # groups is given as group counts
            self._state.groups = np.concatenate([[i] * c for i, c in enumerate(groups)])
            assert (
                len(self._state.groups) == self._nrow
            ), "the sum of group counts must match the number of examples"
            self._state.groups_val = (
                np.concatenate([[i] * c for i, c in enumerate(groups_val)])
                if groups_val is not None
                else None
            )
        else:
            self._state.groups_val = groups_val
            self._state.groups = groups

    def _prepare_data(self, eval_method, split_ratio, n_splits):

        X_val, y_val = self._state.X_val, self._state.y_val
        if issparse(X_val):
            X_val = X_val.tocsr()
        X_train_all, y_train_all = self._X_train_all, self._y_train_all
        if issparse(X_train_all):
            X_train_all = X_train_all.tocsr()
        if (
            self._state.task in CLASSIFICATION
            and self._auto_augment
            and self._state.fit_kwargs.get("sample_weight")
            is None  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
            and self._split_type in ["stratified", "uniform"]
            and self._state.task != TOKENCLASSIFICATION
        ):
            # logger.info(f"label {pd.unique(y_train_all)}")
            label_set, counts = np.unique(y_train_all, return_counts=True)
            # augment rare classes
            rare_threshld = 20
            rare = counts < rare_threshld
            rare_label, rare_counts = label_set[rare], counts[rare]
            for i, label in enumerate(rare_label):
                count = rare_count = rare_counts[i]
                rare_index = y_train_all == label
                n = len(y_train_all)
                while count < rare_threshld:
                    if self._df:
                        X_train_all = concat(
                            X_train_all, X_train_all.iloc[:n].loc[rare_index]
                        )
                    else:
                        X_train_all = concat(
                            X_train_all, X_train_all[:n][rare_index, :]
                        )
                    if isinstance(y_train_all, pd.Series):
                        y_train_all = concat(
                            y_train_all, y_train_all.iloc[:n].loc[rare_index]
                        )
                    else:
                        y_train_all = np.concatenate(
                            [y_train_all, y_train_all[:n][rare_index]]
                        )
                    count += rare_count
                logger.info(f"class {label} augmented from {rare_count} to {count}")
        SHUFFLE_SPLIT_TYPES = ["uniform", "stratified"]
        if self._split_type in SHUFFLE_SPLIT_TYPES:
            if self._sample_weight_full is not None:
                X_train_all, y_train_all, self._state.sample_weight_all = shuffle(
                    X_train_all,
                    y_train_all,
                    self._sample_weight_full,
                    random_state=RANDOM_SEED,
                )
                self._state.fit_kwargs[
                    "sample_weight"
                ] = (
                    self._state.sample_weight_all
                )  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
            else:
                X_train_all, y_train_all = shuffle(
                    X_train_all, y_train_all, random_state=RANDOM_SEED
                )
            if self._df:
                X_train_all.reset_index(drop=True, inplace=True)
                if isinstance(y_train_all, pd.Series):
                    y_train_all.reset_index(drop=True, inplace=True)

        X_train, y_train = X_train_all, y_train_all
        self._state.groups_all = self._state.groups
        if X_val is None and eval_method == "holdout":
            # if eval_method = holdout, make holdout data
            if self._split_type == "time":
                if self._state.task in TS_FORECAST:
                    period = self._state.fit_kwargs[
                        "period"
                    ]  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                    if self._state.task == TS_FORECASTPANEL:
                        X_train_all["time_idx"] -= X_train_all["time_idx"].min()
                        X_train_all["time_idx"] = X_train_all["time_idx"].astype("int")
                        ids = self._state.fit_kwargs["group_ids"].copy()
                        ids.append(TS_TIMESTAMP_COL)
                        ids.append("time_idx")
                        y_train_all = pd.DataFrame(y_train_all)
                        y_train_all[ids] = X_train_all[ids]
                        X_train_all = X_train_all.sort_values(ids)
                        y_train_all = y_train_all.sort_values(ids)
                        training_cutoff = X_train_all["time_idx"].max() - period
                        X_train = X_train_all[lambda x: x.time_idx <= training_cutoff]
                        y_train = y_train_all[
                            lambda x: x.time_idx <= training_cutoff
                        ].drop(columns=ids)
                        X_val = X_train_all[lambda x: x.time_idx > training_cutoff]
                        y_val = y_train_all[
                            lambda x: x.time_idx > training_cutoff
                        ].drop(columns=ids)
                    else:
                        num_samples = X_train_all.shape[0]
                        assert (
                            period < num_samples
                        ), f"period={period}>#examples={num_samples}"
                        split_idx = num_samples - period
                        X_train = X_train_all[:split_idx]
                        y_train = y_train_all[:split_idx]
                        X_val = X_train_all[split_idx:]
                        y_val = y_train_all[split_idx:]
                else:
                    if (
                        "sample_weight" in self._state.fit_kwargs
                    ):  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                        (
                            X_train,
                            X_val,
                            y_train,
                            y_val,
                            self._state.fit_kwargs[
                                "sample_weight"
                            ],  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                            self._state.weight_val,
                        ) = train_test_split(
                            X_train_all,
                            y_train_all,
                            self._state.fit_kwargs[
                                "sample_weight"
                            ],  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                            test_size=split_ratio,
                            shuffle=False,
                        )
                    else:
                        X_train, X_val, y_train, y_val = train_test_split(
                            X_train_all,
                            y_train_all,
                            test_size=split_ratio,
                            shuffle=False,
                        )
            elif self._split_type == "group":
                gss = GroupShuffleSplit(
                    n_splits=1, test_size=split_ratio, random_state=RANDOM_SEED
                )
                for train_idx, val_idx in gss.split(
                    X_train_all, y_train_all, self._state.groups_all
                ):
                    if self._df:
                        X_train = X_train_all.iloc[train_idx]
                        X_val = X_train_all.iloc[val_idx]
                    else:
                        X_train, X_val = X_train_all[train_idx], X_train_all[val_idx]
                    y_train, y_val = y_train_all[train_idx], y_train_all[val_idx]
                    self._state.groups = self._state.groups_all[train_idx]
                    self._state.groups_val = self._state.groups_all[val_idx]
            elif self._state.task in CLASSIFICATION:
                # for classification, make sure the labels are complete in both
                # training and validation data
                label_set, first = np.unique(y_train_all, return_index=True)
                rest = []
                last = 0
                first.sort()
                for i in range(len(first)):
                    rest.extend(range(last, first[i]))
                    last = first[i] + 1
                rest.extend(range(last, len(y_train_all)))
                X_first = X_train_all.iloc[first] if self._df else X_train_all[first]
                X_rest = X_train_all.iloc[rest] if self._df else X_train_all[rest]
                y_rest = y_train_all[rest]
                stratify = y_rest if self._split_type == "stratified" else None
                if (
                    "sample_weight" in self._state.fit_kwargs
                ):  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                    (
                        X_train,
                        X_val,
                        y_train,
                        y_val,
                        weight_train,
                        weight_val,
                    ) = train_test_split(
                        X_rest,
                        y_rest,
                        self._state.fit_kwargs["sample_weight"][
                            rest
                        ],  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                        test_size=split_ratio,
                        random_state=RANDOM_SEED,
                    )
                    weight1 = self._state.fit_kwargs["sample_weight"][
                        first
                    ]  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                    self._state.weight_val = concat(weight1, weight_val)
                    self._state.fit_kwargs[
                        "sample_weight"
                    ] = concat(  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                        weight1, weight_train
                    )
                else:
                    X_train, X_val, y_train, y_val = train_test_split(
                        X_rest,
                        y_rest,
                        test_size=split_ratio,
                        stratify=stratify,
                        random_state=RANDOM_SEED,
                    )
                X_train = concat(X_first, X_train)
                y_train = (
                    concat(label_set, y_train)
                    if self._df
                    else np.concatenate([label_set, y_train])
                )
                X_val = concat(X_first, X_val)
                y_val = (
                    concat(label_set, y_val)
                    if self._df
                    else np.concatenate([label_set, y_val])
                )
            elif self._state.task in REGRESSION:
                if (
                    "sample_weight" in self._state.fit_kwargs
                ):  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                    (
                        X_train,
                        X_val,
                        y_train,
                        y_val,
                        self._state.fit_kwargs[
                            "sample_weight"
                        ],  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                        self._state.weight_val,
                    ) = train_test_split(
                        X_train_all,
                        y_train_all,
                        self._state.fit_kwargs[
                            "sample_weight"
                        ],  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                        test_size=split_ratio,
                        random_state=RANDOM_SEED,
                    )
                else:
                    X_train, X_val, y_train, y_val = train_test_split(
                        X_train_all,
                        y_train_all,
                        test_size=split_ratio,
                        random_state=RANDOM_SEED,
                    )
        self._state.data_size = X_train.shape
        self.data_size_full = len(y_train_all)
        self._state.X_train, self._state.y_train = X_train, y_train
        self._state.X_val, self._state.y_val = X_val, y_val
        self._state.X_train_all = X_train_all
        self._state.y_train_all = y_train_all
        if eval_method == "holdout":
            self._state.kf = None
            return
        if self._split_type == "group":
            # logger.info("Using GroupKFold")
            assert (
                len(self._state.groups_all) == y_train_all.size
            ), "the length of groups must match the number of examples"
            assert (
                len(np.unique(self._state.groups_all)) >= n_splits
            ), "the number of groups must be equal or larger than n_splits"
            self._state.kf = GroupKFold(n_splits)
        elif self._split_type == "stratified":
            # logger.info("Using StratifiedKFold")
            assert y_train_all.size >= n_splits, (
                f"{n_splits}-fold cross validation"
                f" requires input data with at least {n_splits} examples."
            )
            assert y_train_all.size >= 2 * n_splits, (
                f"{n_splits}-fold cross validation with metric=r2 "
                f"requires input data with at least {n_splits*2} examples."
            )
            self._state.kf = RepeatedStratifiedKFold(
                n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED
            )
        elif self._split_type == "time":
            # logger.info("Using TimeSeriesSplit")
            if (
                self._state.task in TS_FORECAST
                and self._state.task is not TS_FORECASTPANEL
            ):
                period = self._state.fit_kwargs[
                    "period"
                ]  # NOTE: _prepare_data is before kwargs is updated to fit_kwargs_by_estimator
                if period * (n_splits + 1) > y_train_all.size:
                    n_splits = int(y_train_all.size / period - 1)
                    assert n_splits >= 2, (
                        f"cross validation for forecasting period={period}"
                        f" requires input data with at least {3 * period} examples."
                    )
                    logger.info(f"Using nsplits={n_splits} due to data size limit.")
                self._state.kf = TimeSeriesSplit(n_splits=n_splits, test_size=period)
            elif self._state.task is TS_FORECASTPANEL:
                n_groups = X_train.groupby(
                    self._state.fit_kwargs.get("group_ids")
                ).ngroups
                period = self._state.fit_kwargs.get("period")
                self._state.kf = TimeSeriesSplit(
                    n_splits=n_splits, test_size=period * n_groups
                )
            else:
                self._state.kf = TimeSeriesSplit(n_splits=n_splits)
        elif isinstance(self._split_type, str):
            # logger.info("Using RepeatedKFold")
            self._state.kf = RepeatedKFold(
                n_splits=n_splits, n_repeats=1, random_state=RANDOM_SEED
            )
        else:
            # logger.info("Using splitter object")
            self._state.kf = self._split_type
        if isinstance(self._state.kf, GroupKFold):
            # self._split_type is either "group" or a GroupKFold object
            self._state.kf.groups = self._state.groups_all

    def add_learner(self, learner_name, learner_class):
        """Add a customized learner.

        Args:
            learner_name: A string of the learner's name.
            learner_class: A subclass of flaml.model.BaseEstimator.
        """
        self._state.learner_classes[learner_name] = learner_class

    def get_estimator_from_log(self, log_file_name, record_id, task):
        """Get the estimator from log file.

        Args:
            log_file_name: A string of the log file name.
            record_id: An integer of the record ID in the file,
                0 corresponds to the first trial.
            task: A string of the task type,
                'binary', 'multiclass', 'regression', 'ts_forecast', 'rank'.

        Returns:
            An estimator object for the given configuration.
        """

        with training_log_reader(log_file_name) as reader:
            record = reader.get_record(record_id)
            estimator = record.learner
            config = record.config

        estimator, _ = train_estimator(
            X_train=None,
            y_train=None,
            config_dic=config,
            task=task,
            estimator_name=estimator,
            estimator_class=self._state.learner_classes.get(estimator),
            eval_metric="train_time",
        )
        return estimator

    def retrain_from_log(
        self,
        log_file_name,
        X_train=None,
        y_train=None,
        dataframe=None,
        label=None,
        time_budget=np.inf,
        task=None,
        eval_method=None,
        split_ratio=None,
        n_splits=None,
        split_type=None,
        groups=None,
        n_jobs=-1,
        # gpu_per_trial=0,
        train_best=True,
        train_full=False,
        record_id=-1,
        auto_augment=None,
        custom_hp=None,
        skip_transform=None,
        preserve_checkpoint=True,
        fit_kwargs_by_estimator=None,
        **fit_kwargs,
    ):
        """Retrain from log file.

        This function is intended to retrain the logged configurations.
        NOTE: In some rare case, the last config is early stopped to meet time_budget and it's the best config.
        But the logged config's ITER_HP (e.g., n_estimators) is not reduced.

        Args:
            log_file_name: A string of the log file name.
            X_train: A numpy array or dataframe of training data in shape n*m.
                For time series forecast tasks, the first column of X_train
                must be the timestamp column (datetime type). Other
                columns in the dataframe are assumed to be exogenous
                variables (categorical or numeric).
            y_train: A numpy array or series of labels in shape n*1.
            dataframe: A dataframe of training data including label column.
                For time series forecast tasks, dataframe must be specified and should
                have at least two columns: timestamp and label, where the first
                column is the timestamp column (datetime type). Other columns
                in the dataframe are assumed to be exogenous variables
                (categorical or numeric).
            label: A str of the label column name, e.g., 'label';
                Note: If X_train and y_train are provided,
                dataframe and label are ignored;
                If not, dataframe and label must be provided.
            time_budget: A float number of the time budget in seconds.
            task: A string of the task type, e.g.,
                'classification', 'regression', 'ts_forecast', 'rank',
                'seq-classification', 'seq-regression', 'summarization'.
            eval_method: A string of resampling strategy, one of
                ['auto', 'cv', 'holdout'].
            split_ratio: A float of the validation data percentage for holdout.
            n_splits: An integer of the number of folds for cross-validation.
            split_type: str or splitter object, default="auto" | the data split type.
                * A valid splitter object is an instance of a derived class of scikit-learn
                [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
                and have ``split`` and ``get_n_splits`` methods with the same signatures.
                Set eval_method to "cv" to use the splitter object.
                * Valid str options depend on different tasks.
                For classification tasks, valid choices are
                    ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
                    "auto" -> uniform.
                For time series forecast tasks, must be "auto" or 'time'.
                For ranking task, must be "auto" or 'group'.
            groups: None or array-like | Group labels (with matching length to
                y_train) or groups counts (with sum equal to length of y_train)
                for training data.
            n_jobs: An integer of the number of threads for training | default=-1.
                Use all available resources when n_jobs == -1.
            train_best: A boolean of whether to train the best config in the
                time budget; if false, train the last config in the budget.
            train_full: A boolean of whether to train on the full data. If true,
                eval_method and sample_size in the log file will be ignored.
            record_id: the ID of the training log record from which the model will
                be retrained. By default `record_id = -1` which means this will be
                ignored. `record_id = 0` corresponds to the first trial, and
                when `record_id >= 0`, `time_budget` will be ignored.
            auto_augment: boolean, default=True | Whether to automatically
                augment rare classes.
            custom_hp: dict, default=None | The custom search space specified by user
                Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the
                domain of the custom search space can either be a value or a sample.Domain object.

        ```python
        custom_hp = {
            "transformer_ms": {
                "model_path": {
                    "domain": "albert-base-v2",
                },
                "learning_rate": {
                    "domain": tune.choice([1e-4, 1e-5]),
                }
            }
        }
        ```
            fit_kwargs_by_estimator: dict, default=None | The user specified keywords arguments, grouped by estimator name.
                e.g.,

        ```python
        fit_kwargs_by_estimator = {
            "transformer": {
                "output_dir": "test/data/output/",
                "fp16": False,
            }
        }
        ```

            **fit_kwargs: Other key word arguments to pass to fit() function of
                the searched learners, such as sample_weight. Below are a few examples of
                estimator-specific parameters:
                    period: int | forecast horizon for all time series forecast tasks.
                    gpu_per_trial: float, default = 0 | A float of the number of gpus per trial,
                        only used by TransformersEstimator, XGBoostSklearnEstimator, and
                        TemporalFusionTransformerEstimator.
                    group_ids: list of strings of column names identifying a time series, only
                        used by TemporalFusionTransformerEstimator, required for
                        'ts_forecast_panel' task. `group_ids` is a parameter for TimeSeriesDataSet object
                        from PyTorchForecasting.
                        For other parameters to describe your dataset, refer to
                        [TimeSeriesDataSet PyTorchForecasting](https://pytorch-forecasting.readthedocs.io/en/stable/api/pytorch_forecasting.data.timeseries.TimeSeriesDataSet.html).
                        To specify your variables, use `static_categoricals`, `static_reals`,
                        `time_varying_known_categoricals`, `time_varying_known_reals`,
                        `time_varying_unknown_categoricals`, `time_varying_unknown_reals`,
                        `variable_groups`. To provide more information on your data, use
                        `max_encoder_length`, `min_encoder_length`, `lags`.
                    log_dir: str, default = "lightning_logs" | Folder into which to log results
                        for tensorboard, only used by TemporalFusionTransformerEstimator.
                    max_epochs: int, default = 20 | Maximum number of epochs to run training,
                        only used by TemporalFusionTransformerEstimator.
                    batch_size: int, default = 64 | Batch size for training model, only
                        used by TemporalFusionTransformerEstimator.
        """
        task = task or self._settings.get("task")
        eval_method = eval_method or self._settings.get("eval_method")
        split_ratio = split_ratio or self._settings.get("split_ratio")
        n_splits = n_splits or self._settings.get("n_splits")
        split_type = split_type or self._settings.get("split_type")
        auto_augment = (
            self._settings.get("auto_augment") if auto_augment is None else auto_augment
        )
        self._state.task = task
        self._estimator_type = "classifier" if task in CLASSIFICATION else "regressor"

        self._state.fit_kwargs = fit_kwargs
        self._state.custom_hp = custom_hp or self._settings.get("custom_hp")
        self._skip_transform = (
            self._settings.get("skip_transform")
            if skip_transform is None
            else skip_transform
        )
        self._state.fit_kwargs_by_estimator = (
            fit_kwargs_by_estimator or self._settings.get("fit_kwargs_by_estimator")
        )
        self.preserve_checkpoint = (
            self._settings.get("preserve_checkpoint")
            if preserve_checkpoint is None
            else preserve_checkpoint
        )
        self._validate_data(X_train, y_train, dataframe, label, groups=groups)

        logger.info("log file name {}".format(log_file_name))

        best_config = None
        best_val_loss = float("+inf")
        best_estimator = None
        sample_size = None
        time_used = 0.0
        training_duration = 0
        best = None
        with training_log_reader(log_file_name) as reader:
            if record_id >= 0:
                best = reader.get_record(record_id)
            else:
                for record in reader.records():
                    time_used = record.wall_clock_time
                    if time_used > time_budget:
                        break
                    training_duration = time_used
                    val_loss = record.validation_loss
                    if val_loss <= best_val_loss or not train_best:
                        if val_loss == best_val_loss and train_best:
                            size = record.sample_size
                            if size > sample_size:
                                best = record
                                best_val_loss = val_loss
                                sample_size = size
                        else:
                            best = record
                            size = record.sample_size
                            best_val_loss = val_loss
                            sample_size = size
                if not training_duration:
                    logger.warning(
                        f"No estimator found within time_budget={time_budget}"
                    )
                    from .model import BaseEstimator as Estimator

                    self._trained_estimator = Estimator()
                    return training_duration
        if not best:
            return
        best_estimator = best.learner
        best_config = best.config
        sample_size = len(self._y_train_all) if train_full else best.sample_size

        this_estimator_kwargs = self._state.fit_kwargs_by_estimator.get(best_estimator)
        if this_estimator_kwargs:
            this_estimator_kwargs = (
                this_estimator_kwargs.copy()
            )  # make another shallow copy of the value (a dict obj), so user's fit_kwargs_by_estimator won't be updated
            this_estimator_kwargs.update(self._state.fit_kwargs)
            self._state.fit_kwargs_by_estimator[best_estimator] = this_estimator_kwargs
        else:
            self._state.fit_kwargs_by_estimator[best_estimator] = self._state.fit_kwargs

        logger.info(
            "estimator = {}, config = {}, #training instances = {}".format(
                best_estimator, best_config, sample_size
            )
        )
        # Partially copied from fit() function
        # Initilize some attributes required for retrain_from_log
        self._decide_split_type(split_type)
        eval_method = self._decide_eval_method(eval_method, time_budget)
        self.modelcount = 0
        self._auto_augment = auto_augment
        self._prepare_data(eval_method, split_ratio, n_splits)
        self._state.time_budget = None
        self._state.n_jobs = n_jobs
        import os

        self._state.resources_per_trial = (
            {
                "cpu": max(1, os.cpu_count() >> 1),
                "gpu": fit_kwargs.get("gpu_per_trial", 0),
            }
            if self._state.n_jobs < 0
            else {"cpu": self._state.n_jobs, "gpu": fit_kwargs.get("gpu_per_trial", 0)}
        )
        self._trained_estimator = self._state._train_with_config(
            best_estimator,
            best_config,
            sample_size=sample_size,
        )[0]
        logger.info("retrain from log succeeded")
        return training_duration

    def _decide_split_type(self, split_type):
        if self._state.task == "classification":
            self._state.task = get_classification_objective(
                len(np.unique(self._y_train_all))
            )
        if not isinstance(split_type, str):
            assert hasattr(split_type, "split") and hasattr(
                split_type, "get_n_splits"
            ), "split_type must be a string or a splitter object with split and get_n_splits methods."
            assert (
                not isinstance(split_type, GroupKFold) or self._state.groups is not None
            ), "GroupKFold requires groups to be provided."
            self._split_type = split_type
        elif self._state.task in CLASSIFICATION:
            assert split_type in ["auto", "stratified", "uniform", "time", "group"]
            self._split_type = (
                split_type
                if split_type != "auto"
                else self._state.groups is None and "stratified" or "group"
            )
        elif self._state.task in REGRESSION:
            assert split_type in ["auto", "uniform", "time", "group"]
            self._split_type = split_type if split_type != "auto" else "uniform"
        elif self._state.task in TS_FORECAST:
            assert split_type in ["auto", "time"]
            self._split_type = "time"
            assert isinstance(
                self._state.fit_kwargs.get("period"),
                int,  # NOTE: _decide_split_type is before kwargs is updated to fit_kwargs_by_estimator
            ), f"missing a required integer 'period' for '{TS_FORECAST}' task."
            if self._state.fit_kwargs.get("group_ids"):
                self._state.task == TS_FORECASTPANEL
                assert isinstance(
                    self._state.fit_kwargs.get("group_ids"), list
                ), f"missing a required List[str] 'group_ids' for '{TS_FORECASTPANEL}' task."
        elif self._state.task == "rank":
            assert (
                self._state.groups is not None
            ), "groups must be specified for ranking task."
            assert split_type in ["auto", "group"]
            self._split_type = "group"
        elif self._state.task in NLG_TASKS:
            assert split_type in ["auto", "uniform", "time", "group"]
            self._split_type = split_type if split_type != "auto" else "uniform"

    def _decide_eval_method(self, eval_method, time_budget):
        if not isinstance(self._split_type, str):
            assert eval_method in [
                "auto",
                "cv",
            ], "eval_method must be 'auto' or 'cv' for custom data splitter."
            assert (
                self._state.X_val is None
            ), "custom splitter and custom validation data can't be used together."
            return "cv"
        if self._state.X_val is not None:
            assert eval_method in [
                "auto",
                "holdout",
            ], "eval_method must be 'auto' or 'holdout' for custom validation data."
            return "holdout"
        if eval_method != "auto":
            assert eval_method in [
                "holdout",
                "cv",
            ], "eval_method must be 'holdout', 'cv' or 'auto'."
            return eval_method
        nrow, dim = self._nrow, self._ndim
        if (
            time_budget is None
            or nrow * dim / 0.9 < SMALL_LARGE_THRES * (time_budget / 3600)
            and nrow < CV_HOLDOUT_THRESHOLD
        ):
            # time allows or sampling can be used and cv is necessary
            return "cv"
        else:
            return "holdout"

    @property
    def search_space(self) -> dict:
        """Search space.

        Must be called after fit(...)
        (use max_iter=0 and retrain_final=False to prevent actual fitting).

        Returns:
            A dict of the search space.
        """
        estimator_list = self.estimator_list
        if len(estimator_list) == 1:
            estimator = estimator_list[0]
            space = self._search_states[estimator].search_space.copy()
            space["learner"] = estimator
            return space
        choices = []
        for estimator in estimator_list:
            space = self._search_states[estimator].search_space.copy()
            space["learner"] = estimator
            choices.append(space)
        return {"ml": tune.choice(choices)}

    @property
    def low_cost_partial_config(self) -> dict:
        """Low cost partial config.

        Returns:
            A dict.
            (a) if there is only one estimator in estimator_list, each key is a
            hyperparameter name.
            (b) otherwise, it is a nested dict with 'ml' as the key, and
            a list of the low_cost_partial_configs as the value, corresponding
            to each learner's low_cost_partial_config; the estimator index as
            an integer corresponding to the cheapest learner is appended to the
            list at the end.
        """
        if len(self.estimator_list) == 1:
            estimator = self.estimator_list[0]
            c = self._search_states[estimator].low_cost_partial_config
            return c
        else:
            configs = []
            for estimator in self.estimator_list:
                c = self._search_states[estimator].low_cost_partial_config
                configs.append(c)
            configs.append(
                np.argmin(
                    [
                        self._state.learner_classes.get(estimator).cost_relative2lgbm()
                        for estimator in self.estimator_list
                    ]
                )
            )
            config = {"ml": configs}
        return config

    @property
    def cat_hp_cost(self) -> dict:
        """Categorical hyperparameter cost

        Returns:
            A dict.
            (a) if there is only one estimator in estimator_list, each key is a
            hyperparameter name.
            (b) otherwise, it is a nested dict with 'ml' as the key, and
            a list of the cat_hp_cost's as the value, corresponding
            to each learner's cat_hp_cost; the cost relative to lgbm for each
            learner (as a list itself) is appended to the list at the end.
        """
        if len(self.estimator_list) == 1:
            estimator = self.estimator_list[0]
            c = self._search_states[estimator].cat_hp_cost
            return c
        else:
            configs = []
            for estimator in self.estimator_list:
                c = self._search_states[estimator].cat_hp_cost
                configs.append(c)
            configs.append(
                [
                    self._state.learner_classes.get(estimator).cost_relative2lgbm()
                    for estimator in self.estimator_list
                ]
            )
            config = {"ml": configs}
        return config

    @property
    def points_to_evaluate(self) -> dict:
        """Initial points to evaluate.

        Returns:
            A list of dicts. Each dict is the initial point for each learner.
        """
        points = []
        for estimator in self.estimator_list:
            if isinstance(self._search_states[estimator].init_config, list):
                configs = self._search_states[estimator].init_config
            else:
                configs = [self._search_states[estimator].init_config]
            for config in configs:
                config["learner"] = estimator
                if len(self.estimator_list) > 1:
                    points.append({"ml": config})
                else:
                    points.append(config)
        return points

    @property
    def resource_attr(self) -> Optional[str]:
        """Attribute of the resource dimension.

        Returns:
            A string for the sample size attribute
            (the resource attribute in AutoML) or None.
        """
        return "FLAML_sample_size" if self._sample else None

    @property
    def min_resource(self) -> Optional[float]:
        """Attribute for pruning.

        Returns:
            A float for the minimal sample size or None.
        """
        return self._min_sample_size if self._sample else None

    @property
    def max_resource(self) -> Optional[float]:
        """Attribute for pruning.

        Returns:
            A float for the maximal sample size or None.
        """
        return self._state.data_size[0] if self._sample else None

    def pickle(self, output_file_name):
        import pickle

        estimator_to_training_function = {}
        for estimator in self.estimator_list:
            search_state = self._search_states[estimator]
            if hasattr(search_state, "training_function"):
                estimator_to_training_function[
                    estimator
                ] = search_state.training_function
                del search_state.training_function

        with open(output_file_name, "wb") as f:
            pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)

    @property
    def trainable(self) -> Callable[[dict], Optional[float]]:
        """Training function.
        Returns:
            A function that evaluates each config and returns the loss.
        """
        self._state.time_from_start = 0
        states = self._search_states
        mem_res = self._mem_thres

        def train(config: dict, state):

            sample_size = config.get("FLAML_sample_size")
            config = config.get("ml", config).copy()
            if sample_size:
                config["FLAML_sample_size"] = sample_size
            estimator = config["learner"]
            # check memory constraints before training
            if states[estimator].learner_class.size(config) <= mem_res:
                del config["learner"]
                result = AutoMLState._compute_with_config_base(
                    config, state=state, estimator=estimator
                )
            else:
                # If search algorithm is not in flaml, it does not handle the config constraint, should also tune.report before return
                result = {
                    "pred_time": 0,
                    "wall_clock_time": None,
                    "metric_for_logging": np.inf,
                    "val_loss": np.inf,
                    "trained_estimator": None,
                }
            tune.report(**result)
            return result

        if self._use_ray is not False:
            from ray.tune import with_parameters

            return with_parameters(
                train,
                state=self._state,
            )
        else:
            return partial(
                train,
                state=self._state,
            )

    @property
    def metric_constraints(self) -> list:
        """Metric constraints.

        Returns:
            A list of the metric constraints.
        """
        return self._metric_constraints

    def fit(
        self,
        X_train=None,
        y_train=None,
        dataframe=None,
        label=None,
        metric=None,
        task=None,
        n_jobs=None,
        # gpu_per_trial=0,
        log_file_name=None,
        estimator_list=None,
        time_budget=None,
        max_iter=None,
        sample=None,
        ensemble=None,
        eval_method=None,
        log_type=None,
        model_history=None,
        split_ratio=None,
        n_splits=None,
        log_training_metric=None,
        mem_thres=None,
        pred_time_limit=None,
        train_time_limit=None,
        X_val=None,
        y_val=None,
        sample_weight_val=None,
        groups_val=None,
        groups=None,
        verbose=None,
        retrain_full=None,
        split_type=None,
        learner_selector=None,
        hpo_method=None,
        starting_points=None,
        seed=None,
        n_concurrent_trials=None,
        keep_search_state=None,
        preserve_checkpoint=True,
        early_stop=None,
        append_log=None,
        auto_augment=None,
        min_sample_size=None,
        use_ray=None,
        metric_constraints=None,
        custom_hp=None,
        skip_transform=None,
        fit_kwargs_by_estimator=None,
        **fit_kwargs,
    ):
        """Find a model for a given task.

        Args:
            X_train: A numpy array or a pandas dataframe of training data in
                shape (n, m). For time series forecsat tasks, the first column of X_train
                must be the timestamp column (datetime type). Other columns in
                the dataframe are assumed to be exogenous variables (categorical or numeric).
                When using ray, X_train can be a ray.ObjectRef.
            y_train: A numpy array or a pandas series of labels in shape (n, ).
            dataframe: A dataframe of training data including label column.
                For time series forecast tasks, dataframe must be specified and must have
                at least two columns, timestamp and label, where the first
                column is the timestamp column (datetime type). Other columns in
                the dataframe are assumed to be exogenous variables (categorical or numeric).
                When using ray, dataframe can be a ray.ObjectRef.
            label: A str of the label column name for, e.g., 'label';
                Note: If X_train and y_train are provided,
                dataframe and label are ignored;
                If not, dataframe and label must be provided.
            metric: A string of the metric name or a function,
                e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
                'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
                'mape'. Default is 'auto'.
                If passing a customized metric function, the function needs to
                have the following signature:

        ```python
        def custom_metric(
            X_test, y_test, estimator, labels,
            X_train, y_train, weight_test=None, weight_train=None,
            config=None, groups_test=None, groups_train=None,
        ):
            return metric_to_minimize, metrics_to_log
        ```
                which returns a float number as the minimization objective,
                and a dictionary as the metrics to log. E.g.,

        ```python
        def custom_metric(
            X_val, y_val, estimator, labels,
            X_train, y_train, weight_val=None, weight_train=None,
            *args,
        ):
            from sklearn.metrics import log_loss
            import time

            start = time.time()
            y_pred = estimator.predict_proba(X_val)
            pred_time = (time.time() - start) / len(X_val)
            val_loss = log_loss(y_val, y_pred, labels=labels, sample_weight=weight_val)
            y_pred = estimator.predict_proba(X_train)
            train_loss = log_loss(y_train, y_pred, labels=labels, sample_weight=weight_train)
            alpha = 0.5
            return val_loss * (1 + alpha) - alpha * train_loss, {
                "val_loss": val_loss,
                "train_loss": train_loss,
                "pred_time": pred_time,
            }
        ```
            task: A string of the task type, e.g.,
                'classification', 'regression', 'ts_forecast_regression',
                'ts_forecast_classification', 'ts_forecast_panel', 'rank', 'seq-classification',
                'seq-regression', 'summarization'.
            n_jobs: An integer of the number of threads for training | default=-1.
                Use all available resources when n_jobs == -1.
            log_file_name: A string of the log file name | default="". To disable logging,
                set it to be an empty string "".
            estimator_list: A list of strings for estimator names, or 'auto'.
                e.g., ```['lgbm', 'xgboost', 'xgb_limitdepth', 'catboost', 'rf', 'extra_tree']```.
            time_budget: A float number of the time budget in seconds.
                Use -1 if no time limit.
            max_iter: An integer of the maximal number of iterations.
                NOTE: when both time_budget and max_iter are unspecified,
                only one model will be trained per estimator.
            sample: A boolean of whether to sample the training data during
                search.
            ensemble: boolean or dict | default=False. Whether to perform
                ensemble after search. Can be a dict with keys 'passthrough'
                and 'final_estimator' to specify the passthrough and
                final_estimator in the stacker. The dict can also contain
                'n_jobs' as the key to specify the number of jobs for the stacker.
            eval_method: A string of resampling strategy, one of
                ['auto', 'cv', 'holdout'].
            split_ratio: A float of the valiation data percentage for holdout.
            n_splits: An integer of the number of folds for cross - validation.
            log_type: A string of the log type, one of
                ['better', 'all'].
                'better' only logs configs with better loss than previos iters
                'all' logs all the tried configs.
            model_history: A boolean of whether to keep the trained best
                model per estimator. Make sure memory is large enough if setting to True.
                Default value is False: best_model_for_estimator would return a
                untrained model for non-best learner.
            log_training_metric: A boolean of whether to log the training
                metric for each model.
            mem_thres: A float of the memory size constraint in bytes.
            pred_time_limit: A float of the prediction latency constraint in seconds.
                It refers to the average prediction time per row in validation data.
            train_time_limit: A float of the training time constraint in seconds.
            X_val: None or a numpy array or a pandas dataframe of validation data.
            y_val: None or a numpy array or a pandas series of validation labels.
            sample_weight_val: None or a numpy array of the sample weight of
                validation data of the same shape as y_val.
            groups_val: None or array-like | group labels (with matching length
                to y_val) or group counts (with sum equal to length of y_val)
                for validation data. Need to be consistent with groups.
            groups: None or array-like | Group labels (with matching length to
                y_train) or groups counts (with sum equal to length of y_train)
                for training data.
            verbose: int, default=3 | Controls the verbosity, higher means more
                messages.
            retrain_full: bool or str, default=True | whether to retrain the
                selected model on the full training data when using holdout.
                True - retrain only after search finishes; False - no retraining;
                'budget' - do best effort to retrain without violating the time
                budget.
            split_type: str or splitter object, default="auto" | the data split type.
                * A valid splitter object is an instance of a derived class of scikit-learn
                [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold)
                and have ``split`` and ``get_n_splits`` methods with the same signatures.
                Set eval_method to "cv" to use the splitter object.
                * Valid str options depend on different tasks.
                For classification tasks, valid choices are
                    ["auto", 'stratified', 'uniform', 'time', 'group']. "auto" -> stratified.
                For regression tasks, valid choices are ["auto", 'uniform', 'time'].
                    "auto" -> uniform.
                For time series forecast tasks, must be "auto" or 'time'.
                For ranking task, must be "auto" or 'group'.
            hpo_method: str, default="auto" | The hyperparameter
                optimization method. By default, CFO is used for sequential
                search and BlendSearch is used for parallel search.
                No need to set when using flaml's default search space or using
                a simple customized search space. When set to 'bs', BlendSearch
                is used. BlendSearch can be tried when the search space is
                complex, for example, containing multiple disjoint, discontinuous
                subspaces. When set to 'random', random search is used.
            starting_points: A dictionary or a str to specify the starting hyperparameter
                config for the estimators | default="data".
                If str:
                    - if "data", use data-dependent defaults;
                    - if "data:path" use data-dependent defaults which are stored at path;
                    - if "static", use data-independent defaults.
                If dict, keys are the name of the estimators, and values are the starting
                hyperparamter configurations for the corresponding estimators.
                The value can be a single hyperparamter configuration dict or a list
                of hyperparamter configuration dicts.
                In the following code example, we get starting_points from the
                `automl` object and use them in the `new_automl` object.
                e.g.,

        ```python
        from flaml import AutoML
        automl = AutoML()
        X_train, y_train = load_iris(return_X_y=True)
        automl.fit(X_train, y_train)
        starting_points = automl.best_config_per_estimator

        new_automl = AutoML()
        new_automl.fit(X_train, y_train, starting_points=starting_points)
        ```

            seed: int or None, default=None | The random seed for hpo.
            n_concurrent_trials: [Experimental] int, default=1 | The number of
                concurrent trials. When n_concurrent_trials > 1, flaml performes
                [parallel tuning](../Use-Cases/Task-Oriented-AutoML#parallel-tuning)
                and installation of ray is required: `pip install flaml[ray]`.
            keep_search_state: boolean, default=False | Whether to keep data needed
                for model search after fit(). By default the state is deleted for
                space saving.
            preserve_checkpoint: boolean, default=True | Whether to preserve the saved checkpoint
                on disk when deleting automl. By default the checkpoint is preserved.
            early_stop: boolean, default=False | Whether to stop early if the
                search is considered to converge.
            append_log: boolean, default=False | Whetehr to directly append the log
                records to the input log file if it exists.
            auto_augment: boolean, default=True | Whether to automatically
                augment rare classes.
            min_sample_size: int, default=MIN_SAMPLE_TRAIN | the minimal sample
                size when sample=True.
            use_ray: boolean or dict.
                If boolean: default=False | Whether to use ray to run the training
                in separate processes. This can be used to prevent OOM for large
                datasets, but will incur more overhead in time.
                If dict: the dict contains the keywords arguments to be passed to
                [ray.tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html).
            metric_constraints: list, default=[] | The list of metric constraints.
                Each element in this list is a 3-tuple, which shall be expressed
                in the following format: the first element of the 3-tuple is the name of the
                metric, the second element is the inequality sign chosen from ">=" and "<=",
                and the third element is the constraint value. E.g., `('precision', '>=', 0.9)`.
                Note that all the metric names in metric_constraints need to be reported via
                the metrics_to_log dictionary returned by a customized metric function.
                The customized metric function shall be provided via the `metric` key word argument
                of the fit() function or the automl constructor.
                Find examples in this [test](https://github.com/microsoft/FLAML/tree/main/test/automl/test_constraints.py).
                If `pred_time_limit` is provided as one of keyword arguments to fit() function or
                the automl constructor, flaml will automatically (and under the hood)
                add it as an additional element in the metric_constraints. Essentially 'pred_time_limit'
                specifies a constraint about the prediction latency constraint in seconds.
            custom_hp: dict, default=None | The custom search space specified by user
                Each key is the estimator name, each value is a dict of the custom search space for that estimator. Notice the
                domain of the custom search space can either be a value of a sample.Domain object.


        ```python
        custom_hp = {
            "transformer_ms": {
                "model_path": {
                    "domain": "albert-base-v2",
                },
                "learning_rate": {
                    "domain": tune.choice([1e-4, 1e-5]),
                }
            }
        }
        ```

        skip_transform: boolean, default=False | Whether to pre-process data prior to modeling.
        fit_kwargs_by_estimator: dict, default=None | The user specified keywords arguments, grouped by estimator name.
                For TransformersEstimator, available fit_kwargs can be found from
                [TrainingArgumentsForAuto](nlp/huggingface/training_args).
                e.g.,

        ```python
        fit_kwargs_by_estimator = {
            "transformer": {
                "output_dir": "test/data/output/",
                "fp16": False,
            },
            "tft": {
                "max_encoder_length": 1,
                "min_encoder_length": 1,
                "static_categoricals": [],
                "static_reals": [],
                "time_varying_known_categoricals": [],
                "time_varying_known_reals": [],
                "time_varying_unknown_categoricals": [],
                "time_varying_unknown_reals": [],
                "variable_groups": {},
                "lags": {},
            }
        }
        ```

            **fit_kwargs: Other key word arguments to pass to fit() function of
                the searched learners, such as sample_weight. Below are a few examples of
                estimator-specific parameters:
                    period: int | forecast horizon for all time series forecast tasks.
                    gpu_per_trial: float, default = 0 | A float of the number of gpus per trial,
                        only used by TransformersEstimator, XGBoostSklearnEstimator, and
                        TemporalFusionTransformerEstimator.
                    group_ids: list of strings of column names identifying a time series, only
                        used by TemporalFusionTransformerEstimator, required for
                        'ts_forecast_panel' task. `group_ids` is a parameter for TimeSeriesDataSet object
                        from PyTorchForecasting.
                        For other parameters to describe your dataset, refer to
                        [TimeSeriesDataSet PyTorchForecasting](https://pytorch-forecasting.readthedocs.io/en/stable/api/pytorch_forecasting.data.timeseries.TimeSeriesDataSet.html).
                        To specify your variables, use `static_categoricals`, `static_reals`,
                        `time_varying_known_categoricals`, `time_varying_known_reals`,
                        `time_varying_unknown_categoricals`, `time_varying_unknown_reals`,
                        `variable_groups`. To provide more information on your data, use
                        `max_encoder_length`, `min_encoder_length`, `lags`.
                    log_dir: str, default = "lightning_logs" | Folder into which to log results
                        for tensorboard, only used by TemporalFusionTransformerEstimator.
                    max_epochs: int, default = 20 | Maximum number of epochs to run training,
                        only used by TemporalFusionTransformerEstimator.
                    batch_size: int, default = 64 | Batch size for training model, only
                        used by TemporalFusionTransformerEstimator.
        """

        self._state._start_time_flag = self._start_time_flag = time.time()
        task = task or self._settings.get("task")
        self._estimator_type = "classifier" if task in CLASSIFICATION else "regressor"
        time_budget = time_budget or self._settings.get("time_budget")
        n_jobs = n_jobs or self._settings.get("n_jobs")
        gpu_per_trial = fit_kwargs.get("gpu_per_trial", 0)
        eval_method = eval_method or self._settings.get("eval_method")
        split_ratio = split_ratio or self._settings.get("split_ratio")
        n_splits = n_splits or self._settings.get("n_splits")
        auto_augment = (
            self._settings.get("auto_augment") if auto_augment is None else auto_augment
        )
        metric = metric or self._settings.get("metric")
        estimator_list = estimator_list or self._settings.get("estimator_list")
        log_file_name = (
            self._settings.get("log_file_name")
            if log_file_name is None
            else log_file_name
        )
        max_iter = self._settings.get("max_iter") if max_iter is None else max_iter
        sample_is_none = sample is None
        if sample_is_none:
            sample = self._settings.get("sample")
        ensemble = self._settings.get("ensemble") if ensemble is None else ensemble
        log_type = log_type or self._settings.get("log_type")
        model_history = (
            self._settings.get("model_history")
            if model_history is None
            else model_history
        )
        log_training_metric = (
            self._settings.get("log_training_metric")
            if log_training_metric is None
            else log_training_metric
        )
        mem_thres = mem_thres or self._settings.get("mem_thres")
        pred_time_limit = pred_time_limit or self._settings.get("pred_time_limit")
        train_time_limit = train_time_limit or self._settings.get("train_time_limit")
        self._metric_constraints = metric_constraints or self._settings.get(
            "metric_constraints"
        )
        if np.isfinite(pred_time_limit):
            self._metric_constraints.append(("pred_time", "<=", pred_time_limit))
        verbose = self._settings.get("verbose") if verbose is None else verbose
        retrain_full = (
            self._settings.get("retrain_full") if retrain_full is None else retrain_full
        )
        split_type = split_type or self._settings.get("split_type")
        hpo_method = hpo_method or self._settings.get("hpo_method")
        learner_selector = learner_selector or self._settings.get("learner_selector")
        no_starting_points = starting_points is None
        if no_starting_points:
            starting_points = self._settings.get("starting_points")
        n_concurrent_trials = n_concurrent_trials or self._settings.get(
            "n_concurrent_trials"
        )
        keep_search_state = (
            self._settings.get("keep_search_state")
            if keep_search_state is None
            else keep_search_state
        )
        self.preserve_checkpoint = (
            self._settings.get("preserve_checkpoint")
            if preserve_checkpoint is None
            else preserve_checkpoint
        )
        early_stop = (
            self._settings.get("early_stop") if early_stop is None else early_stop
        )
        # no search budget is provided?
        no_budget = time_budget == -1 and max_iter is None and not early_stop
        append_log = (
            self._settings.get("append_log") if append_log is None else append_log
        )
        min_sample_size = min_sample_size or self._settings.get("min_sample_size")
        use_ray = self._settings.get("use_ray") if use_ray is None else use_ray
        self._state.n_jobs = n_jobs
        self._n_concurrent_trials = n_concurrent_trials
        self._early_stop = early_stop
        self._use_ray = use_ray or n_concurrent_trials > 1
        # use the following condition if we have an estimation of average_trial_time and average_trial_overhead
        # self._use_ray = use_ray or n_concurrent_trials > ( average_trail_time + average_trial_overhead) / (average_trial_time)
        if self._use_ray is not False:
            import ray

            n_cpus = (
                ray.is_initialized()
                and ray.available_resources()["CPU"]
                or os.cpu_count()
            )

            self._state.resources_per_trial = (
                # when using gpu, default cpu is 1 per job; otherwise, default cpu is n_cpus / n_concurrent_trials
                (
                    {
                        "cpu": max(int((n_cpus - 2) / 2 / n_concurrent_trials), 1),
                        "gpu": gpu_per_trial,
                    }
                    if gpu_per_trial == 0
                    else {"cpu": 1, "gpu": gpu_per_trial}
                )
                if n_jobs < 0
                else {"cpu": n_jobs, "gpu": gpu_per_trial}
            )

            if isinstance(X_train, ray.ObjectRef):
                X_train = ray.get(X_train)
            elif isinstance(dataframe, ray.ObjectRef):
                dataframe = ray.get(dataframe)

        self._state.task = task
        self._state.log_training_metric = log_training_metric

        self._state.fit_kwargs = fit_kwargs
        custom_hp = custom_hp or self._settings.get("custom_hp")
        self._skip_transform = (
            self._settings.get("skip_transform")
            if skip_transform is None
            else skip_transform
        )
        fit_kwargs_by_estimator = fit_kwargs_by_estimator or self._settings.get(
            "fit_kwargs_by_estimator"
        )
        self._state.fit_kwargs_by_estimator = (
            fit_kwargs_by_estimator.copy()
        )  # shallow copy of fit_kwargs_by_estimator
        self._state.weight_val = sample_weight_val

        self._validate_data(
            X_train, y_train, dataframe, label, X_val, y_val, groups_val, groups
        )
        self._search_states = {}  # key: estimator name; value: SearchState
        self._random = np.random.RandomState(RANDOM_SEED)
        self._seed = seed if seed is not None else 20
        self._learner_selector = learner_selector
        old_level = logger.getEffectiveLevel()
        self.verbose = verbose
        logger.setLevel(50 - verbose * 10)
        if not logger.handlers:
            # Add the console handler.
            _ch = logging.StreamHandler()
            _ch.setFormatter(logger_formatter)
            logger.addHandler(_ch)
        logger.info(f"task = {task}")
        self._decide_split_type(split_type)
        logger.info(f"Data split method: {self._split_type}")
        eval_method = self._decide_eval_method(eval_method, time_budget)
        self._state.eval_method = eval_method
        logger.info("Evaluation method: {}".format(eval_method))

        self._retrain_in_budget = retrain_full == "budget" and (
            eval_method == "holdout" and self._state.X_val is None
        )
        self._auto_augment = auto_augment

        _sample_size_from_starting_points = {}
        if isinstance(starting_points, dict):
            for _estimator, _point_per_estimator in starting_points.items():
                sample_size = (
                    _point_per_estimator
                    and isinstance(_point_per_estimator, dict)
                    and _point_per_estimator.get("FLAML_sample_size")
                )
                if sample_size:
                    _sample_size_from_starting_points[_estimator] = sample_size
                elif _point_per_estimator and isinstance(_point_per_estimator, list):
                    _sample_size_set = set(
                        [
                            config["FLAML_sample_size"]
                            for config in _point_per_estimator
                            if "FLAML_sample_size" in config
                        ]
                    )
                    if _sample_size_set:
                        _sample_size_from_starting_points[_estimator] = min(
                            _sample_size_set
                        )
                    if len(_sample_size_set) > 1:
                        logger.warning(
                            "Using the min FLAML_sample_size of all the provided starting points for estimator {}. (Provided FLAML_sample_size are: {})".format(
                                _estimator, _sample_size_set
                            )
                        )

        if not sample and isinstance(starting_points, dict):
            assert (
                not _sample_size_from_starting_points
            ), "When subsampling is disabled, do not include FLAML_sample_size in the starting point."
        self._min_sample_size = _sample_size_from_starting_points or min_sample_size
        self._min_sample_size_input = min_sample_size
        self._prepare_data(eval_method, split_ratio, n_splits)

        if isinstance(self._min_sample_size, dict):
            self._sample = {
                (
                    k,
                    sample
                    and task != "rank"
                    and eval_method != "cv"
                    and (
                        self._min_sample_size[k] * SAMPLE_MULTIPLY_FACTOR
                        < self._state.data_size[0]
                    ),
                )
                for k in self._min_sample_size.keys()
            }
        else:
            self._sample = (
                sample
                and task != "rank"
                and eval_method != "cv"
                and (
                    self._min_sample_size * SAMPLE_MULTIPLY_FACTOR
                    < self._state.data_size[0]
                )
            )
        if "auto" == metric:
            if _is_nlp_task(self._state.task):
                from .nlp.utils import load_default_huggingface_metric_for_task

                metric = load_default_huggingface_metric_for_task(self._state.task)
            elif "binary" in self._state.task:
                metric = "roc_auc"
            elif "multiclass" in self._state.task:
                metric = "log_loss"
            elif self._state.task in TS_FORECAST:
                metric = "mape"
            elif self._state.task == "rank":
                metric = "ndcg"
            else:
                metric = "r2"

        self._state.metric = metric

        def is_to_reverse_metric(metric, task):
            if metric.startswith("ndcg"):
                return True, f"1-{metric}"
            if metric in [
                "r2",
                "accuracy",
                "roc_auc",
                "roc_auc_ovr",
                "roc_auc_ovo",
                "f1",
                "ap",
                "micro_f1",
                "macro_f1",
            ]:
                return True, f"1-{metric}"
            if _is_nlp_task(task):
                from .ml import huggingface_metric_to_mode

                if (
                    metric in huggingface_metric_to_mode
                    and huggingface_metric_to_mode[metric] == "max"
                ):
                    return True, f"-{metric}"
            return False, None

        if isinstance(metric, str):
            is_reverse, reverse_metric = is_to_reverse_metric(metric, task)
            if is_reverse:
                error_metric = reverse_metric
            else:
                error_metric = metric
        else:
            error_metric = "customized metric"
        logger.info(f"Minimizing error metric: {error_metric}")

        if "auto" == estimator_list:
            if self._state.task == "rank":
                estimator_list = ["lgbm", "xgboost", "xgb_limitdepth"]
            elif _is_nlp_task(self._state.task):
                estimator_list = ["transformer"]
            elif self._state.task == TS_FORECASTPANEL:
                estimator_list = ["tft"]
            else:
                try:
                    import catboost

                    estimator_list = [
                        "lgbm",
                        "rf",
                        "catboost",
                        "xgboost",
                        "extra_tree",
                        "xgb_limitdepth",
                    ]
                except ImportError:
                    estimator_list = [
                        "lgbm",
                        "rf",
                        "xgboost",
                        "extra_tree",
                        "xgb_limitdepth",
                    ]
                if self._state.task in TS_FORECAST:
                    # catboost is removed because it has a `name` parameter, making it incompatible with hcrystalball
                    if "catboost" in estimator_list:
                        estimator_list.remove("catboost")
                    if self._state.task in TS_FORECASTREGRESSION:
                        try:
                            import prophet

                            estimator_list += ["prophet", "arima", "sarimax"]
                        except ImportError:
                            estimator_list += ["arima", "sarimax"]
                elif "regression" != self._state.task:
                    estimator_list += ["lrl1"]
        # When no search budget is specified
        if no_budget:
            max_iter = len(estimator_list)
            self._learner_selector = "roundrobin"
            if sample_is_none:
                self._sample = False
            if no_starting_points:
                starting_points = "data"
            logger.warning(
                "No search budget is provided via time_budget or max_iter."
                " Training only one model per estimator."
                " To tune hyperparameters for each estimator,"
                " please provide budget either via time_budget or max_iter."
            )
        elif max_iter is None:
            # set to a large number
            max_iter = 1000000
        self._state.retrain_final = (
            retrain_full is True
            and eval_method == "holdout"
            and (X_val is None or self._use_ray is not False)
            or eval_method == "cv"
            and (max_iter > 0 or retrain_full is True)
            or max_iter == 1
        )
        # add custom learner
        for estimator_name in estimator_list:
            if estimator_name not in self._state.learner_classes:
                self.add_learner(
                    estimator_name,
                    get_estimator_class(self._state.task, estimator_name),
                )
        # set up learner search space
        if isinstance(starting_points, str) and starting_points.startswith("data"):
            from flaml.default import suggest_config

            location = starting_points[5:]
            starting_points = {}
            for estimator_name in estimator_list:
                try:
                    configs = suggest_config(
                        self._state.task,
                        self._X_train_all,
                        self._y_train_all,
                        estimator_name,
                        location,
                        k=1,
                    )
                    starting_points[estimator_name] = [
                        x["hyperparameters"] for x in configs
                    ]
                except FileNotFoundError:
                    pass
            try:
                learner = suggest_learner(
                    self._state.task,
                    self._X_train_all,
                    self._y_train_all,
                    estimator_list=estimator_list,
                    location=location,
                )
                if learner != estimator_list[0]:
                    estimator_list.remove(learner)
                    estimator_list.insert(0, learner)
            except FileNotFoundError:
                pass

        starting_points = {} if starting_points == "static" else starting_points

        for estimator_name in estimator_list:
            estimator_class = self._state.learner_classes[estimator_name]
            estimator_class.init()
            this_estimator_kwargs = self._state.fit_kwargs_by_estimator.get(
                estimator_name
            )
            if this_estimator_kwargs:
                # make another shallow copy of the value (a dict obj), so user's fit_kwargs_by_estimator won't be updated
                this_estimator_kwargs = this_estimator_kwargs.copy()
                this_estimator_kwargs.update(
                    self._state.fit_kwargs
                )  # update the shallow copy of fit_kwargs to fit_kwargs_by_estimator
                self._state.fit_kwargs_by_estimator[
                    estimator_name
                ] = this_estimator_kwargs  # set self._state.fit_kwargs_by_estimator[estimator_name] to the update, so only self._state.fit_kwargs_by_estimator will be updated
            else:
                self._state.fit_kwargs_by_estimator[
                    estimator_name
                ] = self._state.fit_kwargs

            self._search_states[estimator_name] = SearchState(
                learner_class=estimator_class,
                data_size=self._state.data_size,
                task=self._state.task,
                starting_point=starting_points.get(estimator_name),
                period=self._state.fit_kwargs.get(
                    "period"
                ),  # NOTE: this is after kwargs is updated to fit_kwargs_by_estimator
                custom_hp=custom_hp and custom_hp.get(estimator_name),
                max_iter=max_iter,
            )
        logger.info("List of ML learners in AutoML Run: {}".format(estimator_list))
        self.estimator_list = estimator_list
        self._state.time_budget = time_budget if time_budget > 0 else 1e10
        self._active_estimators = estimator_list.copy()
        self._ensemble = ensemble
        self._max_iter = max_iter
        self._mem_thres = mem_thres
        self._pred_time_limit = pred_time_limit
        self._state.train_time_limit = train_time_limit
        self._log_type = log_type
        self.split_ratio = split_ratio
        self._state.model_history = model_history
        self._hpo_method = (
            hpo_method
            if hpo_method != "auto"
            else (
                "bs"
                if n_concurrent_trials > 1
                or self._use_ray is not False
                and len(estimator_list) > 1
                else "cfo"
            )
        )
        if log_file_name:
            with training_log_writer(log_file_name, append_log) as save_helper:
                self._training_log = save_helper
                self._search()
        else:
            self._training_log = None
            self._search()
        if self._best_estimator:
            logger.info("fit succeeded")
            logger.info(
                f"Time taken to find the best model: {self._time_taken_best_iter}"
            )
            if (
                self._hpo_method in ("cfo", "bs")
                and (self._time_taken_best_iter >= self._state.time_budget * 0.7)
                and not all(
                    state.search_alg and state.search_alg.searcher.is_ls_ever_converged
                    for state in self._search_states.values()
                )
            ):
                logger.warning(
                    "Time taken to find the best model is {0:.0f}% of the "
                    "provided time budget and not all estimators' hyperparameter "
                    "search converged. Consider increasing the time budget.".format(
                        self._time_taken_best_iter / self._state.time_budget * 100
                    )
                )

        if not keep_search_state:
            # release space
            del self._X_train_all, self._y_train_all, self._state.kf
            del self._state.X_train, self._state.X_train_all, self._state.X_val
            del self._state.y_train, self._state.y_train_all, self._state.y_val
            del (
                self._sample_weight_full,
                self._state.fit_kwargs_by_estimator,
                self._state.fit_kwargs,
            )  # NOTE: this is after kwargs is updated to fit_kwargs_by_estimator
            del self._state.groups, self._state.groups_all, self._state.groups_val
        logger.setLevel(old_level)

    def _search_parallel(self):
        try:
            from ray import __version__ as ray_version

            assert ray_version >= "1.10.0"
            import ray
            from ray.tune.suggest import ConcurrencyLimiter
        except (ImportError, AssertionError):
            raise ImportError(
                "n_concurrent_trial>1 or use_ray=True requires installation of ray. "
                "Please run pip install flaml[ray]"
            )
        if self._hpo_method in ("cfo", "grid"):
            from flaml import CFO as SearchAlgo
        elif "bs" == self._hpo_method:
            from flaml import BlendSearch as SearchAlgo
        elif "random" == self._hpo_method:
            from ray.tune.suggest import BasicVariantGenerator as SearchAlgo
            from ray.tune.sample import Domain
        elif "optuna" == self._hpo_method:
            try:
                from ray import __version__ as ray_version

                assert ray_version >= "1.10.0"
                from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo
            except (ImportError, AssertionError):
                from .searcher.suggestion import OptunaSearch as SearchAlgo
        else:
            raise NotImplementedError(
                f"hpo_method={self._hpo_method} is not recognized. "
                "'auto', 'cfo' and 'bs' are supported."
            )
        space = self.search_space
        if self._hpo_method == "random":
            # Any point in points_to_evaluate must consist of hyperparamters
            # that are tunable, which can be identified by checking whether
            # the corresponding value in the search space is an instance of
            # the 'Domain' class from flaml or ray.tune
            points_to_evaluate = self.points_to_evaluate.copy()
            to_del = []
            for k, v in space.items():
                if not isinstance(v, Domain):
                    to_del.append(k)
            for k in to_del:
                for p in points_to_evaluate:
                    if k in p:
                        del p[k]
            search_alg = SearchAlgo(
                max_concurrent=self._n_concurrent_trials,
                points_to_evaluate=points_to_evaluate,
            )
        else:
            self._state.time_from_start = time.time() - self._start_time_flag
            time_left = self._state.time_budget - self._state.time_from_start
            if self._hpo_method != "optuna":
                min_resource = self.min_resource
                if isinstance(min_resource, dict):
                    _min_resource_set = set(min_resource.values())
                    min_resource_all_estimator = min(_min_resource_set)
                    if len(_min_resource_set) > 1:
                        logger.warning(
                            "Using the min FLAML_sample_size of all the provided starting points as the starting sample size in the case of parallel search."
                        )
                else:
                    min_resource_all_estimator = min_resource
                search_alg = SearchAlgo(
                    metric="val_loss",
                    space=space,
                    low_cost_partial_config=self.low_cost_partial_config,
                    points_to_evaluate=self.points_to_evaluate,
                    cat_hp_cost=self.cat_hp_cost,
                    resource_attr=self.resource_attr,
                    min_resource=min_resource_all_estimator,
                    max_resource=self.max_resource,
                    config_constraints=[
                        (partial(size, self._state), "<=", self._mem_thres)
                    ],
                    metric_constraints=self.metric_constraints,
                    seed=self._seed,
                    time_budget_s=time_left,
                )
            else:
                # if self._hpo_method is bo, sometimes the search space and the initial config dimension do not match
                # need to remove the extra keys from the search space to be consistent with the initial config
                converted_space = SearchAlgo.convert_search_space(space)

                removed_keys = set(space.keys()).difference(converted_space.keys())
                new_points_to_evaluate = []
                for idx in range(len(self.points_to_evaluate)):
                    r = self.points_to_evaluate[idx].copy()
                    for each_key in removed_keys:
                        r.pop(each_key)
                    new_points_to_evaluate.append(r)

                search_alg = SearchAlgo(
                    metric="val_loss",
                    mode="min",
                    points_to_evaluate=[
                        p
                        for p in new_points_to_evaluate
                        if len(p) == len(converted_space)
                    ],
                )
            search_alg = ConcurrencyLimiter(search_alg, self._n_concurrent_trials)
        resources_per_trial = self._state.resources_per_trial

        analysis = ray.tune.run(
            self.trainable,
            search_alg=search_alg,
            config=space,
            metric="val_loss",
            mode="min",
            resources_per_trial=resources_per_trial,
            time_budget_s=self._state.time_budget,
            num_samples=self._max_iter,
            verbose=max(self.verbose - 2, 0),
            raise_on_failed_trial=False,
            keep_checkpoints_num=1,
            checkpoint_score_attr="min-val_loss",
            **self._use_ray if isinstance(self._use_ray, dict) else {},
        )
        # logger.info([trial.last_result for trial in analysis.trials])
        trials = sorted(
            (
                trial
                for trial in analysis.trials
                if trial.last_result
                and trial.last_result.get("wall_clock_time") is not None
            ),
            key=lambda x: x.last_result["wall_clock_time"],
        )
        for self._track_iter, trial in enumerate(trials):
            result = trial.last_result
            better = False
            if result:
                config = result["config"]
                estimator = config.get("ml", config)["learner"]
                search_state = self._search_states[estimator]
                search_state.update(result, 0)
                wall_time = result.get("wall_clock_time")
                if wall_time is not None:
                    self._state.time_from_start = wall_time
                self._iter_per_learner[estimator] += 1
                if search_state.sample_size == self._state.data_size[0]:
                    if not self._fullsize_reached:
                        self._fullsize_reached = True
                if search_state.best_loss < self._state.best_loss:
                    self._state.best_loss = search_state.best_loss
                    self._best_estimator = estimator
                    self._config_history[self._track_iter] = (
                        self._best_estimator,
                        config,
                        self._time_taken_best_iter,
                    )
                    self._trained_estimator = search_state.trained_estimator
                    self._best_iteration = self._track_iter
                    self._time_taken_best_iter = self._state.time_from_start
                    better = True
                    self._search_states[estimator].best_config = config
                if better or self._log_type == "all":
                    self._log_trial(search_state, estimator)

    def _log_trial(self, search_state, estimator):
        if self._training_log:
            self._training_log.append(
                self._iter_per_learner[estimator],
                search_state.metric_for_logging,
                search_state.trial_time,
                self._state.time_from_start,
                search_state.val_loss,
                search_state.config,
                estimator,
                search_state.sample_size,
            )
        if mlflow is not None and mlflow.active_run():
            with mlflow.start_run(nested=True):
                mlflow.log_metric("iter_counter", self._track_iter)
                if (search_state.metric_for_logging is not None) and (
                    "intermediate_results" in search_state.metric_for_logging
                ):
                    for each_entry in search_state.metric_for_logging[
                        "intermediate_results"
                    ]:
                        with mlflow.start_run(nested=True):
                            mlflow.log_metrics(each_entry)
                            mlflow.log_metric(
                                "iter_counter", self._iter_per_learner[estimator]
                            )
                    del search_state.metric_for_logging["intermediate_results"]
                if search_state.metric_for_logging:
                    mlflow.log_metrics(search_state.metric_for_logging)
                mlflow.log_metric("trial_time", search_state.trial_time)
                mlflow.log_metric("wall_clock_time", self._state.time_from_start)
                mlflow.log_metric("validation_loss", search_state.val_loss)
                mlflow.log_param("config", search_state.config)
                mlflow.log_param("learner", estimator)
                mlflow.log_param("sample_size", search_state.sample_size)
                mlflow.log_metric("best_validation_loss", search_state.best_loss)
                mlflow.log_param("best_config", search_state.best_config)
                mlflow.log_param("best_learner", self._best_estimator)

    def _search_sequential(self):
        try:
            from ray import __version__ as ray_version

            assert ray_version >= "1.10.0"
            from ray.tune.suggest import ConcurrencyLimiter
        except (ImportError, AssertionError):
            from .searcher.suggestion import ConcurrencyLimiter
        if self._hpo_method in ("cfo", "grid"):
            from flaml import CFO as SearchAlgo
        elif "optuna" == self._hpo_method:
            try:
                from ray import __version__ as ray_version

                assert ray_version >= "1.10.0"
                from ray.tune.suggest.optuna import OptunaSearch as SearchAlgo
            except (ImportError, AssertionError):
                from .searcher.suggestion import OptunaSearch as SearchAlgo
        elif "bs" == self._hpo_method:
            from flaml import BlendSearch as SearchAlgo
        elif "random" == self._hpo_method:
            from flaml.searcher import RandomSearch as SearchAlgo
        elif "cfocat" == self._hpo_method:
            from flaml.searcher.cfo_cat import CFOCat as SearchAlgo
        else:
            raise NotImplementedError(
                f"hpo_method={self._hpo_method} is not recognized. "
                "'cfo' and 'bs' are supported."
            )

        est_retrain_time = next_trial_time = 0
        best_config_sig = None
        better = True  # whether we find a better model in one trial
        for self._track_iter in range(self._max_iter):
            if self._estimator_index is None:
                estimator = self._active_estimators[0]
            else:
                estimator = self._select_estimator(self._active_estimators)
                if not estimator:
                    break
            logger.info(f"iteration {self._track_iter}, current learner {estimator}")
            search_state = self._search_states[estimator]
            self._state.time_from_start = time.time() - self._start_time_flag
            time_left = self._state.time_budget - self._state.time_from_start
            budget_left = (
                time_left
                if not self._retrain_in_budget
                or better
                or (not self.best_estimator)
                or self._search_states[self.best_estimator].sample_size
                < self._state.data_size[0]
                else time_left - est_retrain_time
            )
            if not search_state.search_alg:
                search_state.training_function = partial(
                    AutoMLState._compute_with_config_base,
                    state=self._state,
                    estimator=estimator,
                )
                search_space = search_state.search_space
                if self._sample:
                    resource_attr = "FLAML_sample_size"
                    min_resource = (
                        self._min_sample_size[estimator]
                        if isinstance(self._min_sample_size, dict)
                        and estimator in self._min_sample_size
                        else self._min_sample_size_input
                    )
                    max_resource = self._state.data_size[0]
                else:
                    resource_attr = min_resource = max_resource = None
                learner_class = self._state.learner_classes.get(estimator)
                if "grid" == self._hpo_method:  # for synthetic exp only
                    points_to_evaluate = []
                    space = search_space
                    keys = list(space.keys())
                    domain0, domain1 = space[keys[0]], space[keys[1]]
                    for x1 in range(domain0.lower, domain0.upper + 1):
                        for x2 in range(domain1.lower, domain1.upper + 1):
                            points_to_evaluate.append(
                                {
                                    keys[0]: x1,
                                    keys[1]: x2,
                                }
                            )
                    self._max_iter_per_learner = len(points_to_evaluate)
                    low_cost_partial_config = None
                else:
                    points_to_evaluate = (
                        search_state.init_config
                        if isinstance(search_state.init_config, list)
                        else [search_state.init_config]
                    )

                    low_cost_partial_config = search_state.low_cost_partial_config
                if self._hpo_method in ("bs", "cfo", "grid", "cfocat", "random"):
                    algo = SearchAlgo(
                        metric="val_loss",
                        mode="min",
                        space=search_space,
                        points_to_evaluate=points_to_evaluate,
                        low_cost_partial_config=low_cost_partial_config,
                        cat_hp_cost=search_state.cat_hp_cost,
                        resource_attr=resource_attr,
                        min_resource=min_resource,
                        max_resource=max_resource,
                        config_constraints=[
                            (learner_class.size, "<=", self._mem_thres)
                        ],
                        metric_constraints=self.metric_constraints,
                        seed=self._seed,
                    )
                else:
                    # if self._hpo_method is bo, sometimes the search space and the initial config dimension do not match
                    # need to remove the extra keys from the search space to be consistent with the initial config
                    converted_space = SearchAlgo.convert_search_space(search_space)
                    removed_keys = set(search_space.keys()).difference(
                        converted_space.keys()
                    )
                    new_points_to_evaluate = []
                    for idx in range(len(points_to_evaluate)):
                        r = points_to_evaluate[idx].copy()
                        for each_key in removed_keys:
                            r.pop(each_key)
                        new_points_to_evaluate.append(r)
                    points_to_evaluate = new_points_to_evaluate

                    algo = SearchAlgo(
                        metric="val_loss",
                        mode="min",
                        space=search_space,
                        points_to_evaluate=[
                            p for p in points_to_evaluate if len(p) == len(search_space)
                        ],
                    )
                search_state.search_alg = ConcurrencyLimiter(algo, max_concurrent=1)
                # search_state.search_alg = algo
            else:
                search_space = None
                if self._hpo_method in ("bs", "cfo", "cfocat"):
                    search_state.search_alg.searcher.set_search_properties(
                        metric=None,
                        mode=None,
                        metric_target=self._state.best_loss,
                    )
            start_run_time = time.time()
            analysis = tune.run(
                search_state.training_function,
                search_alg=search_state.search_alg,
                time_budget_s=min(budget_left, self._state.train_time_limit),
                verbose=max(self.verbose - 3, 0),
                use_ray=False,
            )
            time_used = time.time() - start_run_time
            better = False
            if analysis.trials:
                result = analysis.trials[-1].last_result
                search_state.update(result, time_used=time_used)
                if self._estimator_index is None:
                    # update init eci estimate
                    eci_base = search_state.init_eci
                    self._eci.append(search_state.estimated_cost4improvement)
                    for e in self.estimator_list[1:]:
                        self._eci.append(
                            self._search_states[e].init_eci / eci_base * self._eci[0]
                        )
                    self._estimator_index = 0
                    min_budget = max(10 * self._eci[0], sum(self._eci))
                    max_budget = 10000 * self._eci[0]
                    if search_state.sample_size:
                        ratio = search_state.data_size[0] / search_state.sample_size
                        min_budget *= ratio
                        max_budget *= ratio
                    logger.info(
                        f"Estimated sufficient time budget={max_budget:.0f}s."
                        f" Estimated necessary time budget={min_budget:.0f}s."
                    )
                wall_time = result.get("wall_clock_time")
                if wall_time is not None:
                    self._state.time_from_start = wall_time
                # logger.info(f"{self._search_states[estimator].sample_size}, {data_size}")
                if search_state.sample_size == self._state.data_size[0]:
                    self._iter_per_learner_fullsize[estimator] += 1
                    self._fullsize_reached = True
                self._iter_per_learner[estimator] += 1
                if search_state.best_loss < self._state.best_loss:
                    best_config_sig = estimator + search_state.get_hist_config_sig(
                        self.data_size_full, search_state.best_config
                    )
                    self._state.best_loss = search_state.best_loss
                    self._best_estimator = estimator
                    est_retrain_time = (
                        search_state.est_retrain_time(self.data_size_full)
                        if (best_config_sig not in self._retrained_config)
                        else 0
                    )
                    self._config_history[self._track_iter] = (
                        estimator,
                        search_state.best_config,
                        self._state.time_from_start,
                    )
                    if self._trained_estimator:
                        self._trained_estimator.cleanup()
                        del self._trained_estimator
                        self._trained_estimator = None
                    if not self._state.retrain_final:
                        self._trained_estimator = search_state.trained_estimator
                    self._best_iteration = self._track_iter
                    self._time_taken_best_iter = self._state.time_from_start
                    better = True
                    next_trial_time = search_state.time2eval_best
                if (
                    search_state.trained_estimator
                    and not self._state.model_history
                    and search_state.trained_estimator != self._trained_estimator
                ):
                    search_state.trained_estimator.cleanup()
                if better or self._log_type == "all":
                    self._log_trial(search_state, estimator)

                logger.info(
                    " at {:.1f}s,\testimator {}'s best error={:.4f},\tbest estimator {}'s best error={:.4f}".format(
                        self._state.time_from_start,
                        estimator,
                        search_state.best_loss,
                        self._best_estimator,
                        self._state.best_loss,
                    )
                )
                if (
                    self._hpo_method in ("cfo", "bs")
                    and all(
                        state.search_alg
                        and state.search_alg.searcher.is_ls_ever_converged
                        for state in self._search_states.values()
                    )
                    and (
                        self._state.time_from_start
                        > self._warn_threshold * self._time_taken_best_iter
                    )
                ):
                    logger.warning(
                        "All estimator hyperparameters local search has "
                        "converged at least once, and the total search time "
                        f"exceeds {self._warn_threshold} times the time taken "
                        "to find the best model."
                    )
                    if self._early_stop:
                        logger.warning("Stopping search as early_stop is set to True.")
                        break
                    self._warn_threshold *= 10
            else:
                logger.info(f"stop trying learner {estimator}")
                if self._estimator_index is not None:
                    self._active_estimators.remove(estimator)
                    self._estimator_index -= 1
                search_state.search_alg.searcher._is_ls_ever_converged = True
            if (
                self._retrain_in_budget
                and best_config_sig
                and est_retrain_time
                and not better
                and self._search_states[self._best_estimator].sample_size
                == self._state.data_size[0]
                and (
                    est_retrain_time
                    <= self._state.time_budget - self._state.time_from_start
                    <= est_retrain_time + next_trial_time
                )
            ):
                state = self._search_states[self._best_estimator]
                self._trained_estimator, retrain_time = self._state._train_with_config(
                    self._best_estimator,
                    state.best_config,
                    self.data_size_full,
                )
                logger.info(
                    "retrain {} for {:.1f}s".format(self._best_estimator, retrain_time)
                )
                self._retrained_config[
                    best_config_sig
                ] = state.best_config_train_time = retrain_time
                est_retrain_time = 0
            self._state.time_from_start = time.time() - self._start_time_flag
            if (
                self._state.time_from_start >= self._state.time_budget
                or not self._active_estimators
            ):
                break
            if self._ensemble and self._best_estimator:
                time_left = self._state.time_budget - self._state.time_from_start
                time_ensemble = self._search_states[self._best_estimator].time2eval_best
                if time_left < time_ensemble < 2 * time_left:
                    break

    def _search(self):
        # initialize the search_states
        self._eci = []
        self._state.best_loss = float("+inf")
        self._state.time_from_start = 0
        self._estimator_index = None
        self._best_iteration = 0
        self._time_taken_best_iter = 0
        self._config_history = {}
        self._max_iter_per_learner = 10000
        self._iter_per_learner = dict([(e, 0) for e in self.estimator_list])
        self._iter_per_learner_fullsize = dict([(e, 0) for e in self.estimator_list])
        self._fullsize_reached = False
        self._trained_estimator = None
        self._best_estimator = None
        self._retrained_config = {}
        self._warn_threshold = 10
        self._selected = None
        self.modelcount = 0
        if self._max_iter < 2 and self.estimator_list and self._state.retrain_final:
            # when max_iter is 1, no need to search
            # TODO: otherwise, need to make sure SearchStates.init_config is inside search space
            self.modelcount = self._max_iter
            self._max_iter = 0
            self._best_estimator = estimator = self.estimator_list[0]
            self._selected = state = self._search_states[estimator]
            state.best_config_sample_size = self._state.data_size[0]
            state.best_config = (
                state.init_config
                if isinstance(state.init_config, dict)
                else state.init_config[0]
            )
        elif self._use_ray is False:
            self._search_sequential()
        else:
            self._search_parallel()
        # Add a checkpoint for the current best config to the log.
        if self._training_log:
            self._training_log.checkpoint()
        self._state.time_from_start = time.time() - self._start_time_flag
        if self._best_estimator:
            self._selected = self._search_states[self._best_estimator]
            self.modelcount = sum(
                search_state.total_iter for search_state in self._search_states.values()
            )
            if self._trained_estimator:
                logger.info(f"selected model: {self._trained_estimator.model}")
            estimators = []
            if self._ensemble and self._state.task in (
                "binary",
                "multiclass",
                "regression",
            ):
                search_states = list(
                    x for x in self._search_states.items() if x[1].best_config
                )
                search_states.sort(key=lambda x: x[1].best_loss)
                estimators = [
                    (
                        x[0],
                        x[1].learner_class(
                            task=self._state.task,
                            n_jobs=self._state.n_jobs,
                            **self._state.sanitize(x[1].best_config),
                        ),
                    )
                    for x in search_states[:2]
                ]
                estimators += [
                    (
                        x[0],
                        x[1].learner_class(
                            task=self._state.task,
                            n_jobs=self._state.n_jobs,
                            **self._state.sanitize(x[1].best_config),
                        ),
                    )
                    for x in search_states[2:]
                    if x[1].best_loss < 4 * self._selected.best_loss
                ]
                logger.info(
                    [(estimator[0], estimator[1].params) for estimator in estimators]
                )
            if len(estimators) > 1:
                if self._state.task in CLASSIFICATION:
                    from sklearn.ensemble import StackingClassifier as Stacker
                else:
                    from sklearn.ensemble import StackingRegressor as Stacker
                if self._use_ray is not False:
                    import ray

                    n_cpus = (
                        ray.is_initialized()
                        and ray.available_resources()["CPU"]
                        or os.cpu_count()
                    )
                else:
                    n_cpus = os.cpu_count()
                ensemble_n_jobs = (
                    -self._state.n_jobs  # maximize total parallelization degree
                    if abs(self._state.n_jobs)
                    == 1  # 1 and -1 correspond to min/max parallelization
                    else max(1, int(n_cpus / 2 / self._state.n_jobs))
                    # the total degree of parallelization = parallelization degree per estimator * parallelization degree of ensemble
                )
                if isinstance(self._ensemble, dict):
                    final_estimator = self._ensemble.get(
                        "final_estimator", self._trained_estimator
                    )
                    passthrough = self._ensemble.get("passthrough", True)
                    ensemble_n_jobs = self._ensemble.get("n_jobs", ensemble_n_jobs)
                else:
                    final_estimator = self._trained_estimator
                    passthrough = True
                stacker = Stacker(
                    estimators,
                    final_estimator,
                    n_jobs=ensemble_n_jobs,
                    passthrough=passthrough,
                )
                sample_weight_dict = (
                    (self._sample_weight_full is not None)
                    and {"sample_weight": self._sample_weight_full}
                    or {}
                )
                for e in estimators:
                    e[1].__class__.init()
                import joblib

                try:
                    logger.info("Building ensemble with tuned estimators")
                    stacker.fit(
                        self._X_train_all,
                        self._y_train_all,
                        **sample_weight_dict,  # NOTE: _search is after kwargs is updated to fit_kwargs_by_estimator
                    )
                    logger.info(f"ensemble: {stacker}")
                    self._trained_estimator = stacker
                    self._trained_estimator.model = stacker
                except ValueError as e:
                    if passthrough:
                        logger.warning(
                            "Using passthrough=False for ensemble because the data contain categorical features."
                        )
                        stacker = Stacker(
                            estimators,
                            final_estimator,
                            n_jobs=self._state.n_jobs,
                            passthrough=False,
                        )
                        stacker.fit(
                            self._X_train_all,
                            self._y_train_all,
                            **sample_weight_dict,  # NOTE: _search is after kwargs is updated to fit_kwargs_by_estimator
                        )
                        logger.info(f"ensemble: {stacker}")
                        self._trained_estimator = stacker
                        self._trained_estimator.model = stacker
                    else:
                        raise e
                except joblib.externals.loky.process_executor.TerminatedWorkerError:
                    logger.error(
                        "No enough memory to build the ensemble."
                        " Please try increasing available RAM, decreasing n_jobs for ensemble, or disabling ensemble."
                    )
            elif self._state.retrain_final:
                # reset time budget for retraining
                if self._max_iter > 1:
                    self._state.time_from_start -= self._state.time_budget
                if (
                    self._state.task in TS_FORECAST
                    or self._trained_estimator is None
                    or self._trained_estimator.model is None
                    or (
                        self._state.time_budget - self._state.time_from_start
                        > self._selected.est_retrain_time(self.data_size_full)
                        and self._selected.best_config_sample_size
                        == self._state.data_size[0]
                    )
                ):
                    state = self._search_states[self._best_estimator]
                    (
                        self._trained_estimator,
                        retrain_time,
                    ) = self._state._train_with_config(
                        self._best_estimator,
                        state.best_config,
                        self.data_size_full,
                    )
                    logger.info(
                        "retrain {} for {:.1f}s".format(
                            self._best_estimator, retrain_time
                        )
                    )
                    state.best_config_train_time = retrain_time
                    if self._trained_estimator:
                        logger.info(f"retrained model: {self._trained_estimator.model}")
                else:
                    logger.info("not retraining because the time budget is too small.")

    def __del__(self):
        if (
            hasattr(self, "_trained_estimator")
            and self._trained_estimator
            and hasattr(self._trained_estimator, "cleanup")
        ):
            if self.preserve_checkpoint is False:
                self._trained_estimator.cleanup()
            del self._trained_estimator

    def _select_estimator(self, estimator_list):
        if self._learner_selector == "roundrobin":
            self._estimator_index += 1
            if self._estimator_index == len(estimator_list):
                self._estimator_index = 0
            return estimator_list[self._estimator_index]
        min_estimated_cost, selected = np.Inf, None
        inv = []
        untried_exists = False
        for i, estimator in enumerate(estimator_list):
            if estimator in self._search_states and (
                self._search_states[estimator].sample_size
            ):  # sample_size=None meaning no result
                search_state = self._search_states[estimator]
                if (
                    self._search_states[estimator].time2eval_best
                    > self._state.time_budget - self._state.time_from_start
                    or self._iter_per_learner_fullsize[estimator]
                    >= self._max_iter_per_learner
                ):
                    inv.append(0)
                    continue
                estimated_cost = search_state.estimated_cost4improvement
                if search_state.sample_size < self._state.data_size[0]:
                    estimated_cost = min(
                        estimated_cost,
                        search_state.time2eval_best
                        * min(
                            SAMPLE_MULTIPLY_FACTOR,
                            self._state.data_size[0] / search_state.sample_size,
                        ),
                    )
                gap = search_state.best_loss - self._state.best_loss
                if gap > 0 and not self._ensemble:
                    delta_loss = (
                        search_state.best_loss_old - search_state.best_loss
                    ) or search_state.best_loss
                    delta_time = (
                        search_state.total_time_used - search_state.time_best_found_old
                    ) or 1e-10
                    speed = delta_loss / delta_time
                    if speed:
                        estimated_cost = max(2 * gap / speed, estimated_cost)
                estimated_cost = estimated_cost or 1e-9
                inv.append(1 / estimated_cost)
            else:
                estimated_cost = self._eci[i]
                inv.append(0)
                untried_exists = True
            if estimated_cost < min_estimated_cost:
                min_estimated_cost = estimated_cost
                selected = estimator
        if untried_exists or not selected:
            state = self._search_states.get(selected)
            if not (state and state.sample_size):
                return selected
        s = sum(inv)
        p = self._random.rand()
        q = 0
        for i in range(len(inv)):
            if inv[i]:
                q += inv[i] / s
                if p < q:
                    return estimator_list[i]