mirror of
				https://github.com/microsoft/autogen.git
				synced 2025-10-31 01:40:58 +00:00 
			
		
		
		
	 5eece5c748
			
		
	
	
		5eece5c748
		
			
		
	
	
	
	
		
			
			* add doc for spark * labelCol equals to label by default * change title and reformat * reference about default index type * fix doc build * Update website/docs/Examples/Integrate - Spark.md * update doc * Added more references * remove exception case when `y_train.name` is None * fix broken link --------- Co-authored-by: Wendong Li <v-wendongli@microsoft.com> Co-authored-by: Li Jiang <bnujli@gmail.com>
		
			
				
	
	
		
			2034 lines
		
	
	
		
			77 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			2034 lines
		
	
	
		
			77 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # !
 | |
| #  * Copyright (c) FLAML authors. All rights reserved.
 | |
| #  * Licensed under the MIT License. See LICENSE file in the
 | |
| #  * project root for license information.
 | |
| from contextlib import contextmanager
 | |
| from functools import partial
 | |
| import signal
 | |
| import os
 | |
| from typing import Callable, List, Union
 | |
| import numpy as np
 | |
| import time
 | |
| import logging
 | |
| import shutil
 | |
| import sys
 | |
| import math
 | |
| from flaml import tune
 | |
| from flaml.automl.data import (
 | |
|     group_counts,
 | |
| )
 | |
| from flaml.automl.task.task import (
 | |
|     Task,
 | |
|     SEQCLASSIFICATION,
 | |
|     SEQREGRESSION,
 | |
|     TOKENCLASSIFICATION,
 | |
|     SUMMARIZATION,
 | |
|     NLG_TASKS,
 | |
| )
 | |
| from flaml.automl.task.factory import task_factory
 | |
| 
 | |
| try:
 | |
|     from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 | |
|     from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
 | |
|     from sklearn.linear_model import LogisticRegression
 | |
|     from sklearn.dummy import DummyClassifier, DummyRegressor
 | |
| except ImportError:
 | |
|     pass
 | |
| 
 | |
| try:
 | |
|     from scipy.sparse import issparse
 | |
| except ImportError:
 | |
|     pass
 | |
| 
 | |
| from flaml.automl.spark import psDataFrame, sparkDataFrame, psSeries, ERROR as SPARK_ERROR, DataFrame, Series
 | |
| from flaml.automl.spark.utils import len_labels, to_pandas_on_spark
 | |
| from flaml.automl.spark.configs import (
 | |
|     ParamList_LightGBM_Classifier,
 | |
|     ParamList_LightGBM_Regressor,
 | |
|     ParamList_LightGBM_Ranker,
 | |
| )
 | |
| 
 | |
| if DataFrame is not None:
 | |
|     from pandas import to_datetime
 | |
| 
 | |
| try:
 | |
|     import psutil
 | |
| except ImportError:
 | |
|     psutil = None
 | |
| try:
 | |
|     import resource
 | |
| except ImportError:
 | |
|     resource = None
 | |
| 
 | |
| logger = logging.getLogger("flaml.automl")
 | |
| # FREE_MEM_RATIO = 0.2
 | |
| 
 | |
| 
 | |
| def TimeoutHandler(sig, frame):
 | |
|     raise TimeoutError(sig, frame)
 | |
| 
 | |
| 
 | |
| @contextmanager
 | |
| def limit_resource(memory_limit, time_limit):
 | |
|     if memory_limit > 0:
 | |
|         soft, hard = resource.getrlimit(resource.RLIMIT_AS)
 | |
|         if soft < 0 and (hard < 0 or memory_limit <= hard) or memory_limit < soft:
 | |
|             try:
 | |
|                 resource.setrlimit(resource.RLIMIT_AS, (int(memory_limit), hard))
 | |
|             except ValueError:
 | |
|                 # According to https://bugs.python.org/issue40518, it's a mac-specific error.
 | |
|                 pass
 | |
|     main_thread = False
 | |
|     if time_limit is not None:
 | |
|         try:
 | |
|             signal.signal(signal.SIGALRM, TimeoutHandler)
 | |
|             signal.alarm(int(time_limit) or 1)
 | |
|             main_thread = True
 | |
|         except ValueError:
 | |
|             pass
 | |
|     try:
 | |
|         yield
 | |
|     finally:
 | |
|         if main_thread:
 | |
|             signal.alarm(0)
 | |
|         if memory_limit > 0:
 | |
|             resource.setrlimit(resource.RLIMIT_AS, (soft, hard))
 | |
| 
 | |
| 
 | |
| class BaseEstimator:
 | |
|     """The abstract class for all learners.
 | |
| 
 | |
|     Typical examples:
 | |
|     * XGBoostEstimator: for regression.
 | |
|     * XGBoostSklearnEstimator: for classification.
 | |
|     * LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier:
 | |
|         for both regression and classification.
 | |
|     """
 | |
| 
 | |
|     def __init__(self, task="binary", **config):
 | |
|         """Constructor.
 | |
| 
 | |
|         Args:
 | |
|             task: A string of the task type, one of
 | |
|                 'binary', 'multiclass', 'regression', 'rank', 'seq-classification',
 | |
|                 'seq-regression', 'token-classification', 'multichoice-classification',
 | |
|                 'summarization', 'ts_forecast', 'ts_forecast_classification'.
 | |
|             config: A dictionary containing the hyperparameter names, 'n_jobs' as keys.
 | |
|                 n_jobs is the number of parallel threads.
 | |
|         """
 | |
|         self._task = task if isinstance(task, Task) else task_factory(task, None, None)
 | |
|         self.params = self.config2params(config)
 | |
|         self.estimator_class = self._model = None
 | |
|         if "_estimator_type" in config:
 | |
|             self._estimator_type = self.params.pop("_estimator_type")
 | |
|         else:
 | |
|             self._estimator_type = "classifier" if self._task.is_classification() else "regressor"
 | |
| 
 | |
|     def get_params(self, deep=False):
 | |
|         params = self.params.copy()
 | |
|         params["task"] = self._task
 | |
|         if hasattr(self, "_estimator_type"):
 | |
|             params["_estimator_type"] = self._estimator_type
 | |
|         return params
 | |
| 
 | |
|     @property
 | |
|     def classes_(self):
 | |
|         return self._model.classes_
 | |
| 
 | |
|     @property
 | |
|     def n_features_in_(self):
 | |
|         return self._model.n_features_in_
 | |
| 
 | |
|     @property
 | |
|     def model(self):
 | |
|         """Trained model after fit() is called, or None before fit() is called."""
 | |
|         return self._model
 | |
| 
 | |
|     @property
 | |
|     def estimator(self):
 | |
|         """Trained model after fit() is called, or None before fit() is called."""
 | |
|         return self._model
 | |
| 
 | |
|     @property
 | |
|     def feature_names_in_(self):
 | |
|         """
 | |
|         if self._model has attribute feature_names_in_, return it.
 | |
|         otherwise, if self._model has attribute feature_name_, return it.
 | |
|         otherwise, if self._model has attribute feature_names, return it.
 | |
|         otherwise, if self._model has method get_booster, return the feature names.
 | |
|         otherwise, return None.
 | |
|         """
 | |
|         if hasattr(self._model, "feature_names_in_"):  # for sklearn, xgboost>=1.6
 | |
|             return self._model.feature_names_in_
 | |
|         if hasattr(self._model, "feature_name_"):  # for lightgbm
 | |
|             return self._model.feature_name_
 | |
|         if hasattr(self._model, "feature_names"):  # for XGBoostEstimator
 | |
|             return self._model.feature_names
 | |
|         if hasattr(self._model, "get_booster"):
 | |
|             # get feature names for xgboost<1.6
 | |
|             # https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.Booster.feature_names
 | |
|             booster = self._model.get_booster()
 | |
|             return booster.feature_names
 | |
|         return None
 | |
| 
 | |
|     @property
 | |
|     def feature_importances_(self):
 | |
|         """
 | |
|         if self._model has attribute feature_importances_, return it.
 | |
|         otherwise, if self._model has attribute coef_, return it.
 | |
|         otherwise, return None.
 | |
|         """
 | |
|         if hasattr(self._model, "feature_importances_"):
 | |
|             # for sklearn, lightgbm, catboost, xgboost
 | |
|             return self._model.feature_importances_
 | |
|         elif hasattr(self._model, "coef_"):  # for linear models
 | |
|             return self._model.coef_
 | |
|         else:
 | |
|             return None
 | |
| 
 | |
|     def _preprocess(self, X):
 | |
|         return X
 | |
| 
 | |
|     def _fit(self, X_train, y_train, **kwargs):
 | |
|         current_time = time.time()
 | |
|         if "groups" in kwargs:
 | |
|             kwargs = kwargs.copy()
 | |
|             groups = kwargs.pop("groups")
 | |
|             if self._task == "rank":
 | |
|                 kwargs["group"] = group_counts(groups)
 | |
|                 # groups_val = kwargs.get('groups_val')
 | |
|                 # if groups_val is not None:
 | |
|                 #     kwargs['eval_group'] = [group_counts(groups_val)]
 | |
|                 #     kwargs['eval_set'] = [
 | |
|                 #         (kwargs['X_val'], kwargs['y_val'])]
 | |
|                 #     kwargs['verbose'] = False
 | |
|                 #     del kwargs['groups_val'], kwargs['X_val'], kwargs['y_val']
 | |
|         X_train = self._preprocess(X_train)
 | |
|         model = self.estimator_class(**self.params)
 | |
|         if logger.level == logging.DEBUG:
 | |
|             # xgboost 1.6 doesn't display all the params in the model str
 | |
|             logger.debug(f"flaml.model - {model} fit started with params {self.params}")
 | |
|         model.fit(X_train, y_train, **kwargs)
 | |
|         if logger.level == logging.DEBUG:
 | |
|             logger.debug(f"flaml.model - {model} fit finished")
 | |
|         train_time = time.time() - current_time
 | |
|         self._model = model
 | |
|         return train_time
 | |
| 
 | |
|     def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
 | |
|         """Train the model from given training data.
 | |
| 
 | |
|         Args:
 | |
|             X_train: A numpy array or a dataframe of training data in shape n*m.
 | |
|             y_train: A numpy array or a series of labels in shape n*1.
 | |
|             budget: A float of the time budget in seconds.
 | |
|             free_mem_ratio: A float between 0 and 1 for the free memory ratio to keep during training.
 | |
| 
 | |
|         Returns:
 | |
|             train_time: A float of the training time in seconds.
 | |
|         """
 | |
|         if (
 | |
|             getattr(self, "limit_resource", None)
 | |
|             and resource is not None
 | |
|             and (budget is not None or psutil is not None)
 | |
|         ):
 | |
|             start_time = time.time()
 | |
|             mem = psutil.virtual_memory() if psutil is not None else None
 | |
|             try:
 | |
|                 with limit_resource(
 | |
|                     mem.available * (1 - free_mem_ratio) + psutil.Process(os.getpid()).memory_info().rss
 | |
|                     if mem is not None
 | |
|                     else -1,
 | |
|                     budget,
 | |
|                 ):
 | |
|                     train_time = self._fit(X_train, y_train, **kwargs)
 | |
|             except (MemoryError, TimeoutError) as e:
 | |
|                 logger.warning(f"{e.__class__} {e}")
 | |
|                 if self._task.is_classification():
 | |
|                     model = DummyClassifier()
 | |
|                 else:
 | |
|                     model = DummyRegressor()
 | |
|                 X_train = self._preprocess(X_train)
 | |
|                 model.fit(X_train, y_train)
 | |
|                 self._model = model
 | |
|                 train_time = time.time() - start_time
 | |
|         else:
 | |
|             train_time = self._fit(X_train, y_train, **kwargs)
 | |
|         return train_time
 | |
| 
 | |
|     def predict(self, X, **kwargs):
 | |
|         """Predict label from features.
 | |
| 
 | |
|         Args:
 | |
|             X: A numpy array or a dataframe of featurized instances, shape n*m.
 | |
| 
 | |
|         Returns:
 | |
|             A numpy array of shape n*1.
 | |
|             Each element is the label for a instance.
 | |
|         """
 | |
|         if self._model is not None:
 | |
|             X = self._preprocess(X)
 | |
|             return self._model.predict(X, **kwargs)
 | |
|         else:
 | |
|             logger.warning("Estimator is not fit yet. Please run fit() before predict().")
 | |
|             return np.ones(X.shape[0])
 | |
| 
 | |
|     def predict_proba(self, X, **kwargs):
 | |
|         """Predict the probability of each class from features.
 | |
| 
 | |
|         Only works for classification problems
 | |
| 
 | |
|         Args:
 | |
|             X: A numpy array of featurized instances, shape n*m.
 | |
| 
 | |
|         Returns:
 | |
|             A numpy array of shape n*c. c is the # classes.
 | |
|             Each element at (i,j) is the probability for instance i to be in
 | |
|                 class j.
 | |
|         """
 | |
|         assert self._task.is_classification(), "predict_proba() only for classification."
 | |
| 
 | |
|         X = self._preprocess(X)
 | |
|         return self._model.predict_proba(X, **kwargs)
 | |
| 
 | |
|     def score(self, X_val: DataFrame, y_val: Series, **kwargs):
 | |
|         """Report the evaluation score of a trained estimator.
 | |
| 
 | |
| 
 | |
|         Args:
 | |
|             X_val: A pandas dataframe of the validation input data.
 | |
|             y_val: A pandas series of the validation label.
 | |
|             kwargs: keyword argument of the evaluation function, for example:
 | |
|                 - metric: A string of the metric name or a function
 | |
|                 e.g., 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo',
 | |
|                 'f1', 'micro_f1', 'macro_f1', 'log_loss', 'mae', 'mse', 'r2',
 | |
|                 'mape'. Default is 'auto'.
 | |
|                 If metric is given, the score will report the user specified metric.
 | |
|                 If metric is not given, the metric is set to accuracy for classification and r2
 | |
|                 for regression.
 | |
|                 You can also pass a customized metric function, for examples on how to pass a
 | |
|                 customized metric function, please check
 | |
|                 [test/nlp/test_autohf_custom_metric.py](https://github.com/microsoft/FLAML/blob/main/test/nlp/test_autohf_custom_metric.py) and
 | |
|                 [test/automl/test_multiclass.py](https://github.com/microsoft/FLAML/blob/main/test/automl/test_multiclass.py).
 | |
| 
 | |
|         Returns:
 | |
|             The evaluation score on the validation dataset.
 | |
|         """
 | |
|         from .ml import metric_loss_score
 | |
|         from .ml import is_min_metric
 | |
| 
 | |
|         if self._model is not None:
 | |
|             if self._task == "rank":
 | |
|                 raise NotImplementedError("AutoML.score() is not implemented for ranking")
 | |
|             else:
 | |
|                 X_val = self._preprocess(X_val)
 | |
|                 metric = kwargs.pop("metric", None)
 | |
|                 if metric:
 | |
|                     y_pred = self.predict(X_val, **kwargs)
 | |
|                     if is_min_metric(metric):
 | |
|                         return metric_loss_score(metric, y_pred, y_val)
 | |
|                     else:
 | |
|                         return 1.0 - metric_loss_score(metric, y_pred, y_val)
 | |
|                 else:
 | |
|                     return self._model.score(X_val, y_val, **kwargs)
 | |
|         else:
 | |
|             logger.warning("Estimator is not fit yet. Please run fit() before predict().")
 | |
|             return 0.0
 | |
| 
 | |
|     def cleanup(self):
 | |
|         del self._model
 | |
|         self._model = None
 | |
| 
 | |
|     @classmethod
 | |
|     def search_space(cls, data_size, task, **params):
 | |
|         """[required method] search space.
 | |
| 
 | |
|         Args:
 | |
|             data_size: A tuple of two integers, number of rows and columns.
 | |
|             task: A str of the task type, e.g., "binary", "multiclass", "regression".
 | |
| 
 | |
|         Returns:
 | |
|             A dictionary of the search space.
 | |
|             Each key is the name of a hyperparameter, and value is a dict with
 | |
|                 its domain (required) and low_cost_init_value, init_value,
 | |
|                 cat_hp_cost (if applicable).
 | |
|                 e.g., ```{'domain': tune.randint(lower=1, upper=10), 'init_value': 1}```.
 | |
|         """
 | |
|         return {}
 | |
| 
 | |
|     @classmethod
 | |
|     def size(cls, config: dict) -> float:
 | |
|         """[optional method] memory size of the estimator in bytes.
 | |
| 
 | |
|         Args:
 | |
|             config: A dict of the hyperparameter config.
 | |
| 
 | |
|         Returns:
 | |
|             A float of the memory size required by the estimator to train the
 | |
|             given config.
 | |
|         """
 | |
|         return 1.0
 | |
| 
 | |
|     @classmethod
 | |
|     def cost_relative2lgbm(cls) -> float:
 | |
|         """[optional method] relative cost compared to lightgbm."""
 | |
|         return 1.0
 | |
| 
 | |
|     @classmethod
 | |
|     def init(cls):
 | |
|         """[optional method] initialize the class."""
 | |
|         pass
 | |
| 
 | |
|     def config2params(self, config: dict) -> dict:
 | |
|         """[optional method] config dict to params dict
 | |
| 
 | |
|         Args:
 | |
|             config: A dict of the hyperparameter config.
 | |
| 
 | |
|         Returns:
 | |
|             A dict that will be passed to self.estimator_class's constructor.
 | |
|         """
 | |
|         params = config.copy()
 | |
|         if "FLAML_sample_size" in params:
 | |
|             params.pop("FLAML_sample_size")
 | |
|         return params
 | |
| 
 | |
| 
 | |
| class SparkEstimator(BaseEstimator):
 | |
|     """The base class for fine-tuning spark models, using pyspark.ml and SynapseML API."""
 | |
| 
 | |
|     def __init__(self, task="binary", **config):
 | |
|         if SPARK_ERROR:
 | |
|             raise SPARK_ERROR
 | |
|         super().__init__(task, **config)
 | |
|         self.df_train = None
 | |
| 
 | |
|     def _preprocess(
 | |
|         self,
 | |
|         X_train: Union[psDataFrame, sparkDataFrame],
 | |
|         y_train: psSeries = None,
 | |
|         index_col: str = "tmp_index_col",
 | |
|         return_label: bool = False,
 | |
|     ):
 | |
|         # TODO: optimize this, support pyspark.sql.DataFrame
 | |
|         if y_train is not None:
 | |
|             self.df_train = X_train.join(y_train)
 | |
|         else:
 | |
|             self.df_train = X_train
 | |
|         if isinstance(self.df_train, psDataFrame):
 | |
|             self.df_train = self.df_train.to_spark(index_col=index_col)
 | |
|         if return_label:
 | |
|             return self.df_train, y_train.name
 | |
|         else:
 | |
|             return self.df_train
 | |
| 
 | |
|     def fit(
 | |
|         self,
 | |
|         X_train: psDataFrame,
 | |
|         y_train: psSeries = None,
 | |
|         budget=None,
 | |
|         free_mem_ratio=0,
 | |
|         index_col: str = "tmp_index_col",
 | |
|         **kwargs,
 | |
|     ):
 | |
|         """Train the model from given training data.
 | |
|         Args:
 | |
|             X_train: A pyspark.pandas DataFrame of training data in shape n*m.
 | |
|             y_train: A pyspark.pandas Series in shape n*1. None if X_train is a pyspark.pandas
 | |
|                 Dataframe contains y_train.
 | |
|             budget: A float of the time budget in seconds.
 | |
|             free_mem_ratio: A float between 0 and 1 for the free memory ratio to keep during training.
 | |
|         Returns:
 | |
|             train_time: A float of the training time in seconds.
 | |
|         """
 | |
|         df_train, label_col = self._preprocess(X_train, y_train, index_col=index_col, return_label=True)
 | |
|         kwargs["labelCol"] = label_col
 | |
|         train_time = self._fit(df_train, **kwargs)
 | |
|         return train_time
 | |
| 
 | |
|     def _fit(self, df_train: sparkDataFrame, **kwargs):
 | |
|         current_time = time.time()
 | |
|         pipeline_model = self.estimator_class(**self.params, **kwargs)
 | |
|         if logger.level == logging.DEBUG:
 | |
|             logger.debug(f"flaml.model - {pipeline_model} fit started with params {self.params}")
 | |
|         pipeline_model.fit(df_train)
 | |
|         if logger.level == logging.DEBUG:
 | |
|             logger.debug(f"flaml.model - {pipeline_model} fit finished")
 | |
|         train_time = time.time() - current_time
 | |
|         self._model = pipeline_model
 | |
|         return train_time
 | |
| 
 | |
|     def predict(self, X, index_col="tmp_index_col", return_all=False, **kwargs):
 | |
|         """Predict label from features.
 | |
|         Args:
 | |
|             X: A pyspark or pyspark.pandas dataframe of featurized instances, shape n*m.
 | |
|             index_col: A str of the index column name. Default to "tmp_index_col".
 | |
|             return_all: A bool of whether to return all the prediction results. Default to False.
 | |
|         Returns:
 | |
|             A pyspark.pandas series of shape n*1 if return_all is False. Otherwise, a pyspark.pandas dataframe.
 | |
|         """
 | |
|         if self._model is not None:
 | |
|             X = self._preprocess(X, index_col=index_col)
 | |
|             predictions = to_pandas_on_spark(self._model.transform(X), index_col=index_col)
 | |
|             predictions.index.name = None
 | |
|             pred_y = predictions["prediction"]
 | |
|             if return_all:
 | |
|                 return predictions
 | |
|             else:
 | |
|                 return pred_y
 | |
|         else:
 | |
|             logger.warning("Estimator is not fit yet. Please run fit() before predict().")
 | |
|             return np.ones(X.shape[0])
 | |
| 
 | |
|     def predict_proba(self, X, index_col="tmp_index_col", return_all=False, **kwargs):
 | |
|         """Predict the probability of each class from features.
 | |
|         Only works for classification problems
 | |
|         Args:
 | |
|             X: A pyspark or pyspark.pandas dataframe of featurized instances, shape n*m.
 | |
|             index_col: A str of the index column name. Default to "tmp_index_col".
 | |
|             return_all: A bool of whether to return all the prediction results. Default to False.
 | |
|         Returns:
 | |
|             A pyspark.pandas dataframe of shape n*c. c is the # classes.
 | |
|             Each element at (i,j) is the probability for instance i to be in
 | |
|                 class j.
 | |
|         """
 | |
|         assert self._task.is_classification(), "predict_proba() only for classification."
 | |
|         if self._model is not None:
 | |
|             X = self._preprocess(X, index_col=index_col)
 | |
|             predictions = to_pandas_on_spark(self._model.transform(X), index_col=index_col)
 | |
|             predictions.index.name = None
 | |
|             pred_y = predictions["probability"]
 | |
| 
 | |
|             if return_all:
 | |
|                 return predictions
 | |
|             else:
 | |
|                 return pred_y
 | |
|         else:
 | |
|             logger.warning("Estimator is not fit yet. Please run fit() before predict().")
 | |
|             return np.ones(X.shape[0])
 | |
| 
 | |
| 
 | |
| class SparkLGBMEstimator(SparkEstimator):
 | |
|     """The class for fine-tuning spark version lightgbm models, using SynapseML API."""
 | |
| 
 | |
|     ITER_HP = "numIterations"
 | |
|     DEFAULT_ITER = 100
 | |
| 
 | |
|     @classmethod
 | |
|     def search_space(cls, data_size, **params):
 | |
|         upper = max(5, min(32768, int(data_size[0])))  # upper must be larger than lower
 | |
|         # https://github.com/microsoft/SynapseML/blob/master/lightgbm/src/main/scala/com/microsoft/azure/synapse/ml/lightgbm/LightGBMBase.scala
 | |
|         return {
 | |
|             "numIterations": {
 | |
|                 "domain": tune.lograndint(lower=4, upper=upper),
 | |
|                 "init_value": 4,
 | |
|                 "low_cost_init_value": 4,
 | |
|             },
 | |
|             "numLeaves": {
 | |
|                 "domain": tune.lograndint(lower=4, upper=upper),
 | |
|                 "init_value": 4,
 | |
|                 "low_cost_init_value": 4,
 | |
|             },
 | |
|             "minDataInLeaf": {
 | |
|                 "domain": tune.lograndint(lower=2, upper=2**7 + 1),
 | |
|                 "init_value": 20,
 | |
|             },
 | |
|             "learningRate": {
 | |
|                 "domain": tune.loguniform(lower=1 / 1024, upper=1.0),
 | |
|                 "init_value": 0.1,
 | |
|             },
 | |
|             "log_max_bin": {  # log transformed with base 2
 | |
|                 "domain": tune.lograndint(lower=3, upper=11),
 | |
|                 "init_value": 8,
 | |
|             },
 | |
|             "featureFraction": {
 | |
|                 "domain": tune.uniform(lower=0.01, upper=1.0),
 | |
|                 "init_value": 1.0,
 | |
|             },
 | |
|             "lambdaL1": {
 | |
|                 "domain": tune.loguniform(lower=1 / 1024, upper=1024),
 | |
|                 "init_value": 1 / 1024,
 | |
|             },
 | |
|             "lambdaL2": {
 | |
|                 "domain": tune.loguniform(lower=1 / 1024, upper=1024),
 | |
|                 "init_value": 1.0,
 | |
|             },
 | |
|         }
 | |
| 
 | |
|     def config2params(self, config: dict) -> dict:
 | |
|         params = super().config2params(config)
 | |
|         if "n_jobs" in params:
 | |
|             params.pop("n_jobs")
 | |
|         if "log_max_bin" in params:
 | |
|             params["maxBin"] = (1 << params.pop("log_max_bin")) - 1
 | |
|         return params
 | |
| 
 | |
|     @classmethod
 | |
|     def size(cls, config):
 | |
|         num_leaves = int(round(config.get("numLeaves") or 1 << config.get("maxDepth", 16)))
 | |
|         n_estimators = int(round(config["numIterations"]))
 | |
|         return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8
 | |
| 
 | |
|     def __init__(self, task="binary", **config):
 | |
|         super().__init__(task, **config)
 | |
|         err_msg = (
 | |
|             "SynapseML is not installed. Please refer to [SynapseML]"
 | |
|             + "(https://github.com/microsoft/SynapseML) for installation instructions."
 | |
|         )
 | |
|         if "regression" == task:
 | |
|             try:
 | |
|                 from synapse.ml.lightgbm import LightGBMRegressor
 | |
|             except ImportError:
 | |
|                 raise ImportError(err_msg)
 | |
| 
 | |
|             self.estimator_class = LightGBMRegressor
 | |
|             self.estimator_params = ParamList_LightGBM_Regressor
 | |
|         elif "rank" == task:
 | |
|             try:
 | |
|                 from synapse.ml.lightgbm import LightGBMRanker
 | |
|             except ImportError:
 | |
|                 raise ImportError(err_msg)
 | |
| 
 | |
|             self.estimator_class = LightGBMRanker
 | |
|             self.estimator_params = ParamList_LightGBM_Ranker
 | |
|         else:
 | |
|             try:
 | |
|                 from synapse.ml.lightgbm import LightGBMClassifier
 | |
|             except ImportError:
 | |
|                 raise ImportError(err_msg)
 | |
| 
 | |
|             self.estimator_class = LightGBMClassifier
 | |
|             self.estimator_params = ParamList_LightGBM_Classifier
 | |
|         self._time_per_iter = None
 | |
|         self._train_size = 0
 | |
|         self._mem_per_iter = -1
 | |
|         self.model_classes_ = None
 | |
|         self.model_n_classes_ = None
 | |
| 
 | |
|     def fit(
 | |
|         self,
 | |
|         X_train,
 | |
|         y_train=None,
 | |
|         budget=None,
 | |
|         free_mem_ratio=0,
 | |
|         index_col="tmp_index_col",
 | |
|         **kwargs,
 | |
|     ):
 | |
|         start_time = time.time()
 | |
|         if self.model_n_classes_ is None and self._task not in ["regression", "rank"]:
 | |
|             self.model_n_classes_, self.model_classes_ = len_labels(y_train, return_labels=True)
 | |
|         df_train, label_col = self._preprocess(X_train, y_train, index_col=index_col, return_label=True)
 | |
|         # n_iter = self.params.get(self.ITER_HP, self.DEFAULT_ITER)
 | |
|         # trained = False
 | |
|         # mem0 = psutil.virtual_memory().available if psutil is not None else 1
 | |
|         _kwargs = kwargs.copy()
 | |
|         if self._task not in ["regression", "rank"] and "objective" not in _kwargs:
 | |
|             _kwargs["objective"] = "binary" if self.model_n_classes_ == 2 else "multiclass"
 | |
|         for k in list(_kwargs.keys()):
 | |
|             if k not in self.estimator_params:
 | |
|                 logger.warning(f"[SparkLGBMEstimator] [Warning] Ignored unknown parameter: {k}")
 | |
|                 _kwargs.pop(k)
 | |
|         # TODO: find a better estimation of early stopping
 | |
|         # if (
 | |
|         #     (not self._time_per_iter or abs(self._train_size - df_train.count()) > 4)
 | |
|         #     and budget is not None
 | |
|         #     or self._mem_per_iter < 0
 | |
|         #     and psutil is not None
 | |
|         # ) and n_iter > 1:
 | |
|         #     self.params[self.ITER_HP] = 1
 | |
|         #     self._t1 = self._fit(df_train, **_kwargs)
 | |
|         #     if budget is not None and self._t1 >= budget or n_iter == 1:
 | |
|         #         return self._t1
 | |
|         #     mem1 = psutil.virtual_memory().available if psutil is not None else 1
 | |
|         #     self._mem1 = mem0 - mem1
 | |
|         #     self.params[self.ITER_HP] = min(n_iter, 4)
 | |
|         #     self._t2 = self._fit(df_train, **_kwargs)
 | |
|         #     mem2 = psutil.virtual_memory().available if psutil is not None else 1
 | |
|         #     self._mem2 = max(mem0 - mem2, self._mem1)
 | |
|         #     self._mem_per_iter = min(self._mem1, self._mem2 / self.params[self.ITER_HP])
 | |
|         #     self._time_per_iter = (
 | |
|         #         (self._t2 - self._t1) / (self.params[self.ITER_HP] - 1)
 | |
|         #         if self._t2 > self._t1
 | |
|         #         else self._t1
 | |
|         #         if self._t1
 | |
|         #         else 0.001
 | |
|         #     )
 | |
|         #     self._train_size = df_train.count()
 | |
|         #     if (
 | |
|         #         budget is not None
 | |
|         #         and self._t1 + self._t2 >= budget
 | |
|         #         or n_iter == self.params[self.ITER_HP]
 | |
|         #     ):
 | |
|         #         # self.params[self.ITER_HP] = n_iter
 | |
|         #         return time.time() - start_time
 | |
|         #     trained = True
 | |
|         # if n_iter > 1:
 | |
|         #     max_iter = min(
 | |
|         #         n_iter,
 | |
|         #         int(
 | |
|         #             (budget - time.time() + start_time - self._t1) / self._time_per_iter
 | |
|         #             + 1
 | |
|         #         )
 | |
|         #         if budget is not None
 | |
|         #         else n_iter,
 | |
|         #     )
 | |
|         #     if trained and max_iter <= self.params[self.ITER_HP]:
 | |
|         #         return time.time() - start_time
 | |
|         #     # when not trained, train at least one iter
 | |
|         #     self.params[self.ITER_HP] = max(max_iter, 1)
 | |
|         _kwargs["labelCol"] = label_col
 | |
|         self._fit(df_train, **_kwargs)
 | |
|         train_time = time.time() - start_time
 | |
|         return train_time
 | |
| 
 | |
|     def _fit(self, df_train: sparkDataFrame, **kwargs):
 | |
|         current_time = time.time()
 | |
|         model = self.estimator_class(**self.params, **kwargs)
 | |
|         if logger.level == logging.DEBUG:
 | |
|             logger.debug(f"flaml.model - {model} fit started with params {self.params}")
 | |
|         self._model = model.fit(df_train)
 | |
|         self._model.classes_ = self.model_classes_
 | |
|         self._model.n_classes_ = self.model_n_classes_
 | |
|         if logger.level == logging.DEBUG:
 | |
|             logger.debug(f"flaml.model - {model} fit finished")
 | |
|         train_time = time.time() - current_time
 | |
|         return train_time
 | |
| 
 | |
| 
 | |
| class TransformersEstimator(BaseEstimator):
 | |
|     """The class for fine-tuning language models, using huggingface transformers API."""
 | |
| 
 | |
|     ITER_HP = "global_max_steps"
 | |
| 
 | |
|     def __init__(self, task="seq-classification", **config):
 | |
|         super().__init__(task, **config)
 | |
|         import uuid
 | |
| 
 | |
|         self.trial_id = str(uuid.uuid1().hex)[:8]
 | |
|         if task not in NLG_TASKS:  # TODO: not in NLG_TASKS
 | |
|             from .nlp.huggingface.training_args import (
 | |
|                 TrainingArgumentsForAuto as TrainingArguments,
 | |
|             )
 | |
|         else:
 | |
|             from .nlp.huggingface.training_args import (
 | |
|                 Seq2SeqTrainingArgumentsForAuto as TrainingArguments,
 | |
|             )
 | |
|         self._TrainingArguments = TrainingArguments
 | |
| 
 | |
|     @classmethod
 | |
|     def search_space(cls, data_size, task, **params):
 | |
|         search_space_dict = {
 | |
|             "learning_rate": {
 | |
|                 "domain": tune.loguniform(1e-6, 1e-4),
 | |
|                 "init_value": 1e-5,
 | |
|             },
 | |
|             "num_train_epochs": {
 | |
|                 "domain": tune.choice([1, 2, 3, 4, 5]),
 | |
|                 "init_value": 3,  # to be consistent with roberta
 | |
|                 "low_cost_init_value": 1,
 | |
|             },
 | |
|             "per_device_train_batch_size": {
 | |
|                 "domain": tune.choice([4, 8, 16, 32, 64]),
 | |
|                 "init_value": 32,
 | |
|                 "low_cost_init_value": 64,
 | |
|             },
 | |
|             "seed": {
 | |
|                 "domain": tune.choice(range(1, 40)),
 | |
|                 "init_value": 20,
 | |
|             },
 | |
|             "global_max_steps": {
 | |
|                 "domain": sys.maxsize,
 | |
|                 "init_value": sys.maxsize,
 | |
|             },
 | |
|         }
 | |
| 
 | |
|         return search_space_dict
 | |
| 
 | |
|     @property
 | |
|     def fp16(self):
 | |
|         return self._kwargs.get("gpu_per_trial") and self._training_args.fp16
 | |
| 
 | |
|     @property
 | |
|     def no_cuda(self):
 | |
|         return not self._kwargs.get("gpu_per_trial")
 | |
| 
 | |
|     def _set_training_args(self, **kwargs):
 | |
|         from .nlp.utils import date_str, Counter
 | |
| 
 | |
|         for key, val in kwargs.items():
 | |
|             assert key not in self.params, (
 | |
|                 "Since {} is in the search space, it cannot exist in 'custom_fit_kwargs' at the same time."
 | |
|                 "If you need to fix the value of {} to {}, the only way is to add a single-value domain in the search "
 | |
|                 "space by adding:\n '{}': {{ 'domain': {} }} to 'custom_hp'. For example:"
 | |
|                 'automl_settings["custom_hp"] = {{ "transformer": {{ "model_path": {{ "domain" : '
 | |
|                 '"google/electra-small-discriminator" }} }} }}'.format(key, key, val, key, val)
 | |
|             )
 | |
| 
 | |
|         """
 | |
|             If use has specified any custom args for TrainingArguments, update these arguments
 | |
|         """
 | |
|         self._training_args = self._TrainingArguments(**kwargs)
 | |
| 
 | |
|         """
 | |
|             Update the attributes in TrainingArguments with self.params values
 | |
|         """
 | |
|         for key, val in self.params.items():
 | |
|             if hasattr(self._training_args, key):
 | |
|                 setattr(self._training_args, key, val)
 | |
| 
 | |
|         """
 | |
|             Update the attributes in TrainingArguments that depends on the values of self.params
 | |
|         """
 | |
|         local_dir = os.path.join(self._training_args.output_dir, "train_{}".format(date_str()))
 | |
|         if self._use_ray is True:
 | |
|             import ray
 | |
| 
 | |
|             self._training_args.output_dir = ray.tune.get_trial_dir()
 | |
|         else:
 | |
|             self._training_args.output_dir = Counter.get_trial_fold_name(local_dir, self.params, self.trial_id)
 | |
| 
 | |
|         self._training_args.fp16 = self.fp16
 | |
|         self._training_args.no_cuda = self.no_cuda
 | |
| 
 | |
|         if self._task == TOKENCLASSIFICATION and self._training_args.max_seq_length is not None:
 | |
|             logger.warning(
 | |
|                 "For token classification task, FLAML currently does not support customizing the max_seq_length, max_seq_length will be reset to None."
 | |
|             )
 | |
|             setattr(self._training_args, "max_seq_length", None)
 | |
| 
 | |
|     def _tokenize_text(self, X, y=None, **kwargs):
 | |
|         from .nlp.huggingface.utils import tokenize_text
 | |
|         from .nlp.utils import is_a_list_of_str
 | |
| 
 | |
|         is_str = str(X.dtypes[0]) in ("string", "str")
 | |
|         is_list_of_str = is_a_list_of_str(X[list(X.keys())[0]].to_list()[0])
 | |
| 
 | |
|         if is_str or is_list_of_str:
 | |
|             return tokenize_text(
 | |
|                 X=X,
 | |
|                 Y=y,
 | |
|                 task=self._task,
 | |
|                 hf_args=self._training_args,
 | |
|                 tokenizer=self.tokenizer,
 | |
|             )
 | |
|         else:
 | |
|             return X, y
 | |
| 
 | |
|     def _model_init(self):
 | |
|         from .nlp.huggingface.utils import load_model
 | |
| 
 | |
|         this_model = load_model(
 | |
|             checkpoint_path=self._training_args.model_path,
 | |
|             task=self._task,
 | |
|             num_labels=self.num_labels,
 | |
|         )
 | |
|         return this_model
 | |
| 
 | |
|     def _preprocess_data(self, X, y):
 | |
|         from datasets import Dataset
 | |
| 
 | |
|         processed_X, processed_y_df = self._tokenize_text(X=X, y=y, **self._kwargs)
 | |
|         # convert y from pd.DataFrame back to pd.Series
 | |
|         processed_y = processed_y_df.iloc[:, 0]
 | |
| 
 | |
|         processed_dataset = Dataset.from_pandas(processed_X.join(processed_y_df))
 | |
| 
 | |
|         return processed_dataset, processed_X, processed_y
 | |
| 
 | |
|     @property
 | |
|     def num_labels(self):
 | |
|         if self._task == SEQREGRESSION:
 | |
|             return 1
 | |
|         elif self._task == SEQCLASSIFICATION:
 | |
|             return len(set(self._y_train))
 | |
|         elif self._task == TOKENCLASSIFICATION:
 | |
|             return len(self._training_args.label_list)
 | |
|         else:
 | |
|             return None
 | |
| 
 | |
|     @property
 | |
|     def tokenizer(self):
 | |
|         from transformers import AutoTokenizer
 | |
| 
 | |
|         if self._task == SUMMARIZATION:
 | |
|             return AutoTokenizer.from_pretrained(
 | |
|                 pretrained_model_name_or_path=self._training_args.model_path,
 | |
|                 cache_dir=None,
 | |
|                 use_fast=True,
 | |
|                 revision="main",
 | |
|                 use_auth_token=None,
 | |
|             )
 | |
|         else:
 | |
|             return AutoTokenizer.from_pretrained(
 | |
|                 self._training_args.model_path,
 | |
|                 use_fast=True,
 | |
|                 add_prefix_space=self._add_prefix_space,
 | |
|             )
 | |
| 
 | |
|     @property
 | |
|     def data_collator(self):
 | |
|         from flaml.automl.task.task import Task
 | |
|         from flaml.automl.nlp.huggingface.data_collator import (
 | |
|             task_to_datacollator_class,
 | |
|         )
 | |
| 
 | |
|         data_collator_class = task_to_datacollator_class.get(
 | |
|             self._task.name if isinstance(self._task, Task) else self._task
 | |
|         )
 | |
| 
 | |
|         if data_collator_class:
 | |
|             kwargs = {
 | |
|                 "model": self._model_init(),
 | |
|                 # need to set model, or there's ValueError: Expected input batch_size (..) to match target batch_size (..)
 | |
|                 "label_pad_token_id": -100,  # pad with token id -100
 | |
|                 "pad_to_multiple_of": 8,
 | |
|                 # pad to multiple of 8 because quote Transformers: "This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta)"
 | |
|                 "tokenizer": self.tokenizer,
 | |
|             }
 | |
| 
 | |
|             for key in list(kwargs.keys()):
 | |
|                 if key not in data_collator_class.__dict__.keys() and key != "tokenizer":
 | |
|                     del kwargs[key]
 | |
|             return data_collator_class(**kwargs)
 | |
|         else:
 | |
|             return None
 | |
| 
 | |
|     def fit(
 | |
|         self,
 | |
|         X_train: DataFrame,
 | |
|         y_train: Series,
 | |
|         budget=None,
 | |
|         free_mem_ratio=0,
 | |
|         X_val=None,
 | |
|         y_val=None,
 | |
|         gpu_per_trial=None,
 | |
|         metric=None,
 | |
|         **kwargs,
 | |
|     ):
 | |
|         import transformers
 | |
| 
 | |
|         transformers.logging.set_verbosity_error()
 | |
| 
 | |
|         from transformers import TrainerCallback
 | |
|         from transformers.trainer_utils import set_seed
 | |
|         from .nlp.huggingface.trainer import TrainerForAuto
 | |
| 
 | |
|         try:
 | |
|             from ray.tune import is_session_enabled
 | |
| 
 | |
|             self._use_ray = is_session_enabled()
 | |
|         except ImportError:
 | |
|             self._use_ray = False
 | |
| 
 | |
|         this_params = self.params
 | |
|         self._kwargs = kwargs
 | |
| 
 | |
|         self._X_train, self._y_train = X_train, y_train
 | |
|         self._set_training_args(**kwargs)
 | |
|         self._add_prefix_space = (
 | |
|             "roberta" in self._training_args.model_path
 | |
|         )  # If using roberta model, must set add_prefix_space to True to avoid the assertion error at
 | |
|         # https://github.com/huggingface/transformers/blob/main/src/transformers/models/roberta/tokenization_roberta_fast.py#L249
 | |
| 
 | |
|         train_dataset, self._X_train, self._y_train = self._preprocess_data(X_train, y_train)
 | |
|         if X_val is not None:
 | |
|             eval_dataset, self._X_val, self._y_val = self._preprocess_data(X_val, y_val)
 | |
|         else:
 | |
|             eval_dataset, self._X_val, self._y_val = None, None, None
 | |
| 
 | |
|         set_seed(self.params.get("seed", self._training_args.seed))
 | |
|         self._metric = metric
 | |
| 
 | |
|         class EarlyStoppingCallbackForAuto(TrainerCallback):
 | |
|             def on_train_begin(self, args, state, control, **callback_kwargs):
 | |
|                 self.train_begin_time = time.time()
 | |
| 
 | |
|             def on_step_begin(self, args, state, control, **callback_kwargs):
 | |
|                 self.step_begin_time = time.time()
 | |
| 
 | |
|             def on_step_end(self, args, state, control, **callback_kwargs):
 | |
|                 if state.global_step == 1:
 | |
|                     self.time_per_iter = time.time() - self.step_begin_time
 | |
|                 if (
 | |
|                     budget
 | |
|                     and (time.time() + self.time_per_iter > self.train_begin_time + budget)
 | |
|                     or state.global_step >= this_params[TransformersEstimator.ITER_HP]
 | |
|                 ):
 | |
|                     control.should_training_stop = True
 | |
|                     control.should_save = True
 | |
|                     control.should_evaluate = True
 | |
|                 return control
 | |
| 
 | |
|             def on_epoch_end(self, args, state, control, **callback_kwargs):
 | |
|                 if control.should_training_stop or state.epoch + 1 >= args.num_train_epochs:
 | |
|                     control.should_save = True
 | |
|                     control.should_evaluate = True
 | |
| 
 | |
|         self._trainer = TrainerForAuto(
 | |
|             args=self._training_args,
 | |
|             model_init=self._model_init,
 | |
|             train_dataset=train_dataset,
 | |
|             eval_dataset=eval_dataset,
 | |
|             tokenizer=self.tokenizer,
 | |
|             data_collator=self.data_collator,
 | |
|             compute_metrics=self._compute_metrics_by_dataset_name,
 | |
|             callbacks=[EarlyStoppingCallbackForAuto],
 | |
|         )
 | |
| 
 | |
|         if self._task in NLG_TASKS:
 | |
|             setattr(self._trainer, "_is_seq2seq", True)
 | |
| 
 | |
|         """
 | |
|             When not using ray for tuning, set the limit of CUDA_VISIBLE_DEVICES to math.ceil(gpu_per_trial),
 | |
|             so each estimator does not see all the GPUs
 | |
|         """
 | |
|         if gpu_per_trial is not None:
 | |
|             tmp_cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "")
 | |
|             self._trainer.args._n_gpu = gpu_per_trial
 | |
| 
 | |
|             # if gpu_per_trial == 0:
 | |
|             #     os.environ["CUDA_VISIBLE_DEVICES"] = ""
 | |
|             if tmp_cuda_visible_devices.count(",") != math.ceil(gpu_per_trial) - 1:
 | |
|                 os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(x) for x in range(math.ceil(gpu_per_trial))])
 | |
| 
 | |
|         import time
 | |
| 
 | |
|         start_time = time.time()
 | |
|         self._trainer.train()
 | |
| 
 | |
|         if gpu_per_trial is not None:
 | |
|             os.environ["CUDA_VISIBLE_DEVICES"] = tmp_cuda_visible_devices
 | |
| 
 | |
|         self.params[self.ITER_HP] = self._trainer.state.global_step
 | |
| 
 | |
|         self._checkpoint_path = self._select_checkpoint(self._trainer)
 | |
|         self._ckpt_remains = list(self._trainer.ckpt_to_metric.keys())
 | |
| 
 | |
|         if hasattr(self._trainer, "intermediate_results"):
 | |
|             self.intermediate_results = [
 | |
|                 x[1] for x in sorted(self._trainer.intermediate_results.items(), key=lambda x: x[0])
 | |
|             ]
 | |
|         self._trainer = None
 | |
| 
 | |
|         return time.time() - start_time
 | |
| 
 | |
|     def _delete_one_ckpt(self, ckpt_location):
 | |
|         if self._use_ray is False:
 | |
|             if os.path.exists(ckpt_location):
 | |
|                 shutil.rmtree(ckpt_location)
 | |
| 
 | |
|     def cleanup(self):
 | |
|         super().cleanup()
 | |
|         if hasattr(self, "_ckpt_remains"):
 | |
|             for each_ckpt in self._ckpt_remains:
 | |
|                 self._delete_one_ckpt(each_ckpt)
 | |
| 
 | |
|     def _select_checkpoint(self, trainer):
 | |
|         from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
 | |
| 
 | |
|         if trainer.ckpt_to_metric:
 | |
|             best_ckpt, _ = min(trainer.ckpt_to_metric.items(), key=lambda x: x[1]["eval_automl_metric"])
 | |
|             best_ckpt_global_step = trainer.ckpt_to_global_step[best_ckpt]
 | |
|             for each_ckpt in list(trainer.ckpt_to_metric):
 | |
|                 if each_ckpt != best_ckpt:
 | |
|                     del trainer.ckpt_to_metric[each_ckpt]
 | |
|                     del trainer.ckpt_to_global_step[each_ckpt]
 | |
|                     self._delete_one_ckpt(each_ckpt)
 | |
|         else:
 | |
|             best_ckpt_global_step = trainer.state.global_step
 | |
|             best_ckpt = os.path.join(
 | |
|                 trainer.args.output_dir,
 | |
|                 f"{PREFIX_CHECKPOINT_DIR}-{best_ckpt_global_step}",
 | |
|             )
 | |
|         self.params[self.ITER_HP] = best_ckpt_global_step
 | |
|         logger.debug(trainer.state.global_step)
 | |
|         logger.debug(trainer.ckpt_to_global_step)
 | |
|         return best_ckpt
 | |
| 
 | |
|     def _compute_metrics_by_dataset_name(self, eval_pred):
 | |
|         # TODO: call self._metric(eval_pred, self)
 | |
|         if isinstance(self._metric, str):
 | |
|             from .ml import metric_loss_score
 | |
|             from .nlp.huggingface.utils import postprocess_prediction_and_true
 | |
| 
 | |
|             predictions, y_true = eval_pred
 | |
|             # postprocess the matrix prediction and ground truth into user readable format, e.g., for summarization, decode into text
 | |
|             processed_predictions, processed_y_true = postprocess_prediction_and_true(
 | |
|                 task=self._task,
 | |
|                 y_pred=predictions,
 | |
|                 tokenizer=self.tokenizer,
 | |
|                 hf_args=self._training_args,
 | |
|                 y_true=y_true,
 | |
|             )
 | |
|             metric_dict = {
 | |
|                 "automl_metric": metric_loss_score(
 | |
|                     metric_name=self._metric,
 | |
|                     y_processed_predict=processed_predictions,
 | |
|                     y_processed_true=processed_y_true,
 | |
|                     labels=self._training_args.label_list,
 | |
|                 )
 | |
|             }
 | |
|         else:
 | |
|             # TODO: debug to see how custom metric can take both tokenized (here) and untokenized input (ml.py)
 | |
|             loss, metric_dict = self._metric(
 | |
|                 X_test=self._X_val,
 | |
|                 y_test=self._y_val,
 | |
|                 estimator=self,
 | |
|                 labels=None,
 | |
|                 X_train=self._X_train,
 | |
|                 y_train=self._y_train,
 | |
|             )
 | |
|             metric_dict["automl_metric"] = loss
 | |
| 
 | |
|         return metric_dict
 | |
| 
 | |
|     def _init_model_for_predict(self):
 | |
|         from .nlp.huggingface.trainer import TrainerForAuto
 | |
| 
 | |
|         """
 | |
|             Need to reinit training_args because of a bug in deepspeed: if not reinit, the deepspeed config will be inconsistent
 | |
|             with HF config https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py#L947
 | |
|         """
 | |
|         training_args = self._TrainingArguments(local_rank=-1, model_path=self._checkpoint_path, fp16=self.fp16)
 | |
|         for key, val in self._training_args.__dict__.items():
 | |
|             if key not in ("local_rank", "model_path", "fp16"):
 | |
|                 setattr(training_args, key, val)
 | |
|         self._training_args = training_args
 | |
| 
 | |
|         new_trainer = TrainerForAuto(
 | |
|             model=self._model_init(),
 | |
|             args=self._training_args,
 | |
|             data_collator=self.data_collator,
 | |
|             compute_metrics=self._compute_metrics_by_dataset_name,
 | |
|         )
 | |
|         if self._task in NLG_TASKS:
 | |
|             setattr(new_trainer, "_is_seq2seq", True)
 | |
|         return new_trainer
 | |
| 
 | |
|     def predict_proba(self, X, **pred_kwargs):
 | |
|         from datasets import Dataset
 | |
| 
 | |
|         if pred_kwargs:
 | |
|             for key, val in pred_kwargs.items():
 | |
|                 setattr(self._training_args, key, val)
 | |
| 
 | |
|         assert self._task.is_classification(), "predict_proba() only for classification tasks."
 | |
| 
 | |
|         X_test, _ = self._tokenize_text(X, **self._kwargs)
 | |
|         test_dataset = Dataset.from_pandas(X_test)
 | |
| 
 | |
|         new_trainer = self._init_model_for_predict()
 | |
|         try:
 | |
|             predictions = new_trainer.predict(test_dataset).predictions
 | |
|         except ZeroDivisionError:
 | |
|             logger.warning("Zero division error appeared in HuggingFace Transformers.")
 | |
|             predictions = None
 | |
|         return predictions
 | |
| 
 | |
|     def score(self, X_val: DataFrame, y_val: Series, **kwargs):
 | |
|         import transformers
 | |
| 
 | |
|         transformers.logging.set_verbosity_error()
 | |
| 
 | |
|         self._metric = kwargs["metric"]
 | |
| 
 | |
|         eval_dataset, X_val, y_val = self._preprocess_data(X_val, y_val)
 | |
| 
 | |
|         new_trainer = self._init_model_for_predict()
 | |
|         return new_trainer.evaluate(eval_dataset)
 | |
| 
 | |
|     def predict(self, X, **pred_kwargs):
 | |
|         import transformers
 | |
|         from datasets import Dataset
 | |
|         from .nlp.huggingface.utils import postprocess_prediction_and_true
 | |
| 
 | |
|         transformers.logging.set_verbosity_error()
 | |
| 
 | |
|         if pred_kwargs:
 | |
|             for key, val in pred_kwargs.items():
 | |
|                 setattr(self._training_args, key, val)
 | |
| 
 | |
|         X_test, _ = self._tokenize_text(X, **self._kwargs)
 | |
|         test_dataset = Dataset.from_pandas(X_test)
 | |
| 
 | |
|         new_trainer = self._init_model_for_predict()
 | |
| 
 | |
|         kwargs = {} if self._task not in NLG_TASKS else {"metric_key_prefix": "predict"}
 | |
|         try:
 | |
|             predictions = new_trainer.predict(test_dataset, **kwargs).predictions
 | |
|         except ZeroDivisionError:
 | |
|             logger.warning("Zero division error appeared in HuggingFace Transformers.")
 | |
|             predictions = None
 | |
|         post_y_pred, _ = postprocess_prediction_and_true(
 | |
|             task=self._task,
 | |
|             y_pred=predictions,
 | |
|             tokenizer=self.tokenizer,
 | |
|             hf_args=self._training_args,
 | |
|             X=X,
 | |
|         )
 | |
|         return post_y_pred
 | |
| 
 | |
|     def config2params(self, config: dict) -> dict:
 | |
|         params = super().config2params(config)
 | |
|         params[TransformersEstimator.ITER_HP] = params.get(TransformersEstimator.ITER_HP, sys.maxsize)
 | |
|         return params
 | |
| 
 | |
| 
 | |
| class TransformersEstimatorModelSelection(TransformersEstimator):
 | |
|     def __init__(self, task="seq-classification", **config):
 | |
|         super().__init__(task, **config)
 | |
| 
 | |
|     @classmethod
 | |
|     def search_space(cls, data_size, task, **params):
 | |
|         search_space_dict = TransformersEstimator.search_space(data_size, task, **params)
 | |
| 
 | |
|         """
 | |
|             For model selection, use the same search space regardless of memory constraint
 | |
|             If OOM, user should change the search space themselves
 | |
|         """
 | |
| 
 | |
|         search_space_dict["model_path"] = {
 | |
|             "domain": tune.choice(
 | |
|                 [
 | |
|                     "google/electra-base-discriminator",
 | |
|                     "bert-base-uncased",
 | |
|                     "roberta-base",
 | |
|                     "facebook/muppet-roberta-base",
 | |
|                     "google/electra-small-discriminator",
 | |
|                 ]
 | |
|             ),
 | |
|             "init_value": "facebook/muppet-roberta-base",
 | |
|         }
 | |
|         return search_space_dict
 | |
| 
 | |
| 
 | |
| class SKLearnEstimator(BaseEstimator):
 | |
|     """
 | |
|     The base class for tuning scikit-learn estimators.
 | |
| 
 | |
|     Subclasses can modify the function signature of ``__init__`` to
 | |
|     ignore the values in ``config`` that are not relevant to the constructor
 | |
|     of their underlying estimator. For example, some regressors in ``scikit-learn``
 | |
|     don't accept the ``n_jobs`` parameter contained in ``config``. For these,
 | |
|     one can add ``n_jobs=None,`` before ``**config`` to make sure ``config`` doesn't
 | |
|     contain an ``n_jobs`` key.
 | |
|     """
 | |
| 
 | |
|     def __init__(self, task="binary", **config):
 | |
|         super().__init__(task, **config)
 | |
| 
 | |
|     def _preprocess(self, X):
 | |
|         if isinstance(X, DataFrame):
 | |
|             cat_columns = X.select_dtypes(include=["category"]).columns
 | |
|             if not cat_columns.empty:
 | |
|                 X = X.copy()
 | |
|                 X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)
 | |
|         elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif":
 | |
|             # numpy array is not of numeric dtype
 | |
|             X = DataFrame(X)
 | |
|             for col in X.columns:
 | |
|                 if isinstance(X[col][0], str):
 | |
|                     X[col] = X[col].astype("category").cat.codes
 | |
|             X = X.to_numpy()
 | |
|         return X
 | |
| 
 | |
| 
 | |
| class LGBMEstimator(BaseEstimator):
 | |
|     """The class for tuning LGBM, using sklearn API."""
 | |
| 
 | |
|     ITER_HP = "n_estimators"
 | |
|     HAS_CALLBACK = True
 | |
|     DEFAULT_ITER = 100
 | |
| 
 | |
|     @classmethod
 | |
|     def search_space(cls, data_size, **params):
 | |
|         upper = max(5, min(32768, int(data_size[0])))  # upper must be larger than lower
 | |
|         return {
 | |
|             "n_estimators": {
 | |
|                 "domain": tune.lograndint(lower=4, upper=upper),
 | |
|                 "init_value": 4,
 | |
|                 "low_cost_init_value": 4,
 | |
|             },
 | |
|             "num_leaves": {
 | |
|                 "domain": tune.lograndint(lower=4, upper=upper),
 | |
|                 "init_value": 4,
 | |
|                 "low_cost_init_value": 4,
 | |
|             },
 | |
|             "min_child_samples": {
 | |
|                 "domain": tune.lograndint(lower=2, upper=2**7 + 1),
 | |
|                 "init_value": 20,
 | |
|             },
 | |
|             "learning_rate": {
 | |
|                 "domain": tune.loguniform(lower=1 / 1024, upper=1.0),
 | |
|                 "init_value": 0.1,
 | |
|             },
 | |
|             "log_max_bin": {  # log transformed with base 2
 | |
|                 "domain": tune.lograndint(lower=3, upper=11),
 | |
|                 "init_value": 8,
 | |
|             },
 | |
|             "colsample_bytree": {
 | |
|                 "domain": tune.uniform(lower=0.01, upper=1.0),
 | |
|                 "init_value": 1.0,
 | |
|             },
 | |
|             "reg_alpha": {
 | |
|                 "domain": tune.loguniform(lower=1 / 1024, upper=1024),
 | |
|                 "init_value": 1 / 1024,
 | |
|             },
 | |
|             "reg_lambda": {
 | |
|                 "domain": tune.loguniform(lower=1 / 1024, upper=1024),
 | |
|                 "init_value": 1.0,
 | |
|             },
 | |
|         }
 | |
| 
 | |
|     def config2params(self, config: dict) -> dict:
 | |
|         params = super().config2params(config)
 | |
|         if "log_max_bin" in params:
 | |
|             params["max_bin"] = (1 << params.pop("log_max_bin")) - 1
 | |
|         return params
 | |
| 
 | |
|     @classmethod
 | |
|     def size(cls, config):
 | |
|         num_leaves = int(
 | |
|             round(config.get("num_leaves") or config.get("max_leaves") or 1 << config.get("max_depth", 16))
 | |
|         )
 | |
|         n_estimators = int(round(config["n_estimators"]))
 | |
|         return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8
 | |
| 
 | |
|     def __init__(self, task="binary", **config):
 | |
|         super().__init__(task, **config)
 | |
|         if "verbose" not in self.params:
 | |
|             self.params["verbose"] = -1
 | |
| 
 | |
|         if self._task.is_classification():
 | |
|             from lightgbm import LGBMClassifier
 | |
| 
 | |
|             self.estimator_class = LGBMClassifier
 | |
| 
 | |
|         elif task == "rank":
 | |
|             from lightgbm import LGBMRanker
 | |
| 
 | |
|             self.estimator_class = LGBMRanker
 | |
|         else:
 | |
|             from lightgbm import LGBMRegressor
 | |
| 
 | |
|             self.estimator_class = LGBMRegressor
 | |
| 
 | |
|         self._time_per_iter = None
 | |
|         self._train_size = 0
 | |
|         self._mem_per_iter = -1
 | |
|         self.HAS_CALLBACK = self.HAS_CALLBACK and self._callbacks(0, 0, 0) is not None
 | |
| 
 | |
|     def _preprocess(self, X):
 | |
|         if not isinstance(X, DataFrame) and issparse(X) and np.issubdtype(X.dtype, np.integer):
 | |
|             X = X.astype(float)
 | |
|         elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif":
 | |
|             # numpy array is not of numeric dtype
 | |
|             X = DataFrame(X)
 | |
|             for col in X.columns:
 | |
|                 if isinstance(X[col][0], str):
 | |
|                     X[col] = X[col].astype("category").cat.codes
 | |
|             X = X.to_numpy()
 | |
|         return X
 | |
| 
 | |
|     def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
 | |
|         start_time = time.time()
 | |
|         deadline = start_time + budget if budget else np.inf
 | |
|         n_iter = self.params.get(self.ITER_HP, self.DEFAULT_ITER)
 | |
|         trained = False
 | |
|         if not self.HAS_CALLBACK:
 | |
|             mem0 = psutil.virtual_memory().available if psutil is not None else 1
 | |
|             if (
 | |
|                 (not self._time_per_iter or abs(self._train_size - X_train.shape[0]) > 4)
 | |
|                 and budget is not None
 | |
|                 or self._mem_per_iter < 0
 | |
|                 and psutil is not None
 | |
|             ) and n_iter > 1:
 | |
|                 self.params[self.ITER_HP] = 1
 | |
|                 self._t1 = self._fit(X_train, y_train, **kwargs)
 | |
|                 if budget is not None and self._t1 >= budget or n_iter == 1:
 | |
|                     return self._t1
 | |
|                 mem1 = psutil.virtual_memory().available if psutil is not None else 1
 | |
|                 self._mem1 = mem0 - mem1
 | |
|                 self.params[self.ITER_HP] = min(n_iter, 4)
 | |
|                 self._t2 = self._fit(X_train, y_train, **kwargs)
 | |
|                 mem2 = psutil.virtual_memory().available if psutil is not None else 1
 | |
|                 self._mem2 = max(mem0 - mem2, self._mem1)
 | |
|                 # if self._mem1 <= 0:
 | |
|                 #     self._mem_per_iter = self._mem2 / (self.params[self.ITER_HP] + 1)
 | |
|                 # elif self._mem2 <= 0:
 | |
|                 #     self._mem_per_iter = self._mem1
 | |
|                 # else:
 | |
|                 self._mem_per_iter = min(self._mem1, self._mem2 / self.params[self.ITER_HP])
 | |
|                 # if self._mem_per_iter <= 1 and psutil is not None:
 | |
|                 #     n_iter = self.params[self.ITER_HP]
 | |
|                 self._time_per_iter = (
 | |
|                     (self._t2 - self._t1) / (self.params[self.ITER_HP] - 1)
 | |
|                     if self._t2 > self._t1
 | |
|                     else self._t1
 | |
|                     if self._t1
 | |
|                     else 0.001
 | |
|                 )
 | |
|                 self._train_size = X_train.shape[0]
 | |
|                 if budget is not None and self._t1 + self._t2 >= budget or n_iter == self.params[self.ITER_HP]:
 | |
|                     # self.params[self.ITER_HP] = n_iter
 | |
|                     return time.time() - start_time
 | |
|                 trained = True
 | |
|             # logger.debug(mem0)
 | |
|             # logger.debug(self._mem_per_iter)
 | |
|             if n_iter > 1:
 | |
|                 max_iter = min(
 | |
|                     n_iter,
 | |
|                     int((budget - time.time() + start_time - self._t1) / self._time_per_iter + 1)
 | |
|                     if budget is not None
 | |
|                     else n_iter,
 | |
|                     int((1 - free_mem_ratio) * mem0 / self._mem_per_iter)
 | |
|                     if psutil is not None and self._mem_per_iter > 0
 | |
|                     else n_iter,
 | |
|                 )
 | |
|                 if trained and max_iter <= self.params[self.ITER_HP]:
 | |
|                     return time.time() - start_time
 | |
|                 # when not trained, train at least one iter
 | |
|                 self.params[self.ITER_HP] = max(max_iter, 1)
 | |
|         if self.HAS_CALLBACK:
 | |
|             kwargs_callbacks = kwargs.get("callbacks")
 | |
|             if kwargs_callbacks:
 | |
|                 callbacks = kwargs_callbacks + self._callbacks(start_time, deadline, free_mem_ratio)
 | |
|                 kwargs.pop("callbacks")
 | |
|             else:
 | |
|                 callbacks = self._callbacks(start_time, deadline, free_mem_ratio)
 | |
|             if isinstance(self, XGBoostSklearnEstimator):
 | |
|                 from xgboost import __version__
 | |
| 
 | |
|                 if __version__ >= "1.6.0":
 | |
|                     # since xgboost>=1.6.0, callbacks can't be passed in fit()
 | |
|                     self.params["callbacks"] = callbacks
 | |
|                     callbacks = None
 | |
|             self._fit(
 | |
|                 X_train,
 | |
|                 y_train,
 | |
|                 callbacks=callbacks,
 | |
|                 **kwargs,
 | |
|             )
 | |
|             if callbacks is None:
 | |
|                 # for xgboost>=1.6.0, pop callbacks to enable pickle
 | |
|                 callbacks = self.params.pop("callbacks")
 | |
|                 self._model.set_params(callbacks=callbacks[:-1])
 | |
|             best_iteration = (
 | |
|                 self._model.get_booster().best_iteration
 | |
|                 if isinstance(self, XGBoostSklearnEstimator)
 | |
|                 else self._model.best_iteration_
 | |
|             )
 | |
|             if best_iteration is not None:
 | |
|                 self._model.set_params(n_estimators=best_iteration + 1)
 | |
|         else:
 | |
|             self._fit(X_train, y_train, **kwargs)
 | |
|         train_time = time.time() - start_time
 | |
|         return train_time
 | |
| 
 | |
|     def _callbacks(self, start_time, deadline, free_mem_ratio) -> List[Callable]:
 | |
|         return [partial(self._callback, start_time, deadline, free_mem_ratio)]
 | |
| 
 | |
|     def _callback(self, start_time, deadline, free_mem_ratio, env) -> None:
 | |
|         from lightgbm.callback import EarlyStopException
 | |
| 
 | |
|         now = time.time()
 | |
|         if env.iteration == 0:
 | |
|             self._time_per_iter = now - start_time
 | |
|         if now + self._time_per_iter > deadline:
 | |
|             raise EarlyStopException(env.iteration, env.evaluation_result_list)
 | |
|         if psutil is not None:
 | |
|             mem = psutil.virtual_memory()
 | |
|             if mem.available / mem.total < free_mem_ratio:
 | |
|                 raise EarlyStopException(env.iteration, env.evaluation_result_list)
 | |
| 
 | |
| 
 | |
| class XGBoostEstimator(SKLearnEstimator):
 | |
|     """The class for tuning XGBoost regressor, not using sklearn API."""
 | |
| 
 | |
|     DEFAULT_ITER = 10
 | |
| 
 | |
|     @classmethod
 | |
|     def search_space(cls, data_size, **params):
 | |
|         upper = max(5, min(32768, int(data_size[0])))  # upper must be larger than lower
 | |
|         return {
 | |
|             "n_estimators": {
 | |
|                 "domain": tune.lograndint(lower=4, upper=upper),
 | |
|                 "init_value": 4,
 | |
|                 "low_cost_init_value": 4,
 | |
|             },
 | |
|             "max_leaves": {
 | |
|                 "domain": tune.lograndint(lower=4, upper=upper),
 | |
|                 "init_value": 4,
 | |
|                 "low_cost_init_value": 4,
 | |
|             },
 | |
|             "max_depth": {
 | |
|                 "domain": tune.choice([0, 6, 12]),
 | |
|                 "init_value": 0,
 | |
|             },
 | |
|             "min_child_weight": {
 | |
|                 "domain": tune.loguniform(lower=0.001, upper=128),
 | |
|                 "init_value": 1.0,
 | |
|             },
 | |
|             "learning_rate": {
 | |
|                 "domain": tune.loguniform(lower=1 / 1024, upper=1.0),
 | |
|                 "init_value": 0.1,
 | |
|             },
 | |
|             "subsample": {
 | |
|                 "domain": tune.uniform(lower=0.1, upper=1.0),
 | |
|                 "init_value": 1.0,
 | |
|             },
 | |
|             "colsample_bylevel": {
 | |
|                 "domain": tune.uniform(lower=0.01, upper=1.0),
 | |
|                 "init_value": 1.0,
 | |
|             },
 | |
|             "colsample_bytree": {
 | |
|                 "domain": tune.uniform(lower=0.01, upper=1.0),
 | |
|                 "init_value": 1.0,
 | |
|             },
 | |
|             "reg_alpha": {
 | |
|                 "domain": tune.loguniform(lower=1 / 1024, upper=1024),
 | |
|                 "init_value": 1 / 1024,
 | |
|             },
 | |
|             "reg_lambda": {
 | |
|                 "domain": tune.loguniform(lower=1 / 1024, upper=1024),
 | |
|                 "init_value": 1.0,
 | |
|             },
 | |
|         }
 | |
| 
 | |
|     @classmethod
 | |
|     def size(cls, config):
 | |
|         return LGBMEstimator.size(config)
 | |
| 
 | |
|     @classmethod
 | |
|     def cost_relative2lgbm(cls):
 | |
|         return 1.6
 | |
| 
 | |
|     def config2params(self, config: dict) -> dict:
 | |
|         params = super().config2params(config)
 | |
|         max_depth = params["max_depth"] = params.get("max_depth", 0)
 | |
|         if max_depth == 0:
 | |
|             params["grow_policy"] = params.get("grow_policy", "lossguide")
 | |
|             params["tree_method"] = params.get("tree_method", "hist")
 | |
|         # params["booster"] = params.get("booster", "gbtree")
 | |
|         params["use_label_encoder"] = params.get("use_label_encoder", False)
 | |
|         if "n_jobs" in config:
 | |
|             params["nthread"] = params.pop("n_jobs")
 | |
|         return params
 | |
| 
 | |
|     def __init__(
 | |
|         self,
 | |
|         task="regression",
 | |
|         **config,
 | |
|     ):
 | |
|         super().__init__(task, **config)
 | |
|         self.params["verbosity"] = 0
 | |
| 
 | |
|     def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
 | |
|         import xgboost as xgb
 | |
| 
 | |
|         start_time = time.time()
 | |
|         deadline = start_time + budget if budget else np.inf
 | |
|         if issparse(X_train):
 | |
|             if xgb.__version__ < "1.6.0":
 | |
|                 # "auto" fails for sparse input since xgboost 1.6.0
 | |
|                 self.params["tree_method"] = "auto"
 | |
|         else:
 | |
|             X_train = self._preprocess(X_train)
 | |
|         if "sample_weight" in kwargs:
 | |
|             dtrain = xgb.DMatrix(X_train, label=y_train, weight=kwargs["sample_weight"])
 | |
|         else:
 | |
|             dtrain = xgb.DMatrix(X_train, label=y_train)
 | |
| 
 | |
|         objective = self.params.get("objective")
 | |
|         if isinstance(objective, str):
 | |
|             obj = None
 | |
|         else:
 | |
|             obj = objective
 | |
|             if "objective" in self.params:
 | |
|                 del self.params["objective"]
 | |
|         _n_estimators = self.params.pop("n_estimators")
 | |
|         callbacks = XGBoostEstimator._callbacks(start_time, deadline, free_mem_ratio)
 | |
|         if callbacks:
 | |
|             self._model = xgb.train(
 | |
|                 self.params,
 | |
|                 dtrain,
 | |
|                 _n_estimators,
 | |
|                 obj=obj,
 | |
|                 callbacks=callbacks,
 | |
|             )
 | |
|             self.params["n_estimators"] = self._model.best_iteration + 1
 | |
|         else:
 | |
|             self._model = xgb.train(self.params, dtrain, _n_estimators, obj=obj)
 | |
|             self.params["n_estimators"] = _n_estimators
 | |
|         self.params["objective"] = objective
 | |
|         del dtrain
 | |
|         train_time = time.time() - start_time
 | |
|         return train_time
 | |
| 
 | |
|     def predict(self, X, **kwargs):
 | |
|         import xgboost as xgb
 | |
| 
 | |
|         if not issparse(X):
 | |
|             X = self._preprocess(X)
 | |
|         dtest = xgb.DMatrix(X)
 | |
|         return super().predict(dtest, **kwargs)
 | |
| 
 | |
|     @classmethod
 | |
|     def _callbacks(cls, start_time, deadline, free_mem_ratio):
 | |
|         try:
 | |
|             from xgboost.callback import TrainingCallback
 | |
|         except ImportError:  # for xgboost<1.3
 | |
|             return None
 | |
| 
 | |
|         class ResourceLimit(TrainingCallback):
 | |
|             def after_iteration(self, model, epoch, evals_log) -> bool:
 | |
|                 now = time.time()
 | |
|                 if epoch == 0:
 | |
|                     self._time_per_iter = now - start_time
 | |
|                 if now + self._time_per_iter > deadline:
 | |
|                     return True
 | |
|                 if psutil is not None:
 | |
|                     mem = psutil.virtual_memory()
 | |
|                     if mem.available / mem.total < free_mem_ratio:
 | |
|                         return True
 | |
|                 return False
 | |
| 
 | |
|         return [ResourceLimit()]
 | |
| 
 | |
| 
 | |
| class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
 | |
|     """The class for tuning XGBoost with unlimited depth, using sklearn API."""
 | |
| 
 | |
|     DEFAULT_ITER = 10
 | |
| 
 | |
|     @classmethod
 | |
|     def search_space(cls, data_size, **params):
 | |
|         space = XGBoostEstimator.search_space(data_size)
 | |
|         space.pop("max_depth")
 | |
|         return space
 | |
| 
 | |
|     @classmethod
 | |
|     def cost_relative2lgbm(cls):
 | |
|         return XGBoostEstimator.cost_relative2lgbm()
 | |
| 
 | |
|     def config2params(self, config: dict) -> dict:
 | |
|         params = super().config2params(config)
 | |
|         max_depth = params["max_depth"] = params.get("max_depth", 0)
 | |
|         if max_depth == 0:
 | |
|             params["grow_policy"] = params.get("grow_policy", "lossguide")
 | |
|             params["tree_method"] = params.get("tree_method", "hist")
 | |
|         params["use_label_encoder"] = params.get("use_label_encoder", False)
 | |
|         return params
 | |
| 
 | |
|     def __init__(
 | |
|         self,
 | |
|         task="binary",
 | |
|         **config,
 | |
|     ):
 | |
|         super().__init__(task, **config)
 | |
|         del self.params["verbose"]
 | |
|         self.params["verbosity"] = 0
 | |
|         import xgboost as xgb
 | |
| 
 | |
|         if "rank" == task:
 | |
|             self.estimator_class = xgb.XGBRanker
 | |
|         elif self._task.is_classification():
 | |
|             self.estimator_class = xgb.XGBClassifier
 | |
|         else:
 | |
|             self.estimator_class = xgb.XGBRegressor
 | |
| 
 | |
|         self._xgb_version = xgb.__version__
 | |
| 
 | |
|     def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
 | |
|         if issparse(X_train) and self._xgb_version < "1.6.0":
 | |
|             # "auto" fails for sparse input since xgboost 1.6.0
 | |
|             self.params["tree_method"] = "auto"
 | |
|         if kwargs.get("gpu_per_trial"):
 | |
|             self.params["tree_method"] = "gpu_hist"
 | |
|             kwargs.pop("gpu_per_trial")
 | |
|         return super().fit(X_train, y_train, budget, free_mem_ratio, **kwargs)
 | |
| 
 | |
|     def _callbacks(self, start_time, deadline, free_mem_ratio) -> List[Callable]:
 | |
|         return XGBoostEstimator._callbacks(start_time, deadline, free_mem_ratio)
 | |
| 
 | |
| 
 | |
| class XGBoostLimitDepthEstimator(XGBoostSklearnEstimator):
 | |
|     """The class for tuning XGBoost with limited depth, using sklearn API."""
 | |
| 
 | |
|     @classmethod
 | |
|     def search_space(cls, data_size, **params):
 | |
|         space = XGBoostEstimator.search_space(data_size)
 | |
|         space.pop("max_leaves")
 | |
|         upper = max(6, int(np.log2(data_size[0])))
 | |
|         space["max_depth"] = {
 | |
|             "domain": tune.randint(lower=1, upper=min(upper, 16)),
 | |
|             "init_value": 6,
 | |
|             "low_cost_init_value": 1,
 | |
|         }
 | |
|         space["learning_rate"]["init_value"] = 0.3
 | |
|         space["n_estimators"]["init_value"] = 10
 | |
|         return space
 | |
| 
 | |
|     @classmethod
 | |
|     def cost_relative2lgbm(cls):
 | |
|         return 64
 | |
| 
 | |
| 
 | |
| class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
 | |
|     """The class for tuning Random Forest."""
 | |
| 
 | |
|     HAS_CALLBACK = False
 | |
|     nrows = 101
 | |
| 
 | |
|     @classmethod
 | |
|     def search_space(cls, data_size, task, **params):
 | |
|         RandomForestEstimator.nrows = int(data_size[0])
 | |
|         upper = min(2048, RandomForestEstimator.nrows)
 | |
|         init = 1 / np.sqrt(data_size[1]) if task.is_classification() else 1
 | |
|         lower = min(0.1, init)
 | |
|         space = {
 | |
|             "n_estimators": {
 | |
|                 "domain": tune.lograndint(lower=4, upper=max(5, upper)),
 | |
|                 "init_value": 4,
 | |
|                 "low_cost_init_value": 4,
 | |
|             },
 | |
|             "max_features": {
 | |
|                 "domain": tune.loguniform(lower=lower, upper=1.0),
 | |
|                 "init_value": init,
 | |
|             },
 | |
|             "max_leaves": {
 | |
|                 "domain": tune.lograndint(
 | |
|                     lower=4,
 | |
|                     upper=max(5, min(32768, RandomForestEstimator.nrows >> 1)),  #
 | |
|                 ),
 | |
|                 "init_value": 4,
 | |
|                 "low_cost_init_value": 4,
 | |
|             },
 | |
|         }
 | |
|         if task.is_classification():
 | |
|             space["criterion"] = {
 | |
|                 "domain": tune.choice(["gini", "entropy"]),
 | |
|                 # "init_value": "gini",
 | |
|             }
 | |
|         return space
 | |
| 
 | |
|     @classmethod
 | |
|     def cost_relative2lgbm(cls):
 | |
|         return 2
 | |
| 
 | |
|     def config2params(self, config: dict) -> dict:
 | |
|         params = super().config2params(config)
 | |
|         if "max_leaves" in params:
 | |
|             params["max_leaf_nodes"] = params.get("max_leaf_nodes", params.pop("max_leaves"))
 | |
|         if not self._task.is_classification() and "criterion" in config:
 | |
|             params.pop("criterion")
 | |
|         if "random_state" not in params:
 | |
|             params["random_state"] = 12032022
 | |
|         return params
 | |
| 
 | |
|     def __init__(
 | |
|         self,
 | |
|         task: Task,
 | |
|         **params,
 | |
|     ):
 | |
|         super().__init__(task, **params)
 | |
|         self.params["verbose"] = 0
 | |
| 
 | |
|         if self._task.is_classification():
 | |
|             self.estimator_class = RandomForestClassifier
 | |
|         else:
 | |
|             self.estimator_class = RandomForestRegressor
 | |
| 
 | |
| 
 | |
| class ExtraTreesEstimator(RandomForestEstimator):
 | |
|     """The class for tuning Extra Trees."""
 | |
| 
 | |
|     @classmethod
 | |
|     def cost_relative2lgbm(cls):
 | |
|         return 1.9
 | |
| 
 | |
|     def __init__(self, task="binary", **params):
 | |
|         if isinstance(task, str):
 | |
|             from flaml.automl.task.factory import task_factory
 | |
| 
 | |
|             task = task_factory(task)
 | |
|         super().__init__(task, **params)
 | |
|         if task.is_regression():
 | |
|             self.estimator_class = ExtraTreesRegressor
 | |
|         else:
 | |
|             self.estimator_class = ExtraTreesClassifier
 | |
| 
 | |
| 
 | |
| class LRL1Classifier(SKLearnEstimator):
 | |
|     """The class for tuning Logistic Regression with L1 regularization."""
 | |
| 
 | |
|     @classmethod
 | |
|     def search_space(cls, **params):
 | |
|         return {
 | |
|             "C": {
 | |
|                 "domain": tune.loguniform(lower=0.03125, upper=32768.0),
 | |
|                 "init_value": 1.0,
 | |
|             },
 | |
|         }
 | |
| 
 | |
|     @classmethod
 | |
|     def cost_relative2lgbm(cls):
 | |
|         return 160
 | |
| 
 | |
|     def config2params(self, config: dict) -> dict:
 | |
|         params = super().config2params(config)
 | |
|         params["tol"] = params.get("tol", 0.0001)
 | |
|         params["solver"] = params.get("solver", "saga")
 | |
|         params["penalty"] = params.get("penalty", "l1")
 | |
|         return params
 | |
| 
 | |
|     def __init__(self, task="binary", **config):
 | |
|         super().__init__(task, **config)
 | |
|         assert self._task.is_classification(), "LogisticRegression for classification task only"
 | |
|         self.estimator_class = LogisticRegression
 | |
| 
 | |
| 
 | |
| class LRL2Classifier(SKLearnEstimator):
 | |
|     """The class for tuning Logistic Regression with L2 regularization."""
 | |
| 
 | |
|     limit_resource = True
 | |
| 
 | |
|     @classmethod
 | |
|     def search_space(cls, **params):
 | |
|         return LRL1Classifier.search_space(**params)
 | |
| 
 | |
|     @classmethod
 | |
|     def cost_relative2lgbm(cls):
 | |
|         return 25
 | |
| 
 | |
|     def config2params(self, config: dict) -> dict:
 | |
|         params = super().config2params(config)
 | |
|         params["tol"] = params.get("tol", 0.0001)
 | |
|         params["solver"] = params.get("solver", "lbfgs")
 | |
|         params["penalty"] = params.get("penalty", "l2")
 | |
|         return params
 | |
| 
 | |
|     def __init__(self, task="binary", **config):
 | |
|         super().__init__(task, **config)
 | |
|         assert self._task.is_classification(), "LogisticRegression for classification task only"
 | |
|         self.estimator_class = LogisticRegression
 | |
| 
 | |
| 
 | |
| class CatBoostEstimator(BaseEstimator):
 | |
|     """The class for tuning CatBoost."""
 | |
| 
 | |
|     ITER_HP = "n_estimators"
 | |
|     DEFAULT_ITER = 1000
 | |
| 
 | |
|     @classmethod
 | |
|     def search_space(cls, data_size, **params):
 | |
|         upper = max(min(round(1500000 / data_size[0]), 150), 12)
 | |
|         return {
 | |
|             "early_stopping_rounds": {
 | |
|                 "domain": tune.lograndint(lower=10, upper=upper),
 | |
|                 "init_value": 10,
 | |
|                 "low_cost_init_value": 10,
 | |
|             },
 | |
|             "learning_rate": {
 | |
|                 "domain": tune.loguniform(lower=0.005, upper=0.2),
 | |
|                 "init_value": 0.1,
 | |
|             },
 | |
|             "n_estimators": {
 | |
|                 "domain": 8192,
 | |
|                 "init_value": 8192,
 | |
|             },
 | |
|         }
 | |
| 
 | |
|     @classmethod
 | |
|     def size(cls, config):
 | |
|         n_estimators = config.get("n_estimators", 8192)
 | |
|         max_leaves = 64
 | |
|         return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8
 | |
| 
 | |
|     @classmethod
 | |
|     def cost_relative2lgbm(cls):
 | |
|         return 15
 | |
| 
 | |
|     def _preprocess(self, X):
 | |
|         if isinstance(X, DataFrame):
 | |
|             cat_columns = X.select_dtypes(include=["category"]).columns
 | |
|             if not cat_columns.empty:
 | |
|                 X = X.copy()
 | |
|                 X[cat_columns] = X[cat_columns].apply(
 | |
|                     lambda x: x.cat.rename_categories([str(c) if isinstance(c, float) else c for c in x.cat.categories])
 | |
|                 )
 | |
|         elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif":
 | |
|             # numpy array is not of numeric dtype
 | |
|             X = DataFrame(X)
 | |
|             for col in X.columns:
 | |
|                 if isinstance(X[col][0], str):
 | |
|                     X[col] = X[col].astype("category").cat.codes
 | |
|             X = X.to_numpy()
 | |
|         return X
 | |
| 
 | |
|     def config2params(self, config: dict) -> dict:
 | |
|         params = super().config2params(config)
 | |
|         params["n_estimators"] = params.get("n_estimators", 8192)
 | |
|         if "n_jobs" in params:
 | |
|             params["thread_count"] = params.pop("n_jobs")
 | |
|         return params
 | |
| 
 | |
|     def __init__(
 | |
|         self,
 | |
|         task="binary",
 | |
|         **config,
 | |
|     ):
 | |
|         super().__init__(task, **config)
 | |
|         self.params.update(
 | |
|             {
 | |
|                 "verbose": config.get("verbose", False),
 | |
|                 "random_seed": config.get("random_seed", 10242048),
 | |
|             }
 | |
|         )
 | |
|         if self._task.is_classification():
 | |
|             from catboost import CatBoostClassifier
 | |
| 
 | |
|             self.estimator_class = CatBoostClassifier
 | |
|         else:
 | |
|             from catboost import CatBoostRegressor
 | |
| 
 | |
|             self.estimator_class = CatBoostRegressor
 | |
| 
 | |
|     def fit(self, X_train, y_train, budget=None, free_mem_ratio=0, **kwargs):
 | |
|         start_time = time.time()
 | |
|         deadline = start_time + budget if budget else np.inf
 | |
|         train_dir = f"catboost_{str(start_time)}"
 | |
|         X_train = self._preprocess(X_train)
 | |
|         if isinstance(X_train, DataFrame):
 | |
|             cat_features = list(X_train.select_dtypes(include="category").columns)
 | |
|         else:
 | |
|             cat_features = []
 | |
|         use_best_model = kwargs.get("use_best_model", True)
 | |
|         n = max(int(len(y_train) * 0.9), len(y_train) - 1000) if use_best_model else len(y_train)
 | |
|         X_tr, y_tr = X_train[:n], y_train[:n]
 | |
|         from catboost import Pool, __version__
 | |
| 
 | |
|         eval_set = Pool(data=X_train[n:], label=y_train[n:], cat_features=cat_features) if use_best_model else None
 | |
|         if "sample_weight" in kwargs:
 | |
|             weight = kwargs["sample_weight"]
 | |
|             if weight is not None:
 | |
|                 kwargs["sample_weight"] = weight[:n]
 | |
|         else:
 | |
|             weight = None
 | |
| 
 | |
|         model = self.estimator_class(train_dir=train_dir, **self.params)
 | |
|         if __version__ >= "0.26":
 | |
|             model.fit(
 | |
|                 X_tr,
 | |
|                 y_tr,
 | |
|                 cat_features=cat_features,
 | |
|                 eval_set=eval_set,
 | |
|                 callbacks=CatBoostEstimator._callbacks(
 | |
|                     start_time, deadline, free_mem_ratio if use_best_model else None
 | |
|                 ),
 | |
|                 **kwargs,
 | |
|             )
 | |
|         else:
 | |
|             model.fit(
 | |
|                 X_tr,
 | |
|                 y_tr,
 | |
|                 cat_features=cat_features,
 | |
|                 eval_set=eval_set,
 | |
|                 **kwargs,
 | |
|             )
 | |
|         shutil.rmtree(train_dir, ignore_errors=True)
 | |
|         if weight is not None:
 | |
|             kwargs["sample_weight"] = weight
 | |
|         self._model = model
 | |
|         self.params[self.ITER_HP] = self._model.tree_count_
 | |
|         train_time = time.time() - start_time
 | |
|         return train_time
 | |
| 
 | |
|     @classmethod
 | |
|     def _callbacks(cls, start_time, deadline, free_mem_ratio):
 | |
|         class ResourceLimit:
 | |
|             def after_iteration(self, info) -> bool:
 | |
|                 now = time.time()
 | |
|                 if info.iteration == 1:
 | |
|                     self._time_per_iter = now - start_time
 | |
|                 if now + self._time_per_iter > deadline:
 | |
|                     return False
 | |
|                 if psutil is not None and free_mem_ratio is not None:
 | |
|                     mem = psutil.virtual_memory()
 | |
|                     if mem.available / mem.total < free_mem_ratio:
 | |
|                         return False
 | |
|                 return True  # can continue
 | |
| 
 | |
|         return [ResourceLimit()]
 | |
| 
 | |
| 
 | |
| class KNeighborsEstimator(BaseEstimator):
 | |
|     @classmethod
 | |
|     def search_space(cls, data_size, **params):
 | |
|         upper = min(512, int(data_size[0] / 2))
 | |
|         return {
 | |
|             "n_neighbors": {
 | |
|                 "domain": tune.lograndint(lower=1, upper=max(2, upper)),
 | |
|                 "init_value": 5,
 | |
|                 "low_cost_init_value": 1,
 | |
|             },
 | |
|         }
 | |
| 
 | |
|     @classmethod
 | |
|     def cost_relative2lgbm(cls):
 | |
|         return 30
 | |
| 
 | |
|     def config2params(self, config: dict) -> dict:
 | |
|         params = super().config2params(config)
 | |
|         params["weights"] = params.get("weights", "distance")
 | |
|         return params
 | |
| 
 | |
|     def __init__(self, task="binary", **config):
 | |
|         super().__init__(task, **config)
 | |
|         if self._task.is_classification():
 | |
|             from sklearn.neighbors import KNeighborsClassifier
 | |
| 
 | |
|             self.estimator_class = KNeighborsClassifier
 | |
|         else:
 | |
|             from sklearn.neighbors import KNeighborsRegressor
 | |
| 
 | |
|             self.estimator_class = KNeighborsRegressor
 | |
| 
 | |
|     def _preprocess(self, X):
 | |
|         if isinstance(X, DataFrame):
 | |
|             cat_columns = X.select_dtypes(["category"]).columns
 | |
|             if X.shape[1] == len(cat_columns):
 | |
|                 raise ValueError("kneighbor requires at least one numeric feature")
 | |
|             X = X.drop(cat_columns, axis=1)
 | |
|         elif isinstance(X, np.ndarray) and X.dtype.kind not in "buif":
 | |
|             # drop categocial columns if any
 | |
|             X = DataFrame(X)
 | |
|             cat_columns = []
 | |
|             for col in X.columns:
 | |
|                 if isinstance(X[col][0], str):
 | |
|                     cat_columns.append(col)
 | |
|             X = X.drop(cat_columns, axis=1)
 | |
|             X = X.to_numpy()
 | |
|         return X
 | |
| 
 | |
| 
 | |
| class suppress_stdout_stderr(object):
 | |
|     def __init__(self):
 | |
|         # Open a pair of null files
 | |
|         self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)]
 | |
|         # Save the actual stdout (1) and stderr (2) file descriptors.
 | |
|         self.save_fds = (os.dup(1), os.dup(2))
 | |
| 
 | |
|     def __enter__(self):
 | |
|         # Assign the null pointers to stdout and stderr.
 | |
|         os.dup2(self.null_fds[0], 1)
 | |
|         os.dup2(self.null_fds[1], 2)
 | |
| 
 | |
|     def __exit__(self, *_):
 | |
|         # Re-assign the real stdout/stderr back to (1) and (2)
 | |
|         os.dup2(self.save_fds[0], 1)
 | |
|         os.dup2(self.save_fds[1], 2)
 | |
|         # Close the null files
 | |
|         os.close(self.null_fds[0])
 | |
|         os.close(self.null_fds[1])
 |