autogen/flaml/model.py

'''!
 * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
 * Licensed under the MIT License.
'''

import numpy as np
import xgboost as xgb
import time
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor
from scipy.sparse import issparse
import pandas as pd
from . import tune

import logging
logger = logging.getLogger(__name__)


class BaseEstimator:
    '''The abstract class for all learners

    Typical example:
        XGBoostEstimator: for regression
        XGBoostSklearnEstimator: for classification
        LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier:
            for both regression and classification
    '''

    def __init__(self, task='binary:logistic', **params):
        '''Constructor

        Args:
            task: A string of the task type, one of
                'binary:logistic', 'multi:softmax', 'regression'
            n_jobs: An integer of the number of parallel threads
            params: A dictionary of the hyperparameter names and values
        '''
        self.params = params
        self.estimator_class = self._model = None
        self._task = task
        if '_estimator_type' in params:
            self._estimator_type = params['_estimator_type']
        else:
            self._estimator_type = "regressor" if task == 'regression' \
                else "classifier"

    def get_params(self, deep=False):
        params = self.params.copy()
        params["task"] = self._task
        if hasattr(self, '_estimator_type'):
            params['_estimator_type'] = self._estimator_type
        return params

    @property
    def classes_(self):
        return self._model.classes_

    @property
    def n_features_in_(self):
        return self.model.n_features_in_

    @property
    def model(self):
        '''Trained model after fit() is called, or None before fit() is called
        '''
        return self._model

    def _preprocess(self, X):
        return X

    def _fit(self, X_train, y_train, **kwargs):

        current_time = time.time()
        X_train = self._preprocess(X_train)
        model = self.estimator_class(**self.params)
        model.fit(X_train, y_train, **kwargs)
        train_time = time.time() - current_time
        self._model = model
        return train_time

    def fit(self, X_train, y_train, budget=None, **kwargs):
        '''Train the model from given training data

        Args:
            X_train: A numpy array of training data in shape n*m
            y_train: A numpy array of labels in shape n*1
            budget: A float of the time budget in seconds

        Returns:
            train_time: A float of the training time in seconds
        '''
        return self._fit(X_train, y_train, **kwargs)

    def predict(self, X_test):
        '''Predict label from features

        Args:
            X_test: A numpy array of featurized instances, shape n*m

        Returns:
            A numpy array of shape n*1.
            Each element is the label for a instance
        '''
        if self._model is not None:
            X_test = self._preprocess(X_test)
            return self._model.predict(X_test)
        else:
            return np.ones(X_test.shape[0])

    def predict_proba(self, X_test):
        '''Predict the probability of each class from features

        Only works for classification problems

        Args:
            model: An object of trained model with method predict_proba()
            X_test: A numpy array of featurized instances, shape n*m

        Returns:
            A numpy array of shape n*c. c is the # classes
            Each element at (i,j) is the probability for instance i to be in
                class j
        '''
        if 'regression' in self._task:
            print('Regression tasks do not support predict_prob')
            raise ValueError
        else:
            X_test = self._preprocess(X_test)
            return self._model.predict_proba(X_test)

    def cleanup(self):
        pass

    @classmethod
    def search_space(cls, **params):
        '''[required method] search space

        Returns:
            A dictionary of the search space.
            Each key is the name of a hyperparameter, and value is a dict with
                its domain and init_value (optional), cat_hp_cost (optional)
                e.g.,
                {'domain': tune.randint(lower=1, upper=10), 'init_value': 1}
        '''
        return {}

    @classmethod
    def size(cls, config):
        '''[optional method] memory size of the estimator in bytes

        Args:
            config - the dict of the hyperparameter config

        Returns:
            A float of the memory size required by the estimator to train the
            given config
        '''
        return 1.0

    @classmethod
    def cost_relative2lgbm(cls):
        '''[optional method] relative cost compared to lightgbm'''
        return 1.0


class SKLearnEstimator(BaseEstimator):

    def _preprocess(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.copy()
            cat_columns = X.select_dtypes(include=['category']).columns
            X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)
        return X


class LGBMEstimator(BaseEstimator):

    @classmethod
    def search_space(cls, data_size, **params):
        upper = min(32768, int(data_size))
        return {
            'n_estimators': {
                'domain': tune.qloguniform(lower=4, upper=upper, q=1),
                'init_value': 4,
                'low_cost_init_value': 4,
            },
            'num_leaves': {
                'domain': tune.qloguniform(lower=4, upper=upper, q=1),
                'init_value': 4,
                'low_cost_init_value': 4,
            },
            'min_child_samples': {
                'domain': tune.qloguniform(lower=2, upper=2**7, q=1),
                'init_value': 20,
            },
            'learning_rate': {
                'domain': tune.loguniform(lower=1 / 1024, upper=1.0),
                'init_value': 0.1,
            },
            'subsample': {
                'domain': tune.uniform(lower=0.1, upper=1.0),
                'init_value': 1.0,
            },
            'log_max_bin': {
                'domain': tune.qloguniform(lower=3, upper=10, q=1),
                'init_value': 8,
            },
            'colsample_bytree': {
                'domain': tune.uniform(lower=0.01, upper=1.0),
                'init_value': 1.0,
            },
            'reg_alpha': {
                'domain': tune.loguniform(lower=1 / 1024, upper=1024),
                'init_value': 1 / 1024,
            },
            'reg_lambda': {
                'domain': tune.loguniform(lower=1 / 1024, upper=1024),
                'init_value': 1.0,
            },
        }

    @classmethod
    def size(cls, config):
        num_leaves = int(round(config.get('num_leaves') or config['max_leaves']))
        n_estimators = int(round(config['n_estimators']))
        return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8

    def __init__(
        self, task='binary:logistic', n_jobs=1,
        n_estimators=2, num_leaves=2, min_child_samples=20, learning_rate=0.1,
        subsample=1.0, reg_lambda=1.0, reg_alpha=0.0,
        colsample_bytree=1.0, log_max_bin=8, **params
    ):
        super().__init__(task, **params)
        # Default: ‘regression’ for LGBMRegressor,
        # ‘binary’ or ‘multiclass’ for LGBMClassifier
        if 'regression' in task:
            objective = 'regression'
        elif 'binary' in task:
            objective = 'binary'
        elif 'multi' in task:
            objective = 'multiclass'
        else:
            objective = 'regression'
        self.params = {
            "n_estimators": int(round(n_estimators)),
            "num_leaves": int(round(num_leaves)),
            'objective': params.get("objective", objective),
            'n_jobs': n_jobs,
            'learning_rate': float(learning_rate),
            'reg_alpha': float(reg_alpha),
            'reg_lambda': float(reg_lambda),
            'min_child_samples': int(round(min_child_samples)),
            'colsample_bytree': float(colsample_bytree),
            'subsample': float(subsample),
        }
        self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else (
            1 << int(round(log_max_bin))) - 1
        if 'regression' in task:
            self.estimator_class = LGBMRegressor
        else:
            self.estimator_class = LGBMClassifier
        self._time_per_iter = None
        self._train_size = 0

    def _preprocess(self, X):
        if not isinstance(X, pd.DataFrame) and issparse(X) and np.issubdtype(
                X.dtype, np.integer):
            X = X.astype(float)
        return X

    def fit(self, X_train, y_train, budget=None, **kwargs):
        start_time = time.time()
        n_iter = self.params["n_estimators"]
        if (not self._time_per_iter or abs(
                self._train_size - X_train.shape[0]) > 4) and budget is not None:
            self.params["n_estimators"] = 1
            self._t1 = self._fit(X_train, y_train, **kwargs)
            if self._t1 >= budget:
                self.params["n_estimators"] = n_iter
                return self._t1
            self.params["n_estimators"] = 4
            self._t2 = self._fit(X_train, y_train, **kwargs)
            self._time_per_iter = (self._t2 - self._t1) / (
                self.params["n_estimators"] - 1) if self._t2 > self._t1 \
                else self._t1 if self._t1 else 0.001
            self._train_size = X_train.shape[0]
            if self._t1 + self._t2 >= budget or n_iter == self.params[
                    "n_estimators"]:
                self.params["n_estimators"] = n_iter
                return time.time() - start_time
        if budget is not None:
            self.params["n_estimators"] = min(n_iter, int(
                (budget - time.time() + start_time - self._t1)
                / self._time_per_iter + 1))
        if self.params["n_estimators"] > 0:
            self._fit(X_train, y_train, **kwargs)
        self.params["n_estimators"] = n_iter
        train_time = time.time() - start_time
        return train_time


class XGBoostEstimator(SKLearnEstimator):
    ''' not using sklearn API, used for regression '''

    @classmethod
    def search_space(cls, data_size, **params):
        upper = min(32768, int(data_size))
        return {
            'n_estimators': {
                'domain': tune.qloguniform(lower=4, upper=upper, q=1),
                'init_value': 4,
                'low_cost_init_value': 4,
            },
            'max_leaves': {
                'domain': tune.qloguniform(lower=4, upper=upper, q=1),
                'init_value': 4,
                'low_cost_init_value': 4,
            },
            'min_child_weight': {
                'domain': tune.loguniform(lower=0.001, upper=128),
                'init_value': 1,
            },
            'learning_rate': {
                'domain': tune.loguniform(lower=1 / 1024, upper=1.0),
                'init_value': 0.1,
            },
            'subsample': {
                'domain': tune.uniform(lower=0.1, upper=1.0),
                'init_value': 1.0,
            },
            'colsample_bylevel': {
                'domain': tune.uniform(lower=0.01, upper=1.0),
                'init_value': 1.0,
            },
            'colsample_bytree': {
                'domain': tune.uniform(lower=0.01, upper=1.0),
                'init_value': 1.0,
            },
            'reg_alpha': {
                'domain': tune.loguniform(lower=1 / 1024, upper=1024),
                'init_value': 1 / 1024,
            },
            'reg_lambda': {
                'domain': tune.loguniform(lower=1 / 1024, upper=1024),
                'init_value': 1.0,
            },
        }

    @classmethod
    def size(cls, config):
        return LGBMEstimator.size(config)

    @classmethod
    def cost_relative2lgbm(cls):
        return 1.6

    def __init__(
        self, task='regression', all_thread=False, n_jobs=1,
        n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1,
        learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0,
        colsample_bytree=1.0, tree_method='auto', **params
    ):
        super().__init__(task, **params)
        self._n_estimators = int(round(n_estimators))
        self.params = {
            'max_leaves': int(round(max_leaves)),
            'max_depth': params.get('max_depth', 0),
            'grow_policy': params.get("grow_policy", 'lossguide'),
            'tree_method': tree_method,
            'verbosity': params.get('verbosity', 0),
            'nthread': n_jobs,
            'learning_rate': float(learning_rate),
            'subsample': float(subsample),
            'reg_alpha': float(reg_alpha),
            'reg_lambda': float(reg_lambda),
            'min_child_weight': float(min_child_weight),
            'booster': params.get('booster', 'gbtree'),
            'colsample_bylevel': float(colsample_bylevel),
            'colsample_bytree': float(colsample_bytree),
            'objective': params.get("objective")
        }
        if all_thread:
            del self.params['nthread']

    def get_params(self, deep=False):
        params = super().get_params()
        params["n_jobs"] = params['nthread']
        return params

    def fit(self, X_train, y_train, budget=None, **kwargs):
        start_time = time.time()
        if not issparse(X_train):
            self.params['tree_method'] = 'hist'
            X_train = self._preprocess(X_train)
        if 'sample_weight' in kwargs:
            dtrain = xgb.DMatrix(X_train, label=y_train, weight=kwargs[
                'sample_weight'])
        else:
            dtrain = xgb.DMatrix(X_train, label=y_train)

        objective = self.params.get('objective')
        if isinstance(objective, str):
            obj = None
        else:
            obj = objective
            if 'objective' in self.params:
                del self.params['objective']
        self._model = xgb.train(self.params, dtrain, self._n_estimators,
                                obj=obj)
        self.params['objective'] = objective
        del dtrain
        train_time = time.time() - start_time
        return train_time

    def predict(self, X_test):
        if not issparse(X_test):
            X_test = self._preprocess(X_test)
        dtest = xgb.DMatrix(X_test)
        return super().predict(dtest)


class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
    ''' using sklearn API, used for classification '''

    @classmethod
    def search_space(cls, data_size, **params):
        return XGBoostEstimator.search_space(data_size)

    @classmethod
    def cost_relative2lgbm(cls):
        return XGBoostEstimator.cost_relative2lgbm()

    def __init__(
        self, task='binary:logistic', n_jobs=1,
        n_estimators=4, max_leaves=4, subsample=1.0,
        min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0,
        colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist',
        **params
    ):
        super().__init__(task, **params)
        self.params = {
            "n_estimators": int(round(n_estimators)),
            'max_leaves': int(round(max_leaves)),
            'max_depth': 0,
            'grow_policy': params.get("grow_policy", 'lossguide'),
            'tree_method': tree_method,
            'verbosity': 0,
            'n_jobs': n_jobs,
            'learning_rate': float(learning_rate),
            'subsample': float(subsample),
            'reg_alpha': float(reg_alpha),
            'reg_lambda': float(reg_lambda),
            'min_child_weight': float(min_child_weight),
            'booster': params.get('booster', 'gbtree'),
            'colsample_bylevel': float(colsample_bylevel),
            'colsample_bytree': float(colsample_bytree),
        }

        if 'regression' in task:
            self.estimator_class = xgb.XGBRegressor
        else:
            self.estimator_class = xgb.XGBClassifier
        self._time_per_iter = None
        self._train_size = 0

    def fit(self, X_train, y_train, budget=None, **kwargs):
        if issparse(X_train):
            self.params['tree_method'] = 'auto'
        return super().fit(X_train, y_train, budget, **kwargs)


class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):

    @classmethod
    def search_space(cls, data_size, task, **params):
        upper = min(2048, int(data_size))
        space = {
            'n_estimators': {
                'domain': tune.qloguniform(lower=4, upper=upper, q=1),
                'init_value': 4,
                'low_cost_init_value': 4,
            },
            'max_features': {
                'domain': tune.loguniform(lower=0.1, upper=1.0),
                'init_value': 1.0,
            },
        }
        if task != 'regression':
            space['criterion'] = {
                'domain': tune.choice(['gini', 'entropy']),
                # 'init_value': 'gini',
            }
        return space

    @classmethod
    def size(cls, config):
        return 1.0

    @classmethod
    def cost_relative2lgbm(cls):
        return 2.0

    def __init__(
        self, task='binary:logistic', n_jobs=1,
        n_estimators=4, max_features=1.0, criterion='gini', **params
    ):
        super().__init__(task, **params)
        self.params = {
            "n_estimators": int(round(n_estimators)),
            "n_jobs": n_jobs,
            'max_features': float(max_features),
        }
        if 'regression' in task:
            self.estimator_class = RandomForestRegressor
        else:
            self.estimator_class = RandomForestClassifier
            self.params['criterion'] = criterion
        self._time_per_iter = None
        self._train_size = 0

    def get_params(self, deep=False):
        params = super().get_params()
        return params


class ExtraTreeEstimator(RandomForestEstimator):

    @classmethod
    def cost_relative2lgbm(cls):
        return 1.9

    def __init__(self, task='binary:logistic', **params):
        super().__init__(task, **params)
        if 'regression' in task:
            self.estimator_class = ExtraTreesRegressor
        else:
            self.estimator_class = ExtraTreesClassifier


class LRL1Classifier(SKLearnEstimator):

    @classmethod
    def search_space(cls, **params):
        return {
            'C': {
                'domain': tune.loguniform(lower=0.03125, upper=32768.0),
                'init_value': 1.0,
            },
        }

    @classmethod
    def cost_relative2lgbm(cls):
        return 160

    def __init__(
        self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0,
        **params
    ):
        super().__init__(task, **params)
        self.params = {
            'penalty': params.get("penalty", 'l1'),
            'tol': float(tol),
            'C': float(C),
            'solver': params.get("solver", 'saga'),
            'n_jobs': n_jobs,
        }
        if 'regression' in task:
            self.estimator_class = None
            raise NotImplementedError('LR does not support regression task')
        else:
            self.estimator_class = LogisticRegression


class LRL2Classifier(SKLearnEstimator):

    @classmethod
    def search_space(cls, **params):
        return LRL1Classifier.search_space(**params)

    @classmethod
    def cost_relative2lgbm(cls):
        return 25

    def __init__(
        self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0,
        **params
    ):
        super().__init__(task, **params)
        self.params = {
            'penalty': params.get("penalty", 'l2'),
            'tol': float(tol),
            'C': float(C),
            'solver': params.get("solver", 'lbfgs'),
            'n_jobs': n_jobs,
        }
        if 'regression' in task:
            self.estimator_class = None
            raise NotImplementedError('LR does not support regression task')
        else:
            self.estimator_class = LogisticRegression


class CatBoostEstimator(BaseEstimator):

    _time_per_iter = None
    _train_size = 0

    @classmethod
    def search_space(cls, data_size, **params):
        upper = max(min(round(1500000 / data_size), 150), 11)
        return {
            'early_stopping_rounds': {
                'domain': tune.qloguniform(lower=10, upper=upper, q=1),
                'init_value': 10,
                'low_cost_init_value': 10,
            },
            'learning_rate': {
                'domain': tune.loguniform(lower=.005, upper=.2),
                'init_value': 0.1,
            },
        }

    @classmethod
    def size(cls, config):
        n_estimators = 8192
        max_leaves = 64
        return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8

    @classmethod
    def cost_relative2lgbm(cls):
        return 15

    def __init__(
        self, task='binary:logistic', n_jobs=1,
        n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params
    ):
        super().__init__(task, **params)
        self.params = {
            "early_stopping_rounds": int(round(early_stopping_rounds)),
            "n_estimators": n_estimators,
            'learning_rate': learning_rate,
            'thread_count': n_jobs,
            'verbose': params.get('verbose', False),
            'random_seed': params.get("random_seed", 10242048),
        }
        if 'regression' in task:
            from catboost import CatBoostRegressor
            self.estimator_class = CatBoostRegressor
        else:
            from catboost import CatBoostClassifier
            self.estimator_class = CatBoostClassifier

    def get_params(self, deep=False):
        params = super().get_params()
        params['n_jobs'] = params['thread_count']
        return params

    def fit(self, X_train, y_train, budget=None, **kwargs):
        start_time = time.time()
        n_iter = self.params["n_estimators"]
        if isinstance(X_train, pd.DataFrame):
            cat_features = list(X_train.select_dtypes(
                include='category').columns)
        else:
            cat_features = []
        from catboost import CatBoostError
        try:
            if (not CatBoostEstimator._time_per_iter or abs(
                    CatBoostEstimator._train_size - len(y_train)) > 4) and budget:
                # measure the time per iteration
                self.params["n_estimators"] = 1
                CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
                CatBoostEstimator._smallmodel.fit(
                    X_train, y_train, cat_features=cat_features, **kwargs)
                CatBoostEstimator._t1 = time.time() - start_time
                if CatBoostEstimator._t1 >= budget:
                    self.params["n_estimators"] = n_iter
                    self._model = CatBoostEstimator._smallmodel
                    return CatBoostEstimator._t1
                self.params["n_estimators"] = 4
                CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
                CatBoostEstimator._smallmodel.fit(
                    X_train, y_train, cat_features=cat_features, **kwargs)
                CatBoostEstimator._time_per_iter = (
                    time.time() - start_time - CatBoostEstimator._t1) / (
                        self.params["n_estimators"] - 1)
                if CatBoostEstimator._time_per_iter <= 0:
                    CatBoostEstimator._time_per_iter = CatBoostEstimator._t1
                CatBoostEstimator._train_size = len(y_train)
                if time.time() - start_time >= budget or n_iter == self.params[
                        "n_estimators"]:
                    self.params["n_estimators"] = n_iter
                    self._model = CatBoostEstimator._smallmodel
                    return time.time() - start_time
            if budget:
                train_times = 1
                self.params["n_estimators"] = min(n_iter, int(
                    (budget - time.time() + start_time - CatBoostEstimator._t1)
                    / train_times / CatBoostEstimator._time_per_iter + 1))
                self._model = CatBoostEstimator._smallmodel
            if self.params["n_estimators"] > 0:
                n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
                X_tr, y_tr = X_train[:n], y_train[:n]
                if 'sample_weight' in kwargs:
                    weight = kwargs['sample_weight']
                    if weight is not None:
                        kwargs['sample_weight'] = weight[:n]
                else:
                    weight = None
                from catboost import Pool
                model = self.estimator_class(**self.params)
                model.fit(
                    X_tr, y_tr, cat_features=cat_features,
                    eval_set=Pool(
                        data=X_train[n:], label=y_train[n:],
                        cat_features=cat_features),
                    **kwargs)   # model.get_best_iteration()
                if weight is not None:
                    kwargs['sample_weight'] = weight
                self._model = model
        except CatBoostError:
            self._model = None
        self.params["n_estimators"] = n_iter
        train_time = time.time() - start_time
        return train_time


class KNeighborsEstimator(BaseEstimator):

    @classmethod
    def search_space(cls, data_size, **params):
        upper = min(512, int(data_size / 2))
        return {
            'n_neighbors': {
                'domain': tune.qloguniform(lower=1, upper=upper, q=1),
                'init_value': 5,
                'low_cost_init_value': 1,
            },
        }

    @classmethod
    def cost_relative2lgbm(cls):
        return 30

    def __init__(
        self, task='binary:logistic', n_jobs=1, n_neighbors=5, **params
    ):
        super().__init__(task, **params)
        self.params = {
            'n_neighbors': int(round(n_neighbors)),
            'weights': params.get('weights', 'distance'),
            'n_jobs': n_jobs,
        }
        if 'regression' in task:
            from sklearn.neighbors import KNeighborsRegressor
            self.estimator_class = KNeighborsRegressor
        else:
            from sklearn.neighbors import KNeighborsClassifier
            self.estimator_class = KNeighborsClassifier

    def _preprocess(self, X):
        if isinstance(X, pd.DataFrame):
            cat_columns = X.select_dtypes(['category']).columns
            if X.shape[1] == len(cat_columns):
                raise ValueError(
                    "kneighbor requires at least one numeric feature")
            X = X.drop(cat_columns, axis=1)
        return X
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								'''!
 								 * Copyright (c) 2020-2021 Microsoft Corporation. All rights reserved.
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								 * Licensed under the MIT License.
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								'''
 								import numpy as np
 								import xgboost as xgb
 								import time
 								from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
 								from sklearn.ensemble import ExtraTreesRegressor, ExtraTreesClassifier
 								from sklearn.linear_model import LogisticRegression
 								from lightgbm import LGBMClassifier, LGBMRegressor
 								from scipy.sparse import issparse
 								import pandas as pd
 								from . import tune
 								import logging
 								logger = logging.getLogger(__name__)
 								class BaseEstimator:
 								    '''The abstract class for all learners
 								    Typical example:
 								        XGBoostEstimator: for regression
 								        XGBoostSklearnEstimator: for classification
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        LGBMEstimator, RandomForestEstimator, LRL1Classifier, LRL2Classifier:
 								            for both regression and classification
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								    '''
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def __init__(self, task='binary:logistic', **params):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        '''Constructor
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        Args:
 								            task: A string of the task type, one of
 								                'binary:logistic', 'multi:softmax', 'regression'
 								            n_jobs: An integer of the number of parallel threads
 								            params: A dictionary of the hyperparameter names and values
 								        '''
 								        self.params = params
 								        self.estimator_class = self._model = None
 								        self._task = task
 								        if '_estimator_type' in params:
 								            self._estimator_type = params['_estimator_type']
 								        else:
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            self._estimator_type = "regressor" if task == 'regression' \
 								                else "classifier"
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
 								    def get_params(self, deep=False):
 								        params = self.params.copy()
 								        params["task"] = self._task
 								        if hasattr(self, '_estimator_type'):
 								            params['_estimator_type'] = self._estimator_type
 								        return params
 								    @property
 								    def classes_(self):
 								        return self._model.classes_
 								    @property
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def n_features_in_(self):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        return self.model.n_features_in_
 								    @property
 								    def model(self):
 								        '''Trained model after fit() is called, or None before fit() is called
 								        '''
 								        return self._model
 								    def _preprocess(self, X):
 								        return X
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def _fit(self, X_train, y_train, **kwargs):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
-												v0.2.10 (#51)

* increase search space

* None check

											
										
										
											2021-03-28 17:54:25 -07:00
+								        current_time = time.time()
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        X_train = self._preprocess(X_train)
 								        model = self.estimator_class(**self.params)
 								        model.fit(X_train, y_train, **kwargs)
-												v0.2.10 (#51)

* increase search space

* None check

											
										
										
											2021-03-28 17:54:25 -07:00
+								        train_time = time.time() - current_time
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        self._model = model
 								        return train_time
 								    def fit(self, X_train, y_train, budget=None, **kwargs):
 								        '''Train the model from given training data
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        Args:
 								            X_train: A numpy array of training data in shape n*m
 								            y_train: A numpy array of labels in shape n*1
 								            budget: A float of the time budget in seconds
 								        Returns:
 								            train_time: A float of the training time in seconds
 								        '''
 								        return self._fit(X_train, y_train, **kwargs)
 								    def predict(self, X_test):
 								        '''Predict label from features
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        Args:
 								            X_test: A numpy array of featurized instances, shape n*m
 								        Returns:
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            A numpy array of shape n*1.
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            Each element is the label for a instance
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        '''
-												update image url (#71)

* update image url

* ArffException

* OpenMLError is ValueError

* CatBoostError

* reduce build on push

Co-authored-by: Chi Wang (MSR) <wang.chi@microsoft.com>
											
										
										
											2021-04-21 04:36:06 -04:00
+								        if self._model is not None:
 								            X_test = self._preprocess(X_test)
 								            return self._model.predict(X_test)
 								        else:
 								            return np.ones(X_test.shape[0])
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
 								    def predict_proba(self, X_test):
 								        '''Predict the probability of each class from features
 								        Only works for classification problems
 								        Args:
 								            model: An object of trained model with method predict_proba()
 								            X_test: A numpy array of featurized instances, shape n*m
 								        Returns:
 								            A numpy array of shape n*c. c is the # classes
 								            Each element at (i,j) is the probability for instance i to be in
 								                class j
 								        '''
 								        if 'regression' in self._task:
 								            print('Regression tasks do not support predict_prob')
 								            raise ValueError
 								        else:
 								            X_test = self._preprocess(X_test)
 								            return self._model.predict_proba(X_test)
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def cleanup(self):
 								        pass
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
 								    @classmethod
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def search_space(cls, **params):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        '''[required method] search space
 								        Returns:
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            A dictionary of the search space.
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            Each key is the name of a hyperparameter, and value is a dict with
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								                its domain and init_value (optional), cat_hp_cost (optional)
 								                e.g.,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                {'domain': tune.randint(lower=1, upper=10), 'init_value': 1}
 								        '''
 								        return {}
 								    @classmethod
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def size(cls, config):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        '''[optional method] memory size of the estimator in bytes
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        Args:
 								            config - the dict of the hyperparameter config
 								        Returns:
 								            A float of the memory size required by the estimator to train the
 								            given config
 								        '''
 								        return 1.0
 								    @classmethod
 								    def cost_relative2lgbm(cls):
 								        '''[optional method] relative cost compared to lightgbm'''
 								        return 1.0
 								class SKLearnEstimator(BaseEstimator):
 								    def _preprocess(self, X):
 								        if isinstance(X, pd.DataFrame):
 								            X = X.copy()
 								            cat_columns = X.select_dtypes(include=['category']).columns
 								            X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)
 								        return X
 								class LGBMEstimator(BaseEstimator):
 								    @classmethod
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def search_space(cls, data_size, **params):
 								        upper = min(32768, int(data_size))
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        return {
 								            'n_estimators': {
 								                'domain': tune.qloguniform(lower=4, upper=upper, q=1),
 								                'init_value': 4,
-												V0.3.0 (#55)

* flaml v0.3

* low cost partial config

											
										
										
											2021-04-06 11:37:52 -07:00
+								                'low_cost_init_value': 4,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            },
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            'num_leaves': {
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                'domain': tune.qloguniform(lower=4, upper=upper, q=1),
 								                'init_value': 4,
-												V0.3.0 (#55)

* flaml v0.3

* low cost partial config

											
										
										
											2021-04-06 11:37:52 -07:00
+								                'low_cost_init_value': 4,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            },
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            'min_child_samples': {
-												v0.2.10 (#51)

* increase search space

* None check

											
										
										
											2021-03-28 17:54:25 -07:00
+								                'domain': tune.qloguniform(lower=2, upper=2**7, q=1),
 								                'init_value': 20,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            },
 								            'learning_rate': {
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								                'domain': tune.loguniform(lower=1 / 1024, upper=1.0),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                'init_value': 0.1,
 								            },
 								            'subsample': {
-												v0.2.10 (#51)

* increase search space

* None check

											
										
										
											2021-03-28 17:54:25 -07:00
+								                'domain': tune.uniform(lower=0.1, upper=1.0),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                'init_value': 1.0,
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            },
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'log_max_bin': {
 								                'domain': tune.qloguniform(lower=3, upper=10, q=1),
 								                'init_value': 8,
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            },
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'colsample_bytree': {
-												v0.2.10 (#51)

* increase search space

* None check

											
										
										
											2021-03-28 17:54:25 -07:00
+								                'domain': tune.uniform(lower=0.01, upper=1.0),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                'init_value': 1.0,
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            },
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'reg_alpha': {
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								                'domain': tune.loguniform(lower=1 / 1024, upper=1024),
 								                'init_value': 1 / 1024,
 								            },
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'reg_lambda': {
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								                'domain': tune.loguniform(lower=1 / 1024, upper=1024),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                'init_value': 1.0,
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            },
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        }
 								    @classmethod
 								    def size(cls, config):
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        num_leaves = int(round(config.get('num_leaves') or config['max_leaves']))
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        n_estimators = int(round(config['n_estimators']))
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        return (num_leaves * 3 + (num_leaves - 1) * 4 + 1.0) * n_estimators * 8
 								    def __init__(
 								        self, task='binary:logistic', n_jobs=1,
 								        n_estimators=2, num_leaves=2, min_child_samples=20, learning_rate=0.1,
 								        subsample=1.0, reg_lambda=1.0, reg_alpha=0.0,
 								        colsample_bytree=1.0, log_max_bin=8, **params
 								    ):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        super().__init__(task, **params)
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        # Default: ‘regression’ for LGBMRegressor,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        # ‘binary’ or ‘multiclass’ for LGBMClassifier
 								        if 'regression' in task:
 								            objective = 'regression'
 								        elif 'binary' in task:
 								            objective = 'binary'
 								        elif 'multi' in task:
 								            objective = 'multiclass'
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        else:
 								            objective = 'regression'
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        self.params = {
 								            "n_estimators": int(round(n_estimators)),
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            "num_leaves": int(round(num_leaves)),
-												data validation (#45)

* pickle the AutoML object

* get best model per estimator

* test deberta

* stateless API

* prevent divide by zero

* test roberta

* BlendSearchTuner

* delta time

* reindex columns when dropping int-indexed columns

* test drop columns and small training data

* param set for ensemble builder

* fillna on copy

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
											
										
										
											2021-03-19 09:50:47 -07:00
+								            'objective': params.get("objective", objective),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'n_jobs': n_jobs,
 								            'learning_rate': float(learning_rate),
 								            'reg_alpha': float(reg_alpha),
 								            'reg_lambda': float(reg_lambda),
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            'min_child_samples': int(round(min_child_samples)),
 								            'colsample_bytree': float(colsample_bytree),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'subsample': float(subsample),
 								        }
 								        self.params['max_bin'] = params['max_bin'] if 'max_bin' in params else (
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+<< int(round(log_max_bin))) - 1
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        if 'regression' in task:
 								            self.estimator_class = LGBMRegressor
 								        else:
 								            self.estimator_class = LGBMClassifier
 								        self._time_per_iter = None
 								        self._train_size = 0
 								    def _preprocess(self, X):
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        if not isinstance(X, pd.DataFrame) and issparse(X) and np.issubdtype(
 								                X.dtype, np.integer):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            X = X.astype(float)
 								        return X
 								    def fit(self, X_train, y_train, budget=None, **kwargs):
 								        start_time = time.time()
 								        n_iter = self.params["n_estimators"]
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        if (not self._time_per_iter or abs(
 								                self._train_size - X_train.shape[0]) > 4) and budget is not None:
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            self.params["n_estimators"] = 1
 								            self._t1 = self._fit(X_train, y_train, **kwargs)
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            if self._t1 >= budget:
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                self.params["n_estimators"] = n_iter
 								                return self._t1
 								            self.params["n_estimators"] = 4
 								            self._t2 = self._fit(X_train, y_train, **kwargs)
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            self._time_per_iter = (self._t2 - self._t1) / (
 								                self.params["n_estimators"] - 1) if self._t2 > self._t1 \
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                else self._t1 if self._t1 else 0.001
 								            self._train_size = X_train.shape[0]
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            if self._t1 + self._t2 >= budget or n_iter == self.params[
 								                    "n_estimators"]:
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                self.params["n_estimators"] = n_iter
 								                return time.time() - start_time
 								        if budget is not None:
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            self.params["n_estimators"] = min(n_iter, int(
 								                (budget - time.time() + start_time - self._t1)
 								                / self._time_per_iter + 1))
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        if self.params["n_estimators"] > 0:
 								            self._fit(X_train, y_train, **kwargs)
 								        self.params["n_estimators"] = n_iter
 								        train_time = time.time() - start_time
 								        return train_time
 								class XGBoostEstimator(SKLearnEstimator):
 								    ''' not using sklearn API, used for regression '''
 								    @classmethod
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def search_space(cls, data_size, **params):
 								        upper = min(32768, int(data_size))
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        return {
 								            'n_estimators': {
 								                'domain': tune.qloguniform(lower=4, upper=upper, q=1),
 								                'init_value': 4,
-												V0.3.0 (#55)

* flaml v0.3

* low cost partial config

											
										
										
											2021-04-06 11:37:52 -07:00
+								                'low_cost_init_value': 4,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            },
 								            'max_leaves': {
 								                'domain': tune.qloguniform(lower=4, upper=upper, q=1),
 								                'init_value': 4,
-												V0.3.0 (#55)

* flaml v0.3

* low cost partial config

											
										
										
											2021-04-06 11:37:52 -07:00
+								                'low_cost_init_value': 4,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            },
 								            'min_child_weight': {
-												v0.2.10 (#51)

* increase search space

* None check

											
										
										
											2021-03-28 17:54:25 -07:00
+								                'domain': tune.loguniform(lower=0.001, upper=128),
 								                'init_value': 1,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            },
 								            'learning_rate': {
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								                'domain': tune.loguniform(lower=1 / 1024, upper=1.0),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                'init_value': 0.1,
 								            },
 								            'subsample': {
-												v0.2.10 (#51)

* increase search space

* None check

											
										
										
											2021-03-28 17:54:25 -07:00
+								                'domain': tune.uniform(lower=0.1, upper=1.0),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                'init_value': 1.0,
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            },
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'colsample_bylevel': {
-												v0.2.10 (#51)

* increase search space

* None check

											
										
										
											2021-03-28 17:54:25 -07:00
+								                'domain': tune.uniform(lower=0.01, upper=1.0),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                'init_value': 1.0,
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            },
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'colsample_bytree': {
-												v0.2.10 (#51)

* increase search space

* None check

											
										
										
											2021-03-28 17:54:25 -07:00
+								                'domain': tune.uniform(lower=0.01, upper=1.0),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                'init_value': 1.0,
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            },
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'reg_alpha': {
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								                'domain': tune.loguniform(lower=1 / 1024, upper=1024),
 								                'init_value': 1 / 1024,
 								            },
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'reg_lambda': {
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								                'domain': tune.loguniform(lower=1 / 1024, upper=1024),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                'init_value': 1.0,
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            },
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        }
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								    @classmethod
 								    def size(cls, config):
 								        return LGBMEstimator.size(config)
 								    @classmethod
 								    def cost_relative2lgbm(cls):
 								        return 1.6
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def __init__(
 								        self, task='regression', all_thread=False, n_jobs=1,
 								        n_estimators=4, max_leaves=4, subsample=1.0, min_child_weight=1,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0, colsample_bylevel=1.0,
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        colsample_bytree=1.0, tree_method='auto', **params
 								    ):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        super().__init__(task, **params)
 								        self._n_estimators = int(round(n_estimators))
 								        self.params = {
 								            'max_leaves': int(round(max_leaves)),
-												data validation (#45)

* pickle the AutoML object

* get best model per estimator

* test deberta

* stateless API

* prevent divide by zero

* test roberta

* BlendSearchTuner

* delta time

* reindex columns when dropping int-indexed columns

* test drop columns and small training data

* param set for ensemble builder

* fillna on copy

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
											
										
										
											2021-03-19 09:50:47 -07:00
+								            'max_depth': params.get('max_depth', 0),
 								            'grow_policy': params.get("grow_policy", 'lossguide'),
 								            'tree_method': tree_method,
 								            'verbosity': params.get('verbosity', 0),
 								            'nthread': n_jobs,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'learning_rate': float(learning_rate),
 								            'subsample': float(subsample),
 								            'reg_alpha': float(reg_alpha),
 								            'reg_lambda': float(reg_lambda),
 								            'min_child_weight': float(min_child_weight),
-												data validation (#45)

* pickle the AutoML object

* get best model per estimator

* test deberta

* stateless API

* prevent divide by zero

* test roberta

* BlendSearchTuner

* delta time

* reindex columns when dropping int-indexed columns

* test drop columns and small training data

* param set for ensemble builder

* fillna on copy

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
											
										
										
											2021-03-19 09:50:47 -07:00
+								            'booster': params.get('booster', 'gbtree'),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'colsample_bylevel': float(colsample_bylevel),
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            'colsample_bytree': float(colsample_bytree),
-												Lgbm w customized obj (#64)

* add customized lgbm learner

* add comments

* fix format issue

* format

* OpenMLError

* add test

* add notebook

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: Chi Wang <wang.chi@microsoft.com>
											
										
										
											2021-04-10 21:14:28 -04:00
+								            'objective': params.get("objective")
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        }
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        if all_thread:
 								            del self.params['nthread']
 								    def get_params(self, deep=False):
 								        params = super().get_params()
 								        params["n_jobs"] = params['nthread']
 								        return params
 								    def fit(self, X_train, y_train, budget=None, **kwargs):
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        start_time = time.time()
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        if not issparse(X_train):
 								            self.params['tree_method'] = 'hist'
 								            X_train = self._preprocess(X_train)
-												sample weight in xgboost (#54)


											
										
										
											2021-03-31 22:11:56 -07:00
+								        if 'sample_weight' in kwargs:
 								            dtrain = xgb.DMatrix(X_train, label=y_train, weight=kwargs[
 								                'sample_weight'])
 								        else:
 								            dtrain = xgb.DMatrix(X_train, label=y_train)
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
-												Lgbm w customized obj (#64)

* add customized lgbm learner

* add comments

* fix format issue

* format

* OpenMLError

* add test

* add notebook

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: Chi Wang <wang.chi@microsoft.com>
											
										
										
											2021-04-10 21:14:28 -04:00
+								        objective = self.params.get('objective')
 								        if isinstance(objective, str):
 								            obj = None
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        else:
-												Lgbm w customized obj (#64)

* add customized lgbm learner

* add comments

* fix format issue

* format

* OpenMLError

* add test

* add notebook

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: Chi Wang <wang.chi@microsoft.com>
											
										
										
											2021-04-10 21:14:28 -04:00
+								            obj = objective
 								            if 'objective' in self.params:
 								                del self.params['objective']
 								        self._model = xgb.train(self.params, dtrain, self._n_estimators,
 								                                obj=obj)
 								        self.params['objective'] = objective
 								        del dtrain
 								        train_time = time.time() - start_time
 								        return train_time
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
 								    def predict(self, X_test):
 								        if not issparse(X_test):
 								            X_test = self._preprocess(X_test)
 								        dtest = xgb.DMatrix(X_test)
 								        return super().predict(dtest)
 								class XGBoostSklearnEstimator(SKLearnEstimator, LGBMEstimator):
 								    ''' using sklearn API, used for classification '''
 								    @classmethod
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def search_space(cls, data_size, **params):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        return XGBoostEstimator.search_space(data_size)
 								    @classmethod
 								    def cost_relative2lgbm(cls):
 								        return XGBoostEstimator.cost_relative2lgbm()
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def __init__(
 								        self, task='binary:logistic', n_jobs=1,
 								        n_estimators=4, max_leaves=4, subsample=1.0,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        min_child_weight=1, learning_rate=0.1, reg_lambda=1.0, reg_alpha=0.0,
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        colsample_bylevel=1.0, colsample_bytree=1.0, tree_method='hist',
 								        **params
 								    ):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        super().__init__(task, **params)
 								        self.params = {
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            "n_estimators": int(round(n_estimators)),
 								            'max_leaves': int(round(max_leaves)),
 								            'max_depth': 0,
 								            'grow_policy': params.get("grow_policy", 'lossguide'),
 								            'tree_method': tree_method,
 								            'verbosity': 0,
 								            'n_jobs': n_jobs,
 								            'learning_rate': float(learning_rate),
 								            'subsample': float(subsample),
 								            'reg_alpha': float(reg_alpha),
 								            'reg_lambda': float(reg_lambda),
 								            'min_child_weight': float(min_child_weight),
 								            'booster': params.get('booster', 'gbtree'),
 								            'colsample_bylevel': float(colsample_bylevel),
 								            'colsample_bytree': float(colsample_bytree),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        }
 								        if 'regression' in task:
 								            self.estimator_class = xgb.XGBRegressor
 								        else:
 								            self.estimator_class = xgb.XGBClassifier
 								        self._time_per_iter = None
 								        self._train_size = 0
 								    def fit(self, X_train, y_train, budget=None, **kwargs):
 								        if issparse(X_train):
 								            self.params['tree_method'] = 'auto'
 								        return super().fit(X_train, y_train, budget, **kwargs)
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								class RandomForestEstimator(SKLearnEstimator, LGBMEstimator):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
 								    @classmethod
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def search_space(cls, data_size, task, **params):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        upper = min(2048, int(data_size))
 								        space = {
 								            'n_estimators': {
 								                'domain': tune.qloguniform(lower=4, upper=upper, q=1),
 								                'init_value': 4,
-												V0.3.0 (#55)

* flaml v0.3

* low cost partial config

											
										
										
											2021-04-06 11:37:52 -07:00
+								                'low_cost_init_value': 4,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            },
 								            'max_features': {
 								                'domain': tune.loguniform(lower=0.1, upper=1.0),
 								                'init_value': 1.0,
 								            },
 								        }
 								        if task != 'regression':
 								            space['criterion'] = {
 								                'domain': tune.choice(['gini', 'entropy']),
 								                # 'init_value': 'gini',
 								            }
 								        return space
 								    @classmethod
 								    def size(cls, config):
 								        return 1.0
 								    @classmethod
 								    def cost_relative2lgbm(cls):
 								        return 2.0
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def __init__(
 								        self, task='binary:logistic', n_jobs=1,
 								        n_estimators=4, max_features=1.0, criterion='gini', **params
 								    ):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        super().__init__(task, **params)
 								        self.params = {
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            "n_estimators": int(round(n_estimators)),
 								            "n_jobs": n_jobs,
 								            'max_features': float(max_features),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        }
 								        if 'regression' in task:
 								            self.estimator_class = RandomForestRegressor
 								        else:
 								            self.estimator_class = RandomForestClassifier
 								            self.params['criterion'] = criterion
 								        self._time_per_iter = None
 								        self._train_size = 0
 								    def get_params(self, deep=False):
 								        params = super().get_params()
 								        return params
 								class ExtraTreeEstimator(RandomForestEstimator):
 								    @classmethod
 								    def cost_relative2lgbm(cls):
 								        return 1.9
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def __init__(self, task='binary:logistic', **params):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        super().__init__(task, **params)
 								        if 'regression' in task:
 								            self.estimator_class = ExtraTreesRegressor
 								        else:
 								            self.estimator_class = ExtraTreesClassifier
 								class LRL1Classifier(SKLearnEstimator):
 								    @classmethod
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def search_space(cls, **params):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        return {
 								            'C': {
 								                'domain': tune.loguniform(lower=0.03125, upper=32768.0),
 								                'init_value': 1.0,
 								            },
 								        }
 								    @classmethod
 								    def cost_relative2lgbm(cls):
 								        return 160
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def __init__(
 								        self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0,
 								        **params
 								    ):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        super().__init__(task, **params)
 								        self.params = {
-												data validation (#45)

* pickle the AutoML object

* get best model per estimator

* test deberta

* stateless API

* prevent divide by zero

* test roberta

* BlendSearchTuner

* delta time

* reindex columns when dropping int-indexed columns

* test drop columns and small training data

* param set for ensemble builder

* fillna on copy

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
											
										
										
											2021-03-19 09:50:47 -07:00
+								            'penalty': params.get("penalty", 'l1'),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'tol': float(tol),
 								            'C': float(C),
-												data validation (#45)

* pickle the AutoML object

* get best model per estimator

* test deberta

* stateless API

* prevent divide by zero

* test roberta

* BlendSearchTuner

* delta time

* reindex columns when dropping int-indexed columns

* test drop columns and small training data

* param set for ensemble builder

* fillna on copy

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
											
										
										
											2021-03-19 09:50:47 -07:00
+								            'solver': params.get("solver", 'saga'),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'n_jobs': n_jobs,
 								        }
 								        if 'regression' in task:
 								            self.estimator_class = None
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            raise NotImplementedError('LR does not support regression task')
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        else:
 								            self.estimator_class = LogisticRegression
 								class LRL2Classifier(SKLearnEstimator):
 								    @classmethod
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def search_space(cls, **params):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        return LRL1Classifier.search_space(**params)
 								    @classmethod
 								    def cost_relative2lgbm(cls):
 								        return 25
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def __init__(
 								        self, task='binary:logistic', n_jobs=1, tol=0.0001, C=1.0,
 								        **params
 								    ):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        super().__init__(task, **params)
 								        self.params = {
-												data validation (#45)

* pickle the AutoML object

* get best model per estimator

* test deberta

* stateless API

* prevent divide by zero

* test roberta

* BlendSearchTuner

* delta time

* reindex columns when dropping int-indexed columns

* test drop columns and small training data

* param set for ensemble builder

* fillna on copy

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
											
										
										
											2021-03-19 09:50:47 -07:00
+								            'penalty': params.get("penalty", 'l2'),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'tol': float(tol),
 								            'C': float(C),
-												data validation (#45)

* pickle the AutoML object

* get best model per estimator

* test deberta

* stateless API

* prevent divide by zero

* test roberta

* BlendSearchTuner

* delta time

* reindex columns when dropping int-indexed columns

* test drop columns and small training data

* param set for ensemble builder

* fillna on copy

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
											
										
										
											2021-03-19 09:50:47 -07:00
+								            'solver': params.get("solver", 'lbfgs'),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'n_jobs': n_jobs,
 								        }
 								        if 'regression' in task:
 								            self.estimator_class = None
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            raise NotImplementedError('LR does not support regression task')
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        else:
 								            self.estimator_class = LogisticRegression
 								class CatBoostEstimator(BaseEstimator):
 								    _time_per_iter = None
 								    _train_size = 0
 								    @classmethod
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def search_space(cls, data_size, **params):
 								        upper = max(min(round(1500000 / data_size), 150), 11)
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        return {
 								            'early_stopping_rounds': {
 								                'domain': tune.qloguniform(lower=10, upper=upper, q=1),
 								                'init_value': 10,
-												V0.3.0 (#55)

* flaml v0.3

* low cost partial config

											
										
										
											2021-04-06 11:37:52 -07:00
+								                'low_cost_init_value': 10,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            },
 								            'learning_rate': {
 								                'domain': tune.loguniform(lower=.005, upper=.2),
 								                'init_value': 0.1,
 								            },
 								        }
 								    @classmethod
 								    def size(cls, config):
 								        n_estimators = 8192
 								        max_leaves = 64
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        return (max_leaves * 3 + (max_leaves - 1) * 4 + 1.0) * n_estimators * 8
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
 								    @classmethod
 								    def cost_relative2lgbm(cls):
 								        return 15
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def __init__(
 								        self, task='binary:logistic', n_jobs=1,
 								        n_estimators=8192, learning_rate=0.1, early_stopping_rounds=4, **params
 								    ):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        super().__init__(task, **params)
 								        self.params = {
 								            "early_stopping_rounds": int(round(early_stopping_rounds)),
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								            "n_estimators": n_estimators,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'learning_rate': learning_rate,
 								            'thread_count': n_jobs,
-												data validation (#45)

* pickle the AutoML object

* get best model per estimator

* test deberta

* stateless API

* prevent divide by zero

* test roberta

* BlendSearchTuner

* delta time

* reindex columns when dropping int-indexed columns

* test drop columns and small training data

* param set for ensemble builder

* fillna on copy

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
											
										
										
											2021-03-19 09:50:47 -07:00
+								            'verbose': params.get('verbose', False),
 								            'random_seed': params.get("random_seed", 10242048),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        }
 								        if 'regression' in task:
 								            from catboost import CatBoostRegressor
 								            self.estimator_class = CatBoostRegressor
 								        else:
 								            from catboost import CatBoostClassifier
 								            self.estimator_class = CatBoostClassifier
 								    def get_params(self, deep=False):
 								        params = super().get_params()
 								        params['n_jobs'] = params['thread_count']
 								        return params
 								    def fit(self, X_train, y_train, budget=None, **kwargs):
 								        start_time = time.time()
 								        n_iter = self.params["n_estimators"]
 								        if isinstance(X_train, pd.DataFrame):
 								            cat_features = list(X_train.select_dtypes(
 								                include='category').columns)
 								        else:
 								            cat_features = []
-												update image url (#71)

* update image url

* ArffException

* OpenMLError is ValueError

* CatBoostError

* reduce build on push

Co-authored-by: Chi Wang (MSR) <wang.chi@microsoft.com>
											
										
										
											2021-04-21 04:36:06 -04:00
+								        from catboost import CatBoostError
 								        try:
 								            if (not CatBoostEstimator._time_per_iter or abs(
 								                    CatBoostEstimator._train_size - len(y_train)) > 4) and budget:
 								                # measure the time per iteration
 								                self.params["n_estimators"] = 1
 								                CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
 								                CatBoostEstimator._smallmodel.fit(
 								                    X_train, y_train, cat_features=cat_features, **kwargs)
 								                CatBoostEstimator._t1 = time.time() - start_time
 								                if CatBoostEstimator._t1 >= budget:
 								                    self.params["n_estimators"] = n_iter
 								                    self._model = CatBoostEstimator._smallmodel
 								                    return CatBoostEstimator._t1
 								                self.params["n_estimators"] = 4
 								                CatBoostEstimator._smallmodel = self.estimator_class(**self.params)
 								                CatBoostEstimator._smallmodel.fit(
 								                    X_train, y_train, cat_features=cat_features, **kwargs)
 								                CatBoostEstimator._time_per_iter = (
 								                    time.time() - start_time - CatBoostEstimator._t1) / (
 								                        self.params["n_estimators"] - 1)
 								                if CatBoostEstimator._time_per_iter <= 0:
 								                    CatBoostEstimator._time_per_iter = CatBoostEstimator._t1
 								                CatBoostEstimator._train_size = len(y_train)
 								                if time.time() - start_time >= budget or n_iter == self.params[
 								                        "n_estimators"]:
 								                    self.params["n_estimators"] = n_iter
 								                    self._model = CatBoostEstimator._smallmodel
 								                    return time.time() - start_time
 								            if budget:
 								                train_times = 1
 								                self.params["n_estimators"] = min(n_iter, int(
 								                    (budget - time.time() + start_time - CatBoostEstimator._t1)
 								                    / train_times / CatBoostEstimator._time_per_iter + 1))
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								                self._model = CatBoostEstimator._smallmodel
-												update image url (#71)

* update image url

* ArffException

* OpenMLError is ValueError

* CatBoostError

* reduce build on push

Co-authored-by: Chi Wang (MSR) <wang.chi@microsoft.com>
											
										
										
											2021-04-21 04:36:06 -04:00
+								            if self.params["n_estimators"] > 0:
 								                n = max(int(len(y_train) * 0.9), len(y_train) - 1000)
 								                X_tr, y_tr = X_train[:n], y_train[:n]
 								                if 'sample_weight' in kwargs:
 								                    weight = kwargs['sample_weight']
 								                    if weight is not None:
 								                        kwargs['sample_weight'] = weight[:n]
 								                else:
 								                    weight = None
 								                from catboost import Pool
 								                model = self.estimator_class(**self.params)
 								                model.fit(
 								                    X_tr, y_tr, cat_features=cat_features,
 								                    eval_set=Pool(
 								                        data=X_train[n:], label=y_train[n:],
 								                        cat_features=cat_features),
 								                    **kwargs)   # model.get_best_iteration()
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								                if weight is not None:
-												update image url (#71)

* update image url

* ArffException

* OpenMLError is ValueError

* CatBoostError

* reduce build on push

Co-authored-by: Chi Wang (MSR) <wang.chi@microsoft.com>
											
										
										
											2021-04-21 04:36:06 -04:00
+								                    kwargs['sample_weight'] = weight
 								                self._model = model
 								        except CatBoostError:
 								            self._model = None
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        self.params["n_estimators"] = n_iter
 								        train_time = time.time() - start_time
 								        return train_time
 								class KNeighborsEstimator(BaseEstimator):
 								    @classmethod
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def search_space(cls, data_size, **params):
 								        upper = min(512, int(data_size / 2))
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        return {
 								            'n_neighbors': {
 								                'domain': tune.qloguniform(lower=1, upper=upper, q=1),
 								                'init_value': 5,
-												V0.3.0 (#55)

* flaml v0.3

* low cost partial config

											
										
										
											2021-04-06 11:37:52 -07:00
+								                'low_cost_init_value': 1,
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            },
 								        }
 								    @classmethod
 								    def cost_relative2lgbm(cls):
 								        return 30
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								    def __init__(
 								        self, task='binary:logistic', n_jobs=1, n_neighbors=5, **params
 								    ):
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        super().__init__(task, **params)
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								        self.params = {
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'n_neighbors': int(round(n_neighbors)),
-												data validation (#45)

* pickle the AutoML object

* get best model per estimator

* test deberta

* stateless API

* prevent divide by zero

* test roberta

* BlendSearchTuner

* delta time

* reindex columns when dropping int-indexed columns

* test drop columns and small training data

* param set for ensemble builder

* fillna on copy

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
											
										
										
											2021-03-19 09:50:47 -07:00
+								            'weights': params.get('weights', 'distance'),
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								            'n_jobs': n_jobs,
 								        }
 								        if 'regression' in task:
 								            from sklearn.neighbors import KNeighborsRegressor
 								            self.estimator_class = KNeighborsRegressor
 								        else:
 								            from sklearn.neighbors import KNeighborsClassifier
 								            self.estimator_class = KNeighborsClassifier
 								    def _preprocess(self, X):
 								        if isinstance(X, pd.DataFrame):
 								            cat_columns = X.select_dtypes(['category']).columns
 								            if X.shape[1] == len(cat_columns):
 								                raise ValueError(
-												Issue58 (#59)

* iter per learner

* code cleanup

											
										
										
											2021-04-08 09:29:55 -07:00
+								                    "kneighbor requires at least one numeric feature")
 								            X = X.drop(cat_columns, axis=1)
-												V0.2.2 (#19)

* v0.2.2

separate the HPO part into the module flaml.tune
enhanced implementation of FLOW^2, CFO and BlendSearch
support parallel tuning using ray tune
add support for sample_weight and generic fit arguments
enable mlflow logging

Co-authored-by: Chi Wang (MSR) <chiw@microsoft.com>
Co-authored-by: qingyun-wu <qw2ky@virginia.edu>
											
										
										
											2021-02-05 21:41:14 -08:00
+								        return X