mirror of
https://github.com/microsoft/autogen.git
synced 2025-09-09 08:16:09 +00:00

* add basic support to Spark dataframe add support to SynapseML LightGBM model update to pyspark>=3.2.0 to leverage pandas_on_Spark API * clean code, add TODOs * add sample_train_data for pyspark.pandas dataframe, fix bugs * improve some functions, fix bugs * fix dict change size during iteration * update model predict * update LightGBM model, update test * update SynapseML LightGBM params * update synapseML and tests * update TODOs * Added support to roc_auc for spark models * Added support to score of spark estimator * Added test for automl score of spark estimator * Added cv support to pyspark.pandas dataframe * Update test, fix bugs * Added tests * Updated docs, tests, added a notebook * Fix bugs in non-spark env * Fix bugs and improve tests * Fix uninstall pyspark * Fix tests error * Fix java.lang.OutOfMemoryError: Java heap space * Fix test_performance * Update test_sparkml to test_0sparkml to use the expected spark conf * Remove unnecessary widgets in notebook * Fix iloc java.lang.StackOverflowError * fix pre-commit * Added params check for spark dataframes * Refactor code for train_test_split to a function * Update train_test_split_pyspark * Refactor if-else, remove unnecessary code * Remove y from predict, remove mem control from n_iter compute * Update workflow * Improve _split_pyspark * Fix test failure of too short training time * Fix typos, improve docstrings * Fix index errors of pandas_on_spark, add spark loss metric * Fix typo of ndcgAtK * Update NDCG metrics and tests * Remove unuseful logger * Use cache and count to ensure consistent indexes * refactor for merge maain * fix errors of refactor * Updated SparkLightGBMEstimator and cache * Updated config2params * Remove unused import * Fix unknown parameters * Update default_estimator_list * Add unit tests for spark metrics
459 lines
17 KiB
Python
459 lines
17 KiB
Python
import inspect
|
|
import time
|
|
import os
|
|
from typing import Any, Optional
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
from flaml import tune
|
|
from flaml.automl.logger import logger
|
|
from flaml.automl.ml import compute_estimator, train_estimator
|
|
from flaml.automl.task.task import TS_FORECAST
|
|
|
|
try:
|
|
from flaml.automl.spark.utils import (
|
|
train_test_split_pyspark,
|
|
unique_pandas_on_spark,
|
|
len_labels,
|
|
unique_value_first_index,
|
|
)
|
|
except ImportError:
|
|
train_test_split_pyspark = None
|
|
unique_pandas_on_spark = None
|
|
from flaml.automl.utils import (
|
|
len_labels,
|
|
unique_value_first_index,
|
|
)
|
|
try:
|
|
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
|
|
import pyspark.pandas as ps
|
|
from pyspark.pandas import DataFrame as psDataFrame, Series as psSeries
|
|
from pyspark.pandas.config import set_option, reset_option
|
|
except ImportError:
|
|
ps = None
|
|
|
|
class psDataFrame:
|
|
pass
|
|
|
|
class psSeries:
|
|
pass
|
|
|
|
|
|
class SearchState:
|
|
@property
|
|
def search_space(self):
|
|
return self._search_space_domain
|
|
|
|
@property
|
|
def estimated_cost4improvement(self):
|
|
return max(
|
|
self.time_best_found - self.time_best_found_old,
|
|
self.total_time_used - self.time_best_found,
|
|
)
|
|
|
|
def valid_starting_point_one_dim(self, value_one_dim, domain_one_dim):
|
|
from flaml.tune.space import sample
|
|
|
|
"""
|
|
For each hp in the starting point, check the following 3 conditions:
|
|
(1) If the type of the starting point does not match the required type in search space, return false
|
|
(2) If the starting point is not in the required search space, return false
|
|
(3) If the search space is a value instead of domain, and the value is not equal to the starting point
|
|
Notice (2) include the case starting point not in user specified search space custom_hp
|
|
"""
|
|
if isinstance(domain_one_dim, sample.Domain):
|
|
renamed_type = list(
|
|
inspect.signature(domain_one_dim.is_valid).parameters.values()
|
|
)[0].annotation
|
|
type_match = (
|
|
renamed_type == Any
|
|
or isinstance(value_one_dim, renamed_type)
|
|
or isinstance(value_one_dim, int)
|
|
and renamed_type is float
|
|
)
|
|
if not (type_match and domain_one_dim.is_valid(value_one_dim)):
|
|
return False
|
|
elif value_one_dim != domain_one_dim:
|
|
return False
|
|
return True
|
|
|
|
def valid_starting_point(self, starting_point, search_space):
|
|
return all(
|
|
self.valid_starting_point_one_dim(value, search_space[name].get("domain"))
|
|
for name, value in starting_point.items()
|
|
if name != "FLAML_sample_size"
|
|
)
|
|
|
|
def __init__(
|
|
self,
|
|
learner_class,
|
|
data_size,
|
|
task,
|
|
starting_point=None,
|
|
period=None,
|
|
custom_hp=None,
|
|
max_iter=None,
|
|
budget=None,
|
|
):
|
|
self.init_eci = learner_class.cost_relative2lgbm() if budget >= 0 else 1
|
|
self._search_space_domain = {}
|
|
self.init_config = None
|
|
self.low_cost_partial_config = {}
|
|
self.cat_hp_cost = {}
|
|
self.data_size = data_size
|
|
self.ls_ever_converged = False
|
|
self.learner_class = learner_class
|
|
self._budget = budget
|
|
if task in TS_FORECAST:
|
|
search_space = learner_class.search_space(
|
|
data_size=data_size, task=task, pred_horizon=period
|
|
)
|
|
else:
|
|
search_space = learner_class.search_space(data_size=data_size, task=task)
|
|
|
|
if custom_hp is not None:
|
|
search_space.update(custom_hp)
|
|
|
|
if isinstance(starting_point, dict):
|
|
starting_point = AutoMLState.sanitize(starting_point)
|
|
if max_iter > 1 and not self.valid_starting_point(
|
|
starting_point, search_space
|
|
):
|
|
# If the number of iterations is larger than 1, remove invalid point
|
|
logger.warning(
|
|
"Starting point {} removed because it is outside of the search space".format(
|
|
starting_point
|
|
)
|
|
)
|
|
starting_point = None
|
|
elif isinstance(starting_point, list):
|
|
starting_point = [AutoMLState.sanitize(x) for x in starting_point]
|
|
if max_iter > len(starting_point):
|
|
# If the number of starting points is no smaller than max iter, avoid the checking
|
|
starting_point_len = len(starting_point)
|
|
starting_point = [
|
|
x
|
|
for x in starting_point
|
|
if self.valid_starting_point(x, search_space)
|
|
]
|
|
if starting_point_len > len(starting_point):
|
|
logger.warning(
|
|
"Starting points outside of the search space are removed. "
|
|
f"Remaining starting points for {learner_class}: {starting_point}"
|
|
)
|
|
starting_point = starting_point or None
|
|
|
|
for name, space in search_space.items():
|
|
assert (
|
|
"domain" in space
|
|
), f"{name}'s domain is missing in the search space spec {space}"
|
|
if space["domain"] is None:
|
|
# don't search this hp
|
|
continue
|
|
self._search_space_domain[name] = space["domain"]
|
|
|
|
if "low_cost_init_value" in space:
|
|
self.low_cost_partial_config[name] = space["low_cost_init_value"]
|
|
if "cat_hp_cost" in space:
|
|
self.cat_hp_cost[name] = space["cat_hp_cost"]
|
|
# if a starting point is provided, set the init config to be
|
|
# the starting point provided
|
|
if (
|
|
isinstance(starting_point, dict)
|
|
and starting_point.get(name) is not None
|
|
):
|
|
if self.init_config is None:
|
|
self.init_config = {}
|
|
self.init_config[name] = starting_point[name]
|
|
elif (
|
|
not isinstance(starting_point, list)
|
|
and "init_value" in space
|
|
and self.valid_starting_point_one_dim(
|
|
space["init_value"], space["domain"]
|
|
)
|
|
):
|
|
if self.init_config is None:
|
|
self.init_config = {}
|
|
self.init_config[name] = space["init_value"]
|
|
|
|
if isinstance(starting_point, list):
|
|
self.init_config = starting_point
|
|
else:
|
|
self.init_config = [] if self.init_config is None else [self.init_config]
|
|
|
|
self._hp_names = list(self._search_space_domain.keys())
|
|
self.search_alg = None
|
|
self.best_config = None
|
|
self.best_result = None
|
|
self.best_loss = self.best_loss_old = np.inf
|
|
self.total_time_used = 0
|
|
self.total_iter = 0
|
|
self.base_eci = None
|
|
self.time_best_found = self.time_best_found_old = 0
|
|
self.time2eval_best = 0
|
|
self.time2eval_best_old = 0
|
|
self.trained_estimator = None
|
|
self.sample_size = None
|
|
self.trial_time = 0
|
|
|
|
def update(self, result, time_used):
|
|
if result:
|
|
config = result["config"]
|
|
if config and "FLAML_sample_size" in config:
|
|
self.sample_size = config["FLAML_sample_size"]
|
|
else:
|
|
self.sample_size = self.data_size[0]
|
|
obj = result["val_loss"]
|
|
metric_for_logging = result["metric_for_logging"]
|
|
time2eval = result["time_total_s"]
|
|
trained_estimator = result["trained_estimator"]
|
|
del result["trained_estimator"] # free up RAM
|
|
n_iter = (
|
|
trained_estimator
|
|
and hasattr(trained_estimator, "ITER_HP")
|
|
and trained_estimator.params.get(trained_estimator.ITER_HP)
|
|
)
|
|
if n_iter:
|
|
if "ml" in config:
|
|
config["ml"][trained_estimator.ITER_HP] = n_iter
|
|
else:
|
|
config[trained_estimator.ITER_HP] = n_iter
|
|
else:
|
|
obj, time2eval, trained_estimator = np.inf, 0.0, None
|
|
metric_for_logging = config = None
|
|
self.trial_time = time2eval
|
|
self.total_time_used += time_used if self._budget >= 0 else 1
|
|
self.total_iter += 1
|
|
|
|
if self.base_eci is None:
|
|
self.base_eci = time_used
|
|
if (obj is not None) and (obj < self.best_loss):
|
|
self.best_loss_old = self.best_loss if self.best_loss < np.inf else 2 * obj
|
|
self.best_loss = obj
|
|
self.best_result = result
|
|
self.time_best_found_old = self.time_best_found
|
|
self.time_best_found = self.total_time_used
|
|
self.iter_best_found = self.total_iter
|
|
self.best_config = config
|
|
self.best_config_sample_size = self.sample_size
|
|
self.best_config_train_time = time_used
|
|
if time2eval:
|
|
self.time2eval_best_old = self.time2eval_best
|
|
self.time2eval_best = time2eval
|
|
if (
|
|
self.trained_estimator
|
|
and trained_estimator
|
|
and self.trained_estimator != trained_estimator
|
|
):
|
|
self.trained_estimator.cleanup()
|
|
if trained_estimator:
|
|
self.trained_estimator = trained_estimator
|
|
elif trained_estimator:
|
|
trained_estimator.cleanup()
|
|
self.metric_for_logging = metric_for_logging
|
|
self.val_loss, self.config = obj, config
|
|
|
|
def get_hist_config_sig(self, sample_size, config):
|
|
config_values = tuple([config[k] for k in self._hp_names if k in config])
|
|
config_sig = str(sample_size) + "_" + str(config_values)
|
|
return config_sig
|
|
|
|
def est_retrain_time(self, retrain_sample_size):
|
|
assert (
|
|
self.best_config_sample_size is not None
|
|
), "need to first get best_config_sample_size"
|
|
return self.time2eval_best * retrain_sample_size / self.best_config_sample_size
|
|
|
|
|
|
class AutoMLState:
|
|
def _prepare_sample_train_data(self, sample_size: int):
|
|
sampled_weight = groups = None
|
|
if sample_size <= self.data_size[0]:
|
|
if isinstance(self.X_train, (pd.DataFrame, psDataFrame)):
|
|
sampled_X_train = self.X_train.iloc[:sample_size]
|
|
else:
|
|
sampled_X_train = self.X_train[:sample_size]
|
|
if isinstance(self.y_train, (pd.Series, psSeries)):
|
|
sampled_y_train = self.y_train.iloc[:sample_size]
|
|
else:
|
|
sampled_y_train = self.y_train[:sample_size]
|
|
weight = self.fit_kwargs.get(
|
|
"sample_weight"
|
|
) # NOTE: _prepare_sample_train_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
if weight is not None:
|
|
sampled_weight = (
|
|
weight.iloc[:sample_size]
|
|
if isinstance(weight, (pd.Series, psSeries))
|
|
else weight[:sample_size]
|
|
)
|
|
if self.groups is not None:
|
|
groups = (
|
|
self.groups.iloc[:sample_size]
|
|
if isinstance(self.groups, (pd.Series, psSeries))
|
|
else self.groups[:sample_size]
|
|
)
|
|
else:
|
|
sampled_X_train = self.X_train_all
|
|
sampled_y_train = self.y_train_all
|
|
if (
|
|
"sample_weight" in self.fit_kwargs
|
|
): # NOTE: _prepare_sample_train_data is before kwargs is updated to fit_kwargs_by_estimator
|
|
sampled_weight = self.sample_weight_all
|
|
if self.groups is not None:
|
|
groups = self.groups_all
|
|
return sampled_X_train, sampled_y_train, sampled_weight, groups
|
|
|
|
@staticmethod
|
|
def _compute_with_config_base(
|
|
config_w_resource: dict,
|
|
state: "AutoMLState",
|
|
estimator: str,
|
|
is_report: bool = True,
|
|
) -> dict:
|
|
if "FLAML_sample_size" in config_w_resource:
|
|
sample_size = int(config_w_resource["FLAML_sample_size"])
|
|
else:
|
|
sample_size = state.data_size[0]
|
|
|
|
this_estimator_kwargs = state.fit_kwargs_by_estimator.get(
|
|
estimator
|
|
).copy() # NOTE: _compute_with_config_base is after kwargs is updated to fit_kwargs_by_estimator
|
|
(
|
|
sampled_X_train,
|
|
sampled_y_train,
|
|
sampled_weight,
|
|
groups,
|
|
) = state._prepare_sample_train_data(sample_size)
|
|
if sampled_weight is not None:
|
|
weight = this_estimator_kwargs["sample_weight"]
|
|
this_estimator_kwargs["sample_weight"] = sampled_weight
|
|
if groups is not None:
|
|
this_estimator_kwargs["groups"] = groups
|
|
config = config_w_resource.copy()
|
|
if "FLAML_sample_size" in config:
|
|
del config["FLAML_sample_size"]
|
|
budget = (
|
|
None
|
|
if state.time_budget < 0
|
|
else state.time_budget - state.time_from_start
|
|
if sample_size == state.data_size[0]
|
|
else (state.time_budget - state.time_from_start)
|
|
/ 2
|
|
* sample_size
|
|
/ state.data_size[0]
|
|
)
|
|
|
|
(
|
|
trained_estimator,
|
|
val_loss,
|
|
metric_for_logging,
|
|
_,
|
|
pred_time,
|
|
) = compute_estimator(
|
|
sampled_X_train,
|
|
sampled_y_train,
|
|
state.X_val,
|
|
state.y_val,
|
|
state.weight_val,
|
|
state.groups_val,
|
|
state.train_time_limit
|
|
if budget is None
|
|
else min(budget, state.train_time_limit or np.inf),
|
|
state.kf,
|
|
config,
|
|
state.task,
|
|
estimator,
|
|
state.eval_method,
|
|
state.metric,
|
|
state.best_loss,
|
|
state.n_jobs,
|
|
state.learner_classes.get(estimator),
|
|
state.cv_score_agg_func,
|
|
state.log_training_metric,
|
|
this_estimator_kwargs,
|
|
state.free_mem_ratio,
|
|
)
|
|
if state.retrain_final and not state.model_history:
|
|
trained_estimator.cleanup()
|
|
|
|
result = {
|
|
"pred_time": pred_time,
|
|
"wall_clock_time": time.time() - state._start_time_flag,
|
|
"metric_for_logging": metric_for_logging,
|
|
"val_loss": val_loss,
|
|
"trained_estimator": trained_estimator,
|
|
}
|
|
if sampled_weight is not None:
|
|
this_estimator_kwargs["sample_weight"] = weight
|
|
if is_report is True:
|
|
tune.report(**result)
|
|
return result
|
|
|
|
@classmethod
|
|
def sanitize(cls, config: dict) -> dict:
|
|
"""Make a config ready for passing to estimator."""
|
|
config = config.get("ml", config).copy()
|
|
config.pop("FLAML_sample_size", None)
|
|
config.pop("learner", None)
|
|
config.pop("_choice_", None)
|
|
return config
|
|
|
|
def _train_with_config(
|
|
self,
|
|
estimator: str,
|
|
config_w_resource: dict,
|
|
sample_size: Optional[int] = None,
|
|
):
|
|
if not sample_size:
|
|
sample_size = config_w_resource.get(
|
|
"FLAML_sample_size", len(self.y_train_all)
|
|
)
|
|
config = AutoMLState.sanitize(config_w_resource)
|
|
|
|
this_estimator_kwargs = self.fit_kwargs_by_estimator.get(
|
|
estimator
|
|
).copy() # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
|
|
(
|
|
sampled_X_train,
|
|
sampled_y_train,
|
|
sampled_weight,
|
|
groups,
|
|
) = self._prepare_sample_train_data(sample_size)
|
|
if sampled_weight is not None:
|
|
weight = this_estimator_kwargs[
|
|
"sample_weight"
|
|
] # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
|
|
this_estimator_kwargs[
|
|
"sample_weight"
|
|
] = sampled_weight # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
|
|
if groups is not None:
|
|
this_estimator_kwargs[
|
|
"groups"
|
|
] = groups # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
|
|
|
|
budget = (
|
|
None if self.time_budget < 0 else self.time_budget - self.time_from_start
|
|
)
|
|
|
|
estimator, train_time = train_estimator(
|
|
X_train=sampled_X_train,
|
|
y_train=sampled_y_train,
|
|
config_dic=config,
|
|
task=self.task,
|
|
estimator_name=estimator,
|
|
n_jobs=self.n_jobs,
|
|
estimator_class=self.learner_classes.get(estimator),
|
|
budget=budget,
|
|
fit_kwargs=this_estimator_kwargs, # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
|
|
eval_metric=self.metric if hasattr(self, "metric") else "train_time",
|
|
free_mem_ratio=self.free_mem_ratio,
|
|
)
|
|
|
|
if sampled_weight is not None:
|
|
this_estimator_kwargs[
|
|
"sample_weight"
|
|
] = weight # NOTE: _train_with_config is after kwargs is updated to fit_kwargs_by_estimator
|
|
|
|
return estimator, train_time
|