Merge branch 'main' into support_percentages

This commit is contained in:
Shaokun 2023-01-17 10:06:51 -05:00 committed by GitHub
commit 60a3e85b98
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 156 additions and 51 deletions

View File

@ -2604,11 +2604,12 @@ class AutoML(BaseEstimator):
min_sample_size = min_sample_size or self._settings.get("min_sample_size")
use_ray = self._settings.get("use_ray") if use_ray is None else use_ray
use_spark = self._settings.get("use_spark") if use_spark is None else use_spark
spark_available, spark_error_msg = check_spark()
if use_spark and use_ray is not False:
raise ValueError("use_spark and use_ray cannot be both True.")
elif use_spark and not spark_available:
raise spark_error_msg
elif use_spark:
spark_available, spark_error_msg = check_spark()
if not spark_available:
raise spark_error_msg
old_level = logger.getEffectiveLevel()
self.verbose = verbose
@ -2626,18 +2627,20 @@ class AutoML(BaseEstimator):
"Ray installed, setting use_ray to True. If you want to use Spark, set use_spark to True."
)
use_ray = True
elif spark_available:
logger.warning(
"n_concurrent_trials > 1 is only supported when using Ray or Spark. "
"Spark installed, setting use_spark to True. If you want to use Ray, set use_ray to True."
)
use_spark = True
else:
logger.warning(
"n_concurrent_trials > 1 is only supported when using Ray or Spark. "
"Neither Ray nor Spark installed, setting n_concurrent_trials to 1."
)
n_concurrent_trials = 1
spark_available, _ = check_spark()
if spark_available:
logger.warning(
"n_concurrent_trials > 1 is only supported when using Ray or Spark. "
"Spark installed, setting use_spark to True. If you want to use Ray, set use_ray to True."
)
use_spark = True
else:
logger.warning(
"n_concurrent_trials > 1 is only supported when using Ray or Spark. "
"Neither Ray nor Spark installed, setting n_concurrent_trials to 1."
)
n_concurrent_trials = 1
self._state.n_jobs = n_jobs
self._n_concurrent_trials = n_concurrent_trials

View File

@ -90,7 +90,8 @@ class BlendSearch(Searcher):
needing to re-compute the trial. Must be the same or shorter length than
points_to_evaluate. When provided, `mode` must be specified.
time_budget_s: int or float | Time budget in seconds.
num_samples: int | The number of configs to try.
num_samples: int | The number of configs to try. -1 means no limit on the
number of configs to try.
resource_attr: A string to specify the resource dimension and the best
performance is assumed to be at the max_resource.
min_resource: A float of the minimal resource to use for the resource_attr.
@ -222,11 +223,12 @@ class BlendSearch(Searcher):
else:
gs_space = space
gs_seed = seed - 10 if (seed - 10) >= 0 else seed - 11 + (1 << 32)
self._gs_seed = gs_seed
if experimental:
import optuna as ot
sampler = ot.samplers.TPESampler(
seed=seed, multivariate=True, group=True
seed=gs_seed, multivariate=True, group=True
)
else:
sampler = None
@ -306,7 +308,7 @@ class BlendSearch(Searcher):
space=self._gs._space,
metric=metric,
mode=mode,
sampler=self._gs._sampler,
seed=self._gs_seed,
)
self._gs.space = self._ls.space
self._init_search()
@ -322,11 +324,12 @@ class BlendSearch(Searcher):
self.cost_attr = self._ls.cost_attr = TIME_TOTAL_S
if "metric_target" in spec:
self._metric_target = spec.get("metric_target")
if "num_samples" in spec:
num_samples = spec.get("num_samples")
if num_samples is not None:
self._num_samples = (
spec["num_samples"]
+ len(self._result)
+ len(self._trial_proposed_by)
(num_samples + len(self._result) + len(self._trial_proposed_by))
if num_samples > 0 # 0 is currently treated the same as -1
else num_samples
)
return True

View File

@ -547,8 +547,8 @@ def complete_config(
domain.categories[index],
flow2,
disturb,
lower and lower[key][index],
upper and upper[key][index],
lower and lower.get(key) and lower[key][index],
upper and upper.get(key) and upper[key][index],
)
assert (
"_choice_" not in subspace[key]
@ -560,8 +560,8 @@ def complete_config(
space[key],
flow2,
disturb,
lower and lower[key],
upper and upper[key],
lower and lower.get(key),
upper and upper.get(key),
)
continue
subspace[key] = domain

View File

@ -31,23 +31,21 @@ def check_spark():
Return (True, None) if the check passes, otherwise log the exception message and
return (False, Exception(msg)). The exception can be raised by the caller.
"""
logger.warning("\ncheck Spark installation...This line should appear only once.\n")
logger.debug("\ncheck Spark installation...This line should appear only once.\n")
if not _have_spark:
msg = """use_spark=True requires installation of PySpark. Please run pip install flaml[spark]
and check [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)
for more details about installing Spark."""
logger.warning(msg)
return False, ImportError(msg)
if _spark_major_minor_version[0] < 3:
msg = "Spark version must be >= 3.0 to use flaml[spark]"
logger.warning(msg)
return False, ImportError(msg)
try:
SparkSession.builder.getOrCreate()
except RuntimeError as e:
logger.warning(f"\nSparkSession is not available: {e}\n")
# logger.warning(f"\nSparkSession is not available: {e}\n")
return False, RuntimeError(e)
return True, None

View File

@ -30,7 +30,7 @@ class SimpleTrial(Trial):
self.config = config or {}
self.status = Trial.PENDING
self.start_time = None
self.last_result = {}
self.last_result = None
self.last_update_time = -float("inf")
self.custom_trial_name = None
self.trainable_name = "trainable"

View File

@ -470,7 +470,9 @@ def run(
logger.addHandler(old_handlers[0])
if verbose > 0:
if log_file_name:
os.makedirs(os.path.dirname(log_file_name), exist_ok=True)
dir_name = os.path.dirname(log_file_name)
if dir_name:
os.makedirs(dir_name, exist_ok=True)
logger.addHandler(logging.FileHandler(log_file_name))
elif not logger.hasHandlers():
# Add the console handler.
@ -811,6 +813,10 @@ def run(
report(_metric=result)
_runner.stop_trial(trial_to_run)
num_failures = 0
if trial_to_run.last_result is None:
# application stops tuning by returning None
# TODO document this feature when it is finalized
break
else:
# break with upperbound_num_failures consecutive failures
num_failures += 1

View File

@ -1 +1 @@
__version__ = "1.1.0"
__version__ = "1.1.2"

View File

@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
@ -38,10 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install flaml[notebook]\n",
"# From v0.6.6, catboost is made an optional dependency to build conda package.\n",
"# To install catboost, you can run:\n",
"# %pip install flaml[catboost]"
"# %pip install flaml[notebook]"
]
},
{
@ -749,7 +747,8 @@
"xgb = XGBClassifier()\n",
"cat_columns = X_train.select_dtypes(include=['category']).columns\n",
"X = X_train.copy()\n",
"X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\n"
"X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\n",
"y_train_xgb = y_train.astype(\"int\")"
]
},
{
@ -758,7 +757,7 @@
"metadata": {},
"outputs": [],
"source": [
"xgb.fit(X, y_train)"
"xgb.fit(X, y_train_xgb)"
]
},
{
@ -769,7 +768,8 @@
"source": [
"X = X_test.copy()\n",
"X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\n",
"y_pred_xgb = xgb.predict(X)"
"y_pred_xgb = xgb.predict(X)\n",
"y_test_xgb = y_test.astype(\"int\")\n"
]
},
{
@ -788,7 +788,7 @@
}
],
"source": [
"print('default xgboost accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_xgb, y_test))\n",
"print('default xgboost accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_xgb, y_test_xgb))\n",
"print('default lgbm accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_lgbm, y_test))\n",
"print('flaml (10 min) accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))"
]
@ -1283,7 +1283,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
"version": "3.9.15 (main, Oct 26 2022, 03:47:43) \n[GCC 10.2.1 20210110]"
},
"vscode": {
"interpreter": {

View File

@ -28,7 +28,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install -U flaml openml;"
"# %pip install -U flaml openml;"
]
},
{
@ -520,7 +520,7 @@
" \"task\": \"regression\",\n",
" \"starting_points\": \"data\",\n",
" \"estimator_list\": [\"lgbm\"],\n",
" \"time_budget\": 600,\n",
" \"time_budget\": 300,\n",
"}\n",
"automl.fit(X_train, y_train, **settings)"
]
@ -545,7 +545,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
"version": "3.9.15 (main, Oct 26 2022, 03:47:43) \n[GCC 10.2.1 20210110]"
}
},
"nbformat": 4,

View File

@ -75,6 +75,7 @@ setuptools.setup(
"joblibspark>=0.5.0",
"nbconvert",
"nbformat",
"ipykernel",
],
"catboost": ["catboost>=0.26"],
"blendsearch": ["optuna==2.8.0"],

View File

@ -0,0 +1,45 @@
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert.preprocessors import CellExecutionError
import os
import sys
import pytest
here = os.path.abspath(os.path.dirname(__file__))
def run_notebook(input_nb, output_nb="executed_notebook.ipynb", save=False):
try:
file_path = os.path.join(here, os.pardir, os.pardir, "notebook", input_nb)
with open(file_path) as f:
nb = nbformat.read(f, as_version=4)
ep = ExecutePreprocessor(timeout=3600, kernel_name="python3")
ep.preprocess(nb, {"metadata": {"path": here}})
except CellExecutionError:
raise
finally:
if save:
with open(os.path.join(here, output_nb), "w", encoding="utf-8") as f:
nbformat.write(nb, f)
@pytest.mark.skipif(
sys.platform != "darwin" or "3.8" not in sys.version,
reason="Only run on macOS with Python 3.8",
)
def test_automl_classification(save=False):
run_notebook("automl_classification.ipynb", save=save)
@pytest.mark.skipif(
sys.platform != "darwin" or "3.7" not in sys.version,
reason="Only run on macOS with Python 3.7",
)
def test_zeroshot_lightgbm(save=False):
run_notebook("zeroshot_lightgbm.ipynb", save=save)
if __name__ == "__main__":
# test_automl_classification(save=True)
test_zeroshot_lightgbm(save=True)

View File

@ -25,8 +25,8 @@ def run_notebook(input_nb, output_nb="executed_notebook.ipynb", save=False):
ep.preprocess(nb, {"metadata": {"path": here}})
except CellExecutionError:
raise
except Exception as e:
print("\nIgnoring below error:\n", e, "\n\n")
# except Exception as e:
# print("\nIgnoring below error:\n", e, "\n\n")
finally:
if save:
with open(os.path.join(here, output_nb), "w", encoding="utf-8") as f:

View File

@ -34,7 +34,7 @@ def test_tune(externally_setup_searcher=False, use_ray=False, use_raytune=False)
"width": tune.uniform(0, 20),
"height": tune.uniform(-100, 100),
}
if externally_setup_searcher:
if externally_setup_searcher is True:
searcher = BlendSearch(
space=search_space,
@ -84,8 +84,10 @@ def test_tune(externally_setup_searcher=False, use_ray=False, use_raytune=False)
metric="mean_loss",
mode="min",
)
else:
elif externally_setup_searcher is False:
searcher = None
else:
searcher = externally_setup_searcher
analysis = tune.run(
easy_objective_custom_tune,
@ -120,5 +122,19 @@ def test_reproducibility():
), "flaml.tune not reproducible when the searcher is set up externally"
def test_gs_reproducibility():
from flaml import BlendSearch, tune
def f(config):
return {"m": 0.35}
search_space = {"a": tune.randint(1, 100)}
bs = BlendSearch(space=search_space, cost_attr=None)
analysis1 = tune.run(f, search_alg=bs, num_samples=2, metric="m", mode="max")
bs = BlendSearch(space=search_space, cost_attr=None)
analysis2 = tune.run(f, search_alg=bs, num_samples=2, metric="m", mode="max")
assert analysis1.trials[-1].config == analysis2.trials[-1].config
if __name__ == "__main__":
test_reproducibility()

View File

@ -66,6 +66,14 @@ def test_define_by_run():
cfo.suggest(f"t{i}")
# print(normalize(config, bs._gs.space, config, {}, False))
print(complete_config({}, cfo._ls.space, cfo._ls))
# test hierarchical space with low_cost_partial_config
bs = BlendSearch(
space={"c": tune.choice([0, choice]), "randn": tune.randn(10, 2)},
low_cost_partial_config={"randn": 10},
metric="metric",
mode="max",
)
tune.run(lambda config: {"metric": 1}, search_alg=bs)
def test_grid():

25
test/tune/test_stop.py Normal file
View File

@ -0,0 +1,25 @@
from flaml import tune
n_trials = 0
def evaluate_config(config):
global n_trials
n_trials += 1
if n_trials >= 10:
return None
metric = (round(config["x"]) - 85000) ** 2 - config["x"] / config["y"]
return metric
def test_eval_stop():
analysis = tune.run(
evaluate_config,
config={
"x": tune.qloguniform(lower=1, upper=100000, q=1),
"y": tune.qlograndint(lower=2, upper=100000, q=2),
},
num_samples=100,
mode="max",
)
assert len(analysis.trials) == 10

View File

@ -5085,9 +5085,9 @@ json-schema-traverse@^1.0.0:
integrity sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==
json5@^2.1.2, json5@^2.2.1:
version "2.2.1"
resolved "https://registry.npmmirror.com/json5/-/json5-2.2.1.tgz#655d50ed1e6f95ad1a3caababd2b0efda10b395c"
integrity sha512-1hqLFMSrGHRHxav9q9gNjJ5EXznIxGVO09xQRrwplcS8qs28pZ8s8hupZAmqDwZUmVZ2Qb2jnyPOWcDH8m8dlA==
version "2.2.3"
resolved "https://registry.yarnpkg.com/json5/-/json5-2.2.3.tgz#78cd6f1a19bdc12b73db5ad0c61efd66c1e29283"
integrity sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==
jsonfile@^6.0.1:
version "6.1.0"