notebook test; spark warning message; reproducibility bug; sequential tuning stop condition (#869)

* notebook test

* add ipykernel, remove except

* only create dir if not empty

* Stop sequential tuning when result is None

* fix reproducibility of global search

* save gs seed

* use get to avoid KeyError

* test
This commit is contained in:
Chi Wang 2023-01-07 18:39:29 -08:00 committed by GitHub
parent 9fde27e536
commit 75e3454120
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 129 additions and 29 deletions

View File

@ -213,11 +213,12 @@ class BlendSearch(Searcher):
else:
gs_space = space
gs_seed = seed - 10 if (seed - 10) >= 0 else seed - 11 + (1 << 32)
self._gs_seed = gs_seed
if experimental:
import optuna as ot
sampler = ot.samplers.TPESampler(
seed=seed, multivariate=True, group=True
seed=gs_seed, multivariate=True, group=True
)
else:
sampler = None
@ -297,7 +298,7 @@ class BlendSearch(Searcher):
space=self._gs._space,
metric=metric,
mode=mode,
sampler=self._gs._sampler,
seed=self._gs_seed,
)
self._gs.space = self._ls.space
self._init_search()

View File

@ -547,8 +547,8 @@ def complete_config(
domain.categories[index],
flow2,
disturb,
lower and lower[key][index],
upper and upper[key][index],
lower and lower.get(key) and lower[key][index],
upper and upper.get(key) and upper[key][index],
)
assert (
"_choice_" not in subspace[key]
@ -560,8 +560,8 @@ def complete_config(
space[key],
flow2,
disturb,
lower and lower[key],
upper and upper[key],
lower and lower.get(key),
upper and upper.get(key),
)
continue
subspace[key] = domain

View File

@ -31,23 +31,21 @@ def check_spark():
Return (True, None) if the check passes, otherwise log the exception message and
return (False, Exception(msg)). The exception can be raised by the caller.
"""
logger.warning("\ncheck Spark installation...This line should appear only once.\n")
logger.debug("\ncheck Spark installation...This line should appear only once.\n")
if not _have_spark:
msg = """use_spark=True requires installation of PySpark. Please run pip install flaml[spark]
and check [here](https://spark.apache.org/docs/latest/api/python/getting_started/install.html)
for more details about installing Spark."""
logger.warning(msg)
return False, ImportError(msg)
if _spark_major_minor_version[0] < 3:
msg = "Spark version must be >= 3.0 to use flaml[spark]"
logger.warning(msg)
return False, ImportError(msg)
try:
SparkSession.builder.getOrCreate()
except RuntimeError as e:
logger.warning(f"\nSparkSession is not available: {e}\n")
# logger.warning(f"\nSparkSession is not available: {e}\n")
return False, RuntimeError(e)
return True, None

View File

@ -30,7 +30,7 @@ class SimpleTrial(Trial):
self.config = config or {}
self.status = Trial.PENDING
self.start_time = None
self.last_result = {}
self.last_result = None
self.last_update_time = -float("inf")
self.custom_trial_name = None
self.trainable_name = "trainable"

View File

@ -448,7 +448,9 @@ def run(
logger.addHandler(old_handlers[0])
if verbose > 0:
if log_file_name:
os.makedirs(os.path.dirname(log_file_name), exist_ok=True)
dir_name = os.path.dirname(log_file_name)
if dir_name:
os.makedirs(dir_name, exist_ok=True)
logger.addHandler(logging.FileHandler(log_file_name))
elif not logger.hasHandlers():
# Add the console handler.
@ -789,6 +791,10 @@ def run(
report(_metric=result)
_runner.stop_trial(trial_to_run)
num_failures = 0
if trial_to_run.last_result is None:
# application stops tuning by returning None
# TODO document this feature when it is finalized
break
else:
# break with upperbound_num_failures consecutive failures
num_failures += 1

View File

@ -1 +1 @@
__version__ = "1.1.0"
__version__ = "1.1.1"

View File

@ -1,6 +1,7 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"slideshow": {
@ -38,10 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install flaml[notebook]\n",
"# From v0.6.6, catboost is made an optional dependency to build conda package.\n",
"# To install catboost, you can run:\n",
"# %pip install flaml[catboost]"
"# %pip install flaml[notebook]"
]
},
{
@ -749,7 +747,8 @@
"xgb = XGBClassifier()\n",
"cat_columns = X_train.select_dtypes(include=['category']).columns\n",
"X = X_train.copy()\n",
"X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\n"
"X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\n",
"y_train_xgb = y_train.astype(\"int\")"
]
},
{
@ -758,7 +757,7 @@
"metadata": {},
"outputs": [],
"source": [
"xgb.fit(X, y_train)"
"xgb.fit(X, y_train_xgb)"
]
},
{
@ -769,7 +768,8 @@
"source": [
"X = X_test.copy()\n",
"X[cat_columns] = X[cat_columns].apply(lambda x: x.cat.codes)\n",
"y_pred_xgb = xgb.predict(X)"
"y_pred_xgb = xgb.predict(X)\n",
"y_test_xgb = y_test.astype(\"int\")\n"
]
},
{
@ -788,7 +788,7 @@
}
],
"source": [
"print('default xgboost accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_xgb, y_test))\n",
"print('default xgboost accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_xgb, y_test_xgb))\n",
"print('default lgbm accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred_lgbm, y_test))\n",
"print('flaml (10 min) accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))"
]
@ -1283,7 +1283,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
"version": "3.9.15 (main, Oct 26 2022, 03:47:43) \n[GCC 10.2.1 20210110]"
},
"vscode": {
"interpreter": {

View File

@ -28,7 +28,7 @@
"metadata": {},
"outputs": [],
"source": [
"%pip install -U flaml openml;"
"# %pip install -U flaml openml;"
]
},
{
@ -520,7 +520,7 @@
" \"task\": \"regression\",\n",
" \"starting_points\": \"data\",\n",
" \"estimator_list\": [\"lgbm\"],\n",
" \"time_budget\": 600,\n",
" \"time_budget\": 300,\n",
"}\n",
"automl.fit(X_train, y_train, **settings)"
]
@ -545,7 +545,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
"version": "3.9.15 (main, Oct 26 2022, 03:47:43) \n[GCC 10.2.1 20210110]"
}
},
"nbformat": 4,

View File

@ -75,6 +75,7 @@ setuptools.setup(
"joblibspark>=0.5.0",
"nbconvert",
"nbformat",
"ipykernel",
],
"catboost": ["catboost>=0.26"],
"blendsearch": ["optuna==2.8.0"],

View File

@ -0,0 +1,45 @@
import nbformat
from nbconvert.preprocessors import ExecutePreprocessor
from nbconvert.preprocessors import CellExecutionError
import os
import sys
import pytest
here = os.path.abspath(os.path.dirname(__file__))
def run_notebook(input_nb, output_nb="executed_notebook.ipynb", save=False):
try:
file_path = os.path.join(here, os.pardir, os.pardir, "notebook", input_nb)
with open(file_path) as f:
nb = nbformat.read(f, as_version=4)
ep = ExecutePreprocessor(timeout=3600, kernel_name="python3")
ep.preprocess(nb, {"metadata": {"path": here}})
except CellExecutionError:
raise
finally:
if save:
with open(os.path.join(here, output_nb), "w", encoding="utf-8") as f:
nbformat.write(nb, f)
@pytest.mark.skipif(
sys.platform != "darwin" or "3.8" not in sys.version,
reason="Only run on macOS with Python 3.8",
)
def test_automl_classification(save=False):
run_notebook("automl_classification.ipynb", save=save)
@pytest.mark.skipif(
sys.platform != "darwin" or "3.7" not in sys.version,
reason="Only run on macOS with Python 3.7",
)
def test_zeroshot_lightgbm(save=False):
run_notebook("zeroshot_lightgbm.ipynb", save=save)
if __name__ == "__main__":
# test_automl_classification(save=True)
test_zeroshot_lightgbm(save=True)

View File

@ -25,8 +25,8 @@ def run_notebook(input_nb, output_nb="executed_notebook.ipynb", save=False):
ep.preprocess(nb, {"metadata": {"path": here}})
except CellExecutionError:
raise
except Exception as e:
print("\nIgnoring below error:\n", e, "\n\n")
# except Exception as e:
# print("\nIgnoring below error:\n", e, "\n\n")
finally:
if save:
with open(os.path.join(here, output_nb), "w", encoding="utf-8") as f:

View File

@ -34,7 +34,7 @@ def test_tune(externally_setup_searcher=False, use_ray=False, use_raytune=False)
"width": tune.uniform(0, 20),
"height": tune.uniform(-100, 100),
}
if externally_setup_searcher:
if externally_setup_searcher is True:
searcher = BlendSearch(
space=search_space,
@ -84,8 +84,10 @@ def test_tune(externally_setup_searcher=False, use_ray=False, use_raytune=False)
metric="mean_loss",
mode="min",
)
else:
elif externally_setup_searcher is False:
searcher = None
else:
searcher = externally_setup_searcher
analysis = tune.run(
easy_objective_custom_tune,
@ -120,5 +122,19 @@ def test_reproducibility():
), "flaml.tune not reproducible when the searcher is set up externally"
def test_gs_reproducibility():
from flaml import BlendSearch, tune
def f(config):
return {"m": 0.35}
search_space = {"a": tune.randint(1, 100)}
bs = BlendSearch(space=search_space, cost_attr=None)
analysis1 = tune.run(f, search_alg=bs, num_samples=2, metric="m", mode="max")
bs = BlendSearch(space=search_space, cost_attr=None)
analysis2 = tune.run(f, search_alg=bs, num_samples=2, metric="m", mode="max")
assert analysis1.trials[-1].config == analysis2.trials[-1].config
if __name__ == "__main__":
test_reproducibility()

View File

@ -66,6 +66,14 @@ def test_define_by_run():
cfo.suggest(f"t{i}")
# print(normalize(config, bs._gs.space, config, {}, False))
print(complete_config({}, cfo._ls.space, cfo._ls))
# test hierarchical space with low_cost_partial_config
bs = BlendSearch(
space={"c": tune.choice([0, choice]), "randn": tune.randn(10, 2)},
low_cost_partial_config={"randn": 10},
metric="metric",
mode="max",
)
tune.run(lambda config: {"metric": 1}, search_alg=bs)
def test_grid():

25
test/tune/test_stop.py Normal file
View File

@ -0,0 +1,25 @@
from flaml import tune
n_trials = 0
def evaluate_config(config):
global n_trials
n_trials += 1
if n_trials >= 10:
return None
metric = (round(config["x"]) - 85000) ** 2 - config["x"] / config["y"]
return metric
def test_eval_stop():
analysis = tune.run(
evaluate_config,
config={
"x": tune.qloguniform(lower=1, upper=100000, q=1),
"y": tune.qlograndint(lower=2, upper=100000, q=2),
},
num_samples=100,
mode="max",
)
assert len(analysis.trials) == 10