From fefdb48ef3fcdaa3da8de1bc0fc99da35ca1d9a5 Mon Sep 17 00:00:00 2001 From: Zvi Baratz Date: Thu, 23 Jun 2022 05:44:14 +0300 Subject: [PATCH] Fix automl settings in scikit-learn pipeline integration example (#602) * Added test directory and core file to gitignore. Closes #601. * Fixed pipeline fit parameters. Closes #600. * Reverted changes to gitignore. --- notebook/integrate_sklearn.ipynb | 396 +++++++++--------- .../Integrate - Scikit-learn Pipeline.md | 17 +- 2 files changed, 211 insertions(+), 202 deletions(-) diff --git a/notebook/integrate_sklearn.ipynb b/notebook/integrate_sklearn.ipynb index 4f0e0b98f..5601b791a 100644 --- a/notebook/integrate_sklearn.ipynb +++ b/notebook/integrate_sklearn.ipynb @@ -93,14 +93,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "load dataset from ./openml_ds1169.pkl\n", + "download dataset from openml\n", "Dataset name: airlines\n", "X_train.shape: (404537, 7), y_train.shape: (404537,);\n", "X_test.shape: (134846, 7), y_test.shape: (134846,)\n" @@ -115,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -124,7 +124,7 @@ "array([ 12., 2648., 4., 15., 4., 450., 67.], dtype=float32)" ] }, - "execution_count": 5, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -142,29 +142,74 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Pipeline(steps=[('imputuer', SimpleImputer()),\n",
-       "                ('standardizer', StandardScaler()),\n",
-       "                ('automl', )])
SimpleImputer()
StandardScaler()
" + "
Pipeline(steps=[('imputuer', SimpleImputer()),\n",
+       "                ('standardizer', StandardScaler()),\n",
+       "                ('automl',\n",
+       "                 AutoML(append_log=False, auto_augment=True, custom_hp={},\n",
+       "                        early_stop=False, ensemble=False, estimator_list='auto',\n",
+       "                        eval_method='auto', fit_kwargs_by_estimator={},\n",
+       "                        hpo_method='auto', keep_search_state=False,\n",
+       "                        learner_selector='sample', log_file_name='',\n",
+       "                        log_training_metric=False, log_type='better',\n",
+       "                        max_iter=None, mem_thres=4294967296, metric='auto',\n",
+       "                        metric_constraints=[], min_sample_size=10000,\n",
+       "                        model_history=False, n_concurrent_trials=1, n_jobs=-1,\n",
+       "                        n_splits=5, pred_time_limit=inf, retrain_full=True,\n",
+       "                        sample=True, split_ratio=0.1, split_type='auto',\n",
+       "                        starting_points='static', task='classification', ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('imputuer', SimpleImputer()),\n", " ('standardizer', StandardScaler()),\n", - " ('automl', )])" + " ('automl',\n", + " AutoML(append_log=False, auto_augment=True, custom_hp={},\n", + " early_stop=False, ensemble=False, estimator_list='auto',\n", + " eval_method='auto', fit_kwargs_by_estimator={},\n", + " hpo_method='auto', keep_search_state=False,\n", + " learner_selector='sample', log_file_name='',\n", + " log_training_metric=False, log_type='better',\n", + " max_iter=None, mem_thres=4294967296, metric='auto',\n", + " metric_constraints=[], min_sample_size=10000,\n", + " model_history=False, n_concurrent_trials=1, n_jobs=-1,\n", + " n_splits=5, pred_time_limit=inf, retrain_full=True,\n", + " sample=True, split_ratio=0.1, split_type='auto',\n", + " starting_points='static', task='classification', ...))])" ] }, - "execution_count": 6, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import sklearn\n", "from sklearn import set_config\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", @@ -195,217 +240,177 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "settings = {\n", + "automl_settings = {\n", " \"time_budget\": 60, # total running time in seconds\n", " \"metric\": 'accuracy', # primary metrics can be chosen from: ['accuracy','roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'f1','log_loss','mae','mse','r2']\n", " \"task\": 'classification', # task type \n", - " \"estimator_list\":['xgboost','catboost','lgbm'],\n", + " \"estimator_list\": ['xgboost','catboost','lgbm'],\n", " \"log_file_name\": 'airlines_experiment.log', # flaml log file\n", - "}" + "}\n", + "pipeline_settings = {f\"automl__{key}\": value for key, value in automl_settings.items()}" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "[flaml.automl: 08-22 21:32:13] {1130} INFO - Evaluation method: holdout\n", - "[flaml.automl: 08-22 21:32:14] {624} INFO - Using StratifiedKFold\n", - "[flaml.automl: 08-22 21:32:14] {1155} INFO - Minimizing error metric: 1-accuracy\n", - "[flaml.automl: 08-22 21:32:14] {1175} INFO - List of ML learners in AutoML Run: ['xgboost', 'catboost', 'lgbm']\n", - "[flaml.automl: 08-22 21:32:14] {1358} INFO - iteration 0, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:14] {1515} INFO - at 0.5s,\tbest xgboost's error=0.3755,\tbest xgboost's error=0.3755\n", - "[flaml.automl: 08-22 21:32:14] {1358} INFO - iteration 1, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:14] {1515} INFO - at 0.6s,\tbest xgboost's error=0.3755,\tbest xgboost's error=0.3755\n", - "[flaml.automl: 08-22 21:32:14] {1358} INFO - iteration 2, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:14] {1515} INFO - at 0.6s,\tbest xgboost's error=0.3755,\tbest xgboost's error=0.3755\n", - "[flaml.automl: 08-22 21:32:14] {1358} INFO - iteration 3, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:14] {1515} INFO - at 0.7s,\tbest xgboost's error=0.3755,\tbest xgboost's error=0.3755\n", - "[flaml.automl: 08-22 21:32:14] {1358} INFO - iteration 4, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:14] {1515} INFO - at 0.7s,\tbest xgboost's error=0.3679,\tbest xgboost's error=0.3679\n", - "[flaml.automl: 08-22 21:32:14] {1358} INFO - iteration 5, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:14] {1515} INFO - at 0.8s,\tbest lgbm's error=0.3811,\tbest xgboost's error=0.3679\n", - "[flaml.automl: 08-22 21:32:14] {1358} INFO - iteration 6, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:14] {1515} INFO - at 0.8s,\tbest xgboost's error=0.3679,\tbest xgboost's error=0.3679\n", - "[flaml.automl: 08-22 21:32:14] {1358} INFO - iteration 7, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:14] {1515} INFO - at 0.9s,\tbest xgboost's error=0.3679,\tbest xgboost's error=0.3679\n", - "[flaml.automl: 08-22 21:32:14] {1358} INFO - iteration 8, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:14] {1515} INFO - at 1.0s,\tbest xgboost's error=0.3679,\tbest xgboost's error=0.3679\n", - "[flaml.automl: 08-22 21:32:14] {1358} INFO - iteration 9, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:15] {1515} INFO - at 1.1s,\tbest lgbm's error=0.3811,\tbest xgboost's error=0.3679\n", - "[flaml.automl: 08-22 21:32:15] {1358} INFO - iteration 10, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:15] {1515} INFO - at 1.1s,\tbest lgbm's error=0.3755,\tbest xgboost's error=0.3679\n", - "[flaml.automl: 08-22 21:32:15] {1358} INFO - iteration 11, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:15] {1515} INFO - at 1.2s,\tbest xgboost's error=0.3637,\tbest xgboost's error=0.3637\n", - "[flaml.automl: 08-22 21:32:15] {1358} INFO - iteration 12, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:15] {1515} INFO - at 1.4s,\tbest xgboost's error=0.3594,\tbest xgboost's error=0.3594\n", - "[flaml.automl: 08-22 21:32:15] {1358} INFO - iteration 13, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:15] {1515} INFO - at 1.5s,\tbest xgboost's error=0.3594,\tbest xgboost's error=0.3594\n", - "[flaml.automl: 08-22 21:32:15] {1358} INFO - iteration 14, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:15] {1515} INFO - at 1.7s,\tbest xgboost's error=0.3591,\tbest xgboost's error=0.3591\n", - "[flaml.automl: 08-22 21:32:15] {1358} INFO - iteration 15, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:15] {1515} INFO - at 1.7s,\tbest lgbm's error=0.3647,\tbest xgboost's error=0.3591\n", - "[flaml.automl: 08-22 21:32:15] {1358} INFO - iteration 16, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:15] {1515} INFO - at 2.0s,\tbest xgboost's error=0.3585,\tbest xgboost's error=0.3585\n", - "[flaml.automl: 08-22 21:32:15] {1358} INFO - iteration 17, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:16] {1515} INFO - at 2.0s,\tbest lgbm's error=0.3647,\tbest xgboost's error=0.3585\n", - "[flaml.automl: 08-22 21:32:16] {1358} INFO - iteration 18, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:16] {1515} INFO - at 2.1s,\tbest lgbm's error=0.3629,\tbest xgboost's error=0.3585\n", - "[flaml.automl: 08-22 21:32:16] {1358} INFO - iteration 19, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:16] {1515} INFO - at 2.3s,\tbest xgboost's error=0.3553,\tbest xgboost's error=0.3553\n", - "[flaml.automl: 08-22 21:32:16] {1358} INFO - iteration 20, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:16] {1515} INFO - at 2.6s,\tbest xgboost's error=0.3553,\tbest xgboost's error=0.3553\n", - "[flaml.automl: 08-22 21:32:16] {1358} INFO - iteration 21, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:16] {1515} INFO - at 2.7s,\tbest xgboost's error=0.3553,\tbest xgboost's error=0.3553\n", - "[flaml.automl: 08-22 21:32:16] {1358} INFO - iteration 22, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:16] {1515} INFO - at 2.8s,\tbest lgbm's error=0.3629,\tbest xgboost's error=0.3553\n", - "[flaml.automl: 08-22 21:32:16] {1358} INFO - iteration 23, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:16] {1515} INFO - at 2.9s,\tbest lgbm's error=0.3629,\tbest xgboost's error=0.3553\n", - "[flaml.automl: 08-22 21:32:16] {1358} INFO - iteration 24, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:17] {1515} INFO - at 3.1s,\tbest xgboost's error=0.3520,\tbest xgboost's error=0.3520\n", - "[flaml.automl: 08-22 21:32:17] {1358} INFO - iteration 25, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:17] {1515} INFO - at 3.3s,\tbest xgboost's error=0.3520,\tbest xgboost's error=0.3520\n", - "[flaml.automl: 08-22 21:32:17] {1358} INFO - iteration 26, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:17] {1515} INFO - at 3.4s,\tbest lgbm's error=0.3573,\tbest xgboost's error=0.3520\n", - "[flaml.automl: 08-22 21:32:17] {1358} INFO - iteration 27, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:17] {1515} INFO - at 3.5s,\tbest lgbm's error=0.3573,\tbest xgboost's error=0.3520\n", - "[flaml.automl: 08-22 21:32:17] {1358} INFO - iteration 28, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:17] {1515} INFO - at 3.9s,\tbest xgboost's error=0.3520,\tbest xgboost's error=0.3520\n", - "[flaml.automl: 08-22 21:32:17] {1358} INFO - iteration 29, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:18] {1515} INFO - at 4.1s,\tbest xgboost's error=0.3520,\tbest xgboost's error=0.3520\n", - "[flaml.automl: 08-22 21:32:18] {1358} INFO - iteration 30, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:18] {1515} INFO - at 4.8s,\tbest xgboost's error=0.3485,\tbest xgboost's error=0.3485\n", - "[flaml.automl: 08-22 21:32:18] {1358} INFO - iteration 31, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:19] {1515} INFO - at 5.2s,\tbest lgbm's error=0.3573,\tbest xgboost's error=0.3485\n", - "[flaml.automl: 08-22 21:32:19] {1358} INFO - iteration 32, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:19] {1515} INFO - at 5.7s,\tbest xgboost's error=0.3485,\tbest xgboost's error=0.3485\n", - "[flaml.automl: 08-22 21:32:19] {1358} INFO - iteration 33, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:20] {1515} INFO - at 6.6s,\tbest xgboost's error=0.3485,\tbest xgboost's error=0.3485\n", - "[flaml.automl: 08-22 21:32:20] {1358} INFO - iteration 34, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:20] {1515} INFO - at 6.9s,\tbest lgbm's error=0.3481,\tbest lgbm's error=0.3481\n", - "[flaml.automl: 08-22 21:32:20] {1358} INFO - iteration 35, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:21] {1515} INFO - at 7.2s,\tbest lgbm's error=0.3481,\tbest lgbm's error=0.3481\n", - "[flaml.automl: 08-22 21:32:21] {1358} INFO - iteration 36, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:21] {1515} INFO - at 7.4s,\tbest lgbm's error=0.3481,\tbest lgbm's error=0.3481\n", - "[flaml.automl: 08-22 21:32:21] {1358} INFO - iteration 37, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:22] {1515} INFO - at 8.2s,\tbest xgboost's error=0.3485,\tbest lgbm's error=0.3481\n", - "[flaml.automl: 08-22 21:32:22] {1358} INFO - iteration 38, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:22] {1515} INFO - at 8.5s,\tbest lgbm's error=0.3481,\tbest lgbm's error=0.3481\n", - "[flaml.automl: 08-22 21:32:22] {1358} INFO - iteration 39, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:22] {1515} INFO - at 8.8s,\tbest lgbm's error=0.3481,\tbest lgbm's error=0.3481\n", - "[flaml.automl: 08-22 21:32:22] {1358} INFO - iteration 40, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:23] {1515} INFO - at 9.7s,\tbest xgboost's error=0.3485,\tbest lgbm's error=0.3481\n", - "[flaml.automl: 08-22 21:32:23] {1358} INFO - iteration 41, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:25] {1515} INFO - at 11.7s,\tbest lgbm's error=0.3481,\tbest lgbm's error=0.3481\n", - "[flaml.automl: 08-22 21:32:25] {1358} INFO - iteration 42, current learner catboost\n", - "[flaml.automl: 08-22 21:32:26] {1515} INFO - at 12.2s,\tbest catboost's error=0.3647,\tbest lgbm's error=0.3481\n", - "[flaml.automl: 08-22 21:32:26] {1358} INFO - iteration 43, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:28] {1515} INFO - at 14.4s,\tbest lgbm's error=0.3427,\tbest lgbm's error=0.3427\n", - "[flaml.automl: 08-22 21:32:28] {1358} INFO - iteration 44, current learner catboost\n", - "[flaml.automl: 08-22 21:32:28] {1515} INFO - at 14.6s,\tbest catboost's error=0.3647,\tbest lgbm's error=0.3427\n", - "[flaml.automl: 08-22 21:32:28] {1358} INFO - iteration 45, current learner catboost\n", - "[flaml.automl: 08-22 21:32:28] {1515} INFO - at 14.8s,\tbest catboost's error=0.3601,\tbest lgbm's error=0.3427\n", - "[flaml.automl: 08-22 21:32:28] {1358} INFO - iteration 46, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:30] {1515} INFO - at 16.9s,\tbest lgbm's error=0.3427,\tbest lgbm's error=0.3427\n", - "[flaml.automl: 08-22 21:32:30] {1358} INFO - iteration 47, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:34] {1515} INFO - at 21.0s,\tbest xgboost's error=0.3332,\tbest xgboost's error=0.3332\n", - "[flaml.automl: 08-22 21:32:34] {1358} INFO - iteration 48, current learner catboost\n", - "[flaml.automl: 08-22 21:32:35] {1515} INFO - at 21.1s,\tbest catboost's error=0.3601,\tbest xgboost's error=0.3332\n", - "[flaml.automl: 08-22 21:32:35] {1358} INFO - iteration 49, current learner lgbm\n", - "[flaml.automl: 08-22 21:32:37] {1515} INFO - at 23.2s,\tbest lgbm's error=0.3409,\tbest xgboost's error=0.3332\n", - "[flaml.automl: 08-22 21:32:37] {1358} INFO - iteration 50, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:38] {1515} INFO - at 24.6s,\tbest xgboost's error=0.3332,\tbest xgboost's error=0.3332\n", - "[flaml.automl: 08-22 21:32:38] {1358} INFO - iteration 51, current learner xgboost\n", - "[flaml.automl: 08-22 21:32:53] {1515} INFO - at 40.0s,\tbest xgboost's error=0.3279,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:32:53] {1358} INFO - iteration 52, current learner xgboost\n", - "[flaml.automl: 08-22 21:33:01] {1515} INFO - at 47.6s,\tbest xgboost's error=0.3279,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:01] {1358} INFO - iteration 53, current learner catboost\n", - "[flaml.automl: 08-22 21:33:01] {1515} INFO - at 47.7s,\tbest catboost's error=0.3601,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:01] {1358} INFO - iteration 54, current learner catboost\n", - "[flaml.automl: 08-22 21:33:02] {1515} INFO - at 48.2s,\tbest catboost's error=0.3601,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:02] {1358} INFO - iteration 55, current learner catboost\n", - "[flaml.automl: 08-22 21:33:02] {1515} INFO - at 48.5s,\tbest catboost's error=0.3552,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:02] {1358} INFO - iteration 56, current learner catboost\n", - "[flaml.automl: 08-22 21:33:02] {1515} INFO - at 48.7s,\tbest catboost's error=0.3552,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:02] {1358} INFO - iteration 57, current learner catboost\n", - "[flaml.automl: 08-22 21:33:02] {1515} INFO - at 49.0s,\tbest catboost's error=0.3552,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:02] {1358} INFO - iteration 58, current learner catboost\n", - "[flaml.automl: 08-22 21:33:03] {1515} INFO - at 49.1s,\tbest catboost's error=0.3552,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:03] {1358} INFO - iteration 59, current learner catboost\n", - "[flaml.automl: 08-22 21:33:03] {1515} INFO - at 49.4s,\tbest catboost's error=0.3552,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:03] {1358} INFO - iteration 60, current learner catboost\n", - "[flaml.automl: 08-22 21:33:06] {1515} INFO - at 52.2s,\tbest catboost's error=0.3453,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:06] {1358} INFO - iteration 61, current learner catboost\n", - "[flaml.automl: 08-22 21:33:07] {1515} INFO - at 53.9s,\tbest catboost's error=0.3453,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:07] {1358} INFO - iteration 62, current learner catboost\n", - "[flaml.automl: 08-22 21:33:09] {1515} INFO - at 55.3s,\tbest catboost's error=0.3453,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:09] {1358} INFO - iteration 63, current learner catboost\n", - "[flaml.automl: 08-22 21:33:10] {1515} INFO - at 56.4s,\tbest catboost's error=0.3453,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:10] {1358} INFO - iteration 64, current learner catboost\n", - "[flaml.automl: 08-22 21:33:11] {1515} INFO - at 57.5s,\tbest catboost's error=0.3453,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:11] {1358} INFO - iteration 65, current learner lgbm\n", - "[flaml.automl: 08-22 21:33:13] {1515} INFO - at 59.8s,\tbest lgbm's error=0.3409,\tbest xgboost's error=0.3279\n", - "[flaml.automl: 08-22 21:33:13] {1592} INFO - selected model: XGBClassifier(base_score=0.5, booster='gbtree',\n", - " colsample_bylevel=0.810466508891351, colsample_bynode=1,\n", - " colsample_bytree=0.8005378817953572, gamma=0, gpu_id=-1,\n", - " grow_policy='lossguide', importance_type='gain',\n", - " interaction_constraints='', learning_rate=0.06234183309508761,\n", - " max_delta_step=0, max_depth=0, max_leaves=1797,\n", - " min_child_weight=0.07275175679381725, missing=nan,\n", - " monotone_constraints='()', n_estimators=63, n_jobs=-1,\n", - " num_parallel_tree=1, random_state=0, reg_alpha=0.5768305704485758,\n", - " reg_lambda=6.867180836557797, scale_pos_weight=1,\n", - " subsample=0.9814772488195874, tree_method='hist',\n", - " use_label_encoder=False, validate_parameters=1, verbosity=0)\n", - "[flaml.automl: 08-22 21:33:26] {1633} INFO - retrain xgboost for 13.0s\n", - "[flaml.automl: 08-22 21:33:26] {1636} INFO - retrained model: XGBClassifier(base_score=0.5, booster='gbtree',\n", - " colsample_bylevel=0.810466508891351, colsample_bynode=1,\n", - " colsample_bytree=0.8005378817953572, gamma=0, gpu_id=-1,\n", - " grow_policy='lossguide', importance_type='gain',\n", - " interaction_constraints='', learning_rate=0.06234183309508761,\n", - " max_delta_step=0, max_depth=0, max_leaves=1797,\n", - " min_child_weight=0.07275175679381725, missing=nan,\n", - " monotone_constraints='()', n_estimators=63, n_jobs=-1,\n", - " num_parallel_tree=1, random_state=0, reg_alpha=0.5768305704485758,\n", - " reg_lambda=6.867180836557797, scale_pos_weight=1,\n", - " subsample=0.9814772488195874, tree_method='hist',\n", - " use_label_encoder=False, validate_parameters=1, verbosity=0)\n", - "[flaml.automl: 08-22 21:33:26] {1199} INFO - fit succeeded\n", - "[flaml.automl: 08-22 21:33:26] {1200} INFO - Time taken to find the best model: 40.023393869400024\n" + "[flaml.automl: 06-22 08:01:43] {2390} INFO - task = classification\n", + "[flaml.automl: 06-22 08:01:43] {2392} INFO - Data split method: stratified\n", + "[flaml.automl: 06-22 08:01:43] {2396} INFO - Evaluation method: holdout\n", + "[flaml.automl: 06-22 08:01:44] {2465} INFO - Minimizing error metric: 1-accuracy\n", + "[flaml.automl: 06-22 08:01:44] {2605} INFO - List of ML learners in AutoML Run: ['xgboost', 'catboost', 'lgbm']\n", + "[flaml.automl: 06-22 08:01:44] {2897} INFO - iteration 0, current learner xgboost\n", + "[flaml.automl: 06-22 08:01:44] {3025} INFO - Estimated sufficient time budget=105341s. Estimated necessary time budget=116s.\n", + "[flaml.automl: 06-22 08:01:44] {3072} INFO - at 0.7s,\testimator xgboost's best error=0.3755,\tbest estimator xgboost's best error=0.3755\n", + "[flaml.automl: 06-22 08:01:44] {2897} INFO - iteration 1, current learner lgbm\n", + "[flaml.automl: 06-22 08:01:44] {3072} INFO - at 0.9s,\testimator lgbm's best error=0.3814,\tbest estimator xgboost's best error=0.3755\n", + "[flaml.automl: 06-22 08:01:44] {2897} INFO - iteration 2, current learner xgboost\n", + "[flaml.automl: 06-22 08:01:45] {3072} INFO - at 1.3s,\testimator xgboost's best error=0.3755,\tbest estimator xgboost's best error=0.3755\n", + "[flaml.automl: 06-22 08:01:45] {2897} INFO - iteration 3, current learner lgbm\n", + "[flaml.automl: 06-22 08:01:45] {3072} INFO - at 1.5s,\testimator lgbm's best error=0.3814,\tbest estimator xgboost's best error=0.3755\n", + "[flaml.automl: 06-22 08:01:45] {2897} INFO - iteration 4, current learner xgboost\n", + "[flaml.automl: 06-22 08:01:45] {3072} INFO - at 1.8s,\testimator xgboost's best error=0.3755,\tbest estimator xgboost's best error=0.3755\n", + "[flaml.automl: 06-22 08:01:45] {2897} INFO - iteration 5, current learner lgbm\n", + "[flaml.automl: 06-22 08:01:45] {3072} INFO - at 2.0s,\testimator lgbm's best error=0.3755,\tbest estimator xgboost's best error=0.3755\n", + "[flaml.automl: 06-22 08:01:45] {2897} INFO - iteration 6, current learner xgboost\n", + "[flaml.automl: 06-22 08:01:46] {3072} INFO - at 2.3s,\testimator xgboost's best error=0.3724,\tbest estimator xgboost's best error=0.3724\n", + "[flaml.automl: 06-22 08:01:46] {2897} INFO - iteration 7, current learner xgboost\n", + "[flaml.automl: 06-22 08:01:46] {3072} INFO - at 2.6s,\testimator xgboost's best error=0.3724,\tbest estimator xgboost's best error=0.3724\n", + "[flaml.automl: 06-22 08:01:46] {2897} INFO - iteration 8, current learner xgboost\n", + "[flaml.automl: 06-22 08:01:47] {3072} INFO - at 3.1s,\testimator xgboost's best error=0.3657,\tbest estimator xgboost's best error=0.3657\n", + "[flaml.automl: 06-22 08:01:47] {2897} INFO - iteration 9, current learner xgboost\n", + "[flaml.automl: 06-22 08:01:47] {3072} INFO - at 3.6s,\testimator xgboost's best error=0.3657,\tbest estimator xgboost's best error=0.3657\n", + "[flaml.automl: 06-22 08:01:47] {2897} INFO - iteration 10, current learner xgboost\n", + "[flaml.automl: 06-22 08:01:48] {3072} INFO - at 4.8s,\testimator xgboost's best error=0.3592,\tbest estimator xgboost's best error=0.3592\n", + "[flaml.automl: 06-22 08:01:48] {2897} INFO - iteration 11, current learner xgboost\n", + "[flaml.automl: 06-22 08:01:50] {3072} INFO - at 6.8s,\testimator xgboost's best error=0.3580,\tbest estimator xgboost's best error=0.3580\n", + "[flaml.automl: 06-22 08:01:50] {2897} INFO - iteration 12, current learner xgboost\n", + "[flaml.automl: 06-22 08:01:51] {3072} INFO - at 8.1s,\testimator xgboost's best error=0.3580,\tbest estimator xgboost's best error=0.3580\n", + "[flaml.automl: 06-22 08:01:51] {2897} INFO - iteration 13, current learner lgbm\n", + "[flaml.automl: 06-22 08:01:52] {3072} INFO - at 8.4s,\testimator lgbm's best error=0.3644,\tbest estimator xgboost's best error=0.3580\n", + "[flaml.automl: 06-22 08:01:52] {2897} INFO - iteration 14, current learner lgbm\n", + "[flaml.automl: 06-22 08:01:52] {3072} INFO - at 8.7s,\testimator lgbm's best error=0.3644,\tbest estimator xgboost's best error=0.3580\n", + "[flaml.automl: 06-22 08:01:52] {2897} INFO - iteration 15, current learner lgbm\n", + "[flaml.automl: 06-22 08:01:53] {3072} INFO - at 9.3s,\testimator lgbm's best error=0.3644,\tbest estimator xgboost's best error=0.3580\n", + "[flaml.automl: 06-22 08:01:53] {2897} INFO - iteration 16, current learner xgboost\n", + "[flaml.automl: 06-22 08:01:56] {3072} INFO - at 12.1s,\testimator xgboost's best error=0.3559,\tbest estimator xgboost's best error=0.3559\n", + "[flaml.automl: 06-22 08:01:56] {2897} INFO - iteration 17, current learner lgbm\n", + "[flaml.automl: 06-22 08:01:56] {3072} INFO - at 12.6s,\testimator lgbm's best error=0.3604,\tbest estimator xgboost's best error=0.3559\n", + "[flaml.automl: 06-22 08:01:56] {2897} INFO - iteration 18, current learner catboost\n", + "[flaml.automl: 06-22 08:01:56] {3072} INFO - at 13.0s,\testimator catboost's best error=0.3615,\tbest estimator xgboost's best error=0.3559\n", + "[flaml.automl: 06-22 08:01:56] {2897} INFO - iteration 19, current learner catboost\n", + "[flaml.automl: 06-22 08:01:57] {3072} INFO - at 13.7s,\testimator catboost's best error=0.3615,\tbest estimator xgboost's best error=0.3559\n", + "[flaml.automl: 06-22 08:01:57] {2897} INFO - iteration 20, current learner catboost\n", + "[flaml.automl: 06-22 08:01:57] {3072} INFO - at 13.9s,\testimator catboost's best error=0.3615,\tbest estimator xgboost's best error=0.3559\n", + "[flaml.automl: 06-22 08:01:57] {2897} INFO - iteration 21, current learner xgboost\n", + "[flaml.automl: 06-22 08:01:59] {3072} INFO - at 15.7s,\testimator xgboost's best error=0.3559,\tbest estimator xgboost's best error=0.3559\n", + "[flaml.automl: 06-22 08:01:59] {2897} INFO - iteration 22, current learner catboost\n", + "[flaml.automl: 06-22 08:02:00] {3072} INFO - at 16.5s,\testimator catboost's best error=0.3489,\tbest estimator catboost's best error=0.3489\n", + "[flaml.automl: 06-22 08:02:00] {2897} INFO - iteration 23, current learner catboost\n", + "[flaml.automl: 06-22 08:02:02] {3072} INFO - at 18.9s,\testimator catboost's best error=0.3489,\tbest estimator catboost's best error=0.3489\n", + "[flaml.automl: 06-22 08:02:02] {2897} INFO - iteration 24, current learner lgbm\n", + "[flaml.automl: 06-22 08:02:03] {3072} INFO - at 19.2s,\testimator lgbm's best error=0.3604,\tbest estimator catboost's best error=0.3489\n", + "[flaml.automl: 06-22 08:02:03] {2897} INFO - iteration 25, current learner catboost\n", + "[flaml.automl: 06-22 08:02:03] {3072} INFO - at 20.0s,\testimator catboost's best error=0.3472,\tbest estimator catboost's best error=0.3472\n", + "[flaml.automl: 06-22 08:02:03] {2897} INFO - iteration 26, current learner catboost\n", + "[flaml.automl: 06-22 08:02:06] {3072} INFO - at 22.2s,\testimator catboost's best error=0.3472,\tbest estimator catboost's best error=0.3472\n", + "[flaml.automl: 06-22 08:02:06] {2897} INFO - iteration 27, current learner lgbm\n", + "[flaml.automl: 06-22 08:02:06] {3072} INFO - at 22.6s,\testimator lgbm's best error=0.3604,\tbest estimator catboost's best error=0.3472\n", + "[flaml.automl: 06-22 08:02:06] {2897} INFO - iteration 28, current learner lgbm\n", + "[flaml.automl: 06-22 08:02:06] {3072} INFO - at 22.9s,\testimator lgbm's best error=0.3604,\tbest estimator catboost's best error=0.3472\n", + "[flaml.automl: 06-22 08:02:06] {2897} INFO - iteration 29, current learner catboost\n", + "[flaml.automl: 06-22 08:02:07] {3072} INFO - at 23.6s,\testimator catboost's best error=0.3472,\tbest estimator catboost's best error=0.3472\n", + "[flaml.automl: 06-22 08:02:07] {2897} INFO - iteration 30, current learner xgboost\n", + "[flaml.automl: 06-22 08:02:09] {3072} INFO - at 25.4s,\testimator xgboost's best error=0.3548,\tbest estimator catboost's best error=0.3472\n", + "[flaml.automl: 06-22 08:02:09] {2897} INFO - iteration 31, current learner catboost\n", + "[flaml.automl: 06-22 08:02:16] {3072} INFO - at 32.3s,\testimator catboost's best error=0.3388,\tbest estimator catboost's best error=0.3388\n", + "[flaml.automl: 06-22 08:02:16] {2897} INFO - iteration 32, current learner lgbm\n", + "[flaml.automl: 06-22 08:02:16] {3072} INFO - at 32.7s,\testimator lgbm's best error=0.3604,\tbest estimator catboost's best error=0.3388\n", + "[flaml.automl: 06-22 08:02:16] {2897} INFO - iteration 33, current learner catboost\n", + "[flaml.automl: 06-22 08:02:22] {3072} INFO - at 38.5s,\testimator catboost's best error=0.3388,\tbest estimator catboost's best error=0.3388\n", + "[flaml.automl: 06-22 08:02:22] {2897} INFO - iteration 34, current learner catboost\n", + "[flaml.automl: 06-22 08:02:43] {3072} INFO - at 59.6s,\testimator catboost's best error=0.3388,\tbest estimator catboost's best error=0.3388\n", + "[flaml.automl: 06-22 08:02:46] {3336} INFO - retrain catboost for 2.8s\n", + "[flaml.automl: 06-22 08:02:46] {3343} INFO - retrained model: \n", + "[flaml.automl: 06-22 08:02:46] {2636} INFO - fit succeeded\n", + "[flaml.automl: 06-22 08:02:46] {2637} INFO - Time taken to find the best model: 32.311296463012695\n" ] }, { "data": { "text/html": [ - "
Pipeline(steps=[('imputuer', SimpleImputer()),\n",
-       "                ('standardizer', StandardScaler()),\n",
-       "                ('automl', )])
SimpleImputer()
StandardScaler()
" + "
Pipeline(steps=[('imputuer', SimpleImputer()),\n",
+       "                ('standardizer', StandardScaler()),\n",
+       "                ('automl',\n",
+       "                 AutoML(append_log=False, auto_augment=True, custom_hp={},\n",
+       "                        early_stop=False, ensemble=False, estimator_list='auto',\n",
+       "                        eval_method='auto', fit_kwargs_by_estimator={},\n",
+       "                        hpo_method='auto', keep_search_state=False,\n",
+       "                        learner_selector='sample', log_file_name='',\n",
+       "                        log_training_metric=False, log_type='better',\n",
+       "                        max_iter=None, mem_thres=4294967296, metric='auto',\n",
+       "                        metric_constraints=[], min_sample_size=10000,\n",
+       "                        model_history=False, n_concurrent_trials=1, n_jobs=-1,\n",
+       "                        n_splits=5, pred_time_limit=inf, retrain_full=True,\n",
+       "                        sample=True, split_ratio=0.1, split_type='auto',\n",
+       "                        starting_points='static', task='classification', ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('imputuer', SimpleImputer()),\n", " ('standardizer', StandardScaler()),\n", - " ('automl', )])" + " ('automl',\n", + " AutoML(append_log=False, auto_augment=True, custom_hp={},\n", + " early_stop=False, ensemble=False, estimator_list='auto',\n", + " eval_method='auto', fit_kwargs_by_estimator={},\n", + " hpo_method='auto', keep_search_state=False,\n", + " learner_selector='sample', log_file_name='',\n", + " log_training_metric=False, log_type='better',\n", + " max_iter=None, mem_thres=4294967296, metric='auto',\n", + " metric_constraints=[], min_sample_size=10000,\n", + " model_history=False, n_concurrent_trials=1, n_jobs=-1,\n", + " n_splits=5, pred_time_limit=inf, retrain_full=True,\n", + " sample=True, split_ratio=0.1, split_type='auto',\n", + " starting_points='static', task='classification', ...))])" ] }, - "execution_count": 8, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "automl_pipeline.fit(X_train, y_train, \n", - " automl__time_budget=settings['time_budget'],\n", - " automl__metric=settings['metric'],\n", - " automl__estimator_list=settings['estimator_list'],\n", - " automl__log_training_metric=True)" + "automl_pipeline.fit(X_train, y_train, **pipeline_settings)" ] }, { @@ -500,11 +505,9 @@ } ], "metadata": { - "interpreter": { - "hash": "0cfea3304185a9579d09e0953576b57c8581e46e6ebc6dfeb681bc5a511f7544" - }, "kernelspec": { - "display_name": "Python 3.8.0 64-bit ('blend': conda)", + "display_name": "Python 3.9.12 64-bit", + "language": "python", "name": "python3" }, "language_info": { @@ -517,7 +520,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.0" + "version": "3.9.12" + }, + "vscode": { + "interpreter": { + "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1" + } } }, "nbformat": 4, diff --git a/website/docs/Examples/Integrate - Scikit-learn Pipeline.md b/website/docs/Examples/Integrate - Scikit-learn Pipeline.md index 09894321d..9d20d26fb 100644 --- a/website/docs/Examples/Integrate - Scikit-learn Pipeline.md +++ b/website/docs/Examples/Integrate - Scikit-learn Pipeline.md @@ -37,16 +37,17 @@ automl_pipeline ### Run AutoML in the pipeline ```python -settings = { +automl_settings = { "time_budget": 60, # total running time in seconds - "metric": 'accuracy', # primary metrics can be chosen from: ['accuracy','roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'f1','log_loss','mae','mse','r2'] - "task": 'classification', # task type - "estimator_list":['xgboost','catboost','lgbm'], - "log_file_name": 'airlines_experiment.log', # flaml log file + "metric": "accuracy", # primary metrics can be chosen from: ['accuracy','roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'f1','log_loss','mae','mse','r2'] + "task": "classification", # task type + "estimator_list": ["xgboost", "catboost", "lgbm"], + "log_file_name": "airlines_experiment.log", # flaml log file } -automl_pipeline.fit(X_train, y_train, - automl__time_budget=60, - automl__metric="accuracy") +pipeline_settings = { + f"automl__{key}": value for key, value in automl_settings.items() +} +automl_pipeline.fit(X_train, y_train, **pipeline_settings) ``` ### Get the automl object from the pipeline