diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index cff7e9abf..040d7869d 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -17,10 +17,7 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* ENV DEBIAN_FRONTEND=dialog -# -# Install extras for development -# -RUN pip3 --disable-pip-version-check --no-cache-dir install flaml[test,notebook] +RUN pip3 --disable-pip-version-check --no-cache-dir install flaml # For docs RUN npm install --global yarn RUN pip install pydoc-markdown==4.5.0 diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 632f97d72..46517c859 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -8,5 +8,6 @@ } }, "terminal.integrated.defaultProfile.linux": "bash" - } + }, + "updateContentCommand": "pip install -e .[test,notebook] && pre-commit install" } \ No newline at end of file diff --git a/flaml/automl.py b/flaml/automl.py index 2394660f3..4e2772ed5 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -2213,7 +2213,7 @@ class AutoML(BaseEstimator): ``` task: A string of the task type, e.g., 'classification', 'regression', 'ts_forecast_regression', - 'ts_forecast_classification', 'ts_forecast_panel', 'rank', 'seq-classification', + 'ts_forecast_classification', 'rank', 'seq-classification', 'seq-regression', 'summarization'. n_jobs: An integer of the number of threads for training | default=-1. Use all available resources when n_jobs == -1. diff --git a/flaml/model.py b/flaml/model.py index e1941d5f1..c23c894c0 100644 --- a/flaml/model.py +++ b/flaml/model.py @@ -2266,18 +2266,13 @@ class TemporalFusionTransformerEstimator(SKLearnEstimator): return training, train_dataloader, val_dataloader def fit(self, X_train, y_train, budget=None, **kwargs): - import copy - from pathlib import Path import warnings - import numpy as np - import pandas as pd import pytorch_lightning as pl from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor from pytorch_lightning.loggers import TensorBoardLogger import torch from pytorch_forecasting import TemporalFusionTransformer from pytorch_forecasting.metrics import QuantileLoss - import tensorboard as tb warnings.filterwarnings("ignore") current_time = time.time() diff --git a/flaml/tune/analysis.py b/flaml/tune/analysis.py index af20ad748..3350f2bbe 100644 --- a/flaml/tune/analysis.py +++ b/flaml/tune/analysis.py @@ -18,7 +18,6 @@ from typing import Dict, Optional import numpy as np from .trial import Trial -from collections import defaultdict import logging logger = logging.getLogger(__name__) diff --git a/flaml/tune/sample.py b/flaml/tune/sample.py index 0ef50837d..66223ca1c 100644 --- a/flaml/tune/sample.py +++ b/flaml/tune/sample.py @@ -428,7 +428,12 @@ class Categorical(Domain): ): if not isinstance(random_state, _BackwardsCompatibleNumpyRng): random_state = _BackwardsCompatibleNumpyRng(random_state) - items = random_state.choice(domain.categories, size=size).tolist() + # do not use .choice() directly on domain.categories + # as that will coerce them to a single dtype + indices = random_state.choice( + np.arange(0, len(domain.categories)), size=size + ) + items = [domain.categories[index] for index in indices] return items if len(items) > 1 else domain.cast(items[0]) default_sampler_cls = _Uniform @@ -479,8 +484,18 @@ class Quantized(Sampler): ): if not isinstance(random_state, _BackwardsCompatibleNumpyRng): random_state = _BackwardsCompatibleNumpyRng(random_state) - values = self.sampler.sample(domain, spec, size, random_state=random_state) + + if self.q == 1: + return self.sampler.sample(domain, spec, size, random_state=random_state) + + quantized_domain = copy(domain) + quantized_domain.lower = np.ceil(domain.lower / self.q) * self.q + quantized_domain.upper = np.floor(domain.upper / self.q) * self.q + values = self.sampler.sample( + quantized_domain, spec, size, random_state=random_state + ) quantized = np.round(np.divide(values, self.q)) * self.q + if not isinstance(quantized, np.ndarray): return domain.cast(quantized) return list(quantized) @@ -586,7 +601,9 @@ def lograndint(lower: int, upper: int, base: float = 10): def qrandint(lower: int, upper: int, q: int = 1): """Sample an integer value uniformly between ``lower`` and ``upper``. + ``lower`` is inclusive, ``upper`` is also inclusive (!). + The value will be quantized, i.e. rounded to an integer increment of ``q``. Quantization makes the upper bound inclusive. """ @@ -614,12 +631,15 @@ def randn(mean: float = 0.0, sd: float = 1.0): def qrandn(mean: float, sd: float, q: float): """Sample a float value normally with ``mean`` and ``sd``. + The value will be quantized, i.e. rounded to an integer increment of ``q``. + Args: - mean (float): Mean of the normal distribution. - sd (float): SD of the normal distribution. - q (float): Quantization number. The result will be rounded to an + mean: Mean of the normal distribution. + sd: SD of the normal distribution. + q: Quantization number. The result will be rounded to an integer increment of this value. + """ return Float(None, None).normal(mean, sd).quantized(q) diff --git a/notebook/automl_classification.ipynb b/notebook/automl_classification.ipynb index a84c0aa05..378f026c1 100644 --- a/notebook/automl_classification.ipynb +++ b/notebook/automl_classification.ipynb @@ -38,10 +38,10 @@ "metadata": {}, "outputs": [], "source": [ - "%pip install flaml[notebook]\n", - "# from v0.6.6, catboost is made an optional dependency to build conda package.\n", - "# to install catboost without installing the notebook option, you can run:\n", - "# %pip install flaml[catboost]" + "%pip install flaml[notebook] openml==0.10.2\n", + "# From v0.6.6, catboost is made an optional dependency to build conda package.\n", + "# To install catboost, you can run:\n", + "%pip install flaml[catboost]" ] }, { @@ -836,6 +836,15 @@ "In this example, the above information for RGF is wrapped in a python class called *MyRegularizedGreedyForest* that exposes the hyperparameters." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install rgf-python" + ] + }, { "cell_type": "code", "execution_count": 19, @@ -1259,11 +1268,8 @@ } ], "metadata": { - "interpreter": { - "hash": "5432eb6463ddd46aaa76ccf859b1fa421ab98224a755661a6688060ed6e23d59" - }, "kernelspec": { - "display_name": "ds440flaml", + "display_name": "Python 3.9.15 64-bit", "language": "python", "name": "python3" }, @@ -1277,7 +1283,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.9.15" + }, + "vscode": { + "interpreter": { + "hash": "949777d72b0d2535278d3dc13498b2535136f6dfe0678499012e853ee9abcab1" + } } }, "nbformat": 4, diff --git a/setup.py b/setup.py index d2f109802..c86882972 100644 --- a/setup.py +++ b/setup.py @@ -40,11 +40,8 @@ setuptools.setup( install_requires=install_requires, extras_require={ "notebook": [ - "openml==0.10.2", "jupyter", "matplotlib", - "rgf-python", - "catboost>=0.26", ], "test": [ "flake8>=3.8.4", @@ -57,7 +54,7 @@ setuptools.setup( "catboost>=0.26", "rgf-python", "optuna==2.8.0", - "openml", + "openml==0.10.2", "statsmodels>=0.12.2", "psutil==5.8.0", "dataclasses", @@ -67,7 +64,8 @@ setuptools.setup( "rouge_score", "hcrystalball==0.1.10", "seqeval", - "pytorch-forecasting>=0.9.0", + "pytorch-forecasting>=0.9.0,<=0.10.1", + "mlflow", ], "catboost": ["catboost>=0.26"], "blendsearch": ["optuna==2.8.0"], diff --git a/test/automl/test_notebook_example.py b/test/automl/test_notebook_example.py index 5f2382454..818f7af36 100644 --- a/test/automl/test_notebook_example.py +++ b/test/automl/test_notebook_example.py @@ -108,10 +108,7 @@ def _test_nobudget(): def test_mlflow(): - import subprocess - import sys - - subprocess.check_call([sys.executable, "-m", "pip", "install", "mlflow"]) + # subprocess.check_call([sys.executable, "-m", "pip", "install", "mlflow"]) import mlflow from flaml.data import load_openml_task @@ -152,9 +149,12 @@ def test_mlflow(): print(automl.predict_proba(X_test)) except ImportError: pass - # subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "mlflow"]) + +def test_mlflow_iris(): from sklearn.datasets import load_iris + import mlflow + from flaml import AutoML with mlflow.start_run(): automl = AutoML() @@ -167,6 +167,8 @@ def test_mlflow(): X_train, y_train = load_iris(return_X_y=True) automl.fit(X_train=X_train, y_train=y_train, **automl_settings) + # subprocess.check_call([sys.executable, "-m", "pip", "uninstall", "mlflow"]) + if __name__ == "__main__": test_automl(600) diff --git a/test/nlp/test_autohf.py b/test/nlp/test_autohf.py index 38c82027c..ae7bae5fa 100644 --- a/test/nlp/test_autohf.py +++ b/test/nlp/test_autohf.py @@ -74,7 +74,10 @@ def test_hf_data(): del automl if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") if __name__ == "__main__": diff --git a/test/nlp/test_autohf_classificationhead.py b/test/nlp/test_autohf_classificationhead.py index d2eab332e..4204d49f6 100644 --- a/test/nlp/test_autohf_classificationhead.py +++ b/test/nlp/test_autohf_classificationhead.py @@ -113,7 +113,10 @@ def _test_switch_classificationhead(each_data, each_model_path): return if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") if __name__ == "__main__": diff --git a/test/nlp/test_autohf_custom_metric.py b/test/nlp/test_autohf_custom_metric.py index b28e48d35..c653b74c5 100644 --- a/test/nlp/test_autohf_custom_metric.py +++ b/test/nlp/test_autohf_custom_metric.py @@ -84,7 +84,10 @@ def test_custom_metric(): del automl if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") if __name__ == "__main__": diff --git a/test/nlp/test_autohf_cv.py b/test/nlp/test_autohf_cv.py index 6e9162dd9..a8b40ae9e 100644 --- a/test/nlp/test_autohf_cv.py +++ b/test/nlp/test_autohf_cv.py @@ -5,7 +5,9 @@ import os import shutil -@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os") +@pytest.mark.skipif( + sys.platform in ["darwin", "win32"], reason="do not run on mac os or windows" +) def test_cv(): from flaml import AutoML import requests @@ -22,7 +24,10 @@ def test_cv(): return if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") if __name__ == "__main__": diff --git a/test/nlp/test_autohf_multichoice_classification.py b/test/nlp/test_autohf_multichoice_classification.py index 61691141e..918dec8f8 100644 --- a/test/nlp/test_autohf_multichoice_classification.py +++ b/test/nlp/test_autohf_multichoice_classification.py @@ -5,7 +5,9 @@ import os import shutil -@pytest.mark.skipif(sys.platform == "darwin", reason="do not run on mac os") +@pytest.mark.skipif( + sys.platform in ["darwin", "win32"], reason="do not run on mac os or windows" +) def test_mcc(): from flaml import AutoML import requests @@ -49,7 +51,10 @@ def test_mcc(): print("Accuracy: " + str(accuracy)) if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") if __name__ == "__main__": diff --git a/test/nlp/test_autohf_regression.py b/test/nlp/test_autohf_regression.py index cb01a6152..85453ed79 100644 --- a/test/nlp/test_autohf_regression.py +++ b/test/nlp/test_autohf_regression.py @@ -35,7 +35,10 @@ def test_regression(): automl.predict(X_val) if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") if __name__ == "__main__": diff --git a/test/nlp/test_autohf_summarization.py b/test/nlp/test_autohf_summarization.py index 9e21984d6..f1427e752 100644 --- a/test/nlp/test_autohf_summarization.py +++ b/test/nlp/test_autohf_summarization.py @@ -7,8 +7,8 @@ import shutil @pytest.mark.skipif( - sys.platform == "darwin" or sys.version < "3.7", - reason="do not run on mac os or py3.6", + sys.platform in ["darwin", "win32"] or sys.version < "3.7", + reason="do not run on mac os, windows or py3.6", ) def test_summarization(): # TODO: manual test for how effective postprocess_seq2seq_prediction_label is @@ -51,7 +51,10 @@ def test_summarization(): automl.predict(X_test) if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") if __name__ == "__main__": diff --git a/test/nlp/test_autohf_tokenclassification.py b/test/nlp/test_autohf_tokenclassification.py index 051c2bf41..ead6ce8e2 100644 --- a/test/nlp/test_autohf_tokenclassification.py +++ b/test/nlp/test_autohf_tokenclassification.py @@ -11,8 +11,8 @@ from utils import ( @pytest.mark.skipif( - sys.platform == "darwin" or sys.version < "3.7", - reason="do not run on mac os or py<3.7", + sys.platform in ["darwin", "win32"] or sys.version < "3.7", + reason="do not run on mac os, windows or py<3.7", ) def test_tokenclassification_idlabel(): from flaml import AutoML @@ -65,12 +65,15 @@ def test_tokenclassification_idlabel(): assert val_loss == min_inter_result if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") @pytest.mark.skipif( - sys.platform == "darwin" or sys.version < "3.7", - reason="do not run on mac os or py<3.7", + sys.platform in ["darwin", "win32"] or sys.version < "3.7", + reason="do not run on mac os, windows or py<3.7", ) def test_tokenclassification_tokenlabel(): from flaml import AutoML @@ -112,7 +115,10 @@ def test_tokenclassification_tokenlabel(): assert val_loss == min_inter_result if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") if __name__ == "__main__": diff --git a/test/nlp/test_default.py b/test/nlp/test_default.py index b0796b334..d2e991b66 100644 --- a/test/nlp/test_default.py +++ b/test/nlp/test_default.py @@ -3,6 +3,7 @@ import sys from flaml.default import portfolio import os import shutil +import pytest def pop_args(fit_kwargs): @@ -18,6 +19,7 @@ def test_build_portfolio(path="./test/nlp/default", strategy="greedy"): portfolio.main() +@pytest.mark.skipif(sys.platform == "win32", reason="do not run on windows") def test_starting_point_not_in_search_space(): from flaml import AutoML @@ -84,9 +86,13 @@ def test_starting_point_not_in_search_space(): ) if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") +@pytest.mark.skipif(sys.platform == "win32", reason="do not run on windows") def test_points_to_evaluate(): from flaml import AutoML @@ -106,10 +112,14 @@ def test_points_to_evaluate(): automl.fit(X_train, y_train, **automl_settings) if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") # TODO: implement _test_zero_shot_model +@pytest.mark.skipif(sys.platform == "win32", reason="do not run on windows") def test_zero_shot_nomodel(): from flaml.default import preprocess_and_suggest_hyperparams @@ -141,7 +151,10 @@ def test_zero_shot_nomodel(): model.fit(X_train, y_train, **fit_kwargs) if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") def test_build_error_portfolio(path="./test/nlp/default", strategy="greedy"): @@ -176,4 +189,7 @@ def test_build_error_portfolio(path="./test/nlp/default", strategy="greedy"): import shutil if os.path.exists("test/data/output/"): - shutil.rmtree("test/data/output/") + try: + shutil.rmtree("test/data/output/") + except PermissionError: + print("PermissionError when deleting test/data/output/") diff --git a/website/docs/Use-Cases/Task-Oriented-AutoML.md b/website/docs/Use-Cases/Task-Oriented-AutoML.md index d47b6be7a..05ec64cbd 100644 --- a/website/docs/Use-Cases/Task-Oriented-AutoML.md +++ b/website/docs/Use-Cases/Task-Oriented-AutoML.md @@ -12,7 +12,7 @@ - 'regression': regression with tabular data. - 'ts_forecast': time series forecasting. - 'ts_forecast_classification': time series forecasting for classification. - - 'ts_forecast_panel': time series forecasting for panel datasets (multiple time series). + - 'rank': learning to rank. - 'seq-classification': sequence classification. - 'seq-regression': sequence regression. @@ -120,7 +120,7 @@ The estimator list can contain one or more estimator names, each corresponding t - 'arima': ARIMA for task "ts_forecast". Hyperparameters: p, d, q. - 'sarimax': SARIMAX for task "ts_forecast". Hyperparameters: p, d, q, P, D, Q, s. - 'transformer': Huggingface transformer models for task "seq-classification", "seq-regression", "multichoice-classification", "token-classification" and "summarization". Hyperparameters: learning_rate, num_train_epochs, per_device_train_batch_size, warmup_ratio, weight_decay, adam_epsilon, seed. - - 'temporal_fusion_transform': TemporalFusionTransformerEstimator for task "ts_forecast_panel". Hyperparameters: gradient_clip_val, hidden_size, hidden_continuous_size, attention_head_size, dropout, learning_rate. + * Custom estimator. Use custom estimator for: - tuning an estimator that is not built-in; - customizing search space for a built-in estimator.