diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 813197a9e..ef1fcdd56 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -19,41 +19,66 @@ jobs: python-version: [3.6, 3.7, 3.8] steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: If mac, install libomp to facilitate lgbm install - if: matrix.os == 'macOS-latest' - run: | - brew install libomp - export CC=/usr/bin/clang - export CXX=/usr/bin/clang++ - export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp" - export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include" - export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include" - export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp" - - name: Install packages and dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 pytest coverage - pip install -e . - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pytest test - - name: Coverage - run: | - coverage run -a -m pytest test - coverage xml - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - with: - file: ./coverage.xml - flags: unittests \ No newline at end of file + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: If mac, install libomp to facilitate lgbm install + if: matrix.os == 'macOS-latest' + run: | + brew install libomp + export CC=/usr/bin/clang + export CXX=/usr/bin/clang++ + export CPPFLAGS="$CPPFLAGS -Xpreprocessor -fopenmp" + export CFLAGS="$CFLAGS -I/usr/local/opt/libomp/include" + export CXXFLAGS="$CXXFLAGS -I/usr/local/opt/libomp/include" + export LDFLAGS="$LDFLAGS -Wl,-rpath,/usr/local/opt/libomp/lib -L/usr/local/opt/libomp/lib -lomp" + - name: Install packages and dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest coverage + pip install -e . + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest test + - name: Coverage + run: | + coverage run -a -m pytest test + coverage xml + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + with: + file: ./coverage.xml + flags: unittests + + docs: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Compile documentation + run: | + pip install -e . + python -m pip install sphinx sphinx_rtd_theme + cd docs + make html + - name: Deploy to GitHub pages + if: ${{ github.ref == 'refs/heads/main' }} + uses: JamesIves/github-pages-deploy-action@3.6.2 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + BRANCH: gh-pages + FOLDER: docs/_build/html + CLEAN: true \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 000000000..d4bb2cbb9 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 000000000..e6e6a823e --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,60 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'FLAML' +copyright = '2020, FLAML Team' +author = 'FLAML Team' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.napoleon', + 'sphinx.ext.doctest', + 'sphinx.ext.coverage', + 'sphinx.ext.mathjax', + 'sphinx.ext.viewcode', + 'sphinx.ext.githubpages', + 'sphinx_rtd_theme', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 000000000..e05f4fdf7 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,29 @@ +.. FLAML documentation master file, created by + sphinx-quickstart on Mon Dec 14 23:33:24 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. Welcome to FLAML's documentation! +.. ================================= + +.. .. toctree:: +.. :maxdepth: 2 +.. :caption: Contents: + + +FLAML API Documentation +======================= + +AutoML +------ + +.. autoclass:: flaml.AutoML + :members: + + +.. Indices and tables +.. ================== + +.. * :ref:`genindex` +.. * :ref:`modindex` +.. * :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 000000000..2119f5109 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/flaml/automl.py b/flaml/automl.py index baa77ef98..b271608f8 100644 --- a/flaml/automl.py +++ b/flaml/automl.py @@ -28,33 +28,20 @@ logger = logging.getLogger(__name__) class AutoML: '''The AutoML class - Attributes: - model: An object with predict() and predict_proba() method (for - classification), storing the best trained model. - model_history: A dictionary of iter->model, storing the models when - the best model is updated each time - config_history: A dictionary of iter->(estimator, config, time), - storing the best estimator, config, and the time when the best - model is updated each time - classes_: A list of n_classes elements for class labels - best_iteration: An integer of the iteration number where the best - config is found - best_estimator: A string indicating the best estimator found. - best_config: A dictionary of the best configuration. - best_config_train_time: A float of the seconds taken by training the - best config + Example: - Typical usage example: + .. code-block:: python + + automl = AutoML() + automl_settings = { + "time_budget": 60, + "metric": 'accuracy', + "task": 'classification', + "log_file_name": 'test/mylog.log', + } + automl.fit(X_train = X_train, y_train = y_train, + **automl_settings) - automl = AutoML() - automl_settings = { - "time_budget": 60, - "metric": 'accuracy', - "task": 'classification', - "log_file_name": 'test/mylog.log', - } - automl.fit(X_train = X_train, y_train = y_train, - **automl_settings) ''' def __init__(self): @@ -66,14 +53,24 @@ class AutoML: @property def model_history(self): + '''A dictionary of iter->model, storing the models when + the best model is updated each time. + ''' return self._model_history @property def config_history(self): + '''A dictionary of iter->(estimator, config, time), + storing the best estimator, config, and the time when the best + model is updated each time. + ''' return self._config_history @property def model(self): + '''An object with `predict()` and `predict_proba()` method (for + classification), storing the best trained model. + ''' if self._trained_estimator: return self._trained_estimator.model else: @@ -81,14 +78,18 @@ class AutoML: @property def best_estimator(self): + '''A string indicating the best estimator found.''' return self._best_estimator @property def best_iteration(self): + '''An integer of the iteration number where the best + config is found.''' return self._best_iteration @property def best_config(self): + '''A dictionary of the best configuration.''' return self._selected.best_config[0] @property @@ -97,10 +98,13 @@ class AutoML: @property def best_config_train_time(self): + '''A float of the seconds taken by training the + best config.''' return self.best_train_time @property def classes_(self): + '''A list of n_classes elements for class labels.''' if self.label_transformer: return self.label_transformer.classes_.tolist() if self._trained_estimator: @@ -111,10 +115,10 @@ class AutoML: '''Predict label from features. Args: - X_test: A numpy array of featurized instances, shape n*m. + X_test: A numpy array of featurized instances, shape n * m. Returns: - A numpy array of shape n*1 -- each element is a predicted class + A numpy array of shape n * 1 - - each element is a predicted class label for an instance. ''' X_test = self.preprocess(X_test) @@ -132,11 +136,11 @@ class AutoML: classification problems. Args: - X_test: A numpy array of featurized instances, shape n*m. + X_test: A numpy array of featurized instances, shape n * m. Returns: - A numpy array of shape n*c. c is the # classes. Each element at - (i,j) is the probability for instance i to be in class j. + A numpy array of shape n * c. c is the # classes. Each element at + (i, j) is the probability for instance i to be in class j. ''' X_test = self.preprocess(X_test) proba = self._trained_estimator.predict_proba(X_test) @@ -298,14 +302,14 @@ class AutoML: random_state=1) X_train = concat(X_first, X_train) y_train = concat(label_set, - y_train) if self.df else np.concatenate([label_set, y_train]) + y_train) if self.df else np.concatenate([label_set, y_train]) X_val = concat(X_first, X_val) y_val = concat(label_set, - y_val) if self.df else np.concatenate([label_set, y_val]) + y_val) if self.df else np.concatenate([label_set, y_val]) _, y_train_counts_elements = np.unique(y_train, - return_counts=True) + return_counts=True) _, y_val_counts_elements = np.unique(y_val, - return_counts=True) + return_counts=True) logger.debug( f"""{self.split_type} split for y_train \ {y_train_counts_elements}, \ @@ -396,7 +400,7 @@ class AutoML: learner_class: A subclass of BaseEstimator size_estimate: A function from a config to its memory size in float cost_relative2lgbm: A float number for the training cost ratio with - respect to lightgbm (when both use the initial config) + respect to lightgbm(when both use the initial config) ''' self._custom_learners[learner_name] = learner_class self._eti_ini[learner_name] = cost_relative2lgbm @@ -450,14 +454,14 @@ class AutoML: Args: time_budget: A float number of the time budget in seconds log_file_name: A string of the log file name - X_train: A numpy array of training data in shape n*m - y_train: A numpy array of labels in shape n*1 + X_train: A numpy array of training data in shape n * m + y_train: A numpy array of labels in shape n * 1 task: A string of the task type, e.g., 'classification', 'regression' eval_method: A string of resampling strategy, one of ['auto', 'cv', 'holdout'] split_ratio: A float of the validation data percentage for holdout - n_splits: An integer of the number of folds for cross-validation + n_splits: An integer of the number of folds for cross - validation n_jobs: An integer of the number of threads for training train_best: A boolean of whether to train the best config in the time budget; if false, train the last config in the budget @@ -507,7 +511,8 @@ class AutoML: self._trained_estimator = BaseEstimator() self._trained_estimator.model = None return training_duration - if not best: return + if not best: + return best_estimator = best.learner best_config = best.config sample_size = len(self.y_train_all) if train_full \ @@ -581,29 +586,36 @@ class AutoML: Args: X_train: A numpy array or a pandas dataframe of training data in - shape n*m - y_train: A numpy array or a pandas series of labels in shape n*1 + shape n * m + y_train: A numpy array or a pandas series of labels in shape n * 1 dataframe: A dataframe of training data including label column label: A str of the label column name - Note: If X_train and y_train are provided, + Note: If X_train and y_train are provided, dataframe and label are ignored; If not, dataframe and label must be provided. metric: A string of the metric name or a function, - e.g., 'accuracy','roc_auc','f1','log_loss','mae','mse','r2' + e.g., 'accuracy', 'roc_auc', 'f1', 'log_loss', 'mae', 'mse', 'r2' if passing a customized metric function, the function needs to - have the follwing signature + have the follwing signature: - def metric(X_test, y_test, estimator, labels, X_train, y_train): - return metric_to_minimize, metrics_to_log + .. code-block:: python - which returns a float number as the minimization objective, + def metric(X_test, y_test, estimator, labels, X_train, y_train): + return metric_to_minimize, metrics_to_log + + which returns a float number as the minimization objective, and a tuple of floats as the metrics to log task: A string of the task type, e.g., 'classification', 'regression' n_jobs: An integer of the number of threads for training log_file_name: A string of the log file name estimator_list: A list of strings for estimator names, or 'auto' - e.g., ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree'] + e.g., + + .. code-block:: python + + ['lgbm', 'xgboost', 'catboost', 'rf', 'extra_tree'] + time_budget: A float number of the time budget in seconds max_iter: An integer of the maximal number of iterations sample: A boolean of whether to sample the training data during @@ -611,16 +623,17 @@ class AutoML: eval_method: A string of resampling strategy, one of ['auto', 'cv', 'holdout'] split_ratio: A float of the valiation data percentage for holdout - n_splits: An integer of the number of folds for cross-validation - log_type: A string of the log type, one of ['better', 'all', 'new'] + n_splits: An integer of the number of folds for cross - validation + log_type: A string of the log type, one of + ['better', 'all', 'new'] 'better' only logs configs with better loss than previos iters 'all' logs all the tried configs - 'new' only logs non-redundant configs + 'new' only logs non - redundant configs model_history: A boolean of whether to keep the history of best models in the history property. Make sure memory is large enough if setting to True. - log_training_metric: A boolean of whether to log the training - metric for each model. + log_training_metric: A boolean of whether to log the training + metric for each model. mem_thres: A float of the memory size constraint in bytes X_val: None | a numpy array or a pandas dataframe of validation data y_val: None | a numpy array or a pandas series of validation labels diff --git a/flaml/version.py b/flaml/version.py index b81b15805..b3f475621 100644 --- a/flaml/version.py +++ b/flaml/version.py @@ -1 +1 @@ -__version__="0.1.1" +__version__ = "0.1.2"