azureml + ray (#344)

* examples and documentation about how to use azureml + ray
This commit is contained in:
Chi Wang 2021-12-23 13:37:07 -08:00 committed by GitHub
parent baa0359324
commit 300f286667
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 171 additions and 6 deletions

View File

@ -0,0 +1,49 @@
import ray
import lightgbm as lgb
import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
from flaml import tune
from flaml.model import LGBMEstimator
data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25)
def train_breast_cancer(config):
params = LGBMEstimator(**config).params
num_boost_round = params.pop("n_estimators")
train_set = lgb.Dataset(train_x, label=train_y)
gbm = lgb.train(params, train_set, num_boost_round)
preds = gbm.predict(test_x)
pred_labels = np.rint(preds)
tune.report(
mean_accuracy=sklearn.metrics.accuracy_score(test_y, pred_labels), done=True
)
if __name__ == "__main__":
ray.init(address="auto")
flaml_lgbm_search_space = LGBMEstimator.search_space(train_x.shape)
config_search_space = {
hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()
}
low_cost_partial_config = {
hp: space["low_cost_init_value"]
for hp, space in flaml_lgbm_search_space.items()
if "low_cost_init_value" in space
}
analysis = tune.run(
train_breast_cancer,
metric="mean_accuracy",
mode="max",
config=config_search_space,
num_samples=-1,
time_budget_s=60,
use_ray=True,
)
# print("Best hyperparameters found were: ", analysis.best_config)
print("The best trial's result: ", analysis.best_trial.last_result)

View File

@ -71,7 +71,11 @@ If all the tests are passed, please also test run [notebook/automl_classificatio
### Documentation
To build and test documentation locally, install [Node.js](https://nodejs.org/en/download/).
To build and test documentation locally, install [Node.js](https://nodejs.org/en/download/). For example,
```bash
nvm install --lts
```
Then:
@ -79,7 +83,7 @@ Then:
npm install --global yarn
pip install pydoc-markdown
cd website
yarn install
yarn install --frozen-lockfile
pydoc-markdown
yarn start
```

View File

@ -1,4 +1,4 @@
FLAML can be used together with AzureML and mlflow.
FLAML can be used together with AzureML. On top of that, using mlflow and ray is easy too.
### Prerequisites
@ -48,4 +48,116 @@ with mlflow.start_run() as run: # create a mlflow run
The metrics in the run will be automatically logged in an experiment named "flaml" in your AzureML workspace.
[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb)
[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb)
### Use ray to distribute across a cluster
When you have a compute cluster in AzureML, you can distribute `flaml.AutoML` or `flaml.tune` with ray.
#### Build a ray environment in AzureML
Create a docker file such as [.Docker/Dockerfile-cpu](https://github.com/microsoft/FLAML/blob/main/test/.Docker/Dockerfile-cpu). Make sure `RUN pip install flaml[blendsearch,ray]` is included in the docker file.
Then build a AzureML environment in the workspace `ws`.
```python
ray_environment_name = "aml-ray-cpu"
ray_environment_dockerfile_path = "./Docker/Dockerfile-cpu"
# Build CPU image for Ray
ray_cpu_env = Environment.from_dockerfile(name=ray_environment_name, dockerfile=ray_environment_dockerfile_path)
ray_cpu_env.register(workspace=ws)
ray_cpu_build_details = ray_cpu_env.build(workspace=ws)
import time
while ray_cpu_build_details.status not in ["Succeeded", "Failed"]:
print(f"Awaiting completion of ray CPU environment build. Current status is: {ray_cpu_build_details.status}")
time.sleep(10)
```
You only need to do this step once for one workspace.
#### Create a compute cluster with multiple nodes
```python
from azureml.core.compute import AmlCompute, ComputeTarget
compute_target_name = "cpucluster"
node_count = 2
# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
compute_target_size = "STANDARD_D2_V2"
if compute_target_name in ws.compute_targets:
compute_target = ws.compute_targets[compute_target_name]
if compute_target and type(compute_target) is AmlCompute:
if compute_target.provisioning_state == "Succeeded":
print("Found compute target; using it:", compute_target_name)
else:
raise Exception(
"Found compute target but it is in state", compute_target.provisioning_state)
else:
print("creating a new compute target...")
provisioning_config = AmlCompute.provisioning_configuration(
vm_size=compute_target_size,
min_nodes=0,
max_nodes=node_count)
# Create the cluster
compute_target = ComputeTarget.create(ws, compute_target_name, provisioning_config)
# Can poll for a minimum number of nodes and for a specific timeout.
# If no min node count is provided it will use the scale settings for the cluster
compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
# For a more detailed view of current AmlCompute status, use get_status()
print(compute_target.get_status().serialize())
```
If the computer target "cpucluster" already exists, it will not be recreated.
#### Run distributed AutoML job
Assuming you have an automl script like [ray/distribute_automl.py](https://github.com/microsoft/FLAML/blob/main/test/ray/distribute_automl.py). It uses `ray.init(address="auto")` to initialize the cluster, and uses `n_concurrent_trials=k` to inform `AutoML.fit()` to perform k concurrent trials in parallel.
Submit an AzureML job as the following:
```python
from azureml.core import Workspace, Experiment, ScriptRunConfig, Environment
command = ["python distribute_automl.py"]
ray_environment_name = 'aml-ray-cpu'
env = Environment.get(workspace=ws, name=ray_environment_name)
config = ScriptRunConfig(
source_directory='ray/',
command=command,
compute_target=compute_target,
environment=env,
)
config.run_config.node_count = 2
config.run_config.environment_variables["_AZUREML_CR_START_RAY"] = "true"
config.run_config.environment_variables["AZUREML_COMPUTE_USE_COMMON_RUNTIME"] = "true"
exp = Experiment(ws, 'distribute-automl')
run = exp.submit(config)
print(run.get_portal_url()) # link to ml.azure.com
run.wait_for_completion(show_output=True)
```
The line
`
config.run_config.environment_variables["_AZUREML_CR_START_RAY"] = "true"
`
tells AzureML to start ray on each node of the cluster.
#### Run distributed tune job
Prepare a script like [ray/distribute_tune.py](https://github.com/microsoft/FLAML/blob/main/test/ray/distribute_tune.py). Replace the command in the above eample with:
```python
command = ["python distribute_tune.py"]
```
Everything else is the same.

View File

@ -467,9 +467,9 @@ The curve suggests that increasing the time budget may further improve the accur
1. set t1 as the time budget, and check the message in the console log in the end. If the budget is too small, you will see a warning like
> WARNING - Time taken to find the best model is 91% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.
2. set t2 as the time budget, and also set `early_stop=True`. If the early stopping is triggered, you will see a warning like
> WARNING - All estimator hyperparameters local search has converged at least once, and the total search time exceeds 10 times the time taken to find the best model.
> WARNING - All estimator hyperparameters local search has converged at least once, and the total search time exceeds 10 times the time taken to find the best model.
> WARNING - Stopping search as early_stop is set to True.
> WARNING - Stopping search as early_stop is set to True.
### How much time is needed to find the best model