mirror of
https://github.com/microsoft/autogen.git
synced 2025-12-27 15:09:41 +00:00
azureml + ray (#344)
* examples and documentation about how to use azureml + ray
This commit is contained in:
parent
baa0359324
commit
300f286667
49
test/ray/distribute_tune.py
Normal file
49
test/ray/distribute_tune.py
Normal file
@ -0,0 +1,49 @@
|
||||
import ray
|
||||
import lightgbm as lgb
|
||||
import numpy as np
|
||||
import sklearn.datasets
|
||||
import sklearn.metrics
|
||||
from sklearn.model_selection import train_test_split
|
||||
from flaml import tune
|
||||
from flaml.model import LGBMEstimator
|
||||
|
||||
data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
|
||||
train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.25)
|
||||
|
||||
|
||||
def train_breast_cancer(config):
|
||||
params = LGBMEstimator(**config).params
|
||||
num_boost_round = params.pop("n_estimators")
|
||||
train_set = lgb.Dataset(train_x, label=train_y)
|
||||
gbm = lgb.train(params, train_set, num_boost_round)
|
||||
preds = gbm.predict(test_x)
|
||||
pred_labels = np.rint(preds)
|
||||
tune.report(
|
||||
mean_accuracy=sklearn.metrics.accuracy_score(test_y, pred_labels), done=True
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ray.init(address="auto")
|
||||
flaml_lgbm_search_space = LGBMEstimator.search_space(train_x.shape)
|
||||
config_search_space = {
|
||||
hp: space["domain"] for hp, space in flaml_lgbm_search_space.items()
|
||||
}
|
||||
low_cost_partial_config = {
|
||||
hp: space["low_cost_init_value"]
|
||||
for hp, space in flaml_lgbm_search_space.items()
|
||||
if "low_cost_init_value" in space
|
||||
}
|
||||
|
||||
analysis = tune.run(
|
||||
train_breast_cancer,
|
||||
metric="mean_accuracy",
|
||||
mode="max",
|
||||
config=config_search_space,
|
||||
num_samples=-1,
|
||||
time_budget_s=60,
|
||||
use_ray=True,
|
||||
)
|
||||
|
||||
# print("Best hyperparameters found were: ", analysis.best_config)
|
||||
print("The best trial's result: ", analysis.best_trial.last_result)
|
||||
@ -71,7 +71,11 @@ If all the tests are passed, please also test run [notebook/automl_classificatio
|
||||
|
||||
### Documentation
|
||||
|
||||
To build and test documentation locally, install [Node.js](https://nodejs.org/en/download/).
|
||||
To build and test documentation locally, install [Node.js](https://nodejs.org/en/download/). For example,
|
||||
|
||||
```bash
|
||||
nvm install --lts
|
||||
```
|
||||
|
||||
Then:
|
||||
|
||||
@ -79,7 +83,7 @@ Then:
|
||||
npm install --global yarn
|
||||
pip install pydoc-markdown
|
||||
cd website
|
||||
yarn install
|
||||
yarn install --frozen-lockfile
|
||||
pydoc-markdown
|
||||
yarn start
|
||||
```
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
FLAML can be used together with AzureML and mlflow.
|
||||
FLAML can be used together with AzureML. On top of that, using mlflow and ray is easy too.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
@ -48,4 +48,116 @@ with mlflow.start_run() as run: # create a mlflow run
|
||||
|
||||
The metrics in the run will be automatically logged in an experiment named "flaml" in your AzureML workspace.
|
||||
|
||||
[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb)
|
||||
[Link to notebook](https://github.com/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb) | [Open in colab](https://colab.research.google.com/github/microsoft/FLAML/blob/main/notebook/integrate_azureml.ipynb)
|
||||
|
||||
### Use ray to distribute across a cluster
|
||||
|
||||
When you have a compute cluster in AzureML, you can distribute `flaml.AutoML` or `flaml.tune` with ray.
|
||||
|
||||
#### Build a ray environment in AzureML
|
||||
|
||||
Create a docker file such as [.Docker/Dockerfile-cpu](https://github.com/microsoft/FLAML/blob/main/test/.Docker/Dockerfile-cpu). Make sure `RUN pip install flaml[blendsearch,ray]` is included in the docker file.
|
||||
|
||||
Then build a AzureML environment in the workspace `ws`.
|
||||
|
||||
```python
|
||||
ray_environment_name = "aml-ray-cpu"
|
||||
ray_environment_dockerfile_path = "./Docker/Dockerfile-cpu"
|
||||
|
||||
# Build CPU image for Ray
|
||||
ray_cpu_env = Environment.from_dockerfile(name=ray_environment_name, dockerfile=ray_environment_dockerfile_path)
|
||||
ray_cpu_env.register(workspace=ws)
|
||||
ray_cpu_build_details = ray_cpu_env.build(workspace=ws)
|
||||
|
||||
import time
|
||||
while ray_cpu_build_details.status not in ["Succeeded", "Failed"]:
|
||||
print(f"Awaiting completion of ray CPU environment build. Current status is: {ray_cpu_build_details.status}")
|
||||
time.sleep(10)
|
||||
```
|
||||
|
||||
You only need to do this step once for one workspace.
|
||||
|
||||
#### Create a compute cluster with multiple nodes
|
||||
|
||||
```python
|
||||
from azureml.core.compute import AmlCompute, ComputeTarget
|
||||
|
||||
compute_target_name = "cpucluster"
|
||||
node_count = 2
|
||||
|
||||
# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
|
||||
compute_target_size = "STANDARD_D2_V2"
|
||||
|
||||
if compute_target_name in ws.compute_targets:
|
||||
compute_target = ws.compute_targets[compute_target_name]
|
||||
if compute_target and type(compute_target) is AmlCompute:
|
||||
if compute_target.provisioning_state == "Succeeded":
|
||||
print("Found compute target; using it:", compute_target_name)
|
||||
else:
|
||||
raise Exception(
|
||||
"Found compute target but it is in state", compute_target.provisioning_state)
|
||||
else:
|
||||
print("creating a new compute target...")
|
||||
provisioning_config = AmlCompute.provisioning_configuration(
|
||||
vm_size=compute_target_size,
|
||||
min_nodes=0,
|
||||
max_nodes=node_count)
|
||||
|
||||
# Create the cluster
|
||||
compute_target = ComputeTarget.create(ws, compute_target_name, provisioning_config)
|
||||
|
||||
# Can poll for a minimum number of nodes and for a specific timeout.
|
||||
# If no min node count is provided it will use the scale settings for the cluster
|
||||
compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
|
||||
|
||||
# For a more detailed view of current AmlCompute status, use get_status()
|
||||
print(compute_target.get_status().serialize())
|
||||
```
|
||||
|
||||
If the computer target "cpucluster" already exists, it will not be recreated.
|
||||
|
||||
#### Run distributed AutoML job
|
||||
|
||||
Assuming you have an automl script like [ray/distribute_automl.py](https://github.com/microsoft/FLAML/blob/main/test/ray/distribute_automl.py). It uses `ray.init(address="auto")` to initialize the cluster, and uses `n_concurrent_trials=k` to inform `AutoML.fit()` to perform k concurrent trials in parallel.
|
||||
|
||||
Submit an AzureML job as the following:
|
||||
|
||||
```python
|
||||
from azureml.core import Workspace, Experiment, ScriptRunConfig, Environment
|
||||
|
||||
command = ["python distribute_automl.py"]
|
||||
ray_environment_name = 'aml-ray-cpu'
|
||||
env = Environment.get(workspace=ws, name=ray_environment_name)
|
||||
config = ScriptRunConfig(
|
||||
source_directory='ray/',
|
||||
command=command,
|
||||
compute_target=compute_target,
|
||||
environment=env,
|
||||
)
|
||||
|
||||
config.run_config.node_count = 2
|
||||
config.run_config.environment_variables["_AZUREML_CR_START_RAY"] = "true"
|
||||
config.run_config.environment_variables["AZUREML_COMPUTE_USE_COMMON_RUNTIME"] = "true"
|
||||
|
||||
exp = Experiment(ws, 'distribute-automl')
|
||||
run = exp.submit(config)
|
||||
|
||||
print(run.get_portal_url()) # link to ml.azure.com
|
||||
run.wait_for_completion(show_output=True)
|
||||
```
|
||||
|
||||
The line
|
||||
`
|
||||
config.run_config.environment_variables["_AZUREML_CR_START_RAY"] = "true"
|
||||
`
|
||||
tells AzureML to start ray on each node of the cluster.
|
||||
|
||||
#### Run distributed tune job
|
||||
|
||||
Prepare a script like [ray/distribute_tune.py](https://github.com/microsoft/FLAML/blob/main/test/ray/distribute_tune.py). Replace the command in the above eample with:
|
||||
|
||||
```python
|
||||
command = ["python distribute_tune.py"]
|
||||
```
|
||||
|
||||
Everything else is the same.
|
||||
|
||||
@ -467,9 +467,9 @@ The curve suggests that increasing the time budget may further improve the accur
|
||||
1. set t1 as the time budget, and check the message in the console log in the end. If the budget is too small, you will see a warning like
|
||||
> WARNING - Time taken to find the best model is 91% of the provided time budget and not all estimators' hyperparameter search converged. Consider increasing the time budget.
|
||||
2. set t2 as the time budget, and also set `early_stop=True`. If the early stopping is triggered, you will see a warning like
|
||||
> WARNING - All estimator hyperparameters local search has converged at least once, and the total search time exceeds 10 times the time taken to find the best model.
|
||||
> WARNING - All estimator hyperparameters local search has converged at least once, and the total search time exceeds 10 times the time taken to find the best model.
|
||||
|
||||
> WARNING - Stopping search as early_stop is set to True.
|
||||
> WARNING - Stopping search as early_stop is set to True.
|
||||
|
||||
### How much time is needed to find the best model
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user