mirror of
https://github.com/microsoft/autogen.git
synced 2025-09-18 12:44:20 +00:00
103 lines
3.3 KiB
Python
103 lines
3.3 KiB
Python
import numpy as np
|
|
import scipy.sparse
|
|
from flaml import AutoML
|
|
from flaml.tune.spark.utils import check_spark
|
|
import os
|
|
import pytest
|
|
|
|
# For spark, we need to put customized learner in a separate file
|
|
if os.path.exists(os.path.join(os.getcwd(), "test", "spark", "mylearner.py")):
|
|
try:
|
|
from test.spark.mylearner import MyLargeLGBM
|
|
|
|
skip_my_learner = False
|
|
except ImportError:
|
|
skip_my_learner = True
|
|
MyLargeLGBM = None
|
|
else:
|
|
MyLargeLGBM = None
|
|
skip_my_learner = True
|
|
|
|
os.environ["FLAML_MAX_CONCURRENT"] = "2"
|
|
|
|
spark_available, _ = check_spark()
|
|
skip_spark = not spark_available
|
|
|
|
pytestmark = pytest.mark.skipif(skip_spark, reason="Spark is not installed. Skip all spark tests.")
|
|
|
|
|
|
def test_parallel_xgboost(hpo_method=None, data_size=1000):
|
|
automl_experiment = AutoML()
|
|
automl_settings = {
|
|
"time_budget": 10,
|
|
"metric": "ap",
|
|
"task": "classification",
|
|
"log_file_name": "test/sparse_classification.log",
|
|
"estimator_list": ["xgboost"],
|
|
"log_type": "all",
|
|
"n_jobs": 1,
|
|
"n_concurrent_trials": 2,
|
|
"hpo_method": hpo_method,
|
|
"use_spark": True,
|
|
}
|
|
X_train = scipy.sparse.eye(data_size)
|
|
y_train = np.random.randint(2, size=data_size)
|
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
|
print(automl_experiment.predict(X_train))
|
|
print(automl_experiment.model)
|
|
print(automl_experiment.config_history)
|
|
print(automl_experiment.best_model_for_estimator("xgboost"))
|
|
print(automl_experiment.best_iteration)
|
|
print(automl_experiment.best_estimator)
|
|
|
|
|
|
def test_parallel_xgboost_others():
|
|
# use random search as the hpo_method
|
|
test_parallel_xgboost(hpo_method="random")
|
|
|
|
|
|
@pytest.mark.skip(reason="currently not supporting too large data, will support spark dataframe in the future")
|
|
def test_large_dataset():
|
|
test_parallel_xgboost(data_size=90000000)
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
skip_my_learner,
|
|
reason="please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file",
|
|
)
|
|
def test_custom_learner(data_size=1000):
|
|
automl_experiment = AutoML()
|
|
automl_experiment.add_learner(learner_name="large_lgbm", learner_class=MyLargeLGBM)
|
|
automl_settings = {
|
|
"time_budget": 2,
|
|
"task": "classification",
|
|
"log_file_name": "test/sparse_classification_oom.log",
|
|
"estimator_list": ["large_lgbm"],
|
|
"log_type": "all",
|
|
"n_jobs": 1,
|
|
"hpo_method": "random",
|
|
"n_concurrent_trials": 2,
|
|
"use_spark": True,
|
|
}
|
|
X_train = scipy.sparse.eye(data_size)
|
|
y_train = np.random.randint(2, size=data_size)
|
|
|
|
automl_experiment.fit(X_train=X_train, y_train=y_train, **automl_settings)
|
|
print(automl_experiment.predict(X_train))
|
|
print(automl_experiment.model)
|
|
print(automl_experiment.config_history)
|
|
print(automl_experiment.best_model_for_estimator("large_lgbm"))
|
|
print(automl_experiment.best_iteration)
|
|
print(automl_experiment.best_estimator)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_parallel_xgboost()
|
|
test_parallel_xgboost_others()
|
|
# test_large_dataset()
|
|
if skip_my_learner:
|
|
print("please run pytest in the root directory of FLAML, i.e., the directory that contains the setup.py file")
|
|
else:
|
|
test_custom_learner()
|