autogen/test/test_autovw.py
Xueqing Liu 42de3075e9
Make NLP tasks available from AutoML.fit() (#210)
Sequence classification and regression: "seq-classification" and "seq-regression"

Co-authored-by: Chi Wang <wang.chi@microsoft.com>
2021-11-16 11:06:20 -08:00

440 lines
15 KiB
Python

import unittest
import numpy as np
import scipy.sparse
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
import logging
from flaml.tune import loguniform, polynomial_expansion_set
from vowpalwabbit import pyvw
from flaml import AutoVW
import string
import os
import openml
VW_DS_DIR = "test/data/"
NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase)
logger = logging.getLogger(__name__)
def oml_to_vw_w_grouping(
X, y, ds_dir, fname, orginal_dim, group_num, grouping_method="sequential"
):
# split all_indexes into # group_num of groups
max_size_per_group = int(np.ceil(orginal_dim / float(group_num)))
# sequential grouping
if grouping_method == "sequential":
group_indexes = [] # lists of lists
for i in range(group_num):
indexes = [
ind
for ind in range(
i * max_size_per_group,
min((i + 1) * max_size_per_group, orginal_dim),
)
]
if len(indexes) > 0:
group_indexes.append(indexes)
print(group_indexes)
else:
NotImplementedError
if group_indexes:
if not os.path.exists(ds_dir):
os.makedirs(ds_dir)
with open(os.path.join(ds_dir, fname), "w") as f:
if isinstance(X, pd.DataFrame):
raise NotImplementedError
elif isinstance(X, np.ndarray):
for i in range(len(X)):
NS_content = []
for zz in range(len(group_indexes)):
ns_features = " ".join(
"{}:{:.6f}".format(ind, X[i][ind])
for ind in group_indexes[zz]
)
NS_content.append(ns_features)
ns_line = "{} |{}".format(
str(y[i]),
"|".join(
"{} {}".format(NS_LIST[j], NS_content[j])
for j in range(len(group_indexes))
),
)
f.write(ns_line)
f.write("\n")
elif isinstance(X, scipy.sparse.csr_matrix):
print("NotImplementedError for sparse data")
NotImplementedError
def save_vw_dataset_w_ns(X, y, did, ds_dir, max_ns_num, is_regression):
"""convert openml dataset to vw example and save to file"""
print("is_regression", is_regression)
if is_regression:
fname = "ds_{}_{}_{}.vw".format(did, max_ns_num, 0)
print("dataset size", X.shape[0], X.shape[1])
print("saving data", did, ds_dir, fname)
dim = X.shape[1]
oml_to_vw_w_grouping(X, y, ds_dir, fname, dim, group_num=max_ns_num)
else:
NotImplementedError
def shuffle_data(X, y, seed):
try:
n = len(X)
except ValueError:
n = X.getnnz()
perm = np.random.RandomState(seed=seed).permutation(n)
X_shuf = X[perm, :]
y_shuf = y[perm]
return X_shuf, y_shuf
def get_oml_to_vw(did, max_ns_num, ds_dir=VW_DS_DIR):
success = False
print("-----getting oml dataset-------", did)
ds = openml.datasets.get_dataset(did)
target_attribute = ds.default_target_attribute
# if target_attribute is None and did in OML_target_attribute_dict:
# target_attribute = OML_target_attribute_dict[did]
print("target=ds.default_target_attribute", target_attribute)
data = ds.get_data(target=target_attribute, dataset_format="array")
X, y = data[0], data[1] # return X: pd DataFrame, y: pd series
import scipy
if scipy.sparse.issparse(X):
X = scipy.sparse.csr_matrix.toarray(X)
print("is sparse matrix")
if data and isinstance(X, np.ndarray):
print("-----converting oml to vw and and saving oml dataset-------")
save_vw_dataset_w_ns(X, y, did, ds_dir, max_ns_num, is_regression=True)
success = True
else:
print("---failed to convert/save oml dataset to vw!!!----")
try:
X, y = data[0], data[1] # return X: pd DataFrame, y: pd series
if data and isinstance(X, np.ndarray):
print("-----converting oml to vw and and saving oml dataset-------")
save_vw_dataset_w_ns(X, y, did, ds_dir, max_ns_num, is_regression=True)
success = True
else:
print("---failed to convert/save oml dataset to vw!!!----")
except ValueError:
print("-------------failed to get oml dataset!!!", did)
return success
def load_vw_dataset(did, ds_dir, is_regression, max_ns_num):
import os
if is_regression:
# the second field specifies the largest number of namespaces using.
fname = "ds_{}_{}_{}.vw".format(did, max_ns_num, 0)
vw_dataset_file = os.path.join(ds_dir, fname)
# if file does not exist, generate and save the datasets
if (
not os.path.exists(vw_dataset_file)
or os.stat(vw_dataset_file).st_size < 1000
):
get_oml_to_vw(did, max_ns_num)
print(ds_dir, vw_dataset_file)
if not os.path.exists(ds_dir):
os.makedirs(ds_dir)
with open(os.path.join(ds_dir, fname), "r") as f:
vw_content = f.read().splitlines()
print(type(vw_content), len(vw_content))
return vw_content
def get_data(
iter_num=None,
dataset_id=None,
vw_format=True,
max_ns_num=10,
shuffle=False,
use_log=True,
dataset_type="regression",
):
logging.info("generating data")
LOG_TRANSFORMATION_THRESHOLD = 100
# get data from simulation
import random
vw_examples = None
data_id = int(dataset_id)
# loading oml dataset
# data = OpenML2VWData(data_id, max_ns_num, dataset_type)
# Y = data.Y
if vw_format:
# vw_examples = data.vw_examples
vw_examples = load_vw_dataset(
did=data_id, ds_dir=VW_DS_DIR, is_regression=True, max_ns_num=max_ns_num
)
Y = []
for i, e in enumerate(vw_examples):
Y.append(float(e.split("|")[0]))
logger.debug("first data %s", vw_examples[0])
# do data shuffling or log transformation for oml data when needed
if shuffle:
random.seed(54321)
random.shuffle(vw_examples)
# do log transformation
unique_y = set(Y)
min_y = min(unique_y)
max_y = max(unique_y)
if use_log and max((max_y - min_y), max_y) >= LOG_TRANSFORMATION_THRESHOLD:
log_vw_examples = []
for v in vw_examples:
org_y = v.split("|")[0]
y = float(v.split("|")[0])
# shift y to ensure all y are positive
if min_y <= 0:
y = y + abs(min_y) + 1
log_y = np.log(y)
log_vw = v.replace(org_y + "|", str(log_y) + " |")
log_vw_examples.append(log_vw)
logger.info("log_vw_examples %s", log_vw_examples[0:2])
if log_vw_examples:
return log_vw_examples
return vw_examples, Y
class VowpalWabbitNamesspaceTuningProblem:
def __init__(self, max_iter_num, dataset_id, ns_num, **kwargs):
use_log = (kwargs.get("use_log", True),)
shuffle = kwargs.get("shuffle", False)
vw_format = kwargs.get("vw_format", True)
print("dataset_id", dataset_id)
self.vw_examples, self.Y = get_data(
max_iter_num,
dataset_id=dataset_id,
vw_format=vw_format,
max_ns_num=ns_num,
shuffle=shuffle,
use_log=use_log,
)
self.max_iter_num = min(max_iter_num, len(self.Y))
self._problem_info = {
"max_iter_num": self.max_iter_num,
"dataset_id": dataset_id,
"ns_num": ns_num,
}
self._problem_info.update(kwargs)
self._fixed_hp_config = kwargs.get("fixed_hp_config", {})
self.namespace_feature_dim = AutoVW.get_ns_feature_dim_from_vw_example(
self.vw_examples[0]
)
self._raw_namespaces = list(self.namespace_feature_dim.keys())
self._setup_search()
def _setup_search(self):
self._search_space = self._fixed_hp_config.copy()
self._init_config = self._fixed_hp_config.copy()
search_space = {
"interactions": polynomial_expansion_set(
init_monomials=set(self._raw_namespaces),
highest_poly_order=len(self._raw_namespaces),
allow_self_inter=False,
),
}
init_config = {"interactions": set()}
self._search_space.update(search_space)
self._init_config.update(init_config)
logger.info(
"search space %s %s %s",
self._search_space,
self._init_config,
self._fixed_hp_config,
)
@property
def init_config(self):
return self._init_config
@property
def search_space(self):
return self._search_space
class VowpalWabbitNamesspaceLRTuningProblem(VowpalWabbitNamesspaceTuningProblem):
def __init__(self, max_iter_num, dataset_id, ns_num, **kwargs):
super().__init__(max_iter_num, dataset_id, ns_num, **kwargs)
self._setup_search()
def _setup_search(self):
self._search_space = self._fixed_hp_config.copy()
self._init_config = self._fixed_hp_config.copy()
search_space = {
"interactions": polynomial_expansion_set(
init_monomials=set(self._raw_namespaces),
highest_poly_order=len(self._raw_namespaces),
allow_self_inter=False,
),
"learning_rate": loguniform(lower=2e-10, upper=1.0),
}
init_config = {"interactions": set(), "learning_rate": 0.5}
self._search_space.update(search_space)
self._init_config.update(init_config)
logger.info(
"search space %s %s %s",
self._search_space,
self._init_config,
self._fixed_hp_config,
)
def get_y_from_vw_example(vw_example):
"""get y from a vw_example. this works for regression dataset"""
return float(vw_example.split("|")[0])
def get_loss(y_pred, y_true, loss_func="squared"):
if "squared" in loss_func:
loss = mean_squared_error([y_pred], [y_true])
elif "absolute" in loss_func:
loss = mean_absolute_error([y_pred], [y_true])
else:
loss = None
raise NotImplementedError
return loss
def online_learning_loop(iter_num, vw_examples, vw_alg, loss_func, method_name=""):
"""Implements the online learning loop.
Args:
iter_num (int): The total number of iterations
vw_examples (list): A list of vw examples
alg (alg instance): An algorithm instance has the following functions:
- alg.learn(example)
- alg.predict(example)
loss_func (str): loss function
Outputs:
cumulative_loss_list (list): the list of cumulative loss from each iteration.
It is returned for the convenience of visualization.
"""
print("rerunning exp....", len(vw_examples), iter_num)
loss_list = []
y_predict_list = []
for i in range(iter_num):
vw_x = vw_examples[i]
y_true = get_y_from_vw_example(vw_x)
# predict step
y_pred = vw_alg.predict(vw_x)
# learn step
vw_alg.learn(vw_x)
# calculate one step loss
loss = get_loss(y_pred, y_true, loss_func)
loss_list.append(loss)
y_predict_list.append([y_pred, y_true])
return loss_list
def get_vw_tuning_problem(tuning_hp="NamesapceInteraction"):
online_vw_exp_setting = {
"max_live_model_num": 5,
"fixed_hp_config": {"alg": "supervised", "loss_function": "squared"},
"ns_num": 10,
"max_iter_num": 10000,
}
# construct openml problem setting based on basic experiment setting
vw_oml_problem_args = {
"max_iter_num": online_vw_exp_setting["max_iter_num"],
"dataset_id": "42183",
"ns_num": online_vw_exp_setting["ns_num"],
"fixed_hp_config": online_vw_exp_setting["fixed_hp_config"],
}
if tuning_hp == "NamesapceInteraction":
vw_online_aml_problem = VowpalWabbitNamesspaceTuningProblem(
**vw_oml_problem_args
)
elif tuning_hp == "NamesapceInteraction+LearningRate":
vw_online_aml_problem = VowpalWabbitNamesspaceLRTuningProblem(
**vw_oml_problem_args
)
else:
NotImplementedError
return vw_oml_problem_args, vw_online_aml_problem
class TestAutoVW(unittest.TestCase):
def test_vw_oml_problem_and_vanilla_vw(self):
vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem()
vanilla_vw = pyvw.vw(**vw_oml_problem_args["fixed_hp_config"])
cumulative_loss_list = online_learning_loop(
vw_online_aml_problem.max_iter_num,
vw_online_aml_problem.vw_examples,
vanilla_vw,
loss_func=vw_oml_problem_args["fixed_hp_config"].get(
"loss_function", "squared"
),
)
print(
"final average loss:", sum(cumulative_loss_list) / len(cumulative_loss_list)
)
def test_supervised_vw_tune_namespace(self):
# basic experiment setting
vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem()
autovw = AutoVW(
max_live_model_num=5,
search_space=vw_online_aml_problem.search_space,
init_config=vw_online_aml_problem.init_config,
min_resource_lease="auto",
random_seed=2345,
)
cumulative_loss_list = online_learning_loop(
vw_online_aml_problem.max_iter_num,
vw_online_aml_problem.vw_examples,
autovw,
loss_func=vw_oml_problem_args["fixed_hp_config"].get(
"loss_function", "squared"
),
)
print(
"final average loss:", sum(cumulative_loss_list) / len(cumulative_loss_list)
)
def test_supervised_vw_tune_namespace_learningrate(self):
# basic experiment setting
vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem(
tuning_hp="NamesapceInteraction+LearningRate"
)
autovw = AutoVW(
max_live_model_num=5,
search_space=vw_online_aml_problem.search_space,
init_config=vw_online_aml_problem.init_config,
min_resource_lease="auto",
random_seed=2345,
)
cumulative_loss_list = online_learning_loop(
vw_online_aml_problem.max_iter_num,
vw_online_aml_problem.vw_examples,
autovw,
loss_func=vw_oml_problem_args["fixed_hp_config"].get(
"loss_function", "squared"
),
)
print(
"final average loss:", sum(cumulative_loss_list) / len(cumulative_loss_list)
)
def test_bandit_vw_tune_namespace(self):
pass
def test_bandit_vw_tune_namespace_learningrate(self):
pass
if __name__ == "__main__":
unittest.main()