import unittest import numpy as np import scipy.sparse import pandas as pd from sklearn.metrics import mean_squared_error, mean_absolute_error import time import logging from flaml.tune import loguniform, polynomial_expansion_set from vowpalwabbit import pyvw from flaml import AutoVW import string import os import openml VW_DS_DIR = 'test/data/' NS_LIST = list(string.ascii_lowercase) + list(string.ascii_uppercase) logger = logging.getLogger(__name__) def oml_to_vw_w_grouping(X, y, ds_dir, fname, orginal_dim, group_num, grouping_method='sequential'): # split all_indexes into # group_num of groups max_size_per_group = int(np.ceil(orginal_dim / float(group_num))) # sequential grouping if grouping_method == 'sequential': group_indexes = [] # lists of lists for i in range(group_num): indexes = [ind for ind in range(i * max_size_per_group, min((i + 1) * max_size_per_group, orginal_dim))] if len(indexes) > 0: group_indexes.append(indexes) print(group_indexes) else: NotImplementedError if group_indexes: if not os.path.exists(ds_dir): os.makedirs(ds_dir) with open(os.path.join(ds_dir, fname), 'w') as f: if isinstance(X, pd.DataFrame): raise NotImplementedError elif isinstance(X, np.ndarray): for i in range(len(X)): NS_content = [] for zz in range(len(group_indexes)): ns_features = ' '.join('{}:{:.6f}'.format(ind, X[i][ind] ) for ind in group_indexes[zz]) NS_content.append(ns_features) ns_line = '{} |{}'.format(str(y[i]), '|'.join( '{} {}'.format(NS_LIST[j], NS_content[j] ) for j in range(len(group_indexes)))) f.write(ns_line) f.write('\n') elif isinstance(X, scipy.sparse.csr_matrix): print('NotImplementedError for sparse data') NotImplementedError def save_vw_dataset_w_ns(X, y, did, ds_dir, max_ns_num, is_regression): """ convert openml dataset to vw example and save to file """ print('is_regression', is_regression) if is_regression: fname = 'ds_{}_{}_{}.vw'.format(did, max_ns_num, 0) print('dataset size', X.shape[0], X.shape[1]) print('saving data', did, ds_dir, fname) dim = X.shape[1] oml_to_vw_w_grouping(X, y, ds_dir, fname, dim, group_num=max_ns_num) else: NotImplementedError def shuffle_data(X, y, seed): try: n = len(X) except ValueError: n = X.getnnz() perm = np.random.RandomState(seed=seed).permutation(n) X_shuf = X[perm, :] y_shuf = y[perm] return X_shuf, y_shuf def get_oml_to_vw(did, max_ns_num, ds_dir=VW_DS_DIR): success = False print('-----getting oml dataset-------', did) ds = openml.datasets.get_dataset(did) target_attribute = ds.default_target_attribute # if target_attribute is None and did in OML_target_attribute_dict: # target_attribute = OML_target_attribute_dict[did] print('target=ds.default_target_attribute', target_attribute) data = ds.get_data(target=target_attribute, dataset_format='array') X, y = data[0], data[1] # return X: pd DataFrame, y: pd series import scipy if scipy.sparse.issparse(X): X = scipy.sparse.csr_matrix.toarray(X) print('is sparse matrix') if data and isinstance(X, np.ndarray): print('-----converting oml to vw and and saving oml dataset-------') save_vw_dataset_w_ns(X, y, did, ds_dir, max_ns_num, is_regression=True) success = True else: print('---failed to convert/save oml dataset to vw!!!----') try: X, y = data[0], data[1] # return X: pd DataFrame, y: pd series if data and isinstance(X, np.ndarray): print('-----converting oml to vw and and saving oml dataset-------') save_vw_dataset_w_ns(X, y, did, ds_dir, max_ns_num, is_regression=True) success = True else: print('---failed to convert/save oml dataset to vw!!!----') except ValueError: print('-------------failed to get oml dataset!!!', did) return success def load_vw_dataset(did, ds_dir, is_regression, max_ns_num): import os if is_regression: # the second field specifies the largest number of namespaces using. fname = 'ds_{}_{}_{}.vw'.format(did, max_ns_num, 0) vw_dataset_file = os.path.join(ds_dir, fname) # if file does not exist, generate and save the datasets if not os.path.exists(vw_dataset_file) or os.stat(vw_dataset_file).st_size < 1000: get_oml_to_vw(did, max_ns_num) print(ds_dir, vw_dataset_file) if not os.path.exists(ds_dir): os.makedirs(ds_dir) with open(os.path.join(ds_dir, fname), 'r') as f: vw_content = f.read().splitlines() print(type(vw_content), len(vw_content)) return vw_content def get_data(iter_num=None, dataset_id=None, vw_format=True, max_ns_num=10, shuffle=False, use_log=True, dataset_type='regression'): logging.info('generating data') LOG_TRANSFORMATION_THRESHOLD = 100 # get data from simulation import random vw_examples = None data_id = int(dataset_id) # loading oml dataset # data = OpenML2VWData(data_id, max_ns_num, dataset_type) # Y = data.Y if vw_format: # vw_examples = data.vw_examples vw_examples = load_vw_dataset(did=data_id, ds_dir=VW_DS_DIR, is_regression=True, max_ns_num=max_ns_num) Y = [] for i, e in enumerate(vw_examples): Y.append(float(e.split('|')[0])) logger.debug('first data %s', vw_examples[0]) # do data shuffling or log transformation for oml data when needed if shuffle: random.seed(54321) random.shuffle(vw_examples) # do log transformation unique_y = set(Y) min_y = min(unique_y) max_y = max(unique_y) if use_log and max((max_y - min_y), max_y) >= LOG_TRANSFORMATION_THRESHOLD: log_vw_examples = [] for v in vw_examples: org_y = v.split('|')[0] y = float(v.split('|')[0]) # shift y to ensure all y are positive if min_y <= 0: y = y + abs(min_y) + 1 log_y = np.log(y) log_vw = v.replace(org_y + '|', str(log_y) + ' |') log_vw_examples.append(log_vw) logger.info('log_vw_examples %s', log_vw_examples[0:2]) if log_vw_examples: return log_vw_examples return vw_examples, Y class VowpalWabbitNamesspaceTuningProblem: def __init__(self, max_iter_num, dataset_id, ns_num, **kwargs): use_log = kwargs.get('use_log', True), shuffle = kwargs.get('shuffle', False) vw_format = kwargs.get('vw_format', True) print('dataset_id', dataset_id) self.vw_examples, self.Y = get_data(max_iter_num, dataset_id=dataset_id, vw_format=vw_format, max_ns_num=ns_num, shuffle=shuffle, use_log=use_log ) self.max_iter_num = min(max_iter_num, len(self.Y)) self._problem_info = {'max_iter_num': self.max_iter_num, 'dataset_id': dataset_id, 'ns_num': ns_num, } self._problem_info.update(kwargs) self._fixed_hp_config = kwargs.get('fixed_hp_config', {}) self.namespace_feature_dim = AutoVW.get_ns_feature_dim_from_vw_example(self.vw_examples[0]) self._raw_namespaces = list(self.namespace_feature_dim.keys()) self._setup_search() def _setup_search(self): self._search_space = self._fixed_hp_config.copy() self._init_config = self._fixed_hp_config.copy() search_space = {'interactions': polynomial_expansion_set( init_monomials=set(self._raw_namespaces), highest_poly_order=len(self._raw_namespaces), allow_self_inter=False), } init_config = {'interactions': set()} self._search_space.update(search_space) self._init_config.update(init_config) logger.info('search space %s %s %s', self._search_space, self._init_config, self._fixed_hp_config) @property def init_config(self): return self._init_config @property def search_space(self): return self._search_space class VowpalWabbitNamesspaceLRTuningProblem(VowpalWabbitNamesspaceTuningProblem): def __init__(self, max_iter_num, dataset_id, ns_num, **kwargs): super().__init__(max_iter_num, dataset_id, ns_num, **kwargs) self._setup_search() def _setup_search(self): self._search_space = self._fixed_hp_config.copy() self._init_config = self._fixed_hp_config.copy() search_space = {'interactions': polynomial_expansion_set( init_monomials=set(self._raw_namespaces), highest_poly_order=len(self._raw_namespaces), allow_self_inter=False), 'learning_rate': loguniform(lower=2e-10, upper=1.0) } init_config = {'interactions': set(), 'learning_rate': 0.5} self._search_space.update(search_space) self._init_config.update(init_config) logger.info('search space %s %s %s', self._search_space, self._init_config, self._fixed_hp_config) def get_y_from_vw_example(vw_example): """ get y from a vw_example. this works for regression dataset """ return float(vw_example.split('|')[0]) def get_loss(y_pred, y_true, loss_func='squared'): if 'squared' in loss_func: loss = mean_squared_error([y_pred], [y_true]) elif 'absolute' in loss_func: loss = mean_absolute_error([y_pred], [y_true]) else: loss = None raise NotImplementedError return loss def online_learning_loop(iter_num, vw_examples, vw_alg, loss_func, method_name=''): """Implements the online learning loop. Args: iter_num (int): The total number of iterations vw_examples (list): A list of vw examples alg (alg instance): An algorithm instance has the following functions: - alg.learn(example) - alg.predict(example) loss_func (str): loss function Outputs: cumulative_loss_list (list): the list of cumulative loss from each iteration. It is returned for the convenience of visualization. """ print('rerunning exp....', len(vw_examples), iter_num) loss_list = [] y_predict_list = [] for i in range(iter_num): vw_x = vw_examples[i] y_true = get_y_from_vw_example(vw_x) # predict step y_pred = vw_alg.predict(vw_x) # learn step vw_alg.learn(vw_x) # calculate one step loss loss = get_loss(y_pred, y_true, loss_func) loss_list.append(loss) y_predict_list.append([y_pred, y_true]) return loss_list def get_vw_tuning_problem(tuning_hp='NamesapceInteraction'): online_vw_exp_setting = {"max_live_model_num": 5, "fixed_hp_config": {'alg': 'supervised', 'loss_function': 'squared'}, "ns_num": 10, "max_iter_num": 10000, } # construct openml problem setting based on basic experiment setting vw_oml_problem_args = {"max_iter_num": online_vw_exp_setting['max_iter_num'], "dataset_id": '42183', "ns_num": online_vw_exp_setting['ns_num'], "fixed_hp_config": online_vw_exp_setting['fixed_hp_config'], } if tuning_hp == 'NamesapceInteraction': vw_online_aml_problem = VowpalWabbitNamesspaceTuningProblem(**vw_oml_problem_args) elif tuning_hp == 'NamesapceInteraction+LearningRate': vw_online_aml_problem = VowpalWabbitNamesspaceLRTuningProblem(**vw_oml_problem_args) else: NotImplementedError return vw_oml_problem_args, vw_online_aml_problem class TestAutoVW(unittest.TestCase): def test_vw_oml_problem_and_vanilla_vw(self): vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem() vanilla_vw = pyvw.vw(**vw_oml_problem_args["fixed_hp_config"]) cumulative_loss_list = online_learning_loop(vw_online_aml_problem.max_iter_num, vw_online_aml_problem.vw_examples, vanilla_vw, loss_func=vw_oml_problem_args["fixed_hp_config"].get("loss_function", "squared"), ) print('final average loss:', sum(cumulative_loss_list) / len(cumulative_loss_list)) def test_supervised_vw_tune_namespace(self): # basic experiment setting vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem() autovw = AutoVW(max_live_model_num=5, search_space=vw_online_aml_problem.search_space, init_config=vw_online_aml_problem.init_config, min_resource_lease='auto', random_seed=2345) cumulative_loss_list = online_learning_loop(vw_online_aml_problem.max_iter_num, vw_online_aml_problem.vw_examples, autovw, loss_func=vw_oml_problem_args["fixed_hp_config"].get("loss_function", "squared"), ) print('final average loss:', sum(cumulative_loss_list) / len(cumulative_loss_list)) def test_supervised_vw_tune_namespace_learningrate(self): # basic experiment setting vw_oml_problem_args, vw_online_aml_problem = get_vw_tuning_problem(tuning_hp='NamesapceInteraction+LearningRate') autovw = AutoVW(max_live_model_num=5, search_space=vw_online_aml_problem.search_space, init_config=vw_online_aml_problem.init_config, min_resource_lease='auto', random_seed=2345) cumulative_loss_list = online_learning_loop(vw_online_aml_problem.max_iter_num, vw_online_aml_problem.vw_examples, autovw, loss_func=vw_oml_problem_args["fixed_hp_config"].get("loss_function", "squared"), ) print('final average loss:', sum(cumulative_loss_list) / len(cumulative_loss_list)) def test_bandit_vw_tune_namespace(self): pass def test_bandit_vw_tune_namespace_learningrate(self): pass if __name__ == "__main__": unittest.main()