import pandas as pd import numpy as np import argparse from pathlib import Path import json from sklearn.preprocessing import RobustScaler from flaml.default import greedy from flaml.default.regret import load_result, build_regret regret_bound = 0.01 def config_predictor_tuple(tasks, configs, meta_features, regret_matrix): """Config predictor represented in tuple. The returned tuple consists of (meta_features, preferences, proc). Returns: meta_features_norm: A dataframe of normalized meta features, each column for a task. preferences: A dataframe of sorted configuration indicies by their performance per task (column). regret_matrix: A dataframe of the configuration(row)-task(column) regret matrix. """ # pre-processing scaler = RobustScaler() meta_features_norm = meta_features.loc[tasks] # this makes a copy meta_features_norm.loc[:, :] = scaler.fit_transform(meta_features_norm) proc = { "center": scaler.center_.tolist(), "scale": scaler.scale_.tolist(), } # best model for each dataset in training # choices = regret_matrix[tasks].loc[configs].reset_index(drop=True).idxmin() # break ties using the order in configs regret = ( regret_matrix[tasks] .loc[configs] .reset_index(drop=True) .apply(lambda row: row.apply(lambda x: (x, row.name)), axis=1) ) print(regret) preferences = np.argsort(regret, axis=0) print(preferences) return (meta_features_norm, preferences, proc) def build_portfolio(meta_features, regret, strategy): """Build a portfolio from meta features and regret matrix. Args: meta_features: A dataframe of metafeatures matrix. regret: A dataframe of regret matrix. strategy: A str of the strategy, one of ("greedy", "greedy-feedback"). """ assert strategy in ("greedy", "greedy-feedback") if strategy == "greedy": portfolio = greedy.construct_portfolio(regret, None, regret_bound) elif strategy == "greedy-feedback": portfolio = greedy.construct_portfolio(regret, meta_features, regret_bound) if "default" not in portfolio and "default" in regret.index: portfolio += ["default"] return portfolio def load_json(filename): """Returns the contents of json file filename.""" with open(filename, "r") as f: return json.load(f) def _filter(preference, regret): """Remove choices after default or have NaN regret.""" try: last = regret.index.get_loc("default") # len(preference) - 1 preference = preference[: preference[preference == last].index[0] + 1] except KeyError: # no "default" pass finally: regret = regret.reset_index(drop=True) preference = preference[regret[preference].notna().to_numpy()] # regret = regret[preference].reset_index(drop=True) # dup = regret[regret.duplicated()] # if not dup.empty: # # break ties using the order in configs # unique = dup.drop_duplicates() # for u in unique: # subset = regret == u # preference[subset].sort_values(inplace=True) # # raise ValueError(preference) return preference.tolist() def serialize(configs, regret, meta_features, output_file, config_path): """Store to disk all information FLAML-metalearn needs at runtime. configs: names of model configs regret: regret matrix meta_features: task metafeatures output_file: filename config_path: path containing config json files """ output_file = Path(output_file) # delete if exists try: output_file.unlink() except FileNotFoundError: pass meta_features_norm, preferences, proc = config_predictor_tuple( regret.columns, configs, meta_features, regret ) portfolio = [load_json(config_path.joinpath(m + ".json")) for m in configs] regret = regret.loc[configs] meta_predictor = { "version": "default", "portfolio": portfolio, "preprocessing": proc, "neighbors": [ {"features": tuple(x), "choice": _filter(preferences[y], regret[y])} for x, y in zip( meta_features_norm.to_records(index=False), preferences.columns ) ], "configsource": list(configs), } with open(output_file, "w+") as f: json.dump(meta_predictor, f, indent=4) return meta_predictor # def analyze(regret_matrix, meta_predictor): # tasks = regret_matrix.columns # neighbors = meta_predictor["neighbors"] # from sklearn.neighbors import NearestNeighbors # nn = NearestNeighbors(n_neighbors=1) # for i, task in enumerate(neighbors): # other_tasks = [j for j in range(len(neighbors)) if j != i] # # find the nn and the regret # nn.fit([neighbors[j]["features"] for j in other_tasks]) # dist, ind = nn.kneighbors( # np.array(task["features"]).reshape(1, -1), return_distance=True # ) # ind = other_tasks[int(ind.item())] # choice = int(neighbors[ind]["choice"][0]) # r = regret_matrix.iloc[choice, i] # if r > regret_bound: # label = "outlier" # else: # label = "normal" # print(tasks[i], label, tasks[ind], "dist", dist, "regret", r) # # find the best model and the regret # regrets = regret_matrix.iloc[other_tasks, i] # best = regrets.min() # if best > regret_bound: # print(tasks[i], "best_regret", best, "task", regrets.idxmin()) def main(): parser = argparse.ArgumentParser(description="Build a portfolio.") parser.add_argument( "--strategy", help="One of {greedy, greedy-feedback}", default="greedy" ) parser.add_argument("--input", help="Input path") parser.add_argument("--metafeatures", help="CSV of task metafeatures") parser.add_argument("--exclude", help="One task name to exclude (for LOO purposes)") parser.add_argument("--output", help="Location to write portfolio JSON") parser.add_argument("--task", help="Task to merge portfolios", default="binary") parser.add_argument( "--estimator", help="Estimators to merge portfolios", default=["lgbm", "xgboost"], nargs="+", ) args = parser.parse_args() meta_features = pd.read_csv(args.metafeatures, index_col=0).groupby(level=0).first() if args.exclude: meta_features.drop(args.exclude, inplace=True) baseline_best = None all_results = None for estimator in args.estimator: # produce regret all, baseline = load_result( f"{args.input}/{estimator}/results.csv", args.task, "result" ) regret = build_regret(all, baseline) regret = regret.replace(np.inf, np.nan).dropna(axis=1, how="all") if args.exclude: regret = regret.loc[[i for i in regret.index if args.exclude not in i]] regret = regret[[c for c in regret.columns if args.exclude not in c]] print( f"Regret matrix complete: {100 * regret.count().sum() / regret.shape[0] / regret.shape[1]}%" ) print(f"Num models considered: {regret.shape[0]}") configs = build_portfolio(meta_features, regret, args.strategy) meta_predictor = serialize( configs, regret, meta_features, f"{args.output}/{estimator}/{args.task}.json", Path(f"{args.input}/{estimator}"), ) configsource = meta_predictor["configsource"] all = all.loc[configsource] all.rename({x: f"{estimator}/{x}" for x in regret.index.values}, inplace=True) baseline_best = ( baseline if baseline_best is None else pd.DataFrame({0: baseline_best, 1: baseline}).max(1) ) all_results = all if all_results is None else pd.concat([all_results, all]) # analyze(regret, meta_predictor) regrets = build_regret(all_results, baseline_best) if len(args.estimator) > 1: meta_predictor = serialize( regrets.index, regrets, meta_features, f"{args.output}/all/{args.task}.json", Path(args.input), ) if __name__ == "__main__": # execute only if run as a script main()