mirror of
				https://github.com/microsoft/autogen.git
				synced 2025-10-31 01:40:58 +00:00 
			
		
		
		
	 ca9f9054e7
			
		
	
	
		ca9f9054e7
		
			
		
	
	
	
	
		
			
			* categorical choice can be ordered or unordered * ordered -> order * move choice into utils * version comparison * packaging -> setuptools * import version * version_parse * test order for choice
		
			
				
	
	
		
			238 lines
		
	
	
		
			8.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			238 lines
		
	
	
		
			8.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pandas as pd
 | |
| import numpy as np
 | |
| import argparse
 | |
| from pathlib import Path
 | |
| import json
 | |
| from sklearn.preprocessing import RobustScaler
 | |
| from flaml.default import greedy
 | |
| from flaml.default.regret import load_result, build_regret
 | |
| from flaml.version import __version__
 | |
| 
 | |
| regret_bound = 0.01
 | |
| 
 | |
| 
 | |
| def config_predictor_tuple(tasks, configs, meta_features, regret_matrix):
 | |
|     """Config predictor represented in tuple.
 | |
| 
 | |
|     The returned tuple consists of (meta_features, preferences, proc).
 | |
| 
 | |
|     Returns:
 | |
|         meta_features_norm: A dataframe of normalized meta features, each column for a task.
 | |
|         preferences: A dataframe of sorted configuration indicies by their performance per task (column).
 | |
|         regret_matrix: A dataframe of the configuration(row)-task(column) regret matrix.
 | |
|     """
 | |
|     # pre-processing
 | |
|     scaler = RobustScaler()
 | |
|     meta_features_norm = meta_features.loc[tasks]  # this makes a copy
 | |
|     meta_features_norm.loc[:, :] = scaler.fit_transform(meta_features_norm)
 | |
| 
 | |
|     proc = {
 | |
|         "center": scaler.center_.tolist(),
 | |
|         "scale": scaler.scale_.tolist(),
 | |
|     }
 | |
| 
 | |
|     # best model for each dataset in training
 | |
|     # choices = regret_matrix[tasks].loc[configs].reset_index(drop=True).idxmin()
 | |
| 
 | |
|     # break ties using the order in configs
 | |
|     regret = (
 | |
|         regret_matrix[tasks]
 | |
|         .loc[configs]
 | |
|         .reset_index(drop=True)
 | |
|         .apply(lambda row: row.apply(lambda x: (x, row.name)), axis=1)
 | |
|     )
 | |
|     print(regret)
 | |
|     preferences = np.argsort(regret, axis=0)
 | |
|     print(preferences)
 | |
|     return (meta_features_norm, preferences, proc)
 | |
| 
 | |
| 
 | |
| def build_portfolio(meta_features, regret, strategy):
 | |
|     """Build a portfolio from meta features and regret matrix.
 | |
| 
 | |
|     Args:
 | |
|         meta_features: A dataframe of metafeatures matrix.
 | |
|         regret: A dataframe of regret matrix.
 | |
|         strategy: A str of the strategy, one of ("greedy", "greedy-feedback").
 | |
|     """
 | |
|     assert strategy in ("greedy", "greedy-feedback")
 | |
|     if strategy == "greedy":
 | |
|         portfolio = greedy.construct_portfolio(regret, None, regret_bound)
 | |
|     elif strategy == "greedy-feedback":
 | |
|         portfolio = greedy.construct_portfolio(regret, meta_features, regret_bound)
 | |
|     if "default" not in portfolio and "default" in regret.index:
 | |
|         portfolio += ["default"]
 | |
|     return portfolio
 | |
| 
 | |
| 
 | |
| def load_json(filename):
 | |
|     """Returns the contents of json file filename."""
 | |
|     with open(filename, "r") as f:
 | |
|         return json.load(f)
 | |
| 
 | |
| 
 | |
| def _filter(preference, regret):
 | |
|     """Remove choices after default or have NaN regret."""
 | |
|     try:
 | |
|         last = regret.index.get_loc("default")  # len(preference) - 1
 | |
|         preference = preference[: preference[preference == last].index[0] + 1]
 | |
|     except KeyError:  # no "default"
 | |
|         pass
 | |
|     finally:
 | |
|         regret = regret.reset_index(drop=True)
 | |
|     preference = preference[regret[preference].notna().to_numpy()]
 | |
|     # regret = regret[preference].reset_index(drop=True)
 | |
|     # dup = regret[regret.duplicated()]
 | |
|     # if not dup.empty:
 | |
|     #     # break ties using the order in configs
 | |
|     #     unique = dup.drop_duplicates()
 | |
|     #     for u in unique:
 | |
|     #         subset = regret == u
 | |
|     #         preference[subset].sort_values(inplace=True)
 | |
|     #     # raise ValueError(preference)
 | |
|     return preference.tolist()
 | |
| 
 | |
| 
 | |
| def serialize(configs, regret, meta_features, output_file, config_path):
 | |
|     """Store to disk all information FLAML-metalearn needs at runtime.
 | |
| 
 | |
|     configs: names of model configs
 | |
|     regret: regret matrix
 | |
|     meta_features: task metafeatures
 | |
|     output_file: filename
 | |
|     config_path: path containing config json files
 | |
|     """
 | |
|     output_file = Path(output_file)
 | |
|     # delete if exists
 | |
|     try:
 | |
|         output_file.unlink()
 | |
|     except FileNotFoundError:
 | |
|         pass
 | |
| 
 | |
|     meta_features_norm, preferences, proc = config_predictor_tuple(
 | |
|         regret.columns, configs, meta_features, regret
 | |
|     )
 | |
|     portfolio = [load_json(config_path.joinpath(m + ".json")) for m in configs]
 | |
|     regret = regret.loc[configs]
 | |
| 
 | |
|     meta_predictor = {
 | |
|         "version": __version__,
 | |
|         "meta_feature_names": list(meta_features.columns),
 | |
|         "portfolio": portfolio,
 | |
|         "preprocessing": proc,
 | |
|         "neighbors": [
 | |
|             {"features": tuple(x), "choice": _filter(preferences[y], regret[y])}
 | |
|             for x, y in zip(
 | |
|                 meta_features_norm.to_records(index=False), preferences.columns
 | |
|             )
 | |
|         ],
 | |
|         "configsource": list(configs),
 | |
|     }
 | |
| 
 | |
|     with open(output_file, "w+") as f:
 | |
|         json.dump(meta_predictor, f, indent=4)
 | |
|     return meta_predictor
 | |
| 
 | |
| 
 | |
| # def analyze(regret_matrix, meta_predictor):
 | |
| # tasks = regret_matrix.columns
 | |
| # neighbors = meta_predictor["neighbors"]
 | |
| # from sklearn.neighbors import NearestNeighbors
 | |
| 
 | |
| # nn = NearestNeighbors(n_neighbors=1)
 | |
| # for i, task in enumerate(neighbors):
 | |
| #     other_tasks = [j for j in range(len(neighbors)) if j != i]
 | |
| #     # find the nn and the regret
 | |
| #     nn.fit([neighbors[j]["features"] for j in other_tasks])
 | |
| #     dist, ind = nn.kneighbors(
 | |
| #         np.array(task["features"]).reshape(1, -1), return_distance=True
 | |
| #     )
 | |
| #     ind = other_tasks[int(ind.item())]
 | |
| #     choice = int(neighbors[ind]["choice"][0])
 | |
| #     r = regret_matrix.iloc[choice, i]
 | |
| #     if r > regret_bound:
 | |
| #         label = "outlier"
 | |
| #     else:
 | |
| #         label = "normal"
 | |
| #     print(tasks[i], label, tasks[ind], "dist", dist, "regret", r)
 | |
| #     # find the best model and the regret
 | |
| #     regrets = regret_matrix.iloc[other_tasks, i]
 | |
| #     best = regrets.min()
 | |
| #     if best > regret_bound:
 | |
| #         print(tasks[i], "best_regret", best, "task", regrets.idxmin())
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     parser = argparse.ArgumentParser(description="Build a portfolio.")
 | |
|     parser.add_argument(
 | |
|         "--strategy", help="One of {greedy, greedy-feedback}", default="greedy"
 | |
|     )
 | |
|     parser.add_argument("--input", help="Input path")
 | |
|     parser.add_argument("--metafeatures", help="CSV of task metafeatures")
 | |
|     parser.add_argument("--exclude", help="One task name to exclude (for LOO purposes)")
 | |
|     parser.add_argument("--output", help="Location to write portfolio JSON")
 | |
|     parser.add_argument("--task", help="Task to merge portfolios", default="binary")
 | |
|     parser.add_argument(
 | |
|         "--estimator",
 | |
|         help="Estimators to merge portfolios",
 | |
|         default=["lgbm", "xgboost"],
 | |
|         nargs="+",
 | |
|     )
 | |
|     args = parser.parse_args()
 | |
| 
 | |
|     meta_features = pd.read_csv(args.metafeatures, index_col=0).groupby(level=0).first()
 | |
|     if args.exclude:
 | |
|         meta_features.drop(args.exclude, inplace=True)
 | |
| 
 | |
|     baseline_best = None
 | |
|     all_results = None
 | |
|     for estimator in args.estimator:
 | |
|         # produce regret
 | |
|         all, baseline = load_result(
 | |
|             f"{args.input}/{estimator}/results.csv", args.task, "result"
 | |
|         )
 | |
|         regret = build_regret(all, baseline)
 | |
|         regret = regret.replace(np.inf, np.nan).dropna(axis=1, how="all")
 | |
| 
 | |
|         if args.exclude:
 | |
|             regret = regret.loc[[i for i in regret.index if args.exclude not in i]]
 | |
|             regret = regret[[c for c in regret.columns if args.exclude not in c]]
 | |
| 
 | |
|         print(
 | |
|             f"Regret matrix complete: {100 * regret.count().sum() / regret.shape[0] / regret.shape[1]}%"
 | |
|         )
 | |
|         print(f"Num models considered: {regret.shape[0]}")
 | |
| 
 | |
|         configs = build_portfolio(meta_features, regret, args.strategy)
 | |
|         meta_predictor = serialize(
 | |
|             configs,
 | |
|             regret,
 | |
|             meta_features,
 | |
|             f"{args.output}/{estimator}/{args.task}.json",
 | |
|             Path(f"{args.input}/{estimator}"),
 | |
|         )
 | |
|         configsource = meta_predictor["configsource"]
 | |
|         all = all.loc[configsource]
 | |
|         all.rename({x: f"{estimator}/{x}" for x in regret.index.values}, inplace=True)
 | |
|         baseline_best = (
 | |
|             baseline
 | |
|             if baseline_best is None
 | |
|             else pd.DataFrame({0: baseline_best, 1: baseline}).max(1)
 | |
|         )
 | |
|         all_results = all if all_results is None else pd.concat([all_results, all])
 | |
|         # analyze(regret, meta_predictor)
 | |
|     regrets = build_regret(all_results, baseline_best)
 | |
|     if len(args.estimator) > 1:
 | |
|         meta_predictor = serialize(
 | |
|             regrets.index,
 | |
|             regrets,
 | |
|             meta_features,
 | |
|             f"{args.output}/all/{args.task}.json",
 | |
|             Path(args.input),
 | |
|         )
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     # execute only if run as a script
 | |
|     main()
 |