From 548af1546647c8ec69cd50f146abbf6fd253f29b Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Tue, 2 Jul 2024 22:19:22 +0000 Subject: [PATCH 01/22] Implement metadata for multifidelity in SMAC --- .cspell.json | 2 + .../optimizers/mlos_core_optimizer.py | 2 +- mlos_core/mlos_core/optimizers/README | 27 ++ .../bayesian_optimizers/smac_optimizer.py | 250 +++++++++++++++--- mlos_core/mlos_core/optimizers/optimizer.py | 59 +++-- mlos_core/mlos_core/optimizers/utils.py | 57 ++++ .../optimizers/optimizer_metadata_test.py | 99 +++++++ .../optimizers/optimizer_multiobj_test.py | 6 +- .../tests/optimizers/optimizer_test.py | 22 +- 9 files changed, 465 insertions(+), 59 deletions(-) create mode 100644 mlos_core/mlos_core/optimizers/README create mode 100644 mlos_core/mlos_core/optimizers/utils.py create mode 100644 mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py diff --git a/.cspell.json b/.cspell.json index 2cd9280fc8..19aa2b07f7 100644 --- a/.cspell.json +++ b/.cspell.json @@ -43,6 +43,7 @@ "linalg", "llamatune", "matplotlib", + "metadatas", "mlos", "mloscore", "mwait", @@ -72,6 +73,7 @@ "sklearn", "skopt", "smac", + "Sobol", "sqlalchemy", "srcpaths", "subcmd", diff --git a/mlos_bench/mlos_bench/optimizers/mlos_core_optimizer.py b/mlos_bench/mlos_bench/optimizers/mlos_core_optimizer.py index e0235f76b9..cdead2528f 100644 --- a/mlos_bench/mlos_bench/optimizers/mlos_core_optimizer.py +++ b/mlos_bench/mlos_bench/optimizers/mlos_core_optimizer.py @@ -199,7 +199,7 @@ def register(self, tunables: TunableGroups, status: Status, return registered_score def get_best_observation(self) -> Union[Tuple[Dict[str, float], TunableGroups], Tuple[None, None]]: - (df_config, df_score, _df_context) = self._opt.get_best_observations() + (df_config, df_score, _df_context, _df_metadata) = self._opt.get_best_observations() if len(df_config) == 0: return (None, None) params = configspace_data_to_tunable_values(df_config.iloc[0].to_dict()) diff --git a/mlos_core/mlos_core/optimizers/README b/mlos_core/mlos_core/optimizers/README new file mode 100644 index 0000000000..5d597ba076 --- /dev/null +++ b/mlos_core/mlos_core/optimizers/README @@ -0,0 +1,27 @@ +# Optimizers + +This is a directory that contains wrappers for different optimizers to integrate into MLOS. +This is implemented though child classes for the `BaseOptimizer` class defined in `optimizer.py`. + +The main goal of these optimizers is to `suggest` configurations, possibly based on prior trial data to find an optimum based on some objective(s). +This process is interacted with through `register` and `suggest` interfaces. + +The following definitions are useful for understanding the implementation + +- `configuration`: a vector representation of a configuration of a system to be evaluated. +- `score`: the objective(s) associated with a configuration +- `metadata`: additional information about the evaluation, such as the runtime budget used during evaluation. +- `context`: additional (static) information about the evaluation used to extend the internal model used for suggesting samples. + For instance, a descriptor of the VM size (vCore count and # of GB of RAM), and some descriptor of the workload. + The intent being to allow either sharing or indexing of trial info between "similar" experiments in order to help make the optimization process more efficient for new scenarios. + > Note: This is not yet implemented. +The interface for these classes can be described as follows: + +- `register`: this is a function that takes a configuration, a score, and, optionally, metadata about the evaluation to update the model for future evaluations. +- `suggest`: this function returns a new configuration for evaluation. + + Some optimizers will return additional metadata for evaluation, that should be used during the register phase. + This function can also optionally take context (not yet implemented), and an argument to force the function to return the default configuration. +- `register_pending`: registers a configuration and metadata pair as pending to the optimizer. +- `get_observations`: returns all observations reported to the optimizer as a triplet of DataFrames (config, score, context, metadata). +- `get_best_observations`: returns the best observation as a triplet of best (config, score, context, metadata) DataFrames. diff --git a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py index aa948b8125..d7886ffcf6 100644 --- a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py +++ b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py @@ -9,14 +9,22 @@ from logging import warning from pathlib import Path -from typing import Dict, List, Optional, Tuple, Union, TYPE_CHECKING +import threading +from typing import Any, Dict, List, Optional, Tuple, Type, Union from tempfile import TemporaryDirectory from warnings import warn import ConfigSpace +import numpy as np import numpy.typing as npt import pandas as pd +from smac import HyperparameterOptimizationFacade as Optimizer_Smac +from smac.facade import AbstractFacade +from smac.initial_design import AbstractInitialDesign, SobolInitialDesign +from smac.intensifier.abstract_intensifier import AbstractIntensifier +from smac.runhistory import TrialInfo +from mlos_core.optimizers.utils import filter_kwargs, to_metadata from mlos_core.optimizers.bayesian_optimizers.bayesian_optimizer import BaseBayesianOptimizer from mlos_core.spaces.adapters.adapter import BaseSpaceAdapter from mlos_core.spaces.adapters.identity_adapter import IdentityAdapter @@ -39,7 +47,11 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments n_random_init: Optional[int] = None, max_ratio: Optional[float] = None, use_default_config: bool = False, - n_random_probability: float = 0.1): + n_random_probability: float = 0.1, + facade: Type[AbstractFacade] = Optimizer_Smac, + intensifier: Optional[Type[AbstractIntensifier]] = None, + initial_design_class: Type[AbstractInitialDesign] = SobolInitialDesign, + **kwargs: Any): """ Instantiate a new SMAC optimizer wrapper. @@ -91,6 +103,22 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments n_random_probability: float Probability of choosing to evaluate a random configuration during optimization. Defaults to `0.1`. Setting this to a higher value favors exploration over exploitation. + + facade: AbstractFacade + sets the facade to use for SMAC + + intensifier: Optional[Type[AbstractIntensifier]] + Sets the intensifier type to use in the optimizer. If not set, the + default intensifier + from the facade will be used + + initial_design_class: AbstractInitialDesign + Sets the initial design class to be used in the optimizer. + Defaults to SobolInitialDesign + + **kwargs: + Additional arguments to be passed to the + facade, scenario, and intensifier """ super().__init__( parameter_space=parameter_space, @@ -103,15 +131,14 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments self._temp_output_directory: Optional[TemporaryDirectory] = None # pylint: disable=import-outside-toplevel - from smac import HyperparameterOptimizationFacade as Optimizer_Smac from smac import Scenario - from smac.intensifier.abstract_intensifier import AbstractIntensifier from smac.main.config_selector import ConfigSelector from smac.random_design.probability_design import ProbabilityRandomDesign - from smac.runhistory import TrialInfo # Store for TrialInfo instances returned by .ask() - self.trial_info_map: Dict[ConfigSpace.Configuration, TrialInfo] = {} + self.trial_info_df: pd.DataFrame = pd.DataFrame( + columns=["Configuration", "Metadata", "TrialInfo", "TrialValue"] + ) # The default when not specified is to use a known seed (0) to keep results reproducible. # However, if a `None` seed is explicitly provided, we let a random seed be produced by SMAC. @@ -143,8 +170,16 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments n_trials=max_trials, seed=seed or -1, # if -1, SMAC will generate a random seed internally n_workers=1, # Use a single thread for evaluating trials + **filter_kwargs(Scenario, **kwargs), ) - intensifier: AbstractIntensifier = Optimizer_Smac.get_intensifier(scenario, max_config_calls=1) + + if intensifier is None: + intensifier_instance = facade.get_intensifier(scenario) + else: + intensifier_instance = intensifier( + scenario, **filter_kwargs(intensifier, **kwargs) + ) + config_selector: ConfigSelector = Optimizer_Smac.get_config_selector(scenario, retrain_after=1) # TODO: When bulk registering prior configs to rewarm the optimizer, @@ -178,11 +213,10 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments assert isinstance(max_ratio, float) and 0.0 <= max_ratio <= 1.0 initial_design_args['max_ratio'] = max_ratio - # Use the default InitialDesign from SMAC. - # (currently SBOL instead of LatinHypercube due to better uniformity + # build the initial design for SMAC. + # (currently defaults SOBOL instead of LatinHypercube due to better uniformity # for initial sampling which results in lower overall samples required) - initial_design = Optimizer_Smac.get_initial_design(**initial_design_args) # type: ignore[arg-type] - # initial_design = LatinHypercubeInitialDesign(**initial_design_args) # type: ignore[arg-type] + initial_design = initial_design_class(**initial_design_args) # type: ignore[arg-type] # Workaround a bug in SMAC that doesn't pass the seed to the random # design when generated a random_design for itself via the @@ -190,19 +224,22 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments assert isinstance(n_random_probability, float) and n_random_probability >= 0 random_design = ProbabilityRandomDesign(probability=n_random_probability, seed=scenario.seed) - self.base_optimizer = Optimizer_Smac( + self.base_optimizer = facade( scenario, SmacOptimizer._dummy_target_func, initial_design=initial_design, - intensifier=intensifier, + intensifier=intensifier_instance, random_design=random_design, config_selector=config_selector, multi_objective_algorithm=Optimizer_Smac.get_multi_objective_algorithm( scenario, objective_weights=self._objective_weights), overwrite=True, logging_level=False, # Use the existing logger + **filter_kwargs(facade, **kwargs), ) + self.lock = threading.Lock() + def __del__(self) -> None: # Best-effort attempt to clean up, in case the user forgets to call .cleanup() self.cleanup() @@ -224,7 +261,12 @@ def n_random_init(self) -> int: return self.base_optimizer._initial_design._n_configs @staticmethod - def _dummy_target_func(config: ConfigSpace.Configuration, seed: int = 0) -> None: + def _dummy_target_func( + config: ConfigSpace.Configuration, + seed: int = 0, + budget: float = 1, + instance: object = None, + ) -> None: """Dummy target function for SMAC optimizer. Since we only use the ask-and-tell interface, this is never called. @@ -236,6 +278,12 @@ def _dummy_target_func(config: ConfigSpace.Configuration, seed: int = 0) -> None seed : int Random seed to use for the target function. Not actually used. + + budget : int + The budget that was used for evaluating the configuration. + + instance : object + The instance that the configuration was evaluated on. """ # NOTE: Providing a target function when using the ask-and-tell interface is an imperfection of the API # -- this planned to be fixed in some future release: https://github.com/automl/SMAC3/issues/946 @@ -257,20 +305,68 @@ def _register(self, *, configs: pd.DataFrame, Not Yet Implemented. metadata: pd.DataFrame - Not Yet Implemented. + The metadata for the config to register. """ - from smac.runhistory import StatusType, TrialInfo, TrialValue # pylint: disable=import-outside-toplevel + from smac.runhistory import StatusType, TrialValue # pylint: disable=import-outside-toplevel if context is not None: warn(f"Not Implemented: Ignoring context {list(context.columns)}", UserWarning) - # Register each trial (one-by-one) - for (config, (_i, score)) in zip(self._to_configspace_configs(configs=configs), scores.iterrows()): - # Retrieve previously generated TrialInfo (returned by .ask()) or create new TrialInfo instance - info: TrialInfo = self.trial_info_map.get( - config, TrialInfo(config=config, seed=self.base_optimizer.scenario.seed)) - value = TrialValue(cost=list(score.astype(float)), time=0.0, status=StatusType.SUCCESS) - self.base_optimizer.tell(info, value, save=False) + with self.lock: + # Register each trial (one-by-one) + metadatas: Union[List[pd.Series], List[None]] = to_metadata(metadata) or [ + None for _ in scores # type: ignore[misc] + ] + for config, score, ctx in zip( + self._to_configspace_configs(configs=configs), + scores.values.tolist(), + metadatas, + ): + value: TrialValue = TrialValue( + cost=score, time=0.0, status=StatusType.SUCCESS + ) + + matching: pd.Series[bool] + if ctx is None: + matching = self.trial_info_df["Configuration"] == config + else: + matching = ( + self.trial_info_df["Configuration"] == config + ) & pd.Series( + [df_ctx.equals(ctx) for df_ctx in self.trial_info_df["Metadata"]] + ) + + # make a new entry + if sum(matching) > 0: + info = self.trial_info_df[matching]["TrialInfo"].iloc[-1] + self.trial_info_df.at[list(matching).index(True), "TrialValue"] = ( + value + ) + else: + if ctx is None or "budget" not in ctx or "instance" not in ctx: + info = TrialInfo( + config=config, seed=self.base_optimizer.scenario.seed + ) + self.trial_info_df.loc[len(self.trial_info_df.index)] = [ + config, + ctx, + info, + value, + ] + else: + info = TrialInfo( + config=config, + seed=self.base_optimizer.scenario.seed, + budget=ctx["budget"], + instance=ctx["instance"], + ) + self.trial_info_df.loc[len(self.trial_info_df.index)] = [ + config, + ctx, + info, + value, + ] + self.base_optimizer.tell(info, value, save=False) # Save optimizer once we register all configs self.base_optimizer.optimizer.save() @@ -289,11 +385,8 @@ def _suggest(self, *, context: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFr Pandas dataframe with a single row. Column names are the parameter names. metadata : Optional[pd.DataFrame] - Not yet implemented. + The metadata for the config being suggested. """ - if TYPE_CHECKING: - from smac.runhistory import TrialInfo # pylint: disable=import-outside-toplevel,unused-import - if context is not None: warn(f"Not Implemented: Ignoring context {list(context.columns)}", UserWarning) @@ -301,9 +394,18 @@ def _suggest(self, *, context: Optional[pd.DataFrame] = None) -> Tuple[pd.DataFr trial.config.is_valid_configuration() self.optimizer_parameter_space.check_configuration(trial.config) assert trial.config.config_space == self.optimizer_parameter_space - self.trial_info_map[trial.config] = trial - config_df = pd.DataFrame([trial.config], columns=list(self.optimizer_parameter_space.keys())) - return config_df, None + + config_df = self._extract_config(trial) + metadata_df = _extract_metadata(trial) + + with self.lock: + self.trial_info_df.loc[len(self.trial_info_df.index)] = [ + trial.config, + metadata_df.iloc[0], + trial, + None, + ] + return config_df, metadata_df def register_pending(self, *, configs: pd.DataFrame, context: Optional[pd.DataFrame] = None, @@ -348,6 +450,56 @@ def cleanup(self) -> None: self._temp_output_directory.cleanup() self._temp_output_directory = None + def get_observations_full(self) -> pd.DataFrame: + """Returns the observations as a dataframe with additional info. + + Returns + ------- + observations : pd.DataFrame + Dataframe of observations. The columns are parameter names and "score" for the score, each row is an observation. + """ + if len(self.trial_info_df) == 0: + raise ValueError("No observations registered yet.") + + return self.trial_info_df + + def get_best_observations(self, *, n_max: int = 1 + ) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame]]: + """Returns the best observation so far as a dataframe. + + Returns + ------- + best_observation : pd.DataFrame + Dataframe with a single row containing the best observation. The columns are parameter names and "score" for the score. + """ + if len(self._observations) == 0: + raise ValueError("No observations registered yet.") + + observations = self._observations + + max_budget = np.nan + budgets = [ + metadata["budget"].max() + for _, _, _, metadata in self._observations + if metadata is not None + ] + if len(budgets) > 0: + max_budget = max(budgets) + + if max_budget is not np.nan: + observations = [ + (config, score, context, metadata) + for config, score, context, metadata in self._observations + if metadata is not None and metadata["budget"].max() == max_budget + ] + + configs, scores, contexts, metadatas = self._get_observations(observations) + + idx = scores.nsmallest(n_max, columns=self._optimization_targets, keep="first").index + return (configs.loc[idx], scores.loc[idx], + None if contexts is None else contexts.loc[idx], + None if metadatas is None else metadatas.loc[idx]) + def _to_configspace_configs(self, *, configs: pd.DataFrame) -> List[ConfigSpace.Configuration]: """Convert a dataframe of configs to a list of ConfigSpace configs. @@ -365,3 +517,41 @@ def _to_configspace_configs(self, *, configs: pd.DataFrame) -> List[ConfigSpace. ConfigSpace.Configuration(self.optimizer_parameter_space, values=config.to_dict()) for (_, config) in configs.astype('O').iterrows() ] + + def _extract_config(self, trial: TrialInfo) -> pd.DataFrame: + """Convert TrialInfo to a config DataFrame. + + Parameters + ---------- + trial : TrialInfo + The trial to extract. + + Returns + ------- + config : pd.DataFrame + Pandas dataframe with a single row containing the config. + Column names are config parameters + """ + return pd.DataFrame( + [trial.config], columns=list(self.optimizer_parameter_space.keys()) + ) + + +def _extract_metadata(trial: TrialInfo) -> pd.DataFrame: + """Convert TrialInfo to a metadata DataFrame. + + Parameters + ---------- + trial : TrialInfo + The trial to extract. + + Returns + ------- + metadata : pd.DataFrame + Pandas dataframe with a single row containing the metadata. + Column names are the budget and instance of the evaluation, if valid. + """ + return pd.DataFrame( + [[trial.instance, trial.seed, trial.budget]], + columns=["instance", "seed", "budget"], + ) diff --git a/mlos_core/mlos_core/optimizers/optimizer.py b/mlos_core/mlos_core/optimizers/optimizer.py index 8fcf592a6c..670216b364 100644 --- a/mlos_core/mlos_core/optimizers/optimizer.py +++ b/mlos_core/mlos_core/optimizers/optimizer.py @@ -56,9 +56,9 @@ def __init__(self, *, raise ValueError("Number of weights must match the number of optimization targets") self._space_adapter: Optional[BaseSpaceAdapter] = space_adapter - self._observations: List[Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]] = [] + self._observations: List[Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame]]] = [] self._has_context: Optional[bool] = None - self._pending_observations: List[Tuple[pd.DataFrame, Optional[pd.DataFrame]]] = [] + self._pending_observations: List[Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame]]] = [] def __repr__(self) -> str: return f"{self.__class__.__name__}(space_adapter={self.space_adapter})" @@ -98,7 +98,7 @@ def register(self, *, configs: pd.DataFrame, scores: pd.DataFrame, "Mismatched number of configs and context." assert configs.shape[1] == len(self.parameter_space.values()), \ "Mismatched configuration shape." - self._observations.append((configs, scores, context)) + self._observations.append((configs, scores, context, metadata)) self._has_context = context is not None if self._space_adapter: @@ -197,26 +197,48 @@ def register_pending(self, *, configs: pd.DataFrame, """ pass # pylint: disable=unnecessary-pass # pragma: no cover - def get_observations(self) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]: + def _get_observations(self, observations: + List[Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame]]] + ) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame]]: """ - Returns the observations as a triplet of DataFrames (config, score, context). + Returns the observations as a quad of DataFrames(config, score, context, metadata) + for a specific set of observations. + + Parameters + ---------- + observations: List[Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame]]] + Observations to run the transformation on Returns ------- - observations : Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]] - A triplet of (config, score, context) DataFrames of observations. + observations: Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]], Optional[pd.DataFrame]] + A quad of(config, score, context, metadata) DataFrames of observations. """ - if len(self._observations) == 0: + if len(observations) == 0: raise ValueError("No observations registered yet.") - configs = pd.concat([config for config, _, _ in self._observations]).reset_index(drop=True) - scores = pd.concat([score for _, score, _ in self._observations]).reset_index(drop=True) + configs = pd.concat([config for config, _, _, _ in observations]).reset_index(drop=True) + scores = pd.concat([score for _, score, _, _ in observations]).reset_index(drop=True) contexts = pd.concat([pd.DataFrame() if context is None else context - for _, _, context in self._observations]).reset_index(drop=True) - return (configs, scores, contexts if len(contexts.columns) > 0 else None) + for _, _, context, _ in observations]).reset_index(drop=True) + metadatas = pd.concat([pd.DataFrame() if metadata is None else metadata + for _, _, _, metadata in observations]).reset_index(drop=True) + return (configs, scores, contexts if len(contexts.columns) > 0 else None, metadatas if len(metadatas.columns) > 0 else None) + + def get_observations(self) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame]]: + """ + Returns the observations as a quad of DataFrames(config, score, context, metadata). + + Returns + ------- + observations: Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]], Optional[pd.DataFrame]] + A quad of(config, score, context, metadata) DataFrames of observations. + """ + return self._get_observations(self._observations) - def get_best_observations(self, *, n_max: int = 1) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]]: + def get_best_observations(self, *, n_max: int = 1) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame], + Optional[pd.DataFrame]]: """ - Get the N best observations so far as a triplet of DataFrames (config, score, context). + Get the N best observations so far as a quad of DataFrames (config, score, context, metadata). Default is N=1. The columns are ordered in ASCENDING order of the optimization targets. The function uses `pandas.DataFrame.nsmallest(..., keep="first")` method under the hood. @@ -227,15 +249,16 @@ def get_best_observations(self, *, n_max: int = 1) -> Tuple[pd.DataFrame, pd.Dat Returns ------- - observations : Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]] - A triplet of best (config, score, context) DataFrames of best observations. + observations : Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame]], Optional[pd.DataFrame]] + A quad of best (config, score, context, metadata) DataFrames of best observations. """ if len(self._observations) == 0: raise ValueError("No observations registered yet.") - (configs, scores, contexts) = self.get_observations() + (configs, scores, contexts, metadatas) = self.get_observations() idx = scores.nsmallest(n_max, columns=self._optimization_targets, keep="first").index return (configs.loc[idx], scores.loc[idx], - None if contexts is None else contexts.loc[idx]) + None if contexts is None else contexts.loc[idx], + None if metadatas is None else metadatas.loc[idx]) def cleanup(self) -> None: """ diff --git a/mlos_core/mlos_core/optimizers/utils.py b/mlos_core/mlos_core/optimizers/utils.py new file mode 100644 index 0000000000..1e59c6ab0d --- /dev/null +++ b/mlos_core/mlos_core/optimizers/utils.py @@ -0,0 +1,57 @@ +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +""" +Contains utils used for implementing the mlos_core optimizers +""" +import inspect +from typing import Any, Callable, Dict, List, Optional +import pandas as pd + + +def to_metadata(metadata: Optional[pd.DataFrame]) -> Optional[List[pd.Series]]: + """ + Converts a list of metadata dataframe objects to a list of metadata objects + Parameters + ---------- + metadata : Optional[pd.DataFrame] + The dataframe to convert to metadata + + Returns + ------- + Optional[List[pd.Series]] + The created metadata object + """ + if metadata is None: + return None + return [idx_series[1] for idx_series in metadata.iterrows()] + + +def filter_kwargs(function: Callable, **kwargs: Any) -> Dict[str, Any]: + """ + Filters arguments provided in the kwargs dictionary to be restricted to the arguments legal for + the called function. + + Parameters + ---------- + function : Callable + function over which we filter kwargs for. + kwargs: + kwargs that we are filtering for the target function + + Returns + ------- + dict + kwargs with the non-legal argument filtered out + """ + sig = inspect.signature(function) + filter_keys = [ + param.name + for param in sig.parameters.values() + if param.kind == param.POSITIONAL_OR_KEYWORD + ] + filtered_dict = { + filter_key: kwargs[filter_key] for filter_key in filter_keys & kwargs.keys() + } + return filtered_dict diff --git a/mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py b/mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py new file mode 100644 index 0000000000..60f25025ea --- /dev/null +++ b/mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py @@ -0,0 +1,99 @@ +# +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +""" +Tests for Optimizers using Metadata. +""" + +from typing import Callable + +import logging +import pytest + +import pandas as pd +import ConfigSpace as CS + +from smac import MultiFidelityFacade as MFFacade +from smac.intensifier.successive_halving import SuccessiveHalving + +from mlos_core.optimizers import ( + OptimizerType, OptimizerFactory, BaseOptimizer) +from mlos_core.tests import SEED + +_LOG = logging.getLogger(__name__) +_LOG.setLevel(logging.DEBUG) + + +def smac_verify_best(metadata: pd.DataFrame) -> bool: + """ + Function to verify if the metadata used by SMAC is in a legal state + + Parameters + ---------- + metadata : pd.DataFrame + metadata returned by SMAC + + Returns + ------- + bool + if the metadata that is returned is valid + """ + max_budget = metadata["budget"].max() + if isinstance(max_budget, float): + return max_budget == 9 + return False + + +@pytest.mark.parametrize(('optimizer_type', 'verify', 'kwargs'), [ + # Enumerate all supported Optimizers + *[(member, verify, {"seed": SEED, "facade": MFFacade, "intensifier": SuccessiveHalving, "min_budget": 1, "max_budget": 9}) + for member, verify in [(OptimizerType.SMAC, smac_verify_best)]], +]) +def test_optimizer_metadata(optimizer_type: OptimizerType, verify: Callable[[pd.DataFrame], bool], kwargs: dict) -> None: + """ + Toy problem to test if metadata is properly being handled for each supporting optimizer + """ + max_iterations = 100 + + def objective(point: pd.DataFrame) -> pd.DataFrame: + # mix of hyperparameters, optimal is to select the highest possible + return pd.DataFrame({"score": point["x"] + point["y"]}) + + input_space = CS.ConfigurationSpace(seed=SEED) + # add a mix of numeric datatypes + input_space.add_hyperparameter(CS.UniformIntegerHyperparameter(name='x', lower=0, upper=5)) + input_space.add_hyperparameter(CS.UniformFloatHyperparameter(name='y', lower=0.0, upper=5.0)) + + optimizer: BaseOptimizer = OptimizerFactory.create( + parameter_space=input_space, + optimization_targets=['score'], + optimizer_type=optimizer_type, + optimizer_kwargs=kwargs, + ) + + with pytest.raises(ValueError, match="No observations"): + optimizer.get_best_observations() + + with pytest.raises(ValueError, match="No observations"): + optimizer.get_observations() + + for _ in range(max_iterations): + config, metadata = optimizer.suggest() + assert isinstance(metadata, pd.DataFrame) + + optimizer.register(configs=config, scores=objective(config), metadata=metadata) + + (all_configs, all_scores, all_contexts, all_metadata) = optimizer.get_observations() + assert isinstance(all_configs, pd.DataFrame) + assert isinstance(all_scores, pd.DataFrame) + assert all_contexts is None + assert isinstance(all_metadata, pd.DataFrame) + assert smac_verify_best(all_metadata) + + (best_configs, best_scores, best_contexts, best_metadata) = optimizer.get_best_observations() + assert isinstance(best_configs, pd.DataFrame) + assert isinstance(best_scores, pd.DataFrame) + assert best_contexts is None + assert isinstance(best_metadata, pd.DataFrame) + assert smac_verify_best(best_metadata) diff --git a/mlos_core/mlos_core/tests/optimizers/optimizer_multiobj_test.py b/mlos_core/mlos_core/tests/optimizers/optimizer_multiobj_test.py index 22263b4c1d..06054dd2c2 100644 --- a/mlos_core/mlos_core/tests/optimizers/optimizer_multiobj_test.py +++ b/mlos_core/mlos_core/tests/optimizers/optimizer_multiobj_test.py @@ -102,19 +102,21 @@ def objective(point: pd.DataFrame) -> pd.DataFrame: assert set(observation.columns) == {'main_score', 'other_score'} optimizer.register(configs=suggestion, scores=observation) - (best_config, best_score, best_context) = optimizer.get_best_observations() + (best_config, best_score, best_context, best_metadata) = optimizer.get_best_observations() assert isinstance(best_config, pd.DataFrame) assert isinstance(best_score, pd.DataFrame) assert best_context is None + assert best_metadata is None assert set(best_config.columns) == {'x', 'y'} assert set(best_score.columns) == {'main_score', 'other_score'} assert best_config.shape == (1, 2) assert best_score.shape == (1, 2) - (all_configs, all_scores, all_contexts) = optimizer.get_observations() + (all_configs, all_scores, all_contexts, all_metadata) = optimizer.get_observations() assert isinstance(all_configs, pd.DataFrame) assert isinstance(all_scores, pd.DataFrame) assert all_contexts is None + assert all_metadata is None assert set(all_configs.columns) == {'x', 'y'} assert set(all_scores.columns) == {'main_score', 'other_score'} assert all_configs.shape == (max_iterations, 2) diff --git a/mlos_core/mlos_core/tests/optimizers/optimizer_test.py b/mlos_core/mlos_core/tests/optimizers/optimizer_test.py index 8231e59feb..f40d8ccaeb 100644 --- a/mlos_core/mlos_core/tests/optimizers/optimizer_test.py +++ b/mlos_core/mlos_core/tests/optimizers/optimizer_test.py @@ -106,20 +106,22 @@ def objective(x: pd.Series) -> pd.DataFrame: assert isinstance(observation, pd.DataFrame) optimizer.register(configs=suggestion, scores=observation, metadata=metadata) - (best_config, best_score, best_context) = optimizer.get_best_observations() + (best_config, best_score, best_context, best_metadata) = optimizer.get_best_observations() assert isinstance(best_config, pd.DataFrame) assert isinstance(best_score, pd.DataFrame) assert best_context is None + assert best_metadata is None or isinstance(best_metadata, pd.DataFrame) assert set(best_config.columns) == {'x', 'y', 'z'} assert set(best_score.columns) == {'score'} assert best_config.shape == (1, 3) assert best_score.shape == (1, 1) assert best_score.score.iloc[0] < -5 - (all_configs, all_scores, all_contexts) = optimizer.get_observations() + (all_configs, all_scores, all_contexts, all_metadata) = optimizer.get_observations() assert isinstance(all_configs, pd.DataFrame) assert isinstance(all_scores, pd.DataFrame) assert all_contexts is None + assert all_metadata is None or isinstance(all_metadata, pd.DataFrame) assert set(all_configs.columns) == {'x', 'y', 'z'} assert set(all_scores.columns) == {'score'} assert all_configs.shape == (20, 3) @@ -284,26 +286,28 @@ def objective(point: pd.DataFrame) -> pd.DataFrame: best_observation = optimizer.get_best_observations() llamatune_best_observation = llamatune_optimizer.get_best_observations() - for (best_config, best_score, best_context) in (best_observation, llamatune_best_observation): + for (best_config, best_score, best_context, best_metadata) in (best_observation, llamatune_best_observation): assert isinstance(best_config, pd.DataFrame) assert isinstance(best_score, pd.DataFrame) assert best_context is None + assert best_metadata is None or isinstance(best_metadata, pd.DataFrame) assert set(best_config.columns) == {'x', 'y'} assert set(best_score.columns) == {'score'} - (best_config, best_score, _context) = best_observation - (llamatune_best_config, llamatune_best_score, _context) = llamatune_best_observation + (best_config, best_score, _context, _metadata) = best_observation + (llamatune_best_config, llamatune_best_score, _context, _metadata) = llamatune_best_observation # LlamaTune's optimizer score should better (i.e., lower) than plain optimizer's one, or close to that assert best_score.score.iloc[0] > llamatune_best_score.score.iloc[0] or \ best_score.score.iloc[0] + 1e-3 > llamatune_best_score.score.iloc[0] # Retrieve and check all observations - for (all_configs, all_scores, all_contexts) in ( + for (all_configs, all_scores, all_contexts, all_metadata) in ( optimizer.get_observations(), llamatune_optimizer.get_observations()): assert isinstance(all_configs, pd.DataFrame) assert isinstance(all_scores, pd.DataFrame) assert all_contexts is None + assert all_metadata is None or isinstance(all_metadata, pd.DataFrame) assert set(all_configs.columns) == {'x', 'y'} assert set(all_scores.columns) == {'score'} assert len(all_configs) == num_iters @@ -391,12 +395,14 @@ def objective(point: pd.DataFrame) -> pd.DataFrame: assert isinstance(observation, pd.DataFrame) optimizer.register(configs=suggestion, scores=observation, metadata=metadata) - (best_config, best_score, best_context) = optimizer.get_best_observations() + (best_config, best_score, best_context, best_metadata) = optimizer.get_best_observations() assert isinstance(best_config, pd.DataFrame) assert isinstance(best_score, pd.DataFrame) assert best_context is None + assert best_metadata is None or isinstance(best_metadata, pd.DataFrame) - (all_configs, all_scores, all_contexts) = optimizer.get_observations() + (all_configs, all_scores, all_contexts, all_metadata) = optimizer.get_observations() assert isinstance(all_configs, pd.DataFrame) assert isinstance(all_scores, pd.DataFrame) assert all_contexts is None + assert all_metadata is None or isinstance(all_metadata, pd.DataFrame) From bfd2a42a316c3dfe6dc2f15785cb058e97fa0827 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 17:59:39 -0500 Subject: [PATCH 02/22] Update mlos_core/mlos_core/optimizers/README Co-authored-by: Brian Kroth --- mlos_core/mlos_core/optimizers/README | 1 + 1 file changed, 1 insertion(+) diff --git a/mlos_core/mlos_core/optimizers/README b/mlos_core/mlos_core/optimizers/README index 5d597ba076..5e33bfc731 100644 --- a/mlos_core/mlos_core/optimizers/README +++ b/mlos_core/mlos_core/optimizers/README @@ -11,6 +11,7 @@ The following definitions are useful for understanding the implementation - `configuration`: a vector representation of a configuration of a system to be evaluated. - `score`: the objective(s) associated with a configuration - `metadata`: additional information about the evaluation, such as the runtime budget used during evaluation. + This data is typically specific to the given optimizer backend and may be returned during a `suggest` call and expected to be provided again during the subsequent `register` call. - `context`: additional (static) information about the evaluation used to extend the internal model used for suggesting samples. For instance, a descriptor of the VM size (vCore count and # of GB of RAM), and some descriptor of the workload. The intent being to allow either sharing or indexing of trial info between "similar" experiments in order to help make the optimization process more efficient for new scenarios. From 16208f4689de6798c6a11358dc0ca865fa9ad15c Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 17:59:56 -0500 Subject: [PATCH 03/22] Update mlos_core/mlos_core/optimizers/README Co-authored-by: Brian Kroth --- mlos_core/mlos_core/optimizers/README | 1 + 1 file changed, 1 insertion(+) diff --git a/mlos_core/mlos_core/optimizers/README b/mlos_core/mlos_core/optimizers/README index 5e33bfc731..27e3c5d02a 100644 --- a/mlos_core/mlos_core/optimizers/README +++ b/mlos_core/mlos_core/optimizers/README @@ -16,6 +16,7 @@ The following definitions are useful for understanding the implementation For instance, a descriptor of the VM size (vCore count and # of GB of RAM), and some descriptor of the workload. The intent being to allow either sharing or indexing of trial info between "similar" experiments in order to help make the optimization process more efficient for new scenarios. > Note: This is not yet implemented. + The interface for these classes can be described as follows: - `register`: this is a function that takes a configuration, a score, and, optionally, metadata about the evaluation to update the model for future evaluations. From 938f8f0b6aef98acda84221c1145b24c84a2c900 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:00:04 -0500 Subject: [PATCH 04/22] Update mlos_core/mlos_core/optimizers/README Co-authored-by: Brian Kroth --- mlos_core/mlos_core/optimizers/README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/README b/mlos_core/mlos_core/optimizers/README index 27e3c5d02a..c180ad9f94 100644 --- a/mlos_core/mlos_core/optimizers/README +++ b/mlos_core/mlos_core/optimizers/README @@ -19,7 +19,7 @@ The following definitions are useful for understanding the implementation The interface for these classes can be described as follows: -- `register`: this is a function that takes a configuration, a score, and, optionally, metadata about the evaluation to update the model for future evaluations. +- `register`: this is a function that takes a `configuration`, `score`, and, optionally, `metadata` and `context` about the evaluation to update the model for future evaluations. - `suggest`: this function returns a new configuration for evaluation. Some optimizers will return additional metadata for evaluation, that should be used during the register phase. From 81d6d56253da4ddfd14086b43f0d257046b2930a Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:00:10 -0500 Subject: [PATCH 05/22] Update mlos_core/mlos_core/optimizers/README Co-authored-by: Brian Kroth --- mlos_core/mlos_core/optimizers/README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/README b/mlos_core/mlos_core/optimizers/README index c180ad9f94..43f58a5399 100644 --- a/mlos_core/mlos_core/optimizers/README +++ b/mlos_core/mlos_core/optimizers/README @@ -26,4 +26,4 @@ The interface for these classes can be described as follows: This function can also optionally take context (not yet implemented), and an argument to force the function to return the default configuration. - `register_pending`: registers a configuration and metadata pair as pending to the optimizer. - `get_observations`: returns all observations reported to the optimizer as a triplet of DataFrames (config, score, context, metadata). -- `get_best_observations`: returns the best observation as a triplet of best (config, score, context, metadata) DataFrames. +- `get_best_observations`: returns the best observation as a triplet of best `(config, score, context, metadata)` DataFrames. From bf2f3ccf0028e0155c160dc00dc42078855ad14e Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:00:16 -0500 Subject: [PATCH 06/22] Update mlos_core/mlos_core/optimizers/README Co-authored-by: Brian Kroth --- mlos_core/mlos_core/optimizers/README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/README b/mlos_core/mlos_core/optimizers/README index 43f58a5399..92d6c88ef2 100644 --- a/mlos_core/mlos_core/optimizers/README +++ b/mlos_core/mlos_core/optimizers/README @@ -25,5 +25,5 @@ The interface for these classes can be described as follows: Some optimizers will return additional metadata for evaluation, that should be used during the register phase. This function can also optionally take context (not yet implemented), and an argument to force the function to return the default configuration. - `register_pending`: registers a configuration and metadata pair as pending to the optimizer. -- `get_observations`: returns all observations reported to the optimizer as a triplet of DataFrames (config, score, context, metadata). +- `get_observations`: returns all observations reported to the optimizer as a triplet of DataFrames `(config, score, context, metadata)`. - `get_best_observations`: returns the best observation as a triplet of best `(config, score, context, metadata)` DataFrames. From 1686c7c1e68582db6ad9392b9f595c371ba7a17f Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 23:01:53 +0000 Subject: [PATCH 07/22] some comments --- .cspell.json | 2 +- mlos_core/mlos_core/optimizers/README | 4 ++-- .../optimizers/bayesian_optimizers/smac_optimizer.py | 8 ++++---- mlos_core/mlos_core/optimizers/optimizer.py | 8 ++++---- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.cspell.json b/.cspell.json index 19aa2b07f7..3a2ee1ae7c 100644 --- a/.cspell.json +++ b/.cspell.json @@ -43,7 +43,7 @@ "linalg", "llamatune", "matplotlib", - "metadatas", + "metadatum", "mlos", "mloscore", "mwait", diff --git a/mlos_core/mlos_core/optimizers/README b/mlos_core/mlos_core/optimizers/README index 92d6c88ef2..529f898b94 100644 --- a/mlos_core/mlos_core/optimizers/README +++ b/mlos_core/mlos_core/optimizers/README @@ -10,12 +10,12 @@ The following definitions are useful for understanding the implementation - `configuration`: a vector representation of a configuration of a system to be evaluated. - `score`: the objective(s) associated with a configuration -- `metadata`: additional information about the evaluation, such as the runtime budget used during evaluation. - This data is typically specific to the given optimizer backend and may be returned during a `suggest` call and expected to be provided again during the subsequent `register` call. - `context`: additional (static) information about the evaluation used to extend the internal model used for suggesting samples. For instance, a descriptor of the VM size (vCore count and # of GB of RAM), and some descriptor of the workload. The intent being to allow either sharing or indexing of trial info between "similar" experiments in order to help make the optimization process more efficient for new scenarios. > Note: This is not yet implemented. +- `metadata`: additional information about the evaluation, such as the runtime budget used during evaluation. + This data is typically specific to the given optimizer backend and may be returned during a `suggest` call and expected to be provided again during the subsequent `register` call. The interface for these classes can be described as follows: diff --git a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py index d7886ffcf6..7b53a72891 100644 --- a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py +++ b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py @@ -314,13 +314,13 @@ def _register(self, *, configs: pd.DataFrame, with self.lock: # Register each trial (one-by-one) - metadatas: Union[List[pd.Series], List[None]] = to_metadata(metadata) or [ + metadatum: Union[List[pd.Series], List[None]] = to_metadata(metadata) or [ None for _ in scores # type: ignore[misc] ] for config, score, ctx in zip( self._to_configspace_configs(configs=configs), scores.values.tolist(), - metadatas, + metadatum, ): value: TrialValue = TrialValue( cost=score, time=0.0, status=StatusType.SUCCESS @@ -493,12 +493,12 @@ def get_best_observations(self, *, n_max: int = 1 if metadata is not None and metadata["budget"].max() == max_budget ] - configs, scores, contexts, metadatas = self._get_observations(observations) + configs, scores, contexts, metadatum = self._get_observations(observations) idx = scores.nsmallest(n_max, columns=self._optimization_targets, keep="first").index return (configs.loc[idx], scores.loc[idx], None if contexts is None else contexts.loc[idx], - None if metadatas is None else metadatas.loc[idx]) + None if metadatum is None else metadatum.loc[idx]) def _to_configspace_configs(self, *, configs: pd.DataFrame) -> List[ConfigSpace.Configuration]: """Convert a dataframe of configs to a list of ConfigSpace configs. diff --git a/mlos_core/mlos_core/optimizers/optimizer.py b/mlos_core/mlos_core/optimizers/optimizer.py index 670216b364..807d5d84e0 100644 --- a/mlos_core/mlos_core/optimizers/optimizer.py +++ b/mlos_core/mlos_core/optimizers/optimizer.py @@ -220,9 +220,9 @@ def _get_observations(self, observations: scores = pd.concat([score for _, score, _, _ in observations]).reset_index(drop=True) contexts = pd.concat([pd.DataFrame() if context is None else context for _, _, context, _ in observations]).reset_index(drop=True) - metadatas = pd.concat([pd.DataFrame() if metadata is None else metadata + metadatum = pd.concat([pd.DataFrame() if metadata is None else metadata for _, _, _, metadata in observations]).reset_index(drop=True) - return (configs, scores, contexts if len(contexts.columns) > 0 else None, metadatas if len(metadatas.columns) > 0 else None) + return (configs, scores, contexts if len(contexts.columns) > 0 else None, metadatum if len(metadatum.columns) > 0 else None) def get_observations(self) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame]]: """ @@ -254,11 +254,11 @@ def get_best_observations(self, *, n_max: int = 1) -> Tuple[pd.DataFrame, pd.Dat """ if len(self._observations) == 0: raise ValueError("No observations registered yet.") - (configs, scores, contexts, metadatas) = self.get_observations() + (configs, scores, contexts, metadatum) = self.get_observations() idx = scores.nsmallest(n_max, columns=self._optimization_targets, keep="first").index return (configs.loc[idx], scores.loc[idx], None if contexts is None else contexts.loc[idx], - None if metadatas is None else metadatas.loc[idx]) + None if metadatum is None else metadatum.loc[idx]) def cleanup(self) -> None: """ From d2636137df368921b335ffecb34ff64d9228b9e3 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 23:03:48 +0000 Subject: [PATCH 08/22] more comments for README --- mlos_core/mlos_core/optimizers/README | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mlos_core/mlos_core/optimizers/README b/mlos_core/mlos_core/optimizers/README index 529f898b94..46c90a525f 100644 --- a/mlos_core/mlos_core/optimizers/README +++ b/mlos_core/mlos_core/optimizers/README @@ -17,13 +17,13 @@ The following definitions are useful for understanding the implementation - `metadata`: additional information about the evaluation, such as the runtime budget used during evaluation. This data is typically specific to the given optimizer backend and may be returned during a `suggest` call and expected to be provided again during the subsequent `register` call. -The interface for these classes can be described as follows: +- `register`: this is a function that takes a `configuration`, `score`, and, optionally, `metadata` and `context` about the evaluation to update the model for future evaluations. - `register`: this is a function that takes a `configuration`, `score`, and, optionally, `metadata` and `context` about the evaluation to update the model for future evaluations. - `suggest`: this function returns a new configuration for evaluation. Some optimizers will return additional metadata for evaluation, that should be used during the register phase. - This function can also optionally take context (not yet implemented), and an argument to force the function to return the default configuration. -- `register_pending`: registers a configuration and metadata pair as pending to the optimizer. +- `get_observations`: returns all observations reported to the optimizer as a triplet of DataFrames `(config, score, context, metadata)`. +- `get_best_observations`: returns the best observation as a triplet of best `(config, score, context, metadata)` DataFrames. - `get_observations`: returns all observations reported to the optimizer as a triplet of DataFrames `(config, score, context, metadata)`. - `get_best_observations`: returns the best observation as a triplet of best `(config, score, context, metadata)` DataFrames. From 6766b8d572e4b75837c15960f3da74aa946d0aa4 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:04:38 -0500 Subject: [PATCH 09/22] Update mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py Co-authored-by: Brian Kroth --- .../mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py index 7b53a72891..e29bda74e9 100644 --- a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py +++ b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py @@ -105,7 +105,7 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments Defaults to `0.1`. Setting this to a higher value favors exploration over exploitation. facade: AbstractFacade - sets the facade to use for SMAC + Sets the facade to use for SMAC. intensifier: Optional[Type[AbstractIntensifier]] Sets the intensifier type to use in the optimizer. If not set, the From bae17632ff5953532379be9ad9471a76c8994780 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:05:48 -0500 Subject: [PATCH 10/22] Update mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py Co-authored-by: Brian Kroth --- .../mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py index e29bda74e9..c0989a97b9 100644 --- a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py +++ b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py @@ -213,7 +213,7 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments assert isinstance(max_ratio, float) and 0.0 <= max_ratio <= 1.0 initial_design_args['max_ratio'] = max_ratio - # build the initial design for SMAC. + # Build the initial design for SMAC. # (currently defaults SOBOL instead of LatinHypercube due to better uniformity # for initial sampling which results in lower overall samples required) initial_design = initial_design_class(**initial_design_args) # type: ignore[arg-type] From 53af62b30b30ca0df9dabdeb07a0fd064c940932 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 23:06:03 +0000 Subject: [PATCH 11/22] mergE --- .../mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py index e29bda74e9..37e01dada8 100644 --- a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py +++ b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py @@ -105,7 +105,7 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments Defaults to `0.1`. Setting this to a higher value favors exploration over exploitation. facade: AbstractFacade - Sets the facade to use for SMAC. + Sets the facade to use for SMAC. More information about the facade can be found here: https://automl.github.io/SMAC3/main/api/smac.facade.html intensifier: Optional[Type[AbstractIntensifier]] Sets the intensifier type to use in the optimizer. If not set, the From dcff9cc96b0e41b620f62c7905a277a158ddf448 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:07:29 -0500 Subject: [PATCH 12/22] Update mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py Co-authored-by: Brian Kroth --- .../mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py index e5773989b6..ef1fa928fc 100644 --- a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py +++ b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py @@ -214,7 +214,7 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments initial_design_args['max_ratio'] = max_ratio # Build the initial design for SMAC. - # (currently defaults SOBOL instead of LatinHypercube due to better uniformity + # (currently defaults to SOBOL instead of LatinHypercube due to better uniformity # for initial sampling which results in lower overall samples required) initial_design = initial_design_class(**initial_design_args) # type: ignore[arg-type] From 50ef16caefc6773d7b3492f0961a7462790c069f Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:07:36 -0500 Subject: [PATCH 13/22] Update mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py Co-authored-by: Brian Kroth --- .../mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py index ef1fa928fc..3e353fe86e 100644 --- a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py +++ b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py @@ -451,7 +451,8 @@ def cleanup(self) -> None: self._temp_output_directory = None def get_observations_full(self) -> pd.DataFrame: - """Returns the observations as a dataframe with additional info. + """ + Returns the observations as a dataframe with additional info. Returns ------- From c32bd679cf25ccb7308114e6c730b181f2793452 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:07:44 -0500 Subject: [PATCH 14/22] Update mlos_core/mlos_core/optimizers/utils.py Co-authored-by: Brian Kroth --- mlos_core/mlos_core/optimizers/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/utils.py b/mlos_core/mlos_core/optimizers/utils.py index 1e59c6ab0d..edcc65598d 100644 --- a/mlos_core/mlos_core/optimizers/utils.py +++ b/mlos_core/mlos_core/optimizers/utils.py @@ -21,7 +21,7 @@ def to_metadata(metadata: Optional[pd.DataFrame]) -> Optional[List[pd.Series]]: Returns ------- Optional[List[pd.Series]] - The created metadata object + The list of metadata series objects """ if metadata is None: return None From 2b15694edbc8644f895ec4f02c5f21e72d17fb2a Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:07:51 -0500 Subject: [PATCH 15/22] Update mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py Co-authored-by: Brian Kroth --- mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py b/mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py index 60f25025ea..6c21c879a3 100644 --- a/mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py +++ b/mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py @@ -96,4 +96,4 @@ def objective(point: pd.DataFrame) -> pd.DataFrame: assert isinstance(best_scores, pd.DataFrame) assert best_contexts is None assert isinstance(best_metadata, pd.DataFrame) - assert smac_verify_best(best_metadata) + assert verifier(best_metadata) From 574b8cc0d3d1faf3d9be5a6b4a675a57d0186f0f Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:08:01 -0500 Subject: [PATCH 16/22] Update mlos_core/mlos_core/optimizers/utils.py Co-authored-by: Brian Kroth --- mlos_core/mlos_core/optimizers/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/utils.py b/mlos_core/mlos_core/optimizers/utils.py index edcc65598d..c971c6a81c 100644 --- a/mlos_core/mlos_core/optimizers/utils.py +++ b/mlos_core/mlos_core/optimizers/utils.py @@ -12,7 +12,7 @@ def to_metadata(metadata: Optional[pd.DataFrame]) -> Optional[List[pd.Series]]: """ - Converts a list of metadata dataframe objects to a list of metadata objects + Converts a list of metadata dataframe objects to a list of metadata series objects. Parameters ---------- metadata : Optional[pd.DataFrame] From 3d4c055b5ff8c529266f41c2b84a3160baf18e4b Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:08:08 -0500 Subject: [PATCH 17/22] Update mlos_core/mlos_core/optimizers/utils.py Co-authored-by: Brian Kroth --- mlos_core/mlos_core/optimizers/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mlos_core/mlos_core/optimizers/utils.py b/mlos_core/mlos_core/optimizers/utils.py index c971c6a81c..67c047cfc3 100644 --- a/mlos_core/mlos_core/optimizers/utils.py +++ b/mlos_core/mlos_core/optimizers/utils.py @@ -13,6 +13,7 @@ def to_metadata(metadata: Optional[pd.DataFrame]) -> Optional[List[pd.Series]]: """ Converts a list of metadata dataframe objects to a list of metadata series objects. + Parameters ---------- metadata : Optional[pd.DataFrame] From 41ee53398f6461fe4a2b43edab446a3d6777401c Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:08:16 -0500 Subject: [PATCH 18/22] Update mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py Co-authored-by: Brian Kroth --- .../mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py index 3e353fe86e..5f660f976f 100644 --- a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py +++ b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py @@ -466,7 +466,8 @@ def get_observations_full(self) -> pd.DataFrame: def get_best_observations(self, *, n_max: int = 1 ) -> Tuple[pd.DataFrame, pd.DataFrame, Optional[pd.DataFrame], Optional[pd.DataFrame]]: - """Returns the best observation so far as a dataframe. + """ + Returns the best observation so far as a dataframe. Returns ------- From cfa936ac7ec1908505a576e55a8dc77168e9b24a Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:08:23 -0500 Subject: [PATCH 19/22] Update mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py Co-authored-by: Brian Kroth --- .../mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py index 5f660f976f..e630a2d42f 100644 --- a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py +++ b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py @@ -521,7 +521,8 @@ def _to_configspace_configs(self, *, configs: pd.DataFrame) -> List[ConfigSpace. ] def _extract_config(self, trial: TrialInfo) -> pd.DataFrame: - """Convert TrialInfo to a config DataFrame. + """ + Convert TrialInfo to a config DataFrame. Parameters ---------- From abd3eb630141eab79b892024767e536ccb8f6c63 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 18:08:32 -0500 Subject: [PATCH 20/22] Update mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py Co-authored-by: Brian Kroth --- .../mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py index e630a2d42f..9dc2bbaa6b 100644 --- a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py +++ b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py @@ -541,7 +541,8 @@ def _extract_config(self, trial: TrialInfo) -> pd.DataFrame: def _extract_metadata(trial: TrialInfo) -> pd.DataFrame: - """Convert TrialInfo to a metadata DataFrame. + """ + Convert TrialInfo to a metadata DataFrame. Parameters ---------- From e0ac571ed9893fe0b27309dcdc9fd7a907b651c5 Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 23:09:19 +0000 Subject: [PATCH 21/22] comment --- .../mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py index e5773989b6..cea26b7438 100644 --- a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py +++ b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py @@ -109,8 +109,7 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments intensifier: Optional[Type[AbstractIntensifier]] Sets the intensifier type to use in the optimizer. If not set, the - default intensifier - from the facade will be used + default intensifier from the facade will be used initial_design_class: AbstractInitialDesign Sets the initial design class to be used in the optimizer. From 92345997ab4a74d455a6db3f3109fc542e24bb4c Mon Sep 17 00:00:00 2001 From: Johannes Freischuetz Date: Mon, 8 Jul 2024 23:50:41 +0000 Subject: [PATCH 22/22] comments --- .../optimizers/{README => README.md} | 0 .../bayesian_optimizers/smac_optimizer.py | 16 ++++++-- .../optimizers/optimizer_metadata_test.py | 38 ++++++++++++++----- 3 files changed, 41 insertions(+), 13 deletions(-) rename mlos_core/mlos_core/optimizers/{README => README.md} (100%) diff --git a/mlos_core/mlos_core/optimizers/README b/mlos_core/mlos_core/optimizers/README.md similarity index 100% rename from mlos_core/mlos_core/optimizers/README rename to mlos_core/mlos_core/optimizers/README.md diff --git a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py index b2127f78b1..5c2005b524 100644 --- a/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py +++ b/mlos_core/mlos_core/optimizers/bayesian_optimizers/smac_optimizer.py @@ -105,7 +105,8 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments Defaults to `0.1`. Setting this to a higher value favors exploration over exploitation. facade: AbstractFacade - Sets the facade to use for SMAC. More information about the facade can be found here: https://automl.github.io/SMAC3/main/api/smac.facade.html + Sets the facade to use for SMAC. More information about the facade + can be found here: https://automl.github.io/SMAC3/main/api/smac.facade.html intensifier: Optional[Type[AbstractIntensifier]] Sets the intensifier type to use in the optimizer. If not set, the @@ -237,6 +238,9 @@ def __init__(self, *, # pylint: disable=too-many-locals,too-many-arguments **filter_kwargs(facade, **kwargs), ) + # This lock is required because pandas SMAC3 using the ask and tell interface + # are not thread safe. To fix this we need to lock around a few critical + # sections when interacting with SMAC3 or updating our pandas data structures self.lock = threading.Lock() def __del__(self) -> None: @@ -264,7 +268,7 @@ def _dummy_target_func( config: ConfigSpace.Configuration, seed: int = 0, budget: float = 1, - instance: object = None, + instance: Optional[str] = None, ) -> None: """Dummy target function for SMAC optimizer. @@ -281,7 +285,7 @@ def _dummy_target_func( budget : int The budget that was used for evaluating the configuration. - instance : object + instance : Optional[str] The instance that the configuration was evaluated on. """ # NOTE: Providing a target function when using the ask-and-tell interface is an imperfection of the API @@ -325,6 +329,7 @@ def _register(self, *, configs: pd.DataFrame, cost=score, time=0.0, status=StatusType.SUCCESS ) + # Find existing entries in our run history that match the config + metadata matching: pd.Series[bool] if ctx is None: matching = self.trial_info_df["Configuration"] == config @@ -335,12 +340,13 @@ def _register(self, *, configs: pd.DataFrame, [df_ctx.equals(ctx) for df_ctx in self.trial_info_df["Metadata"]] ) - # make a new entry + # If no entry exists, make a new entry if sum(matching) > 0: info = self.trial_info_df[matching]["TrialInfo"].iloc[-1] self.trial_info_df.at[list(matching).index(True), "TrialValue"] = ( value ) + # Otherwise update the existing entry with our new information else: if ctx is None or "budget" not in ctx or "instance" not in ctx: info = TrialInfo( @@ -352,6 +358,7 @@ def _register(self, *, configs: pd.DataFrame, info, value, ] + # If important metadata exists, rebuild the SMAC specific object else: info = TrialInfo( config=config, @@ -554,6 +561,7 @@ def _extract_metadata(trial: TrialInfo) -> pd.DataFrame: Pandas dataframe with a single row containing the metadata. Column names are the budget and instance of the evaluation, if valid. """ + # Note that the columns extracted are the only columns that exist currently in SMAC return pd.DataFrame( [[trial.instance, trial.seed, trial.budget]], columns=["instance", "seed", "budget"], diff --git a/mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py b/mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py index 6c21c879a3..f952446b40 100644 --- a/mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py +++ b/mlos_core/mlos_core/tests/optimizers/optimizer_metadata_test.py @@ -25,7 +25,7 @@ _LOG.setLevel(logging.DEBUG) -def smac_verify_best(metadata: pd.DataFrame) -> bool: +def smac_verify_best(metadata: pd.DataFrame, best: bool = False) -> bool: """ Function to verify if the metadata used by SMAC is in a legal state @@ -34,23 +34,43 @@ def smac_verify_best(metadata: pd.DataFrame) -> bool: metadata : pd.DataFrame metadata returned by SMAC + best: bool + If we are testing just the best contexts or not + Returns ------- bool if the metadata that is returned is valid """ + max_budget = metadata["budget"].max() - if isinstance(max_budget, float): - return max_budget == 9 - return False + assert isinstance(max_budget, float) + assert max_budget == 9 + + if not best: + min_budget = metadata["budget"].min() + assert isinstance(min_budget, float) + assert min_budget == 1 + + return True @pytest.mark.parametrize(('optimizer_type', 'verify', 'kwargs'), [ # Enumerate all supported Optimizers - *[(member, verify, {"seed": SEED, "facade": MFFacade, "intensifier": SuccessiveHalving, "min_budget": 1, "max_budget": 9}) - for member, verify in [(OptimizerType.SMAC, smac_verify_best)]], + *[(member, verify, kwargs) + for member, verify, kwargs in [( + OptimizerType.SMAC, + smac_verify_best, + { + "seed": SEED, + "facade": MFFacade, + "intensifier": SuccessiveHalving, + "min_budget": 1, + "max_budget": 9 + } + )]], ]) -def test_optimizer_metadata(optimizer_type: OptimizerType, verify: Callable[[pd.DataFrame], bool], kwargs: dict) -> None: +def test_optimizer_metadata(optimizer_type: OptimizerType, verify: Callable[[pd.DataFrame, bool], bool], kwargs: dict) -> None: """ Toy problem to test if metadata is properly being handled for each supporting optimizer """ @@ -89,11 +109,11 @@ def objective(point: pd.DataFrame) -> pd.DataFrame: assert isinstance(all_scores, pd.DataFrame) assert all_contexts is None assert isinstance(all_metadata, pd.DataFrame) - assert smac_verify_best(all_metadata) + assert verify(all_metadata, False) (best_configs, best_scores, best_contexts, best_metadata) = optimizer.get_best_observations() assert isinstance(best_configs, pd.DataFrame) assert isinstance(best_scores, pd.DataFrame) assert best_contexts is None assert isinstance(best_metadata, pd.DataFrame) - assert verifier(best_metadata) + assert verify(best_metadata, True)