From 5e5d0c77d6756301063b8c74d08fdda964e0a8a0 Mon Sep 17 00:00:00 2001 From: wildromi Date: Wed, 7 Aug 2024 18:08:42 +0200 Subject: [PATCH 1/3] added cosine decaying learning rate option --- dadapy/_utils/differentiable_imbalance.py | 28 ++++++++++------- dadapy/feature_weighting.py | 38 ++++++++++++++--------- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/dadapy/_utils/differentiable_imbalance.py b/dadapy/_utils/differentiable_imbalance.py index 2ab2ec57..cbd2bd1e 100644 --- a/dadapy/_utils/differentiable_imbalance.py +++ b/dadapy/_utils/differentiable_imbalance.py @@ -391,7 +391,7 @@ def _optimize_dii( l_rate: float = None, constrain: bool = False, l1_penalty: float = 0.0, - decaying_lr: bool = True, + decaying_lr: str = "exp", period: np.ndarray = None, groundtruthperiod: np.ndarray = None, cythond: bool = True, @@ -416,8 +416,8 @@ def _optimize_dii( Constrain the sum of the weights to sum up to the number of weights. Default: False l1_penalty: float The l1-regularization strength, if sparcity is needed. Default: 0 (l1-regularization turned off). - decaying_lr: bool - Use exponentially decaying learning rate in gradient descent or not. Default: True. + decaying_lr: string + "exp" for exponentially decaying learning rate (cut in half every 10 epochs) or "cos" for cosine decaying learning rate. "static" for no decay in the learning rate. Default: "exp" period : float or numpy.ndarray/list, optional D(input data) periods (input should be formatted to be periodic starting at 0). If not a list, the same period is assumed for all D features Default is None, which means no periodic boundary conditions are applied. If some of the input feature do not have a a period, set those to 0. @@ -483,7 +483,7 @@ def _optimize_dii( diis[0] = _return_dii(dists_rescaled_A, rank_matrix_B, lambd) l1_penalties[0] = l1_penalty * np.sum(np.abs(weights)) - lrate = l_rate # for not expon. decaying learning rates + lrate = 1 * l_rate # for not expon. decaying learning rates; "1*" ensures deepcopy for i_epoch in range(n_epochs): # compute gradient * SCALING!!!! to be scale invariant in case of no adaptive lambda @@ -509,8 +509,10 @@ def _optimize_dii( ) break else: - # exponentially decaying lr - if decaying_lr == True: + # exponentially or cosine decaying lr + if decaying_lr == "cos": + lrate = l_rate * 0.5 * (1 + np.cos((np.pi * i_epoch) / n_epochs)) + elif decaying_lr == "exp": lrate = l_rate * 2 ** ( -i_epoch / 10 ) # every 10 epochs the learning rate will be halfed @@ -576,7 +578,7 @@ def _optimize_dii_static_zeros( n_epochs: int = 100, l_rate: float = 0.1, constrain: bool = False, - decaying_lr: bool = True, + decaying_lr: str = "exp", period: np.ndarray = None, groundtruthperiod: np.ndarray = None, cythond: bool = True, @@ -591,7 +593,7 @@ def _optimize_dii_static_zeros( l_rate (float): learning rate. Has to be tuned, especially if constrain=True (otherwise optmization could fail) constrain (bool): if True, rescale the weights so the biggest weight = 1 l1_penalty (float): l1 regularization strength - decaying_lr (bool): default: True. Apply decaying learning rate = l_rate * 2**(-i_epoch/10) - every 10 epochs the learning rate will be halfed + decaying_lr (string): "exp" for exponentially decaying learning rate (cut in half every 10 epochs) or "cos" for cosine decaying learning rate. "static" for no decay in the learning rate. Default: "exp" period (float or np.ndarray/list): D(input) periods (input formatted to be 0-period). If not a list, the same period is assumed for all D features groundtruthperiod (float or np.ndarray/list): D(groundtruth) periods (groundtruth formatted to be 0-period). If not a list, the same period is assumed for all D(groundtruth) features @@ -672,7 +674,9 @@ def _optimize_dii_static_zeros( gradient[weights == 0] = 0 # exponentially decaying lr - if decaying_lr == True: + if decaying_lr == "cos": + lrate = l_rate * 0.5 * (1 + np.cos((np.pi * i_epoch) / n_epochs)) + elif decaying_lr == "exp": lrate = l_rate * 2 ** ( -i_epoch / 10 ) # every 10 epochs the learning rate will be halfed @@ -732,7 +736,7 @@ def _refine_lasso_optimization( n_epochs=50, l_rate=None, constrain=False, - decaying_lr=True, + decaying_lr="exp", period=None, groundtruthperiod=None, cythond=True, @@ -752,8 +756,8 @@ def _refine_lasso_optimization( l_rate (float or None): if None, the learning rate is determined automatically with optimize_learning_rate n_epochs (int): number of epochs in each optimization cycle constrain (bool): if True, rescale the weights so the biggest weight = 1 - decaying_lr (bool): default: True. Apply decaying learning rate = l_rate * 2**(-i_epoch/10) - every 10 epochs the learning rate will be halfed - period (float or np.ndarray/list): D(input) periods (input formatted to be 0-period). If not a list, the same period is assumed for all D features + decaying_lr (string): + "exp" for exponentially decaying learning rate (cut in half every 10 epochs) or "cos" for cosine decaying learning rate. "static" for no decay in the learning rate. Default: "exp" period (float or np.ndarray/list): D(input) periods (input formatted to be 0-period). If not a list, the same period is assumed for all D features groundtruthperiod (float or np.ndarray/list): D(groundtruth) periods (groundtruth formatted to be 0-period). If not a list, the same period is assumed for all D(groundtruth) features cythond (bool): Flag indicating whether to use Cython-based distance computation methods. Should be True (default) unless you want to test the Python-based methods. diff --git a/dadapy/feature_weighting.py b/dadapy/feature_weighting.py index 78f2002c..502c5e7d 100644 --- a/dadapy/feature_weighting.py +++ b/dadapy/feature_weighting.py @@ -185,7 +185,7 @@ def return_optimal_learning_rate( n_samples: int = 200, initial_weights: Union[np.ndarray, int, float] = None, lambd: float = None, - decaying_lr: bool = True, + decaying_lr: str = "exp", trial_learning_rates: np.ndarray = None, ): """Find the optimal learning rate for the optimization of the DII by testing several on a reduced set @@ -197,9 +197,11 @@ def return_optimal_learning_rate( initial_weights (np.ndarray or list): D(input) initial weights for the input features. No zeros allowed here lambd (float): softmax scaling. If None (preferred), this chosen automatically with compute_optimial_lambda - decaying_lr (bool): default: True. - Apply decaying learning rate = l_rate * 2**(-i_epoch/10) - - every 10 epochs the learning rate will be halfed + decaying_lr (string): Default: "exp". + "exp" for exponentially decaying learning rate (cut in half every 10 epochs): + lrate = l_rate_initial * 2**(-i_epoch/10), + or "cos" for cosine decaying learning rate: lrate = l_rate_initial * 0.5 * (1+ cos((pi * i_epoch)/n_epochs)). + "static" for no decay in the learning rate. trial_learning_rates (np.ndarray or list or None): learning rates to try. If None are given, a sensible set of learning rates is tested. Returns: @@ -366,7 +368,7 @@ def return_weights_optimize_dii( lambd: float = None, learning_rate: float = None, l1_penalty: float = 0.0, - decaying_lr: bool = True, + decaying_lr: str = "exp", ): """Optimize the differentiable information imbalance using gradient descent of the DII between input data object A and groundtruth data object B. @@ -389,9 +391,11 @@ def return_weights_optimize_dii( The learning rate of the gradient descent. If None, automatically estimated to be fast. l1_penalty: float, optional The l1-regularization strength, if sparcity is needed. Default: 0 (l1-regularization turned off). - decaying_lr: bool - Use exponentially decaying learning rate in gradient descent or not. Default: True. - + decaying_lr (string): Default: "exp". + "exp" for exponentially decaying learning rate (cut in half every 10 epochs): + lrate = l_rate_initial * 2**(-i_epoch/10), + or "cos" for cosine decaying learning rate: lrate = l_rate_initial * 0.5 * (1+ cos((pi * i_epoch)/n_epochs)). + "static" for no decay in the learning rate. Returns: final_weights: np.ndarray, shape (D). Array of the optmized weights. @@ -454,7 +458,7 @@ def return_backward_greedy_dii_elimination( n_epochs: int = 100, learning_rate: float = None, constrain: bool = False, - decaying_lr: bool = True, + decaying_lr: str = "exp", ): """Do a stepwise backward elimination of feature weights, always eliminating the lowest weight; after each elimination the DII is optimized by gradient descent using the remaining features @@ -469,8 +473,11 @@ def return_backward_greedy_dii_elimination( Has to be tuned, especially if constrain=True (otherwise optmization could fail) constrain (bool): if True, rescale the weights so the biggest weight = 1 l1_penalty (float): l1 regularization strength - decaying_lr (bool): default: True. Apply decaying learning rate = l_rate * 2**(-i_epoch/10) - - every 10 epochs the learning rate will be halfed + decaying_lr (string): Default: "exp". + "exp" for exponentially decaying learning rate (cut in half every 10 epochs): + lrate = l_rate_initial * 2**(-i_epoch/10), + or "cos" for cosine decaying learning rate: lrate = l_rate_initial * 0.5 * (1+ cos((pi * i_epoch)/n_epochs)). + "static" for no decay in the learning rate. Returns: final_diis: np.ndarray, shape (D). Array of the optmized DII for each of the according weights. @@ -571,7 +578,7 @@ def return_lasso_optimization_dii_search( learning_rate: float = None, l1_penalties: Union[list, float] = None, constrain: bool = False, - decaying_lr: bool = True, + decaying_lr: str = "exp", refine: bool = False, plotlasso: bool = True, ): @@ -591,8 +598,11 @@ def return_lasso_optimization_dii_search( l1_penalties (list or None): l1 regularization strengths to be tested. If None (default), a list of 10 sensible l1-penalties is tested, which are chosen depending on the learning rate. - decaying_lr (bool): default: True. Apply decaying learning rate = l_rate * 2**(-i_epoch/10) - - every 10 epochs the learning rate will be halfed. + decaying_lr (string): Default: "exp". + "exp" for exponentially decaying learning rate (cut in half every 10 epochs): + lrate = l_rate_initial * 2**(-i_epoch/10), + or "cos" for cosine decaying learning rate: lrate = l_rate_initial * 0.5 * (1+ cos((pi * i_epoch)/n_epochs)). + "static" for no decay in the learning rate. refine (bool): default: False. If True, the l1-penalties are added in between penalties where the number of non-zero weights changes by more than one. This is done to find the optimal l1-penalty for each number of non-zero weights. From 36b57cb7fb3ffe1c0e4ba8bb9682d3f0cea00e18 Mon Sep 17 00:00:00 2001 From: wildromi Date: Mon, 19 Aug 2024 18:08:59 +0200 Subject: [PATCH 2/3] added Value check --- dadapy/_utils/differentiable_imbalance.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/dadapy/_utils/differentiable_imbalance.py b/dadapy/_utils/differentiable_imbalance.py index cbd2bd1e..ffdaad50 100644 --- a/dadapy/_utils/differentiable_imbalance.py +++ b/dadapy/_utils/differentiable_imbalance.py @@ -436,7 +436,13 @@ def _optimize_dii( l1_penalties: np.ndarray, shape (n_epochs, ). List of the l1_penaltie terms that were added to the imbalances in the loss function """ - # weightcheck = 0 + + # Validate decaying_lr + if decaying_lr not in ["exp", "cos", "static"]: + raise ValueError( + "Invalid value for decaying_lr. Must be 'exp', 'cos', or 'static'." + ) + N = data.shape[0] D = data.shape[1] @@ -603,6 +609,12 @@ def _optimize_dii_static_zeros( """ # batch GD optimization with zeroes staying zeros - needed for return_backward_greedy_dii_elimination + # Validate decaying_lr + if decaying_lr not in ["exp", "cos", "static"]: + raise ValueError( + "Invalid value for decaying_lr. Must be 'exp', 'cos', or 'static'." + ) + N = data.shape[0] D = data.shape[1] @@ -767,7 +779,12 @@ def _refine_lasso_optimization( opt_l_rate (float): Learning rate, which leads to optimal unregularized (no l1-penalty) result in the specified number of epochs diis_list: values of the DII during optimization in n_epochs using the l_rate. Plot to ensure the optimization went well """ - # TODO: @wildromi typehints + # Validate decaying_lr + if decaying_lr not in ["exp", "cos", "static"]: + raise ValueError( + "Invalid value for decaying_lr. Must be 'exp', 'cos', or 'static'." + ) + # Find where to refine the lasso and decide on new l1 penalties gs[np.isnan(gs)] = 0 l0gs = np.linalg.norm(gs[:, -1, :], 0, axis=1) From 9bd35cce1501f0145fd3f21cc55725d7201c6d10 Mon Sep 17 00:00:00 2001 From: wildromi Date: Mon, 19 Aug 2024 18:09:07 +0200 Subject: [PATCH 3/3] fix unit tests --- tests/test_feature_weighting/test_selection.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/test_feature_weighting/test_selection.py b/tests/test_feature_weighting/test_selection.py index 1138a9f2..896d4d5c 100644 --- a/tests/test_feature_weighting/test_selection.py +++ b/tests/test_feature_weighting/test_selection.py @@ -45,6 +45,10 @@ def test_optimise_imbalance_typing(): feature_selection.return_weights_optimize_dii( Data(data), initial_weights=initial_weights ) + for lrs in [2, np.array([2, 2], np.float32), "faz"]: + feature_selection = FeatureWeighting(data) + with pytest.raises(ValueError): + feature_selection.return_weights_optimize_dii(Data(data), decaying_lr=lrs) def test_dist_matrix(): @@ -101,7 +105,7 @@ def test_optimise_imbalance(): initial_weightss = [None, 1.0, weights_array] lambdas = [1e-5, 1, None] l1_penalties = [1.0, 10, 0.0] - decays = [True, False] + decays = ["cos", "exp", "static"] n_epochs = 5 for ( @@ -146,13 +150,13 @@ def test_optimise_imbalance(): feature_selection = FeatureWeighting(data, period=None) weights = feature_selection.return_weights_optimize_dii( Data(target_data), - n_epochs=50, + n_epochs=40, learning_rate=None, constrain=True, initial_weights=np.ones_like(weights_array), lambd=None, l1_penalty=1e-5, - decaying_lr=True, + decaying_lr="exp", ) assert np.all(weights[0] >= weights[2:]) assert np.all(weights[1] >= weights[2:]) @@ -243,10 +247,11 @@ def test_search_lasso_optimization_kernel_imbalance(): target_data = data * weights_array feature_selection = FeatureWeighting(data, period=None) l1_penalties_options = [[1e-3, 1e-2, 1e-1], np.array([1e-5]), 1e-5, None] + l1_decay_options = ["cos", "exp", "static"] n_epochs = 10 - for l1_penalties, constrain, decaying_lr, refine in itertools.product( - l1_penalties_options, *([[True, False]] * 3) + for l1_penalties, constrain, refine, decaying_lr in itertools.product( + l1_penalties_options, *([[True, False]] * 2), l1_decay_options ): ( num_nonzero_features,