diff --git a/dadapy/_utils/differentiable_imbalance.py b/dadapy/_utils/differentiable_imbalance.py index 2ab2ec57..cbd2bd1e 100644 --- a/dadapy/_utils/differentiable_imbalance.py +++ b/dadapy/_utils/differentiable_imbalance.py @@ -391,7 +391,7 @@ def _optimize_dii( l_rate: float = None, constrain: bool = False, l1_penalty: float = 0.0, - decaying_lr: bool = True, + decaying_lr: str = "exp", period: np.ndarray = None, groundtruthperiod: np.ndarray = None, cythond: bool = True, @@ -416,8 +416,8 @@ def _optimize_dii( Constrain the sum of the weights to sum up to the number of weights. Default: False l1_penalty: float The l1-regularization strength, if sparcity is needed. Default: 0 (l1-regularization turned off). - decaying_lr: bool - Use exponentially decaying learning rate in gradient descent or not. Default: True. + decaying_lr: string + "exp" for exponentially decaying learning rate (cut in half every 10 epochs) or "cos" for cosine decaying learning rate. "static" for no decay in the learning rate. Default: "exp" period : float or numpy.ndarray/list, optional D(input data) periods (input should be formatted to be periodic starting at 0). If not a list, the same period is assumed for all D features Default is None, which means no periodic boundary conditions are applied. If some of the input feature do not have a a period, set those to 0. @@ -483,7 +483,7 @@ def _optimize_dii( diis[0] = _return_dii(dists_rescaled_A, rank_matrix_B, lambd) l1_penalties[0] = l1_penalty * np.sum(np.abs(weights)) - lrate = l_rate # for not expon. decaying learning rates + lrate = 1 * l_rate # for not expon. decaying learning rates; "1*" ensures deepcopy for i_epoch in range(n_epochs): # compute gradient * SCALING!!!! to be scale invariant in case of no adaptive lambda @@ -509,8 +509,10 @@ def _optimize_dii( ) break else: - # exponentially decaying lr - if decaying_lr == True: + # exponentially or cosine decaying lr + if decaying_lr == "cos": + lrate = l_rate * 0.5 * (1 + np.cos((np.pi * i_epoch) / n_epochs)) + elif decaying_lr == "exp": lrate = l_rate * 2 ** ( -i_epoch / 10 ) # every 10 epochs the learning rate will be halfed @@ -576,7 +578,7 @@ def _optimize_dii_static_zeros( n_epochs: int = 100, l_rate: float = 0.1, constrain: bool = False, - decaying_lr: bool = True, + decaying_lr: str = "exp", period: np.ndarray = None, groundtruthperiod: np.ndarray = None, cythond: bool = True, @@ -591,7 +593,7 @@ def _optimize_dii_static_zeros( l_rate (float): learning rate. Has to be tuned, especially if constrain=True (otherwise optmization could fail) constrain (bool): if True, rescale the weights so the biggest weight = 1 l1_penalty (float): l1 regularization strength - decaying_lr (bool): default: True. Apply decaying learning rate = l_rate * 2**(-i_epoch/10) - every 10 epochs the learning rate will be halfed + decaying_lr (string): "exp" for exponentially decaying learning rate (cut in half every 10 epochs) or "cos" for cosine decaying learning rate. "static" for no decay in the learning rate. Default: "exp" period (float or np.ndarray/list): D(input) periods (input formatted to be 0-period). If not a list, the same period is assumed for all D features groundtruthperiod (float or np.ndarray/list): D(groundtruth) periods (groundtruth formatted to be 0-period). If not a list, the same period is assumed for all D(groundtruth) features @@ -672,7 +674,9 @@ def _optimize_dii_static_zeros( gradient[weights == 0] = 0 # exponentially decaying lr - if decaying_lr == True: + if decaying_lr == "cos": + lrate = l_rate * 0.5 * (1 + np.cos((np.pi * i_epoch) / n_epochs)) + elif decaying_lr == "exp": lrate = l_rate * 2 ** ( -i_epoch / 10 ) # every 10 epochs the learning rate will be halfed @@ -732,7 +736,7 @@ def _refine_lasso_optimization( n_epochs=50, l_rate=None, constrain=False, - decaying_lr=True, + decaying_lr="exp", period=None, groundtruthperiod=None, cythond=True, @@ -752,8 +756,8 @@ def _refine_lasso_optimization( l_rate (float or None): if None, the learning rate is determined automatically with optimize_learning_rate n_epochs (int): number of epochs in each optimization cycle constrain (bool): if True, rescale the weights so the biggest weight = 1 - decaying_lr (bool): default: True. Apply decaying learning rate = l_rate * 2**(-i_epoch/10) - every 10 epochs the learning rate will be halfed - period (float or np.ndarray/list): D(input) periods (input formatted to be 0-period). If not a list, the same period is assumed for all D features + decaying_lr (string): + "exp" for exponentially decaying learning rate (cut in half every 10 epochs) or "cos" for cosine decaying learning rate. "static" for no decay in the learning rate. Default: "exp" period (float or np.ndarray/list): D(input) periods (input formatted to be 0-period). If not a list, the same period is assumed for all D features groundtruthperiod (float or np.ndarray/list): D(groundtruth) periods (groundtruth formatted to be 0-period). If not a list, the same period is assumed for all D(groundtruth) features cythond (bool): Flag indicating whether to use Cython-based distance computation methods. Should be True (default) unless you want to test the Python-based methods. diff --git a/dadapy/feature_weighting.py b/dadapy/feature_weighting.py index 78f2002c..502c5e7d 100644 --- a/dadapy/feature_weighting.py +++ b/dadapy/feature_weighting.py @@ -185,7 +185,7 @@ def return_optimal_learning_rate( n_samples: int = 200, initial_weights: Union[np.ndarray, int, float] = None, lambd: float = None, - decaying_lr: bool = True, + decaying_lr: str = "exp", trial_learning_rates: np.ndarray = None, ): """Find the optimal learning rate for the optimization of the DII by testing several on a reduced set @@ -197,9 +197,11 @@ def return_optimal_learning_rate( initial_weights (np.ndarray or list): D(input) initial weights for the input features. No zeros allowed here lambd (float): softmax scaling. If None (preferred), this chosen automatically with compute_optimial_lambda - decaying_lr (bool): default: True. - Apply decaying learning rate = l_rate * 2**(-i_epoch/10) - - every 10 epochs the learning rate will be halfed + decaying_lr (string): Default: "exp". + "exp" for exponentially decaying learning rate (cut in half every 10 epochs): + lrate = l_rate_initial * 2**(-i_epoch/10), + or "cos" for cosine decaying learning rate: lrate = l_rate_initial * 0.5 * (1+ cos((pi * i_epoch)/n_epochs)). + "static" for no decay in the learning rate. trial_learning_rates (np.ndarray or list or None): learning rates to try. If None are given, a sensible set of learning rates is tested. Returns: @@ -366,7 +368,7 @@ def return_weights_optimize_dii( lambd: float = None, learning_rate: float = None, l1_penalty: float = 0.0, - decaying_lr: bool = True, + decaying_lr: str = "exp", ): """Optimize the differentiable information imbalance using gradient descent of the DII between input data object A and groundtruth data object B. @@ -389,9 +391,11 @@ def return_weights_optimize_dii( The learning rate of the gradient descent. If None, automatically estimated to be fast. l1_penalty: float, optional The l1-regularization strength, if sparcity is needed. Default: 0 (l1-regularization turned off). - decaying_lr: bool - Use exponentially decaying learning rate in gradient descent or not. Default: True. - + decaying_lr (string): Default: "exp". + "exp" for exponentially decaying learning rate (cut in half every 10 epochs): + lrate = l_rate_initial * 2**(-i_epoch/10), + or "cos" for cosine decaying learning rate: lrate = l_rate_initial * 0.5 * (1+ cos((pi * i_epoch)/n_epochs)). + "static" for no decay in the learning rate. Returns: final_weights: np.ndarray, shape (D). Array of the optmized weights. @@ -454,7 +458,7 @@ def return_backward_greedy_dii_elimination( n_epochs: int = 100, learning_rate: float = None, constrain: bool = False, - decaying_lr: bool = True, + decaying_lr: str = "exp", ): """Do a stepwise backward elimination of feature weights, always eliminating the lowest weight; after each elimination the DII is optimized by gradient descent using the remaining features @@ -469,8 +473,11 @@ def return_backward_greedy_dii_elimination( Has to be tuned, especially if constrain=True (otherwise optmization could fail) constrain (bool): if True, rescale the weights so the biggest weight = 1 l1_penalty (float): l1 regularization strength - decaying_lr (bool): default: True. Apply decaying learning rate = l_rate * 2**(-i_epoch/10) - - every 10 epochs the learning rate will be halfed + decaying_lr (string): Default: "exp". + "exp" for exponentially decaying learning rate (cut in half every 10 epochs): + lrate = l_rate_initial * 2**(-i_epoch/10), + or "cos" for cosine decaying learning rate: lrate = l_rate_initial * 0.5 * (1+ cos((pi * i_epoch)/n_epochs)). + "static" for no decay in the learning rate. Returns: final_diis: np.ndarray, shape (D). Array of the optmized DII for each of the according weights. @@ -571,7 +578,7 @@ def return_lasso_optimization_dii_search( learning_rate: float = None, l1_penalties: Union[list, float] = None, constrain: bool = False, - decaying_lr: bool = True, + decaying_lr: str = "exp", refine: bool = False, plotlasso: bool = True, ): @@ -591,8 +598,11 @@ def return_lasso_optimization_dii_search( l1_penalties (list or None): l1 regularization strengths to be tested. If None (default), a list of 10 sensible l1-penalties is tested, which are chosen depending on the learning rate. - decaying_lr (bool): default: True. Apply decaying learning rate = l_rate * 2**(-i_epoch/10) - - every 10 epochs the learning rate will be halfed. + decaying_lr (string): Default: "exp". + "exp" for exponentially decaying learning rate (cut in half every 10 epochs): + lrate = l_rate_initial * 2**(-i_epoch/10), + or "cos" for cosine decaying learning rate: lrate = l_rate_initial * 0.5 * (1+ cos((pi * i_epoch)/n_epochs)). + "static" for no decay in the learning rate. refine (bool): default: False. If True, the l1-penalties are added in between penalties where the number of non-zero weights changes by more than one. This is done to find the optimal l1-penalty for each number of non-zero weights.