diff --git a/README.md b/README.md index 378a21e..3b32161 100644 --- a/README.md +++ b/README.md @@ -98,8 +98,6 @@ for epoch in range(3): y = np.random.randint(0, 2, (10, 1)) # dummy labels loss = model.train_on_batch(x, y) print("Iter {} loss: {}".format(iteration + 1, "%.3f" % loss)) - if iteration == (24 - 2): - K.set_value(model.optimizer.t_cur, -1) # WARM RESTART: reset cosine annealing argument print("EPOCH {} COMPLETED\n".format(epoch + 1)) ``` @@ -113,8 +111,9 @@ for epoch in range(3): - `total_iterations_wd` --> set to normalize over _all epochs_ (or other interval `!= total_iterations`) instead of per-WR when using WR; may _sometimes_ yield better results --_My note_ ### Warm restarts - - Set `t_cur = -1` to restart schedule multiplier (see _Example_). Can be done at compilation or during training. Non-`-1` is also valid, and will start `eta_t` at another point on the cosine curve. Details in A-2,3 - - `t_cur` should be set at `iter == total_iterations - 2`; explanation [here](https://github.com/OverLordGoldDragon/keras-adamw/blob/v1.31/tests/test_optimizers.py#L52) + - Done automatically with `autorestart=True`, which is the default if `use_cosine_annealing=True`; internally sets `t_cur=0` after `total_iterations` iterations. + - Manually: set `t_cur = -1` to restart schedule multiplier (see _Example_). Can be done at compilation or during training. Non-`-1` is also valid, and will start `eta_t` at another point on the cosine curve. Details in A-2,3 + - `t_cur` should be set at `iter == total_iterations - 2`; explanation [here](https://github.com/OverLordGoldDragon/keras-adamw/blob/v1.35/tests/test_optimizers.py#L52) - Set `total_iterations` to the # of expected weight updates _for the given restart_ --_Authors_ (A-1,2) - `eta_min=0, eta_max=1` are tunable hyperparameters; e.g., an exponential schedule can be used for `eta_max`. If unsure, the defaults were shown to work well in the paper. --_Authors_ - **[Save/load](https://keras.io/getting-started/faq/#how-can-i-save-a-keras-model) optimizer state**; WR relies on using the optimizer's update history for effective transitions --_Authors_ (A-2) diff --git a/example.py b/example.py index 58ad3bd..93aa166 100644 --- a/example.py +++ b/example.py @@ -33,8 +33,10 @@ eta_history.append(K_eval(model.optimizer.eta_t, K)) lr_history.append(K_eval(model.optimizer.lr_t, K)) print("Iter {} loss: {}".format(iteration + 1, "%.3f" % loss)) - if iteration == (24 - 2): - K.set_value(model.optimizer.t_cur, -1) # WARM RESTART + + # MANUAL OPTION if autorestart=False is used + # if iteration == (24 - 2): + # K.set_value(model.optimizer.t_cur, -1) # WARM RESTART print("EPOCH {} COMPLETED\n".format(epoch + 1)) # learning rate at iteration `t` (lr_t) is subject to scaling depending on diff --git a/keras_adamw/__init__.py b/keras_adamw/__init__.py index 780bde8..f06e6f8 100644 --- a/keras_adamw/__init__.py +++ b/keras_adamw/__init__.py @@ -28,4 +28,4 @@ from .utils import get_weight_decays, fill_dict_in_order from .utils import reset_seeds, K_eval -__version__ = '1.32' +__version__ = '1.35' diff --git a/keras_adamw/optimizers.py b/keras_adamw/optimizers.py index 501625a..b894e24 100644 --- a/keras_adamw/optimizers.py +++ b/keras_adamw/optimizers.py @@ -3,7 +3,7 @@ from keras.legacy import interfaces from keras.optimizers import Optimizer from .utils import _init_weight_decays, _apply_weight_decays, _check_args -from .utils import _apply_lr_multiplier, _update_t_cur_eta_t +from .utils import _apply_lr_multiplier, _update_t_cur_eta_t, _set_autorestart from .utils import K_eval as KE @@ -41,6 +41,13 @@ class AdamW(Optimizer): use_cosine_annealing: bool. If True, multiplies lr each train iteration as a function of eta_min, eta_max, total_iterations, and t_cur (current); [2]-Appendix, 2 + autorestart: bool / None. If True, will automatically do Warm Restarts + by resetting `t_cur=0` after `total_iterations`. If None, + will default to same as `use_cosine_annealing`. If True + but `use_cosine_annealing` is False, will raise ValueError. + Note: once optimizer is built (happens on first model fit), + changing `autorestart` has no effect; optimizer needs to be + re-built. eta_min, eta_max: int, int. Min & max values of cosine annealing lr multiplier; [2]-Appendix, 2 t_cur: int. Value to initialize t_cur to - used for 'warm restarts'. @@ -69,7 +76,7 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False, model=None, zero_penalties=True, batch_size=32, total_iterations=0, total_iterations_wd=None, use_cosine_annealing=False, lr_multipliers=None, - weight_decays=None, init_verbose=True, + weight_decays=None, autorestart=None, init_verbose=True, eta_min=0, eta_max=1, t_cur=0, **kwargs): if total_iterations > 1: weight_decays = _init_weight_decays(model, zero_penalties, @@ -101,6 +108,7 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, self.init_verbose = init_verbose self.use_cosine_annealing = use_cosine_annealing + _set_autorestart(self, autorestart, use_cosine_annealing) _check_args(self, total_iterations, use_cosine_annealing, weight_decays) self._init_lr = learning_rate # to print lr_mult setup self._init_notified = False @@ -186,6 +194,7 @@ def get_config(self): 'weight_decays': self.weight_decays, 'lr_multipliers': self.lr_multipliers, 'use_cosine_annealing': self.use_cosine_annealing, + 'autorestart': self.autorestart, 't_cur': int(K_eval(self.t_cur)), 'eta_t': float(K_eval(self.eta_t)), 'eta_min': float(K_eval(self.eta_min)), @@ -228,6 +237,13 @@ class NadamW(Optimizer): use_cosine_annealing: bool. If True, multiplies lr each train iteration as a function of eta_min, eta_max, total_iterations, and t_cur (current); [3]-Appendix, 2 + autorestart: bool / None. If True, will automatically do Warm Restarts + by resetting `t_cur=0` after `total_iterations`. If None, + will default to same as `use_cosine_annealing`. If True + but `use_cosine_annealing` is False, will raise ValueError. + Note: once optimizer is built (happens on first model fit), + changing `autorestart` has no effect; optimizer needs to be + re-built. eta_min, eta_max: int, int. Min & max values of cosine annealing lr multiplier; [3]-Appendix, 2 t_cur: int. Value to initialize t_cur to - used for 'warm restarts'. @@ -258,7 +274,7 @@ def __init__(self, learning_rate=0.002, beta_1=0.9, beta_2=0.999, model=None, zero_penalties=True, batch_size=32, total_iterations=0, total_iterations_wd=None, use_cosine_annealing=False, lr_multipliers=None, - weight_decays=None, init_verbose=True, + weight_decays=None, autorestart=None, init_verbose=True, eta_min=0, eta_max=1, t_cur=0, **kwargs): if total_iterations > 1: weight_decays = _init_weight_decays(model, zero_penalties, @@ -289,6 +305,7 @@ def __init__(self, learning_rate=0.002, beta_1=0.9, beta_2=0.999, self.use_cosine_annealing = use_cosine_annealing self.init_verbose = init_verbose + _set_autorestart(self, autorestart, use_cosine_annealing) _check_args(self, total_iterations, use_cosine_annealing, weight_decays) self._init_lr = learning_rate # to print lr_mult setup self._init_notified = False @@ -376,6 +393,7 @@ def get_config(self): 'weight_decays': self.weight_decays, 'lr_multipliers': self.lr_multipliers, 'use_cosine_annealing': self.use_cosine_annealing, + 'autorestart': self.autorestart, 't_cur': int(K_eval(self.t_cur)), 'eta_t': float(K_eval(self.eta_t)), 'eta_min': float(K_eval(self.eta_min)), @@ -414,6 +432,13 @@ class SGDW(Optimizer): use_cosine_annealing: bool. If True, multiplies lr each train iteration as a function of eta_min, eta_max, total_iterations, and t_cur (current); [2]-Appendix, 2 + autorestart: bool / None. If True, will automatically do Warm Restarts + by resetting `t_cur=0` after `total_iterations`. If None, + will default to same as `use_cosine_annealing`. If True + but `use_cosine_annealing` is False, will raise ValueError. + Note: once optimizer is built (happens on first model fit), + changing `autorestart` has no effect; optimizer needs to be + re-built. eta_min, eta_max: int, int. Min & max values of cosine annealing lr multiplier; [2]-Appendix, 2 t_cur: int. Value to initialize t_cur to - used for 'warm restarts'. @@ -443,7 +468,7 @@ def __init__(self, learning_rate=0.01, momentum=0., nesterov=False, model=None, zero_penalties=True, batch_size=32, total_iterations=0, total_iterations_wd=None, use_cosine_annealing=False, lr_multipliers=None, - weight_decays=None, init_verbose=True, + weight_decays=None, autorestart=None, init_verbose=True, eta_min=0, eta_max=1, t_cur=0, **kwargs): if total_iterations > 1: weight_decays = _init_weight_decays(model, zero_penalties, @@ -473,6 +498,7 @@ def __init__(self, learning_rate=0.01, momentum=0., nesterov=False, self.init_verbose = init_verbose self.use_cosine_annealing = use_cosine_annealing + _set_autorestart(self, autorestart, use_cosine_annealing) _check_args(self, total_iterations, use_cosine_annealing, weight_decays) self._init_lr = learning_rate # to print lr_mult setup self._init_notified = False @@ -535,6 +561,7 @@ def get_config(self): 'weight_decays': self.weight_decays, 'lr_multipliers': self.lr_multipliers, 'use_cosine_annealing': self.use_cosine_annealing, + 'autorestart': self.autorestart, 't_cur': int(K_eval(self.t_cur)), 'eta_t': float(K_eval(self.eta_t)), 'eta_min': float(K_eval(self.eta_min)), diff --git a/keras_adamw/optimizers_225.py b/keras_adamw/optimizers_225.py index 95c2090..b0ed9bc 100644 --- a/keras_adamw/optimizers_225.py +++ b/keras_adamw/optimizers_225.py @@ -2,7 +2,7 @@ from keras.legacy import interfaces from keras.optimizers import Optimizer from .utils import _init_weight_decays, _apply_weight_decays, _check_args -from .utils import _apply_lr_multiplier, _update_t_cur_eta_t +from .utils import _apply_lr_multiplier, _update_t_cur_eta_t, _set_autorestart class AdamW(Optimizer): @@ -30,6 +30,13 @@ class AdamW(Optimizer): use_cosine_annealing: bool. If True, multiplies lr each train iteration as a function of eta_min, eta_max, total_iterations, and t_cur (current); [2]-Appendix, 2 + autorestart: bool / None. If True, will automatically do Warm Restarts + by resetting `t_cur=0` after `total_iterations`. If None, + will default to same as `use_cosine_annealing`. If True + but `use_cosine_annealing` is False, will raise ValueError. + Note: once optimizer is built (happens on first model fit), + changing `autorestart` has no effect; optimizer needs to be + re-built. eta_min, eta_max: int, int. Min & max values of cosine annealing lr multiplier; [2]-Appendix, 2 t_cur: int. Value to initialize t_cur to - used for 'warm restarts'. @@ -58,7 +65,7 @@ def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False, epsilon=None, decay=0.0, model=None, zero_penalties=True, batch_size=32, total_iterations=0, total_iterations_wd=None, use_cosine_annealing=False, lr_multipliers=None, - weight_decays=None, init_verbose=True, + weight_decays=None, autorestart=None, init_verbose=True, eta_min=0, eta_max=1, t_cur=0, **kwargs): if total_iterations > 1: weight_decays = _init_weight_decays(model, zero_penalties, @@ -88,6 +95,7 @@ def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False, self.init_verbose = init_verbose self.use_cosine_annealing = use_cosine_annealing + _set_autorestart(self, autorestart, use_cosine_annealing) _check_args(self, total_iterations, use_cosine_annealing, weight_decays) self._init_lr = lr # to print lr_mult setup self._init_notified = False @@ -162,6 +170,7 @@ def get_config(self): 'weight_decays': self.weight_decays, 'lr_multipliers': self.lr_multipliers, 'use_cosine_annealing': self.use_cosine_annealing, + 'autorestart': self.autorestart, 't_cur': int(K.get_value(self.t_cur)), 'eta_t': float(K.eval(self.eta_t)), 'eta_min': float(K.get_value(self.eta_min)), @@ -204,6 +213,13 @@ class NadamW(Optimizer): use_cosine_annealing: bool. If True, multiplies lr each train iteration as a function of eta_min, eta_max, total_iterations, and t_cur (current); [3]-Appendix, 2 + autorestart: bool / None. If True, will automatically do Warm Restarts + by resetting `t_cur=0` after `total_iterations`. If None, + will default to same as `use_cosine_annealing`. If True + but `use_cosine_annealing` is False, will raise ValueError. + Note: once optimizer is built (happens on first model fit), + changing `autorestart` has no effect; optimizer needs to be + re-built. eta_min, eta_max: int, int. Min & max values of cosine annealing lr multiplier; [3]-Appendix, 2 t_cur: int. Value to initialize t_cur to - used for 'warm restarts'. @@ -235,7 +251,7 @@ def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, model=None, zero_penalties=True, batch_size=32, total_iterations=0, total_iterations_wd=None, use_cosine_annealing=False, lr_multipliers=None, - weight_decays=None, init_verbose=True, + weight_decays=None, autorestart=None, init_verbose=True, eta_min=0, eta_max=1, t_cur=0, **kwargs): if total_iterations > 1: weight_decays = _init_weight_decays(model, zero_penalties, @@ -264,6 +280,7 @@ def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999, self.use_cosine_annealing = use_cosine_annealing self.init_verbose = init_verbose + _set_autorestart(self, autorestart, use_cosine_annealing) _check_args(self, total_iterations, use_cosine_annealing, weight_decays) self._init_lr = lr # to print lr_mult setup self._init_notified = False @@ -340,6 +357,7 @@ def get_config(self): 'weight_decays': self.weight_decays, 'lr_multipliers': self.lr_multipliers, 'use_cosine_annealing': self.use_cosine_annealing, + 'autorestart': self.autorestart, 't_cur': int(K.get_value(self.t_cur)), 'eta_t': float(K.eval(self.eta_t)), 'eta_min': float(K.get_value(self.eta_min)), @@ -378,6 +396,13 @@ class SGDW(Optimizer): use_cosine_annealing: bool. If True, multiplies lr each train iteration as a function of eta_min, eta_max, total_iterations, and t_cur (current); [2]-Appendix, 2 + autorestart: bool / None. If True, will automatically do Warm Restarts + by resetting `t_cur=0` after `total_iterations`. If None, + will default to same as `use_cosine_annealing`. If True + but `use_cosine_annealing` is False, will raise ValueError. + Note: once optimizer is built (happens on first model fit), + changing `autorestart` has no effect; optimizer needs to be + re-built. eta_min, eta_max: int, int. Min & max values of cosine annealing lr multiplier; [2]-Appendix, 2 t_cur: int. Value to initialize t_cur to - used for 'warm restarts'. @@ -407,7 +432,7 @@ def __init__(self, lr=0.01, momentum=0., nesterov=False, decay=0.0, model=None, zero_penalties=True, batch_size=32, total_iterations=0, total_iterations_wd=None, use_cosine_annealing=False, lr_multipliers=None, - weight_decays=None, init_verbose=True, + weight_decays=None, autorestart=None, init_verbose=True, eta_min=0, eta_max=1, t_cur=0, **kwargs): if total_iterations > 1: weight_decays = _init_weight_decays(model, zero_penalties, @@ -435,6 +460,7 @@ def __init__(self, lr=0.01, momentum=0., nesterov=False, decay=0.0, self.init_verbose = init_verbose self.use_cosine_annealing = use_cosine_annealing + _set_autorestart(self, autorestart, use_cosine_annealing) _check_args(self, total_iterations, use_cosine_annealing, weight_decays) self._init_lr = lr # to print lr_mult setup self._init_notified = False @@ -495,6 +521,7 @@ def get_config(self): 'weight_decays': self.weight_decays, 'lr_multipliers': self.lr_multipliers, 'use_cosine_annealing': self.use_cosine_annealing, + 'autorestart': self.autorestart, 't_cur': int(K.get_value(self.t_cur)), 'eta_t': float(K.eval(self.eta_t)), 'eta_min': float(K.get_value(self.eta_min)), diff --git a/keras_adamw/optimizers_225tf.py b/keras_adamw/optimizers_225tf.py index 9e1b4b9..d975d89 100644 --- a/keras_adamw/optimizers_225tf.py +++ b/keras_adamw/optimizers_225tf.py @@ -6,7 +6,7 @@ from tensorflow.python.util.tf_export import keras_export from tensorflow.keras import backend as K from .utils import _init_weight_decays, _apply_weight_decays, _check_args -from .utils import _update_t_cur_eta_t_v2, _apply_lr_multiplier +from .utils import _update_t_cur_eta_t_v2, _apply_lr_multiplier, _set_autorestart @keras_export('keras.optimizers.AdamW') @@ -86,7 +86,7 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, model=None, zero_penalties=True, batch_size=32, total_iterations=0, total_iterations_wd=None, use_cosine_annealing=False, lr_multipliers=None, - weight_decays=None, init_verbose=True, + weight_decays=None, autorestart=None, init_verbose=True, eta_min=0, eta_max=1, t_cur=0, name="AdamW", **kwargs): if total_iterations > 1: weight_decays = _init_weight_decays(model, zero_penalties, @@ -113,6 +113,7 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, self.epsilon = epsilon or backend_config.epsilon() self.amsgrad = amsgrad + _set_autorestart(self, autorestart, use_cosine_annealing) _check_args(self, total_iterations, use_cosine_annealing, weight_decays) self._init_lr = kwargs.get('lr', learning_rate) # to print lr_mult setup self._updates_processed = 0 # to track num calls to '_resource_apply_...' @@ -270,6 +271,7 @@ def get_config(self): 'total_iterations': int(self.total_iterations), 'weight_decays': self.weight_decays, 'use_cosine_annealing': self.use_cosine_annealing, + 'autorestart': self.autorestart, 't_cur': int(K.get_value(self.t_cur)), 'eta_t': float(K.get_value(self.eta_t)), 'eta_min': float(K.get_value(self.eta_min)), @@ -350,7 +352,7 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, model=None, zero_penalties=True, batch_size=32, total_iterations=0, total_iterations_wd=None, use_cosine_annealing=False, lr_multipliers=None, - weight_decays=None, init_verbose=True, + weight_decays=None, autorestart=None, init_verbose=True, eta_min=0, eta_max=1, t_cur=0, name="NadamW", **kwargs): if total_iterations > 1: weight_decays = _init_weight_decays(model, zero_penalties, @@ -386,6 +388,7 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, self.use_cosine_annealing = use_cosine_annealing self.epsilon = epsilon or backend_config.epsilon() + _set_autorestart(self, autorestart, use_cosine_annealing) _check_args(self, total_iterations, use_cosine_annealing, weight_decays) self._init_lr = kwargs.get('lr', learning_rate) # to print lr_mult setup self._updates_processed = 0 # to track num calls to '_resource_apply_...' @@ -557,6 +560,7 @@ def get_config(self): 'total_iterations': int(self.total_iterations), 'weight_decays': self.weight_decays, 'use_cosine_annealing': self.use_cosine_annealing, + 'autorestart': self.autorestart, 't_cur': int(K.get_value(self.t_cur)), 'eta_t': float(K.get_value(self.eta_t)), 'eta_min': float(K.get_value(self.eta_min)), @@ -630,7 +634,7 @@ def __init__(self, learning_rate=0.01, momentum=0.0, nesterov=False, model=None, zero_penalties=True, batch_size=32, total_iterations=0, total_iterations_wd=None, use_cosine_annealing=False, lr_multipliers=None, - weight_decays=None, init_verbose=True, + weight_decays=None, autorestart=None, init_verbose=True, eta_min=0, eta_max=1, t_cur=0, name="SGDW", **kwargs): if total_iterations > 1: weight_decays = _init_weight_decays(model, zero_penalties, @@ -661,6 +665,7 @@ def __init__(self, learning_rate=0.01, momentum=0.0, nesterov=False, self.init_verbose = init_verbose self.use_cosine_annealing = use_cosine_annealing + _set_autorestart(self, autorestart, use_cosine_annealing) _check_args(self, total_iterations, use_cosine_annealing, weight_decays) self._init_lr = kwargs.get('lr', learning_rate) # to print lr_mult setup self._updates_processed = 0 # to track num calls to '_resource_apply_...' @@ -772,6 +777,7 @@ def get_config(self): 'total_iterations': int(self.total_iterations), 'weight_decays': self.weight_decays, 'use_cosine_annealing': self.use_cosine_annealing, + 'autorestart': self.autorestart, 't_cur': int(K.get_value(self.t_cur)), 'eta_t': float(K.get_value(self.eta_t)), 'eta_min': float(K.get_value(self.eta_min)), diff --git a/keras_adamw/optimizers_v2.py b/keras_adamw/optimizers_v2.py index ebdea1f..1b0658b 100644 --- a/keras_adamw/optimizers_v2.py +++ b/keras_adamw/optimizers_v2.py @@ -6,7 +6,7 @@ from tensorflow.python.util.tf_export import keras_export from tensorflow.python.keras import backend as K from .utils import _init_weight_decays, _apply_weight_decays, _check_args -from .utils import _update_t_cur_eta_t_v2, _apply_lr_multiplier +from .utils import _update_t_cur_eta_t_v2, _apply_lr_multiplier, _set_autorestart from .utils import K_eval as KE @@ -62,6 +62,13 @@ class AdamW(OptimizerV2): use_cosine_annealing: bool. If True, multiplies lr each train iteration as a function of eta_min, eta_max, total_iterations, and t_cur (current); [2]-Appendix, 2 + autorestart: bool / None. If True, will automatically do Warm Restarts + by resetting `t_cur=0` after `total_iterations`. If None, + will default to same as `use_cosine_annealing`. If True + but `use_cosine_annealing` is False, will raise ValueError. + Note: once optimizer is built (happens on first model fit), + changing `autorestart` has no effect; optimizer needs to be + re-built. eta_min, eta_max: int, int. Min & max values of cosine annealing lr multiplier; [2]-Appendix, 2 t_cur: int. Value to initialize t_cur to - used for 'warm restarts'. @@ -91,8 +98,9 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, model=None, zero_penalties=True, batch_size=32, total_iterations=0, total_iterations_wd=None, use_cosine_annealing=False, lr_multipliers=None, - weight_decays=None, init_verbose=True, - eta_min=0, eta_max=1, t_cur=0, name="AdamW", **kwargs): + weight_decays=None, autorestart=None, + init_verbose=True, eta_min=0, eta_max=1, t_cur=0, + name="AdamW", **kwargs): if total_iterations > 1: weight_decays = _init_weight_decays(model, zero_penalties, weight_decays) @@ -118,6 +126,7 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, self.epsilon = epsilon or backend_config.epsilon() self.amsgrad = amsgrad + _set_autorestart(self, autorestart, use_cosine_annealing) _check_args(self, total_iterations, use_cosine_annealing, weight_decays) self._init_lr = kwargs.get('lr', learning_rate) # to print lr_mult setup self._updates_processed = 0 # to track num calls to '_resource_apply_...' @@ -271,6 +280,7 @@ def get_config(self): 'total_iterations': int(self.total_iterations), 'weight_decays': self.weight_decays, 'use_cosine_annealing': self.use_cosine_annealing, + 'autorestart': self.autorestart, 't_cur': int(K_eval(self.t_cur)), 'eta_t': float(K_eval(self.eta_t)), 'eta_min': float(K_eval(self.eta_min)), @@ -321,6 +331,13 @@ class NadamW(OptimizerV2): use_cosine_annealing: bool. If True, multiplies lr each train iteration as a function of eta_min, eta_max, total_iterations, and t_cur (current); [3]-Appendix, 2 + autorestart: bool / None. If True, will automatically do Warm Restarts + by resetting `t_cur=0` after `total_iterations`. If None, + will default to same as `use_cosine_annealing`. If True + but `use_cosine_annealing` is False, will raise ValueError. + Note: once optimizer is built (happens on first model fit), + changing `autorestart` has no effect; optimizer needs to be + re-built. eta_min, eta_max: int, int. Min & max values of cosine annealing lr multiplier; [3]-Appendix, 2 t_cur: int. Value to initialize t_cur to - used for 'warm restarts'. @@ -351,7 +368,7 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-7, model=None, zero_penalties=True, batch_size=32, total_iterations=0, total_iterations_wd=None, use_cosine_annealing=False, lr_multipliers=None, - weight_decays=None, init_verbose=True, + weight_decays=None, autorestart=None, init_verbose=True, eta_min=0, eta_max=1, t_cur=0, name="NadamW", **kwargs): if total_iterations > 1: weight_decays = _init_weight_decays(model, zero_penalties, @@ -387,6 +404,7 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, self.use_cosine_annealing = use_cosine_annealing self.epsilon = epsilon or backend_config.epsilon() + _set_autorestart(self, autorestart, use_cosine_annealing) _check_args(self, total_iterations, use_cosine_annealing, weight_decays) self._init_lr = kwargs.get('lr', learning_rate) # to print lr_mult setup self._updates_processed = 0 # to track num calls to '_resource_apply_...' @@ -557,6 +575,7 @@ def get_config(self): 'total_iterations': int(self.total_iterations), 'weight_decays': self.weight_decays, 'use_cosine_annealing': self.use_cosine_annealing, + 'autorestart': self.autorestart, 't_cur': int(K_eval(self.t_cur)), 'eta_t': float(K_eval(self.eta_t)), 'eta_min': float(K_eval(self.eta_min)), @@ -601,6 +620,13 @@ class SGDW(OptimizerV2): use_cosine_annealing: bool. If True, multiplies lr each train iteration as a function of eta_min, eta_max, total_iterations, and t_cur (current); [2]-Appendix, 2 + autorestart: bool / None. If True, will automatically do Warm Restarts + by resetting `t_cur=0` after `total_iterations`. If None, + will default to same as `use_cosine_annealing`. If True + but `use_cosine_annealing` is False, will raise ValueError. + Note: once optimizer is built (happens on first model fit), + changing `autorestart` has no effect; optimizer needs to be + re-built. eta_min, eta_max: int, int. Min & max values of cosine annealing lr multiplier; [2]-Appendix, 2 t_cur: int. Value to initialize t_cur to - used for 'warm restarts'. @@ -630,7 +656,7 @@ def __init__(self, learning_rate=0.01, momentum=0.0, nesterov=False, model=None, zero_penalties=True, batch_size=32, total_iterations=0, total_iterations_wd=None, use_cosine_annealing=False, lr_multipliers=None, - weight_decays=None, init_verbose=True, + weight_decays=None, autorestart=None, init_verbose=True, eta_min=0, eta_max=1, t_cur=0, name="SGDW", **kwargs): if total_iterations > 1: weight_decays = _init_weight_decays(model, zero_penalties, @@ -661,6 +687,7 @@ def __init__(self, learning_rate=0.01, momentum=0.0, nesterov=False, self.init_verbose = init_verbose self.use_cosine_annealing = use_cosine_annealing + _set_autorestart(self, autorestart, use_cosine_annealing) _check_args(self, total_iterations, use_cosine_annealing, weight_decays) self._init_lr = kwargs.get('lr', learning_rate) # to print lr_mult setup self._updates_processed = 0 # to track num calls to '_resource_apply_...' @@ -771,6 +798,7 @@ def get_config(self): 'total_iterations': int(self.total_iterations), 'weight_decays': self.weight_decays, 'use_cosine_annealing': self.use_cosine_annealing, + 'autorestart': self.autorestart, 't_cur': int(K_eval(self.t_cur)), 'eta_t': float(K_eval(self.eta_t)), 'eta_min': float(K_eval(self.eta_min)), diff --git a/keras_adamw/utils.py b/keras_adamw/utils.py index 2783f08..eb7ee66 100644 --- a/keras_adamw/utils.py +++ b/keras_adamw/utils.py @@ -3,8 +3,8 @@ import tensorflow as tf from termcolor import colored from tensorflow.python import ops -from tensorflow.python.ops import math_ops, state_ops - +from tensorflow.python.ops import math_ops, state_ops, control_flow_ops +from . import TF_KERAS WARN = colored('WARNING:', 'red') @@ -66,7 +66,7 @@ def _apply_lr_multiplier(self, lr_t, var): def _update_t_cur_eta_t(self): # keras - self.updates.append(state_ops.assign_add(self.t_cur, 1)) + self.updates.append(_update_t_cur(self)) # Cosine annealing if self.use_cosine_annealing: # ensure eta_t is updated AFTER t_cur @@ -79,10 +79,9 @@ def _update_t_cur_eta_t_v2(self, lr_t=None, var=None): # tf.keras t_cur_update, eta_t_update = None, None # in case not assigned # update `t_cur` if iterating last `(grad, var)` - iteration_done = self._updates_processed == (self._updates_per_iter - 1) + iteration_done = (self._updates_processed == (self._updates_per_iter - 1)) if iteration_done: - t_cur_update = state_ops.assign_add(self.t_cur, 1, - use_locking=self._use_locking) + t_cur_update = _update_t_cur(self) self._updates_processed = 0 # reset else: self._updates_processed += 1 @@ -98,6 +97,27 @@ def _update_t_cur_eta_t_v2(self, lr_t=None, var=None): # tf.keras return iteration_done, t_cur_update, eta_t_update +def _update_t_cur(self): + kw = {'use_locking': self._use_locking} if TF_KERAS else {} + if self.autorestart: + return control_flow_ops.cond( + math_ops.equal(self.t_cur, self.total_iterations - 1), + lambda: state_ops.assign(self.t_cur, 0, **kw), + lambda: state_ops.assign_add(self.t_cur, 1, **kw), + ) + return state_ops.assign_add(self.t_cur, 1, **kw) + + +def _set_autorestart(self, autorestart, use_cosine_annealing): + if autorestart is None: + self.autorestart = bool(use_cosine_annealing) + elif autorestart and not use_cosine_annealing: + raise ValueError("`autorestart` can only be used with " + "`use_cosine_annealing`") + else: + self.autorestart = autorestart + + def _check_args(self, total_iterations, use_cosine_annealing, weight_decays): if use_cosine_annealing and total_iterations > 1: print('Using cosine annealing learning rates') @@ -106,6 +126,7 @@ def _check_args(self, total_iterations, use_cosine_annealing, weight_decays): + " to use cosine annealing and/or weight decays; " "proceeding without either") self.use_cosine_annealing = False + self.autorestart = False self.weight_decays = {} diff --git a/tests/test_optimizers.py b/tests/test_optimizers.py index b2bfc98..edb291c 100644 --- a/tests/test_optimizers.py +++ b/tests/test_optimizers.py @@ -16,7 +16,7 @@ from time import time from termcolor import cprint -from backend import K, TF_KERAS, TF_2 +from backend import K, TF_KERAS, TF_2, TF_EAGER from backend import Input, Dense, GRU, Bidirectional, Embedding from backend import Model, load_model from backend import l1, l2, l1_l2 @@ -49,6 +49,7 @@ def test_main(): # Save/Load, Warm Restarts (w/ cosine annealing) eta_history = [] # for stop-introspection t_cur_history = [] # for stop-introspection + # // Explanation for "manual option" when autorestart=False # eta_t is first applied as-is, and only updated AFTER iteration; # setting t_cur does not immediately change eta_t. # Thus, t_cur must be reset 1 iteration BEFORE epoch ends @@ -60,8 +61,8 @@ def test_main(): # Save/Load, Warm Restarts (w/ cosine annealing) t_cur_history += [K_eval(model.optimizer.t_cur, K)] eta_history += [K_eval(model.optimizer.eta_t, K)] model.train_on_batch(X[batch_num], Y[batch_num]) - if batch_num == (num_batches - 2): - K.set_value(model.optimizer.t_cur, -1) + # if batch_num == (num_batches - 2): Manual Option + # K.set_value(model.optimizer.t_cur, -1) assert _valid_cosine_annealing(eta_history, total_iterations, num_epochs) assert model.optimizer.get_config() # ensure value evaluation won't error @@ -225,7 +226,7 @@ def _test_save_load(model, X, optimizer_name, optimizer): modelpath = os.path.join(tempfile.gettempdir(), test_name) model.save(modelpath) del model - if TF_2 and not TF_KERAS: + if TF_2 and not TF_EAGER and not TF_KERAS: tf.compat.v1.experimental.output_all_intermediates(True) # bug fix model = load_model(modelpath, custom_objects={optimizer_name: optimizer}) @@ -376,7 +377,6 @@ def _valid_cosine_annealing(eta_history, total_iterations, num_epochs): dtype='float32') value = np.array([0.5 * (1 + np.cos(arg))], dtype='float32') eta_history_simul.append(value[0][0]) - # 1 + np.cos(np.pi * iteration / total_iterations))) return np.allclose(eta_history, eta_history_simul, rtol=0, atol=2e-7)