Skip to content

Commit

Permalink
Add autorestart
Browse files Browse the repository at this point in the history
**FEATURE**: `autorestart` option which automatically handles Warm Restarts by resetting `t_cur=0` after `total_iterations` iterations.

 - Defaults to `True` if `use_cosine_annealing=True`, else `False`
 - Must use `use_cosine_annealing=True` if using `autorestart=True`

Updated README and `example.py`.
  • Loading branch information
OverLordGoldDragon authored Jul 8, 2020
1 parent 8d362e2 commit 29aa8f2
Show file tree
Hide file tree
Showing 9 changed files with 145 additions and 35 deletions.
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,6 @@ for epoch in range(3):
y = np.random.randint(0, 2, (10, 1)) # dummy labels
loss = model.train_on_batch(x, y)
print("Iter {} loss: {}".format(iteration + 1, "%.3f" % loss))
if iteration == (24 - 2):
K.set_value(model.optimizer.t_cur, -1) # WARM RESTART: reset cosine annealing argument
print("EPOCH {} COMPLETED\n".format(epoch + 1))
```
<img src="https://user-images.githubusercontent.com/16495490/83707138-51d56c00-a62a-11ea-9eba-60284490992b.png" width="470">
Expand All @@ -113,8 +111,9 @@ for epoch in range(3):
- `total_iterations_wd` --> set to normalize over _all epochs_ (or other interval `!= total_iterations`) instead of per-WR when using WR; may _sometimes_ yield better results --_My note_

### Warm restarts
- Set `t_cur = -1` to restart schedule multiplier (see _Example_). Can be done at compilation or during training. Non-`-1` is also valid, and will start `eta_t` at another point on the cosine curve. Details in A-2,3
- `t_cur` should be set at `iter == total_iterations - 2`; explanation [here](https://github.com/OverLordGoldDragon/keras-adamw/blob/v1.31/tests/test_optimizers.py#L52)
- Done automatically with `autorestart=True`, which is the default if `use_cosine_annealing=True`; internally sets `t_cur=0` after `total_iterations` iterations.
- Manually: set `t_cur = -1` to restart schedule multiplier (see _Example_). Can be done at compilation or during training. Non-`-1` is also valid, and will start `eta_t` at another point on the cosine curve. Details in A-2,3
- `t_cur` should be set at `iter == total_iterations - 2`; explanation [here](https://github.com/OverLordGoldDragon/keras-adamw/blob/v1.35/tests/test_optimizers.py#L52)
- Set `total_iterations` to the # of expected weight updates _for the given restart_ --_Authors_ (A-1,2)
- `eta_min=0, eta_max=1` are tunable hyperparameters; e.g., an exponential schedule can be used for `eta_max`. If unsure, the defaults were shown to work well in the paper. --_Authors_
- **[Save/load](https://keras.io/getting-started/faq/#how-can-i-save-a-keras-model) optimizer state**; WR relies on using the optimizer's update history for effective transitions --_Authors_ (A-2)
Expand Down
6 changes: 4 additions & 2 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,10 @@
eta_history.append(K_eval(model.optimizer.eta_t, K))
lr_history.append(K_eval(model.optimizer.lr_t, K))
print("Iter {} loss: {}".format(iteration + 1, "%.3f" % loss))
if iteration == (24 - 2):
K.set_value(model.optimizer.t_cur, -1) # WARM RESTART

# MANUAL OPTION if autorestart=False is used
# if iteration == (24 - 2):
# K.set_value(model.optimizer.t_cur, -1) # WARM RESTART
print("EPOCH {} COMPLETED\n".format(epoch + 1))

# learning rate at iteration `t` (lr_t) is subject to scaling depending on
Expand Down
2 changes: 1 addition & 1 deletion keras_adamw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@
from .utils import get_weight_decays, fill_dict_in_order
from .utils import reset_seeds, K_eval

__version__ = '1.32'
__version__ = '1.35'
35 changes: 31 additions & 4 deletions keras_adamw/optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from keras.legacy import interfaces
from keras.optimizers import Optimizer
from .utils import _init_weight_decays, _apply_weight_decays, _check_args
from .utils import _apply_lr_multiplier, _update_t_cur_eta_t
from .utils import _apply_lr_multiplier, _update_t_cur_eta_t, _set_autorestart
from .utils import K_eval as KE


Expand Down Expand Up @@ -41,6 +41,13 @@ class AdamW(Optimizer):
use_cosine_annealing: bool. If True, multiplies lr each train iteration
as a function of eta_min, eta_max, total_iterations,
and t_cur (current); [2]-Appendix, 2
autorestart: bool / None. If True, will automatically do Warm Restarts
by resetting `t_cur=0` after `total_iterations`. If None,
will default to same as `use_cosine_annealing`. If True
but `use_cosine_annealing` is False, will raise ValueError.
Note: once optimizer is built (happens on first model fit),
changing `autorestart` has no effect; optimizer needs to be
re-built.
eta_min, eta_max: int, int. Min & max values of cosine annealing
lr multiplier; [2]-Appendix, 2
t_cur: int. Value to initialize t_cur to - used for 'warm restarts'.
Expand Down Expand Up @@ -69,7 +76,7 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999,
amsgrad=False, model=None, zero_penalties=True,
batch_size=32, total_iterations=0, total_iterations_wd=None,
use_cosine_annealing=False, lr_multipliers=None,
weight_decays=None, init_verbose=True,
weight_decays=None, autorestart=None, init_verbose=True,
eta_min=0, eta_max=1, t_cur=0, **kwargs):
if total_iterations > 1:
weight_decays = _init_weight_decays(model, zero_penalties,
Expand Down Expand Up @@ -101,6 +108,7 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999,
self.init_verbose = init_verbose
self.use_cosine_annealing = use_cosine_annealing

_set_autorestart(self, autorestart, use_cosine_annealing)
_check_args(self, total_iterations, use_cosine_annealing, weight_decays)
self._init_lr = learning_rate # to print lr_mult setup
self._init_notified = False
Expand Down Expand Up @@ -186,6 +194,7 @@ def get_config(self):
'weight_decays': self.weight_decays,
'lr_multipliers': self.lr_multipliers,
'use_cosine_annealing': self.use_cosine_annealing,
'autorestart': self.autorestart,
't_cur': int(K_eval(self.t_cur)),
'eta_t': float(K_eval(self.eta_t)),
'eta_min': float(K_eval(self.eta_min)),
Expand Down Expand Up @@ -228,6 +237,13 @@ class NadamW(Optimizer):
use_cosine_annealing: bool. If True, multiplies lr each train iteration
as a function of eta_min, eta_max, total_iterations,
and t_cur (current); [3]-Appendix, 2
autorestart: bool / None. If True, will automatically do Warm Restarts
by resetting `t_cur=0` after `total_iterations`. If None,
will default to same as `use_cosine_annealing`. If True
but `use_cosine_annealing` is False, will raise ValueError.
Note: once optimizer is built (happens on first model fit),
changing `autorestart` has no effect; optimizer needs to be
re-built.
eta_min, eta_max: int, int. Min & max values of cosine annealing
lr multiplier; [3]-Appendix, 2
t_cur: int. Value to initialize t_cur to - used for 'warm restarts'.
Expand Down Expand Up @@ -258,7 +274,7 @@ def __init__(self, learning_rate=0.002, beta_1=0.9, beta_2=0.999,
model=None, zero_penalties=True, batch_size=32,
total_iterations=0, total_iterations_wd=None,
use_cosine_annealing=False, lr_multipliers=None,
weight_decays=None, init_verbose=True,
weight_decays=None, autorestart=None, init_verbose=True,
eta_min=0, eta_max=1, t_cur=0, **kwargs):
if total_iterations > 1:
weight_decays = _init_weight_decays(model, zero_penalties,
Expand Down Expand Up @@ -289,6 +305,7 @@ def __init__(self, learning_rate=0.002, beta_1=0.9, beta_2=0.999,
self.use_cosine_annealing = use_cosine_annealing
self.init_verbose = init_verbose

_set_autorestart(self, autorestart, use_cosine_annealing)
_check_args(self, total_iterations, use_cosine_annealing, weight_decays)
self._init_lr = learning_rate # to print lr_mult setup
self._init_notified = False
Expand Down Expand Up @@ -376,6 +393,7 @@ def get_config(self):
'weight_decays': self.weight_decays,
'lr_multipliers': self.lr_multipliers,
'use_cosine_annealing': self.use_cosine_annealing,
'autorestart': self.autorestart,
't_cur': int(K_eval(self.t_cur)),
'eta_t': float(K_eval(self.eta_t)),
'eta_min': float(K_eval(self.eta_min)),
Expand Down Expand Up @@ -414,6 +432,13 @@ class SGDW(Optimizer):
use_cosine_annealing: bool. If True, multiplies lr each train iteration
as a function of eta_min, eta_max, total_iterations,
and t_cur (current); [2]-Appendix, 2
autorestart: bool / None. If True, will automatically do Warm Restarts
by resetting `t_cur=0` after `total_iterations`. If None,
will default to same as `use_cosine_annealing`. If True
but `use_cosine_annealing` is False, will raise ValueError.
Note: once optimizer is built (happens on first model fit),
changing `autorestart` has no effect; optimizer needs to be
re-built.
eta_min, eta_max: int, int. Min & max values of cosine annealing
lr multiplier; [2]-Appendix, 2
t_cur: int. Value to initialize t_cur to - used for 'warm restarts'.
Expand Down Expand Up @@ -443,7 +468,7 @@ def __init__(self, learning_rate=0.01, momentum=0., nesterov=False,
model=None, zero_penalties=True, batch_size=32,
total_iterations=0, total_iterations_wd=None,
use_cosine_annealing=False, lr_multipliers=None,
weight_decays=None, init_verbose=True,
weight_decays=None, autorestart=None, init_verbose=True,
eta_min=0, eta_max=1, t_cur=0, **kwargs):
if total_iterations > 1:
weight_decays = _init_weight_decays(model, zero_penalties,
Expand Down Expand Up @@ -473,6 +498,7 @@ def __init__(self, learning_rate=0.01, momentum=0., nesterov=False,
self.init_verbose = init_verbose
self.use_cosine_annealing = use_cosine_annealing

_set_autorestart(self, autorestart, use_cosine_annealing)
_check_args(self, total_iterations, use_cosine_annealing, weight_decays)
self._init_lr = learning_rate # to print lr_mult setup
self._init_notified = False
Expand Down Expand Up @@ -535,6 +561,7 @@ def get_config(self):
'weight_decays': self.weight_decays,
'lr_multipliers': self.lr_multipliers,
'use_cosine_annealing': self.use_cosine_annealing,
'autorestart': self.autorestart,
't_cur': int(K_eval(self.t_cur)),
'eta_t': float(K_eval(self.eta_t)),
'eta_min': float(K_eval(self.eta_min)),
Expand Down
35 changes: 31 additions & 4 deletions keras_adamw/optimizers_225.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from keras.legacy import interfaces
from keras.optimizers import Optimizer
from .utils import _init_weight_decays, _apply_weight_decays, _check_args
from .utils import _apply_lr_multiplier, _update_t_cur_eta_t
from .utils import _apply_lr_multiplier, _update_t_cur_eta_t, _set_autorestart


class AdamW(Optimizer):
Expand Down Expand Up @@ -30,6 +30,13 @@ class AdamW(Optimizer):
use_cosine_annealing: bool. If True, multiplies lr each train iteration
as a function of eta_min, eta_max, total_iterations,
and t_cur (current); [2]-Appendix, 2
autorestart: bool / None. If True, will automatically do Warm Restarts
by resetting `t_cur=0` after `total_iterations`. If None,
will default to same as `use_cosine_annealing`. If True
but `use_cosine_annealing` is False, will raise ValueError.
Note: once optimizer is built (happens on first model fit),
changing `autorestart` has no effect; optimizer needs to be
re-built.
eta_min, eta_max: int, int. Min & max values of cosine annealing
lr multiplier; [2]-Appendix, 2
t_cur: int. Value to initialize t_cur to - used for 'warm restarts'.
Expand Down Expand Up @@ -58,7 +65,7 @@ def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False,
epsilon=None, decay=0.0, model=None, zero_penalties=True,
batch_size=32, total_iterations=0, total_iterations_wd=None,
use_cosine_annealing=False, lr_multipliers=None,
weight_decays=None, init_verbose=True,
weight_decays=None, autorestart=None, init_verbose=True,
eta_min=0, eta_max=1, t_cur=0, **kwargs):
if total_iterations > 1:
weight_decays = _init_weight_decays(model, zero_penalties,
Expand Down Expand Up @@ -88,6 +95,7 @@ def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False,
self.init_verbose = init_verbose
self.use_cosine_annealing = use_cosine_annealing

_set_autorestart(self, autorestart, use_cosine_annealing)
_check_args(self, total_iterations, use_cosine_annealing, weight_decays)
self._init_lr = lr # to print lr_mult setup
self._init_notified = False
Expand Down Expand Up @@ -162,6 +170,7 @@ def get_config(self):
'weight_decays': self.weight_decays,
'lr_multipliers': self.lr_multipliers,
'use_cosine_annealing': self.use_cosine_annealing,
'autorestart': self.autorestart,
't_cur': int(K.get_value(self.t_cur)),
'eta_t': float(K.eval(self.eta_t)),
'eta_min': float(K.get_value(self.eta_min)),
Expand Down Expand Up @@ -204,6 +213,13 @@ class NadamW(Optimizer):
use_cosine_annealing: bool. If True, multiplies lr each train iteration
as a function of eta_min, eta_max, total_iterations,
and t_cur (current); [3]-Appendix, 2
autorestart: bool / None. If True, will automatically do Warm Restarts
by resetting `t_cur=0` after `total_iterations`. If None,
will default to same as `use_cosine_annealing`. If True
but `use_cosine_annealing` is False, will raise ValueError.
Note: once optimizer is built (happens on first model fit),
changing `autorestart` has no effect; optimizer needs to be
re-built.
eta_min, eta_max: int, int. Min & max values of cosine annealing
lr multiplier; [3]-Appendix, 2
t_cur: int. Value to initialize t_cur to - used for 'warm restarts'.
Expand Down Expand Up @@ -235,7 +251,7 @@ def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999,
model=None, zero_penalties=True, batch_size=32,
total_iterations=0, total_iterations_wd=None,
use_cosine_annealing=False, lr_multipliers=None,
weight_decays=None, init_verbose=True,
weight_decays=None, autorestart=None, init_verbose=True,
eta_min=0, eta_max=1, t_cur=0, **kwargs):
if total_iterations > 1:
weight_decays = _init_weight_decays(model, zero_penalties,
Expand Down Expand Up @@ -264,6 +280,7 @@ def __init__(self, lr=0.002, beta_1=0.9, beta_2=0.999,
self.use_cosine_annealing = use_cosine_annealing
self.init_verbose = init_verbose

_set_autorestart(self, autorestart, use_cosine_annealing)
_check_args(self, total_iterations, use_cosine_annealing, weight_decays)
self._init_lr = lr # to print lr_mult setup
self._init_notified = False
Expand Down Expand Up @@ -340,6 +357,7 @@ def get_config(self):
'weight_decays': self.weight_decays,
'lr_multipliers': self.lr_multipliers,
'use_cosine_annealing': self.use_cosine_annealing,
'autorestart': self.autorestart,
't_cur': int(K.get_value(self.t_cur)),
'eta_t': float(K.eval(self.eta_t)),
'eta_min': float(K.get_value(self.eta_min)),
Expand Down Expand Up @@ -378,6 +396,13 @@ class SGDW(Optimizer):
use_cosine_annealing: bool. If True, multiplies lr each train iteration
as a function of eta_min, eta_max, total_iterations,
and t_cur (current); [2]-Appendix, 2
autorestart: bool / None. If True, will automatically do Warm Restarts
by resetting `t_cur=0` after `total_iterations`. If None,
will default to same as `use_cosine_annealing`. If True
but `use_cosine_annealing` is False, will raise ValueError.
Note: once optimizer is built (happens on first model fit),
changing `autorestart` has no effect; optimizer needs to be
re-built.
eta_min, eta_max: int, int. Min & max values of cosine annealing
lr multiplier; [2]-Appendix, 2
t_cur: int. Value to initialize t_cur to - used for 'warm restarts'.
Expand Down Expand Up @@ -407,7 +432,7 @@ def __init__(self, lr=0.01, momentum=0., nesterov=False, decay=0.0,
model=None, zero_penalties=True, batch_size=32,
total_iterations=0, total_iterations_wd=None,
use_cosine_annealing=False, lr_multipliers=None,
weight_decays=None, init_verbose=True,
weight_decays=None, autorestart=None, init_verbose=True,
eta_min=0, eta_max=1, t_cur=0, **kwargs):
if total_iterations > 1:
weight_decays = _init_weight_decays(model, zero_penalties,
Expand Down Expand Up @@ -435,6 +460,7 @@ def __init__(self, lr=0.01, momentum=0., nesterov=False, decay=0.0,
self.init_verbose = init_verbose
self.use_cosine_annealing = use_cosine_annealing

_set_autorestart(self, autorestart, use_cosine_annealing)
_check_args(self, total_iterations, use_cosine_annealing, weight_decays)
self._init_lr = lr # to print lr_mult setup
self._init_notified = False
Expand Down Expand Up @@ -495,6 +521,7 @@ def get_config(self):
'weight_decays': self.weight_decays,
'lr_multipliers': self.lr_multipliers,
'use_cosine_annealing': self.use_cosine_annealing,
'autorestart': self.autorestart,
't_cur': int(K.get_value(self.t_cur)),
'eta_t': float(K.eval(self.eta_t)),
'eta_min': float(K.get_value(self.eta_min)),
Expand Down
Loading

0 comments on commit 29aa8f2

Please sign in to comment.