1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15# pylint: disable=invalid-name 16# pylint: disable=g-classes-have-attributes 17"""Legacy v1 optimizer classes. 18 19For more examples see the base class `tf.compat.v1.keras.optimizers.Optimizer`. 20""" 21from __future__ import absolute_import 22from __future__ import division 23from __future__ import print_function 24 25from six.moves import zip # pylint: disable=redefined-builtin 26 27from tensorflow.python.distribute import distribution_strategy_context 28from tensorflow.python.eager import backprop 29from tensorflow.python.framework import ops 30from tensorflow.python.keras import backend as K 31from tensorflow.python.ops import clip_ops 32from tensorflow.python.ops import math_ops 33from tensorflow.python.ops import state_ops 34from tensorflow.python.training import training_util 35from tensorflow.python.training.tracking import base as trackable 36from tensorflow.python.util import nest 37 38 39class Optimizer(object): 40 """Abstract optimizer base class. 41 42 Note: this is the parent class of all optimizers, not an actual optimizer 43 that can be used for training models. 44 45 All Keras optimizers support the following keyword arguments: 46 47 clipnorm: float >= 0. Gradients will be clipped 48 when their L2 norm exceeds this value. 49 clipvalue: float >= 0. Gradients will be clipped 50 when their absolute value exceeds this value. 51 """ 52 53 def __init__(self, **kwargs): 54 allowed_kwargs = {'clipnorm', 'clipvalue'} 55 for k in kwargs: 56 if k not in allowed_kwargs: 57 raise TypeError('Unexpected keyword argument ' 58 'passed to optimizer: ' + str(k)) 59 # checks that clipnorm >= 0 and clipvalue >= 0 60 if kwargs[k] < 0: 61 raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k])) 62 self.__dict__.update(kwargs) 63 self.updates = [] 64 self.weights = [] 65 66 # Set this to False, indicating `apply_gradients` does not take the 67 # `experimental_aggregate_gradients` argument. 68 _HAS_AGGREGATE_GRAD = False 69 70 def _create_all_weights(self, params): 71 """Creates and sets all optimizer weights. 72 73 Args: 74 params: list or tuple of `Variable` objects that will be minimized 75 using this optimizer. 76 77 Returns: 78 Specific weight values that are used in `get_updates` 79 """ 80 raise NotImplementedError 81 82 def get_updates(self, loss, params): 83 raise NotImplementedError 84 85 def get_gradients(self, loss, params): 86 """Returns gradients of `loss` with respect to `params`. 87 88 Args: 89 loss: Loss tensor. 90 params: List of variables. 91 92 Returns: 93 List of gradient tensors. 94 95 Raises: 96 ValueError: In case any gradient cannot be computed (e.g. if gradient 97 function not implemented). 98 """ 99 grads = K.gradients(loss, params) 100 if any(g is None for g in grads): 101 raise ValueError('An operation has `None` for gradient. ' 102 'Please make sure that all of your ops have a ' 103 'gradient defined (i.e. are differentiable). ' 104 'Common ops without gradient: ' 105 'K.argmax, K.round, K.eval.') 106 if hasattr(self, 'clipnorm'): 107 grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads] 108 if hasattr(self, 'clipvalue'): 109 grads = [ 110 clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) 111 for g in grads 112 ] 113 return grads 114 115 def set_weights(self, weights): 116 """Sets the weights of the optimizer, from Numpy arrays. 117 118 Should only be called after computing the gradients 119 (otherwise the optimizer has no weights). 120 121 Args: 122 weights: a list of Numpy arrays. The number of arrays and their shape 123 must match number of the dimensions of the weights of the optimizer 124 (i.e. it should match the output of `get_weights`). 125 126 Raises: 127 ValueError: in case of incompatible weight shapes. 128 """ 129 params = self.weights 130 if len(params) != len(weights): 131 raise ValueError('Length of the specified weight list (' + 132 str(len(weights)) + 133 ') does not match the number of weights ' 134 'of the optimizer (' + str(len(params)) + ')') 135 weight_value_tuples = [] 136 param_values = K.batch_get_value(params) 137 for pv, p, w in zip(param_values, params, weights): 138 if pv.shape != w.shape: 139 raise ValueError('Optimizer weight shape ' + str(pv.shape) + 140 ' not compatible with ' 141 'provided weight shape ' + str(w.shape)) 142 weight_value_tuples.append((p, w)) 143 K.batch_set_value(weight_value_tuples) 144 145 def get_weights(self): 146 """Returns the current value of the weights of the optimizer. 147 148 Returns: 149 A list of numpy arrays. 150 """ 151 return K.batch_get_value(self.weights) 152 153 def get_config(self): 154 config = {} 155 if hasattr(self, 'clipnorm'): 156 config['clipnorm'] = self.clipnorm 157 if hasattr(self, 'clipvalue'): 158 config['clipvalue'] = self.clipvalue 159 return config 160 161 @classmethod 162 def from_config(cls, config): 163 return cls(**config) 164 165 166class SGD(Optimizer): 167 """Stochastic gradient descent optimizer. 168 169 Includes support for momentum, 170 learning rate decay, and Nesterov momentum. 171 172 Args: 173 lr: float >= 0. Learning rate. 174 momentum: float >= 0. Parameter that accelerates SGD in the relevant 175 direction and dampens oscillations. 176 decay: float >= 0. Learning rate decay over each update. 177 nesterov: boolean. Whether to apply Nesterov momentum. 178 """ 179 180 def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs): 181 super(SGD, self).__init__(**kwargs) 182 with K.name_scope(self.__class__.__name__): 183 self.iterations = K.variable(0, dtype='int64', name='iterations') 184 self.lr = K.variable(lr, name='lr') 185 self.momentum = K.variable(momentum, name='momentum') 186 self.decay = K.variable(decay, name='decay') 187 self.initial_decay = decay 188 self.nesterov = nesterov 189 190 def _create_all_weights(self, params): 191 shapes = [K.int_shape(p) for p in params] 192 moments = [K.zeros(shape) for shape in shapes] 193 self.weights = [self.iterations] + moments 194 return moments 195 196 def get_updates(self, loss, params): 197 grads = self.get_gradients(loss, params) 198 self.updates = [state_ops.assign_add(self.iterations, 1)] 199 200 lr = self.lr 201 if self.initial_decay > 0: 202 lr = lr * ( # pylint: disable=g-no-augmented-assignment 203 1. / 204 (1. + 205 self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) 206 # momentum 207 moments = self._create_all_weights(params) 208 for p, g, m in zip(params, grads, moments): 209 v = self.momentum * m - lr * g # velocity 210 self.updates.append(state_ops.assign(m, v)) 211 212 if self.nesterov: 213 new_p = p + self.momentum * v - lr * g 214 else: 215 new_p = p + v 216 217 # Apply constraints. 218 if getattr(p, 'constraint', None) is not None: 219 new_p = p.constraint(new_p) 220 221 self.updates.append(state_ops.assign(p, new_p)) 222 return self.updates 223 224 def get_config(self): 225 config = { 226 'lr': float(K.get_value(self.lr)), 227 'momentum': float(K.get_value(self.momentum)), 228 'decay': float(K.get_value(self.decay)), 229 'nesterov': self.nesterov 230 } 231 base_config = super(SGD, self).get_config() 232 return dict(list(base_config.items()) + list(config.items())) 233 234 235class RMSprop(Optimizer): 236 """RMSProp optimizer. 237 238 It is recommended to leave the parameters of this optimizer 239 at their default values 240 (except the learning rate, which can be freely tuned). 241 242 Args: 243 lr: float >= 0. Learning rate. 244 rho: float >= 0. 245 epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. 246 decay: float >= 0. Learning rate decay over each update. 247 """ 248 249 def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs): 250 super(RMSprop, self).__init__(**kwargs) 251 with K.name_scope(self.__class__.__name__): 252 self.lr = K.variable(lr, name='lr') 253 self.rho = K.variable(rho, name='rho') 254 self.decay = K.variable(decay, name='decay') 255 self.iterations = K.variable(0, dtype='int64', name='iterations') 256 if epsilon is None: 257 epsilon = K.epsilon() 258 self.epsilon = epsilon 259 self.initial_decay = decay 260 261 def _create_all_weights(self, params): 262 accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 263 self.weights = accumulators 264 return accumulators 265 266 def get_updates(self, loss, params): 267 grads = self.get_gradients(loss, params) 268 accumulators = self._create_all_weights(params) 269 self.updates = [state_ops.assign_add(self.iterations, 1)] 270 271 lr = self.lr 272 if self.initial_decay > 0: 273 lr = lr * ( # pylint: disable=g-no-augmented-assignment 274 1. / 275 (1. + 276 self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) 277 278 for p, g, a in zip(params, grads, accumulators): 279 # update accumulator 280 new_a = self.rho * a + (1. - self.rho) * math_ops.square(g) 281 self.updates.append(state_ops.assign(a, new_a)) 282 new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) 283 284 # Apply constraints. 285 if getattr(p, 'constraint', None) is not None: 286 new_p = p.constraint(new_p) 287 288 self.updates.append(state_ops.assign(p, new_p)) 289 return self.updates 290 291 def get_config(self): 292 config = { 293 'lr': float(K.get_value(self.lr)), 294 'rho': float(K.get_value(self.rho)), 295 'decay': float(K.get_value(self.decay)), 296 'epsilon': self.epsilon 297 } 298 base_config = super(RMSprop, self).get_config() 299 return dict(list(base_config.items()) + list(config.items())) 300 301 302class Adagrad(Optimizer): 303 """Adagrad optimizer. 304 305 Adagrad is an optimizer with parameter-specific learning rates, 306 which are adapted relative to how frequently a parameter gets 307 updated during training. The more updates a parameter receives, 308 the smaller the updates. 309 310 It is recommended to leave the parameters of this optimizer 311 at their default values. 312 313 # Arguments 314 lr: float >= 0. Initial learning rate. 315 epsilon: float >= 0. If `None`, defaults to `K.epsilon()`. 316 decay: float >= 0. Learning rate decay over each update. 317 318 # References 319 - [Adaptive Subgradient Methods for Online Learning and Stochastic 320 Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) 321 """ 322 323 def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs): 324 super(Adagrad, self).__init__(**kwargs) 325 with K.name_scope(self.__class__.__name__): 326 self.lr = K.variable(lr, name='lr') 327 self.decay = K.variable(decay, name='decay') 328 self.iterations = K.variable(0, dtype='int64', name='iterations') 329 if epsilon is None: 330 epsilon = K.epsilon() 331 self.epsilon = epsilon 332 self.initial_decay = decay 333 334 def _create_all_weights(self, params): 335 shapes = [K.int_shape(p) for p in params] 336 accumulators = [K.zeros(shape) for shape in shapes] 337 self.weights = accumulators 338 return accumulators 339 340 def get_updates(self, loss, params): 341 grads = self.get_gradients(loss, params) 342 accumulators = self._create_all_weights(params) 343 344 self.updates = [state_ops.assign_add(self.iterations, 1)] 345 346 lr = self.lr 347 if self.initial_decay > 0: 348 lr = lr * ( # pylint: disable=g-no-augmented-assignment 349 1. / 350 (1. + 351 self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) 352 353 for p, g, a in zip(params, grads, accumulators): 354 new_a = a + math_ops.square(g) # update accumulator 355 self.updates.append(state_ops.assign(a, new_a)) 356 new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon) 357 358 # Apply constraints. 359 if getattr(p, 'constraint', None) is not None: 360 new_p = p.constraint(new_p) 361 362 self.updates.append(state_ops.assign(p, new_p)) 363 return self.updates 364 365 def get_config(self): 366 config = { 367 'lr': float(K.get_value(self.lr)), 368 'decay': float(K.get_value(self.decay)), 369 'epsilon': self.epsilon 370 } 371 base_config = super(Adagrad, self).get_config() 372 return dict(list(base_config.items()) + list(config.items())) 373 374 375class Adadelta(Optimizer): 376 """Adadelta optimizer. 377 378 Adadelta is a more robust extension of Adagrad 379 that adapts learning rates based on a moving window of gradient updates, 380 instead of accumulating all past gradients. This way, Adadelta continues 381 learning even when many updates have been done. Compared to Adagrad, in the 382 original version of Adadelta you don't have to set an initial learning 383 rate. In this version, initial learning rate and decay factor can 384 be set, as in most other Keras optimizers. 385 386 It is recommended to leave the parameters of this optimizer 387 at their default values. 388 389 # Arguments 390 lr: float >= 0. Initial learning rate, defaults to 1. 391 It is recommended to leave it at the default value. 392 rho: float >= 0. Adadelta decay factor, corresponding to fraction of 393 gradient to keep at each time step. 394 epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. 395 decay: float >= 0. Initial learning rate decay. 396 397 # References 398 - [Adadelta - an adaptive learning rate 399 method](http://arxiv.org/abs/1212.5701) 400 """ 401 402 def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs): 403 super(Adadelta, self).__init__(**kwargs) 404 with K.name_scope(self.__class__.__name__): 405 self.lr = K.variable(lr, name='lr') 406 self.decay = K.variable(decay, name='decay') 407 self.iterations = K.variable(0, dtype='int64', name='iterations') 408 if epsilon is None: 409 epsilon = K.epsilon() 410 self.rho = rho 411 self.epsilon = epsilon 412 self.initial_decay = decay 413 414 def _create_all_weights(self, params): 415 shapes = [K.int_shape(p) for p in params] 416 accumulators = [K.zeros(shape) for shape in shapes] 417 delta_accumulators = [K.zeros(shape) for shape in shapes] 418 self.weights = accumulators + delta_accumulators 419 return accumulators, delta_accumulators 420 421 def get_updates(self, loss, params): 422 grads = self.get_gradients(loss, params) 423 self.updates = [state_ops.assign_add(self.iterations, 1)] 424 accumulators, delta_accumulators = self._create_all_weights(params) 425 426 lr = self.lr 427 if self.initial_decay > 0: 428 lr = lr * ( # pylint: disable=g-no-augmented-assignment 429 1. / 430 (1. + 431 self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) 432 433 for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): 434 # update accumulator 435 new_a = self.rho * a + (1. - self.rho) * math_ops.square(g) 436 self.updates.append(state_ops.assign(a, new_a)) 437 438 # use the new accumulator and the *old* delta_accumulator 439 update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon) 440 new_p = p - lr * update 441 442 # Apply constraints. 443 if getattr(p, 'constraint', None) is not None: 444 new_p = p.constraint(new_p) 445 446 self.updates.append(state_ops.assign(p, new_p)) 447 448 # update delta_accumulator 449 new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update) 450 self.updates.append(state_ops.assign(d_a, new_d_a)) 451 return self.updates 452 453 def get_config(self): 454 config = { 455 'lr': float(K.get_value(self.lr)), 456 'rho': self.rho, 457 'decay': float(K.get_value(self.decay)), 458 'epsilon': self.epsilon 459 } 460 base_config = super(Adadelta, self).get_config() 461 return dict(list(base_config.items()) + list(config.items())) 462 463 464class Adam(Optimizer): 465 """Adam optimizer. 466 467 Default parameters follow those provided in the original paper. 468 469 Args: 470 lr: float >= 0. Learning rate. 471 beta_1: float, 0 < beta < 1. Generally close to 1. 472 beta_2: float, 0 < beta < 1. Generally close to 1. 473 epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. 474 decay: float >= 0. Learning rate decay over each update. 475 amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm 476 from the paper "On the Convergence of Adam and Beyond". 477 """ 478 479 def __init__(self, 480 lr=0.001, 481 beta_1=0.9, 482 beta_2=0.999, 483 epsilon=None, 484 decay=0., 485 amsgrad=False, 486 **kwargs): 487 super(Adam, self).__init__(**kwargs) 488 with K.name_scope(self.__class__.__name__): 489 self.iterations = K.variable(0, dtype='int64', name='iterations') 490 self.lr = K.variable(lr, name='lr') 491 self.beta_1 = K.variable(beta_1, name='beta_1') 492 self.beta_2 = K.variable(beta_2, name='beta_2') 493 self.decay = K.variable(decay, name='decay') 494 if epsilon is None: 495 epsilon = K.epsilon() 496 self.epsilon = epsilon 497 self.initial_decay = decay 498 self.amsgrad = amsgrad 499 500 def _create_all_weights(self, params): 501 ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 502 vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 503 if self.amsgrad: 504 vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 505 else: 506 vhats = [K.zeros(1) for _ in params] 507 self.weights = [self.iterations] + ms + vs + vhats 508 return ms, vs, vhats 509 510 def get_updates(self, loss, params): 511 grads = self.get_gradients(loss, params) 512 self.updates = [] 513 514 lr = self.lr 515 if self.initial_decay > 0: 516 lr = lr * ( # pylint: disable=g-no-augmented-assignment 517 1. / 518 (1. + 519 self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) 520 521 with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): 522 t = math_ops.cast(self.iterations, K.floatx()) 523 lr_t = lr * ( 524 K.sqrt(1. - math_ops.pow(self.beta_2, t)) / 525 (1. - math_ops.pow(self.beta_1, t))) 526 527 ms, vs, vhats = self._create_all_weights(params) 528 for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): 529 m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 530 v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) 531 if self.amsgrad: 532 vhat_t = math_ops.maximum(vhat, v_t) 533 p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon) 534 self.updates.append(state_ops.assign(vhat, vhat_t)) 535 else: 536 p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) 537 538 self.updates.append(state_ops.assign(m, m_t)) 539 self.updates.append(state_ops.assign(v, v_t)) 540 new_p = p_t 541 542 # Apply constraints. 543 if getattr(p, 'constraint', None) is not None: 544 new_p = p.constraint(new_p) 545 546 self.updates.append(state_ops.assign(p, new_p)) 547 return self.updates 548 549 def get_config(self): 550 config = { 551 'lr': float(K.get_value(self.lr)), 552 'beta_1': float(K.get_value(self.beta_1)), 553 'beta_2': float(K.get_value(self.beta_2)), 554 'decay': float(K.get_value(self.decay)), 555 'epsilon': self.epsilon, 556 'amsgrad': self.amsgrad 557 } 558 base_config = super(Adam, self).get_config() 559 return dict(list(base_config.items()) + list(config.items())) 560 561 562class Adamax(Optimizer): 563 """Adamax optimizer from Adam paper's Section 7. 564 565 It is a variant of Adam based on the infinity norm. 566 Default parameters follow those provided in the paper. 567 568 Args: 569 lr: float >= 0. Learning rate. 570 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 571 epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. 572 decay: float >= 0. Learning rate decay over each update. 573 """ 574 575 def __init__(self, 576 lr=0.002, 577 beta_1=0.9, 578 beta_2=0.999, 579 epsilon=None, 580 decay=0., 581 **kwargs): 582 super(Adamax, self).__init__(**kwargs) 583 with K.name_scope(self.__class__.__name__): 584 self.iterations = K.variable(0, dtype='int64', name='iterations') 585 self.lr = K.variable(lr, name='lr') 586 self.beta_1 = K.variable(beta_1, name='beta_1') 587 self.beta_2 = K.variable(beta_2, name='beta_2') 588 self.decay = K.variable(decay, name='decay') 589 if epsilon is None: 590 epsilon = K.epsilon() 591 self.epsilon = epsilon 592 self.initial_decay = decay 593 594 def _create_all_weights(self, params): 595 596 shapes = [K.int_shape(p) for p in params] 597 # zero init of 1st moment 598 ms = [K.zeros(shape) for shape in shapes] 599 # zero init of exponentially weighted infinity norm 600 us = [K.zeros(shape) for shape in shapes] 601 self.weights = [self.iterations] + ms + us 602 return ms, us 603 604 def get_updates(self, loss, params): 605 grads = self.get_gradients(loss, params) 606 self.updates = [] 607 608 lr = self.lr 609 if self.initial_decay > 0: 610 lr = lr * ( # pylint: disable=g-no-augmented-assignment 611 1. / 612 (1. + 613 self.decay * math_ops.cast(self.iterations, K.dtype(self.decay)))) 614 615 with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): 616 t = math_ops.cast(self.iterations, K.floatx()) 617 lr_t = lr / (1. - math_ops.pow(self.beta_1, t)) 618 619 ms, us = self._create_all_weights(params) 620 621 for p, g, m, u in zip(params, grads, ms, us): 622 623 m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 624 u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g)) 625 p_t = p - lr_t * m_t / (u_t + self.epsilon) 626 627 self.updates.append(state_ops.assign(m, m_t)) 628 self.updates.append(state_ops.assign(u, u_t)) 629 new_p = p_t 630 631 # Apply constraints. 632 if getattr(p, 'constraint', None) is not None: 633 new_p = p.constraint(new_p) 634 635 self.updates.append(state_ops.assign(p, new_p)) 636 return self.updates 637 638 def get_config(self): 639 config = { 640 'lr': float(K.get_value(self.lr)), 641 'beta_1': float(K.get_value(self.beta_1)), 642 'beta_2': float(K.get_value(self.beta_2)), 643 'decay': float(K.get_value(self.decay)), 644 'epsilon': self.epsilon 645 } 646 base_config = super(Adamax, self).get_config() 647 return dict(list(base_config.items()) + list(config.items())) 648 649 650class Nadam(Optimizer): 651 """Nesterov Adam optimizer. 652 653 Much like Adam is essentially RMSprop with momentum, 654 Nadam is Adam RMSprop with Nesterov momentum. 655 656 Default parameters follow those provided in the paper. 657 It is recommended to leave the parameters of this optimizer 658 at their default values. 659 660 Args: 661 lr: float >= 0. Learning rate. 662 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 663 epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. 664 """ 665 666 def __init__(self, 667 lr=0.002, 668 beta_1=0.9, 669 beta_2=0.999, 670 epsilon=None, 671 schedule_decay=0.004, 672 **kwargs): 673 super(Nadam, self).__init__(**kwargs) 674 with K.name_scope(self.__class__.__name__): 675 self.iterations = K.variable(0, dtype='int64', name='iterations') 676 self.m_schedule = K.variable(1., name='m_schedule') 677 self.lr = K.variable(lr, name='lr') 678 self.beta_1 = K.variable(beta_1, name='beta_1') 679 self.beta_2 = K.variable(beta_2, name='beta_2') 680 if epsilon is None: 681 epsilon = K.epsilon() 682 self.epsilon = epsilon 683 self.schedule_decay = schedule_decay 684 685 def _create_all_weights(self, params): 686 shapes = [K.int_shape(p) for p in params] 687 ms = [K.zeros(shape) for shape in shapes] 688 vs = [K.zeros(shape) for shape in shapes] 689 690 self.weights = [self.iterations, self.m_schedule] + ms + vs 691 return ms, vs 692 693 def get_updates(self, loss, params): 694 grads = self.get_gradients(loss, params) 695 self.updates = [] 696 697 with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): 698 t = math_ops.cast(self.iterations, K.floatx()) 699 700 # Due to the recommendations in [2], i.e. warming momentum schedule 701 momentum_cache_t = self.beta_1 * ( 702 1. - 0.5 * 703 (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay))) 704 momentum_cache_t_1 = self.beta_1 * ( 705 1. - 0.5 * 706 (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay))) 707 m_schedule_new = self.m_schedule * momentum_cache_t 708 m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 709 self.updates.append((self.m_schedule, m_schedule_new)) 710 711 ms, vs = self._create_all_weights(params) 712 713 for p, g, m, v in zip(params, grads, ms, vs): 714 # the following equations given in [1] 715 g_prime = g / (1. - m_schedule_new) 716 m_t = self.beta_1 * m + (1. - self.beta_1) * g 717 m_t_prime = m_t / (1. - m_schedule_next) 718 v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g) 719 v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t)) 720 m_t_bar = (1. - 721 momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime 722 723 self.updates.append(state_ops.assign(m, m_t)) 724 self.updates.append(state_ops.assign(v, v_t)) 725 726 p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon) 727 new_p = p_t 728 729 # Apply constraints. 730 if getattr(p, 'constraint', None) is not None: 731 new_p = p.constraint(new_p) 732 733 self.updates.append(state_ops.assign(p, new_p)) 734 return self.updates 735 736 def get_config(self): 737 config = { 738 'lr': float(K.get_value(self.lr)), 739 'beta_1': float(K.get_value(self.beta_1)), 740 'beta_2': float(K.get_value(self.beta_2)), 741 'epsilon': self.epsilon, 742 'schedule_decay': self.schedule_decay 743 } 744 base_config = super(Nadam, self).get_config() 745 return dict(list(base_config.items()) + list(config.items())) 746 747 748class TFOptimizer(Optimizer, trackable.Trackable): 749 """Wrapper class for native TensorFlow optimizers.""" 750 751 def __init__(self, optimizer, iterations=None): # pylint: disable=super-init-not-called 752 self.optimizer = optimizer 753 self._track_trackable(optimizer, name='optimizer') 754 if iterations is None: 755 with K.name_scope(self.__class__.__name__): 756 self.iterations = K.variable(0, dtype='int64', name='iterations') 757 else: 758 self.iterations = iterations 759 self._track_trackable(self.iterations, name='global_step') 760 761 def _clip_gradients(self, grads): 762 """Clip gradients according to the clipnorm and clipvalue attributes.""" 763 # TFOptimizer wrapper has no gradient clipping options. 764 return grads 765 766 def minimize(self, loss, var_list, grad_loss=None, tape=None): 767 """Mimics the `OptimizerV2.minimize` API.""" 768 if not callable(loss) and tape is None: 769 raise ValueError('`tape` is required when a `Tensor` loss is passed.') 770 tape = tape if tape is not None else backprop.GradientTape() 771 772 if callable(loss): 773 with tape: 774 if not callable(var_list): 775 tape.watch(var_list) 776 loss = loss() 777 if callable(var_list): 778 var_list = var_list() 779 780 var_list = nest.flatten(var_list) 781 if var_list: 782 grads = tape.gradient(loss, var_list, grad_loss) 783 grads_and_vars = list(zip(grads, var_list)) 784 self.apply_gradients(grads_and_vars) 785 786 def apply_gradients(self, grads_and_vars): 787 self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations) 788 789 def get_grads(self, loss, params): 790 return self.optimizer.compute_gradients(loss, params) 791 792 def get_updates(self, loss, params): 793 if distribution_strategy_context.has_strategy(): 794 self.updates = [] 795 796 if not params: 797 # After the model vars have been created, the second call to get_updates 798 # is called with params as an empty list. This ensures that we call 799 # compute_gradients with params=None. 800 grads = self.optimizer.compute_gradients(loss) 801 else: 802 grads = self.optimizer.compute_gradients(loss, params) 803 global_step = training_util.get_global_step() 804 opt_update = self.optimizer.apply_gradients(grads, global_step) 805 else: 806 if not params: 807 self.updates = [state_ops.assign_add(self.iterations, 1)] 808 return self.updates 809 810 # Updates list starts out empty because the iterations variable is 811 # incremented in optimizer.apply_gradients() 812 self.updates = [] 813 grads = self.optimizer.compute_gradients(loss, params) 814 opt_update = self.optimizer.apply_gradients( 815 grads, global_step=self.iterations) 816 817 self.updates.append(opt_update) 818 return self.updates 819 820 @property 821 def weights(self): 822 raise NotImplementedError 823 824 def get_config(self): 825 raise NotImplementedError 826 827 def from_config(self, config): 828 raise NotImplementedError 829 830 831# Aliases. 832 833sgd = SGD 834rmsprop = RMSprop 835adagrad = Adagrad 836adadelta = Adadelta 837adam = Adam 838adamax = Adamax 839nadam = Nadam 840