1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Adam for TensorFlow.""" 16from __future__ import absolute_import 17from __future__ import division 18from __future__ import print_function 19 20from tensorflow.python.eager import context 21from tensorflow.python.framework import ops 22from tensorflow.python.ops import control_flow_ops 23from tensorflow.python.ops import math_ops 24from tensorflow.python.ops import resource_variable_ops 25from tensorflow.python.ops import state_ops 26from tensorflow.python.training import optimizer 27from tensorflow.python.training import training_ops 28from tensorflow.python.util.tf_export import tf_export 29 30 31@tf_export(v1=["train.AdamOptimizer"]) 32class AdamOptimizer(optimizer.Optimizer): 33 """Optimizer that implements the Adam algorithm. 34 35 References: 36 Adam - A Method for Stochastic Optimization: 37 [Kingma et al., 2015](https://arxiv.org/abs/1412.6980) 38 ([pdf](https://arxiv.org/pdf/1412.6980.pdf)) 39 40 @compatibility(TF2) 41 tf.compat.v1.train.AdamOptimizer is compatible with eager mode and 42 `tf.function`. 43 When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and 44 `epsilon` can each be a callable that takes no arguments and returns the 45 actual value to use. This can be useful for changing these values across 46 different invocations of optimizer functions. 47 48 To switch to native TF2 style, use [`tf.keras.optimizers.Adam`] 49 (https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam) 50 instead. Please notice that due to the implementation differences, 51 `tf.keras.optimizers.Adam` and 52 `tf.compat.v1.train.AdamOptimizer` may have slight differences in 53 floating point numerics even though the formula used for the variable 54 updates still matches. 55 56 #### Structural Mapping to Native TF2 57 58 Before: 59 60 ```python 61 optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.001) 62 ``` 63 64 After: 65 66 ```python 67 optimizer = tf.keras.optimizers.Adam(learning_rate=0.001) 68 ``` 69 70 #### How to Map Arguments 71 |TF1 Arg Name |TF2 Arg Name |Note | 72 |----------------------|-------------|----------------------| 73 |learning_rate |learning_rate|Be careful of setting learning_rate as a 74 : : : tensor value computed from the global 75 : : : step. In TF1 this was usually meant to 76 : : : imply a dynamic learning rate and would 77 : : : recompute in each step. In TF2 (eager + 78 : : : function) it will treat it as a scalar 79 : : : value that only gets computed once 80 : : : instead of a symbolic placeholder to be 81 : : : computed each time. : 82 |beta1 |beta_1 | | 83 |beta2 |beta_2 | | 84 |epsilon |epsilon | Default value is 1e-08 in TF1, but 85 : : : 1e-07 in TF2. : 86 |use_locking |N/A |Not applicable in TF2. | 87 88 #### Before & After Usage Example 89 Before: 90 91 ```python 92 x = tf.Variable([1,2,3], dtype=tf.float32) 93 grad = tf.constant([0.1, 0.2, 0.3]) 94 optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.001) 95 optimizer.apply_gradients(zip([grad], [x])) 96 ``` 97 98 After: 99 100 ```python 101 x = tf.Variable([1,2,3], dtype=tf.float32) 102 grad = tf.constant([0.1, 0.2, 0.3]) 103 optimizer = tf.keras.optimizers.Adam(learning_rate=0.001) 104 optimizer.apply_gradients(zip([grad], [x])) 105 ``` 106 107 @end_compatibility 108 """ 109 110 def __init__(self, 111 learning_rate=0.001, 112 beta1=0.9, 113 beta2=0.999, 114 epsilon=1e-8, 115 use_locking=False, 116 name="Adam"): 117 r"""Construct a new Adam optimizer. 118 119 Initialization: 120 121 $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$ 122 $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$ 123 $$t := 0 \text{(Initialize timestep)}$$ 124 125 The update rule for `variable` with gradient `g` uses an optimization 126 described at the end of section 2 of the paper: 127 128 $$t := t + 1$$ 129 $$\text{lr}_t := \mathrm{learning_rate} * 130 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$ 131 132 $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$ 133 $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$ 134 $$\text{variable} := \text{variable} - 135 \text{lr}_t * m_t / (\sqrt{v_t} + \epsilon)$$ 136 137 The default value of 1e-8 for epsilon might not be a good default in 138 general. For example, when training an Inception network on ImageNet a 139 current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the 140 formulation just before Section 2.1 of the Kingma and Ba paper rather than 141 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon 142 hat" in the paper. 143 144 The sparse implementation of this algorithm (used when the gradient is an 145 IndexedSlices object, typically because of `tf.gather` or an embedding 146 lookup in the forward pass) does apply momentum to variable slices even if 147 they were not used in the forward pass (meaning they have a gradient equal 148 to zero). Momentum decay (beta1) is also applied to the entire momentum 149 accumulator. This means that the sparse behavior is equivalent to the dense 150 behavior (in contrast to some momentum implementations which ignore momentum 151 unless a variable slice was actually used). 152 153 Args: 154 learning_rate: A Tensor or a floating point value. The learning rate. 155 beta1: A float value or a constant float tensor. The exponential decay 156 rate for the 1st moment estimates. 157 beta2: A float value or a constant float tensor. The exponential decay 158 rate for the 2nd moment estimates. 159 epsilon: A small constant for numerical stability. This epsilon is 160 "epsilon hat" in the Kingma and Ba paper (in the formula just before 161 Section 2.1), not the epsilon in Algorithm 1 of the paper. 162 use_locking: If True use locks for update operations. 163 name: Optional name for the operations created when applying gradients. 164 Defaults to "Adam". 165 166 167 """ 168 169 super(AdamOptimizer, self).__init__(use_locking, name) 170 self._lr = learning_rate 171 self._beta1 = beta1 172 self._beta2 = beta2 173 self._epsilon = epsilon 174 175 # Tensor versions of the constructor arguments, created in _prepare(). 176 self._lr_t = None 177 self._beta1_t = None 178 self._beta2_t = None 179 self._epsilon_t = None 180 181 def _get_beta_accumulators(self): 182 with ops.init_scope(): 183 if context.executing_eagerly(): 184 graph = None 185 else: 186 graph = ops.get_default_graph() 187 return (self._get_non_slot_variable("beta1_power", graph=graph), 188 self._get_non_slot_variable("beta2_power", graph=graph)) 189 190 def _create_slots(self, var_list): 191 # Create the beta1 and beta2 accumulators on the same device as the first 192 # variable. Sort the var_list to make sure this device is consistent across 193 # workers (these need to go on the same PS, otherwise some updates are 194 # silently ignored). 195 first_var = min(var_list, key=lambda x: x.name) 196 self._create_non_slot_variable( 197 initial_value=self._beta1, name="beta1_power", colocate_with=first_var) 198 self._create_non_slot_variable( 199 initial_value=self._beta2, name="beta2_power", colocate_with=first_var) 200 201 # Create slots for the first and second moments. 202 for v in var_list: 203 self._zeros_slot(v, "m", self._name) 204 self._zeros_slot(v, "v", self._name) 205 206 def _prepare(self): 207 lr = self._call_if_callable(self._lr) 208 beta1 = self._call_if_callable(self._beta1) 209 beta2 = self._call_if_callable(self._beta2) 210 epsilon = self._call_if_callable(self._epsilon) 211 212 self._lr_t = ops.convert_to_tensor(lr, name="learning_rate") 213 self._beta1_t = ops.convert_to_tensor(beta1, name="beta1") 214 self._beta2_t = ops.convert_to_tensor(beta2, name="beta2") 215 self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon") 216 217 def _apply_dense(self, grad, var): 218 m = self.get_slot(var, "m") 219 v = self.get_slot(var, "v") 220 beta1_power, beta2_power = self._get_beta_accumulators() 221 return training_ops.apply_adam( 222 var, 223 m, 224 v, 225 math_ops.cast(beta1_power, var.dtype.base_dtype), 226 math_ops.cast(beta2_power, var.dtype.base_dtype), 227 math_ops.cast(self._lr_t, var.dtype.base_dtype), 228 math_ops.cast(self._beta1_t, var.dtype.base_dtype), 229 math_ops.cast(self._beta2_t, var.dtype.base_dtype), 230 math_ops.cast(self._epsilon_t, var.dtype.base_dtype), 231 grad, 232 use_locking=self._use_locking).op 233 234 def _resource_apply_dense(self, grad, var): 235 m = self.get_slot(var, "m") 236 v = self.get_slot(var, "v") 237 beta1_power, beta2_power = self._get_beta_accumulators() 238 return training_ops.resource_apply_adam( 239 var.handle, 240 m.handle, 241 v.handle, 242 math_ops.cast(beta1_power, grad.dtype.base_dtype), 243 math_ops.cast(beta2_power, grad.dtype.base_dtype), 244 math_ops.cast(self._lr_t, grad.dtype.base_dtype), 245 math_ops.cast(self._beta1_t, grad.dtype.base_dtype), 246 math_ops.cast(self._beta2_t, grad.dtype.base_dtype), 247 math_ops.cast(self._epsilon_t, grad.dtype.base_dtype), 248 grad, 249 use_locking=self._use_locking) 250 251 def _apply_sparse_shared(self, grad, var, indices, scatter_add): 252 beta1_power, beta2_power = self._get_beta_accumulators() 253 beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype) 254 beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype) 255 lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) 256 beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) 257 beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) 258 epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) 259 lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) 260 # m_t = beta1 * m + (1 - beta1) * g_t 261 m = self.get_slot(var, "m") 262 m_scaled_g_values = grad * (1 - beta1_t) 263 m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) 264 with ops.control_dependencies([m_t]): 265 m_t = scatter_add(m, indices, m_scaled_g_values) 266 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 267 v = self.get_slot(var, "v") 268 v_scaled_g_values = (grad * grad) * (1 - beta2_t) 269 v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) 270 with ops.control_dependencies([v_t]): 271 v_t = scatter_add(v, indices, v_scaled_g_values) 272 v_sqrt = math_ops.sqrt(v_t) 273 var_update = state_ops.assign_sub( 274 var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking) 275 return control_flow_ops.group(*[var_update, m_t, v_t]) 276 277 def _apply_sparse(self, grad, var): 278 return self._apply_sparse_shared( 279 grad.values, 280 var, 281 grad.indices, 282 lambda x, i, v: state_ops.scatter_add( # pylint: disable=g-long-lambda 283 x, 284 i, 285 v, 286 use_locking=self._use_locking)) 287 288 def _resource_scatter_add(self, x, i, v): 289 with ops.control_dependencies( 290 [resource_variable_ops.resource_scatter_add(x.handle, i, v)]): 291 return x.value() 292 293 def _resource_apply_sparse(self, grad, var, indices): 294 return self._apply_sparse_shared(grad, var, indices, 295 self._resource_scatter_add) 296 297 def _finish(self, update_ops, name_scope): 298 # Update the power accumulators. 299 with ops.control_dependencies(update_ops): 300 beta1_power, beta2_power = self._get_beta_accumulators() 301 with ops.colocate_with(beta1_power): 302 update_beta1 = beta1_power.assign( 303 beta1_power * self._beta1_t, use_locking=self._use_locking) 304 update_beta2 = beta2_power.assign( 305 beta2_power * self._beta2_t, use_locking=self._use_locking) 306 return control_flow_ops.group( 307 *update_ops + [update_beta1, update_beta2], name=name_scope) 308