• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Adam for TensorFlow."""
16from __future__ import absolute_import
17from __future__ import division
18from __future__ import print_function
19
20from tensorflow.python.eager import context
21from tensorflow.python.framework import ops
22from tensorflow.python.ops import control_flow_ops
23from tensorflow.python.ops import math_ops
24from tensorflow.python.ops import resource_variable_ops
25from tensorflow.python.ops import state_ops
26from tensorflow.python.training import optimizer
27from tensorflow.python.training import training_ops
28from tensorflow.python.util.tf_export import tf_export
29
30
31@tf_export(v1=["train.AdamOptimizer"])
32class AdamOptimizer(optimizer.Optimizer):
33  """Optimizer that implements the Adam algorithm.
34
35  References:
36    Adam - A Method for Stochastic Optimization:
37      [Kingma et al., 2015](https://arxiv.org/abs/1412.6980)
38      ([pdf](https://arxiv.org/pdf/1412.6980.pdf))
39
40  @compatibility(TF2)
41  tf.compat.v1.train.AdamOptimizer is compatible with eager mode and
42  `tf.function`.
43  When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
44  `epsilon` can each be a callable that takes no arguments and returns the
45  actual value to use. This can be useful for changing these values across
46  different invocations of optimizer functions.
47
48  To switch to native TF2 style, use [`tf.keras.optimizers.Adam`]
49  (https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam)
50  instead. Please notice that due to the implementation differences,
51  `tf.keras.optimizers.Adam` and
52  `tf.compat.v1.train.AdamOptimizer` may have slight differences in
53  floating point numerics even though the formula used for the variable
54  updates still matches.
55
56  #### Structural Mapping to Native TF2
57
58  Before:
59
60  ```python
61  optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)
62  ```
63
64  After:
65
66  ```python
67  optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
68  ```
69
70  #### How to Map Arguments
71  |TF1 Arg Name          |TF2 Arg Name |Note                  |
72  |----------------------|-------------|----------------------|
73  |learning_rate         |learning_rate|Be careful of setting learning_rate as a
74  :                      :             : tensor value computed from the global
75  :                      :             : step. In TF1 this was usually meant to
76  :                      :             : imply a dynamic learning rate and would
77  :                      :             : recompute in each step. In TF2 (eager +
78  :                      :             : function) it will treat it as a scalar
79  :                      :             : value that only gets computed once
80  :                      :             : instead of a symbolic placeholder to be
81  :                      :             : computed each time.                   :
82  |beta1                 |beta_1        |                      |
83  |beta2                 |beta_2        |                      |
84  |epsilon               |epsilon      | Default value is 1e-08 in TF1, but
85  :                      :             : 1e-07 in TF2.                     :
86  |use_locking           |N/A          |Not applicable in TF2. |
87
88  #### Before & After Usage Example
89  Before:
90
91  ```python
92  x = tf.Variable([1,2,3], dtype=tf.float32)
93  grad = tf.constant([0.1, 0.2, 0.3])
94  optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)
95  optimizer.apply_gradients(zip([grad], [x]))
96  ```
97
98  After:
99
100  ```python
101  x = tf.Variable([1,2,3], dtype=tf.float32)
102  grad = tf.constant([0.1, 0.2, 0.3])
103  optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
104  optimizer.apply_gradients(zip([grad], [x]))
105  ```
106
107  @end_compatibility
108  """
109
110  def __init__(self,
111               learning_rate=0.001,
112               beta1=0.9,
113               beta2=0.999,
114               epsilon=1e-8,
115               use_locking=False,
116               name="Adam"):
117    r"""Construct a new Adam optimizer.
118
119    Initialization:
120
121    $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
122    $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
123    $$t := 0 \text{(Initialize timestep)}$$
124
125    The update rule for `variable` with gradient `g` uses an optimization
126    described at the end of section 2 of the paper:
127
128    $$t := t + 1$$
129    $$\text{lr}_t := \mathrm{learning_rate} *
130      \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
131
132    $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
133    $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
134    $$\text{variable} := \text{variable} -
135      \text{lr}_t * m_t / (\sqrt{v_t} + \epsilon)$$
136
137    The default value of 1e-8 for epsilon might not be a good default in
138    general. For example, when training an Inception network on ImageNet a
139    current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
140    formulation just before Section 2.1 of the Kingma and Ba paper rather than
141    the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
142    hat" in the paper.
143
144    The sparse implementation of this algorithm (used when the gradient is an
145    IndexedSlices object, typically because of `tf.gather` or an embedding
146    lookup in the forward pass) does apply momentum to variable slices even if
147    they were not used in the forward pass (meaning they have a gradient equal
148    to zero). Momentum decay (beta1) is also applied to the entire momentum
149    accumulator. This means that the sparse behavior is equivalent to the dense
150    behavior (in contrast to some momentum implementations which ignore momentum
151    unless a variable slice was actually used).
152
153    Args:
154      learning_rate: A Tensor or a floating point value.  The learning rate.
155      beta1: A float value or a constant float tensor. The exponential decay
156        rate for the 1st moment estimates.
157      beta2: A float value or a constant float tensor. The exponential decay
158        rate for the 2nd moment estimates.
159      epsilon: A small constant for numerical stability. This epsilon is
160        "epsilon hat" in the Kingma and Ba paper (in the formula just before
161        Section 2.1), not the epsilon in Algorithm 1 of the paper.
162      use_locking: If True use locks for update operations.
163      name: Optional name for the operations created when applying gradients.
164        Defaults to "Adam".
165
166
167    """
168
169    super(AdamOptimizer, self).__init__(use_locking, name)
170    self._lr = learning_rate
171    self._beta1 = beta1
172    self._beta2 = beta2
173    self._epsilon = epsilon
174
175    # Tensor versions of the constructor arguments, created in _prepare().
176    self._lr_t = None
177    self._beta1_t = None
178    self._beta2_t = None
179    self._epsilon_t = None
180
181  def _get_beta_accumulators(self):
182    with ops.init_scope():
183      if context.executing_eagerly():
184        graph = None
185      else:
186        graph = ops.get_default_graph()
187      return (self._get_non_slot_variable("beta1_power", graph=graph),
188              self._get_non_slot_variable("beta2_power", graph=graph))
189
190  def _create_slots(self, var_list):
191    # Create the beta1 and beta2 accumulators on the same device as the first
192    # variable. Sort the var_list to make sure this device is consistent across
193    # workers (these need to go on the same PS, otherwise some updates are
194    # silently ignored).
195    first_var = min(var_list, key=lambda x: x.name)
196    self._create_non_slot_variable(
197        initial_value=self._beta1, name="beta1_power", colocate_with=first_var)
198    self._create_non_slot_variable(
199        initial_value=self._beta2, name="beta2_power", colocate_with=first_var)
200
201    # Create slots for the first and second moments.
202    for v in var_list:
203      self._zeros_slot(v, "m", self._name)
204      self._zeros_slot(v, "v", self._name)
205
206  def _prepare(self):
207    lr = self._call_if_callable(self._lr)
208    beta1 = self._call_if_callable(self._beta1)
209    beta2 = self._call_if_callable(self._beta2)
210    epsilon = self._call_if_callable(self._epsilon)
211
212    self._lr_t = ops.convert_to_tensor(lr, name="learning_rate")
213    self._beta1_t = ops.convert_to_tensor(beta1, name="beta1")
214    self._beta2_t = ops.convert_to_tensor(beta2, name="beta2")
215    self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon")
216
217  def _apply_dense(self, grad, var):
218    m = self.get_slot(var, "m")
219    v = self.get_slot(var, "v")
220    beta1_power, beta2_power = self._get_beta_accumulators()
221    return training_ops.apply_adam(
222        var,
223        m,
224        v,
225        math_ops.cast(beta1_power, var.dtype.base_dtype),
226        math_ops.cast(beta2_power, var.dtype.base_dtype),
227        math_ops.cast(self._lr_t, var.dtype.base_dtype),
228        math_ops.cast(self._beta1_t, var.dtype.base_dtype),
229        math_ops.cast(self._beta2_t, var.dtype.base_dtype),
230        math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
231        grad,
232        use_locking=self._use_locking).op
233
234  def _resource_apply_dense(self, grad, var):
235    m = self.get_slot(var, "m")
236    v = self.get_slot(var, "v")
237    beta1_power, beta2_power = self._get_beta_accumulators()
238    return training_ops.resource_apply_adam(
239        var.handle,
240        m.handle,
241        v.handle,
242        math_ops.cast(beta1_power, grad.dtype.base_dtype),
243        math_ops.cast(beta2_power, grad.dtype.base_dtype),
244        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
245        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
246        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
247        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
248        grad,
249        use_locking=self._use_locking)
250
251  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
252    beta1_power, beta2_power = self._get_beta_accumulators()
253    beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
254    beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
255    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
256    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
257    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
258    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
259    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
260    # m_t = beta1 * m + (1 - beta1) * g_t
261    m = self.get_slot(var, "m")
262    m_scaled_g_values = grad * (1 - beta1_t)
263    m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
264    with ops.control_dependencies([m_t]):
265      m_t = scatter_add(m, indices, m_scaled_g_values)
266    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
267    v = self.get_slot(var, "v")
268    v_scaled_g_values = (grad * grad) * (1 - beta2_t)
269    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
270    with ops.control_dependencies([v_t]):
271      v_t = scatter_add(v, indices, v_scaled_g_values)
272    v_sqrt = math_ops.sqrt(v_t)
273    var_update = state_ops.assign_sub(
274        var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
275    return control_flow_ops.group(*[var_update, m_t, v_t])
276
277  def _apply_sparse(self, grad, var):
278    return self._apply_sparse_shared(
279        grad.values,
280        var,
281        grad.indices,
282        lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
283            x,
284            i,
285            v,
286            use_locking=self._use_locking))
287
288  def _resource_scatter_add(self, x, i, v):
289    with ops.control_dependencies(
290        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
291      return x.value()
292
293  def _resource_apply_sparse(self, grad, var, indices):
294    return self._apply_sparse_shared(grad, var, indices,
295                                     self._resource_scatter_add)
296
297  def _finish(self, update_ops, name_scope):
298    # Update the power accumulators.
299    with ops.control_dependencies(update_ops):
300      beta1_power, beta2_power = self._get_beta_accumulators()
301      with ops.colocate_with(beta1_power):
302        update_beta1 = beta1_power.assign(
303            beta1_power * self._beta1_t, use_locking=self._use_locking)
304        update_beta2 = beta2_power.assign(
305            beta2_power * self._beta2_t, use_locking=self._use_locking)
306    return control_flow_ops.group(
307        *update_ops + [update_beta1, update_beta2], name=name_scope)
308