• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Various learning rate decay functions."""
16from __future__ import absolute_import
17from __future__ import division
18from __future__ import print_function
19
20import math
21
22from tensorflow.python.framework import constant_op
23from tensorflow.python.framework import dtypes
24from tensorflow.python.framework import ops
25from tensorflow.python.ops import control_flow_ops
26from tensorflow.python.ops import math_ops
27from tensorflow.python.ops import random_ops
28from tensorflow.python.util.tf_export import tf_export
29
30
31@tf_export("train.exponential_decay")
32def exponential_decay(learning_rate,
33                      global_step,
34                      decay_steps,
35                      decay_rate,
36                      staircase=False,
37                      name=None):
38  """Applies exponential decay to the learning rate.
39
40  When training a model, it is often recommended to lower the learning rate as
41  the training progresses.  This function applies an exponential decay function
42  to a provided initial learning rate.  It requires a `global_step` value to
43  compute the decayed learning rate.  You can just pass a TensorFlow variable
44  that you increment at each training step.
45
46  The function returns the decayed learning rate.  It is computed as:
47
48  ```python
49  decayed_learning_rate = learning_rate *
50                          decay_rate ^ (global_step / decay_steps)
51  ```
52
53  If the argument `staircase` is `True`, then `global_step / decay_steps` is an
54  integer division and the decayed learning rate follows a staircase function.
55
56  Example: decay every 100000 steps with a base of 0.96:
57
58  ```python
59  ...
60  global_step = tf.Variable(0, trainable=False)
61  starter_learning_rate = 0.1
62  learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,
63                                             100000, 0.96, staircase=True)
64  # Passing global_step to minimize() will increment it at each step.
65  learning_step = (
66      tf.train.GradientDescentOptimizer(learning_rate)
67      .minimize(...my loss..., global_step=global_step)
68  )
69  ```
70
71  Args:
72    learning_rate: A scalar `float32` or `float64` `Tensor` or a
73      Python number.  The initial learning rate.
74    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
75      Global step to use for the decay computation.  Must not be negative.
76    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
77      Must be positive.  See the decay computation above.
78    decay_rate: A scalar `float32` or `float64` `Tensor` or a
79      Python number.  The decay rate.
80    staircase: Boolean.  If `True` decay the learning rate at discrete intervals
81    name: String.  Optional name of the operation.  Defaults to
82      'ExponentialDecay'.
83
84  Returns:
85    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
86    learning rate.
87
88  Raises:
89    ValueError: if `global_step` is not supplied.
90  """
91  if global_step is None:
92    raise ValueError("global_step is required for exponential_decay.")
93  with ops.name_scope(
94      name, "ExponentialDecay",
95      [learning_rate, global_step, decay_steps, decay_rate]) as name:
96    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
97    dtype = learning_rate.dtype
98    global_step = math_ops.cast(global_step, dtype)
99    decay_steps = math_ops.cast(decay_steps, dtype)
100    decay_rate = math_ops.cast(decay_rate, dtype)
101    p = global_step / decay_steps
102    if staircase:
103      p = math_ops.floor(p)
104    return math_ops.multiply(
105        learning_rate, math_ops.pow(decay_rate, p), name=name)
106
107
108@tf_export("train.piecewise_constant")
109def piecewise_constant(x, boundaries, values, name=None):
110  """Piecewise constant from boundaries and interval values.
111
112  Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
113    for the next 10000 steps, and 0.1 for any additional steps.
114
115  ```python
116  global_step = tf.Variable(0, trainable=False)
117  boundaries = [100000, 110000]
118  values = [1.0, 0.5, 0.1]
119  learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
120
121  # Later, whenever we perform an optimization step, we increment global_step.
122  ```
123
124  Args:
125    x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`,
126      `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`.
127    boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
128      increasing entries, and with all elements having the same type as `x`.
129    values: A list of `Tensor`s or `float`s or `int`s that specifies the values
130      for the intervals defined by `boundaries`. It should have one more element
131      than `boundaries`, and all elements should have the same type.
132    name: A string. Optional name of the operation. Defaults to
133      'PiecewiseConstant'.
134
135  Returns:
136    A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`,
137    `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ...,
138    and values[-1] when `x > boundaries[-1]`.
139
140  Raises:
141    ValueError: if types of `x` and `boundaries` do not match, or types of all
142        `values` do not match or
143        the number of elements in the lists does not match.
144  """
145  if len(boundaries) != len(values) - 1:
146    raise ValueError(
147        "The length of boundaries should be 1 less than the length of values")
148  with ops.name_scope(name, "PiecewiseConstant",
149                      [x, boundaries, values, name]) as name:
150    x = ops.convert_to_tensor(x)
151    # Avoid explicit conversion to x's dtype. This could result in faulty
152    # comparisons, for example if floats are converted to integers.
153    boundaries = ops.convert_n_to_tensor(boundaries)
154    for i, b in enumerate(boundaries):
155      if b.dtype.base_dtype != x.dtype.base_dtype:
156        # We can promote int32 boundaries to int64 without loss of precision.
157        # This covers the most common case where the user passes in boundaries
158        # as an array of Python integers.
159        if (b.dtype.base_dtype == dtypes.int32 and
160            x.dtype.base_dtype == dtypes.int64):
161          b = math_ops.cast(b, x.dtype.base_dtype)
162          boundaries[i] = b
163        else:
164          raise ValueError(
165              "Boundaries (%s) must have the same dtype as x (%s)." %
166              (b.dtype.base_dtype, x.dtype.base_dtype))
167    # TODO(rdipietro): Ensure that boundaries' elements are strictly increasing.
168    values = ops.convert_n_to_tensor(values)
169    for v in values[1:]:
170      if v.dtype.base_dtype != values[0].dtype.base_dtype:
171        raise ValueError(
172            "Values must have elements all with the same dtype (%s vs %s)." %
173            (values[0].dtype.base_dtype, v.dtype.base_dtype))
174    pred_fn_pairs = []
175    pred_fn_pairs.append((x <= boundaries[0], lambda: values[0]))
176    pred_fn_pairs.append((x > boundaries[-1], lambda: values[-1]))
177    for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]):
178      # Need to bind v here; can do this with lambda v=v: ...
179      pred = (x > low) & (x <= high)
180      pred_fn_pairs.append((pred, lambda v=v: v))
181
182    # The default isn't needed here because our conditions are mutually
183    # exclusive and exhaustive, but tf.case requires it.
184    default = lambda: values[0]
185    return control_flow_ops.case(pred_fn_pairs, default, exclusive=True)
186
187
188@tf_export("train.polynomial_decay")
189def polynomial_decay(learning_rate,
190                     global_step,
191                     decay_steps,
192                     end_learning_rate=0.0001,
193                     power=1.0,
194                     cycle=False,
195                     name=None):
196  """Applies a polynomial decay to the learning rate.
197
198  It is commonly observed that a monotonically decreasing learning rate, whose
199  degree of change is carefully chosen, results in a better performing model.
200  This function applies a polynomial decay function to a provided initial
201  `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`.
202
203  It requires a `global_step` value to compute the decayed learning rate.  You
204  can just pass a TensorFlow variable that you increment at each training step.
205
206  The function returns the decayed learning rate.  It is computed as:
207
208  ```python
209  global_step = min(global_step, decay_steps)
210  decayed_learning_rate = (learning_rate - end_learning_rate) *
211                          (1 - global_step / decay_steps) ^ (power) +
212                          end_learning_rate
213
214  ```
215
216  If `cycle` is True then a multiple of `decay_steps` is used, the first one
217  that is bigger than `global_steps`.
218
219  ```python
220  decay_steps = decay_steps * ceil(global_step / decay_steps)
221  decayed_learning_rate = (learning_rate - end_learning_rate) *
222                          (1 - global_step / decay_steps) ^ (power) +
223                          end_learning_rate
224
225  ```
226
227  Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5):
228
229  ```python
230  ...
231  global_step = tf.Variable(0, trainable=False)
232  starter_learning_rate = 0.1
233  end_learning_rate = 0.01
234  decay_steps = 10000
235  learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step,
236                                            decay_steps, end_learning_rate,
237                                            power=0.5)
238  # Passing global_step to minimize() will increment it at each step.
239  learning_step = (
240      tf.train.GradientDescentOptimizer(learning_rate)
241      .minimize(...my loss..., global_step=global_step)
242  )
243  ```
244
245  Args:
246    learning_rate: A scalar `float32` or `float64` `Tensor` or a
247      Python number.  The initial learning rate.
248    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
249      Global step to use for the decay computation.  Must not be negative.
250    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
251      Must be positive.  See the decay computation above.
252    end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
253      Python number.  The minimal end learning rate.
254    power: A scalar `float32` or `float64` `Tensor` or a
255      Python number.  The power of the polynomial. Defaults to linear, 1.0.
256    cycle: A boolean, whether or not it should cycle beyond decay_steps.
257    name: String.  Optional name of the operation. Defaults to
258      'PolynomialDecay'.
259
260  Returns:
261    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
262    learning rate.
263
264  Raises:
265    ValueError: if `global_step` is not supplied.
266  """
267  if global_step is None:
268    raise ValueError("global_step is required for polynomial_decay.")
269  with ops.name_scope(
270      name, "PolynomialDecay",
271      [learning_rate, global_step, decay_steps, end_learning_rate, power
272      ]) as name:
273    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
274    dtype = learning_rate.dtype
275    global_step = math_ops.cast(global_step, dtype)
276    decay_steps = math_ops.cast(decay_steps, dtype)
277    end_learning_rate = math_ops.cast(end_learning_rate, dtype)
278    power = math_ops.cast(power, dtype)
279    if cycle:
280      # Find the first multiple of decay_steps that is bigger than global_step.
281      # If global_step is zero set the multiplier to 1
282      multiplier = control_flow_ops.cond(
283          math_ops.equal(global_step, 0), lambda: 1.0,
284          lambda: math_ops.ceil(global_step / decay_steps))
285      decay_steps = math_ops.multiply(decay_steps, multiplier)
286    else:
287      # Make sure that the global_step used is not bigger than decay_steps.
288      global_step = math_ops.minimum(global_step, decay_steps)
289
290    p = math_ops.div(global_step, decay_steps)
291    return math_ops.add(
292        math_ops.multiply(learning_rate - end_learning_rate,
293                          math_ops.pow(1 - p, power)),
294        end_learning_rate,
295        name=name)
296
297
298@tf_export("train.natural_exp_decay")
299def natural_exp_decay(learning_rate,
300                      global_step,
301                      decay_steps,
302                      decay_rate,
303                      staircase=False,
304                      name=None):
305  """Applies natural exponential decay to the initial learning rate.
306
307  When training a model, it is often recommended to lower the learning rate as
308  the training progresses.  This function applies an exponential decay function
309  to a provided initial learning rate.  It requires an `global_step` value to
310  compute the decayed learning rate.  You can just pass a TensorFlow variable
311  that you increment at each training step.
312
313  The function returns the decayed learning rate.  It is computed as:
314
315  ```python
316  decayed_learning_rate = learning_rate * exp(-decay_rate * global_step)
317  ```
318
319  Example: decay exponentially with a base of 0.96:
320
321  ```python
322  ...
323  global_step = tf.Variable(0, trainable=False)
324  learning_rate = 0.1
325  k = 0.5
326  learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k)
327
328  # Passing global_step to minimize() will increment it at each step.
329  learning_step = (
330      tf.train.GradientDescentOptimizer(learning_rate)
331      .minimize(...my loss..., global_step=global_step)
332  )
333  ```
334
335  Args:
336    learning_rate: A scalar `float32` or `float64` `Tensor` or a
337      Python number.  The initial learning rate.
338    global_step: A Python number.
339      Global step to use for the decay computation.  Must not be negative.
340    decay_steps: How often to apply decay.
341    decay_rate: A Python number.  The decay rate.
342    staircase: Whether to apply decay in a discrete staircase, as opposed to
343      continuous, fashion.
344    name: String.  Optional name of the operation.  Defaults to
345      'ExponentialTimeDecay'.
346
347  Returns:
348    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
349    learning rate.
350
351  Raises:
352    ValueError: if `global_step` is not supplied.
353  """
354  if global_step is None:
355    raise ValueError("global_step is required for natural_exp_decay.")
356  with ops.name_scope(name, "NaturalExpDecay",
357                      [learning_rate, global_step, decay_rate]) as name:
358    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
359    dtype = learning_rate.dtype
360    global_step = math_ops.cast(global_step, dtype)
361    decay_steps = math_ops.cast(decay_steps, dtype)
362    decay_rate = math_ops.cast(decay_rate, dtype)
363    p = global_step / decay_steps
364    if staircase:
365      p = math_ops.floor(p)
366    exponent = math_ops.exp(math_ops.multiply(math_ops.negative(decay_rate), p))
367    return math_ops.multiply(learning_rate, exponent, name=name)
368
369
370@tf_export("train.inverse_time_decay")
371def inverse_time_decay(learning_rate,
372                       global_step,
373                       decay_steps,
374                       decay_rate,
375                       staircase=False,
376                       name=None):
377  """Applies inverse time decay to the initial learning rate.
378
379  When training a model, it is often recommended to lower the learning rate as
380  the training progresses.  This function applies an inverse decay function
381  to a provided initial learning rate.  It requires an `global_step` value to
382  compute the decayed learning rate.  You can just pass a TensorFlow variable
383  that you increment at each training step.
384
385  The function returns the decayed learning rate.  It is computed as:
386
387  ```python
388  decayed_learning_rate = learning_rate / (1 + decay_rate * global_step /
389  decay_step)
390  ```
391
392  or, if `staircase` is `True`, as:
393
394  ```python
395  decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step /
396  decay_step))
397  ```
398
399  Example: decay 1/t with a rate of 0.5:
400
401  ```python
402  ...
403  global_step = tf.Variable(0, trainable=False)
404  learning_rate = 0.1
405  decay_steps = 1.0
406  decay_rate = 0.5
407  learning_rate = tf.train.inverse_time_decay(learning_rate, global_step,
408  decay_steps, decay_rate)
409
410  # Passing global_step to minimize() will increment it at each step.
411  learning_step = (
412      tf.train.GradientDescentOptimizer(learning_rate)
413      .minimize(...my loss..., global_step=global_step)
414  )
415  ```
416
417  Args:
418    learning_rate: A scalar `float32` or `float64` `Tensor` or a
419      Python number.  The initial learning rate.
420    global_step: A Python number.
421      Global step to use for the decay computation.  Must not be negative.
422    decay_steps: How often to apply decay.
423    decay_rate: A Python number.  The decay rate.
424    staircase: Whether to apply decay in a discrete staircase, as opposed to
425      continuous, fashion.
426    name: String.  Optional name of the operation.  Defaults to
427      'InverseTimeDecay'.
428
429  Returns:
430    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
431    learning rate.
432
433  Raises:
434    ValueError: if `global_step` is not supplied.
435  """
436  if global_step is None:
437    raise ValueError("global_step is required for inverse_time_decay.")
438  with ops.name_scope(name, "InverseTimeDecay",
439                      [learning_rate, global_step, decay_rate]) as name:
440    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
441    dtype = learning_rate.dtype
442    global_step = math_ops.cast(global_step, dtype)
443    decay_steps = math_ops.cast(decay_steps, dtype)
444    decay_rate = math_ops.cast(decay_rate, dtype)
445    p = global_step / decay_steps
446    if staircase:
447      p = math_ops.floor(p)
448    const = math_ops.cast(constant_op.constant(1), learning_rate.dtype)
449    denom = math_ops.add(const, math_ops.multiply(decay_rate, p))
450    return math_ops.div(learning_rate, denom, name=name)
451
452
453@tf_export("train.cosine_decay")
454def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None):
455  """Applies cosine decay to the learning rate.
456
457  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
458  with Warm Restarts. https://arxiv.org/abs/1608.03983
459
460  When training a model, it is often recommended to lower the learning rate as
461  the training progresses.  This function applies a cosine decay function
462  to a provided initial learning rate.  It requires a `global_step` value to
463  compute the decayed learning rate.  You can just pass a TensorFlow variable
464  that you increment at each training step.
465
466  The function returns the decayed learning rate.  It is computed as:
467  ```python
468  global_step = min(global_step, decay_steps)
469  cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps))
470  decayed = (1 - alpha) * cosine_decay + alpha
471  decayed_learning_rate = learning_rate * decayed
472  ```
473
474  Example usage:
475  ```python
476  decay_steps = 1000
477  lr_decayed = cosine_decay(learning_rate, global_step, decay_steps)
478  ```
479
480  Args:
481    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
482      The initial learning rate.
483    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
484      Global step to use for the decay computation.
485    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
486      Number of steps to decay over.
487    alpha: A scalar `float32` or `float64` Tensor or a Python number.
488      Minimum learning rate value as a fraction of learning_rate.
489    name: String. Optional name of the operation.  Defaults to 'CosineDecay'.
490  Returns:
491    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
492    learning rate.
493  Raises:
494    ValueError: if `global_step` is not supplied.
495  """
496  if global_step is None:
497    raise ValueError("cosine decay requires global_step")
498  with ops.name_scope(name, "CosineDecay",
499                      [learning_rate, global_step]) as name:
500    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
501    dtype = learning_rate.dtype
502    global_step = math_ops.cast(global_step, dtype)
503    decay_steps = math_ops.cast(decay_steps, dtype)
504    global_step = math_ops.minimum(global_step, decay_steps)
505    completed_fraction = global_step / decay_steps
506    cosine_decayed = 0.5 * (
507        1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))
508
509    decayed = (1 - alpha) * cosine_decayed + alpha
510    return math_ops.multiply(learning_rate, decayed)
511
512
513@tf_export("train.cosine_decay_restarts")
514def cosine_decay_restarts(learning_rate,
515                          global_step,
516                          first_decay_steps,
517                          t_mul=2.0,
518                          m_mul=1.0,
519                          alpha=0.0,
520                          name=None):
521  """Applies cosine decay with restarts to the learning rate.
522
523  See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent
524  with Warm Restarts. https://arxiv.org/abs/1608.03983
525
526  When training a model, it is often recommended to lower the learning rate as
527  the training progresses.  This function applies a cosine decay function with
528  restarts to a provided initial learning rate.  It requires a `global_step`
529  value to compute the decayed learning rate.  You can just pass a TensorFlow
530  variable that you increment at each training step.
531
532  The function returns the decayed learning rate while taking into account
533  possible warm restarts. The learning rate multiplier first decays
534  from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
535  restart is performed. Each new warm restart runs for `t_mul` times more steps
536  and with `m_mul` times smaller initial learning rate.
537
538  Example usage:
539  ```python
540  first_decay_steps = 1000
541  lr_decayed = cosine_decay_restarts(learning_rate, global_step,
542                                     first_decay_steps)
543  ```
544
545  Args:
546    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
547      The initial learning rate.
548    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
549      Global step to use for the decay computation.
550    first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
551      Number of steps to decay over.
552    t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
553      Used to derive the number of iterations in the i-th period
554    m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
555      Used to derive the initial learning rate of the i-th period:
556    alpha: A scalar `float32` or `float64` Tensor or a Python number.
557      Minimum learning rate value as a fraction of the learning_rate.
558    name: String. Optional name of the operation.  Defaults to 'SGDRDecay'.
559  Returns:
560    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
561    learning rate.
562  Raises:
563    ValueError: if `global_step` is not supplied.
564  """
565  if global_step is None:
566    raise ValueError("cosine decay restarts requires global_step")
567  with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]) as name:
568    learning_rate = ops.convert_to_tensor(
569        learning_rate, name="initial_learning_rate")
570    dtype = learning_rate.dtype
571    global_step = math_ops.cast(global_step, dtype)
572    first_decay_steps = math_ops.cast(first_decay_steps, dtype)
573    alpha = math_ops.cast(alpha, dtype)
574    t_mul = math_ops.cast(t_mul, dtype)
575    m_mul = math_ops.cast(m_mul, dtype)
576
577    completed_fraction = global_step / first_decay_steps
578
579    def compute_step(completed_fraction, geometric=False):
580      if geometric:
581        i_restart = math_ops.floor(
582            math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
583            math_ops.log(t_mul))
584
585        sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
586        completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
587
588      else:
589        i_restart = math_ops.floor(completed_fraction)
590        completed_fraction = completed_fraction - i_restart
591
592      return i_restart, completed_fraction
593
594    i_restart, completed_fraction = control_flow_ops.cond(
595        math_ops.equal(t_mul, 1.0),
596        lambda: compute_step(completed_fraction, geometric=False),
597        lambda: compute_step(completed_fraction, geometric=True))
598
599    m_fac = m_mul**i_restart
600    cosine_decayed = 0.5 * m_fac * (
601        1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction))
602    decayed = (1 - alpha) * cosine_decayed + alpha
603
604  return math_ops.multiply(learning_rate, decayed, name=name)
605
606
607@tf_export("train.linear_cosine_decay")
608def linear_cosine_decay(learning_rate,
609                        global_step,
610                        decay_steps,
611                        num_periods=0.5,
612                        alpha=0.0,
613                        beta=0.001,
614                        name=None):
615  """Applies linear cosine decay to the learning rate.
616
617  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
618  https://arxiv.org/abs/1709.07417
619
620  For the idea of warm starts here controlled by `num_periods`,
621  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
622  with Warm Restarts. https://arxiv.org/abs/1608.03983
623
624  Note that linear cosine decay is more aggressive than cosine decay and
625  larger initial learning rates can typically be used.
626
627  When training a model, it is often recommended to lower the learning rate as
628  the training progresses.  This function applies a linear cosine decay function
629  to a provided initial learning rate.  It requires a `global_step` value to
630  compute the decayed learning rate.  You can just pass a TensorFlow variable
631  that you increment at each training step.
632
633  The function returns the decayed learning rate.  It is computed as:
634  ```python
635  global_step = min(global_step, decay_steps)
636  linear_decay = (decay_steps - global_step) / decay_steps)
637  cosine_decay = 0.5 * (
638      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
639  decayed = (alpha + linear_decay) * cosine_decay + beta
640  decayed_learning_rate = learning_rate * decayed
641  ```
642
643  Example usage:
644  ```python
645  decay_steps = 1000
646  lr_decayed = linear_cosine_decay(learning_rate, global_step, decay_steps)
647  ```
648
649  Args:
650    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
651      The initial learning rate.
652    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
653      Global step to use for the decay computation.
654    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
655      Number of steps to decay over.
656    num_periods: Number of periods in the cosine part of the decay.
657      See computation above.
658    alpha: See computation above.
659    beta: See computation above.
660    name: String.  Optional name of the operation.  Defaults to
661      'LinearCosineDecay'.
662  Returns:
663    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
664    learning rate.
665  Raises:
666    ValueError: if `global_step` is not supplied.
667  """
668  if global_step is None:
669    raise ValueError("linear cosine decay requires global_step")
670  with ops.name_scope(name, "LinearCosineDecay",
671                      [learning_rate, global_step]) as name:
672    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
673    dtype = learning_rate.dtype
674    global_step = math_ops.cast(global_step, dtype)
675    decay_steps = math_ops.cast(decay_steps, dtype)
676    num_periods = math_ops.cast(num_periods, dtype)
677    global_step = math_ops.minimum(global_step, decay_steps)
678    alpha = math_ops.cast(alpha, dtype)
679    beta = math_ops.cast(beta, dtype)
680
681    linear_decayed = (decay_steps - global_step) / decay_steps
682    completed_fraction = global_step / decay_steps
683    fraction = 2.0 * num_periods * completed_fraction
684    cosine_decayed = 0.5 * (
685        1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
686
687    linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
688    return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name)
689
690
691@tf_export("train.noisy_linear_cosine_decay")
692def noisy_linear_cosine_decay(learning_rate,
693                              global_step,
694                              decay_steps,
695                              initial_variance=1.0,
696                              variance_decay=0.55,
697                              num_periods=0.5,
698                              alpha=0.0,
699                              beta=0.001,
700                              name=None):
701  """Applies noisy linear cosine decay to the learning rate.
702
703  See [Bello et al., ICML2017] Neural Optimizer Search with RL.
704  https://arxiv.org/abs/1709.07417
705
706  For the idea of warm starts here controlled by `num_periods`,
707  see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
708  with Warm Restarts. https://arxiv.org/abs/1608.03983
709
710  Note that linear cosine decay is more aggressive than cosine decay and
711  larger initial learning rates can typically be used.
712
713  When training a model, it is often recommended to lower the learning rate as
714  the training progresses.  This function applies a noisy linear
715  cosine decay function to a provided initial learning rate.
716  It requires a `global_step` value to compute the decayed learning rate.
717  You can just pass a TensorFlow variable that you increment at each
718  training step.
719
720  The function returns the decayed learning rate.  It is computed as:
721  ```python
722  global_step = min(global_step, decay_steps)
723  linear_decay = (decay_steps - global_step) / decay_steps)
724  cosine_decay = 0.5 * (
725      1 + cos(pi * 2 * num_periods * global_step / decay_steps))
726  decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
727  decayed_learning_rate = learning_rate * decayed
728  ```
729  where eps_t is 0-centered gaussian noise with variance
730  initial_variance / (1 + global_step) ** variance_decay
731
732  Example usage:
733  ```python
734  decay_steps = 1000
735  lr_decayed = noisy_linear_cosine_decay(
736    learning_rate, global_step, decay_steps)
737  ```
738
739  Args:
740    learning_rate: A scalar `float32` or `float64` Tensor or a Python number.
741      The initial learning rate.
742    global_step: A scalar `int32` or `int64` `Tensor` or a Python number.
743      Global step to use for the decay computation.
744    decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
745      Number of steps to decay over.
746    initial_variance: initial variance for the noise. See computation above.
747    variance_decay: decay for the noise's variance. See computation above.
748    num_periods: Number of periods in the cosine part of the decay.
749      See computation above.
750    alpha: See computation above.
751    beta: See computation above.
752    name: String.  Optional name of the operation.  Defaults to
753      'NoisyLinearCosineDecay'.
754  Returns:
755    A scalar `Tensor` of the same type as `learning_rate`.  The decayed
756    learning rate.
757  Raises:
758    ValueError: if `global_step` is not supplied.
759  """
760  if global_step is None:
761    raise ValueError("noisy linear cosine decay requires global_step")
762  with ops.name_scope(name, "NoisyLinearCosineDecay",
763                      [learning_rate, global_step]) as name:
764    learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate")
765    dtype = learning_rate.dtype
766    global_step = math_ops.cast(global_step, dtype)
767    decay_steps = math_ops.cast(decay_steps, dtype)
768    global_step = math_ops.minimum(global_step, decay_steps)
769    initial_variance = math_ops.cast(initial_variance, dtype)
770    variance_decay = math_ops.cast(variance_decay, dtype)
771    num_periods = math_ops.cast(num_periods, dtype)
772    alpha = math_ops.cast(alpha, dtype)
773    beta = math_ops.cast(beta, dtype)
774
775    linear_decayed = (decay_steps - global_step) / decay_steps
776    variance = initial_variance / (
777        math_ops.pow(1.0 + global_step, variance_decay))
778    std = math_ops.sqrt(variance)
779    noisy_linear_decayed = (
780        linear_decayed +
781        random_ops.random_normal(linear_decayed.shape, stddev=std))
782
783    completed_fraction = global_step / decay_steps
784    fraction = 2.0 * num_periods * completed_fraction
785    cosine_decayed = 0.5 * (
786        1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
787    noisy_linear_cosine_decayed = (
788        (alpha + noisy_linear_decayed) * cosine_decayed + beta)
789
790    return math_ops.multiply(
791        learning_rate, noisy_linear_cosine_decayed, name=name)
792