1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Various learning rate decay functions.""" 16from __future__ import absolute_import 17from __future__ import division 18from __future__ import print_function 19 20import math 21 22from tensorflow.python.framework import constant_op 23from tensorflow.python.framework import dtypes 24from tensorflow.python.framework import ops 25from tensorflow.python.ops import control_flow_ops 26from tensorflow.python.ops import math_ops 27from tensorflow.python.ops import random_ops 28from tensorflow.python.util.tf_export import tf_export 29 30 31@tf_export("train.exponential_decay") 32def exponential_decay(learning_rate, 33 global_step, 34 decay_steps, 35 decay_rate, 36 staircase=False, 37 name=None): 38 """Applies exponential decay to the learning rate. 39 40 When training a model, it is often recommended to lower the learning rate as 41 the training progresses. This function applies an exponential decay function 42 to a provided initial learning rate. It requires a `global_step` value to 43 compute the decayed learning rate. You can just pass a TensorFlow variable 44 that you increment at each training step. 45 46 The function returns the decayed learning rate. It is computed as: 47 48 ```python 49 decayed_learning_rate = learning_rate * 50 decay_rate ^ (global_step / decay_steps) 51 ``` 52 53 If the argument `staircase` is `True`, then `global_step / decay_steps` is an 54 integer division and the decayed learning rate follows a staircase function. 55 56 Example: decay every 100000 steps with a base of 0.96: 57 58 ```python 59 ... 60 global_step = tf.Variable(0, trainable=False) 61 starter_learning_rate = 0.1 62 learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 63 100000, 0.96, staircase=True) 64 # Passing global_step to minimize() will increment it at each step. 65 learning_step = ( 66 tf.train.GradientDescentOptimizer(learning_rate) 67 .minimize(...my loss..., global_step=global_step) 68 ) 69 ``` 70 71 Args: 72 learning_rate: A scalar `float32` or `float64` `Tensor` or a 73 Python number. The initial learning rate. 74 global_step: A scalar `int32` or `int64` `Tensor` or a Python number. 75 Global step to use for the decay computation. Must not be negative. 76 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. 77 Must be positive. See the decay computation above. 78 decay_rate: A scalar `float32` or `float64` `Tensor` or a 79 Python number. The decay rate. 80 staircase: Boolean. If `True` decay the learning rate at discrete intervals 81 name: String. Optional name of the operation. Defaults to 82 'ExponentialDecay'. 83 84 Returns: 85 A scalar `Tensor` of the same type as `learning_rate`. The decayed 86 learning rate. 87 88 Raises: 89 ValueError: if `global_step` is not supplied. 90 """ 91 if global_step is None: 92 raise ValueError("global_step is required for exponential_decay.") 93 with ops.name_scope( 94 name, "ExponentialDecay", 95 [learning_rate, global_step, decay_steps, decay_rate]) as name: 96 learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") 97 dtype = learning_rate.dtype 98 global_step = math_ops.cast(global_step, dtype) 99 decay_steps = math_ops.cast(decay_steps, dtype) 100 decay_rate = math_ops.cast(decay_rate, dtype) 101 p = global_step / decay_steps 102 if staircase: 103 p = math_ops.floor(p) 104 return math_ops.multiply( 105 learning_rate, math_ops.pow(decay_rate, p), name=name) 106 107 108@tf_export("train.piecewise_constant") 109def piecewise_constant(x, boundaries, values, name=None): 110 """Piecewise constant from boundaries and interval values. 111 112 Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5 113 for the next 10000 steps, and 0.1 for any additional steps. 114 115 ```python 116 global_step = tf.Variable(0, trainable=False) 117 boundaries = [100000, 110000] 118 values = [1.0, 0.5, 0.1] 119 learning_rate = tf.train.piecewise_constant(global_step, boundaries, values) 120 121 # Later, whenever we perform an optimization step, we increment global_step. 122 ``` 123 124 Args: 125 x: A 0-D scalar `Tensor`. Must be one of the following types: `float32`, 126 `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`. 127 boundaries: A list of `Tensor`s or `int`s or `float`s with strictly 128 increasing entries, and with all elements having the same type as `x`. 129 values: A list of `Tensor`s or `float`s or `int`s that specifies the values 130 for the intervals defined by `boundaries`. It should have one more element 131 than `boundaries`, and all elements should have the same type. 132 name: A string. Optional name of the operation. Defaults to 133 'PiecewiseConstant'. 134 135 Returns: 136 A 0-D Tensor. Its value is `values[0]` when `x <= boundaries[0]`, 137 `values[1]` when `x > boundaries[0]` and `x <= boundaries[1]`, ..., 138 and values[-1] when `x > boundaries[-1]`. 139 140 Raises: 141 ValueError: if types of `x` and `boundaries` do not match, or types of all 142 `values` do not match or 143 the number of elements in the lists does not match. 144 """ 145 if len(boundaries) != len(values) - 1: 146 raise ValueError( 147 "The length of boundaries should be 1 less than the length of values") 148 with ops.name_scope(name, "PiecewiseConstant", 149 [x, boundaries, values, name]) as name: 150 x = ops.convert_to_tensor(x) 151 # Avoid explicit conversion to x's dtype. This could result in faulty 152 # comparisons, for example if floats are converted to integers. 153 boundaries = ops.convert_n_to_tensor(boundaries) 154 for i, b in enumerate(boundaries): 155 if b.dtype.base_dtype != x.dtype.base_dtype: 156 # We can promote int32 boundaries to int64 without loss of precision. 157 # This covers the most common case where the user passes in boundaries 158 # as an array of Python integers. 159 if (b.dtype.base_dtype == dtypes.int32 and 160 x.dtype.base_dtype == dtypes.int64): 161 b = math_ops.cast(b, x.dtype.base_dtype) 162 boundaries[i] = b 163 else: 164 raise ValueError( 165 "Boundaries (%s) must have the same dtype as x (%s)." % 166 (b.dtype.base_dtype, x.dtype.base_dtype)) 167 # TODO(rdipietro): Ensure that boundaries' elements are strictly increasing. 168 values = ops.convert_n_to_tensor(values) 169 for v in values[1:]: 170 if v.dtype.base_dtype != values[0].dtype.base_dtype: 171 raise ValueError( 172 "Values must have elements all with the same dtype (%s vs %s)." % 173 (values[0].dtype.base_dtype, v.dtype.base_dtype)) 174 pred_fn_pairs = [] 175 pred_fn_pairs.append((x <= boundaries[0], lambda: values[0])) 176 pred_fn_pairs.append((x > boundaries[-1], lambda: values[-1])) 177 for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]): 178 # Need to bind v here; can do this with lambda v=v: ... 179 pred = (x > low) & (x <= high) 180 pred_fn_pairs.append((pred, lambda v=v: v)) 181 182 # The default isn't needed here because our conditions are mutually 183 # exclusive and exhaustive, but tf.case requires it. 184 default = lambda: values[0] 185 return control_flow_ops.case(pred_fn_pairs, default, exclusive=True) 186 187 188@tf_export("train.polynomial_decay") 189def polynomial_decay(learning_rate, 190 global_step, 191 decay_steps, 192 end_learning_rate=0.0001, 193 power=1.0, 194 cycle=False, 195 name=None): 196 """Applies a polynomial decay to the learning rate. 197 198 It is commonly observed that a monotonically decreasing learning rate, whose 199 degree of change is carefully chosen, results in a better performing model. 200 This function applies a polynomial decay function to a provided initial 201 `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`. 202 203 It requires a `global_step` value to compute the decayed learning rate. You 204 can just pass a TensorFlow variable that you increment at each training step. 205 206 The function returns the decayed learning rate. It is computed as: 207 208 ```python 209 global_step = min(global_step, decay_steps) 210 decayed_learning_rate = (learning_rate - end_learning_rate) * 211 (1 - global_step / decay_steps) ^ (power) + 212 end_learning_rate 213 214 ``` 215 216 If `cycle` is True then a multiple of `decay_steps` is used, the first one 217 that is bigger than `global_steps`. 218 219 ```python 220 decay_steps = decay_steps * ceil(global_step / decay_steps) 221 decayed_learning_rate = (learning_rate - end_learning_rate) * 222 (1 - global_step / decay_steps) ^ (power) + 223 end_learning_rate 224 225 ``` 226 227 Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5): 228 229 ```python 230 ... 231 global_step = tf.Variable(0, trainable=False) 232 starter_learning_rate = 0.1 233 end_learning_rate = 0.01 234 decay_steps = 10000 235 learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step, 236 decay_steps, end_learning_rate, 237 power=0.5) 238 # Passing global_step to minimize() will increment it at each step. 239 learning_step = ( 240 tf.train.GradientDescentOptimizer(learning_rate) 241 .minimize(...my loss..., global_step=global_step) 242 ) 243 ``` 244 245 Args: 246 learning_rate: A scalar `float32` or `float64` `Tensor` or a 247 Python number. The initial learning rate. 248 global_step: A scalar `int32` or `int64` `Tensor` or a Python number. 249 Global step to use for the decay computation. Must not be negative. 250 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. 251 Must be positive. See the decay computation above. 252 end_learning_rate: A scalar `float32` or `float64` `Tensor` or a 253 Python number. The minimal end learning rate. 254 power: A scalar `float32` or `float64` `Tensor` or a 255 Python number. The power of the polynomial. Defaults to linear, 1.0. 256 cycle: A boolean, whether or not it should cycle beyond decay_steps. 257 name: String. Optional name of the operation. Defaults to 258 'PolynomialDecay'. 259 260 Returns: 261 A scalar `Tensor` of the same type as `learning_rate`. The decayed 262 learning rate. 263 264 Raises: 265 ValueError: if `global_step` is not supplied. 266 """ 267 if global_step is None: 268 raise ValueError("global_step is required for polynomial_decay.") 269 with ops.name_scope( 270 name, "PolynomialDecay", 271 [learning_rate, global_step, decay_steps, end_learning_rate, power 272 ]) as name: 273 learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") 274 dtype = learning_rate.dtype 275 global_step = math_ops.cast(global_step, dtype) 276 decay_steps = math_ops.cast(decay_steps, dtype) 277 end_learning_rate = math_ops.cast(end_learning_rate, dtype) 278 power = math_ops.cast(power, dtype) 279 if cycle: 280 # Find the first multiple of decay_steps that is bigger than global_step. 281 # If global_step is zero set the multiplier to 1 282 multiplier = control_flow_ops.cond( 283 math_ops.equal(global_step, 0), lambda: 1.0, 284 lambda: math_ops.ceil(global_step / decay_steps)) 285 decay_steps = math_ops.multiply(decay_steps, multiplier) 286 else: 287 # Make sure that the global_step used is not bigger than decay_steps. 288 global_step = math_ops.minimum(global_step, decay_steps) 289 290 p = math_ops.div(global_step, decay_steps) 291 return math_ops.add( 292 math_ops.multiply(learning_rate - end_learning_rate, 293 math_ops.pow(1 - p, power)), 294 end_learning_rate, 295 name=name) 296 297 298@tf_export("train.natural_exp_decay") 299def natural_exp_decay(learning_rate, 300 global_step, 301 decay_steps, 302 decay_rate, 303 staircase=False, 304 name=None): 305 """Applies natural exponential decay to the initial learning rate. 306 307 When training a model, it is often recommended to lower the learning rate as 308 the training progresses. This function applies an exponential decay function 309 to a provided initial learning rate. It requires an `global_step` value to 310 compute the decayed learning rate. You can just pass a TensorFlow variable 311 that you increment at each training step. 312 313 The function returns the decayed learning rate. It is computed as: 314 315 ```python 316 decayed_learning_rate = learning_rate * exp(-decay_rate * global_step) 317 ``` 318 319 Example: decay exponentially with a base of 0.96: 320 321 ```python 322 ... 323 global_step = tf.Variable(0, trainable=False) 324 learning_rate = 0.1 325 k = 0.5 326 learning_rate = tf.train.exponential_time_decay(learning_rate, global_step, k) 327 328 # Passing global_step to minimize() will increment it at each step. 329 learning_step = ( 330 tf.train.GradientDescentOptimizer(learning_rate) 331 .minimize(...my loss..., global_step=global_step) 332 ) 333 ``` 334 335 Args: 336 learning_rate: A scalar `float32` or `float64` `Tensor` or a 337 Python number. The initial learning rate. 338 global_step: A Python number. 339 Global step to use for the decay computation. Must not be negative. 340 decay_steps: How often to apply decay. 341 decay_rate: A Python number. The decay rate. 342 staircase: Whether to apply decay in a discrete staircase, as opposed to 343 continuous, fashion. 344 name: String. Optional name of the operation. Defaults to 345 'ExponentialTimeDecay'. 346 347 Returns: 348 A scalar `Tensor` of the same type as `learning_rate`. The decayed 349 learning rate. 350 351 Raises: 352 ValueError: if `global_step` is not supplied. 353 """ 354 if global_step is None: 355 raise ValueError("global_step is required for natural_exp_decay.") 356 with ops.name_scope(name, "NaturalExpDecay", 357 [learning_rate, global_step, decay_rate]) as name: 358 learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") 359 dtype = learning_rate.dtype 360 global_step = math_ops.cast(global_step, dtype) 361 decay_steps = math_ops.cast(decay_steps, dtype) 362 decay_rate = math_ops.cast(decay_rate, dtype) 363 p = global_step / decay_steps 364 if staircase: 365 p = math_ops.floor(p) 366 exponent = math_ops.exp(math_ops.multiply(math_ops.negative(decay_rate), p)) 367 return math_ops.multiply(learning_rate, exponent, name=name) 368 369 370@tf_export("train.inverse_time_decay") 371def inverse_time_decay(learning_rate, 372 global_step, 373 decay_steps, 374 decay_rate, 375 staircase=False, 376 name=None): 377 """Applies inverse time decay to the initial learning rate. 378 379 When training a model, it is often recommended to lower the learning rate as 380 the training progresses. This function applies an inverse decay function 381 to a provided initial learning rate. It requires an `global_step` value to 382 compute the decayed learning rate. You can just pass a TensorFlow variable 383 that you increment at each training step. 384 385 The function returns the decayed learning rate. It is computed as: 386 387 ```python 388 decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / 389 decay_step) 390 ``` 391 392 or, if `staircase` is `True`, as: 393 394 ```python 395 decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / 396 decay_step)) 397 ``` 398 399 Example: decay 1/t with a rate of 0.5: 400 401 ```python 402 ... 403 global_step = tf.Variable(0, trainable=False) 404 learning_rate = 0.1 405 decay_steps = 1.0 406 decay_rate = 0.5 407 learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, 408 decay_steps, decay_rate) 409 410 # Passing global_step to minimize() will increment it at each step. 411 learning_step = ( 412 tf.train.GradientDescentOptimizer(learning_rate) 413 .minimize(...my loss..., global_step=global_step) 414 ) 415 ``` 416 417 Args: 418 learning_rate: A scalar `float32` or `float64` `Tensor` or a 419 Python number. The initial learning rate. 420 global_step: A Python number. 421 Global step to use for the decay computation. Must not be negative. 422 decay_steps: How often to apply decay. 423 decay_rate: A Python number. The decay rate. 424 staircase: Whether to apply decay in a discrete staircase, as opposed to 425 continuous, fashion. 426 name: String. Optional name of the operation. Defaults to 427 'InverseTimeDecay'. 428 429 Returns: 430 A scalar `Tensor` of the same type as `learning_rate`. The decayed 431 learning rate. 432 433 Raises: 434 ValueError: if `global_step` is not supplied. 435 """ 436 if global_step is None: 437 raise ValueError("global_step is required for inverse_time_decay.") 438 with ops.name_scope(name, "InverseTimeDecay", 439 [learning_rate, global_step, decay_rate]) as name: 440 learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") 441 dtype = learning_rate.dtype 442 global_step = math_ops.cast(global_step, dtype) 443 decay_steps = math_ops.cast(decay_steps, dtype) 444 decay_rate = math_ops.cast(decay_rate, dtype) 445 p = global_step / decay_steps 446 if staircase: 447 p = math_ops.floor(p) 448 const = math_ops.cast(constant_op.constant(1), learning_rate.dtype) 449 denom = math_ops.add(const, math_ops.multiply(decay_rate, p)) 450 return math_ops.div(learning_rate, denom, name=name) 451 452 453@tf_export("train.cosine_decay") 454def cosine_decay(learning_rate, global_step, decay_steps, alpha=0.0, name=None): 455 """Applies cosine decay to the learning rate. 456 457 See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent 458 with Warm Restarts. https://arxiv.org/abs/1608.03983 459 460 When training a model, it is often recommended to lower the learning rate as 461 the training progresses. This function applies a cosine decay function 462 to a provided initial learning rate. It requires a `global_step` value to 463 compute the decayed learning rate. You can just pass a TensorFlow variable 464 that you increment at each training step. 465 466 The function returns the decayed learning rate. It is computed as: 467 ```python 468 global_step = min(global_step, decay_steps) 469 cosine_decay = 0.5 * (1 + cos(pi * global_step / decay_steps)) 470 decayed = (1 - alpha) * cosine_decay + alpha 471 decayed_learning_rate = learning_rate * decayed 472 ``` 473 474 Example usage: 475 ```python 476 decay_steps = 1000 477 lr_decayed = cosine_decay(learning_rate, global_step, decay_steps) 478 ``` 479 480 Args: 481 learning_rate: A scalar `float32` or `float64` Tensor or a Python number. 482 The initial learning rate. 483 global_step: A scalar `int32` or `int64` `Tensor` or a Python number. 484 Global step to use for the decay computation. 485 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. 486 Number of steps to decay over. 487 alpha: A scalar `float32` or `float64` Tensor or a Python number. 488 Minimum learning rate value as a fraction of learning_rate. 489 name: String. Optional name of the operation. Defaults to 'CosineDecay'. 490 Returns: 491 A scalar `Tensor` of the same type as `learning_rate`. The decayed 492 learning rate. 493 Raises: 494 ValueError: if `global_step` is not supplied. 495 """ 496 if global_step is None: 497 raise ValueError("cosine decay requires global_step") 498 with ops.name_scope(name, "CosineDecay", 499 [learning_rate, global_step]) as name: 500 learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") 501 dtype = learning_rate.dtype 502 global_step = math_ops.cast(global_step, dtype) 503 decay_steps = math_ops.cast(decay_steps, dtype) 504 global_step = math_ops.minimum(global_step, decay_steps) 505 completed_fraction = global_step / decay_steps 506 cosine_decayed = 0.5 * ( 507 1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction)) 508 509 decayed = (1 - alpha) * cosine_decayed + alpha 510 return math_ops.multiply(learning_rate, decayed) 511 512 513@tf_export("train.cosine_decay_restarts") 514def cosine_decay_restarts(learning_rate, 515 global_step, 516 first_decay_steps, 517 t_mul=2.0, 518 m_mul=1.0, 519 alpha=0.0, 520 name=None): 521 """Applies cosine decay with restarts to the learning rate. 522 523 See [Loshchilov & Hutter, ICLR2016], SGDR: Stochastic Gradient Descent 524 with Warm Restarts. https://arxiv.org/abs/1608.03983 525 526 When training a model, it is often recommended to lower the learning rate as 527 the training progresses. This function applies a cosine decay function with 528 restarts to a provided initial learning rate. It requires a `global_step` 529 value to compute the decayed learning rate. You can just pass a TensorFlow 530 variable that you increment at each training step. 531 532 The function returns the decayed learning rate while taking into account 533 possible warm restarts. The learning rate multiplier first decays 534 from 1 to `alpha` for `first_decay_steps` steps. Then, a warm 535 restart is performed. Each new warm restart runs for `t_mul` times more steps 536 and with `m_mul` times smaller initial learning rate. 537 538 Example usage: 539 ```python 540 first_decay_steps = 1000 541 lr_decayed = cosine_decay_restarts(learning_rate, global_step, 542 first_decay_steps) 543 ``` 544 545 Args: 546 learning_rate: A scalar `float32` or `float64` Tensor or a Python number. 547 The initial learning rate. 548 global_step: A scalar `int32` or `int64` `Tensor` or a Python number. 549 Global step to use for the decay computation. 550 first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. 551 Number of steps to decay over. 552 t_mul: A scalar `float32` or `float64` `Tensor` or a Python number. 553 Used to derive the number of iterations in the i-th period 554 m_mul: A scalar `float32` or `float64` `Tensor` or a Python number. 555 Used to derive the initial learning rate of the i-th period: 556 alpha: A scalar `float32` or `float64` Tensor or a Python number. 557 Minimum learning rate value as a fraction of the learning_rate. 558 name: String. Optional name of the operation. Defaults to 'SGDRDecay'. 559 Returns: 560 A scalar `Tensor` of the same type as `learning_rate`. The decayed 561 learning rate. 562 Raises: 563 ValueError: if `global_step` is not supplied. 564 """ 565 if global_step is None: 566 raise ValueError("cosine decay restarts requires global_step") 567 with ops.name_scope(name, "SGDRDecay", [learning_rate, global_step]) as name: 568 learning_rate = ops.convert_to_tensor( 569 learning_rate, name="initial_learning_rate") 570 dtype = learning_rate.dtype 571 global_step = math_ops.cast(global_step, dtype) 572 first_decay_steps = math_ops.cast(first_decay_steps, dtype) 573 alpha = math_ops.cast(alpha, dtype) 574 t_mul = math_ops.cast(t_mul, dtype) 575 m_mul = math_ops.cast(m_mul, dtype) 576 577 completed_fraction = global_step / first_decay_steps 578 579 def compute_step(completed_fraction, geometric=False): 580 if geometric: 581 i_restart = math_ops.floor( 582 math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) / 583 math_ops.log(t_mul)) 584 585 sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul) 586 completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart 587 588 else: 589 i_restart = math_ops.floor(completed_fraction) 590 completed_fraction = completed_fraction - i_restart 591 592 return i_restart, completed_fraction 593 594 i_restart, completed_fraction = control_flow_ops.cond( 595 math_ops.equal(t_mul, 1.0), 596 lambda: compute_step(completed_fraction, geometric=False), 597 lambda: compute_step(completed_fraction, geometric=True)) 598 599 m_fac = m_mul**i_restart 600 cosine_decayed = 0.5 * m_fac * ( 601 1.0 + math_ops.cos(constant_op.constant(math.pi) * completed_fraction)) 602 decayed = (1 - alpha) * cosine_decayed + alpha 603 604 return math_ops.multiply(learning_rate, decayed, name=name) 605 606 607@tf_export("train.linear_cosine_decay") 608def linear_cosine_decay(learning_rate, 609 global_step, 610 decay_steps, 611 num_periods=0.5, 612 alpha=0.0, 613 beta=0.001, 614 name=None): 615 """Applies linear cosine decay to the learning rate. 616 617 See [Bello et al., ICML2017] Neural Optimizer Search with RL. 618 https://arxiv.org/abs/1709.07417 619 620 For the idea of warm starts here controlled by `num_periods`, 621 see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent 622 with Warm Restarts. https://arxiv.org/abs/1608.03983 623 624 Note that linear cosine decay is more aggressive than cosine decay and 625 larger initial learning rates can typically be used. 626 627 When training a model, it is often recommended to lower the learning rate as 628 the training progresses. This function applies a linear cosine decay function 629 to a provided initial learning rate. It requires a `global_step` value to 630 compute the decayed learning rate. You can just pass a TensorFlow variable 631 that you increment at each training step. 632 633 The function returns the decayed learning rate. It is computed as: 634 ```python 635 global_step = min(global_step, decay_steps) 636 linear_decay = (decay_steps - global_step) / decay_steps) 637 cosine_decay = 0.5 * ( 638 1 + cos(pi * 2 * num_periods * global_step / decay_steps)) 639 decayed = (alpha + linear_decay) * cosine_decay + beta 640 decayed_learning_rate = learning_rate * decayed 641 ``` 642 643 Example usage: 644 ```python 645 decay_steps = 1000 646 lr_decayed = linear_cosine_decay(learning_rate, global_step, decay_steps) 647 ``` 648 649 Args: 650 learning_rate: A scalar `float32` or `float64` Tensor or a Python number. 651 The initial learning rate. 652 global_step: A scalar `int32` or `int64` `Tensor` or a Python number. 653 Global step to use for the decay computation. 654 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. 655 Number of steps to decay over. 656 num_periods: Number of periods in the cosine part of the decay. 657 See computation above. 658 alpha: See computation above. 659 beta: See computation above. 660 name: String. Optional name of the operation. Defaults to 661 'LinearCosineDecay'. 662 Returns: 663 A scalar `Tensor` of the same type as `learning_rate`. The decayed 664 learning rate. 665 Raises: 666 ValueError: if `global_step` is not supplied. 667 """ 668 if global_step is None: 669 raise ValueError("linear cosine decay requires global_step") 670 with ops.name_scope(name, "LinearCosineDecay", 671 [learning_rate, global_step]) as name: 672 learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") 673 dtype = learning_rate.dtype 674 global_step = math_ops.cast(global_step, dtype) 675 decay_steps = math_ops.cast(decay_steps, dtype) 676 num_periods = math_ops.cast(num_periods, dtype) 677 global_step = math_ops.minimum(global_step, decay_steps) 678 alpha = math_ops.cast(alpha, dtype) 679 beta = math_ops.cast(beta, dtype) 680 681 linear_decayed = (decay_steps - global_step) / decay_steps 682 completed_fraction = global_step / decay_steps 683 fraction = 2.0 * num_periods * completed_fraction 684 cosine_decayed = 0.5 * ( 685 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) 686 687 linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta 688 return math_ops.multiply(learning_rate, linear_cosine_decayed, name=name) 689 690 691@tf_export("train.noisy_linear_cosine_decay") 692def noisy_linear_cosine_decay(learning_rate, 693 global_step, 694 decay_steps, 695 initial_variance=1.0, 696 variance_decay=0.55, 697 num_periods=0.5, 698 alpha=0.0, 699 beta=0.001, 700 name=None): 701 """Applies noisy linear cosine decay to the learning rate. 702 703 See [Bello et al., ICML2017] Neural Optimizer Search with RL. 704 https://arxiv.org/abs/1709.07417 705 706 For the idea of warm starts here controlled by `num_periods`, 707 see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent 708 with Warm Restarts. https://arxiv.org/abs/1608.03983 709 710 Note that linear cosine decay is more aggressive than cosine decay and 711 larger initial learning rates can typically be used. 712 713 When training a model, it is often recommended to lower the learning rate as 714 the training progresses. This function applies a noisy linear 715 cosine decay function to a provided initial learning rate. 716 It requires a `global_step` value to compute the decayed learning rate. 717 You can just pass a TensorFlow variable that you increment at each 718 training step. 719 720 The function returns the decayed learning rate. It is computed as: 721 ```python 722 global_step = min(global_step, decay_steps) 723 linear_decay = (decay_steps - global_step) / decay_steps) 724 cosine_decay = 0.5 * ( 725 1 + cos(pi * 2 * num_periods * global_step / decay_steps)) 726 decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta 727 decayed_learning_rate = learning_rate * decayed 728 ``` 729 where eps_t is 0-centered gaussian noise with variance 730 initial_variance / (1 + global_step) ** variance_decay 731 732 Example usage: 733 ```python 734 decay_steps = 1000 735 lr_decayed = noisy_linear_cosine_decay( 736 learning_rate, global_step, decay_steps) 737 ``` 738 739 Args: 740 learning_rate: A scalar `float32` or `float64` Tensor or a Python number. 741 The initial learning rate. 742 global_step: A scalar `int32` or `int64` `Tensor` or a Python number. 743 Global step to use for the decay computation. 744 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. 745 Number of steps to decay over. 746 initial_variance: initial variance for the noise. See computation above. 747 variance_decay: decay for the noise's variance. See computation above. 748 num_periods: Number of periods in the cosine part of the decay. 749 See computation above. 750 alpha: See computation above. 751 beta: See computation above. 752 name: String. Optional name of the operation. Defaults to 753 'NoisyLinearCosineDecay'. 754 Returns: 755 A scalar `Tensor` of the same type as `learning_rate`. The decayed 756 learning rate. 757 Raises: 758 ValueError: if `global_step` is not supplied. 759 """ 760 if global_step is None: 761 raise ValueError("noisy linear cosine decay requires global_step") 762 with ops.name_scope(name, "NoisyLinearCosineDecay", 763 [learning_rate, global_step]) as name: 764 learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") 765 dtype = learning_rate.dtype 766 global_step = math_ops.cast(global_step, dtype) 767 decay_steps = math_ops.cast(decay_steps, dtype) 768 global_step = math_ops.minimum(global_step, decay_steps) 769 initial_variance = math_ops.cast(initial_variance, dtype) 770 variance_decay = math_ops.cast(variance_decay, dtype) 771 num_periods = math_ops.cast(num_periods, dtype) 772 alpha = math_ops.cast(alpha, dtype) 773 beta = math_ops.cast(beta, dtype) 774 775 linear_decayed = (decay_steps - global_step) / decay_steps 776 variance = initial_variance / ( 777 math_ops.pow(1.0 + global_step, variance_decay)) 778 std = math_ops.sqrt(variance) 779 noisy_linear_decayed = ( 780 linear_decayed + 781 random_ops.random_normal(linear_decayed.shape, stddev=std)) 782 783 completed_fraction = global_step / decay_steps 784 fraction = 2.0 * num_periods * completed_fraction 785 cosine_decayed = 0.5 * ( 786 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) 787 noisy_linear_cosine_decayed = ( 788 (alpha + noisy_linear_decayed) * cosine_decayed + beta) 789 790 return math_ops.multiply( 791 learning_rate, noisy_linear_cosine_decayed, name=name) 792