1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15 16"""Adagrad for TensorFlow.""" 17from tensorflow.python.framework import ops 18from tensorflow.python.ops import array_ops 19from tensorflow.python.ops import gen_array_ops 20from tensorflow.python.ops import init_ops 21from tensorflow.python.ops import math_ops 22from tensorflow.python.training import optimizer 23from tensorflow.python.training import training_ops 24from tensorflow.python.util.tf_export import tf_export 25 26 27@tf_export(v1=["train.AdagradOptimizer"]) 28class AdagradOptimizer(optimizer.Optimizer): 29 """Optimizer that implements the Adagrad algorithm. 30 31 References: 32 Adaptive Subgradient Methods for Online Learning and Stochastic Optimization 33 :[Duchi et al., 2011](http://jmlr.org/papers/v12/duchi11a.html) 34 ([pdf](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)) 35 36 @compatibility(TF2) 37 tf.compat.v1.train.AdagradOptimizer is compatible with eager mode and 38 `tf.function`. 39 When eager execution is enabled, `learning_rate`, 40 `initial_accumulator_value`, and `epsilon` can each be a callable that 41 takes no arguments and returns the actual value to use. This can be useful 42 for changing these values across different invocations of optimizer 43 functions. 44 45 To switch to native TF2 style, use [`tf.keras.optimizers.Adagrad`] 46 (https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adagrad) 47 instead. Please notice that due to the implementation differences, 48 `tf.keras.optimizers.Adagrad` and 49 `tf.compat.v1.train.AdagradOptimizer` may have slight differences in 50 floating point numerics even though the formula used for the variable 51 updates still matches. 52 53 #### Structural mapping to native TF2 54 55 Before: 56 57 ```python 58 optimizer = tf.compat.v1.train.AdagradOptimizer( 59 learning_rate=learning_rate, 60 initial_accumulator_value=initial_accumulator_value) 61 ``` 62 63 After: 64 65 ```python 66 optimizer = tf.keras.optimizers.Adagrad( 67 learning_rate=learning_rate, 68 initial_accumulator_value=initial_accumulator_value, 69 epsilon=1e-07) 70 ``` 71 72 #### How to map arguments 73 | TF1 Arg Name | TF2 Arg Name | Note | 74 | ------------------ | ------------- | ------------------------------- | 75 | `learning_rate` | `learning_rate` | Be careful of setting | 76 : : : learning_rate tensor value computed from the global step. : 77 : : : In TF1 this was usually meant to imply a dynamic learning rate and : 78 : : : would recompute in each step. In TF2 (eager + function) it will : 79 : : : treat it as a scalar value that only gets computed once instead of : 80 : : : a symbolic placeholder to be computed each time. : 81 | `initial_accumulator_value` | `initial_accumulator_value` | The | 82 : : : argument can be value of zero in TF2, which is not accepted in TF1.| 83 | - | `epsilon` | `epsilon` is become configurable in TF2. The | 84 : : : defualt value is changed from 1e-8 to 1e-7 : 85 | `use_locking` | - | Not applicable in TF2. | 86 87 #### Before & after usage example 88 Before: 89 90 ```python 91 x = tf.Variable([1,2,3], dtype=tf.float32) 92 grad = tf.constant([0.1, 0.2, 0.3]) 93 optimizer = tf.compat.v1.train.AdagradOptimizer(learning_rate=0.001) 94 optimizer.apply_gradients(zip([grad], [x])) 95 ``` 96 97 After: 98 99 ```python 100 x = tf.Variable([1,2,3], dtype=tf.float32) 101 grad = tf.constant([0.1, 0.2, 0.3]) 102 optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.001) 103 optimizer.apply_gradients(zip([grad], [x])) 104 ``` 105 106 @end_compatibility 107 """ 108 109 def __init__(self, learning_rate, initial_accumulator_value=0.1, 110 use_locking=False, name="Adagrad"): 111 """Construct a new Adagrad optimizer. 112 113 Args: 114 learning_rate: A `Tensor` or a floating point value. The learning rate. 115 initial_accumulator_value: A floating point value. 116 Starting value for the accumulators, must be positive. 117 use_locking: If `True` use locks for update operations. 118 name: Optional name prefix for the operations created when applying 119 gradients. Defaults to "Adagrad". 120 121 Raises: 122 ValueError: If the `initial_accumulator_value` is invalid. 123 124 """ 125 if initial_accumulator_value <= 0.0: 126 raise ValueError("initial_accumulator_value must be positive: %s" % 127 initial_accumulator_value) 128 super(AdagradOptimizer, self).__init__(use_locking, name) 129 self._learning_rate = learning_rate 130 self._initial_accumulator_value = initial_accumulator_value 131 # Created in Initialize. 132 self._learning_rate_tensor = None 133 134 def _create_slots(self, var_list): 135 for v in var_list: 136 dtype = v.dtype.base_dtype 137 if v.get_shape().is_fully_defined(): 138 init = init_ops.constant_initializer(self._initial_accumulator_value, 139 dtype=dtype) 140 else: 141 init = self._init_constant_op(v, dtype) 142 self._get_or_make_slot_with_initializer(v, init, v.get_shape(), dtype, 143 "accumulator", self._name) 144 145 def _init_constant_op(self, v, dtype): 146 def init(): 147 # Use a Tensor instead of initializer if variable does not have 148 # static shape. 149 init_constant = gen_array_ops.fill(array_ops.shape(v), 150 self._initial_accumulator_value) 151 return math_ops.cast(init_constant, dtype) 152 return init 153 154 def _prepare(self): 155 learning_rate = self._call_if_callable(self._learning_rate) 156 self._learning_rate_tensor = ops.convert_to_tensor( 157 learning_rate, name="learning_rate") 158 159 def _apply_dense(self, grad, var): 160 acc = self.get_slot(var, "accumulator") 161 return training_ops.apply_adagrad( 162 var, 163 acc, 164 math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), 165 grad, 166 use_locking=self._use_locking) 167 168 def _resource_apply_dense(self, grad, var): 169 acc = self.get_slot(var, "accumulator") 170 return training_ops.resource_apply_adagrad( 171 var.handle, 172 acc.handle, 173 math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype), 174 grad, 175 use_locking=self._use_locking) 176 177 def _apply_sparse(self, grad, var): 178 acc = self.get_slot(var, "accumulator") 179 return training_ops.sparse_apply_adagrad( 180 var, 181 acc, 182 math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), 183 grad.values, 184 grad.indices, 185 use_locking=self._use_locking) 186 187 def _resource_apply_sparse(self, grad, var, indices): 188 acc = self.get_slot(var, "accumulator") 189 return training_ops.resource_sparse_apply_adagrad( 190 var.handle, 191 acc.handle, 192 math_ops.cast(self._learning_rate_tensor, grad.dtype), 193 grad, 194 indices, 195 use_locking=self._use_locking) 196