1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Adagrad Dual Averaging for TensorFlow.""" 16from __future__ import absolute_import 17from __future__ import division 18from __future__ import print_function 19 20from tensorflow.python.framework import constant_op 21from tensorflow.python.framework import ops 22from tensorflow.python.ops import array_ops 23from tensorflow.python.ops import math_ops 24from tensorflow.python.training import optimizer 25from tensorflow.python.training import training_ops 26from tensorflow.python.util.tf_export import tf_export 27 28 29@tf_export(v1=["train.AdagradDAOptimizer"]) 30class AdagradDAOptimizer(optimizer.Optimizer): 31 """Adagrad Dual Averaging algorithm for sparse linear models. 32 33 See this [paper](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf). 34 35 This optimizer takes care of regularization of unseen features in a mini batch 36 by updating them when they are seen with a closed form update rule that is 37 equivalent to having updated them on every mini-batch. 38 39 AdagradDA is typically used when there is a need for large sparsity in the 40 trained model. This optimizer only guarantees sparsity for linear models. Be 41 careful when using AdagradDA for deep networks as it will require careful 42 initialization of the gradient accumulators for it to train. 43 """ 44 45 def __init__(self, 46 learning_rate, 47 global_step, 48 initial_gradient_squared_accumulator_value=0.1, 49 l1_regularization_strength=0.0, 50 l2_regularization_strength=0.0, 51 use_locking=False, 52 name="AdagradDA"): 53 """Construct a new AdagradDA optimizer. 54 55 Args: 56 learning_rate: A `Tensor` or a floating point value. The learning rate. 57 global_step: A `Tensor` containing the current training step number. 58 initial_gradient_squared_accumulator_value: A floating point value. 59 Starting value for the accumulators, must be positive. 60 l1_regularization_strength: A float value, must be greater than or 61 equal to zero. 62 l2_regularization_strength: A float value, must be greater than or 63 equal to zero. 64 use_locking: If `True` use locks for update operations. 65 name: Optional name prefix for the operations created when applying 66 gradients. Defaults to "AdagradDA". 67 68 Raises: 69 ValueError: If the `initial_gradient_squared_accumulator_value` is 70 invalid. 71 """ 72 if initial_gradient_squared_accumulator_value <= 0.0: 73 raise ValueError("initial_gradient_squared_accumulator_value must be " 74 "positive: %s" % 75 initial_gradient_squared_accumulator_value) 76 super(AdagradDAOptimizer, self).__init__(use_locking, name) 77 self._learning_rate = learning_rate 78 self._initial_gradient_squared_accumulator_value = ( 79 initial_gradient_squared_accumulator_value) 80 # Created in Initialize. 81 self._learning_rate_tensor = None 82 self._l1_regularization_strength = l1_regularization_strength 83 self._l2_regularization_strength = l2_regularization_strength 84 self._global_step = global_step 85 self._global_step_on_worker = None 86 87 def _create_slots(self, var_list): 88 for v in var_list: 89 with ops.colocate_with(v): 90 g_val = constant_op.constant( 91 0.0, shape=v.get_shape(), dtype=v.dtype.base_dtype) 92 gg_val = constant_op.constant( 93 self._initial_gradient_squared_accumulator_value, 94 shape=v.get_shape(), 95 dtype=v.dtype.base_dtype) 96 self._get_or_make_slot(v, g_val, "gradient_accumulator", self._name) 97 self._get_or_make_slot(v, gg_val, "gradient_squared_accumulator", 98 self._name) 99 100 def _prepare(self): 101 self._learning_rate_tensor = ops.convert_to_tensor( 102 self._learning_rate, name="learning_rate") 103 # Performance optimization so that worker creates a copy of the global step 104 # to avoid overloading the parameter server holding the global step. 105 with ops.colocate_with(self._learning_rate_tensor): 106 self._global_step_on_worker = array_ops.identity(self._global_step) + 1 107 108 def _apply_dense(self, grad, var): 109 g_acc = self.get_slot(var, "gradient_accumulator") 110 gg_acc = self.get_slot(var, "gradient_squared_accumulator") 111 with ops.device(var.device): 112 global_step = array_ops.identity(self._global_step_on_worker) 113 return training_ops.apply_adagrad_da( 114 var, 115 g_acc, 116 gg_acc, 117 grad, 118 math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), 119 math_ops.cast(self._l1_regularization_strength, var.dtype.base_dtype), 120 math_ops.cast(self._l2_regularization_strength, var.dtype.base_dtype), 121 global_step, 122 use_locking=self._use_locking) 123 124 def _resource_apply_dense(self, grad, var): 125 g_acc = self.get_slot(var, "gradient_accumulator") 126 gg_acc = self.get_slot(var, "gradient_squared_accumulator") 127 with ops.device(var.device): 128 global_step = array_ops.identity(self._global_step_on_worker) 129 return training_ops.resource_apply_adagrad_da( 130 var.handle, 131 g_acc.handle, 132 gg_acc.handle, 133 grad, 134 math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype), 135 math_ops.cast(self._l1_regularization_strength, grad.dtype.base_dtype), 136 math_ops.cast(self._l2_regularization_strength, grad.dtype.base_dtype), 137 global_step, 138 use_locking=self._use_locking) 139 140 def _apply_sparse(self, grad, var): 141 g_acc = self.get_slot(var, "gradient_accumulator") 142 gg_acc = self.get_slot(var, "gradient_squared_accumulator") 143 with ops.device(var.device): 144 global_step = array_ops.identity(self._global_step_on_worker) 145 return training_ops.sparse_apply_adagrad_da( 146 var, 147 g_acc, 148 gg_acc, 149 grad.values, 150 grad.indices, 151 math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), 152 math_ops.cast(self._l1_regularization_strength, var.dtype.base_dtype), 153 math_ops.cast(self._l2_regularization_strength, var.dtype.base_dtype), 154 global_step, 155 use_locking=self._use_locking) 156 157 def _resource_apply_sparse(self, grad, var, indices): 158 g_acc = self.get_slot(var, "gradient_accumulator") 159 gg_acc = self.get_slot(var, "gradient_squared_accumulator") 160 with ops.device(var.device): 161 global_step = array_ops.identity(self._global_step_on_worker) 162 return training_ops.resource_sparse_apply_adagrad_da( 163 var.handle, 164 g_acc.handle, 165 gg_acc.handle, 166 grad, 167 indices, 168 math_ops.cast(self._learning_rate_tensor, grad.dtype), 169 math_ops.cast(self._l1_regularization_strength, grad.dtype), 170 math_ops.cast(self._l2_regularization_strength, grad.dtype), 171 global_step, 172 use_locking=self._use_locking) 173