1syntax = "proto3"; 2 3package tensorflow.tpu; 4 5import "google/protobuf/wrappers.proto"; 6 7message ClippingLimits { 8 google.protobuf.FloatValue lower = 1; // -inf if not set 9 google.protobuf.FloatValue upper = 2; // +inf if not set 10} 11 12// Dynamic learning rate specification in the TPUEmbeddingConfiguration. The 13// actual learning rates are provided as a scalar input list to the 14// SendTPUEmbeddingGradients Op indexed by their tag specified through the 15// following proto. 16message DynamicLearningRate { 17 // For tables where learning rates are dynamically computed and communicated 18 // to the TPU embedding program, a tag must be specified for the learning 19 // rate. 20 // 21 // The tag must be a non-negative integer. The total number of unique tags 22 // must be less than or equal to the number of tables in the TPU embedding 23 // configuration (a table does not specify any tag if it uses a constant 24 // learning rate, and specifies exactly one tag if it uses dynamic learning 25 // rates). 26 // 27 // All tags in the range [0, number_of_unique_tags) must be present in the TPU 28 // embedding configuration, i.e. a tag cannot be skipped if a different tag 29 // numerically greater than it is used in the configuration. 30 // 31 // If multiple tables specify the same tag, they *MUST* have 32 // the same dynamic learning rate, for example, their dynamic learning rate 33 // could be computed by the same TensorFlow sub-graph. The partitioning of the 34 // embedding layer would be more optimal if the number_of_unique_tags is as 35 // *LOW* as possible, i.e., if many tables share the same tag. 36 // 37 // The learning_rate input of the SendTPUEmbeddingGradients op is used to 38 // communicate dynamic learning rates to the TPU embedding program. 39 // The learning_rate input is a list of scalars where the size of the list is 40 // equal to the number of unique tags. The learning rate associated with a 41 // particular tag is specified by populating its corresponding index in the 42 // list of learning_rate scalars. 43 int32 tag = 1; 44} 45 46// Source of learning rate to use. 47message LearningRate { 48 oneof learning_rate { 49 float constant = 1; 50 DynamicLearningRate dynamic = 2; 51 } 52} 53 54// Each optimizer's parameter proto has a link to its documentation and CPU 55// implementation (if available) for user reference. 56 57// https://www.tensorflow.org/api_docs/python/tf/train/AdagradOptimizer 58// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L151 59message AdagradParameters { 60 float initial_accumulator = 1; 61} 62 63// Algorithm in http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf. 64message BoundedAdagradParameters { 65 // Whether to use the updated or the old value of the accumulator when 66 // computing the effective learning rate. When update_accumulator_first is set 67 // to True, the updated value of the accumulator is used. 68 bool update_accumulator_first = 1; 69 // The max_var_update value to use. Set value to 0 (default) to disable using 70 // max_var_update to clip the gradient. 71 float max_var_update = 2; 72 // The maximum value of the accumulator. Set max_accumulator to 0 (default) 73 // to disable using max_accumulator to clip the accumulator. 74 float max_accumulator = 3; 75} 76 77// https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer 78// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L423 79message StochasticGradientDescentParameters {} 80 81// https://www.tensorflow.org/api_docs/python/tf/train/FtrlOptimizer 82// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L192 83message FtrlParameters { 84 float l1 = 1; 85 float l2 = 2; 86 float lr_power = 3; 87 float initial_accum = 4; 88 float initial_linear = 5; 89} 90 91// The Adam optimizer does not implement hyper-parameter update; use the dynamic 92// learning rate feature instead, setting the learning rate to: 93// user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) 94// Here, t is the current timestep. 95// 96// https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer 97// https://github.com/tensorflow/tensorflow/blob/ab51450c817674c8ff08a7ae4f8ac50cdc4bed8b/tensorflow/python/training/adam.py#L54 98// 99// Note that the code by default implements the lazy version of Adam 100// (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/LazyAdamOptimizer) 101// unless the use_non_lazy_adam parameter is set, in which case it implements 102// the normal version of Adam that updates all parameters in the embedding 103// table, even for entries that are not used in the current minibatch 104// (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If 105// use_non_lazy_adam is enabled, gradient accumulation is also required to be 106// enabled in order to get correct results; a warning will be printed otherwise 107// (which may change to an error in the future). If use_sum_inside_sqrt is set, 108// the Adam variable update formula will be changed from m / (sqrt(v) + epsilon) 109// to m / sqrt(v + epsilon**2); this option improves the performance of TPU 110// training and is not expected to harm model quality. 111message AdamParameters { 112 float beta1 = 3; 113 float beta2 = 4; 114 float epsilon = 5; 115 float initial_m = 6; 116 float initial_v = 7; 117 bool use_non_lazy_adam = 8; 118 bool use_sum_inside_sqrt = 10; 119} 120 121// https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer 122// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L271 123message MomentumParameters { 124 float momentum = 1; 125 bool use_nesterov = 2; 126 float initial_accum = 3; 127} 128 129// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer 130// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L356 131message RmsPropParameters { 132 float rho = 1; 133 float momentum = 2; 134 float epsilon = 3; 135 float initial_ms = 4; 136 float initial_mom = 5; 137} 138 139// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer 140// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L372 141message CenteredRmsPropParameters { 142 float rho = 1; 143 float momentum = 2; 144 float epsilon = 3; 145 float initial_ms = 4; 146 float initial_mom = 5; 147 float initial_mg = 6; 148} 149 150// Variant of algorithm in http://proceedings.mlr.press/v44/shamir15.pdf 151message MdlAdagradLightParameters { 152 float l2 = 1; 153 float lr_power = 2; 154 float min_servable_mdl_benefit = 3; 155 float mdl_mix_in_margin = 4; 156 float mdl_benefit_rampup_coeff = 5; 157 float mdl_min_weight = 6; 158 float benefit_revisit_scale = 7; 159 float max_event_benefit = 8; 160 float max_total_benefit = 9; 161 float mdl_hard_limit = 10; 162 bool hard_limit_min_benefit = 11; 163 bool mdl_regularize = 12; 164 float initial_accumulator = 13; 165 float initial_weight = 14; 166 float initial_benefit = 15; 167} 168 169// https://www.tensorflow.org/api_docs/python/tf/train/AdadeltaOptimizer 170// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L68 171message AdadeltaParameters { 172 float rho = 1; 173 float epsilon = 2; 174 float initial_accumulator = 3; 175 float initial_update = 4; 176} 177 178// https://www.tensorflow.org/api_docs/python/tf/train/ProximalAdagradOptimizer 179// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L164 180message ProximalAdagradParameters { 181 float l1 = 1; 182 float l2 = 2; 183 float initial_accumulator = 3; 184} 185 186// The online Yogi optimizer does not implement hyper-parameter update; use the 187// dynamic learning rate feature instead, setting the learning rate to: 188// user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) 189// Here, t is the current timestep. 190// 191// https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf 192// plus some extensions based on FTRL. 193// 194// Note that the code by default implements the lazy version of online Yogi. 195message OnlineYogiParameters { 196 // The L1 regularization parameter (used analogously to the one in FTRL). 197 float l1 = 1; 198 199 // The L2 regularization parameter (used analogously to the one in FTRL). 200 float l2 = 2; 201 202 // \beta_2 from Algorithm 2 in the paper. 203 float beta2 = 3; 204 205 // x -> copysign(1, x) (i.e., return 1 for an input of +0 rather than 0). 206 message SignActivation {} 207 208 // x -> tanh(x * 10) 209 message TanhActivation {} 210 211 // Activation to use to replace sign function in v_t update in Algorithm 2 of 212 // paper. 213 oneof activation { 214 SignActivation sign = 6; 215 TanhActivation tanh = 7; 216 } 217} 218 219// The online Yogi optimizer does not implement hyper-parameter update; use the 220// dynamic learning rate feature instead, setting the learning rate to: 221// user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t) 222// Here, t is the current timestep. 223// 224// https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf 225// plus some extensions based on FTRL. 226// 227// Note that the code by default implements the lazy version of proximal Yogi. 228message ProximalYogiParameters { 229 // The L1 regularization parameter. 230 float l1 = 1; 231 232 // The L2 regularization parameter. 233 float l2 = 2; 234 235 // The exponential decay rate for the 1st moment estimates. 236 float beta1 = 3; 237 238 // The exponential decay rate for the 2nd moment estimates. 239 float beta2 = 4; 240 241 // A constant trading off adaptivity and noise. 242 float epsilon = 5; 243 244 // x -> copysign(1, x) (i.e., return 1 for an input of +0 rather than 0). 245 message SignActivation {} 246 247 // x -> tanh(x * 10) 248 message TanhActivation {} 249 250 // Activation to use to replace sign function in v_t update in Algorithm 2 of 251 // paper. 252 oneof activation { 253 SignActivation sign = 8; 254 TanhActivation tanh = 9; 255 } 256} 257 258// Status of using gradient accumulation (doing two passes over the input 259// gradients: one to accumulate them into a temporary array and another to apply 260// them using the actual optimization algorithm). The extra message is to wrap 261// the enum for scoping. 262message GradientAccumulationStatus { 263 // if UNSPECIFIED (default), gradient accumulation is ENABLED. 264 enum Status { 265 UNSPECIFIED = 0; 266 ENABLED = 1; 267 DISABLED = 2; 268 } 269} 270 271// Configuration proto for hot ID optimization. This is an experimental feature 272// that is currently disabled (by default). 273message HotIdReplicationConfiguration { 274 // Whether to enable or disable hot ID optimization. 275 // If UNSPECIFIED (default), hot ID optimization is DISABLED. 276 enum Status { 277 UNSPECIFIED = 0; 278 ENABLED = 1; 279 DISABLED = 2; 280 } 281 Status status = 1; 282} 283 284message OptimizationParameters { 285 // Learning rate used for updating the embedding layer parameters. 286 LearningRate learning_rate = 13; 287 reserved 1; // Old learning rate tag. 288 289 // Limits to which to clip the weight values after the backward pass; not 290 // present means no limits are applied. 291 ClippingLimits clipping_limits = 2; 292 293 // Limits to which to clip the backward pass gradient before using it for 294 // updates; not present means no limits are applied. 295 ClippingLimits gradient_clipping_limits = 7; 296 297 // Amount of weight decay to apply; see weight_decay_optimizers.py for 298 // details. Almost all optimizers are supported with this option (MDL Adagrad 299 // Light does not work, and SGD does not behave as expected if it is enabled). 300 // Although there is no check, users who want weight decay will probably also 301 // want to enable gradient accumulation as well so that the decay will happen 302 // once per minibatch. 303 float weight_decay_factor = 16; 304 305 // If true, the weight decay factor is multiplied by the current learning rate 306 // before use; this is to match the note in DecoupledWeightDecayExtension in 307 // weight_decay_optimizers.py. 308 bool multiply_weight_decay_factor_by_learning_rate = 22; 309 310 // Status of using gradient accumulation (doing two passes over the input 311 // gradients: one to accumulate them into a temporary array and another to 312 // apply them using the actual optimization algorithm). 313 GradientAccumulationStatus.Status gradient_accumulation_status = 17; 314 315 // Configuration proto for hot ID replication. This is an experimental 316 // feature that is currently disabled (by default). 317 HotIdReplicationConfiguration hot_id_replication_configuration = 18; 318 319 // Optimization algorithm parameters; which field is selected determines which 320 // algorithm to use. 321 oneof parameters { 322 AdagradParameters adagrad = 3; 323 BoundedAdagradParameters bounded_adagrad = 19; 324 StochasticGradientDescentParameters stochastic_gradient_descent = 4; 325 FtrlParameters ftrl = 5; 326 AdamParameters adam = 6; 327 MomentumParameters momentum = 8; 328 RmsPropParameters rms_prop = 9; 329 CenteredRmsPropParameters centered_rms_prop = 10; 330 MdlAdagradLightParameters mdl_adagrad_light = 11; 331 AdadeltaParameters adadelta = 12; 332 ProximalAdagradParameters proximal_adagrad = 14; 333 OnlineYogiParameters online_yogi = 20; 334 ProximalYogiParameters proximal_yogi = 21; 335 } 336 337 reserved 15; // Old use_gradient_accumulation. 338} 339 340// Specification of an optimization algorithm's state variables (both the main 341// value vector and any extra accumulators, etc.). This proto is only used 342// internally by the TPU software and is not exposed directly to the TF model. 343message StateVariableSpecification { 344 // Parameter name for the state variable. 345 string name = 1; 346 347 // A normal state variable that should be saved and restored in checkpoints 348 // and used as an input or output to non-debug TensorFlow ops. 349 message UserDefined { 350 // For padding embedding rows, this field specifies the initial value to be 351 // used. Separate initial values need to be specified for the embeddings and 352 // any extra accumulators. The initial values should be specified so as to 353 // maintain two invariants during model training: 354 // (1) The embedding vector multiplied by zero returns a vector containing 355 // all zeros. To maintain this invariant, the embedding values should 356 // never be NaNs or +-infinity. 357 // (2) Repeatedly applying the optimizer using a gradient vector of all 358 // zeros does not cause the embeddings or slot variables to become NaNs 359 // or +-infinity. 360 // The padding row is looked up when no embedding IDs are present for a 361 // feature. The semantics of embedding lookup dictate that the output must 362 // be zero under this scenario. 363 double padding_initial_value = 1; 364 } 365 366 // A state variable that should be filled with a constant and normally hidden 367 // from users (used for intermediate gradients being accumulated, for 368 // example). 369 message FillWithConstant { 370 double initial_value = 1; 371 } 372 373 // Usage type of this state variable. 374 oneof usage { 375 UserDefined user_defined = 2; 376 FillWithConstant fill_with_constant = 3; 377 } 378} 379