• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1syntax = "proto3";
2
3package tensorflow.tpu;
4
5import "google/protobuf/wrappers.proto";
6
7message ClippingLimits {
8  google.protobuf.FloatValue lower = 1;  // -inf if not set
9  google.protobuf.FloatValue upper = 2;  // +inf if not set
10}
11
12// Dynamic learning rate specification in the TPUEmbeddingConfiguration. The
13// actual learning rates are provided as a scalar input list to the
14// SendTPUEmbeddingGradients Op indexed by their tag specified through the
15// following proto.
16message DynamicLearningRate {
17  // For tables where learning rates are dynamically computed and communicated
18  // to the TPU embedding program, a tag must be specified for the learning
19  // rate.
20  //
21  // The tag must be a non-negative  integer. The total number of unique tags
22  // must be less than or equal to the number of tables in the TPU embedding
23  // configuration (a table does not specify any tag if it uses a constant
24  // learning rate, and specifies exactly one tag if it uses dynamic learning
25  // rates).
26  //
27  // All tags in the range [0, number_of_unique_tags) must be present in the TPU
28  // embedding configuration, i.e. a tag cannot be skipped if a different tag
29  // numerically greater than it is used in the configuration.
30  //
31  // If multiple tables specify the same tag, they *MUST* have
32  // the same dynamic learning rate, for example, their dynamic learning rate
33  // could be computed by the same TensorFlow sub-graph. The partitioning of the
34  // embedding layer would be more optimal if the number_of_unique_tags is as
35  // *LOW* as possible, i.e., if many tables share the same tag.
36  //
37  // The learning_rate input of the SendTPUEmbeddingGradients op is used to
38  // communicate dynamic learning rates to the TPU embedding program.
39  // The learning_rate input is a list of scalars where the size of the list is
40  // equal to the number of unique tags. The learning rate associated with a
41  // particular tag is specified by populating its corresponding index in the
42  // list of learning_rate scalars.
43  int32 tag = 1;
44}
45
46// Source of learning rate to use.
47message LearningRate {
48  oneof learning_rate {
49    float constant = 1;
50    DynamicLearningRate dynamic = 2;
51  }
52}
53
54// Each optimizer's parameter proto has a link to its documentation and CPU
55// implementation (if available) for user reference.
56
57// https://www.tensorflow.org/api_docs/python/tf/train/AdagradOptimizer
58// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L151
59message AdagradParameters {
60  float initial_accumulator = 1;
61}
62
63// Algorithm in http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
64message BoundedAdagradParameters {
65  // Whether to use the updated or the old value of the accumulator when
66  // computing the effective learning rate. When update_accumulator_first is set
67  // to True, the updated value of the accumulator is used.
68  bool update_accumulator_first = 1;
69  // The max_var_update value to use. Set value to 0 (default) to disable using
70  // max_var_update to clip the gradient.
71  float max_var_update = 2;
72  // The maximum value of the accumulator. Set max_accumulator to 0 (default)
73  // to disable using max_accumulator to clip the accumulator.
74  float max_accumulator = 3;
75}
76
77// https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer
78// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L423
79message StochasticGradientDescentParameters {}
80
81// https://www.tensorflow.org/api_docs/python/tf/train/FtrlOptimizer
82// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L192
83message FtrlParameters {
84  float l1 = 1;
85  float l2 = 2;
86  float lr_power = 3;
87  float initial_accum = 4;
88  float initial_linear = 5;
89}
90
91// The Adam optimizer does not implement hyper-parameter update; use the dynamic
92// learning rate feature instead, setting the learning rate to:
93// user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
94// Here, t is the current timestep.
95//
96// https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer
97// https://github.com/tensorflow/tensorflow/blob/ab51450c817674c8ff08a7ae4f8ac50cdc4bed8b/tensorflow/python/training/adam.py#L54
98//
99// Note that the code by default implements the lazy version of Adam
100// (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/LazyAdamOptimizer)
101// unless the use_non_lazy_adam parameter is set, in which case it implements
102// the normal version of Adam that updates all parameters in the embedding
103// table, even for entries that are not used in the current minibatch
104// (https://www.tensorflow.org/api_docs/python/tf/contrib/opt/AdamOptimizer). If
105// use_non_lazy_adam is enabled, gradient accumulation is also required to be
106// enabled in order to get correct results; a warning will be printed otherwise
107// (which may change to an error in the future). If use_sum_inside_sqrt is set,
108// the Adam variable update formula will be changed from m / (sqrt(v) + epsilon)
109// to m / sqrt(v + epsilon**2); this option improves the performance of TPU
110// training and is not expected to harm model quality.
111message AdamParameters {
112  float beta1 = 3;
113  float beta2 = 4;
114  float epsilon = 5;
115  float initial_m = 6;
116  float initial_v = 7;
117  bool use_non_lazy_adam = 8;
118  bool use_sum_inside_sqrt = 10;
119}
120
121// https://www.tensorflow.org/api_docs/python/tf/train/MomentumOptimizer
122// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L271
123message MomentumParameters {
124  float momentum = 1;
125  bool use_nesterov = 2;
126  float initial_accum = 3;
127}
128
129// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
130// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L356
131message RmsPropParameters {
132  float rho = 1;
133  float momentum = 2;
134  float epsilon = 3;
135  float initial_ms = 4;
136  float initial_mom = 5;
137}
138
139// https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer
140// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L372
141message CenteredRmsPropParameters {
142  float rho = 1;
143  float momentum = 2;
144  float epsilon = 3;
145  float initial_ms = 4;
146  float initial_mom = 5;
147  float initial_mg = 6;
148}
149
150// Variant of algorithm in http://proceedings.mlr.press/v44/shamir15.pdf
151message MdlAdagradLightParameters {
152  float l2 = 1;
153  float lr_power = 2;
154  float min_servable_mdl_benefit = 3;
155  float mdl_mix_in_margin = 4;
156  float mdl_benefit_rampup_coeff = 5;
157  float mdl_min_weight = 6;
158  float benefit_revisit_scale = 7;
159  float max_event_benefit = 8;
160  float max_total_benefit = 9;
161  float mdl_hard_limit = 10;
162  bool hard_limit_min_benefit = 11;
163  bool mdl_regularize = 12;
164  float initial_accumulator = 13;
165  float initial_weight = 14;
166  float initial_benefit = 15;
167}
168
169// https://www.tensorflow.org/api_docs/python/tf/train/AdadeltaOptimizer
170// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L68
171message AdadeltaParameters {
172  float rho = 1;
173  float epsilon = 2;
174  float initial_accumulator = 3;
175  float initial_update = 4;
176}
177
178// https://www.tensorflow.org/api_docs/python/tf/train/ProximalAdagradOptimizer
179// https://github.com/tensorflow/tensorflow/blob/c19e29306ce1777456b2dbb3a14f511edf7883a8/tensorflow/core/kernels/training_ops.cc#L164
180message ProximalAdagradParameters {
181  float l1 = 1;
182  float l2 = 2;
183  float initial_accumulator = 3;
184}
185
186// The online Yogi optimizer does not implement hyper-parameter update; use the
187// dynamic learning rate feature instead, setting the learning rate to:
188// user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
189// Here, t is the current timestep.
190//
191// https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf
192// plus some extensions based on FTRL.
193//
194// Note that the code by default implements the lazy version of online Yogi.
195message OnlineYogiParameters {
196  // The L1 regularization parameter (used analogously to the one in FTRL).
197  float l1 = 1;
198
199  // The L2 regularization parameter (used analogously to the one in FTRL).
200  float l2 = 2;
201
202  // \beta_2 from Algorithm 2 in the paper.
203  float beta2 = 3;
204
205  // x -> copysign(1, x) (i.e., return 1 for an input of +0 rather than 0).
206  message SignActivation {}
207
208  // x -> tanh(x * 10)
209  message TanhActivation {}
210
211  // Activation to use to replace sign function in v_t update in Algorithm 2 of
212  // paper.
213  oneof activation {
214    SignActivation sign = 6;
215    TanhActivation tanh = 7;
216  }
217}
218
219// The online Yogi optimizer does not implement hyper-parameter update; use the
220// dynamic learning rate feature instead, setting the learning rate to:
221// user learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
222// Here, t is the current timestep.
223//
224// https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf
225// plus some extensions based on FTRL.
226//
227// Note that the code by default implements the lazy version of proximal Yogi.
228message ProximalYogiParameters {
229  // The L1 regularization parameter.
230  float l1 = 1;
231
232  // The L2 regularization parameter.
233  float l2 = 2;
234
235  // The exponential decay rate for the 1st moment estimates.
236  float beta1 = 3;
237
238  // The exponential decay rate for the 2nd moment estimates.
239  float beta2 = 4;
240
241  // A constant trading off adaptivity and noise.
242  float epsilon = 5;
243
244  // x -> copysign(1, x) (i.e., return 1 for an input of +0 rather than 0).
245  message SignActivation {}
246
247  // x -> tanh(x * 10)
248  message TanhActivation {}
249
250  // Activation to use to replace sign function in v_t update in Algorithm 2 of
251  // paper.
252  oneof activation {
253    SignActivation sign = 8;
254    TanhActivation tanh = 9;
255  }
256}
257
258// Status of using gradient accumulation (doing two passes over the input
259// gradients: one to accumulate them into a temporary array and another to apply
260// them using the actual optimization algorithm). The extra message is to wrap
261// the enum for scoping.
262message GradientAccumulationStatus {
263  // if UNSPECIFIED (default), gradient accumulation is ENABLED.
264  enum Status {
265    UNSPECIFIED = 0;
266    ENABLED = 1;
267    DISABLED = 2;
268  }
269}
270
271// Configuration proto for hot ID optimization. This is an experimental feature
272// that is currently disabled (by default).
273message HotIdReplicationConfiguration {
274  // Whether to enable or disable hot ID optimization.
275  // If UNSPECIFIED (default), hot ID optimization is DISABLED.
276  enum Status {
277    UNSPECIFIED = 0;
278    ENABLED = 1;
279    DISABLED = 2;
280  }
281  Status status = 1;
282}
283
284message OptimizationParameters {
285  // Learning rate used for updating the embedding layer parameters.
286  LearningRate learning_rate = 13;
287  reserved 1;  // Old learning rate tag.
288
289  // Limits to which to clip the weight values after the backward pass; not
290  // present means no limits are applied.
291  ClippingLimits clipping_limits = 2;
292
293  // Limits to which to clip the backward pass gradient before using it for
294  // updates; not present means no limits are applied.
295  ClippingLimits gradient_clipping_limits = 7;
296
297  // Amount of weight decay to apply; see weight_decay_optimizers.py for
298  // details. Almost all optimizers are supported with this option (MDL Adagrad
299  // Light does not work, and SGD does not behave as expected if it is enabled).
300  // Although there is no check, users who want weight decay will probably also
301  // want to enable gradient accumulation as well so that the decay will happen
302  // once per minibatch.
303  float weight_decay_factor = 16;
304
305  // If true, the weight decay factor is multiplied by the current learning rate
306  // before use; this is to match the note in DecoupledWeightDecayExtension in
307  // weight_decay_optimizers.py.
308  bool multiply_weight_decay_factor_by_learning_rate = 22;
309
310  // Status of using gradient accumulation (doing two passes over the input
311  // gradients: one to accumulate them into a temporary array and another to
312  // apply them using the actual optimization algorithm).
313  GradientAccumulationStatus.Status gradient_accumulation_status = 17;
314
315  // Configuration proto for hot ID replication. This is an experimental
316  // feature that is currently disabled (by default).
317  HotIdReplicationConfiguration hot_id_replication_configuration = 18;
318
319  // Optimization algorithm parameters; which field is selected determines which
320  // algorithm to use.
321  oneof parameters {
322    AdagradParameters adagrad = 3;
323    BoundedAdagradParameters bounded_adagrad = 19;
324    StochasticGradientDescentParameters stochastic_gradient_descent = 4;
325    FtrlParameters ftrl = 5;
326    AdamParameters adam = 6;
327    MomentumParameters momentum = 8;
328    RmsPropParameters rms_prop = 9;
329    CenteredRmsPropParameters centered_rms_prop = 10;
330    MdlAdagradLightParameters mdl_adagrad_light = 11;
331    AdadeltaParameters adadelta = 12;
332    ProximalAdagradParameters proximal_adagrad = 14;
333    OnlineYogiParameters online_yogi = 20;
334    ProximalYogiParameters proximal_yogi = 21;
335  }
336
337  reserved 15;  // Old use_gradient_accumulation.
338}
339
340// Specification of an optimization algorithm's state variables (both the main
341// value vector and any extra accumulators, etc.). This proto is only used
342// internally by the TPU software and is not exposed directly to the TF model.
343message StateVariableSpecification {
344  // Parameter name for the state variable.
345  string name = 1;
346
347  // A normal state variable that should be saved and restored in checkpoints
348  // and used as an input or output to non-debug TensorFlow ops.
349  message UserDefined {
350    // For padding embedding rows, this field specifies the initial value to be
351    // used. Separate initial values need to be specified for the embeddings and
352    // any extra accumulators. The initial values should be specified so as to
353    // maintain two invariants during model training:
354    // (1) The embedding vector multiplied by zero returns a vector containing
355    //     all zeros. To maintain this invariant, the embedding values should
356    //     never be NaNs or +-infinity.
357    // (2) Repeatedly applying the optimizer using a gradient vector of all
358    //     zeros does not cause the embeddings or slot variables to become NaNs
359    //     or +-infinity.
360    // The padding row is looked up when no embedding IDs are present for a
361    // feature. The semantics of embedding lookup dictate that the output must
362    // be zero under this scenario.
363    double padding_initial_value = 1;
364  }
365
366  // A state variable that should be filled with a constant and normally hidden
367  // from users (used for intermediate gradients being accumulated, for
368  // example).
369  message FillWithConstant {
370    double initial_value = 1;
371  }
372
373  // Usage type of this state variable.
374  oneof usage {
375    UserDefined user_defined = 2;
376    FillWithConstant fill_with_constant = 3;
377  }
378}
379