• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Cudnn RNN operators."""
16from __future__ import absolute_import
17from __future__ import division
18from __future__ import print_function
19
20import os
21from tensorflow.contrib.checkpoint.python import split_dependency
22from tensorflow.contrib.rnn.python.ops import lstm_ops
23from tensorflow.python.framework import dtypes
24from tensorflow.python.framework import ops
25from tensorflow.python.framework import random_seed
26from tensorflow.python.keras.engine import base_layer
27from tensorflow.python.ops import array_ops
28from tensorflow.python.ops import gen_cudnn_rnn_ops
29from tensorflow.python.ops import init_ops
30from tensorflow.python.ops import math_ops
31from tensorflow.python.ops import nn_ops
32from tensorflow.python.ops import rnn_cell_impl
33from tensorflow.python.ops import state_ops
34from tensorflow.python.ops import variable_scope as vs
35from tensorflow.python.training import saver
36from tensorflow.python.training.tracking import tracking as trackable_lib
37
38CUDNN_RNN_UNIDIRECTION = "unidirectional"
39CUDNN_RNN_BIDIRECTION = "bidirectional"
40CUDNN_LSTM = "lstm"
41CUDNN_GRU = "gru"
42CUDNN_RNN_RELU = "rnn_relu"
43CUDNN_RNN_TANH = "rnn_tanh"
44
45# Half for cell input, half for hidden states.
46CUDNN_LSTM_PARAMS_PER_LAYER = 8
47CUDNN_GRU_PARAMS_PER_LAYER = 6
48CUDNN_RNN_TANH_PARAMS_PER_LAYER = 2
49CUDNN_RNN_RELU_PARAMS_PER_LAYER = 2
50
51CUDNN_INPUT_LINEAR_MODE = "linear_input"
52CUDNN_INPUT_SKIP_MODE = "skip_input"
53CUDNN_INPUT_AUTO_MODE = "auto_select"
54
55# pylint:disable=protected-access
56_BIAS_VARIABLE_NAME = rnn_cell_impl._BIAS_VARIABLE_NAME
57_WEIGHTS_VARIABLE_NAME = rnn_cell_impl._WEIGHTS_VARIABLE_NAME
58# pylint:enable=protected-access
59
60
61class CudnnCompatibleLSTMCell(lstm_ops.LSTMBlockCell):
62  """Cudnn Compatible LSTMCell.
63
64  A simple wrapper around `tf.contrib.rnn.LSTMBlockCell` to use along with
65  `tf.contrib.cudnn_rnn.CudnnLSTM`. The latter's params can be used by
66  this cell seamlessly.
67  """
68
69  def __init__(self, num_units, reuse=None):
70    super(CudnnCompatibleLSTMCell, self).__init__(
71        num_units, forget_bias=0, cell_clip=None, use_peephole=False,
72        reuse=reuse, name="cudnn_compatible_lstm_cell")
73    self._names.update({"scope": "cudnn_compatible_lstm_cell"})
74
75
76class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell):
77  r"""Cudnn Compatible GRUCell.
78
79  A GRU impl akin to `tf.nn.rnn_cell.GRUCell` to use along with
80  `tf.contrib.cudnn_rnn.CudnnGRU`. The latter's params can be used by
81  it seamlessly.
82
83  It differs from platform-independent GRUs in how the new memory gate is
84  calculated. Nvidia picks this variant based on GRU author's[1] suggestion and
85  the fact it has no accuracy impact[2].
86  [1] https://arxiv.org/abs/1406.1078
87  [2] http://svail.github.io/diff_graphs/
88
89  Cudnn compatible GRU (from Cudnn library user guide):
90  ```python
91  # reset gate
92  $$r_t = \sigma(x_t * W_r + h_t-1 * R_h + b_{Wr} + b_{Rr})$$
93  # update gate
94  $$u_t = \sigma(x_t * W_u + h_t-1 * R_u + b_{Wu} + b_{Ru})$$
95  # new memory gate
96  $$h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_{Rh}) + b_{Wh})$$
97  $$h_t = (1 - u_t) .* h'_t + u_t .* h_t-1$$
98  ```
99
100  Other GRU (see `tf.nn.rnn_cell.GRUCell` and `tf.contrib.rnn.GRUBlockCell`):
101  ```python
102  # new memory gate
103  \\(h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_{Wh})\\)
104  ```
105  which is not equivalent to Cudnn GRU: in addition to the extra bias term b_Rh,
106  ```python
107  \\(r .* (h * R) != (r .* h) * R\\)
108  ```
109  """
110
111  def __init__(self, num_units, reuse=None, kernel_initializer=None):
112    super(CudnnCompatibleGRUCell, self).__init__(
113        num_units,
114        activation=None,
115        reuse=reuse,
116        kernel_initializer=kernel_initializer)
117
118  def build(self, inputs_shape):
119    if inputs_shape[1].value is None:
120      raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s"
121                       % inputs_shape)
122
123    input_depth = inputs_shape[1].value
124    self._gate_kernel = self.add_variable(
125        "gates/%s" % _WEIGHTS_VARIABLE_NAME,
126        shape=[input_depth + self._num_units, 2 * self._num_units],
127        initializer=self._kernel_initializer)
128    self._gate_bias = self.add_variable(
129        "gates/%s" % _BIAS_VARIABLE_NAME,
130        shape=[2 * self._num_units],
131        initializer=(
132            self._bias_initializer
133            if self._bias_initializer is not None
134            else init_ops.constant_initializer(1.0, dtype=self.dtype)))
135
136    self._candidate_input_kernel = self.add_variable(
137        "candidate/input_projection/%s" % _WEIGHTS_VARIABLE_NAME,
138        shape=[input_depth, self._num_units],
139        initializer=self._kernel_initializer)
140    self._candidate_hidden_kernel = self.add_variable(
141        "candidate/hidden_projection/%s" % _WEIGHTS_VARIABLE_NAME,
142        shape=[self._num_units, self._num_units],
143        initializer=self._kernel_initializer)
144
145    self._candidate_input_bias = self.add_variable(
146        "candidate/input_projection/%s" % _BIAS_VARIABLE_NAME,
147        shape=[self._num_units],
148        initializer=(
149            self._bias_initializer
150            if self._bias_initializer is not None
151            else init_ops.zeros_initializer(dtype=self.dtype)))
152    self._candidate_hidden_bias = self.add_variable(
153        "candidate/hidden_projection/%s" % _BIAS_VARIABLE_NAME,
154        shape=[self._num_units],
155        initializer=(
156            self._bias_initializer
157            if self._bias_initializer is not None
158            else init_ops.zeros_initializer(dtype=self.dtype)))
159
160  def call(self, inputs, state):
161    """Gated recurrent unit (GRU) with nunits cells."""
162    gate_inputs = math_ops.matmul(
163        array_ops.concat([inputs, state], 1), self._gate_kernel)
164    gate_inputs = nn_ops.bias_add(gate_inputs, self._gate_bias)
165
166    value = math_ops.sigmoid(gate_inputs)
167    r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1)
168
169    candidate = nn_ops.bias_add(
170        math_ops.matmul(inputs, self._candidate_input_kernel),
171        self._candidate_input_bias)
172    candidate += r * nn_ops.bias_add(
173        math_ops.matmul(state, self._candidate_hidden_kernel),
174        self._candidate_hidden_bias)
175    candidate = self._activation(candidate)
176    new_h = (1-u) * candidate + u * state
177    return new_h, new_h
178
179
180class CudnnParamsFormatConverter(object):
181  """Abstract class that converts between params of Cudnn Rnn and TF Rnn."""
182
183  def __init__(self,
184               num_layers,
185               num_units,
186               input_size,
187               input_mode=CUDNN_INPUT_LINEAR_MODE,
188               direction=CUDNN_RNN_UNIDIRECTION):
189    """Constructor.
190
191    Args:
192      num_layers: the number of layers for the RNN model.
193      num_units: the number of units within the RNN model.
194      input_size: the size of the input, it could be different from the
195        num_units.
196      input_mode: indicate whether there is a linear projection between the
197        input and the actual computation before the first layer. It could be one
198        of 'linear_input', 'skip_input' or 'auto_select'. * 'linear_input'
199        (default) always applies a linear projection of input onto RNN hidden
200        state. (standard RNN behavior). * 'skip_input' is only allowed when
201        input_size == num_units; * 'auto_select' implies 'skip_input' when
202        input_size == num_units; otherwise, it implies 'linear_input'.
203      direction: the direction model that the model operates. Could be either
204        'unidirectional' or 'bidirectional'
205    """
206    self._num_layers = num_layers
207    self._input_size = input_size
208    self._num_units = num_units
209    self._input_mode = input_mode
210    self._direction = direction
211    self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2
212    self._num_params = (
213        self._num_params_per_layer * self._num_layers * self._num_dirs)
214
215  def tf_canonical_to_opaque(self, tf_canonicals):
216    r"""Converts tf canonical weights to cudnn opaque param."""
217    cu_weights, cu_biases = self._tf_canonical_to_cu_canonical(tf_canonicals)
218    cu_weights = [array_ops.reshape(w, [-1]) for w in cu_weights]
219    opaque_params = self._cu_canonical_to_opaque(cu_weights, cu_biases)
220    return opaque_params
221
222  def opaque_to_tf_canonical(self, opaque_param):
223    r"""Converts cudnn opaque param to tf canonical weights."""
224    cu_weights, cu_biases = self._opaque_to_cu_canonical(opaque_param)
225    weights, biases = self._cu_canonical_to_tf_canonical(cu_weights, cu_biases)
226    return weights, biases
227
228  def _opaque_to_cu_canonical(self, opaque_param):
229    """Converts opaque params to Cudnn canonical format.
230
231    Args:
232      opaque_param: An opaque tensor storing cudnn rnn params (weights and
233        biases).
234    Returns:
235      2 list for weights and biases respectively.
236    """
237    with ops.device("/gpu:0"):
238      weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical(
239          num_layers=self._num_layers,
240          num_units=self._num_units,
241          input_size=self._input_size,
242          params=opaque_param,
243          num_params=self._num_params,
244          rnn_mode=self._rnn_mode,
245          input_mode=self._input_mode,
246          direction=self._direction)
247      return (weights, biases)
248
249  def _cu_canonical_to_opaque(self, cu_weights, cu_biases):
250    """Converts from Cudnn canonical format to opaque params.
251
252    Args:
253      cu_weights: a list of tensors, Cudnn canonical weights.
254      cu_biases: a list of tensors, Cudnn canonical biases.
255    Returns:
256      a single opaque tensor.
257    """
258    with ops.device("/gpu:0"):
259      return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params(
260          num_layers=self._num_layers,
261          num_units=self._num_units,
262          input_size=self._input_size,
263          weights=cu_weights,
264          biases=cu_biases,
265          rnn_mode=self._rnn_mode,
266          input_mode=self._input_mode,
267          direction=self._direction)
268
269  def _cu_canonical_to_tf_canonical(self, cu_weights, cu_biases):
270    r"""Transform from Cudnn canonical to tf canonical.
271
272    The elements of argument lists are laid out in the following format:
273        ------------------------------------------------------------
274        | weights                    | biases                      |
275        ------------------------------------------------------------
276        \                             \
277         \                             \
278          -------------------------------
279          | layer1     |layer2     |... |
280          -------------------------------
281          \             \
282           ---------------
283           |fwd   |bak   |
284           ---------------
285    Args:
286      cu_weights: a list of tensors of Cudnn canonical weights.
287      cu_biases: a list of tensors of Cudnn canonical biases.
288    Returns:
289      1 tuple, tf canonical weights and biases.
290    """
291    tf_weights, tf_biases = [], []
292
293    layer_weights_num = self._num_params_per_layer * self._num_dirs
294    layer_biases_num = layer_weights_num
295
296    for i in range(self._num_layers):
297      layer_weights = cu_weights[i * layer_weights_num:(i + 1) *
298                                 layer_weights_num]
299      layer_biases = cu_biases[i * layer_biases_num:(i + 1) * layer_biases_num]
300      if self._direction == CUDNN_RNN_UNIDIRECTION:
301        self._cu_canonical_to_tf_canonical_single_layer(
302            layer_weights, layer_biases, tf_weights, tf_biases)
303      else:
304        fw_weights = layer_weights[:len(layer_weights) // 2]
305        bw_weights = layer_weights[len(layer_weights) // 2:]
306        fw_biases = layer_biases[:len(layer_biases) // 2]
307        bw_biases = layer_biases[len(layer_biases) // 2:]
308
309        self._cu_canonical_to_tf_canonical_single_layer(
310            fw_weights,
311            fw_biases,
312            tf_weights,
313            tf_biases,
314        )
315
316        self._cu_canonical_to_tf_canonical_single_layer(
317            bw_weights,
318            bw_biases,
319            tf_weights,
320            tf_biases,
321        )
322    return (tf_weights, tf_biases)
323
324  def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
325                                                 tf_weights, tf_biases):
326    r"""Transform single layer Cudnn canonicals to tf canonicals.
327
328    The elements of cu_weights, cu_biases are laid out in the following format:
329    -------------------------------------------------------------------------
330    | gate0 param on inputs | gate0 param on hidden state | gate1 ..........|
331    -------------------------------------------------------------------------
332    Args:
333      cu_weights: a list of tensors, single layer weights.
334      cu_biases: a list of tensors, single layer biases.
335      tf_weights: a list where transformed weights are stored.
336      tf_biases: a list where transformed biases are stored.
337    """
338    raise NotImplementedError("Abstract method")
339
340  def _tf_canonical_to_cu_canonical(self, tf_canonicals):
341    r"""Transform from tf canonical to Cudnn canonical.
342
343    This is the reverse routine of _TransformCanonical().
344    Args:
345      tf_canonicals: a list of tensors of tf canonical params. The elements are
346        laid out in the following format:
347        ------------------------------------------------------------
348        | weights                    | biases                      |
349        ------------------------------------------------------------
350        \                             \
351         \                             \
352          -------------------------------
353          | layer1     |layer2     |... |
354          -------------------------------
355          \             \
356           ---------------
357           |fwd   |bak   |
358           ---------------
359    Returns:
360      2 lists: the recovered cudnn canonical weights and biases.
361    """
362    weights = tf_canonicals[:len(tf_canonicals) // 2]
363    biases = tf_canonicals[len(tf_canonicals) // 2:]
364
365    cu_weights, cu_biases = [], []
366    layer_weights_num = len(weights) // self._num_layers
367    layer_biases_num = len(biases) // self._num_layers
368    for i in range(self._num_layers):
369      layer_weights = weights[i * layer_weights_num:(i + 1) * layer_weights_num]
370      layer_biases = biases[i * layer_biases_num:(i + 1) * layer_biases_num]
371      if self._direction == CUDNN_RNN_UNIDIRECTION:
372        cu_weights.extend(self._tf_to_cudnn_weights(i, *layer_weights))
373        cu_biases.extend(self._tf_to_cudnn_biases(*layer_biases))
374      else:
375        fw_weights, bw_weights = layer_weights[:len(
376            layer_weights) // 2], layer_weights[len(layer_weights) // 2:]
377        fw_biases, bw_biases = layer_biases[:len(
378            layer_biases) // 2], layer_biases[len(layer_biases) // 2:]
379        cu_weights.extend(self._tf_to_cudnn_weights(i, *fw_weights))
380        cu_biases.extend(self._tf_to_cudnn_biases(*fw_biases))
381
382        cu_weights.extend(self._tf_to_cudnn_weights(i, *bw_weights))
383        cu_biases.extend(self._tf_to_cudnn_biases(*bw_biases))
384    return cu_weights, cu_biases
385
386  def _cudnn_to_tf_weights(self, *cu_weights):
387    r"""Stitches cudnn canonical weights to generate tf canonical weights."""
388    raise NotImplementedError("Abstract method")
389
390  def _tf_to_cudnn_weights(self, layer, *tf_weights):
391    r"""Reverses the operations in StitchWeights()."""
392    raise NotImplementedError("Abstract method")
393
394  def _cudnn_to_tf_biases(self, *biases):
395    r"""Stitches cudnn canonical biases to generate tf canonical biases."""
396    raise NotImplementedError("Abstract method")
397
398  def _tf_to_cudnn_biases(self, *tf_biases):
399    r"""Reverses the operations in StitchBiases()."""
400    raise NotImplementedError("Abstract method")
401
402
403class CudnnParamsFormatConverterLSTM(CudnnParamsFormatConverter):
404  """Helper class that converts between params of Cudnn and TF LSTM."""
405  _rnn_mode = CUDNN_LSTM
406  _num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
407
408  def _cudnn_to_tf_gate_params(self, *cu_gate_order):
409    i_g, f_g, c_g, o_g = cu_gate_order
410    return [i_g, c_g, f_g, o_g]
411
412  def _tf_to_cudnn_gate_params(self, *tf_gate_order):
413    i_g, c_g, f_g, o_g = tf_gate_order
414    return [i_g, f_g, c_g, o_g]
415
416  def _cudnn_to_tf_weights(self, *cu_weights):
417    r"""Stitching cudnn canonical weights to generate tf canonical weights."""
418    w_i, w_f, w_c, w_o, r_i, r_f, r_c, r_o = cu_weights
419
420    # pylint: disable=invalid-name
421    W_i = array_ops.concat([w_i, r_i], axis=1)
422    W_f = array_ops.concat([w_f, r_f], axis=1)
423    W_c = array_ops.concat([w_c, r_c], axis=1)
424    W_o = array_ops.concat([w_o, r_o], axis=1)
425    # pylint: enable=invalid-name
426    # Cudnn LSTM weights are in ifco order, other tf LSTMs are in icfo order.
427    reordered = self._cudnn_to_tf_gate_params(* [W_i, W_f, W_c, W_o])
428    return (array_ops.transpose(array_ops.concat(reordered, axis=0)),)
429
430  def _tf_to_cudnn_weights(self, layer, *tf_weights):
431    r"""Reverse the operations in StitchWeights()."""
432    input_size = self._input_size
433    num_units = self._num_units
434    if layer == 0:
435      input_weight_width = input_size
436    else:
437      input_weight_width = num_units
438      if self._direction == CUDNN_RNN_BIDIRECTION:
439        input_weight_width *= 2
440
441    (tf_weight,) = tf_weights
442    w = array_ops.transpose(tf_weight)
443    # pylint: disable=invalid-name
444    W_i, W_f, W_c, W_o = self._tf_to_cudnn_gate_params(*array_ops.split(
445        w, 4, axis=0))
446
447    w_i, r_i = array_ops.split(W_i, [input_weight_width, num_units], axis=1)
448    w_c, r_c = array_ops.split(W_c, [input_weight_width, num_units], axis=1)
449    w_f, r_f = array_ops.split(W_f, [input_weight_width, num_units], axis=1)
450    w_o, r_o = array_ops.split(W_o, [input_weight_width, num_units], axis=1)
451    return w_i, w_f, w_c, w_o, r_i, r_f, r_c, r_o
452    # pylint: enable=invalid-name
453
454  def _cudnn_to_tf_biases(self, *cu_biases):
455    r"""Stitching cudnn canonical biases to generate tf canonical biases."""
456    b_wi, b_wf, b_wc, b_wo, b_ri, b_rf, b_rc, b_ro = cu_biases
457    # Save only the sum instead of individual biases. When recovering, return
458    # two biases each with half the value. Since RNN does not regularize by
459    # weight decay, it has no side effect in training or inference.
460    # pylint: disable=invalid-name
461    B_i = b_wi + b_ri
462    B_f = b_wf + b_rf
463    B_c = b_wc + b_rc
464    B_o = b_wo + b_ro
465    # pylint: enable=invalid-name
466    reordered = self._cudnn_to_tf_gate_params(* [B_i, B_f, B_c, B_o])
467    return (array_ops.concat(reordered, axis=0),)
468
469  def _tf_to_cudnn_biases(self, *tf_biases):
470    r"""Reverse the operations in StitchBiases()."""
471    (tf_bias,) = tf_biases
472    # pylint: disable=invalid-name
473    B_i, B_f, B_c, B_o = self._tf_to_cudnn_gate_params(*array_ops.split(
474        tf_bias, 4, axis=0))
475    # pylint: enable=invalid-name
476    # pylint: disable=unbalanced-tuple-unpacking
477    b_wi, b_ri = (B_i * 0.5,) * 2
478    b_wf, b_rf = (B_f * 0.5,) * 2
479    b_wc, b_rc = (B_c * 0.5,) * 2
480    b_wo, b_ro = (B_o * 0.5,) * 2
481    # pylint: enable=unbalanced-tuple-unpacking
482    # Return ifco order for Cudnn LSTM.
483    return b_wi, b_wf, b_wc, b_wo, b_ri, b_rf, b_rc, b_ro
484
485  def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
486                                                 tf_weights, tf_biases):
487    (w,) = self._cudnn_to_tf_weights(*cu_weights)
488    (b,) = self._cudnn_to_tf_biases(*cu_biases)
489    tf_weights.append(w)
490    tf_biases.append(b)
491
492
493class CudnnParamsFormatConverterGRU(CudnnParamsFormatConverter):
494  """Helper class that converts between params of Cudnn and TF GRU."""
495
496  _rnn_mode = CUDNN_GRU
497  _num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
498
499  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__)
500
501  def _cudnn_to_tf_weights(self, *cu_weights):
502    r"""Stitching cudnn canonical weights to generate tf canonical weights."""
503    w_i, w_r, w_h, r_i, r_r, r_h = cu_weights
504
505    # pylint: disable=invalid-name
506    W_i = array_ops.concat([w_i, r_i], axis=1)
507    W_r = array_ops.concat([w_r, r_r], axis=1)
508    # pylint: enable=invalid-name
509    return (array_ops.transpose(array_ops.concat([W_i, W_r], axis=0)),
510            array_ops.transpose(w_h), array_ops.transpose(r_h))
511
512  def _tf_to_cudnn_weights(self, layer, *tf_weights):
513    r"""Reverse the operations in StitchWeights()."""
514    input_size = self._input_size
515    num_units = self._num_units
516    if layer == 0:
517      input_weight_width = input_size
518    else:
519      input_weight_width = num_units
520      if self._direction == CUDNN_RNN_BIDIRECTION:
521        input_weight_width *= 2
522    # pylint: disable=invalid-name
523    W_ir, w_h, r_h = tf_weights
524    W_ir = array_ops.transpose(W_ir)
525    w_h = array_ops.transpose(w_h)
526    r_h = array_ops.transpose(r_h)
527
528    W_i, W_r = array_ops.split(W_ir, 2, axis=0)
529    w_i, r_i = array_ops.split(W_i, [input_weight_width, num_units], axis=1)
530    w_r, r_r = array_ops.split(W_r, [input_weight_width, num_units], axis=1)
531    # pylint: enable=invalid-name
532    return w_i, w_r, w_h, r_i, r_r, r_h
533
534  def _cudnn_to_tf_biases(self, *biases):
535    r"""Stitching cudnn canonical biases to generate tf canonical biases."""
536    b_wi, b_wr, b_wh, b_ri, b_rr, b_rh = biases
537    return (
538        # Save only the sum instead of individual biases. When recovering,
539        # return two biases each with half the value. Since RNN does not
540        # regularize by weight decay, it has no side effect in training or
541        # inference.
542        array_ops.concat([b_wi, b_wr], axis=0) + array_ops.concat(
543            [b_ri, b_rr], axis=0),
544        b_wh,
545        b_rh)
546
547  def _tf_to_cudnn_biases(self, *tf_biases):
548    r"""Reverse the operations in StitchBiases()."""
549    # b_ir is the summed bias of reset and update gate.
550    b_ir, b_wh, b_rh = tf_biases
551    bi, br = b_ir * 0.5, b_ir * 0.5
552    b_wi, b_wr = array_ops.split(bi, 2, axis=0)
553    b_ri, b_rr = array_ops.split(br, 2, axis=0)
554    return b_wi, b_wr, b_wh, b_ri, b_rr, b_rh
555
556  def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases,
557                                                 tf_weights, tf_biases):
558    # pylint: disable=invalid-name
559    W_ir, w_h, r_h = self._cudnn_to_tf_weights(*cu_weights)
560    b_ir, b_wh, b_rh = self._cudnn_to_tf_biases(*cu_biases)
561    # pylint: enable=invalid-name
562    tf_weights.extend([W_ir, w_h, r_h])
563    tf_biases.extend([b_ir, b_wh, b_rh])
564
565
566class CudnnParamsFormatConverterBasic(CudnnParamsFormatConverterLSTM):
567  """Helper class that converts between params of Cudnn and TF Relu/Tanh RNN."""
568
569  def _cudnn_to_tf_weights(self, *cu_weights):
570    r"""Stitching cudnn canonical weights to generate tf canonical weights."""
571    w_i, w_h = cu_weights
572    W = array_ops.concat([w_i, w_h], axis=1)  # pylint: disable=invalid-name
573    return (array_ops.transpose(W),)
574
575  def _tf_to_cudnn_weights(self, layer, *tf_weights):
576    r"""Reverse the operations in StitchWeights()."""
577    input_size = self._input_size
578    num_units = self._num_units
579    if layer == 0:
580      input_weight_width = input_size
581    else:
582      input_weight_width = num_units
583      if self._direction == CUDNN_RNN_BIDIRECTION:
584        input_weight_width *= 2
585
586    (tf_weight,) = tf_weights
587    # pylint: disable=invalid-name
588    W = array_ops.transpose(tf_weight)
589    w_i, w_h = array_ops.split(W, [input_weight_width, num_units], axis=1)
590    return w_i, w_h
591    # pylint: enable=invalid-name
592
593  def _cudnn_to_tf_biases(self, *cu_biases):
594    r"""Stitching cudnn canonical biases to generate tf canonical biases."""
595    # Save only the sum instead of individual biases. When recovering, return
596    # two biases each with half the value. Since RNN does not regularize by
597    # weight decay, it has no side effect in training or inference.
598    b_wi, b_wh = cu_biases
599    return (b_wi + b_wh,)
600
601  def _tf_to_cudnn_biases(self, *tf_biases):
602    r"""Reverse the operations in StitchBiases()."""
603    (tf_bias,) = tf_biases
604    b_i = tf_bias * 0.5
605    b_h = tf_bias * 0.5
606    return b_i, b_h
607
608
609class CudnnParamsFormatConverterTanh(CudnnParamsFormatConverterBasic):
610  """Helper class that converts between params of Cudnn and TF Tanh RNN."""
611  _rnn_mode = CUDNN_RNN_TANH
612  _num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
613
614
615class CudnnParamsFormatConverterRelu(CudnnParamsFormatConverterBasic):
616  """Helper class that converts between params of Cudnn and TF Relu RNN."""
617  _rnn_mode = CUDNN_RNN_RELU
618  _num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER
619
620
621# TODO(yaozhang): make sure we only save the canonical version of params and
622# don't save the platform-specific version to avoid potential race
623# conditions where params is updated by both versions when being restored.
624# Currently, checkpointing will function properly, despite that we save both
625# versions, because Saver restores customized savables after Variables.
626# However, it is good to not rely on this restoring order of Saver and to
627# avoid unnecessary storage. Add a test to check only the canonical version is
628# saved.
629class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
630  """Abstract SaveableObject implementation handling Cudnn opaque params."""
631
632  def __init__(self,
633               opaque_params,
634               num_layers,
635               num_units,
636               input_size,
637               input_mode=CUDNN_INPUT_LINEAR_MODE,
638               direction=CUDNN_RNN_UNIDIRECTION,
639               scope=None,
640               name="cudnn_rnn_saveable"):
641    """Creates a CudnnOpaqueParamsSaveable object.
642
643       CudnnOpaqueParamsSaveable is saveable/restorable in a checkpoint file
644       and is used to save/restore the weights and biases parameters in a
645       canonical format which is directly consumable by platform-independent tf
646       RNN cells. Parameters are saved as tensors layer by layer with weight
647       tensors followed by bias tensors, and forward direction followed by
648       backward direction (if applicable). When restoring, a user could name
649       param_variables as desired, and restore weight and bias tensors to these
650       variables.
651
652       For CudnnRNNRelu or CudnnRNNTanh, there are 2 tensors per weight and per
653       bias for each layer: tensor 0 is applied to the input from the previous
654       layer and tensor 1 to the recurrent input.
655
656       For CudnnLSTM, there are 8 tensors per weight and per bias for each
657       layer: tensor 0-3 are applied to the input from the previous layer and
658       tensor 4-7 to the recurrent input. Tensor 0 and 4 are for the input gate;
659       tensor 1 and 5 the forget gate; tensor 2 and 6 the new memory gate;
660       tensor 3 and 7 the output gate.
661
662       For CudnnGRU, there are 6 tensors per weight and per bias for each layer:
663       tensor 0-2 are applied to the input from the previous layer and
664       tensor 3-5 to the recurrent input. Tensor 0 and 3 are for the reset gate;
665       tensor 1 and 4 the update gate; tensor 2 and 5 the new memory gate.
666
667    Args:
668      opaque_params: a variable, Cudnn RNN opaque params.
669      num_layers: the number of layers for the RNN model.
670      num_units: the number of units within the RNN model.
671      input_size: the size of the input, it could be different from the
672        num_units.
673      input_mode: indicate whether there is a linear projection between the
674        input and the actual computation before the first layer. It could be
675        'linear_input', 'skip_input' or 'auto_select'. 'linear_input' (default)
676        always applies a linear projection of input onto RNN hidden state.
677        (standard RNN behavior). 'skip_input' is only allowed when input_size ==
678        num_units; 'auto_select' implies 'skip_input' when input_size ==
679        num_units; otherwise, it implies 'linear_input'.
680      direction: the direction model that the model operates. Could be either
681        'unidirectional' or 'bidirectional'
682      scope: string of VariableScope, the scope of equivalent subgraph
683        consisting only platform-independent tf RNN cells.
684      name: the name of the CudnnOpaqueParamsSaveable object.
685    """
686    # Define in subclasses.
687    self._num_layers = num_layers
688    self._input_size = input_size
689    self._num_units = num_units
690    self._input_mode = input_mode
691    self._direction = direction
692    if scope is not None:
693      scope_name = scope.name if isinstance(scope, vs.VariableScope) else scope
694      self._scope = scope_name or None
695    else:
696      self._scope = None
697
698    self._variables = opaque_params
699    self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2
700    # Defined in subclasses.
701    self._format_converter = None
702
703    tf_weights, tf_biases = (
704        self.format_converter.opaque_to_tf_canonical(self._variables))
705    tf_weight_names, tf_bias_names = self._tf_canonical_names()
706    # We currently don't use slice_spec. It might be useful in a distributed
707    # setting where each parameter server node stores a slice of variable,
708    # instead of having the master pull all slices and then save them.
709    slice_spec = ""
710    params = tf_weights + tf_biases
711    self._weight_names = tf_weight_names
712    self._bias_names = tf_bias_names
713    self._param_names = tf_weight_names + tf_bias_names
714    prefixed_param_names = tf_weight_names + tf_bias_names
715    if self._scope:
716      prefixed_param_names = [
717          "%s/%s" % (self._scope, pn) for pn in prefixed_param_names
718      ]
719    specs = [
720        saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name)
721        for param, param_name in zip(params, prefixed_param_names)
722    ]
723    super(CudnnOpaqueParamsSaveable, self).__init__(
724        array_ops.identity(self._variables), specs, name)
725
726  @property
727  def format_converter(self):
728    if self._format_converter is None:
729      self._format_converter = self._format_converter_cls(
730          self._num_layers, self._num_units, self._input_size, self._input_mode,
731          self._direction)
732    return self._format_converter
733
734  def restore(self, restored_tensors, restored_shapes):
735    opaque_params = self.format_converter.tf_canonical_to_opaque(
736        restored_tensors)
737    return state_ops.assign(
738        self._variables, opaque_params, validate_shape=False)
739
740  def _trackable_save(self, save_buffer):
741    weights, biases = self.format_converter.opaque_to_tf_canonical(
742        self._variables)
743    for name, tensor in zip(self._param_names, weights + biases):
744      save_buffer[name] = array_ops.identity(tensor)
745
746  def _trackable_restore(self, restore_buffer):
747    tensors = [
748        array_ops.identity(restore_buffer[name]) for name in self._param_names
749    ]
750    return self.restore(
751        restored_tensors=tensors,
752        restored_shapes=None  # Unused
753    )
754
755  def _add_trackable_dependencies(self, trackable, dtype):
756    """Add canonical weight dependencies to `trackable`.
757
758    When saving or restoring, converts to or from the opaque buffer
759    format. Weights are saved and loaded in the configuration expected by
760    cuDNN-compatible cells.
761
762    Args:
763      trackable: An object inheriting from `Trackable` to add
764        dependencies too (typically the cuDNN `Layer`).
765      dtype: The dtype for the canonical parameter Tensors.
766    """
767    split_dependencies = split_dependency.split_dependency(
768        component_names=self._param_names,
769        component_dtypes=(dtype,) * len(self._param_names),
770        fill_save_buffer_fn=self._trackable_save,
771        consume_restore_buffer_fn=self._trackable_restore)
772    self._trackable_track_params(trackable, split_dependencies)
773
774  def _trackable_track_params(self, trackable, params):
775    """Tracks parameters in a canonical configuration."""
776    return  # NotImplementedError raised by the Layer.
777
778  def _tf_canonical_names(self):
779    tf_weights_names, tf_biases_names = [], []
780    for i in range(self._num_layers):
781      if self._direction == CUDNN_RNN_UNIDIRECTION:
782        prefix = self._tf_canonical_name_prefix(i)
783        self._tf_canonical_names_single_layer(prefix, tf_weights_names,
784                                              tf_biases_names)
785      else:
786        fwd_prefix = self._tf_canonical_name_prefix(i, is_fwd=True)
787        bak_prefix = self._tf_canonical_name_prefix(i, is_fwd=False)
788
789        self._tf_canonical_names_single_layer(fwd_prefix, tf_weights_names,
790                                              tf_biases_names)
791        self._tf_canonical_names_single_layer(bak_prefix, tf_weights_names,
792                                              tf_biases_names)
793    return tf_weights_names, tf_biases_names
794
795  def _tf_canonical_name_prefix(self, layer, is_fwd=True):
796    if self._direction == CUDNN_RNN_UNIDIRECTION:
797      return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name)
798    else:
799      if is_fwd:
800        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/fw/%s" %
801                (layer, self._rnn_cell_name))
802      else:
803        return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/bw/%s" %
804                (layer, self._rnn_cell_name))
805
806  def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
807                                       tf_biases_names):
808    raise NotImplementedError("Abstract method")
809
810
811class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable):
812  """SaveableObject implementation handling Cudnn LSTM opaque params."""
813
814  _format_converter_cls = CudnnParamsFormatConverterLSTM
815  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__)
816
817  def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
818                                       tf_bias_names):
819    tf_weights_names.append(prefix + "/kernel")
820    tf_bias_names.append(prefix + "/bias")
821
822  def _trackable_track_params(self, trackable, params):
823    """Track parameters for compatibility with CudnnCompatibleLSTMCell."""
824    biases = []
825    weights = []
826    for name in self._weight_names:
827      weights.append(params[name])
828    for name in self._bias_names:
829      biases.append(params[name])
830    assert len(params) == len(weights) + len(biases)
831    if len(weights) == 1 and len(biases) == 1:
832      # For single-layer cells, allow substituting a cell with no MultiRNNCell
833      # wrapping.
834      kernel, = weights  # pylint: disable=unbalanced-tuple-unpacking
835      bias, = biases  # pylint: disable=unbalanced-tuple-unpacking
836      trackable._track_trackable(kernel, name="kernel")  # pylint: disable=protected-access
837      trackable._track_trackable(bias, name="bias")  # pylint: disable=protected-access
838    assert len(biases) == len(weights)
839    for cell_index, (bias, kernel) in enumerate(zip(biases, weights)):
840      cell = trackable_lib.AutoTrackable()
841      trackable._track_trackable(cell, name="cell-%d" % cell_index)  # pylint: disable=protected-access
842      cell.bias = bias
843      cell.kernel = kernel
844
845
846class CudnnGRUSaveable(CudnnOpaqueParamsSaveable):
847  """SaveableObject implementation handling Cudnn GRU opaque params."""
848
849  _format_converter_cls = CudnnParamsFormatConverterGRU
850  _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__)
851
852  def _tf_canonical_names_single_layer(self, prefix, tf_weights_names,
853                                       tf_bias_names):
854    tf_weights_names.append(prefix + "/gates/kernel")
855    tf_weights_names.append(prefix + "/candidate/input_projection/kernel")
856    tf_weights_names.append(prefix + "/candidate/hidden_projection/kernel")
857
858    tf_bias_names.append(prefix + "/gates/bias")
859    tf_bias_names.append(prefix + "/candidate/input_projection/bias")
860    tf_bias_names.append(prefix + "/candidate/hidden_projection/bias")
861
862
863class CudnnRNNTanhSaveable(CudnnLSTMSaveable):
864  _format_converter_cls = CudnnParamsFormatConverterTanh
865  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
866
867
868class CudnnRNNReluSaveable(CudnnLSTMSaveable):
869  _format_converter_cls = CudnnParamsFormatConverterRelu
870  _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__)
871
872
873_cudnn_rnn_common_doc_string = """
874  Cudnn RNN has an opaque parameter buffer that can be used for inference and
875  training. But it is possible that the layout of the parameter buffers
876  changes between generations. So it is highly recommended to use
877  CudnnOpaqueParamsSaveable to save and restore weights and biases in a
878  canonical format.
879
880  This is a typical use case:
881
882    * The user creates a CudnnRNN model.
883    * The user query that parameter buffer size.
884    * The user creates a variable of that size that serves as the parameter
885        buffers.
886    * The user either initialize the parameter buffer, or load the canonical
887        weights into the parameter buffer.
888    * The user calls the model with the parameter buffer for inference, or
889        training.
890    * If training, the user creates a Saver object.
891    * If training, the user creates a CudnnOpaqueParamsSaveable object from the
892        parameter buffer for it to be later saved in the canonical format. When
893        creating a CudnnOpaqueParamsSaveable object, a name could be provided,
894        which is useful in distinguishing the names of multiple
895        CudnnOpaqueParamsSaveable objects (e.g. for an encoder-decoder model).
896    * Once a while, the user saves the parameter buffer into model checkpoints
897        with Saver.save().
898    * When restoring, the user creates a CudnnOpaqueParamsSaveable object and
899      uses Saver.restore() to restore the parameter buffer from the canonical
900      format to a user-defined format, as well as to restore other savable
901      objects in the checkpoint file.
902"""
903
904
905def _check_rnn_mode(rnn_mode):
906  if rnn_mode not in (CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_TANH, CUDNN_RNN_RELU):
907    raise ValueError("Invalid rnn_mode: %s, expect one of (%s, %s, %s, %s)" %
908                     (rnn_mode, CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_TANH,
909                      CUDNN_RNN_RELU))
910
911
912def _get_seed(seed):
913  seed, seed2 = random_seed.get_seed(seed)
914  if seed is None and seed2 is None:
915    seed, seed2 = 0, 0
916  return seed, seed2
917
918
919def check_direction(direction):
920  """Check validity of direction."""
921  if direction not in (CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION):
922    raise ValueError("Invalid direction: %s, expecting %s or %s" %
923                     (direction, CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION))
924
925
926def check_input_mode(input_mode):
927  if input_mode not in (CUDNN_INPUT_LINEAR_MODE, CUDNN_INPUT_SKIP_MODE,
928                        CUDNN_INPUT_AUTO_MODE):
929    raise ValueError("Invalid input_mode: %s, expect one of (%s, %s, %s)" %
930                     (input_mode, CUDNN_INPUT_LINEAR_MODE,
931                      CUDNN_INPUT_SKIP_MODE, CUDNN_INPUT_AUTO_MODE))
932
933
934def _get_num_params(rnn_mode, num_layers, direction):
935  """Return num params for given Cudnn config."""
936  if rnn_mode == CUDNN_LSTM:
937    num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
938  elif rnn_mode == CUDNN_GRU:
939    num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
940  elif rnn_mode == CUDNN_RNN_RELU:
941    num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER
942  elif rnn_mode == CUDNN_RNN_TANH:
943    num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
944  else:
945    raise ValueError("Invalid \'rnn_mode\': %s" % rnn_mode)
946  num_params = num_layers * num_params_per_layer
947  if direction != CUDNN_RNN_UNIDIRECTION:
948    num_params *= 2
949  return num_params
950
951
952def _cudnn_rnn(inputs,
953               input_h,
954               input_c,
955               params,
956               is_training,
957               rnn_mode,
958               sequence_lengths=None,
959               time_major=True,
960               input_mode=CUDNN_INPUT_LINEAR_MODE,
961               direction=CUDNN_RNN_UNIDIRECTION,
962               dropout=0.,
963               seed=0,
964               name=None):
965  """Cudnn RNN.
966
967  Args:
968    inputs: the input sequence to the RNN model. If `time_major` is True
969        (default), the Tensor shape is [max_time, batch_size, input_size]. If
970        `time_major` is False, the shape is [batch_size, max_time, input_size].
971    input_h: the initial hidden state for h. If `time_major` is True
972        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
973        `time_major` is False, the shape is [batch_size, num_layers, num_units].
974    input_c: the initial hidden state for c. This is only relevant for LSTM.
975      A Tensor of the same shape as input_h.
976    params: the parameter buffer created for this model.
977    is_training: whether this operation will be used in training or inference
978    rnn_mode: one of ('lstm', 'gru', 'rnn_relu', 'rnn_tanh').
979    sequence_lengths: an int32 array representing the variable sequence lengths
980      in a batch. The size of the array has to equal the batch_size. Default to
981      None, in which case sequences in the batch are assumed to have the same
982      length, which is inferred from inputs.
983    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
984      these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If
985      false, these Tensors must be shaped ['batch_size', 'max_time', 'depth'].
986      By default this function accepts input and emits output in time-major
987      form. This param is only effective when 'sequence_lengths' is used.
988    input_mode: indicate whether there is a linear projection between the
989      input and the actual computation before the first layer. It could be
990      'linear_input', 'skip_input' or 'auto_select'.
991      'linear_input' (default) always applies a linear projection of input
992      onto RNN hidden state. (standard RNN behavior).
993      'skip_input' is only allowed when input_size == num_units;
994      'auto_select' implies 'skip_input' when input_size == num_units;
995      otherwise, it implies 'linear_input'.
996    direction: the direction model that the model operates. Could be either
997        'unidirectional' or 'bidirectional'
998    dropout: whether to enable dropout. With it is 0, dropout is disabled.
999    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
1000        for behavior.
1001    name: name of the operation.
1002  Returns:
1003    outputs, output_h, output_c
1004  """
1005  _check_rnn_mode(rnn_mode)
1006  check_direction(direction)
1007  check_input_mode(input_mode)
1008  seed, seed2 = random_seed.get_seed(seed)
1009  # TODO(jamesqin): switch default value to "1" on May 25th 2018, and get rid
1010  # of V1 ops.
1011  use_cudnn_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0")
1012  args = {
1013      "input": inputs,
1014      "input_h": input_h,
1015      "input_c": input_c,
1016      "params": params,
1017      "is_training": is_training,
1018      "rnn_mode": rnn_mode,
1019      "input_mode": input_mode,
1020      "direction": direction,
1021      "dropout": dropout,
1022      "seed": seed,
1023      "seed2": seed2,
1024      "name": name
1025  }
1026  if sequence_lengths is not None:
1027    args["sequence_lengths"] = sequence_lengths
1028    args["time_major"] = time_major
1029    outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)
1030  elif time_major is False:
1031    batch_size = array_ops.shape(inputs)[0]
1032    max_time = array_ops.shape(inputs)[1]
1033    sequence_lengths = array_ops.fill([batch_size], max_time)
1034    args["sequence_lengths"] = sequence_lengths
1035    args["time_major"] = time_major
1036    outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args)
1037  elif use_cudnn_v2 != "1":
1038    outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args)
1039  else:
1040    outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args)
1041  return (outputs, output_h, output_c)
1042
1043
1044def cudnn_lstm(inputs,
1045               input_h,
1046               input_c,
1047               params,
1048               is_training,
1049               sequence_lengths=None,
1050               time_major=True,
1051               input_mode=CUDNN_INPUT_LINEAR_MODE,
1052               direction=CUDNN_RNN_UNIDIRECTION,
1053               dropout=0.,
1054               seed=0,
1055               name=None):
1056  """Cudnn LSTM.
1057
1058  Args:
1059    inputs: the input sequence to the RNN model. If `time_major` is True
1060        (default), the Tensor shape is [max_time, batch_size, input_size]. If
1061        `time_major` is False, the shape is [batch_size, max_time, input_size].
1062    input_h: the initial hidden state for h. If `time_major` is True
1063        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
1064        `time_major` is False, the shape is [batch_size, num_layers, num_units].
1065    input_c: the initial hidden state for c. This is only relevant for LSTM.
1066      A Tensor of the same shape as input_h.
1067    params: the parameter buffer created for this model.
1068    is_training: whether this operation will be used in training or inference
1069    sequence_lengths: an int32 array representing the variable sequence lengths
1070      in a batch. The size of the array has to equal the batch_size. Default to
1071      None, in which case sequences in the batch are assumed to have the same
1072      length, which is inferred from inputs.
1073    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
1074      these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If
1075      false, these Tensors must be shaped ['batch_size', 'max_time', 'depth'].
1076      By default this function accepts input and emits output in time-major
1077      form. This param is only effective when 'sequence_lengths' is used.
1078    input_mode: indicate whether there is a linear projection between the
1079        input and the actual computation before the first layer. It could be
1080        'linear_input', 'skip_input' or 'auto_select'.
1081        'linear_input' (default) always applies a linear projection of input
1082        onto RNN hidden state. (standard RNN behavior).
1083        'skip_input' is only allowed when input_size == num_units;
1084        'auto_select' implies 'skip_input' when input_size == num_units;
1085        otherwise, it implies 'linear_input'.
1086    direction: the direction model that the model operates. Could be either
1087        'unidirectional' or 'bidirectional'
1088    dropout: whether to enable dropout. With it is 0, dropout is disabled.
1089    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
1090        for behavior.
1091    name: name of the operation.
1092  Returns:
1093    outputs, output_h, output_c
1094  """
1095  return _cudnn_rnn(inputs, input_h, input_c, params, is_training, CUDNN_LSTM,
1096                    sequence_lengths, time_major, input_mode, direction,
1097                    dropout, seed, name)
1098
1099
1100def _cudnn_rnn_no_input_c(inputs,
1101                          input_h,
1102                          params,
1103                          is_training,
1104                          rnn_mode,
1105                          sequence_lengths=None,
1106                          time_major=True,
1107                          input_mode=CUDNN_INPUT_LINEAR_MODE,
1108                          direction=CUDNN_RNN_UNIDIRECTION,
1109                          dropout=0.,
1110                          seed=0,
1111                          name=None):
1112  """Cudnn RNN w/o input_c.
1113
1114  Args:
1115    inputs: the input sequence to the RNN model. If `time_major` is True
1116        (default), the Tensor shape is [max_time, batch_size, input_size]. If
1117        `time_major` is False, the shape is [batch_size, max_time, input_size].
1118    input_h: the initial hidden state for h. If `time_major` is True
1119        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
1120        `time_major` is False, the shape is [batch_size, num_layers, num_units].
1121    params: the parameter buffer created for this model.
1122    is_training: whether this operation will be used in training or inference
1123    rnn_mode: one of ('lstm', 'gru', 'rnn_relu', 'rnn_tanh').
1124    sequence_lengths: an int32 array representing the variable sequence lengths
1125      in a batch. The size of the array has to equal the batch_size. Default to
1126      None, in which case sequences in the batch are assumed to have the same
1127      length, which is inferred from inputs.
1128    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
1129      these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If
1130      false, these Tensors must be shaped ['batch_size', 'max_time', 'depth'].
1131      By default this function accepts input and emits output in time-major
1132      form. This param is only effective when 'sequence_lengths' is used.
1133    input_mode: indicate whether there is a linear projection between the
1134      input and the actual computation before the first layer. It could be
1135      'linear_input', 'skip_input' or 'auto_select'.
1136      'linear_input' (default) always applies a linear projection of input
1137      onto RNN hidden state. (standard RNN behavior).
1138      'skip_input' is only allowed when input_size == num_units;
1139      'auto_select' implies 'skip_input' when input_size == num_units;
1140      otherwise, it implies 'linear_input'.
1141    direction: the direction model that the model operates. Could be either
1142        'unidirectional' or 'bidirectional'
1143    dropout: whether to enable dropout. With it is 0, dropout is disabled.
1144    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
1145        for behavior.
1146    name: name of the operation.
1147  Returns:
1148    outputs, output_h
1149  """
1150  input_c = array_ops.constant([], dtype=input_h.dtype)
1151  outputs, output_h, _ = _cudnn_rnn(
1152      inputs, input_h, input_c, params, is_training, rnn_mode, sequence_lengths,
1153      time_major, input_mode, direction, dropout, seed, name)
1154  return outputs, output_h
1155
1156
1157def cudnn_gru(inputs,
1158              input_h,
1159              params,
1160              is_training,
1161              sequence_lengths=None,
1162              time_major=True,
1163              input_mode=CUDNN_INPUT_LINEAR_MODE,
1164              direction=CUDNN_RNN_UNIDIRECTION,
1165              dropout=0.,
1166              seed=0,
1167              name=None):
1168  """Cudnn GRU.
1169
1170  Args:
1171    inputs: the input sequence to the RNN model. If `time_major` is True
1172        (default), the Tensor shape is [max_time, batch_size, input_size]. If
1173        `time_major` is False, the shape is [batch_size, max_time, input_size].
1174    input_h: the initial hidden state for h. If `time_major` is True
1175        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
1176        `time_major` is False, the shape is [batch_size, num_layers, num_units].
1177    params: the parameter buffer created for this model.
1178    is_training: whether this operation will be used in training or inference
1179      input_mode: indicate whether there is a linear projection between the
1180        input and the actual computation before the first layer. It could be
1181        'linear_input', 'skip_input' or 'auto_select'.
1182        'linear_input' (default) always applies a linear projection of input
1183        onto RNN hidden state. (standard RNN behavior).
1184        'skip_input' is only allowed when input_size == num_units;
1185        'auto_select' implies 'skip_input' when input_size == num_units;
1186        otherwise, it implies 'linear_input'.
1187    sequence_lengths: an int32 array representing the variable sequence lengths
1188      in a batch. The size of the array has to equal the batch_size. Default to
1189      None, in which case sequences in the batch are assumed to have the same
1190      length, which is inferred from inputs.
1191    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
1192      these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If
1193      false, these Tensors must be shaped ['batch_size', 'max_time', 'depth'].
1194      By default this function accepts input and emits output in time-major
1195      form. This param is only effective when 'sequence_lengths' is used.
1196    direction: the direction model that the model operates. Could be either
1197        'unidirectional' or 'bidirectional'
1198    dropout: whether to enable dropout. With it is 0, dropout is disabled.
1199    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
1200        for behavior.
1201    name: name of the operation.
1202  Returns:
1203    outputs, output_h
1204  """
1205  return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training, CUDNN_GRU,
1206                               sequence_lengths, time_major, input_mode,
1207                               direction, dropout, seed, name)
1208
1209
1210def cudnn_rnn_relu(inputs,
1211                   input_h,
1212                   params,
1213                   is_training,
1214                   input_mode=CUDNN_INPUT_LINEAR_MODE,
1215                   direction=CUDNN_RNN_UNIDIRECTION,
1216                   dropout=0.,
1217                   seed=0,
1218                   sequence_lengths=None,
1219                   time_major=True,
1220                   name=None):
1221  """Cudnn RNN Relu.
1222
1223  Args:
1224    inputs: the input sequence to the RNN model. If `time_major` is True
1225        (default), the Tensor shape is [max_time, batch_size, input_size]. If
1226        `time_major` is False, the shape is [batch_size, max_time, input_size].
1227    input_h: the initial hidden state for h. If `time_major` is True
1228        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
1229        `time_major` is False, the shape is [batch_size, num_layers, num_units].
1230    params: the parameter buffer created for this model.
1231    is_training: whether this operation will be used in training or inference
1232      input_mode: indicate whether there is a linear projection between the
1233        input and the actual computation before the first layer. It could be
1234        'linear_input', 'skip_input' or 'auto_select'. 'linear_input' (default)
1235        always applies a linear projection of input onto RNN hidden state.
1236        (standard RNN behavior). 'skip_input' is only allowed when input_size ==
1237        num_units; 'auto_select' implies 'skip_input' when input_size ==
1238        num_units; otherwise, it implies 'linear_input'.
1239    direction: the direction model that the model operates. Could be either
1240      'unidirectional' or 'bidirectional'
1241    dropout: whether to enable dropout. With it is 0, dropout is disabled.
1242    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
1243      for behavior.
1244    sequence_lengths: an int32 array representing the variable sequence lengths
1245      in a batch. The size of the array has to equal the batch_size. If not
1246      provided, the same sequence length will be assumed.
1247    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
1248      these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If
1249      false, these Tensors must be shaped ['batch_size', 'max_time', 'depth'].
1250      By default this function accepts input and emits output in time-major
1251      form. This param is only effective when 'sequence_lengths' is used.
1252    name: name of the operation.
1253
1254  Returns:
1255    outputs, output_h
1256  """
1257  return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training,
1258                               CUDNN_RNN_RELU, sequence_lengths, time_major,
1259                               input_mode, direction, dropout, seed, name)
1260
1261
1262def cudnn_rnn_tanh(inputs,
1263                   input_h,
1264                   params,
1265                   is_training,
1266                   sequence_lengths=None,
1267                   time_major=True,
1268                   input_mode=CUDNN_INPUT_LINEAR_MODE,
1269                   direction=CUDNN_RNN_UNIDIRECTION,
1270                   dropout=0.,
1271                   seed=0,
1272                   name=None):
1273  """Cudnn RNN Tanh.
1274
1275  Args:
1276    inputs: the input sequence to the RNN model. If `time_major` is True
1277        (default), the Tensor shape is [max_time, batch_size, input_size]. If
1278        `time_major` is False, the shape is [batch_size, max_time, input_size].
1279    input_h: the initial hidden state for h. If `time_major` is True
1280        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
1281        `time_major` is False, the shape is [batch_size, num_layers, num_units].
1282    params: the parameter buffer created for this model.
1283    is_training: whether this operation will be used in training or inference
1284      input_mode: indicate whether there is a linear projection between the
1285        input and the actual computation before the first layer. It could be
1286        'linear_input', 'skip_input' or 'auto_select'.
1287        'linear_input' (default) always applies a linear projection of input
1288        onto RNN hidden state. (standard RNN behavior).
1289        'skip_input' is only allowed when input_size == num_units;
1290        'auto_select' implies 'skip_input' when input_size == num_units;
1291        otherwise, it implies 'linear_input'.
1292    sequence_lengths: an int32 array representing the variable sequence lengths
1293      in a batch. The size of the array has to equal the batch_size. Default to
1294      None, in which case sequences in the batch are assumed to have the same
1295      length, which is inferred from inputs.
1296    time_major: The shape format of the `inputs` and `outputs` Tensors. If true,
1297      these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If
1298      false, these Tensors must be shaped ['batch_size', 'max_time', 'depth'].
1299      By default this function accepts input and emits output in time-major
1300      form. This param is only effective when 'sequence_lengths' is used.
1301    direction: the direction model that the model operates. Could be either
1302        'unidirectional' or 'bidirectional'
1303    dropout: whether to enable dropout. With it is 0, dropout is disabled.
1304    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
1305        for behavior.
1306    name: name of the operation.
1307  Returns:
1308    outputs, output_h
1309  """
1310  return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training,
1311                               CUDNN_RNN_TANH, sequence_lengths, time_major,
1312                               input_mode, direction, dropout, seed, name)
1313
1314
1315def cudnn_rnn_opaque_params_to_canonical(rnn_mode,
1316                                         num_layers,
1317                                         num_units,
1318                                         input_size,
1319                                         params,
1320                                         input_mode=CUDNN_INPUT_LINEAR_MODE,
1321                                         direction=CUDNN_RNN_UNIDIRECTION,
1322                                         dropout=0,
1323                                         seed=0,
1324                                         name=None):
1325  """Convert cudnn opaque params to canonical.
1326
1327  Args:
1328    rnn_mode: a string specifies the mode, under which this RNN model runs.
1329        Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'.
1330    num_layers: the number of layers for the RNN model.
1331    num_units: the number of units within the RNN model.
1332    input_size: the size of the input, it could be different from the
1333        num_units.
1334    params: opaque cudnn params var.
1335    input_mode: indicate whether there is a linear projection between the
1336        input and the actual computation before the first layer. It could be
1337        'linear_input', 'skip_input' or 'auto_select'.
1338        'linear_input' (default) always applies a linear projection of input
1339        onto RNN hidden state. (standard RNN behavior).
1340        'skip_input' is only allowed when input_size == num_units;
1341        'auto_select' implies 'skip_input' when input_size == num_units;
1342        otherwise, it implies 'linear_input'.
1343    direction: the direction model that the model operates. Could be either
1344        'unidirectional' or 'bidirectional'
1345    dropout: whether to enable dropout. With it is 0, dropout is disabled.
1346    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
1347        for behavior.
1348    name: name of the operation.
1349  Returns:
1350    weights list and bias list
1351  Raises:
1352    ValueError: if rnn_mode or direction is invalid.
1353  """
1354
1355  _check_rnn_mode(rnn_mode)
1356  check_direction(direction)
1357  check_input_mode(input_mode)
1358  num_params = _get_num_params(rnn_mode, num_layers, direction)
1359  seed, seed2 = random_seed.get_seed(seed)
1360  weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical(
1361      rnn_mode=rnn_mode,
1362      num_layers=num_layers,
1363      num_units=num_units,
1364      input_size=input_size,
1365      params=params,
1366      input_mode=input_mode,
1367      direction=direction,
1368      dropout=dropout,
1369      seed=seed,
1370      seed2=seed2,
1371      num_params=num_params,
1372      name=name)
1373  return weights, biases
1374
1375
1376def cudnn_rnn_canonical_to_opaque_params(rnn_mode,
1377                                         num_layers,
1378                                         num_units,
1379                                         input_size,
1380                                         weights,
1381                                         biases,
1382                                         input_mode=CUDNN_INPUT_LINEAR_MODE,
1383                                         direction=CUDNN_RNN_UNIDIRECTION,
1384                                         dropout=0,
1385                                         seed=0,
1386                                         name=None):
1387  """Converts params from the canonical format to a specific format of cuDNN.
1388
1389  Args:
1390    rnn_mode: a string specifies the mode, under which this RNN model runs.
1391        Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'.
1392    num_layers: the number of layers for the RNN model.
1393    num_units: the number of units within the RNN model.
1394    input_size: the size of the input, it could be different from the
1395        num_units.
1396    weights: a Tensor for weight parameters.
1397    biases: a Tensor for bias parameters.
1398    input_mode: indicate whether there is a linear projection between the
1399        input and the actual computation before the first layer. It could be
1400        'linear_input', 'skip_input' or 'auto_select'.
1401        'linear_input' (default) always applies a linear projection of input
1402        onto RNN hidden state. (standard RNN behavior).
1403        'skip_input' is only allowed when input_size == num_units;
1404        'auto_select' implies 'skip_input' when input_size == num_units;
1405        otherwise, it implies 'linear_input'.
1406    direction: the direction model that the model operates. Could be either
1407        'unidirectional' or 'bidirectional'
1408    dropout: whether to enable dropout. With it is 0, dropout is disabled.
1409    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
1410        for behavior.
1411    name: name of the operation.
1412  Returns:
1413    an opaque Cudnn param.
1414  Raises:
1415    ValueError: if rnn_mode or direction is invalid.
1416  """
1417  _check_rnn_mode(rnn_mode)
1418  check_direction(direction)
1419  check_input_mode(input_mode)
1420  seed, seed2 = random_seed.get_seed(seed)
1421  return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params(
1422      rnn_mode=rnn_mode,
1423      num_layers=num_layers,
1424      num_units=num_units,
1425      input_size=input_size,
1426      weights=weights,
1427      biases=biases,
1428      input_mode=input_mode,
1429      direction=direction,
1430      dropout=dropout,
1431      seed=seed,
1432      seed2=seed2,
1433      name=name)
1434
1435
1436def cudnn_rnn_opaque_params_size(rnn_mode,
1437                                 num_layers,
1438                                 num_units,
1439                                 input_size,
1440                                 input_mode=CUDNN_INPUT_LINEAR_MODE,
1441                                 direction=CUDNN_RNN_UNIDIRECTION,
1442                                 dtype=dtypes.float32,
1443                                 dropout=0,
1444                                 seed=0,
1445                                 name=None):
1446  """Returns opaque params size for specific Cudnn config.
1447
1448  Args:
1449    rnn_mode: a string specifies the mode, under which this RNN model runs.
1450        Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'.
1451    num_layers: the number of layers for the RNN model.
1452    num_units: the number of units within the RNN model.
1453    input_size: the size of the input, it could be different from the
1454        num_units.
1455    input_mode: indicate whether there is a linear projection between the
1456        input and the actual computation before the first layer. It could be
1457        'linear_input', 'skip_input' or 'auto_select'.
1458        'linear_input' (default) always applies a linear projection of input
1459        onto RNN hidden state. (standard RNN behavior).
1460        'skip_input' is only allowed when input_size == num_units;
1461        'auto_select' implies 'skip_input' when input_size == num_units;
1462        otherwise, it implies 'linear_input'.
1463    direction: the direction model that the model operates. Could be either
1464        'unidirectional' or 'bidirectional'
1465    dtype: one of tf.float32 or tf.float64.
1466    dropout: whether to enable dropout. With it is 0, dropout is disabled.
1467    seed: the op seed used for initializing dropout. See `tf.set_random_seed`
1468        for behavior.
1469    name: name of the operation.
1470  Returns:
1471    a int, size of Cudnn opaque params.
1472  Raises:
1473    ValueError: if rnn_mode or direction is invalid.
1474  """
1475  _check_rnn_mode(rnn_mode)
1476  check_direction(direction)
1477  check_input_mode(input_mode)
1478  seed, seed2 = random_seed.get_seed(seed)
1479  return gen_cudnn_rnn_ops.cudnn_rnn_params_size(
1480      rnn_mode=rnn_mode,
1481      num_layers=num_layers,
1482      num_units=num_units,
1483      input_size=input_size,
1484      T=dtype,
1485      S=dtypes.int32,
1486      dropout=dropout,
1487      seed=seed,
1488      seed2=seed2,
1489      input_mode=input_mode,
1490      direction=direction,
1491      name=name)[0]
1492
1493
1494class _CudnnRNN(object):
1495  """Creates an RNN model using the underlying Cudnn implementation.
1496
1497  Note that self._NUM_PARAMS_PER_LAYER is the number of parameter sets of
1498  weight and bias per layer. It needs to be defined in subclasses.
1499  """
1500  __doc__ += _cudnn_rnn_common_doc_string
1501
1502  # TODO(jamesqin): support float16 CuDNN RNN
1503  def __init__(self,
1504               rnn_mode,
1505               num_layers,
1506               num_units,
1507               input_size,
1508               input_mode=CUDNN_INPUT_LINEAR_MODE,
1509               direction=CUDNN_RNN_UNIDIRECTION,
1510               dtype=dtypes.float32,
1511               dropout=0.,
1512               seed=0):
1513    """Creates a CudnnRNN model from model spec.
1514
1515    Args:
1516      rnn_mode: a string specifies the mode, under which this RNN model runs.
1517          Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'.
1518      num_layers: the number of layers for the RNN model.
1519      num_units: the number of units within the RNN model.
1520      input_size: the size of the input, it could be different from the
1521          num_units.
1522      input_mode: indicate whether there is a linear projection between the
1523          input and the actual computation before the first layer. It could be
1524          'linear_input', 'skip_input' or 'auto_select'.
1525          'linear_input' (default) always applies a linear projection of input
1526          onto RNN hidden state. (standard RNN behavior).
1527          'skip_input' is only allowed when input_size == num_units;
1528          'auto_select' implies 'skip_input' when input_size == num_units;
1529          otherwise, it implies 'linear_input'.
1530      direction: the direction model that the model operates. Could be either
1531          'unidirectional' or 'bidirectional'
1532      dtype: dtype of params, tf.float32 or tf.float64.
1533      dropout: whether to enable dropout. With it is 0, dropout is disabled.
1534      seed: the op seed used for initializing dropout. See `tf.set_random_seed`
1535          for behavior.
1536    Raises:
1537      ValueError: if direction is invalid.
1538    """
1539    self._num_layers = num_layers
1540    self._num_units = num_units
1541    self._input_size = input_size
1542    self._rnn_mode = rnn_mode
1543    self._input_mode = input_mode
1544    self._direction = direction
1545    self._dtype = dtype
1546    self._dropout = dropout
1547    self._seed = seed
1548
1549  @property
1550  def input_mode(self):
1551    return self._input_mode
1552
1553  @property
1554  def input_size(self):
1555    return self._input_size
1556
1557  @property
1558  def num_units(self):
1559    return self._num_units
1560
1561  @property
1562  def num_layers(self):
1563    return self._num_layers
1564
1565  @property
1566  def rnn_mode(self):
1567    return self._rnn_mode
1568
1569  @property
1570  def direction(self):
1571    return self._direction
1572
1573  def params_size(self):
1574    """Calculates the size of the opaque parameter buffer needed for this model.
1575
1576    Returns:
1577      The calculated parameter buffer size.
1578    """
1579    return cudnn_rnn_opaque_params_size(
1580        rnn_mode=self._rnn_mode,
1581        num_layers=self._num_layers,
1582        num_units=self._num_units,
1583        input_size=self._input_size,
1584        dtype=self._dtype,
1585        dropout=self._dropout,
1586        seed=self._seed,
1587        input_mode=self._input_mode,
1588        direction=self._direction)
1589
1590  def __call__(self,
1591               input_data,
1592               input_h,
1593               input_c,
1594               params,
1595               is_training=True,
1596               sequence_lengths=None,
1597               time_major=True):
1598    """Runs the forward step for the RNN model.
1599
1600    Args:
1601      input_data: the input sequence to the RNN model. If `time_major` is True
1602        (default), the Tensor shape is [max_time, batch_size, input_size]. If
1603        `time_major` is False, the shape is [batch_size, max_time, input_size].
1604      input_h: the initial hidden state for h. If `time_major` is True
1605        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
1606        `time_major` is False, the shape is [batch_size, num_layers, num_units].
1607      input_c: the initial hidden state for c. This is only relevant for LSTM. A
1608        Tensor of the same shape as input_h.
1609      params: the parameter buffer created for this model.
1610      is_training: whether this operation will be used in training or inference.
1611      sequence_lengths: an int32 array representing the variable sequence
1612        lengths in a batch. The size of the array has to equal the batch_size.
1613        Default to None, in which case sequences in the batch are assumed to
1614        have the same length, which is inferred from inputs.
1615      time_major: The shape format of the `inputs` and `outputs` Tensors. If
1616        true, these Tensors must be shaped ['max_time', 'batch_size', 'depth'].
1617        If false, these Tensors must be shaped ['batch_size', 'max_time',
1618        'depth']. By default this function accepts input and emits output in
1619        time-major form. This param is only effective when 'sequence_lengths' is
1620        used.
1621
1622    Returns:
1623      output: the output sequence.
1624      output_h: the final state for h.
1625      output_c: the final state for c. This is only relevant for LSTM.
1626    """
1627    return _cudnn_rnn(
1628        input_data,
1629        input_h,
1630        input_c,
1631        params,
1632        is_training,
1633        self._rnn_mode,
1634        sequence_lengths=sequence_lengths,
1635        time_major=time_major,
1636        input_mode=self._input_mode,
1637        direction=self._direction,
1638        dropout=self._dropout,
1639        seed=self._seed)
1640
1641  def params_to_canonical(self, params):
1642    """Converts params from a specific format of cuDNN to the canonical format.
1643
1644    Args:
1645      params: a Variable for weight and bias parameters.
1646
1647    Returns:
1648      A function for the specific-to-canonical conversion.
1649    """
1650    return cudnn_rnn_opaque_params_to_canonical(
1651        rnn_mode=self._rnn_mode,
1652        num_layers=self._num_layers,
1653        num_units=self._num_units,
1654        input_size=self._input_size,
1655        params=params,
1656        input_mode=self._input_mode,
1657        direction=self._direction,
1658        dropout=self._dropout,
1659        seed=self._seed)
1660
1661  def canonical_to_params(self, weights, biases):
1662    """Converts params from the canonical format to a specific format of cuDNN.
1663
1664    Args:
1665      weights: a Tensor for weight parameters.
1666      biases: a Tensor for bias parameters.
1667
1668    Returns:
1669      A function for the canonical-to-params-to-specific conversion..
1670    """
1671    return cudnn_rnn_canonical_to_opaque_params(
1672        rnn_mode=self._rnn_mode,
1673        num_layers=self._num_layers,
1674        num_units=self._num_units,
1675        input_size=self._input_size,
1676        weights=weights,
1677        biases=biases,
1678        input_mode=self._input_mode,
1679        direction=self._direction,
1680        dropout=self._dropout,
1681        seed=self._seed)
1682
1683
1684class CudnnLSTM(_CudnnRNN):
1685  """Cudnn implementation of the LSTM model."""
1686  __doc__ += _cudnn_rnn_common_doc_string
1687  # 4 sets of weight and bias parameters for the recurrent input, and 4 for the
1688  # previous layer input.
1689  _NUM_PARAMS_PER_LAYER = CUDNN_LSTM_PARAMS_PER_LAYER
1690
1691  def __init__(self,
1692               num_layers,
1693               num_units,
1694               input_size,
1695               input_mode=CUDNN_INPUT_LINEAR_MODE,
1696               direction=CUDNN_RNN_UNIDIRECTION,
1697               dtype=dtypes.float32,
1698               dropout=0.,
1699               seed=0):
1700    """Creates a Cudnn LSTM model from model spec.
1701
1702    Args:
1703      num_layers: the number of layers for the RNN model.
1704      num_units: the number of units within the RNN model.
1705      input_size: the size of the input, it could be different from the
1706          num_units.
1707      input_mode: indicate whether there is a linear projection between the
1708          input and The actual computation before the first layer. It could be
1709          'skip_input', 'linear_input' or 'auto_select'.
1710          'skip_input' is only allowed when input_size == num_units;
1711          'auto_select' implies 'skip_input' when input_size == num_units;
1712          otherwise, it implies 'linear_input'.
1713      direction: the direction model that the model operates. Could be either
1714          'unidirectional' or 'bidirectional'
1715      dtype: dtype of params, tf.float32 or tf.float64.
1716      dropout: whether to enable dropout. With it is 0, dropout is disabled.
1717      seed: the seed used for initializing dropout.
1718    """
1719    super(CudnnLSTM, self).__init__(
1720        CUDNN_LSTM,
1721        num_layers,
1722        num_units,
1723        input_size,
1724        input_mode=input_mode,
1725        direction=direction,
1726        dtype=dtype,
1727        dropout=dropout,
1728        seed=seed)
1729
1730  def __call__(self,
1731               input_data,
1732               input_h,
1733               input_c,
1734               params,
1735               sequence_lengths=None,
1736               time_major=True,
1737               is_training=True):
1738    """Runs the forward step for the Cudnn LSTM model.
1739
1740    Args:
1741      input_data: the input sequence to the RNN model. If `time_major` is True
1742        (default), the Tensor shape is [max_time, batch_size, input_size]. If
1743        `time_major` is False, the shape is [batch_size, max_time, input_size].
1744      input_h: the initial hidden state for h. If `time_major` is True
1745        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
1746        `time_major` is False, the shape is [batch_size, num_layers, num_units].
1747      input_c: the initial hidden state for c. A Tensor of the same shape as
1748        input_h.
1749      params: the parameter buffer created for this model.
1750      sequence_lengths: an int32 array representing the variable sequence
1751        lengths in a batch. The size of the array has to equal the batch_size.
1752        Default to None, in which case sequences in the batch are assumed to
1753        have the same length, which is inferred from inputs.
1754      time_major: The shape format of the `inputs` and `outputs` Tensors. If
1755        true, these Tensors must be shaped ['max_time', 'batch_size', 'depth'].
1756        If false, these Tensors must be shaped ['batch_size', 'max_time',
1757        'depth']. By default this function accepts input and emits output in
1758        time-major form. This param is only effective when 'sequence_lengths'
1759        is used.
1760      is_training: whether this operation will be used in training or inference.
1761    Returns:
1762      output: the output sequence.
1763      output_h: the final state for h.
1764      output_c: the final state for c.
1765    """
1766    output, output_h, output_c = super(CudnnLSTM, self).__call__(
1767        input_data,
1768        input_h,
1769        input_c,
1770        params,
1771        sequence_lengths=sequence_lengths,
1772        time_major=time_major,
1773        is_training=is_training)
1774    return (output, output_h, output_c)
1775
1776
1777class _CudnnRNNNoInputC(_CudnnRNN):
1778  """Simple CudnnRNN models without input_c."""
1779  __doc__ += _cudnn_rnn_common_doc_string
1780
1781  def __init__(self,
1782               num_layers,
1783               num_units,
1784               input_size,
1785               input_mode=CUDNN_INPUT_LINEAR_MODE,
1786               direction=CUDNN_RNN_UNIDIRECTION,
1787               dtype=dtypes.float32,
1788               dropout=0.,
1789               seed=0):
1790    """Creates a Cudnn RNN model from model without hidden-state C.
1791
1792    Args:
1793      num_layers: the number of layers for the RNN model.
1794      num_units: the number of units within the RNN model.
1795      input_size: the size of the input, it could be different from the
1796          num_units.
1797      input_mode: indicate whether there is a linear projection between the
1798          input and The actual computation before the first layer. It could be
1799          'skip_input', 'linear_input' or 'auto_select'.
1800          'skip_input' is only allowed when input_size == num_units;
1801          'auto_select' implies 'skip_input' when input_size == num_units;
1802          otherwise, it implies 'linear_input'.
1803      direction: the direction model that the model operates. Could be either
1804          'unidirectional' or 'bidirectional'
1805      dtype: dtype of params, tf.float32 or tf.float64.
1806      dropout: whether to enable dropout. With it is 0, dropout is disabled.
1807      seed: the seed used for initializing dropout.
1808
1809    Raises:
1810      ValueError: if direction is not 'unidirectional' or 'bidirectional'.
1811    """
1812
1813    if direction not in (CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION):
1814      raise ValueError("Invalid direction: %s" % direction)
1815
1816    super(_CudnnRNNNoInputC, self).__init__(
1817        self._rnn_mode,
1818        num_layers,
1819        num_units,
1820        input_size,
1821        input_mode=input_mode,
1822        direction=direction,
1823        dtype=dtype,
1824        dropout=dropout,
1825        seed=seed)
1826
1827  def __call__(self,
1828               input_data,
1829               input_h,
1830               params,
1831               sequence_lengths=None,
1832               time_major=True,
1833               is_training=True):
1834    """Runs the forward step for the Cudnn LSTM model.
1835
1836    Args:
1837      input_data: the input sequence to the RNN model. If `time_major` is True
1838        (default), the Tensor shape is [max_time, batch_size, input_size]. If
1839        `time_major` is False, the shape is [batch_size, max_time, input_size].
1840      input_h: the initial hidden state for h. If `time_major` is True
1841        (default), the Tensor shape is [num_layers, batch_size, num_units]. If
1842        `time_major` is False, the shape is [batch_size, num_layers, num_units].
1843      params: the parameter buffer created for this model.
1844      sequence_lengths: an int32 array representing the variable sequence
1845        lengths in a batch. The size of the array has to equal the batch_size.
1846        Default to None, in which case sequences in the batch are assumed to
1847        have the same length, which is inferred from inputs.
1848      time_major: The shape format of the `inputs` and `outputs` Tensors. If
1849        true, these Tensors must be shaped ['max_time', 'batch_size', 'depth'].
1850        If false, these Tensors must be shaped ['batch_size', 'max_time',
1851        'depth']. By default this function accepts input and emits output in
1852        time-major form. This param is only effective when 'sequence_lengths'
1853        is used.
1854      is_training: whether this operation will be used in training or inference.
1855    Returns:
1856      output: the output sequence.
1857      output_h: the final state for h.
1858    """
1859    return _cudnn_rnn_no_input_c(
1860        input_data,
1861        input_h,
1862        params,
1863        is_training,
1864        self._rnn_mode,
1865        sequence_lengths=sequence_lengths,
1866        time_major=time_major,
1867        input_mode=self._input_mode,
1868        direction=self._direction,
1869        dropout=self._dropout,
1870        seed=self._seed)
1871
1872
1873class CudnnGRU(_CudnnRNNNoInputC):
1874  """Cudnn implementation of the GRU model."""
1875  __doc__ += _cudnn_rnn_common_doc_string
1876  _rnn_mode = CUDNN_GRU
1877  # 3 sets of weight and bias parameters for the recurrent input, and 3 for the
1878  # previous layer input.
1879  _NUM_PARAMS_PER_LAYER = CUDNN_GRU_PARAMS_PER_LAYER
1880
1881
1882class CudnnRNNTanh(_CudnnRNNNoInputC):
1883  """Cudnn implementation of the RNN-tanh model."""
1884  __doc__ += _cudnn_rnn_common_doc_string
1885  _rnn_mode = CUDNN_RNN_TANH
1886  # 1 set of weight and bias parameters for the recurrent input, and 1 for the
1887  # previous layer input.
1888  _NUM_PARAMS_PER_LAYER = CUDNN_RNN_TANH_PARAMS_PER_LAYER
1889
1890
1891class CudnnRNNRelu(_CudnnRNNNoInputC):
1892  """Cudnn implementation of the RNN-relu model."""
1893  __doc__ += _cudnn_rnn_common_doc_string
1894  _rnn_mode = CUDNN_RNN_RELU
1895  # 1 set of weight and bias parameters for the recurrent input, and 1 for the
1896  # previous layer input.
1897  _NUM_PARAMS_PER_LAYER = CUDNN_RNN_RELU_PARAMS_PER_LAYER
1898