• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15# pylint: disable=g-classes-have-attributes
16"""Recurrent layers for TF 2."""
17
18import uuid
19
20from tensorflow.python.eager import context
21from tensorflow.python.eager import function
22from tensorflow.python.eager.context import get_device_name
23from tensorflow.python.framework import config
24from tensorflow.python.framework import constant_op
25from tensorflow.python.framework import device
26from tensorflow.python.framework import dtypes
27from tensorflow.python.framework import ops
28from tensorflow.python.keras import activations
29from tensorflow.python.keras import backend
30from tensorflow.python.keras.engine.input_spec import InputSpec
31from tensorflow.python.keras.layers import recurrent
32from tensorflow.python.ops import array_ops
33from tensorflow.python.ops import control_flow_ops
34from tensorflow.python.ops import gen_cudnn_rnn_ops
35from tensorflow.python.ops import math_ops
36from tensorflow.python.ops import nn
37from tensorflow.python.ops import state_ops
38from tensorflow.python.ops import variables
39from tensorflow.python.platform import sysconfig
40from tensorflow.python.platform import tf_logging as logging
41from tensorflow.python.util.tf_export import keras_export
42
43
44# The following string constants are used by Defun approach for unified backend
45# of LSTM and GRU.
46_FUNCTION_API_NAME_ATTRIBUTE = 'api_implements'
47_FUNCTION_DEVICE_ATTRIBUTE = 'api_preferred_device'
48_CPU_DEVICE_NAME = 'CPU'
49_GPU_DEVICE_NAME = 'GPU'
50
51# The following number constants are used to represent the runtime of the defun
52# backend function. Since the CPU/GPU implementation are mathematically same, we
53# need some signal for the function to indicate which function is executed. This
54# is for testing purpose to verify the correctness of swapping backend function.
55_RUNTIME_UNKNOWN = 0
56_RUNTIME_CPU = 1
57_RUNTIME_GPU = 2
58
59_CUDNN_AVAILABLE_MSG = 'Layer %s will use cuDNN kernels when running on GPU.'
60_CUDNN_NOT_AVAILABLE_MSG = ('Layer %s will not use cuDNN kernels since it '
61                            'doesn\'t meet the criteria. It will '
62                            'use a generic GPU kernel as fallback when running '
63                            'on GPU.')
64
65
66def _use_new_code():
67  return False
68
69
70# TODO(b/169707691): The wrapper can be removed if TFLite doesn't need to rely
71# on supportive attributes from LSTM/GRU.
72class _DefunWrapper(object):
73  """A wrapper with no deep copy of the Defun in LSTM/GRU layer."""
74
75  def __init__(self, time_major, go_backwards, layer_name):
76    self.time_major = time_major
77    self.go_backwards = go_backwards
78    self.layer_name = layer_name
79    if self.layer_name not in ['lstm', 'gru']:
80      raise ValueError('Defun wrapper only applies to LSTM and GRU layer, '
81                       'but given {}'.format(self.layer_name))
82    # The first two attributes are added to support TFLite use case.
83    supportive_attributes = {
84        'time_major': self.time_major,
85        'go_backwards': self.go_backwards,
86        _FUNCTION_API_NAME_ATTRIBUTE: self.layer_name + '_' + str(uuid.uuid4())
87    }
88    if self.layer_name == 'lstm':
89      layer_func = lstm_with_backend_selection
90    else:
91      layer_func = gru_with_backend_selection
92
93    self.defun_layer = function.defun_with_attributes(
94        layer_func,
95        attributes=supportive_attributes,
96        autograph=False)
97
98  def __deepcopy__(self, memo):
99    new_wrapper = type(self)(
100        self.time_major, self.go_backwards, self.layer_name)
101    memo[id(self)] = new_wrapper
102    return new_wrapper
103
104
105@keras_export('keras.layers.GRUCell', v1=[])
106class GRUCell(recurrent.GRUCell):
107  """Cell class for the GRU layer.
108
109  See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
110  for details about the usage of RNN API.
111
112  This class processes one step within the whole time sequence input, whereas
113  `tf.keras.layer.GRU` processes the whole sequence.
114
115  For example:
116
117  >>> inputs = tf.random.normal([32, 10, 8])
118  >>> rnn = tf.keras.layers.RNN(tf.keras.layers.GRUCell(4))
119  >>> output = rnn(inputs)
120  >>> print(output.shape)
121  (32, 4)
122  >>> rnn = tf.keras.layers.RNN(
123  ...    tf.keras.layers.GRUCell(4),
124  ...    return_sequences=True,
125  ...    return_state=True)
126  >>> whole_sequence_output, final_state = rnn(inputs)
127  >>> print(whole_sequence_output.shape)
128  (32, 10, 4)
129  >>> print(final_state.shape)
130  (32, 4)
131
132  Args:
133    units: Positive integer, dimensionality of the output space.
134    activation: Activation function to use. Default: hyperbolic tangent
135      (`tanh`). If you pass None, no activation is applied
136      (ie. "linear" activation: `a(x) = x`).
137    recurrent_activation: Activation function to use for the recurrent step.
138      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
139      applied (ie. "linear" activation: `a(x) = x`).
140    use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
141    kernel_initializer: Initializer for the `kernel` weights matrix,
142      used for the linear transformation of the inputs. Default:
143      `glorot_uniform`.
144    recurrent_initializer: Initializer for the `recurrent_kernel`
145      weights matrix, used for the linear transformation of the recurrent state.
146      Default: `orthogonal`.
147    bias_initializer: Initializer for the bias vector. Default: `zeros`.
148    kernel_regularizer: Regularizer function applied to the `kernel` weights
149      matrix. Default: `None`.
150    recurrent_regularizer: Regularizer function applied to the
151      `recurrent_kernel` weights matrix. Default: `None`.
152    bias_regularizer: Regularizer function applied to the bias vector. Default:
153      `None`.
154    kernel_constraint: Constraint function applied to the `kernel` weights
155      matrix. Default: `None`.
156    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
157      weights matrix. Default: `None`.
158    bias_constraint: Constraint function applied to the bias vector. Default:
159      `None`.
160    dropout: Float between 0 and 1. Fraction of the units to drop for the
161      linear transformation of the inputs. Default: 0.
162    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
163      the linear transformation of the recurrent state. Default: 0.
164    reset_after: GRU convention (whether to apply reset gate after or
165      before matrix multiplication). False = "before",
166      True = "after" (default and CuDNN compatible).
167
168  Call arguments:
169    inputs: A 2D tensor, with shape of `[batch, feature]`.
170    states: A 2D tensor with shape of `[batch, units]`, which is the state from
171      the previous time step. For timestep 0, the initial state provided by user
172      will be feed to cell.
173    training: Python boolean indicating whether the layer should behave in
174      training mode or in inference mode. Only relevant when `dropout` or
175      `recurrent_dropout` is used.
176  """
177
178  def __init__(self,
179               units,
180               activation='tanh',
181               recurrent_activation='sigmoid',
182               use_bias=True,
183               kernel_initializer='glorot_uniform',
184               recurrent_initializer='orthogonal',
185               bias_initializer='zeros',
186               kernel_regularizer=None,
187               recurrent_regularizer=None,
188               bias_regularizer=None,
189               kernel_constraint=None,
190               recurrent_constraint=None,
191               bias_constraint=None,
192               dropout=0.,
193               recurrent_dropout=0.,
194               reset_after=True,
195               **kwargs):
196    super(GRUCell, self).__init__(
197        units,
198        activation=activation,
199        recurrent_activation=recurrent_activation,
200        use_bias=use_bias,
201        kernel_initializer=kernel_initializer,
202        recurrent_initializer=recurrent_initializer,
203        bias_initializer=bias_initializer,
204        kernel_regularizer=kernel_regularizer,
205        recurrent_regularizer=recurrent_regularizer,
206        bias_regularizer=bias_regularizer,
207        kernel_constraint=kernel_constraint,
208        recurrent_constraint=recurrent_constraint,
209        bias_constraint=bias_constraint,
210        dropout=dropout,
211        recurrent_dropout=recurrent_dropout,
212        implementation=kwargs.pop('implementation', 2),
213        reset_after=reset_after,
214        **kwargs)
215
216
217@keras_export('keras.layers.GRU', v1=[])
218class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
219  """Gated Recurrent Unit - Cho et al. 2014.
220
221  See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
222  for details about the usage of RNN API.
223
224  Based on available runtime hardware and constraints, this layer
225  will choose different implementations (cuDNN-based or pure-TensorFlow)
226  to maximize the performance. If a GPU is available and all
227  the arguments to the layer meet the requirement of the CuDNN kernel
228  (see below for details), the layer will use a fast cuDNN implementation.
229
230  The requirements to use the cuDNN implementation are:
231
232  1. `activation` == `tanh`
233  2. `recurrent_activation` == `sigmoid`
234  3. `recurrent_dropout` == 0
235  4. `unroll` is `False`
236  5. `use_bias` is `True`
237  6. `reset_after` is `True`
238  7. Inputs, if use masking, are strictly right-padded.
239  8. Eager execution is enabled in the outermost context.
240
241  There are two variants of the GRU implementation. The default one is based on
242  [v3](https://arxiv.org/abs/1406.1078v3) and has reset gate applied to hidden
243  state before matrix multiplication. The other one is based on
244  [original](https://arxiv.org/abs/1406.1078v1) and has the order reversed.
245
246  The second variant is compatible with CuDNNGRU (GPU-only) and allows
247  inference on CPU. Thus it has separate biases for `kernel` and
248  `recurrent_kernel`. To use this variant, set `'reset_after'=True` and
249  `recurrent_activation='sigmoid'`.
250
251  For example:
252
253  >>> inputs = tf.random.normal([32, 10, 8])
254  >>> gru = tf.keras.layers.GRU(4)
255  >>> output = gru(inputs)
256  >>> print(output.shape)
257  (32, 4)
258  >>> gru = tf.keras.layers.GRU(4, return_sequences=True, return_state=True)
259  >>> whole_sequence_output, final_state = gru(inputs)
260  >>> print(whole_sequence_output.shape)
261  (32, 10, 4)
262  >>> print(final_state.shape)
263  (32, 4)
264
265  Args:
266    units: Positive integer, dimensionality of the output space.
267    activation: Activation function to use.
268      Default: hyperbolic tangent (`tanh`).
269      If you pass `None`, no activation is applied
270      (ie. "linear" activation: `a(x) = x`).
271    recurrent_activation: Activation function to use
272      for the recurrent step.
273      Default: sigmoid (`sigmoid`).
274      If you pass `None`, no activation is applied
275      (ie. "linear" activation: `a(x) = x`).
276    use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
277    kernel_initializer: Initializer for the `kernel` weights matrix,
278      used for the linear transformation of the inputs. Default:
279      `glorot_uniform`.
280    recurrent_initializer: Initializer for the `recurrent_kernel`
281       weights matrix, used for the linear transformation of the recurrent
282       state. Default: `orthogonal`.
283    bias_initializer: Initializer for the bias vector. Default: `zeros`.
284    kernel_regularizer: Regularizer function applied to the `kernel` weights
285      matrix. Default: `None`.
286    recurrent_regularizer: Regularizer function applied to the
287      `recurrent_kernel` weights matrix. Default: `None`.
288    bias_regularizer: Regularizer function applied to the bias vector. Default:
289      `None`.
290    activity_regularizer: Regularizer function applied to the output of the
291      layer (its "activation"). Default: `None`.
292    kernel_constraint: Constraint function applied to the `kernel` weights
293      matrix. Default: `None`.
294    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
295      weights matrix. Default: `None`.
296    bias_constraint: Constraint function applied to the bias vector. Default:
297      `None`.
298    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
299      transformation of the inputs. Default: 0.
300    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
301      the linear transformation of the recurrent state. Default: 0.
302    return_sequences: Boolean. Whether to return the last output
303      in the output sequence, or the full sequence. Default: `False`.
304    return_state: Boolean. Whether to return the last state in addition to the
305      output. Default: `False`.
306    go_backwards: Boolean (default `False`).
307      If True, process the input sequence backwards and return the
308      reversed sequence.
309    stateful: Boolean (default False). If True, the last state
310      for each sample at index i in a batch will be used as initial
311      state for the sample of index i in the following batch.
312    unroll: Boolean (default False).
313      If True, the network will be unrolled,
314      else a symbolic loop will be used.
315      Unrolling can speed-up a RNN,
316      although it tends to be more memory-intensive.
317      Unrolling is only suitable for short sequences.
318    time_major: The shape format of the `inputs` and `outputs` tensors.
319      If True, the inputs and outputs will be in shape
320      `[timesteps, batch, feature]`, whereas in the False case, it will be
321      `[batch, timesteps, feature]`. Using `time_major = True` is a bit more
322      efficient because it avoids transposes at the beginning and end of the
323      RNN calculation. However, most TensorFlow data is batch-major, so by
324      default this function accepts input and emits output in batch-major
325      form.
326    reset_after: GRU convention (whether to apply reset gate after or
327      before matrix multiplication). False = "before",
328      True = "after" (default and CuDNN compatible).
329
330  Call arguments:
331    inputs: A 3D tensor, with shape `[batch, timesteps, feature]`.
332    mask: Binary tensor of shape `[samples, timesteps]` indicating whether
333      a given timestep should be masked  (optional, defaults to `None`).
334      An individual `True` entry indicates that the corresponding timestep
335      should be utilized, while a `False` entry indicates that the
336      corresponding timestep should be ignored.
337    training: Python boolean indicating whether the layer should behave in
338      training mode or in inference mode. This argument is passed to the cell
339      when calling it. This is only relevant if `dropout` or
340      `recurrent_dropout` is used  (optional, defaults to `None`).
341    initial_state: List of initial state tensors to be passed to the first
342      call of the cell  (optional, defaults to `None` which causes creation
343      of zero-filled initial state tensors).
344  """
345
346  def __init__(self,
347               units,
348               activation='tanh',
349               recurrent_activation='sigmoid',
350               use_bias=True,
351               kernel_initializer='glorot_uniform',
352               recurrent_initializer='orthogonal',
353               bias_initializer='zeros',
354               kernel_regularizer=None,
355               recurrent_regularizer=None,
356               bias_regularizer=None,
357               activity_regularizer=None,
358               kernel_constraint=None,
359               recurrent_constraint=None,
360               bias_constraint=None,
361               dropout=0.,
362               recurrent_dropout=0.,
363               return_sequences=False,
364               return_state=False,
365               go_backwards=False,
366               stateful=False,
367               unroll=False,
368               time_major=False,
369               reset_after=True,
370               **kwargs):
371    # return_runtime is a flag for testing, which shows the real backend
372    # implementation chosen by grappler in graph mode.
373    self._return_runtime = kwargs.pop('return_runtime', False)
374
375    super(GRU, self).__init__(
376        units,
377        activation=activation,
378        recurrent_activation=recurrent_activation,
379        use_bias=use_bias,
380        kernel_initializer=kernel_initializer,
381        recurrent_initializer=recurrent_initializer,
382        bias_initializer=bias_initializer,
383        kernel_regularizer=kernel_regularizer,
384        recurrent_regularizer=recurrent_regularizer,
385        bias_regularizer=bias_regularizer,
386        activity_regularizer=activity_regularizer,
387        kernel_constraint=kernel_constraint,
388        recurrent_constraint=recurrent_constraint,
389        bias_constraint=bias_constraint,
390        dropout=dropout,
391        recurrent_dropout=recurrent_dropout,
392        implementation=kwargs.pop('implementation', 2),
393        return_sequences=return_sequences,
394        return_state=return_state,
395        go_backwards=go_backwards,
396        stateful=stateful,
397        unroll=unroll,
398        time_major=time_major,
399        reset_after=reset_after,
400        **kwargs)
401    # GPU kernel uses following setting by default and not configurable.
402    self._could_use_gpu_kernel = (
403        self.activation in (activations.tanh, nn.tanh) and
404        self.recurrent_activation in (activations.sigmoid, nn.sigmoid) and
405        recurrent_dropout == 0 and not unroll and use_bias and
406        reset_after and ops.executing_eagerly_outside_functions())
407    if config.list_logical_devices('GPU'):
408      # Only show the message when there is GPU available, user will not care
409      # about the cuDNN if there isn't any GPU.
410      if self._could_use_gpu_kernel:
411        logging.debug(_CUDNN_AVAILABLE_MSG % self.name)
412      else:
413        logging.warning(_CUDNN_NOT_AVAILABLE_MSG % self.name)
414
415    if _use_new_code():
416      self._defun_wrapper = _DefunWrapper(time_major, go_backwards, 'gru')
417
418  def call(self, inputs, mask=None, training=None, initial_state=None):
419    # The input should be dense, padded with zeros. If a ragged input is fed
420    # into the layer, it is padded and the row lengths are used for masking.
421    inputs, row_lengths = backend.convert_inputs_if_ragged(inputs)
422    is_ragged_input = (row_lengths is not None)
423    self._validate_args_if_ragged(is_ragged_input, mask)
424
425    # GRU does not support constants. Ignore it during process.
426    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
427
428    if isinstance(mask, list):
429      mask = mask[0]
430
431    input_shape = backend.int_shape(inputs)
432    timesteps = input_shape[0] if self.time_major else input_shape[1]
433
434    # TODO(b/156447398) Investigate why the cuDNN kernel fails with ragged
435    # inputs.
436    if is_ragged_input or not self._could_use_gpu_kernel:
437      kwargs = {'training': training}
438      self._maybe_reset_cell_dropout_mask(self.cell)
439
440      def step(cell_inputs, cell_states):
441        return self.cell(cell_inputs, cell_states, **kwargs)
442
443      last_output, outputs, states = backend.rnn(
444          step,
445          inputs,
446          initial_state,
447          constants=None,
448          go_backwards=self.go_backwards,
449          mask=mask,
450          unroll=self.unroll,
451          input_length=row_lengths if row_lengths is not None else timesteps,
452          time_major=self.time_major,
453          zero_output_for_mask=self.zero_output_for_mask)
454      # This is a dummy tensor for testing purpose.
455      runtime = _runtime(_RUNTIME_UNKNOWN)
456    else:
457      last_output, outputs, runtime, states = self._defun_gru_call(
458          inputs, initial_state, training, mask, row_lengths)
459
460    if self.stateful:
461      updates = [state_ops.assign(self.states[0], states[0])]
462      self.add_update(updates)
463
464    if self.return_sequences:
465      output = backend.maybe_convert_to_ragged(
466          is_ragged_input, outputs, row_lengths, go_backwards=self.go_backwards)
467    else:
468      output = last_output
469
470    if self.return_state:
471      return [output] + list(states)
472    elif self._return_runtime:
473      return output, runtime
474    else:
475      return output
476
477  def _defun_gru_call(self, inputs, initial_state, training, mask,
478                      sequence_lengths):
479    # Use the new defun approach for backend implementation swap.
480    # Note that different implementations need to have same function
481    # signature, eg, the tensor parameters need to have same shape and dtypes.
482
483    self.reset_dropout_mask()
484    dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=3)
485    if dropout_mask is not None:
486      inputs = inputs * dropout_mask[0]
487
488    if _use_new_code():
489      gru_kwargs = {
490          'inputs': inputs,
491          'init_h': _read_variable_value(initial_state[0]),
492          'kernel': _read_variable_value(self.cell.kernel),
493          'recurrent_kernel': _read_variable_value(self.cell.recurrent_kernel),
494          'bias': _read_variable_value(self.cell.bias),
495          'mask': mask,
496          'time_major': self.time_major,
497          'go_backwards': self.go_backwards,
498          'sequence_lengths': sequence_lengths,
499          'zero_output_for_mask': self.zero_output_for_mask
500      }
501      (last_output, outputs, new_h,
502       runtime) = self._defun_wrapper.defun_layer(**gru_kwargs)
503    else:
504      gpu_gru_kwargs = {
505          'inputs': inputs,
506          'init_h': _read_variable_value(initial_state[0]),
507          'kernel': _read_variable_value(self.cell.kernel),
508          'recurrent_kernel': _read_variable_value(self.cell.recurrent_kernel),
509          'bias': _read_variable_value(self.cell.bias),
510          'mask': mask,
511          'time_major': self.time_major,
512          'go_backwards': self.go_backwards,
513          'sequence_lengths': sequence_lengths
514      }
515      normal_gru_kwargs = gpu_gru_kwargs.copy()
516      normal_gru_kwargs.update({
517          'zero_output_for_mask': self.zero_output_for_mask,
518      })
519
520      if context.executing_eagerly():
521        device_type = _get_context_device_type()
522        can_use_gpu = (
523            # Either user specified GPU or unspecified but GPU is available.
524            (device_type == _GPU_DEVICE_NAME or
525             (device_type is None and config.list_logical_devices('GPU'))) and
526            (mask is None or is_cudnn_supported_inputs(mask, self.time_major)))
527        # Under eager context, check the device placement and prefer the
528        if can_use_gpu:
529          last_output, outputs, new_h, runtime = gpu_gru(**gpu_gru_kwargs)
530        else:
531          last_output, outputs, new_h, runtime = standard_gru(
532              **normal_gru_kwargs)
533      else:
534        last_output, outputs, new_h, runtime = gru_with_backend_selection(
535            **normal_gru_kwargs)
536
537    states = [new_h]
538    return last_output, outputs, runtime, states
539
540
541def standard_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask,
542                 time_major, go_backwards, sequence_lengths,
543                 zero_output_for_mask):
544  """GRU with standard kernel implementation.
545
546  This implementation can be run on all types of hardware.
547
548  This implementation lifts out all the layer weights and make them function
549  parameters. It has same number of tensor input params as the CuDNN
550  counterpart. The RNN step logic has been simplified, eg dropout and mask is
551  removed since CuDNN implementation does not support that.
552
553  Args:
554    inputs: Input tensor of GRU layer.
555    init_h: Initial state tensor for the cell output.
556    kernel: Weights for cell kernel.
557    recurrent_kernel: Weights for cell recurrent kernel.
558    bias: Weights for cell kernel bias and recurrent bias. The bias contains the
559      combined input_bias and recurrent_bias.
560    mask: Binary tensor of shape `(samples, timesteps)` indicating whether
561      a given timestep should be masked. An individual `True` entry indicates
562      that the corresponding timestep should be utilized, while a `False` entry
563      indicates that the corresponding timestep should be ignored.
564    time_major: Boolean, whether the inputs are in the format of
565      [time, batch, feature] or [batch, time, feature].
566    go_backwards: Boolean (default False). If True, process the input sequence
567      backwards and return the reversed sequence.
568    sequence_lengths: The lengths of all sequences coming from a variable length
569      input, such as ragged tensors. If the input has a fixed timestep size,
570      this should be None.
571    zero_output_for_mask: Boolean, whether to output zero for masked timestep.
572
573  Returns:
574    last_output: output tensor for the last timestep, which has shape
575      [batch, units].
576    outputs: output tensor for all timesteps, which has shape
577      [batch, time, units].
578    state_0: the cell output, which has same shape as init_h.
579    runtime: constant string tensor which indicate real runtime hardware. This
580      value is for testing purpose and should be used by user.
581  """
582  input_shape = backend.int_shape(inputs)
583  timesteps = input_shape[0] if time_major else input_shape[1]
584
585  input_bias, recurrent_bias = array_ops.unstack(bias)
586
587  def step(cell_inputs, cell_states):
588    """Step function that will be used by Keras RNN backend."""
589    h_tm1 = cell_states[0]
590
591    # inputs projected by all gate matrices at once
592    matrix_x = backend.dot(cell_inputs, kernel)
593    matrix_x = backend.bias_add(matrix_x, input_bias)
594
595    x_z, x_r, x_h = array_ops.split(matrix_x, 3, axis=1)
596
597    # hidden state projected by all gate matrices at once
598    matrix_inner = backend.dot(h_tm1, recurrent_kernel)
599    matrix_inner = backend.bias_add(matrix_inner, recurrent_bias)
600
601    recurrent_z, recurrent_r, recurrent_h = array_ops.split(matrix_inner, 3,
602                                                            axis=1)
603    z = nn.sigmoid(x_z + recurrent_z)
604    r = nn.sigmoid(x_r + recurrent_r)
605    hh = nn.tanh(x_h + r * recurrent_h)
606
607    # previous and candidate state mixed by update gate
608    h = z * h_tm1 + (1 - z) * hh
609    return h, [h]
610
611  last_output, outputs, new_states = backend.rnn(
612      step,
613      inputs, [init_h],
614      constants=None,
615      unroll=False,
616      time_major=time_major,
617      mask=mask,
618      go_backwards=go_backwards,
619      input_length=sequence_lengths
620      if sequence_lengths is not None else timesteps,
621      zero_output_for_mask=zero_output_for_mask)
622  return last_output, outputs, new_states[0], _runtime(_RUNTIME_CPU)
623
624
625def gpu_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
626            go_backwards, sequence_lengths):
627  """GRU with CuDNN implementation which is only available for GPU."""
628  if not time_major and mask is None:
629    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
630    seq_axis, batch_axis = (0, 1)
631  else:
632    seq_axis, batch_axis = (0, 1) if time_major else (1, 0)
633  # For init_h, cuDNN expects one more dim of num_layers before or after batch
634  # dim for time major or batch major inputs respectively
635  init_h = array_ops.expand_dims(init_h, axis=seq_axis)
636
637  weights = array_ops.split(kernel, 3, axis=1)
638  weights += array_ops.split(recurrent_kernel, 3, axis=1)
639  # Note that the bias was initialized as shape (2, 3 * units), flat it into
640  # (6 * units)
641  bias = array_ops.split(backend.flatten(bias), 6)
642
643  if sysconfig.get_build_info()['is_cuda_build']:
644    # Note that the gate order for CuDNN is different from the canonical format.
645    # canonical format is [z, r, h], whereas CuDNN is [r, z, h]. The swap need
646    # to be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
647    # z is update gate weights.
648    # r is reset gate weights.
649    # h is output gate weights.
650    weights[0], weights[1] = weights[1], weights[0]
651    weights[3], weights[4] = weights[4], weights[3]
652    bias[0], bias[1] = bias[1], bias[0]
653    bias[3], bias[4] = bias[4], bias[3]
654
655  params = _canonical_to_params(
656      weights=weights,
657      biases=bias,
658      shape=constant_op.constant([-1]),
659      transpose_weights=True)
660
661  if mask is not None:
662    sequence_lengths = calculate_sequence_by_mask(mask, time_major)
663
664  if sequence_lengths is not None:
665    if go_backwards:
666      # Three reversals are required. E.g.,
667      # normal input = [1, 2, 3, 0, 0]  # where 0 need to be masked
668      # reversed_input_to_cudnn = [3, 2, 1, 0, 0]
669      # output_from_cudnn = [6, 5, 4, 0, 0]
670      # expected_output = [0, 0, 6, 5 ,4]
671      inputs = array_ops.reverse_sequence_v2(
672          inputs, sequence_lengths, seq_axis=seq_axis, batch_axis=batch_axis)
673    outputs, h, _, _, _ = gen_cudnn_rnn_ops.CudnnRNNV3(
674        input=inputs,
675        input_h=init_h,
676        input_c=0,
677        params=params,
678        is_training=True,
679        rnn_mode='gru',
680        sequence_lengths=sequence_lengths,
681        time_major=time_major)
682    if go_backwards:
683      outputs = array_ops.reverse_sequence_v2(
684          outputs, sequence_lengths, seq_axis=seq_axis, batch_axis=batch_axis)
685      outputs = array_ops.reverse(outputs, axis=[seq_axis])
686  else:
687    if go_backwards:
688      # Reverse axis 0 since the input is already convert to time major.
689      inputs = array_ops.reverse(inputs, axis=[0])
690    outputs, h, _, _ = gen_cudnn_rnn_ops.CudnnRNN(
691        input=inputs, input_h=init_h, input_c=0, params=params,
692        is_training=True, rnn_mode='gru')
693
694  last_output = outputs[-1]
695  if not time_major and mask is None:
696    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
697  h = array_ops.squeeze(h, axis=seq_axis)
698
699  # In the case of variable length input, the cudnn kernel will fill zeros for
700  # the output, whereas the default keras behavior is to bring over the previous
701  # output for t-1, so that in the return_sequence=False case, user can quickly
702  # get the final effect output instead just 0s at the last timestep.
703  # In order to mimic the default keras behavior, we copy the final h state as
704  # the last_output, since it is numerically same as the output.
705  if mask is not None:
706    last_output = h
707
708  return last_output, outputs, h, _runtime(_RUNTIME_GPU)
709
710
711def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
712                               mask, time_major, go_backwards, sequence_lengths,
713                               zero_output_for_mask):
714  """Call the GRU with optimized backend kernel selection.
715
716  Under the hood, this function will create two TF function, one with the most
717  generic kernel and can run on all device condition, and the second one with
718  CuDNN specific kernel, which can only run on GPU.
719
720  The first function will be called with normal_lstm_params, while the second
721  function is not called, but only registered in the graph. The Grappler will
722  do the proper graph rewrite and swap the optimized TF function based on the
723  device placement.
724
725  Args:
726    inputs: Input tensor of GRU layer.
727    init_h: Initial state tensor for the cell output.
728    kernel: Weights for cell kernel.
729    recurrent_kernel: Weights for cell recurrent kernel.
730    bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
731      is used in this case.
732    mask: Boolean tensor for mask out the steps within sequence.
733      An individual `True` entry indicates that the corresponding timestep
734      should be utilized, while a `False` entry indicates that the corresponding
735      timestep should be ignored.
736    time_major: Boolean, whether the inputs are in the format of
737      [time, batch, feature] or [batch, time, feature].
738    go_backwards: Boolean (default False). If True, process the input sequence
739      backwards and return the reversed sequence.
740    sequence_lengths: The lengths of all sequences coming from a variable length
741      input, such as ragged tensors. If the input has a fixed timestep size,
742      this should be None.
743    zero_output_for_mask: Boolean, whether to output zero for masked timestep.
744
745  Returns:
746    List of output tensors, same as standard_gru.
747  """
748  params = {
749      'inputs': inputs,
750      'init_h': init_h,
751      'kernel': kernel,
752      'recurrent_kernel': recurrent_kernel,
753      'bias': bias,
754      'mask': mask,
755      'time_major': time_major,
756      'go_backwards': go_backwards,
757      'sequence_lengths': sequence_lengths,
758      'zero_output_for_mask': zero_output_for_mask,
759  }
760
761  def gpu_gru_with_fallback(inputs, init_h, kernel, recurrent_kernel, bias,
762                            mask, time_major, go_backwards, sequence_lengths,
763                            zero_output_for_mask):
764    """Use CuDNN kernel when mask is none or strictly right padded."""
765    if mask is None:
766      return gpu_gru(
767          inputs=inputs,
768          init_h=init_h,
769          kernel=kernel,
770          recurrent_kernel=recurrent_kernel,
771          bias=bias,
772          mask=mask,
773          time_major=time_major,
774          go_backwards=go_backwards,
775          sequence_lengths=sequence_lengths)
776
777    def cudnn_gru_fn():
778      return gpu_gru(
779          inputs=inputs,
780          init_h=init_h,
781          kernel=kernel,
782          recurrent_kernel=recurrent_kernel,
783          bias=bias,
784          mask=mask,
785          time_major=time_major,
786          go_backwards=go_backwards,
787          sequence_lengths=sequence_lengths)
788
789    def standard_gru_fn():
790      return standard_gru(
791          inputs=inputs,
792          init_h=init_h,
793          kernel=kernel,
794          recurrent_kernel=recurrent_kernel,
795          bias=bias,
796          mask=mask,
797          time_major=time_major,
798          go_backwards=go_backwards,
799          sequence_lengths=sequence_lengths,
800          zero_output_for_mask=zero_output_for_mask)
801
802    return control_flow_ops.cond(
803        is_cudnn_supported_inputs(mask, time_major),
804        true_fn=cudnn_gru_fn,
805        false_fn=standard_gru_fn)
806
807  if _use_new_code():
808    # Chooses the implementation dynamically based on the running device.
809    (last_output, outputs, new_h,
810     runtime) = control_flow_ops.execute_fn_for_device(
811         {
812             _CPU_DEVICE_NAME: lambda: standard_gru(**params),
813             _GPU_DEVICE_NAME: lambda: gpu_gru_with_fallback(**params)
814         }, lambda: standard_gru(**params))
815  else:
816    # Each time a `tf.function` is called, we will give it a unique
817    # identifiable API name, so that Grappler won't get confused when it
818    # sees multiple GRU layers added into same graph, and it will be able
819    # to pair up the different implementations across them.
820    api_name = 'gru_' + str(uuid.uuid4())
821    supportive_attribute = {
822        'time_major': time_major,
823        'go_backwards': go_backwards,
824    }
825    defun_standard_gru = _generate_defun_backend(api_name, _CPU_DEVICE_NAME,
826                                                 standard_gru,
827                                                 supportive_attribute)
828    defun_gpu_gru = _generate_defun_backend(api_name, _GPU_DEVICE_NAME,
829                                            gpu_gru_with_fallback,
830                                            supportive_attribute)
831
832    # Call the normal GRU impl and register the CuDNN impl function. The
833    # grappler will kick in during session execution to optimize the graph.
834    last_output, outputs, new_h, runtime = defun_standard_gru(**params)
835    _function_register(defun_gpu_gru, **params)
836
837  return last_output, outputs, new_h, runtime
838
839
840@keras_export('keras.layers.LSTMCell', v1=[])
841class LSTMCell(recurrent.LSTMCell):
842  """Cell class for the LSTM layer.
843
844  See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
845  for details about the usage of RNN API.
846
847  This class processes one step within the whole time sequence input, whereas
848  `tf.keras.layer.LSTM` processes the whole sequence.
849
850  For example:
851
852  >>> inputs = tf.random.normal([32, 10, 8])
853  >>> rnn = tf.keras.layers.RNN(tf.keras.layers.LSTMCell(4))
854  >>> output = rnn(inputs)
855  >>> print(output.shape)
856  (32, 4)
857  >>> rnn = tf.keras.layers.RNN(
858  ...    tf.keras.layers.LSTMCell(4),
859  ...    return_sequences=True,
860  ...    return_state=True)
861  >>> whole_seq_output, final_memory_state, final_carry_state = rnn(inputs)
862  >>> print(whole_seq_output.shape)
863  (32, 10, 4)
864  >>> print(final_memory_state.shape)
865  (32, 4)
866  >>> print(final_carry_state.shape)
867  (32, 4)
868
869  Args:
870    units: Positive integer, dimensionality of the output space.
871    activation: Activation function to use. Default: hyperbolic tangent
872      (`tanh`). If you pass `None`, no activation is applied (ie. "linear"
873      activation: `a(x) = x`).
874    recurrent_activation: Activation function to use for the recurrent step.
875      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is applied
876      (ie. "linear" activation: `a(x) = x`).
877    use_bias: Boolean, (default `True`), whether the layer uses a bias vector.
878    kernel_initializer: Initializer for the `kernel` weights matrix, used for
879      the linear transformation of the inputs. Default: `glorot_uniform`.
880    recurrent_initializer: Initializer for the `recurrent_kernel` weights
881      matrix, used for the linear transformation of the recurrent state.
882      Default: `orthogonal`.
883    bias_initializer: Initializer for the bias vector. Default: `zeros`.
884    unit_forget_bias: Boolean (default `True`). If True, add 1 to the bias of
885      the forget gate at initialization. Setting it to true will also force
886      `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
887        al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
888    kernel_regularizer: Regularizer function applied to the `kernel` weights
889      matrix. Default: `None`.
890    recurrent_regularizer: Regularizer function applied to
891      the `recurrent_kernel` weights matrix. Default: `None`.
892    bias_regularizer: Regularizer function applied to the bias vector. Default:
893      `None`.
894    kernel_constraint: Constraint function applied to the `kernel` weights
895      matrix. Default: `None`.
896    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
897      weights matrix. Default: `None`.
898    bias_constraint: Constraint function applied to the bias vector. Default:
899      `None`.
900    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
901      transformation of the inputs. Default: 0.
902    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
903      the linear transformation of the recurrent state. Default: 0.
904
905  Call arguments:
906    inputs: A 2D tensor, with shape of `[batch, feature]`.
907    states: List of 2 tensors that corresponding to the cell's units. Both of
908      them have shape `[batch, units]`, the first tensor is the memory state
909      from previous time step, the second tensor is the carry state from
910      previous time step. For timestep 0, the initial state provided by user
911      will be feed to cell.
912    training: Python boolean indicating whether the layer should behave in
913      training mode or in inference mode. Only relevant when `dropout` or
914      `recurrent_dropout` is used.
915  """
916
917  def __init__(self,
918               units,
919               activation='tanh',
920               recurrent_activation='sigmoid',
921               use_bias=True,
922               kernel_initializer='glorot_uniform',
923               recurrent_initializer='orthogonal',
924               bias_initializer='zeros',
925               unit_forget_bias=True,
926               kernel_regularizer=None,
927               recurrent_regularizer=None,
928               bias_regularizer=None,
929               kernel_constraint=None,
930               recurrent_constraint=None,
931               bias_constraint=None,
932               dropout=0.,
933               recurrent_dropout=0.,
934               **kwargs):
935    super(LSTMCell, self).__init__(
936        units,
937        activation=activation,
938        recurrent_activation=recurrent_activation,
939        use_bias=use_bias,
940        kernel_initializer=kernel_initializer,
941        recurrent_initializer=recurrent_initializer,
942        bias_initializer=bias_initializer,
943        unit_forget_bias=unit_forget_bias,
944        kernel_regularizer=kernel_regularizer,
945        recurrent_regularizer=recurrent_regularizer,
946        bias_regularizer=bias_regularizer,
947        kernel_constraint=kernel_constraint,
948        recurrent_constraint=recurrent_constraint,
949        bias_constraint=bias_constraint,
950        dropout=dropout,
951        recurrent_dropout=recurrent_dropout,
952        implementation=kwargs.pop('implementation', 2),
953        **kwargs)
954
955
956@keras_export('keras.layers.LSTM', v1=[])
957class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
958  """Long Short-Term Memory layer - Hochreiter 1997.
959
960  See [the Keras RNN API guide](https://www.tensorflow.org/guide/keras/rnn)
961  for details about the usage of RNN API.
962
963  Based on available runtime hardware and constraints, this layer
964  will choose different implementations (cuDNN-based or pure-TensorFlow)
965  to maximize the performance. If a GPU is available and all
966  the arguments to the layer meet the requirement of the CuDNN kernel
967  (see below for details), the layer will use a fast cuDNN implementation.
968
969  The requirements to use the cuDNN implementation are:
970
971  1. `activation` == `tanh`
972  2. `recurrent_activation` == `sigmoid`
973  3. `recurrent_dropout` == 0
974  4. `unroll` is `False`
975  5. `use_bias` is `True`
976  6. Inputs, if use masking, are strictly right-padded.
977  7. Eager execution is enabled in the outermost context.
978
979  For example:
980
981  >>> inputs = tf.random.normal([32, 10, 8])
982  >>> lstm = tf.keras.layers.LSTM(4)
983  >>> output = lstm(inputs)
984  >>> print(output.shape)
985  (32, 4)
986  >>> lstm = tf.keras.layers.LSTM(4, return_sequences=True, return_state=True)
987  >>> whole_seq_output, final_memory_state, final_carry_state = lstm(inputs)
988  >>> print(whole_seq_output.shape)
989  (32, 10, 4)
990  >>> print(final_memory_state.shape)
991  (32, 4)
992  >>> print(final_carry_state.shape)
993  (32, 4)
994
995  Args:
996    units: Positive integer, dimensionality of the output space.
997    activation: Activation function to use.
998      Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
999      is applied (ie. "linear" activation: `a(x) = x`).
1000    recurrent_activation: Activation function to use for the recurrent step.
1001      Default: sigmoid (`sigmoid`). If you pass `None`, no activation is
1002      applied (ie. "linear" activation: `a(x) = x`).
1003    use_bias: Boolean (default `True`), whether the layer uses a bias vector.
1004    kernel_initializer: Initializer for the `kernel` weights matrix, used for
1005      the linear transformation of the inputs. Default: `glorot_uniform`.
1006    recurrent_initializer: Initializer for the `recurrent_kernel` weights
1007      matrix, used for the linear transformation of the recurrent state.
1008      Default: `orthogonal`.
1009    bias_initializer: Initializer for the bias vector. Default: `zeros`.
1010    unit_forget_bias: Boolean (default `True`). If True, add 1 to the bias of
1011      the forget gate at initialization. Setting it to true will also force
1012      `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
1013          al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf).
1014    kernel_regularizer: Regularizer function applied to the `kernel` weights
1015      matrix. Default: `None`.
1016    recurrent_regularizer: Regularizer function applied to the
1017      `recurrent_kernel` weights matrix. Default: `None`.
1018    bias_regularizer: Regularizer function applied to the bias vector. Default:
1019      `None`.
1020    activity_regularizer: Regularizer function applied to the output of the
1021      layer (its "activation"). Default: `None`.
1022    kernel_constraint: Constraint function applied to the `kernel` weights
1023      matrix. Default: `None`.
1024    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
1025      weights matrix. Default: `None`.
1026    bias_constraint: Constraint function applied to the bias vector. Default:
1027      `None`.
1028    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
1029      transformation of the inputs. Default: 0.
1030    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
1031      the linear transformation of the recurrent state. Default: 0.
1032    return_sequences: Boolean. Whether to return the last output. in the output
1033      sequence, or the full sequence. Default: `False`.
1034    return_state: Boolean. Whether to return the last state in addition to the
1035      output. Default: `False`.
1036    go_backwards: Boolean (default `False`). If True, process the input sequence
1037      backwards and return the reversed sequence.
1038    stateful: Boolean (default `False`). If True, the last state for each sample
1039      at index i in a batch will be used as initial state for the sample of
1040      index i in the following batch.
1041    time_major: The shape format of the `inputs` and `outputs` tensors.
1042      If True, the inputs and outputs will be in shape
1043      `[timesteps, batch, feature]`, whereas in the False case, it will be
1044      `[batch, timesteps, feature]`. Using `time_major = True` is a bit more
1045      efficient because it avoids transposes at the beginning and end of the
1046      RNN calculation. However, most TensorFlow data is batch-major, so by
1047      default this function accepts input and emits output in batch-major
1048      form.
1049    unroll: Boolean (default `False`). If True, the network will be unrolled,
1050      else a symbolic loop will be used. Unrolling can speed-up a RNN, although
1051      it tends to be more memory-intensive. Unrolling is only suitable for short
1052      sequences.
1053
1054  Call arguments:
1055    inputs: A 3D tensor with shape `[batch, timesteps, feature]`.
1056    mask: Binary tensor of shape `[batch, timesteps]` indicating whether
1057      a given timestep should be masked (optional, defaults to `None`).
1058      An individual `True` entry indicates that the corresponding timestep
1059      should be utilized, while a `False` entry indicates that the corresponding
1060      timestep should be ignored.
1061    training: Python boolean indicating whether the layer should behave in
1062      training mode or in inference mode. This argument is passed to the cell
1063      when calling it. This is only relevant if `dropout` or
1064      `recurrent_dropout` is used (optional, defaults to `None`).
1065    initial_state: List of initial state tensors to be passed to the first
1066      call of the cell (optional, defaults to `None` which causes creation
1067      of zero-filled initial state tensors).
1068  """
1069
1070  def __init__(self,
1071               units,
1072               activation='tanh',
1073               recurrent_activation='sigmoid',
1074               use_bias=True,
1075               kernel_initializer='glorot_uniform',
1076               recurrent_initializer='orthogonal',
1077               bias_initializer='zeros',
1078               unit_forget_bias=True,
1079               kernel_regularizer=None,
1080               recurrent_regularizer=None,
1081               bias_regularizer=None,
1082               activity_regularizer=None,
1083               kernel_constraint=None,
1084               recurrent_constraint=None,
1085               bias_constraint=None,
1086               dropout=0.,
1087               recurrent_dropout=0.,
1088               return_sequences=False,
1089               return_state=False,
1090               go_backwards=False,
1091               stateful=False,
1092               time_major=False,
1093               unroll=False,
1094               **kwargs):
1095    # return_runtime is a flag for testing, which shows the real backend
1096    # implementation chosen by grappler in graph mode.
1097    self.return_runtime = kwargs.pop('return_runtime', False)
1098
1099    super(LSTM, self).__init__(
1100        units,
1101        activation=activation,
1102        recurrent_activation=recurrent_activation,
1103        use_bias=use_bias,
1104        kernel_initializer=kernel_initializer,
1105        recurrent_initializer=recurrent_initializer,
1106        bias_initializer=bias_initializer,
1107        unit_forget_bias=unit_forget_bias,
1108        kernel_regularizer=kernel_regularizer,
1109        recurrent_regularizer=recurrent_regularizer,
1110        bias_regularizer=bias_regularizer,
1111        activity_regularizer=activity_regularizer,
1112        kernel_constraint=kernel_constraint,
1113        recurrent_constraint=recurrent_constraint,
1114        bias_constraint=bias_constraint,
1115        dropout=dropout,
1116        recurrent_dropout=recurrent_dropout,
1117        implementation=kwargs.pop('implementation', 2),
1118        return_sequences=return_sequences,
1119        return_state=return_state,
1120        go_backwards=go_backwards,
1121        stateful=stateful,
1122        time_major=time_major,
1123        unroll=unroll,
1124        **kwargs)
1125
1126    self.state_spec = [
1127        InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
1128    ]
1129    self._could_use_gpu_kernel = (
1130        self.activation in (activations.tanh, nn.tanh) and
1131        self.recurrent_activation in (activations.sigmoid, nn.sigmoid) and
1132        recurrent_dropout == 0 and not unroll and use_bias and
1133        ops.executing_eagerly_outside_functions())
1134    if config.list_logical_devices('GPU'):
1135      # Only show the message when there is GPU available, user will not care
1136      # about the cuDNN if there isn't any GPU.
1137      if self._could_use_gpu_kernel:
1138        logging.debug(_CUDNN_AVAILABLE_MSG % self.name)
1139      else:
1140        logging.warning(_CUDNN_NOT_AVAILABLE_MSG % self.name)
1141
1142    if _use_new_code():
1143      self._defun_wrapper = _DefunWrapper(time_major, go_backwards, 'lstm')
1144
1145  def call(self, inputs, mask=None, training=None, initial_state=None):
1146    # The input should be dense, padded with zeros. If a ragged input is fed
1147    # into the layer, it is padded and the row lengths are used for masking.
1148    inputs, row_lengths = backend.convert_inputs_if_ragged(inputs)
1149    is_ragged_input = (row_lengths is not None)
1150    self._validate_args_if_ragged(is_ragged_input, mask)
1151
1152    # LSTM does not support constants. Ignore it during process.
1153    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
1154
1155    if isinstance(mask, list):
1156      mask = mask[0]
1157
1158    input_shape = backend.int_shape(inputs)
1159    timesteps = input_shape[0] if self.time_major else input_shape[1]
1160
1161    # TODO(b/156447398) Investigate why the cuDNN kernel fails with ragged
1162    # inputs.
1163    if is_ragged_input or not self._could_use_gpu_kernel:
1164      # Fall back to use the normal LSTM.
1165      kwargs = {'training': training}
1166      self._maybe_reset_cell_dropout_mask(self.cell)
1167
1168      def step(inputs, states):
1169        return self.cell(inputs, states, **kwargs)
1170
1171      last_output, outputs, states = backend.rnn(
1172          step,
1173          inputs,
1174          initial_state,
1175          constants=None,
1176          go_backwards=self.go_backwards,
1177          mask=mask,
1178          unroll=self.unroll,
1179          input_length=row_lengths if row_lengths is not None else timesteps,
1180          time_major=self.time_major,
1181          zero_output_for_mask=self.zero_output_for_mask)
1182      runtime = _runtime(_RUNTIME_UNKNOWN)
1183    else:
1184      # Use the new defun approach for backend implementation swap.
1185      # Note that different implementations need to have same function
1186      # signature, eg, the tensor parameters need to have same shape and dtypes.
1187      # Since the CuDNN has an extra set of bias, those bias will be passed to
1188      # both normal and CuDNN implementations.
1189      self.reset_dropout_mask()
1190      dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
1191      if dropout_mask is not None:
1192        inputs = inputs * dropout_mask[0]
1193      if _use_new_code():
1194        lstm_kwargs = {
1195            'inputs':
1196                inputs,
1197            'init_h':
1198                _read_variable_value(initial_state[0]),
1199            'init_c':
1200                _read_variable_value(initial_state[1]),
1201            'kernel':
1202                _read_variable_value(self.cell.kernel),
1203            'recurrent_kernel':
1204                _read_variable_value(self.cell.recurrent_kernel),
1205            'bias':
1206                _read_variable_value(self.cell.bias),
1207            'mask':
1208                mask,
1209            'time_major':
1210                self.time_major,
1211            'go_backwards':
1212                self.go_backwards,
1213            'sequence_lengths':
1214                row_lengths,
1215            'zero_output_for_mask':
1216                self.zero_output_for_mask,
1217        }
1218        (last_output, outputs, new_h, new_c,
1219         runtime) = self._defun_wrapper.defun_layer(**lstm_kwargs)
1220      else:
1221        gpu_lstm_kwargs = {
1222            'inputs':
1223                inputs,
1224            'init_h':
1225                _read_variable_value(initial_state[0]),
1226            'init_c':
1227                _read_variable_value(initial_state[1]),
1228            'kernel':
1229                _read_variable_value(self.cell.kernel),
1230            'recurrent_kernel':
1231                _read_variable_value(self.cell.recurrent_kernel),
1232            'bias':
1233                _read_variable_value(self.cell.bias),
1234            'mask':
1235                mask,
1236            'time_major':
1237                self.time_major,
1238            'go_backwards':
1239                self.go_backwards,
1240            'sequence_lengths':
1241                row_lengths
1242        }
1243        normal_lstm_kwargs = gpu_lstm_kwargs.copy()
1244        normal_lstm_kwargs.update({
1245            'zero_output_for_mask': self.zero_output_for_mask,
1246        })
1247
1248        if context.executing_eagerly():
1249          device_type = _get_context_device_type()
1250          can_use_gpu = (
1251              # Either user specified GPU or unspecified but GPU is available.
1252              (device_type == _GPU_DEVICE_NAME or
1253               (device_type is None and config.list_logical_devices('GPU'))) and
1254              (mask is None or
1255               is_cudnn_supported_inputs(mask, self.time_major)))
1256          # Under eager context, check the device placement and prefer the
1257          # GPU implementation when GPU is available.
1258          if can_use_gpu:
1259            last_output, outputs, new_h, new_c, runtime = gpu_lstm(
1260                **gpu_lstm_kwargs)
1261          else:
1262            last_output, outputs, new_h, new_c, runtime = standard_lstm(
1263                **normal_lstm_kwargs)
1264        else:
1265          (last_output, outputs, new_h, new_c,
1266           runtime) = lstm_with_backend_selection(**normal_lstm_kwargs)
1267
1268      states = [new_h, new_c]
1269
1270    if self.stateful:
1271      updates = [
1272          state_ops.assign(self_state, state)
1273          for self_state, state in zip(self.states, states)
1274      ]
1275      self.add_update(updates)
1276
1277    if self.return_sequences:
1278      output = backend.maybe_convert_to_ragged(
1279          is_ragged_input, outputs, row_lengths, go_backwards=self.go_backwards)
1280    else:
1281      output = last_output
1282
1283    if self.return_state:
1284      return [output] + list(states)
1285    elif self.return_runtime:
1286      return output, runtime
1287    else:
1288      return output
1289
1290
1291def _canonical_to_params(weights, biases, shape, transpose_weights=False):
1292  """Utility function convert variable to CuDNN compatible parameter.
1293
1294  Note that Keras weights for kernels are different from the CuDNN format. Eg.:
1295
1296  ```
1297    Keras                 CuDNN
1298    [[0, 1, 2],  <--->  [[0, 2, 4],
1299     [3, 4, 5]]          [1, 3, 5]]
1300  ```
1301
1302  If the input weights need to be in a unified format, then set
1303  `transpose_weights=True` to convert the weights.
1304
1305  Args:
1306    weights: list of weights for the individual kernels and recurrent kernels.
1307    biases: list of biases for individual gate.
1308    shape: the shape for the converted variables that will be feed to CuDNN.
1309    transpose_weights: boolean, whether to transpose the weights.
1310
1311  Returns:
1312    The converted weights that can be feed to CuDNN ops as param.
1313  """
1314  def convert(w):
1315    return array_ops.transpose(w) if transpose_weights else w
1316
1317  weights = [array_ops.reshape(convert(x), shape) for x in weights]
1318  biases = [array_ops.reshape(x, shape) for x in biases]
1319  return array_ops.concat(weights + biases, axis=0)
1320
1321
1322def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
1323                  mask, time_major, go_backwards, sequence_lengths,
1324                  zero_output_for_mask):
1325  """LSTM with standard kernel implementation.
1326
1327  This implementation can be run on all types for hardware.
1328
1329  This implementation lifts out all the layer weights and make them function
1330  parameters. It has same number of tensor input params as the CuDNN
1331  counterpart. The RNN step logic has been simplified, eg dropout and mask is
1332  removed since CuDNN implementation does not support that.
1333
1334  Note that the first half of the bias tensor should be ignored by this impl.
1335  The CuDNN impl need an extra set of input gate bias. In order to make the both
1336  function take same shape of parameter, that extra set of bias is also feed
1337  here.
1338
1339  Args:
1340    inputs: input tensor of LSTM layer.
1341    init_h: initial state tensor for the cell output.
1342    init_c: initial state tensor for the cell hidden state.
1343    kernel: weights for cell kernel.
1344    recurrent_kernel: weights for cell recurrent kernel.
1345    bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
1346      is used in this case.
1347    mask: Boolean tensor for mask out the steps within sequence.
1348      An individual `True` entry indicates that the corresponding timestep
1349      should be utilized, while a `False` entry indicates that the corresponding
1350      timestep should be ignored.
1351    time_major: boolean, whether the inputs are in the format of
1352      [time, batch, feature] or [batch, time, feature].
1353    go_backwards: Boolean (default False). If True, process the input sequence
1354      backwards and return the reversed sequence.
1355    sequence_lengths: The lengths of all sequences coming from a variable length
1356      input, such as ragged tensors. If the input has a fixed timestep size,
1357      this should be None.
1358    zero_output_for_mask: Boolean, whether to output zero for masked timestep.
1359
1360  Returns:
1361    last_output: output tensor for the last timestep, which has shape
1362      [batch, units].
1363    outputs: output tensor for all timesteps, which has shape
1364      [batch, time, units].
1365    state_0: the cell output, which has same shape as init_h.
1366    state_1: the cell hidden state, which has same shape as init_c.
1367    runtime: constant string tensor which indicate real runtime hardware. This
1368      value is for testing purpose and should be used by user.
1369  """
1370  input_shape = backend.int_shape(inputs)
1371  timesteps = input_shape[0] if time_major else input_shape[1]
1372
1373  def step(cell_inputs, cell_states):
1374    """Step function that will be used by Keras RNN backend."""
1375    h_tm1 = cell_states[0]  # previous memory state
1376    c_tm1 = cell_states[1]  # previous carry state
1377
1378    z = backend.dot(cell_inputs, kernel)
1379    z += backend.dot(h_tm1, recurrent_kernel)
1380    z = backend.bias_add(z, bias)
1381
1382    z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)
1383
1384    i = nn.sigmoid(z0)
1385    f = nn.sigmoid(z1)
1386    c = f * c_tm1 + i * nn.tanh(z2)
1387    o = nn.sigmoid(z3)
1388
1389    h = o * nn.tanh(c)
1390    return h, [h, c]
1391
1392  last_output, outputs, new_states = backend.rnn(
1393      step,
1394      inputs, [init_h, init_c],
1395      constants=None,
1396      unroll=False,
1397      time_major=time_major,
1398      mask=mask,
1399      go_backwards=go_backwards,
1400      input_length=(sequence_lengths
1401                    if sequence_lengths is not None else timesteps),
1402      zero_output_for_mask=zero_output_for_mask)
1403  return (last_output, outputs, new_states[0], new_states[1],
1404          _runtime(_RUNTIME_CPU))
1405
1406
1407def gpu_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
1408             time_major, go_backwards, sequence_lengths):
1409  """LSTM with either CuDNN or ROCm implementation which is only available for GPU.
1410
1411  Note that currently only right padded data is supported, or the result will be
1412  polluted by the unmasked data which should be filtered.
1413
1414  Args:
1415    inputs: Input tensor of LSTM layer.
1416    init_h: Initial state tensor for the cell output.
1417    init_c: Initial state tensor for the cell hidden state.
1418    kernel: Weights for cell kernel.
1419    recurrent_kernel: Weights for cell recurrent kernel.
1420    bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
1421      is used in this case.
1422    mask: Boolean tensor for mask out the steps within sequence.
1423      An individual `True` entry indicates that the corresponding timestep
1424      should be utilized, while a `False` entry indicates that the corresponding
1425      timestep should be ignored.
1426    time_major: Boolean, whether the inputs are in the format of [time, batch,
1427      feature] or [batch, time, feature].
1428    go_backwards: Boolean (default False). If True, process the input sequence
1429      backwards and return the reversed sequence.
1430    sequence_lengths: The lengths of all sequences coming from a variable length
1431      input, such as ragged tensors. If the input has a fixed timestep size,
1432      this should be None.
1433
1434  Returns:
1435    last_output: Output tensor for the last timestep, which has shape
1436      [batch, units].
1437    outputs: Output tensor for all timesteps, which has shape
1438      [batch, time, units].
1439    state_0: The cell output, which has same shape as init_h.
1440    state_1: The cell hidden state, which has same shape as init_c.
1441    runtime: Constant string tensor which indicate real runtime hardware. This
1442      value is for testing purpose and should not be used by user.
1443  """
1444  if not time_major and mask is None:
1445    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
1446    seq_axis, batch_axis = (0, 1)
1447  else:
1448    seq_axis, batch_axis = (0, 1) if time_major else (1, 0)
1449  # For init_h and init_c, cuDNN expects one more dim of num_layers before or
1450  # after batch dim for time major or batch major inputs respectively
1451  init_h = array_ops.expand_dims(init_h, axis=seq_axis)
1452  init_c = array_ops.expand_dims(init_c, axis=seq_axis)
1453
1454  weights = array_ops.split(kernel, 4, axis=1)
1455  weights += array_ops.split(recurrent_kernel, 4, axis=1)
1456  # CuDNN has an extra set of bias for inputs, we disable them (setting to 0),
1457  # so that mathematically it is same as the canonical LSTM implementation.
1458  full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)
1459
1460  if sysconfig.get_build_info()['is_rocm_build']:
1461    # ROCm MIOpen's weight sequence for LSTM is different from both canonical
1462    # and Cudnn format
1463    # MIOpen: [i, f, o, c] Cudnn/Canonical: [i, f, c, o]
1464    # i is input gate weights.
1465    # f is forget gate weights.
1466    # o is output gate weights.
1467    # c is cell gate weights.
1468    weights = [weights[x] for x in (0, 1, 3, 2, 4, 5, 7, 6)]
1469    # full_bias is a tensor of shape (8*n,)
1470    full_bias = array_ops.split(full_bias, 8, axis=0)
1471    full_bias = [full_bias[x] for x in (0, 1, 3, 2, 4, 5, 7, 6)]
1472
1473  params = _canonical_to_params(
1474      weights=weights,
1475      biases=array_ops.split(full_bias, 8),
1476      shape=constant_op.constant([-1]),
1477      transpose_weights=True)
1478
1479  if mask is not None:
1480    sequence_lengths = calculate_sequence_by_mask(mask, time_major)
1481
1482  if sequence_lengths is not None:
1483    if go_backwards:
1484      # Three reversals are required. E.g.,
1485      # normal input = [1, 2, 3, 0, 0]  # where 0 need to be masked
1486      # reversed_input_to_cudnn = [3, 2, 1, 0, 0]
1487      # output_from_cudnn = [6, 5, 4, 0, 0]
1488      # expected_output = [0, 0, 6, 5 ,4]
1489      inputs = array_ops.reverse_sequence_v2(
1490          inputs, sequence_lengths, seq_axis=seq_axis, batch_axis=batch_axis)
1491    outputs, h, c, _, _ = gen_cudnn_rnn_ops.CudnnRNNV3(
1492        input=inputs,
1493        input_h=init_h,
1494        input_c=init_c,
1495        params=params,
1496        is_training=True,
1497        rnn_mode='lstm',
1498        sequence_lengths=sequence_lengths,
1499        time_major=time_major)
1500    if go_backwards:
1501      outputs = array_ops.reverse_sequence_v2(
1502          outputs, sequence_lengths, seq_axis=seq_axis, batch_axis=batch_axis)
1503      outputs = array_ops.reverse(outputs, axis=[seq_axis])
1504  else:
1505    # # Fill the array with shape [batch] with value of max timesteps.
1506    # sequence_length = array_ops.fill([array_ops.shape(inputs)[1]],
1507    #                                  array_ops.shape(inputs)[0])
1508    if go_backwards:
1509      # Reverse axis 0 since the input is already convert to time major.
1510      inputs = array_ops.reverse(inputs, axis=[0])
1511    outputs, h, c, _ = gen_cudnn_rnn_ops.CudnnRNN(
1512        input=inputs, input_h=init_h, input_c=init_c, params=params,
1513        is_training=True, rnn_mode='lstm')
1514
1515  last_output = outputs[-1]
1516  if not time_major and mask is None:
1517    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
1518  h = array_ops.squeeze(h, axis=seq_axis)
1519  c = array_ops.squeeze(c, axis=seq_axis)
1520
1521  # In the case of variable length input, the cudnn kernel will fill zeros for
1522  # the output, whereas the default keras behavior is to bring over the previous
1523  # output for t-1, so that in the return_sequence=False case, user can quickly
1524  # get the final effect output instead just 0s at the last timestep.
1525  # In order to mimic the default keras behavior, we copy the final h state as
1526  # the last_output, since it is numerically same as the output.
1527  if mask is not None:
1528    last_output = h
1529  return last_output, outputs, h, c, _runtime(_RUNTIME_GPU)
1530
1531
1532def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
1533                                recurrent_kernel, bias, mask, time_major,
1534                                go_backwards, sequence_lengths,
1535                                zero_output_for_mask):
1536  """Call the LSTM with optimized backend kernel selection.
1537
1538  Under the hood, this function will create two TF function, one with the most
1539  generic kernel and can run on all device condition, and the second one with
1540  CuDNN specific kernel, which can only run on GPU.
1541
1542  The first function will be called with normal_lstm_params, while the second
1543  function is not called, but only registered in the graph. The Grappler will
1544  do the proper graph rewrite and swap the optimized TF function based on the
1545  device placement.
1546
1547  Args:
1548    inputs: Input tensor of LSTM layer.
1549    init_h: Initial state tensor for the cell output.
1550    init_c: Initial state tensor for the cell hidden state.
1551    kernel: Weights for cell kernel.
1552    recurrent_kernel: Weights for cell recurrent kernel.
1553    bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias
1554      is used in this case.
1555    mask: Boolean tensor for mask out the steps within sequence.
1556      An individual `True` entry indicates that the corresponding timestep
1557      should be utilized, while a `False` entry indicates that the corresponding
1558      timestep should be ignored.
1559    time_major: Boolean, whether the inputs are in the format of
1560      [time, batch, feature] or [batch, time, feature].
1561    go_backwards: Boolean (default False). If True, process the input sequence
1562      backwards and return the reversed sequence.
1563    sequence_lengths: The lengths of all sequences coming from a variable length
1564      input, such as ragged tensors. If the input has a fixed timestep size,
1565      this should be None.
1566    zero_output_for_mask: Boolean, whether to output zero for masked timestep.
1567
1568  Returns:
1569    List of output tensors, same as standard_lstm.
1570  """
1571  params = {
1572      'inputs': inputs,
1573      'init_h': init_h,
1574      'init_c': init_c,
1575      'kernel': kernel,
1576      'recurrent_kernel': recurrent_kernel,
1577      'bias': bias,
1578      'mask': mask,
1579      'time_major': time_major,
1580      'go_backwards': go_backwards,
1581      'sequence_lengths': sequence_lengths,
1582      'zero_output_for_mask': zero_output_for_mask,
1583  }
1584
1585  def gpu_lstm_with_fallback(inputs, init_h, init_c, kernel, recurrent_kernel,
1586                             bias, mask, time_major, go_backwards,
1587                             sequence_lengths, zero_output_for_mask):
1588    """Use CuDNN kernel when mask is none or strictly right padded."""
1589    if mask is None:
1590      return gpu_lstm(
1591          inputs=inputs,
1592          init_h=init_h,
1593          init_c=init_c,
1594          kernel=kernel,
1595          recurrent_kernel=recurrent_kernel,
1596          bias=bias,
1597          mask=mask,
1598          time_major=time_major,
1599          go_backwards=go_backwards,
1600          sequence_lengths=sequence_lengths)
1601
1602    def cudnn_lstm_fn():
1603      return gpu_lstm(
1604          inputs=inputs,
1605          init_h=init_h,
1606          init_c=init_c,
1607          kernel=kernel,
1608          recurrent_kernel=recurrent_kernel,
1609          bias=bias,
1610          mask=mask,
1611          time_major=time_major,
1612          go_backwards=go_backwards,
1613          sequence_lengths=sequence_lengths)
1614
1615    def stardard_lstm_fn():
1616      return standard_lstm(
1617          inputs=inputs,
1618          init_h=init_h,
1619          init_c=init_c,
1620          kernel=kernel,
1621          recurrent_kernel=recurrent_kernel,
1622          bias=bias,
1623          mask=mask,
1624          time_major=time_major,
1625          go_backwards=go_backwards,
1626          sequence_lengths=sequence_lengths,
1627          zero_output_for_mask=zero_output_for_mask)
1628
1629    return control_flow_ops.cond(
1630        is_cudnn_supported_inputs(mask, time_major),
1631        true_fn=cudnn_lstm_fn,
1632        false_fn=stardard_lstm_fn)
1633
1634  if _use_new_code():
1635    # Chooses the implementation dynamically based on the running device.
1636    (last_output, outputs, new_h, new_c,
1637     runtime) = control_flow_ops.execute_fn_for_device(
1638         {
1639             _CPU_DEVICE_NAME: lambda: standard_lstm(**params),
1640             _GPU_DEVICE_NAME: lambda: gpu_lstm_with_fallback(**params)
1641         }, lambda: standard_lstm(**params))
1642  else:
1643    # Each time a `tf.function` is called, we will give it a unique
1644    # identifiable API name, so that Grappler won't get confused when it
1645    # sees multiple LSTM layers added into same graph, and it will be able
1646    # to pair up the different implementations across them.
1647    api_name = 'lstm_' + str(uuid.uuid4())
1648    supportive_attribute = {
1649        'time_major': time_major,
1650        'go_backwards': go_backwards,
1651    }
1652    defun_standard_lstm = _generate_defun_backend(api_name, _CPU_DEVICE_NAME,
1653                                                  standard_lstm,
1654                                                  supportive_attribute)
1655    defun_gpu_lstm = _generate_defun_backend(api_name, _GPU_DEVICE_NAME,
1656                                             gpu_lstm_with_fallback,
1657                                             supportive_attribute)
1658
1659    # Call the normal LSTM impl and register the CuDNN impl function. The
1660    # grappler will kick in during session execution to optimize the graph.
1661    last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(**params)
1662    _function_register(defun_gpu_lstm, **params)
1663
1664  return last_output, outputs, new_h, new_c, runtime
1665
1666
1667def is_sequence_right_padded(mask):
1668  """Check the mask tensor and see if it right padded.
1669
1670  For CuDNN kernel, it uses the sequence length param to skip the tailing
1671  timestep. If the data is left padded, or not a strict right padding (has
1672  masked value in the middle of the sequence), then CuDNN kernel won't be work
1673  properly in those cases.
1674
1675  Left padded data: [[False, False, True, True, True]].
1676  Right padded data: [[True, True, True, False, False]].
1677  Mixture of mask/unmasked data: [[True, False, True, False, False]].
1678
1679  Note that for the mixed data example above, the actually data RNN should see
1680  are those 2 Trues (index 0 and 2), the index 1 False should be ignored and not
1681  pollute the internal states.
1682
1683  Args:
1684    mask: the Boolean tensor with shape [batch, timestep]
1685
1686  Returns:
1687    boolean scalar tensor, whether the mask is strictly right padded.
1688  """
1689  max_seq_length = array_ops.shape(mask)[1]
1690  count_of_true = math_ops.reduce_sum(math_ops.cast(mask, dtypes.int32), axis=1)
1691  right_padded_mask = array_ops.sequence_mask(
1692      count_of_true, maxlen=max_seq_length)
1693  return math_ops.reduce_all(math_ops.equal(mask, right_padded_mask))
1694
1695
1696def has_fully_masked_sequence(mask):
1697  # See https://github.com/tensorflow/tensorflow/issues/33148 for more details.
1698  # Cudnn kernel will error out if the input sequence contains any fully masked
1699  # data. We walk around this issue by rerouting the computation to standard
1700  # kernel, until the issue on cudnn side has been fixed.
1701  # For a fully masked sequence, it will contain all Falses. To make it easy to
1702  # check, we inverse the boolean, check if any of the sequence has all True.
1703  return math_ops.reduce_any(
1704      math_ops.reduce_all(
1705          math_ops.logical_not(mask),
1706          axis=1))
1707
1708
1709def is_cudnn_supported_inputs(mask, time_major):
1710  if time_major:
1711    mask = array_ops.transpose(mask)
1712
1713  return math_ops.logical_and(
1714      is_sequence_right_padded(mask),
1715      math_ops.logical_not(has_fully_masked_sequence(mask)))
1716
1717
1718def calculate_sequence_by_mask(mask, time_major):
1719  """Calculate the sequence length tensor (1-D) based on the masking tensor.
1720
1721  The masking tensor is a 2D boolean tensor with shape [batch, timestep]. For
1722  any timestep that should be masked, the corresponding field will be False.
1723  Consider the following example:
1724    a = [[True, True, False, False],
1725         [True, True, True, False]]
1726  It is a (2, 4) tensor, and the corresponding sequence length result should be
1727  1D tensor with value [2, 3]. Note that the masking tensor must be right
1728  padded that could be checked by, e.g., `is_sequence_right_padded()`.
1729
1730  Args:
1731    mask: Boolean tensor with shape [batch, timestep] or [timestep, batch] if
1732      time_major=True.
1733    time_major: Boolean, which indicates whether the mask is time major or batch
1734      major.
1735  Returns:
1736    sequence_length: 1D int32 tensor.
1737  """
1738  timestep_index = 0 if time_major else 1
1739  return math_ops.reduce_sum(math_ops.cast(mask, dtypes.int32),
1740                             axis=timestep_index)
1741
1742
1743def _generate_defun_backend(unique_api_name, preferred_device, func,
1744                            supportive_attributes):
1745  function_attributes = {
1746      _FUNCTION_API_NAME_ATTRIBUTE: unique_api_name,
1747      _FUNCTION_DEVICE_ATTRIBUTE: preferred_device,
1748  }
1749  function_attributes.update(supportive_attributes)
1750  return function.defun_with_attributes(func=func,
1751                                        attributes=function_attributes,
1752                                        autograph=False)
1753
1754
1755def _get_context_device_type():
1756  """Parse the current context and return the device type, eg CPU/GPU."""
1757  current_device = get_device_name()
1758  if current_device is None:
1759    return None
1760  return device.DeviceSpec.from_string(current_device).device_type
1761
1762
1763def _runtime(runtime_name):
1764  with ops.device('/cpu:0'):
1765    return constant_op.constant(
1766        runtime_name, dtype=dtypes.float32, name='runtime')
1767
1768
1769def _read_variable_value(v):
1770  """Read the value of a variable if it is variable."""
1771  if isinstance(v, variables.Variable):
1772    return v.read_value()
1773  return v
1774
1775
1776def _function_register(func, *args, **kwargs):
1777  """Register a specialization of a `Function` into the graph.
1778
1779  This won't actually call the function with the inputs, and only put the
1780  function definition into graph. Register function with different input param
1781  will result into multiple version of functions registered in graph.
1782
1783  Args:
1784    func: the `Function` instance that generated by a @defun
1785    *args: input arguments for the Python function.
1786    **kwargs: input keyword arguments for the Python function.
1787
1788  Returns:
1789    a `ConcreteFunction` object specialized to inputs and execution context.
1790
1791  Raises:
1792    ValueError: When the input function is not a defun wrapped python function.
1793  """
1794  concrete_func = func.get_concrete_function(*args, **kwargs)
1795  concrete_func.add_to_graph()
1796  concrete_func.add_gradient_functions_to_graph()
1797  return concrete_func
1798