1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Cudnn RNN operators.""" 16from __future__ import absolute_import 17from __future__ import division 18from __future__ import print_function 19 20import os 21from tensorflow.contrib.checkpoint.python import split_dependency 22from tensorflow.contrib.rnn.python.ops import lstm_ops 23from tensorflow.python.framework import dtypes 24from tensorflow.python.framework import ops 25from tensorflow.python.framework import random_seed 26from tensorflow.python.keras.engine import base_layer 27from tensorflow.python.ops import array_ops 28from tensorflow.python.ops import gen_cudnn_rnn_ops 29from tensorflow.python.ops import init_ops 30from tensorflow.python.ops import math_ops 31from tensorflow.python.ops import nn_ops 32from tensorflow.python.ops import rnn_cell_impl 33from tensorflow.python.ops import state_ops 34from tensorflow.python.ops import variable_scope as vs 35from tensorflow.python.training import saver 36from tensorflow.python.training.tracking import tracking as trackable_lib 37 38CUDNN_RNN_UNIDIRECTION = "unidirectional" 39CUDNN_RNN_BIDIRECTION = "bidirectional" 40CUDNN_LSTM = "lstm" 41CUDNN_GRU = "gru" 42CUDNN_RNN_RELU = "rnn_relu" 43CUDNN_RNN_TANH = "rnn_tanh" 44 45# Half for cell input, half for hidden states. 46CUDNN_LSTM_PARAMS_PER_LAYER = 8 47CUDNN_GRU_PARAMS_PER_LAYER = 6 48CUDNN_RNN_TANH_PARAMS_PER_LAYER = 2 49CUDNN_RNN_RELU_PARAMS_PER_LAYER = 2 50 51CUDNN_INPUT_LINEAR_MODE = "linear_input" 52CUDNN_INPUT_SKIP_MODE = "skip_input" 53CUDNN_INPUT_AUTO_MODE = "auto_select" 54 55# pylint:disable=protected-access 56_BIAS_VARIABLE_NAME = rnn_cell_impl._BIAS_VARIABLE_NAME 57_WEIGHTS_VARIABLE_NAME = rnn_cell_impl._WEIGHTS_VARIABLE_NAME 58# pylint:enable=protected-access 59 60 61class CudnnCompatibleLSTMCell(lstm_ops.LSTMBlockCell): 62 """Cudnn Compatible LSTMCell. 63 64 A simple wrapper around `tf.contrib.rnn.LSTMBlockCell` to use along with 65 `tf.contrib.cudnn_rnn.CudnnLSTM`. The latter's params can be used by 66 this cell seamlessly. 67 """ 68 69 def __init__(self, num_units, reuse=None): 70 super(CudnnCompatibleLSTMCell, self).__init__( 71 num_units, forget_bias=0, cell_clip=None, use_peephole=False, 72 reuse=reuse, name="cudnn_compatible_lstm_cell") 73 self._names.update({"scope": "cudnn_compatible_lstm_cell"}) 74 75 76class CudnnCompatibleGRUCell(rnn_cell_impl.GRUCell): 77 r"""Cudnn Compatible GRUCell. 78 79 A GRU impl akin to `tf.nn.rnn_cell.GRUCell` to use along with 80 `tf.contrib.cudnn_rnn.CudnnGRU`. The latter's params can be used by 81 it seamlessly. 82 83 It differs from platform-independent GRUs in how the new memory gate is 84 calculated. Nvidia picks this variant based on GRU author's[1] suggestion and 85 the fact it has no accuracy impact[2]. 86 [1] https://arxiv.org/abs/1406.1078 87 [2] http://svail.github.io/diff_graphs/ 88 89 Cudnn compatible GRU (from Cudnn library user guide): 90 ```python 91 # reset gate 92 $$r_t = \sigma(x_t * W_r + h_t-1 * R_h + b_{Wr} + b_{Rr})$$ 93 # update gate 94 $$u_t = \sigma(x_t * W_u + h_t-1 * R_u + b_{Wu} + b_{Ru})$$ 95 # new memory gate 96 $$h'_t = tanh(x_t * W_h + r_t .* (h_t-1 * R_h + b_{Rh}) + b_{Wh})$$ 97 $$h_t = (1 - u_t) .* h'_t + u_t .* h_t-1$$ 98 ``` 99 100 Other GRU (see `tf.nn.rnn_cell.GRUCell` and `tf.contrib.rnn.GRUBlockCell`): 101 ```python 102 # new memory gate 103 \\(h'_t = tanh(x_t * W_h + (r_t .* h_t-1) * R_h + b_{Wh})\\) 104 ``` 105 which is not equivalent to Cudnn GRU: in addition to the extra bias term b_Rh, 106 ```python 107 \\(r .* (h * R) != (r .* h) * R\\) 108 ``` 109 """ 110 111 def __init__(self, num_units, reuse=None, kernel_initializer=None): 112 super(CudnnCompatibleGRUCell, self).__init__( 113 num_units, 114 activation=None, 115 reuse=reuse, 116 kernel_initializer=kernel_initializer) 117 118 def build(self, inputs_shape): 119 if inputs_shape[1].value is None: 120 raise ValueError("Expected inputs.shape[-1] to be known, saw shape: %s" 121 % inputs_shape) 122 123 input_depth = inputs_shape[1].value 124 self._gate_kernel = self.add_variable( 125 "gates/%s" % _WEIGHTS_VARIABLE_NAME, 126 shape=[input_depth + self._num_units, 2 * self._num_units], 127 initializer=self._kernel_initializer) 128 self._gate_bias = self.add_variable( 129 "gates/%s" % _BIAS_VARIABLE_NAME, 130 shape=[2 * self._num_units], 131 initializer=( 132 self._bias_initializer 133 if self._bias_initializer is not None 134 else init_ops.constant_initializer(1.0, dtype=self.dtype))) 135 136 self._candidate_input_kernel = self.add_variable( 137 "candidate/input_projection/%s" % _WEIGHTS_VARIABLE_NAME, 138 shape=[input_depth, self._num_units], 139 initializer=self._kernel_initializer) 140 self._candidate_hidden_kernel = self.add_variable( 141 "candidate/hidden_projection/%s" % _WEIGHTS_VARIABLE_NAME, 142 shape=[self._num_units, self._num_units], 143 initializer=self._kernel_initializer) 144 145 self._candidate_input_bias = self.add_variable( 146 "candidate/input_projection/%s" % _BIAS_VARIABLE_NAME, 147 shape=[self._num_units], 148 initializer=( 149 self._bias_initializer 150 if self._bias_initializer is not None 151 else init_ops.zeros_initializer(dtype=self.dtype))) 152 self._candidate_hidden_bias = self.add_variable( 153 "candidate/hidden_projection/%s" % _BIAS_VARIABLE_NAME, 154 shape=[self._num_units], 155 initializer=( 156 self._bias_initializer 157 if self._bias_initializer is not None 158 else init_ops.zeros_initializer(dtype=self.dtype))) 159 160 def call(self, inputs, state): 161 """Gated recurrent unit (GRU) with nunits cells.""" 162 gate_inputs = math_ops.matmul( 163 array_ops.concat([inputs, state], 1), self._gate_kernel) 164 gate_inputs = nn_ops.bias_add(gate_inputs, self._gate_bias) 165 166 value = math_ops.sigmoid(gate_inputs) 167 r, u = array_ops.split(value=value, num_or_size_splits=2, axis=1) 168 169 candidate = nn_ops.bias_add( 170 math_ops.matmul(inputs, self._candidate_input_kernel), 171 self._candidate_input_bias) 172 candidate += r * nn_ops.bias_add( 173 math_ops.matmul(state, self._candidate_hidden_kernel), 174 self._candidate_hidden_bias) 175 candidate = self._activation(candidate) 176 new_h = (1-u) * candidate + u * state 177 return new_h, new_h 178 179 180class CudnnParamsFormatConverter(object): 181 """Abstract class that converts between params of Cudnn Rnn and TF Rnn.""" 182 183 def __init__(self, 184 num_layers, 185 num_units, 186 input_size, 187 input_mode=CUDNN_INPUT_LINEAR_MODE, 188 direction=CUDNN_RNN_UNIDIRECTION): 189 """Constructor. 190 191 Args: 192 num_layers: the number of layers for the RNN model. 193 num_units: the number of units within the RNN model. 194 input_size: the size of the input, it could be different from the 195 num_units. 196 input_mode: indicate whether there is a linear projection between the 197 input and the actual computation before the first layer. It could be one 198 of 'linear_input', 'skip_input' or 'auto_select'. * 'linear_input' 199 (default) always applies a linear projection of input onto RNN hidden 200 state. (standard RNN behavior). * 'skip_input' is only allowed when 201 input_size == num_units; * 'auto_select' implies 'skip_input' when 202 input_size == num_units; otherwise, it implies 'linear_input'. 203 direction: the direction model that the model operates. Could be either 204 'unidirectional' or 'bidirectional' 205 """ 206 self._num_layers = num_layers 207 self._input_size = input_size 208 self._num_units = num_units 209 self._input_mode = input_mode 210 self._direction = direction 211 self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2 212 self._num_params = ( 213 self._num_params_per_layer * self._num_layers * self._num_dirs) 214 215 def tf_canonical_to_opaque(self, tf_canonicals): 216 r"""Converts tf canonical weights to cudnn opaque param.""" 217 cu_weights, cu_biases = self._tf_canonical_to_cu_canonical(tf_canonicals) 218 cu_weights = [array_ops.reshape(w, [-1]) for w in cu_weights] 219 opaque_params = self._cu_canonical_to_opaque(cu_weights, cu_biases) 220 return opaque_params 221 222 def opaque_to_tf_canonical(self, opaque_param): 223 r"""Converts cudnn opaque param to tf canonical weights.""" 224 cu_weights, cu_biases = self._opaque_to_cu_canonical(opaque_param) 225 weights, biases = self._cu_canonical_to_tf_canonical(cu_weights, cu_biases) 226 return weights, biases 227 228 def _opaque_to_cu_canonical(self, opaque_param): 229 """Converts opaque params to Cudnn canonical format. 230 231 Args: 232 opaque_param: An opaque tensor storing cudnn rnn params (weights and 233 biases). 234 Returns: 235 2 list for weights and biases respectively. 236 """ 237 with ops.device("/gpu:0"): 238 weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical( 239 num_layers=self._num_layers, 240 num_units=self._num_units, 241 input_size=self._input_size, 242 params=opaque_param, 243 num_params=self._num_params, 244 rnn_mode=self._rnn_mode, 245 input_mode=self._input_mode, 246 direction=self._direction) 247 return (weights, biases) 248 249 def _cu_canonical_to_opaque(self, cu_weights, cu_biases): 250 """Converts from Cudnn canonical format to opaque params. 251 252 Args: 253 cu_weights: a list of tensors, Cudnn canonical weights. 254 cu_biases: a list of tensors, Cudnn canonical biases. 255 Returns: 256 a single opaque tensor. 257 """ 258 with ops.device("/gpu:0"): 259 return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params( 260 num_layers=self._num_layers, 261 num_units=self._num_units, 262 input_size=self._input_size, 263 weights=cu_weights, 264 biases=cu_biases, 265 rnn_mode=self._rnn_mode, 266 input_mode=self._input_mode, 267 direction=self._direction) 268 269 def _cu_canonical_to_tf_canonical(self, cu_weights, cu_biases): 270 r"""Transform from Cudnn canonical to tf canonical. 271 272 The elements of argument lists are laid out in the following format: 273 ------------------------------------------------------------ 274 | weights | biases | 275 ------------------------------------------------------------ 276 \ \ 277 \ \ 278 ------------------------------- 279 | layer1 |layer2 |... | 280 ------------------------------- 281 \ \ 282 --------------- 283 |fwd |bak | 284 --------------- 285 Args: 286 cu_weights: a list of tensors of Cudnn canonical weights. 287 cu_biases: a list of tensors of Cudnn canonical biases. 288 Returns: 289 1 tuple, tf canonical weights and biases. 290 """ 291 tf_weights, tf_biases = [], [] 292 293 layer_weights_num = self._num_params_per_layer * self._num_dirs 294 layer_biases_num = layer_weights_num 295 296 for i in range(self._num_layers): 297 layer_weights = cu_weights[i * layer_weights_num:(i + 1) * 298 layer_weights_num] 299 layer_biases = cu_biases[i * layer_biases_num:(i + 1) * layer_biases_num] 300 if self._direction == CUDNN_RNN_UNIDIRECTION: 301 self._cu_canonical_to_tf_canonical_single_layer( 302 layer_weights, layer_biases, tf_weights, tf_biases) 303 else: 304 fw_weights = layer_weights[:len(layer_weights) // 2] 305 bw_weights = layer_weights[len(layer_weights) // 2:] 306 fw_biases = layer_biases[:len(layer_biases) // 2] 307 bw_biases = layer_biases[len(layer_biases) // 2:] 308 309 self._cu_canonical_to_tf_canonical_single_layer( 310 fw_weights, 311 fw_biases, 312 tf_weights, 313 tf_biases, 314 ) 315 316 self._cu_canonical_to_tf_canonical_single_layer( 317 bw_weights, 318 bw_biases, 319 tf_weights, 320 tf_biases, 321 ) 322 return (tf_weights, tf_biases) 323 324 def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases, 325 tf_weights, tf_biases): 326 r"""Transform single layer Cudnn canonicals to tf canonicals. 327 328 The elements of cu_weights, cu_biases are laid out in the following format: 329 ------------------------------------------------------------------------- 330 | gate0 param on inputs | gate0 param on hidden state | gate1 ..........| 331 ------------------------------------------------------------------------- 332 Args: 333 cu_weights: a list of tensors, single layer weights. 334 cu_biases: a list of tensors, single layer biases. 335 tf_weights: a list where transformed weights are stored. 336 tf_biases: a list where transformed biases are stored. 337 """ 338 raise NotImplementedError("Abstract method") 339 340 def _tf_canonical_to_cu_canonical(self, tf_canonicals): 341 r"""Transform from tf canonical to Cudnn canonical. 342 343 This is the reverse routine of _TransformCanonical(). 344 Args: 345 tf_canonicals: a list of tensors of tf canonical params. The elements are 346 laid out in the following format: 347 ------------------------------------------------------------ 348 | weights | biases | 349 ------------------------------------------------------------ 350 \ \ 351 \ \ 352 ------------------------------- 353 | layer1 |layer2 |... | 354 ------------------------------- 355 \ \ 356 --------------- 357 |fwd |bak | 358 --------------- 359 Returns: 360 2 lists: the recovered cudnn canonical weights and biases. 361 """ 362 weights = tf_canonicals[:len(tf_canonicals) // 2] 363 biases = tf_canonicals[len(tf_canonicals) // 2:] 364 365 cu_weights, cu_biases = [], [] 366 layer_weights_num = len(weights) // self._num_layers 367 layer_biases_num = len(biases) // self._num_layers 368 for i in range(self._num_layers): 369 layer_weights = weights[i * layer_weights_num:(i + 1) * layer_weights_num] 370 layer_biases = biases[i * layer_biases_num:(i + 1) * layer_biases_num] 371 if self._direction == CUDNN_RNN_UNIDIRECTION: 372 cu_weights.extend(self._tf_to_cudnn_weights(i, *layer_weights)) 373 cu_biases.extend(self._tf_to_cudnn_biases(*layer_biases)) 374 else: 375 fw_weights, bw_weights = layer_weights[:len( 376 layer_weights) // 2], layer_weights[len(layer_weights) // 2:] 377 fw_biases, bw_biases = layer_biases[:len( 378 layer_biases) // 2], layer_biases[len(layer_biases) // 2:] 379 cu_weights.extend(self._tf_to_cudnn_weights(i, *fw_weights)) 380 cu_biases.extend(self._tf_to_cudnn_biases(*fw_biases)) 381 382 cu_weights.extend(self._tf_to_cudnn_weights(i, *bw_weights)) 383 cu_biases.extend(self._tf_to_cudnn_biases(*bw_biases)) 384 return cu_weights, cu_biases 385 386 def _cudnn_to_tf_weights(self, *cu_weights): 387 r"""Stitches cudnn canonical weights to generate tf canonical weights.""" 388 raise NotImplementedError("Abstract method") 389 390 def _tf_to_cudnn_weights(self, layer, *tf_weights): 391 r"""Reverses the operations in StitchWeights().""" 392 raise NotImplementedError("Abstract method") 393 394 def _cudnn_to_tf_biases(self, *biases): 395 r"""Stitches cudnn canonical biases to generate tf canonical biases.""" 396 raise NotImplementedError("Abstract method") 397 398 def _tf_to_cudnn_biases(self, *tf_biases): 399 r"""Reverses the operations in StitchBiases().""" 400 raise NotImplementedError("Abstract method") 401 402 403class CudnnParamsFormatConverterLSTM(CudnnParamsFormatConverter): 404 """Helper class that converts between params of Cudnn and TF LSTM.""" 405 _rnn_mode = CUDNN_LSTM 406 _num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER 407 408 def _cudnn_to_tf_gate_params(self, *cu_gate_order): 409 i_g, f_g, c_g, o_g = cu_gate_order 410 return [i_g, c_g, f_g, o_g] 411 412 def _tf_to_cudnn_gate_params(self, *tf_gate_order): 413 i_g, c_g, f_g, o_g = tf_gate_order 414 return [i_g, f_g, c_g, o_g] 415 416 def _cudnn_to_tf_weights(self, *cu_weights): 417 r"""Stitching cudnn canonical weights to generate tf canonical weights.""" 418 w_i, w_f, w_c, w_o, r_i, r_f, r_c, r_o = cu_weights 419 420 # pylint: disable=invalid-name 421 W_i = array_ops.concat([w_i, r_i], axis=1) 422 W_f = array_ops.concat([w_f, r_f], axis=1) 423 W_c = array_ops.concat([w_c, r_c], axis=1) 424 W_o = array_ops.concat([w_o, r_o], axis=1) 425 # pylint: enable=invalid-name 426 # Cudnn LSTM weights are in ifco order, other tf LSTMs are in icfo order. 427 reordered = self._cudnn_to_tf_gate_params(* [W_i, W_f, W_c, W_o]) 428 return (array_ops.transpose(array_ops.concat(reordered, axis=0)),) 429 430 def _tf_to_cudnn_weights(self, layer, *tf_weights): 431 r"""Reverse the operations in StitchWeights().""" 432 input_size = self._input_size 433 num_units = self._num_units 434 if layer == 0: 435 input_weight_width = input_size 436 else: 437 input_weight_width = num_units 438 if self._direction == CUDNN_RNN_BIDIRECTION: 439 input_weight_width *= 2 440 441 (tf_weight,) = tf_weights 442 w = array_ops.transpose(tf_weight) 443 # pylint: disable=invalid-name 444 W_i, W_f, W_c, W_o = self._tf_to_cudnn_gate_params(*array_ops.split( 445 w, 4, axis=0)) 446 447 w_i, r_i = array_ops.split(W_i, [input_weight_width, num_units], axis=1) 448 w_c, r_c = array_ops.split(W_c, [input_weight_width, num_units], axis=1) 449 w_f, r_f = array_ops.split(W_f, [input_weight_width, num_units], axis=1) 450 w_o, r_o = array_ops.split(W_o, [input_weight_width, num_units], axis=1) 451 return w_i, w_f, w_c, w_o, r_i, r_f, r_c, r_o 452 # pylint: enable=invalid-name 453 454 def _cudnn_to_tf_biases(self, *cu_biases): 455 r"""Stitching cudnn canonical biases to generate tf canonical biases.""" 456 b_wi, b_wf, b_wc, b_wo, b_ri, b_rf, b_rc, b_ro = cu_biases 457 # Save only the sum instead of individual biases. When recovering, return 458 # two biases each with half the value. Since RNN does not regularize by 459 # weight decay, it has no side effect in training or inference. 460 # pylint: disable=invalid-name 461 B_i = b_wi + b_ri 462 B_f = b_wf + b_rf 463 B_c = b_wc + b_rc 464 B_o = b_wo + b_ro 465 # pylint: enable=invalid-name 466 reordered = self._cudnn_to_tf_gate_params(* [B_i, B_f, B_c, B_o]) 467 return (array_ops.concat(reordered, axis=0),) 468 469 def _tf_to_cudnn_biases(self, *tf_biases): 470 r"""Reverse the operations in StitchBiases().""" 471 (tf_bias,) = tf_biases 472 # pylint: disable=invalid-name 473 B_i, B_f, B_c, B_o = self._tf_to_cudnn_gate_params(*array_ops.split( 474 tf_bias, 4, axis=0)) 475 # pylint: enable=invalid-name 476 # pylint: disable=unbalanced-tuple-unpacking 477 b_wi, b_ri = (B_i * 0.5,) * 2 478 b_wf, b_rf = (B_f * 0.5,) * 2 479 b_wc, b_rc = (B_c * 0.5,) * 2 480 b_wo, b_ro = (B_o * 0.5,) * 2 481 # pylint: enable=unbalanced-tuple-unpacking 482 # Return ifco order for Cudnn LSTM. 483 return b_wi, b_wf, b_wc, b_wo, b_ri, b_rf, b_rc, b_ro 484 485 def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases, 486 tf_weights, tf_biases): 487 (w,) = self._cudnn_to_tf_weights(*cu_weights) 488 (b,) = self._cudnn_to_tf_biases(*cu_biases) 489 tf_weights.append(w) 490 tf_biases.append(b) 491 492 493class CudnnParamsFormatConverterGRU(CudnnParamsFormatConverter): 494 """Helper class that converts between params of Cudnn and TF GRU.""" 495 496 _rnn_mode = CUDNN_GRU 497 _num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER 498 499 _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__) 500 501 def _cudnn_to_tf_weights(self, *cu_weights): 502 r"""Stitching cudnn canonical weights to generate tf canonical weights.""" 503 w_i, w_r, w_h, r_i, r_r, r_h = cu_weights 504 505 # pylint: disable=invalid-name 506 W_i = array_ops.concat([w_i, r_i], axis=1) 507 W_r = array_ops.concat([w_r, r_r], axis=1) 508 # pylint: enable=invalid-name 509 return (array_ops.transpose(array_ops.concat([W_i, W_r], axis=0)), 510 array_ops.transpose(w_h), array_ops.transpose(r_h)) 511 512 def _tf_to_cudnn_weights(self, layer, *tf_weights): 513 r"""Reverse the operations in StitchWeights().""" 514 input_size = self._input_size 515 num_units = self._num_units 516 if layer == 0: 517 input_weight_width = input_size 518 else: 519 input_weight_width = num_units 520 if self._direction == CUDNN_RNN_BIDIRECTION: 521 input_weight_width *= 2 522 # pylint: disable=invalid-name 523 W_ir, w_h, r_h = tf_weights 524 W_ir = array_ops.transpose(W_ir) 525 w_h = array_ops.transpose(w_h) 526 r_h = array_ops.transpose(r_h) 527 528 W_i, W_r = array_ops.split(W_ir, 2, axis=0) 529 w_i, r_i = array_ops.split(W_i, [input_weight_width, num_units], axis=1) 530 w_r, r_r = array_ops.split(W_r, [input_weight_width, num_units], axis=1) 531 # pylint: enable=invalid-name 532 return w_i, w_r, w_h, r_i, r_r, r_h 533 534 def _cudnn_to_tf_biases(self, *biases): 535 r"""Stitching cudnn canonical biases to generate tf canonical biases.""" 536 b_wi, b_wr, b_wh, b_ri, b_rr, b_rh = biases 537 return ( 538 # Save only the sum instead of individual biases. When recovering, 539 # return two biases each with half the value. Since RNN does not 540 # regularize by weight decay, it has no side effect in training or 541 # inference. 542 array_ops.concat([b_wi, b_wr], axis=0) + array_ops.concat( 543 [b_ri, b_rr], axis=0), 544 b_wh, 545 b_rh) 546 547 def _tf_to_cudnn_biases(self, *tf_biases): 548 r"""Reverse the operations in StitchBiases().""" 549 # b_ir is the summed bias of reset and update gate. 550 b_ir, b_wh, b_rh = tf_biases 551 bi, br = b_ir * 0.5, b_ir * 0.5 552 b_wi, b_wr = array_ops.split(bi, 2, axis=0) 553 b_ri, b_rr = array_ops.split(br, 2, axis=0) 554 return b_wi, b_wr, b_wh, b_ri, b_rr, b_rh 555 556 def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases, 557 tf_weights, tf_biases): 558 # pylint: disable=invalid-name 559 W_ir, w_h, r_h = self._cudnn_to_tf_weights(*cu_weights) 560 b_ir, b_wh, b_rh = self._cudnn_to_tf_biases(*cu_biases) 561 # pylint: enable=invalid-name 562 tf_weights.extend([W_ir, w_h, r_h]) 563 tf_biases.extend([b_ir, b_wh, b_rh]) 564 565 566class CudnnParamsFormatConverterBasic(CudnnParamsFormatConverterLSTM): 567 """Helper class that converts between params of Cudnn and TF Relu/Tanh RNN.""" 568 569 def _cudnn_to_tf_weights(self, *cu_weights): 570 r"""Stitching cudnn canonical weights to generate tf canonical weights.""" 571 w_i, w_h = cu_weights 572 W = array_ops.concat([w_i, w_h], axis=1) # pylint: disable=invalid-name 573 return (array_ops.transpose(W),) 574 575 def _tf_to_cudnn_weights(self, layer, *tf_weights): 576 r"""Reverse the operations in StitchWeights().""" 577 input_size = self._input_size 578 num_units = self._num_units 579 if layer == 0: 580 input_weight_width = input_size 581 else: 582 input_weight_width = num_units 583 if self._direction == CUDNN_RNN_BIDIRECTION: 584 input_weight_width *= 2 585 586 (tf_weight,) = tf_weights 587 # pylint: disable=invalid-name 588 W = array_ops.transpose(tf_weight) 589 w_i, w_h = array_ops.split(W, [input_weight_width, num_units], axis=1) 590 return w_i, w_h 591 # pylint: enable=invalid-name 592 593 def _cudnn_to_tf_biases(self, *cu_biases): 594 r"""Stitching cudnn canonical biases to generate tf canonical biases.""" 595 # Save only the sum instead of individual biases. When recovering, return 596 # two biases each with half the value. Since RNN does not regularize by 597 # weight decay, it has no side effect in training or inference. 598 b_wi, b_wh = cu_biases 599 return (b_wi + b_wh,) 600 601 def _tf_to_cudnn_biases(self, *tf_biases): 602 r"""Reverse the operations in StitchBiases().""" 603 (tf_bias,) = tf_biases 604 b_i = tf_bias * 0.5 605 b_h = tf_bias * 0.5 606 return b_i, b_h 607 608 609class CudnnParamsFormatConverterTanh(CudnnParamsFormatConverterBasic): 610 """Helper class that converts between params of Cudnn and TF Tanh RNN.""" 611 _rnn_mode = CUDNN_RNN_TANH 612 _num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER 613 614 615class CudnnParamsFormatConverterRelu(CudnnParamsFormatConverterBasic): 616 """Helper class that converts between params of Cudnn and TF Relu RNN.""" 617 _rnn_mode = CUDNN_RNN_RELU 618 _num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER 619 620 621# TODO(yaozhang): make sure we only save the canonical version of params and 622# don't save the platform-specific version to avoid potential race 623# conditions where params is updated by both versions when being restored. 624# Currently, checkpointing will function properly, despite that we save both 625# versions, because Saver restores customized savables after Variables. 626# However, it is good to not rely on this restoring order of Saver and to 627# avoid unnecessary storage. Add a test to check only the canonical version is 628# saved. 629class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject): 630 """Abstract SaveableObject implementation handling Cudnn opaque params.""" 631 632 def __init__(self, 633 opaque_params, 634 num_layers, 635 num_units, 636 input_size, 637 input_mode=CUDNN_INPUT_LINEAR_MODE, 638 direction=CUDNN_RNN_UNIDIRECTION, 639 scope=None, 640 name="cudnn_rnn_saveable"): 641 """Creates a CudnnOpaqueParamsSaveable object. 642 643 CudnnOpaqueParamsSaveable is saveable/restorable in a checkpoint file 644 and is used to save/restore the weights and biases parameters in a 645 canonical format which is directly consumable by platform-independent tf 646 RNN cells. Parameters are saved as tensors layer by layer with weight 647 tensors followed by bias tensors, and forward direction followed by 648 backward direction (if applicable). When restoring, a user could name 649 param_variables as desired, and restore weight and bias tensors to these 650 variables. 651 652 For CudnnRNNRelu or CudnnRNNTanh, there are 2 tensors per weight and per 653 bias for each layer: tensor 0 is applied to the input from the previous 654 layer and tensor 1 to the recurrent input. 655 656 For CudnnLSTM, there are 8 tensors per weight and per bias for each 657 layer: tensor 0-3 are applied to the input from the previous layer and 658 tensor 4-7 to the recurrent input. Tensor 0 and 4 are for the input gate; 659 tensor 1 and 5 the forget gate; tensor 2 and 6 the new memory gate; 660 tensor 3 and 7 the output gate. 661 662 For CudnnGRU, there are 6 tensors per weight and per bias for each layer: 663 tensor 0-2 are applied to the input from the previous layer and 664 tensor 3-5 to the recurrent input. Tensor 0 and 3 are for the reset gate; 665 tensor 1 and 4 the update gate; tensor 2 and 5 the new memory gate. 666 667 Args: 668 opaque_params: a variable, Cudnn RNN opaque params. 669 num_layers: the number of layers for the RNN model. 670 num_units: the number of units within the RNN model. 671 input_size: the size of the input, it could be different from the 672 num_units. 673 input_mode: indicate whether there is a linear projection between the 674 input and the actual computation before the first layer. It could be 675 'linear_input', 'skip_input' or 'auto_select'. 'linear_input' (default) 676 always applies a linear projection of input onto RNN hidden state. 677 (standard RNN behavior). 'skip_input' is only allowed when input_size == 678 num_units; 'auto_select' implies 'skip_input' when input_size == 679 num_units; otherwise, it implies 'linear_input'. 680 direction: the direction model that the model operates. Could be either 681 'unidirectional' or 'bidirectional' 682 scope: string of VariableScope, the scope of equivalent subgraph 683 consisting only platform-independent tf RNN cells. 684 name: the name of the CudnnOpaqueParamsSaveable object. 685 """ 686 # Define in subclasses. 687 self._num_layers = num_layers 688 self._input_size = input_size 689 self._num_units = num_units 690 self._input_mode = input_mode 691 self._direction = direction 692 if scope is not None: 693 scope_name = scope.name if isinstance(scope, vs.VariableScope) else scope 694 self._scope = scope_name or None 695 else: 696 self._scope = None 697 698 self._variables = opaque_params 699 self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2 700 # Defined in subclasses. 701 self._format_converter = None 702 703 tf_weights, tf_biases = ( 704 self.format_converter.opaque_to_tf_canonical(self._variables)) 705 tf_weight_names, tf_bias_names = self._tf_canonical_names() 706 # We currently don't use slice_spec. It might be useful in a distributed 707 # setting where each parameter server node stores a slice of variable, 708 # instead of having the master pull all slices and then save them. 709 slice_spec = "" 710 params = tf_weights + tf_biases 711 self._weight_names = tf_weight_names 712 self._bias_names = tf_bias_names 713 self._param_names = tf_weight_names + tf_bias_names 714 prefixed_param_names = tf_weight_names + tf_bias_names 715 if self._scope: 716 prefixed_param_names = [ 717 "%s/%s" % (self._scope, pn) for pn in prefixed_param_names 718 ] 719 specs = [ 720 saver.BaseSaverBuilder.SaveSpec(param, slice_spec, param_name) 721 for param, param_name in zip(params, prefixed_param_names) 722 ] 723 super(CudnnOpaqueParamsSaveable, self).__init__( 724 array_ops.identity(self._variables), specs, name) 725 726 @property 727 def format_converter(self): 728 if self._format_converter is None: 729 self._format_converter = self._format_converter_cls( 730 self._num_layers, self._num_units, self._input_size, self._input_mode, 731 self._direction) 732 return self._format_converter 733 734 def restore(self, restored_tensors, restored_shapes): 735 opaque_params = self.format_converter.tf_canonical_to_opaque( 736 restored_tensors) 737 return state_ops.assign( 738 self._variables, opaque_params, validate_shape=False) 739 740 def _trackable_save(self, save_buffer): 741 weights, biases = self.format_converter.opaque_to_tf_canonical( 742 self._variables) 743 for name, tensor in zip(self._param_names, weights + biases): 744 save_buffer[name] = array_ops.identity(tensor) 745 746 def _trackable_restore(self, restore_buffer): 747 tensors = [ 748 array_ops.identity(restore_buffer[name]) for name in self._param_names 749 ] 750 return self.restore( 751 restored_tensors=tensors, 752 restored_shapes=None # Unused 753 ) 754 755 def _add_trackable_dependencies(self, trackable, dtype): 756 """Add canonical weight dependencies to `trackable`. 757 758 When saving or restoring, converts to or from the opaque buffer 759 format. Weights are saved and loaded in the configuration expected by 760 cuDNN-compatible cells. 761 762 Args: 763 trackable: An object inheriting from `Trackable` to add 764 dependencies too (typically the cuDNN `Layer`). 765 dtype: The dtype for the canonical parameter Tensors. 766 """ 767 split_dependencies = split_dependency.split_dependency( 768 component_names=self._param_names, 769 component_dtypes=(dtype,) * len(self._param_names), 770 fill_save_buffer_fn=self._trackable_save, 771 consume_restore_buffer_fn=self._trackable_restore) 772 self._trackable_track_params(trackable, split_dependencies) 773 774 def _trackable_track_params(self, trackable, params): 775 """Tracks parameters in a canonical configuration.""" 776 return # NotImplementedError raised by the Layer. 777 778 def _tf_canonical_names(self): 779 tf_weights_names, tf_biases_names = [], [] 780 for i in range(self._num_layers): 781 if self._direction == CUDNN_RNN_UNIDIRECTION: 782 prefix = self._tf_canonical_name_prefix(i) 783 self._tf_canonical_names_single_layer(prefix, tf_weights_names, 784 tf_biases_names) 785 else: 786 fwd_prefix = self._tf_canonical_name_prefix(i, is_fwd=True) 787 bak_prefix = self._tf_canonical_name_prefix(i, is_fwd=False) 788 789 self._tf_canonical_names_single_layer(fwd_prefix, tf_weights_names, 790 tf_biases_names) 791 self._tf_canonical_names_single_layer(bak_prefix, tf_weights_names, 792 tf_biases_names) 793 return tf_weights_names, tf_biases_names 794 795 def _tf_canonical_name_prefix(self, layer, is_fwd=True): 796 if self._direction == CUDNN_RNN_UNIDIRECTION: 797 return "rnn/multi_rnn_cell/cell_%d/%s" % (layer, self._rnn_cell_name) 798 else: 799 if is_fwd: 800 return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/fw/%s" % 801 (layer, self._rnn_cell_name)) 802 else: 803 return ("stack_bidirectional_rnn/cell_%d/bidirectional_rnn/bw/%s" % 804 (layer, self._rnn_cell_name)) 805 806 def _tf_canonical_names_single_layer(self, prefix, tf_weights_names, 807 tf_biases_names): 808 raise NotImplementedError("Abstract method") 809 810 811class CudnnLSTMSaveable(CudnnOpaqueParamsSaveable): 812 """SaveableObject implementation handling Cudnn LSTM opaque params.""" 813 814 _format_converter_cls = CudnnParamsFormatConverterLSTM 815 _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleLSTMCell.__name__) 816 817 def _tf_canonical_names_single_layer(self, prefix, tf_weights_names, 818 tf_bias_names): 819 tf_weights_names.append(prefix + "/kernel") 820 tf_bias_names.append(prefix + "/bias") 821 822 def _trackable_track_params(self, trackable, params): 823 """Track parameters for compatibility with CudnnCompatibleLSTMCell.""" 824 biases = [] 825 weights = [] 826 for name in self._weight_names: 827 weights.append(params[name]) 828 for name in self._bias_names: 829 biases.append(params[name]) 830 assert len(params) == len(weights) + len(biases) 831 if len(weights) == 1 and len(biases) == 1: 832 # For single-layer cells, allow substituting a cell with no MultiRNNCell 833 # wrapping. 834 kernel, = weights # pylint: disable=unbalanced-tuple-unpacking 835 bias, = biases # pylint: disable=unbalanced-tuple-unpacking 836 trackable._track_trackable(kernel, name="kernel") # pylint: disable=protected-access 837 trackable._track_trackable(bias, name="bias") # pylint: disable=protected-access 838 assert len(biases) == len(weights) 839 for cell_index, (bias, kernel) in enumerate(zip(biases, weights)): 840 cell = trackable_lib.AutoTrackable() 841 trackable._track_trackable(cell, name="cell-%d" % cell_index) # pylint: disable=protected-access 842 cell.bias = bias 843 cell.kernel = kernel 844 845 846class CudnnGRUSaveable(CudnnOpaqueParamsSaveable): 847 """SaveableObject implementation handling Cudnn GRU opaque params.""" 848 849 _format_converter_cls = CudnnParamsFormatConverterGRU 850 _rnn_cell_name = base_layer.to_snake_case(CudnnCompatibleGRUCell.__name__) 851 852 def _tf_canonical_names_single_layer(self, prefix, tf_weights_names, 853 tf_bias_names): 854 tf_weights_names.append(prefix + "/gates/kernel") 855 tf_weights_names.append(prefix + "/candidate/input_projection/kernel") 856 tf_weights_names.append(prefix + "/candidate/hidden_projection/kernel") 857 858 tf_bias_names.append(prefix + "/gates/bias") 859 tf_bias_names.append(prefix + "/candidate/input_projection/bias") 860 tf_bias_names.append(prefix + "/candidate/hidden_projection/bias") 861 862 863class CudnnRNNTanhSaveable(CudnnLSTMSaveable): 864 _format_converter_cls = CudnnParamsFormatConverterTanh 865 _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__) 866 867 868class CudnnRNNReluSaveable(CudnnLSTMSaveable): 869 _format_converter_cls = CudnnParamsFormatConverterRelu 870 _rnn_cell_name = base_layer.to_snake_case(rnn_cell_impl.BasicRNNCell.__name__) 871 872 873_cudnn_rnn_common_doc_string = """ 874 Cudnn RNN has an opaque parameter buffer that can be used for inference and 875 training. But it is possible that the layout of the parameter buffers 876 changes between generations. So it is highly recommended to use 877 CudnnOpaqueParamsSaveable to save and restore weights and biases in a 878 canonical format. 879 880 This is a typical use case: 881 882 * The user creates a CudnnRNN model. 883 * The user query that parameter buffer size. 884 * The user creates a variable of that size that serves as the parameter 885 buffers. 886 * The user either initialize the parameter buffer, or load the canonical 887 weights into the parameter buffer. 888 * The user calls the model with the parameter buffer for inference, or 889 training. 890 * If training, the user creates a Saver object. 891 * If training, the user creates a CudnnOpaqueParamsSaveable object from the 892 parameter buffer for it to be later saved in the canonical format. When 893 creating a CudnnOpaqueParamsSaveable object, a name could be provided, 894 which is useful in distinguishing the names of multiple 895 CudnnOpaqueParamsSaveable objects (e.g. for an encoder-decoder model). 896 * Once a while, the user saves the parameter buffer into model checkpoints 897 with Saver.save(). 898 * When restoring, the user creates a CudnnOpaqueParamsSaveable object and 899 uses Saver.restore() to restore the parameter buffer from the canonical 900 format to a user-defined format, as well as to restore other savable 901 objects in the checkpoint file. 902""" 903 904 905def _check_rnn_mode(rnn_mode): 906 if rnn_mode not in (CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_TANH, CUDNN_RNN_RELU): 907 raise ValueError("Invalid rnn_mode: %s, expect one of (%s, %s, %s, %s)" % 908 (rnn_mode, CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_TANH, 909 CUDNN_RNN_RELU)) 910 911 912def _get_seed(seed): 913 seed, seed2 = random_seed.get_seed(seed) 914 if seed is None and seed2 is None: 915 seed, seed2 = 0, 0 916 return seed, seed2 917 918 919def check_direction(direction): 920 """Check validity of direction.""" 921 if direction not in (CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION): 922 raise ValueError("Invalid direction: %s, expecting %s or %s" % 923 (direction, CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION)) 924 925 926def check_input_mode(input_mode): 927 if input_mode not in (CUDNN_INPUT_LINEAR_MODE, CUDNN_INPUT_SKIP_MODE, 928 CUDNN_INPUT_AUTO_MODE): 929 raise ValueError("Invalid input_mode: %s, expect one of (%s, %s, %s)" % 930 (input_mode, CUDNN_INPUT_LINEAR_MODE, 931 CUDNN_INPUT_SKIP_MODE, CUDNN_INPUT_AUTO_MODE)) 932 933 934def _get_num_params(rnn_mode, num_layers, direction): 935 """Return num params for given Cudnn config.""" 936 if rnn_mode == CUDNN_LSTM: 937 num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER 938 elif rnn_mode == CUDNN_GRU: 939 num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER 940 elif rnn_mode == CUDNN_RNN_RELU: 941 num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER 942 elif rnn_mode == CUDNN_RNN_TANH: 943 num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER 944 else: 945 raise ValueError("Invalid \'rnn_mode\': %s" % rnn_mode) 946 num_params = num_layers * num_params_per_layer 947 if direction != CUDNN_RNN_UNIDIRECTION: 948 num_params *= 2 949 return num_params 950 951 952def _cudnn_rnn(inputs, 953 input_h, 954 input_c, 955 params, 956 is_training, 957 rnn_mode, 958 sequence_lengths=None, 959 time_major=True, 960 input_mode=CUDNN_INPUT_LINEAR_MODE, 961 direction=CUDNN_RNN_UNIDIRECTION, 962 dropout=0., 963 seed=0, 964 name=None): 965 """Cudnn RNN. 966 967 Args: 968 inputs: the input sequence to the RNN model. If `time_major` is True 969 (default), the Tensor shape is [max_time, batch_size, input_size]. If 970 `time_major` is False, the shape is [batch_size, max_time, input_size]. 971 input_h: the initial hidden state for h. If `time_major` is True 972 (default), the Tensor shape is [num_layers, batch_size, num_units]. If 973 `time_major` is False, the shape is [batch_size, num_layers, num_units]. 974 input_c: the initial hidden state for c. This is only relevant for LSTM. 975 A Tensor of the same shape as input_h. 976 params: the parameter buffer created for this model. 977 is_training: whether this operation will be used in training or inference 978 rnn_mode: one of ('lstm', 'gru', 'rnn_relu', 'rnn_tanh'). 979 sequence_lengths: an int32 array representing the variable sequence lengths 980 in a batch. The size of the array has to equal the batch_size. Default to 981 None, in which case sequences in the batch are assumed to have the same 982 length, which is inferred from inputs. 983 time_major: The shape format of the `inputs` and `outputs` Tensors. If true, 984 these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If 985 false, these Tensors must be shaped ['batch_size', 'max_time', 'depth']. 986 By default this function accepts input and emits output in time-major 987 form. This param is only effective when 'sequence_lengths' is used. 988 input_mode: indicate whether there is a linear projection between the 989 input and the actual computation before the first layer. It could be 990 'linear_input', 'skip_input' or 'auto_select'. 991 'linear_input' (default) always applies a linear projection of input 992 onto RNN hidden state. (standard RNN behavior). 993 'skip_input' is only allowed when input_size == num_units; 994 'auto_select' implies 'skip_input' when input_size == num_units; 995 otherwise, it implies 'linear_input'. 996 direction: the direction model that the model operates. Could be either 997 'unidirectional' or 'bidirectional' 998 dropout: whether to enable dropout. With it is 0, dropout is disabled. 999 seed: the op seed used for initializing dropout. See `tf.set_random_seed` 1000 for behavior. 1001 name: name of the operation. 1002 Returns: 1003 outputs, output_h, output_c 1004 """ 1005 _check_rnn_mode(rnn_mode) 1006 check_direction(direction) 1007 check_input_mode(input_mode) 1008 seed, seed2 = random_seed.get_seed(seed) 1009 # TODO(jamesqin): switch default value to "1" on May 25th 2018, and get rid 1010 # of V1 ops. 1011 use_cudnn_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0") 1012 args = { 1013 "input": inputs, 1014 "input_h": input_h, 1015 "input_c": input_c, 1016 "params": params, 1017 "is_training": is_training, 1018 "rnn_mode": rnn_mode, 1019 "input_mode": input_mode, 1020 "direction": direction, 1021 "dropout": dropout, 1022 "seed": seed, 1023 "seed2": seed2, 1024 "name": name 1025 } 1026 if sequence_lengths is not None: 1027 args["sequence_lengths"] = sequence_lengths 1028 args["time_major"] = time_major 1029 outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args) 1030 elif time_major is False: 1031 batch_size = array_ops.shape(inputs)[0] 1032 max_time = array_ops.shape(inputs)[1] 1033 sequence_lengths = array_ops.fill([batch_size], max_time) 1034 args["sequence_lengths"] = sequence_lengths 1035 args["time_major"] = time_major 1036 outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args) 1037 elif use_cudnn_v2 != "1": 1038 outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args) 1039 else: 1040 outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args) 1041 return (outputs, output_h, output_c) 1042 1043 1044def cudnn_lstm(inputs, 1045 input_h, 1046 input_c, 1047 params, 1048 is_training, 1049 sequence_lengths=None, 1050 time_major=True, 1051 input_mode=CUDNN_INPUT_LINEAR_MODE, 1052 direction=CUDNN_RNN_UNIDIRECTION, 1053 dropout=0., 1054 seed=0, 1055 name=None): 1056 """Cudnn LSTM. 1057 1058 Args: 1059 inputs: the input sequence to the RNN model. If `time_major` is True 1060 (default), the Tensor shape is [max_time, batch_size, input_size]. If 1061 `time_major` is False, the shape is [batch_size, max_time, input_size]. 1062 input_h: the initial hidden state for h. If `time_major` is True 1063 (default), the Tensor shape is [num_layers, batch_size, num_units]. If 1064 `time_major` is False, the shape is [batch_size, num_layers, num_units]. 1065 input_c: the initial hidden state for c. This is only relevant for LSTM. 1066 A Tensor of the same shape as input_h. 1067 params: the parameter buffer created for this model. 1068 is_training: whether this operation will be used in training or inference 1069 sequence_lengths: an int32 array representing the variable sequence lengths 1070 in a batch. The size of the array has to equal the batch_size. Default to 1071 None, in which case sequences in the batch are assumed to have the same 1072 length, which is inferred from inputs. 1073 time_major: The shape format of the `inputs` and `outputs` Tensors. If true, 1074 these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If 1075 false, these Tensors must be shaped ['batch_size', 'max_time', 'depth']. 1076 By default this function accepts input and emits output in time-major 1077 form. This param is only effective when 'sequence_lengths' is used. 1078 input_mode: indicate whether there is a linear projection between the 1079 input and the actual computation before the first layer. It could be 1080 'linear_input', 'skip_input' or 'auto_select'. 1081 'linear_input' (default) always applies a linear projection of input 1082 onto RNN hidden state. (standard RNN behavior). 1083 'skip_input' is only allowed when input_size == num_units; 1084 'auto_select' implies 'skip_input' when input_size == num_units; 1085 otherwise, it implies 'linear_input'. 1086 direction: the direction model that the model operates. Could be either 1087 'unidirectional' or 'bidirectional' 1088 dropout: whether to enable dropout. With it is 0, dropout is disabled. 1089 seed: the op seed used for initializing dropout. See `tf.set_random_seed` 1090 for behavior. 1091 name: name of the operation. 1092 Returns: 1093 outputs, output_h, output_c 1094 """ 1095 return _cudnn_rnn(inputs, input_h, input_c, params, is_training, CUDNN_LSTM, 1096 sequence_lengths, time_major, input_mode, direction, 1097 dropout, seed, name) 1098 1099 1100def _cudnn_rnn_no_input_c(inputs, 1101 input_h, 1102 params, 1103 is_training, 1104 rnn_mode, 1105 sequence_lengths=None, 1106 time_major=True, 1107 input_mode=CUDNN_INPUT_LINEAR_MODE, 1108 direction=CUDNN_RNN_UNIDIRECTION, 1109 dropout=0., 1110 seed=0, 1111 name=None): 1112 """Cudnn RNN w/o input_c. 1113 1114 Args: 1115 inputs: the input sequence to the RNN model. If `time_major` is True 1116 (default), the Tensor shape is [max_time, batch_size, input_size]. If 1117 `time_major` is False, the shape is [batch_size, max_time, input_size]. 1118 input_h: the initial hidden state for h. If `time_major` is True 1119 (default), the Tensor shape is [num_layers, batch_size, num_units]. If 1120 `time_major` is False, the shape is [batch_size, num_layers, num_units]. 1121 params: the parameter buffer created for this model. 1122 is_training: whether this operation will be used in training or inference 1123 rnn_mode: one of ('lstm', 'gru', 'rnn_relu', 'rnn_tanh'). 1124 sequence_lengths: an int32 array representing the variable sequence lengths 1125 in a batch. The size of the array has to equal the batch_size. Default to 1126 None, in which case sequences in the batch are assumed to have the same 1127 length, which is inferred from inputs. 1128 time_major: The shape format of the `inputs` and `outputs` Tensors. If true, 1129 these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If 1130 false, these Tensors must be shaped ['batch_size', 'max_time', 'depth']. 1131 By default this function accepts input and emits output in time-major 1132 form. This param is only effective when 'sequence_lengths' is used. 1133 input_mode: indicate whether there is a linear projection between the 1134 input and the actual computation before the first layer. It could be 1135 'linear_input', 'skip_input' or 'auto_select'. 1136 'linear_input' (default) always applies a linear projection of input 1137 onto RNN hidden state. (standard RNN behavior). 1138 'skip_input' is only allowed when input_size == num_units; 1139 'auto_select' implies 'skip_input' when input_size == num_units; 1140 otherwise, it implies 'linear_input'. 1141 direction: the direction model that the model operates. Could be either 1142 'unidirectional' or 'bidirectional' 1143 dropout: whether to enable dropout. With it is 0, dropout is disabled. 1144 seed: the op seed used for initializing dropout. See `tf.set_random_seed` 1145 for behavior. 1146 name: name of the operation. 1147 Returns: 1148 outputs, output_h 1149 """ 1150 input_c = array_ops.constant([], dtype=input_h.dtype) 1151 outputs, output_h, _ = _cudnn_rnn( 1152 inputs, input_h, input_c, params, is_training, rnn_mode, sequence_lengths, 1153 time_major, input_mode, direction, dropout, seed, name) 1154 return outputs, output_h 1155 1156 1157def cudnn_gru(inputs, 1158 input_h, 1159 params, 1160 is_training, 1161 sequence_lengths=None, 1162 time_major=True, 1163 input_mode=CUDNN_INPUT_LINEAR_MODE, 1164 direction=CUDNN_RNN_UNIDIRECTION, 1165 dropout=0., 1166 seed=0, 1167 name=None): 1168 """Cudnn GRU. 1169 1170 Args: 1171 inputs: the input sequence to the RNN model. If `time_major` is True 1172 (default), the Tensor shape is [max_time, batch_size, input_size]. If 1173 `time_major` is False, the shape is [batch_size, max_time, input_size]. 1174 input_h: the initial hidden state for h. If `time_major` is True 1175 (default), the Tensor shape is [num_layers, batch_size, num_units]. If 1176 `time_major` is False, the shape is [batch_size, num_layers, num_units]. 1177 params: the parameter buffer created for this model. 1178 is_training: whether this operation will be used in training or inference 1179 input_mode: indicate whether there is a linear projection between the 1180 input and the actual computation before the first layer. It could be 1181 'linear_input', 'skip_input' or 'auto_select'. 1182 'linear_input' (default) always applies a linear projection of input 1183 onto RNN hidden state. (standard RNN behavior). 1184 'skip_input' is only allowed when input_size == num_units; 1185 'auto_select' implies 'skip_input' when input_size == num_units; 1186 otherwise, it implies 'linear_input'. 1187 sequence_lengths: an int32 array representing the variable sequence lengths 1188 in a batch. The size of the array has to equal the batch_size. Default to 1189 None, in which case sequences in the batch are assumed to have the same 1190 length, which is inferred from inputs. 1191 time_major: The shape format of the `inputs` and `outputs` Tensors. If true, 1192 these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If 1193 false, these Tensors must be shaped ['batch_size', 'max_time', 'depth']. 1194 By default this function accepts input and emits output in time-major 1195 form. This param is only effective when 'sequence_lengths' is used. 1196 direction: the direction model that the model operates. Could be either 1197 'unidirectional' or 'bidirectional' 1198 dropout: whether to enable dropout. With it is 0, dropout is disabled. 1199 seed: the op seed used for initializing dropout. See `tf.set_random_seed` 1200 for behavior. 1201 name: name of the operation. 1202 Returns: 1203 outputs, output_h 1204 """ 1205 return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training, CUDNN_GRU, 1206 sequence_lengths, time_major, input_mode, 1207 direction, dropout, seed, name) 1208 1209 1210def cudnn_rnn_relu(inputs, 1211 input_h, 1212 params, 1213 is_training, 1214 input_mode=CUDNN_INPUT_LINEAR_MODE, 1215 direction=CUDNN_RNN_UNIDIRECTION, 1216 dropout=0., 1217 seed=0, 1218 sequence_lengths=None, 1219 time_major=True, 1220 name=None): 1221 """Cudnn RNN Relu. 1222 1223 Args: 1224 inputs: the input sequence to the RNN model. If `time_major` is True 1225 (default), the Tensor shape is [max_time, batch_size, input_size]. If 1226 `time_major` is False, the shape is [batch_size, max_time, input_size]. 1227 input_h: the initial hidden state for h. If `time_major` is True 1228 (default), the Tensor shape is [num_layers, batch_size, num_units]. If 1229 `time_major` is False, the shape is [batch_size, num_layers, num_units]. 1230 params: the parameter buffer created for this model. 1231 is_training: whether this operation will be used in training or inference 1232 input_mode: indicate whether there is a linear projection between the 1233 input and the actual computation before the first layer. It could be 1234 'linear_input', 'skip_input' or 'auto_select'. 'linear_input' (default) 1235 always applies a linear projection of input onto RNN hidden state. 1236 (standard RNN behavior). 'skip_input' is only allowed when input_size == 1237 num_units; 'auto_select' implies 'skip_input' when input_size == 1238 num_units; otherwise, it implies 'linear_input'. 1239 direction: the direction model that the model operates. Could be either 1240 'unidirectional' or 'bidirectional' 1241 dropout: whether to enable dropout. With it is 0, dropout is disabled. 1242 seed: the op seed used for initializing dropout. See `tf.set_random_seed` 1243 for behavior. 1244 sequence_lengths: an int32 array representing the variable sequence lengths 1245 in a batch. The size of the array has to equal the batch_size. If not 1246 provided, the same sequence length will be assumed. 1247 time_major: The shape format of the `inputs` and `outputs` Tensors. If true, 1248 these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If 1249 false, these Tensors must be shaped ['batch_size', 'max_time', 'depth']. 1250 By default this function accepts input and emits output in time-major 1251 form. This param is only effective when 'sequence_lengths' is used. 1252 name: name of the operation. 1253 1254 Returns: 1255 outputs, output_h 1256 """ 1257 return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training, 1258 CUDNN_RNN_RELU, sequence_lengths, time_major, 1259 input_mode, direction, dropout, seed, name) 1260 1261 1262def cudnn_rnn_tanh(inputs, 1263 input_h, 1264 params, 1265 is_training, 1266 sequence_lengths=None, 1267 time_major=True, 1268 input_mode=CUDNN_INPUT_LINEAR_MODE, 1269 direction=CUDNN_RNN_UNIDIRECTION, 1270 dropout=0., 1271 seed=0, 1272 name=None): 1273 """Cudnn RNN Tanh. 1274 1275 Args: 1276 inputs: the input sequence to the RNN model. If `time_major` is True 1277 (default), the Tensor shape is [max_time, batch_size, input_size]. If 1278 `time_major` is False, the shape is [batch_size, max_time, input_size]. 1279 input_h: the initial hidden state for h. If `time_major` is True 1280 (default), the Tensor shape is [num_layers, batch_size, num_units]. If 1281 `time_major` is False, the shape is [batch_size, num_layers, num_units]. 1282 params: the parameter buffer created for this model. 1283 is_training: whether this operation will be used in training or inference 1284 input_mode: indicate whether there is a linear projection between the 1285 input and the actual computation before the first layer. It could be 1286 'linear_input', 'skip_input' or 'auto_select'. 1287 'linear_input' (default) always applies a linear projection of input 1288 onto RNN hidden state. (standard RNN behavior). 1289 'skip_input' is only allowed when input_size == num_units; 1290 'auto_select' implies 'skip_input' when input_size == num_units; 1291 otherwise, it implies 'linear_input'. 1292 sequence_lengths: an int32 array representing the variable sequence lengths 1293 in a batch. The size of the array has to equal the batch_size. Default to 1294 None, in which case sequences in the batch are assumed to have the same 1295 length, which is inferred from inputs. 1296 time_major: The shape format of the `inputs` and `outputs` Tensors. If true, 1297 these Tensors must be shaped ['max_time', 'batch_size', 'depth']. If 1298 false, these Tensors must be shaped ['batch_size', 'max_time', 'depth']. 1299 By default this function accepts input and emits output in time-major 1300 form. This param is only effective when 'sequence_lengths' is used. 1301 direction: the direction model that the model operates. Could be either 1302 'unidirectional' or 'bidirectional' 1303 dropout: whether to enable dropout. With it is 0, dropout is disabled. 1304 seed: the op seed used for initializing dropout. See `tf.set_random_seed` 1305 for behavior. 1306 name: name of the operation. 1307 Returns: 1308 outputs, output_h 1309 """ 1310 return _cudnn_rnn_no_input_c(inputs, input_h, params, is_training, 1311 CUDNN_RNN_TANH, sequence_lengths, time_major, 1312 input_mode, direction, dropout, seed, name) 1313 1314 1315def cudnn_rnn_opaque_params_to_canonical(rnn_mode, 1316 num_layers, 1317 num_units, 1318 input_size, 1319 params, 1320 input_mode=CUDNN_INPUT_LINEAR_MODE, 1321 direction=CUDNN_RNN_UNIDIRECTION, 1322 dropout=0, 1323 seed=0, 1324 name=None): 1325 """Convert cudnn opaque params to canonical. 1326 1327 Args: 1328 rnn_mode: a string specifies the mode, under which this RNN model runs. 1329 Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'. 1330 num_layers: the number of layers for the RNN model. 1331 num_units: the number of units within the RNN model. 1332 input_size: the size of the input, it could be different from the 1333 num_units. 1334 params: opaque cudnn params var. 1335 input_mode: indicate whether there is a linear projection between the 1336 input and the actual computation before the first layer. It could be 1337 'linear_input', 'skip_input' or 'auto_select'. 1338 'linear_input' (default) always applies a linear projection of input 1339 onto RNN hidden state. (standard RNN behavior). 1340 'skip_input' is only allowed when input_size == num_units; 1341 'auto_select' implies 'skip_input' when input_size == num_units; 1342 otherwise, it implies 'linear_input'. 1343 direction: the direction model that the model operates. Could be either 1344 'unidirectional' or 'bidirectional' 1345 dropout: whether to enable dropout. With it is 0, dropout is disabled. 1346 seed: the op seed used for initializing dropout. See `tf.set_random_seed` 1347 for behavior. 1348 name: name of the operation. 1349 Returns: 1350 weights list and bias list 1351 Raises: 1352 ValueError: if rnn_mode or direction is invalid. 1353 """ 1354 1355 _check_rnn_mode(rnn_mode) 1356 check_direction(direction) 1357 check_input_mode(input_mode) 1358 num_params = _get_num_params(rnn_mode, num_layers, direction) 1359 seed, seed2 = random_seed.get_seed(seed) 1360 weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical( 1361 rnn_mode=rnn_mode, 1362 num_layers=num_layers, 1363 num_units=num_units, 1364 input_size=input_size, 1365 params=params, 1366 input_mode=input_mode, 1367 direction=direction, 1368 dropout=dropout, 1369 seed=seed, 1370 seed2=seed2, 1371 num_params=num_params, 1372 name=name) 1373 return weights, biases 1374 1375 1376def cudnn_rnn_canonical_to_opaque_params(rnn_mode, 1377 num_layers, 1378 num_units, 1379 input_size, 1380 weights, 1381 biases, 1382 input_mode=CUDNN_INPUT_LINEAR_MODE, 1383 direction=CUDNN_RNN_UNIDIRECTION, 1384 dropout=0, 1385 seed=0, 1386 name=None): 1387 """Converts params from the canonical format to a specific format of cuDNN. 1388 1389 Args: 1390 rnn_mode: a string specifies the mode, under which this RNN model runs. 1391 Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'. 1392 num_layers: the number of layers for the RNN model. 1393 num_units: the number of units within the RNN model. 1394 input_size: the size of the input, it could be different from the 1395 num_units. 1396 weights: a Tensor for weight parameters. 1397 biases: a Tensor for bias parameters. 1398 input_mode: indicate whether there is a linear projection between the 1399 input and the actual computation before the first layer. It could be 1400 'linear_input', 'skip_input' or 'auto_select'. 1401 'linear_input' (default) always applies a linear projection of input 1402 onto RNN hidden state. (standard RNN behavior). 1403 'skip_input' is only allowed when input_size == num_units; 1404 'auto_select' implies 'skip_input' when input_size == num_units; 1405 otherwise, it implies 'linear_input'. 1406 direction: the direction model that the model operates. Could be either 1407 'unidirectional' or 'bidirectional' 1408 dropout: whether to enable dropout. With it is 0, dropout is disabled. 1409 seed: the op seed used for initializing dropout. See `tf.set_random_seed` 1410 for behavior. 1411 name: name of the operation. 1412 Returns: 1413 an opaque Cudnn param. 1414 Raises: 1415 ValueError: if rnn_mode or direction is invalid. 1416 """ 1417 _check_rnn_mode(rnn_mode) 1418 check_direction(direction) 1419 check_input_mode(input_mode) 1420 seed, seed2 = random_seed.get_seed(seed) 1421 return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params( 1422 rnn_mode=rnn_mode, 1423 num_layers=num_layers, 1424 num_units=num_units, 1425 input_size=input_size, 1426 weights=weights, 1427 biases=biases, 1428 input_mode=input_mode, 1429 direction=direction, 1430 dropout=dropout, 1431 seed=seed, 1432 seed2=seed2, 1433 name=name) 1434 1435 1436def cudnn_rnn_opaque_params_size(rnn_mode, 1437 num_layers, 1438 num_units, 1439 input_size, 1440 input_mode=CUDNN_INPUT_LINEAR_MODE, 1441 direction=CUDNN_RNN_UNIDIRECTION, 1442 dtype=dtypes.float32, 1443 dropout=0, 1444 seed=0, 1445 name=None): 1446 """Returns opaque params size for specific Cudnn config. 1447 1448 Args: 1449 rnn_mode: a string specifies the mode, under which this RNN model runs. 1450 Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'. 1451 num_layers: the number of layers for the RNN model. 1452 num_units: the number of units within the RNN model. 1453 input_size: the size of the input, it could be different from the 1454 num_units. 1455 input_mode: indicate whether there is a linear projection between the 1456 input and the actual computation before the first layer. It could be 1457 'linear_input', 'skip_input' or 'auto_select'. 1458 'linear_input' (default) always applies a linear projection of input 1459 onto RNN hidden state. (standard RNN behavior). 1460 'skip_input' is only allowed when input_size == num_units; 1461 'auto_select' implies 'skip_input' when input_size == num_units; 1462 otherwise, it implies 'linear_input'. 1463 direction: the direction model that the model operates. Could be either 1464 'unidirectional' or 'bidirectional' 1465 dtype: one of tf.float32 or tf.float64. 1466 dropout: whether to enable dropout. With it is 0, dropout is disabled. 1467 seed: the op seed used for initializing dropout. See `tf.set_random_seed` 1468 for behavior. 1469 name: name of the operation. 1470 Returns: 1471 a int, size of Cudnn opaque params. 1472 Raises: 1473 ValueError: if rnn_mode or direction is invalid. 1474 """ 1475 _check_rnn_mode(rnn_mode) 1476 check_direction(direction) 1477 check_input_mode(input_mode) 1478 seed, seed2 = random_seed.get_seed(seed) 1479 return gen_cudnn_rnn_ops.cudnn_rnn_params_size( 1480 rnn_mode=rnn_mode, 1481 num_layers=num_layers, 1482 num_units=num_units, 1483 input_size=input_size, 1484 T=dtype, 1485 S=dtypes.int32, 1486 dropout=dropout, 1487 seed=seed, 1488 seed2=seed2, 1489 input_mode=input_mode, 1490 direction=direction, 1491 name=name)[0] 1492 1493 1494class _CudnnRNN(object): 1495 """Creates an RNN model using the underlying Cudnn implementation. 1496 1497 Note that self._NUM_PARAMS_PER_LAYER is the number of parameter sets of 1498 weight and bias per layer. It needs to be defined in subclasses. 1499 """ 1500 __doc__ += _cudnn_rnn_common_doc_string 1501 1502 # TODO(jamesqin): support float16 CuDNN RNN 1503 def __init__(self, 1504 rnn_mode, 1505 num_layers, 1506 num_units, 1507 input_size, 1508 input_mode=CUDNN_INPUT_LINEAR_MODE, 1509 direction=CUDNN_RNN_UNIDIRECTION, 1510 dtype=dtypes.float32, 1511 dropout=0., 1512 seed=0): 1513 """Creates a CudnnRNN model from model spec. 1514 1515 Args: 1516 rnn_mode: a string specifies the mode, under which this RNN model runs. 1517 Could be either 'lstm', 'gru', 'rnn_tanh' or 'rnn_relu'. 1518 num_layers: the number of layers for the RNN model. 1519 num_units: the number of units within the RNN model. 1520 input_size: the size of the input, it could be different from the 1521 num_units. 1522 input_mode: indicate whether there is a linear projection between the 1523 input and the actual computation before the first layer. It could be 1524 'linear_input', 'skip_input' or 'auto_select'. 1525 'linear_input' (default) always applies a linear projection of input 1526 onto RNN hidden state. (standard RNN behavior). 1527 'skip_input' is only allowed when input_size == num_units; 1528 'auto_select' implies 'skip_input' when input_size == num_units; 1529 otherwise, it implies 'linear_input'. 1530 direction: the direction model that the model operates. Could be either 1531 'unidirectional' or 'bidirectional' 1532 dtype: dtype of params, tf.float32 or tf.float64. 1533 dropout: whether to enable dropout. With it is 0, dropout is disabled. 1534 seed: the op seed used for initializing dropout. See `tf.set_random_seed` 1535 for behavior. 1536 Raises: 1537 ValueError: if direction is invalid. 1538 """ 1539 self._num_layers = num_layers 1540 self._num_units = num_units 1541 self._input_size = input_size 1542 self._rnn_mode = rnn_mode 1543 self._input_mode = input_mode 1544 self._direction = direction 1545 self._dtype = dtype 1546 self._dropout = dropout 1547 self._seed = seed 1548 1549 @property 1550 def input_mode(self): 1551 return self._input_mode 1552 1553 @property 1554 def input_size(self): 1555 return self._input_size 1556 1557 @property 1558 def num_units(self): 1559 return self._num_units 1560 1561 @property 1562 def num_layers(self): 1563 return self._num_layers 1564 1565 @property 1566 def rnn_mode(self): 1567 return self._rnn_mode 1568 1569 @property 1570 def direction(self): 1571 return self._direction 1572 1573 def params_size(self): 1574 """Calculates the size of the opaque parameter buffer needed for this model. 1575 1576 Returns: 1577 The calculated parameter buffer size. 1578 """ 1579 return cudnn_rnn_opaque_params_size( 1580 rnn_mode=self._rnn_mode, 1581 num_layers=self._num_layers, 1582 num_units=self._num_units, 1583 input_size=self._input_size, 1584 dtype=self._dtype, 1585 dropout=self._dropout, 1586 seed=self._seed, 1587 input_mode=self._input_mode, 1588 direction=self._direction) 1589 1590 def __call__(self, 1591 input_data, 1592 input_h, 1593 input_c, 1594 params, 1595 is_training=True, 1596 sequence_lengths=None, 1597 time_major=True): 1598 """Runs the forward step for the RNN model. 1599 1600 Args: 1601 input_data: the input sequence to the RNN model. If `time_major` is True 1602 (default), the Tensor shape is [max_time, batch_size, input_size]. If 1603 `time_major` is False, the shape is [batch_size, max_time, input_size]. 1604 input_h: the initial hidden state for h. If `time_major` is True 1605 (default), the Tensor shape is [num_layers, batch_size, num_units]. If 1606 `time_major` is False, the shape is [batch_size, num_layers, num_units]. 1607 input_c: the initial hidden state for c. This is only relevant for LSTM. A 1608 Tensor of the same shape as input_h. 1609 params: the parameter buffer created for this model. 1610 is_training: whether this operation will be used in training or inference. 1611 sequence_lengths: an int32 array representing the variable sequence 1612 lengths in a batch. The size of the array has to equal the batch_size. 1613 Default to None, in which case sequences in the batch are assumed to 1614 have the same length, which is inferred from inputs. 1615 time_major: The shape format of the `inputs` and `outputs` Tensors. If 1616 true, these Tensors must be shaped ['max_time', 'batch_size', 'depth']. 1617 If false, these Tensors must be shaped ['batch_size', 'max_time', 1618 'depth']. By default this function accepts input and emits output in 1619 time-major form. This param is only effective when 'sequence_lengths' is 1620 used. 1621 1622 Returns: 1623 output: the output sequence. 1624 output_h: the final state for h. 1625 output_c: the final state for c. This is only relevant for LSTM. 1626 """ 1627 return _cudnn_rnn( 1628 input_data, 1629 input_h, 1630 input_c, 1631 params, 1632 is_training, 1633 self._rnn_mode, 1634 sequence_lengths=sequence_lengths, 1635 time_major=time_major, 1636 input_mode=self._input_mode, 1637 direction=self._direction, 1638 dropout=self._dropout, 1639 seed=self._seed) 1640 1641 def params_to_canonical(self, params): 1642 """Converts params from a specific format of cuDNN to the canonical format. 1643 1644 Args: 1645 params: a Variable for weight and bias parameters. 1646 1647 Returns: 1648 A function for the specific-to-canonical conversion. 1649 """ 1650 return cudnn_rnn_opaque_params_to_canonical( 1651 rnn_mode=self._rnn_mode, 1652 num_layers=self._num_layers, 1653 num_units=self._num_units, 1654 input_size=self._input_size, 1655 params=params, 1656 input_mode=self._input_mode, 1657 direction=self._direction, 1658 dropout=self._dropout, 1659 seed=self._seed) 1660 1661 def canonical_to_params(self, weights, biases): 1662 """Converts params from the canonical format to a specific format of cuDNN. 1663 1664 Args: 1665 weights: a Tensor for weight parameters. 1666 biases: a Tensor for bias parameters. 1667 1668 Returns: 1669 A function for the canonical-to-params-to-specific conversion.. 1670 """ 1671 return cudnn_rnn_canonical_to_opaque_params( 1672 rnn_mode=self._rnn_mode, 1673 num_layers=self._num_layers, 1674 num_units=self._num_units, 1675 input_size=self._input_size, 1676 weights=weights, 1677 biases=biases, 1678 input_mode=self._input_mode, 1679 direction=self._direction, 1680 dropout=self._dropout, 1681 seed=self._seed) 1682 1683 1684class CudnnLSTM(_CudnnRNN): 1685 """Cudnn implementation of the LSTM model.""" 1686 __doc__ += _cudnn_rnn_common_doc_string 1687 # 4 sets of weight and bias parameters for the recurrent input, and 4 for the 1688 # previous layer input. 1689 _NUM_PARAMS_PER_LAYER = CUDNN_LSTM_PARAMS_PER_LAYER 1690 1691 def __init__(self, 1692 num_layers, 1693 num_units, 1694 input_size, 1695 input_mode=CUDNN_INPUT_LINEAR_MODE, 1696 direction=CUDNN_RNN_UNIDIRECTION, 1697 dtype=dtypes.float32, 1698 dropout=0., 1699 seed=0): 1700 """Creates a Cudnn LSTM model from model spec. 1701 1702 Args: 1703 num_layers: the number of layers for the RNN model. 1704 num_units: the number of units within the RNN model. 1705 input_size: the size of the input, it could be different from the 1706 num_units. 1707 input_mode: indicate whether there is a linear projection between the 1708 input and The actual computation before the first layer. It could be 1709 'skip_input', 'linear_input' or 'auto_select'. 1710 'skip_input' is only allowed when input_size == num_units; 1711 'auto_select' implies 'skip_input' when input_size == num_units; 1712 otherwise, it implies 'linear_input'. 1713 direction: the direction model that the model operates. Could be either 1714 'unidirectional' or 'bidirectional' 1715 dtype: dtype of params, tf.float32 or tf.float64. 1716 dropout: whether to enable dropout. With it is 0, dropout is disabled. 1717 seed: the seed used for initializing dropout. 1718 """ 1719 super(CudnnLSTM, self).__init__( 1720 CUDNN_LSTM, 1721 num_layers, 1722 num_units, 1723 input_size, 1724 input_mode=input_mode, 1725 direction=direction, 1726 dtype=dtype, 1727 dropout=dropout, 1728 seed=seed) 1729 1730 def __call__(self, 1731 input_data, 1732 input_h, 1733 input_c, 1734 params, 1735 sequence_lengths=None, 1736 time_major=True, 1737 is_training=True): 1738 """Runs the forward step for the Cudnn LSTM model. 1739 1740 Args: 1741 input_data: the input sequence to the RNN model. If `time_major` is True 1742 (default), the Tensor shape is [max_time, batch_size, input_size]. If 1743 `time_major` is False, the shape is [batch_size, max_time, input_size]. 1744 input_h: the initial hidden state for h. If `time_major` is True 1745 (default), the Tensor shape is [num_layers, batch_size, num_units]. If 1746 `time_major` is False, the shape is [batch_size, num_layers, num_units]. 1747 input_c: the initial hidden state for c. A Tensor of the same shape as 1748 input_h. 1749 params: the parameter buffer created for this model. 1750 sequence_lengths: an int32 array representing the variable sequence 1751 lengths in a batch. The size of the array has to equal the batch_size. 1752 Default to None, in which case sequences in the batch are assumed to 1753 have the same length, which is inferred from inputs. 1754 time_major: The shape format of the `inputs` and `outputs` Tensors. If 1755 true, these Tensors must be shaped ['max_time', 'batch_size', 'depth']. 1756 If false, these Tensors must be shaped ['batch_size', 'max_time', 1757 'depth']. By default this function accepts input and emits output in 1758 time-major form. This param is only effective when 'sequence_lengths' 1759 is used. 1760 is_training: whether this operation will be used in training or inference. 1761 Returns: 1762 output: the output sequence. 1763 output_h: the final state for h. 1764 output_c: the final state for c. 1765 """ 1766 output, output_h, output_c = super(CudnnLSTM, self).__call__( 1767 input_data, 1768 input_h, 1769 input_c, 1770 params, 1771 sequence_lengths=sequence_lengths, 1772 time_major=time_major, 1773 is_training=is_training) 1774 return (output, output_h, output_c) 1775 1776 1777class _CudnnRNNNoInputC(_CudnnRNN): 1778 """Simple CudnnRNN models without input_c.""" 1779 __doc__ += _cudnn_rnn_common_doc_string 1780 1781 def __init__(self, 1782 num_layers, 1783 num_units, 1784 input_size, 1785 input_mode=CUDNN_INPUT_LINEAR_MODE, 1786 direction=CUDNN_RNN_UNIDIRECTION, 1787 dtype=dtypes.float32, 1788 dropout=0., 1789 seed=0): 1790 """Creates a Cudnn RNN model from model without hidden-state C. 1791 1792 Args: 1793 num_layers: the number of layers for the RNN model. 1794 num_units: the number of units within the RNN model. 1795 input_size: the size of the input, it could be different from the 1796 num_units. 1797 input_mode: indicate whether there is a linear projection between the 1798 input and The actual computation before the first layer. It could be 1799 'skip_input', 'linear_input' or 'auto_select'. 1800 'skip_input' is only allowed when input_size == num_units; 1801 'auto_select' implies 'skip_input' when input_size == num_units; 1802 otherwise, it implies 'linear_input'. 1803 direction: the direction model that the model operates. Could be either 1804 'unidirectional' or 'bidirectional' 1805 dtype: dtype of params, tf.float32 or tf.float64. 1806 dropout: whether to enable dropout. With it is 0, dropout is disabled. 1807 seed: the seed used for initializing dropout. 1808 1809 Raises: 1810 ValueError: if direction is not 'unidirectional' or 'bidirectional'. 1811 """ 1812 1813 if direction not in (CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION): 1814 raise ValueError("Invalid direction: %s" % direction) 1815 1816 super(_CudnnRNNNoInputC, self).__init__( 1817 self._rnn_mode, 1818 num_layers, 1819 num_units, 1820 input_size, 1821 input_mode=input_mode, 1822 direction=direction, 1823 dtype=dtype, 1824 dropout=dropout, 1825 seed=seed) 1826 1827 def __call__(self, 1828 input_data, 1829 input_h, 1830 params, 1831 sequence_lengths=None, 1832 time_major=True, 1833 is_training=True): 1834 """Runs the forward step for the Cudnn LSTM model. 1835 1836 Args: 1837 input_data: the input sequence to the RNN model. If `time_major` is True 1838 (default), the Tensor shape is [max_time, batch_size, input_size]. If 1839 `time_major` is False, the shape is [batch_size, max_time, input_size]. 1840 input_h: the initial hidden state for h. If `time_major` is True 1841 (default), the Tensor shape is [num_layers, batch_size, num_units]. If 1842 `time_major` is False, the shape is [batch_size, num_layers, num_units]. 1843 params: the parameter buffer created for this model. 1844 sequence_lengths: an int32 array representing the variable sequence 1845 lengths in a batch. The size of the array has to equal the batch_size. 1846 Default to None, in which case sequences in the batch are assumed to 1847 have the same length, which is inferred from inputs. 1848 time_major: The shape format of the `inputs` and `outputs` Tensors. If 1849 true, these Tensors must be shaped ['max_time', 'batch_size', 'depth']. 1850 If false, these Tensors must be shaped ['batch_size', 'max_time', 1851 'depth']. By default this function accepts input and emits output in 1852 time-major form. This param is only effective when 'sequence_lengths' 1853 is used. 1854 is_training: whether this operation will be used in training or inference. 1855 Returns: 1856 output: the output sequence. 1857 output_h: the final state for h. 1858 """ 1859 return _cudnn_rnn_no_input_c( 1860 input_data, 1861 input_h, 1862 params, 1863 is_training, 1864 self._rnn_mode, 1865 sequence_lengths=sequence_lengths, 1866 time_major=time_major, 1867 input_mode=self._input_mode, 1868 direction=self._direction, 1869 dropout=self._dropout, 1870 seed=self._seed) 1871 1872 1873class CudnnGRU(_CudnnRNNNoInputC): 1874 """Cudnn implementation of the GRU model.""" 1875 __doc__ += _cudnn_rnn_common_doc_string 1876 _rnn_mode = CUDNN_GRU 1877 # 3 sets of weight and bias parameters for the recurrent input, and 3 for the 1878 # previous layer input. 1879 _NUM_PARAMS_PER_LAYER = CUDNN_GRU_PARAMS_PER_LAYER 1880 1881 1882class CudnnRNNTanh(_CudnnRNNNoInputC): 1883 """Cudnn implementation of the RNN-tanh model.""" 1884 __doc__ += _cudnn_rnn_common_doc_string 1885 _rnn_mode = CUDNN_RNN_TANH 1886 # 1 set of weight and bias parameters for the recurrent input, and 1 for the 1887 # previous layer input. 1888 _NUM_PARAMS_PER_LAYER = CUDNN_RNN_TANH_PARAMS_PER_LAYER 1889 1890 1891class CudnnRNNRelu(_CudnnRNNNoInputC): 1892 """Cudnn implementation of the RNN-relu model.""" 1893 __doc__ += _cudnn_rnn_common_doc_string 1894 _rnn_mode = CUDNN_RNN_RELU 1895 # 1 set of weight and bias parameters for the recurrent input, and 1 for the 1896 # previous layer input. 1897 _NUM_PARAMS_PER_LAYER = CUDNN_RNN_RELU_PARAMS_PER_LAYER 1898