1# Copyright 2020-2021 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================ 15"""lstm""" 16import math 17import numpy as np 18import mindspore.context as context 19import mindspore.common.dtype as mstype 20from mindspore.ops.primitive import constexpr 21from mindspore._checkparam import Validator as validator 22from mindspore.common.initializer import initializer 23from mindspore.common.parameter import Parameter, ParameterTuple 24from mindspore.common.tensor import Tensor 25from mindspore.nn.cell import Cell 26from mindspore import nn 27from mindspore.ops import operations as P 28from mindspore.ops import functional as F 29 30 31__all__ = ['LSTM', 'LSTMCell'] 32 33 34@constexpr 35def _create_sequence_length(shape): 36 num_step, batch_size, _ = shape 37 sequence_length = Tensor(np.ones(batch_size, np.int32) * num_step, mstype.int32) 38 return sequence_length 39 40 41@constexpr 42def _check_input_dtype(input_dtype, param_name, allow_dtypes, cls_name): 43 validator.check_type_name(param_name, input_dtype, allow_dtypes, cls_name) 44 45 46@constexpr 47def _check_input_3d(input_shape, param_name, func_name): 48 if len(input_shape) != 3: 49 raise ValueError(f"For '{func_name}', the '{param_name}' should be 3d, but got the length of input_shape:" 50 f" {len(input_shape)}.") 51 52 53class LSTM(Cell): 54 r""" 55 Stacked LSTM (Long Short-Term Memory) layers. 56 57 Apply LSTM layer to the input. 58 59 There are two pipelines connecting two consecutive cells in a LSTM model; one is cell state pipeline 60 and the other is hidden state pipeline. Denote two consecutive time nodes as :math:`t-1` and :math:`t`. 61 Given an input :math:`x_t` at time :math:`t`, an hidden state :math:`h_{t-1}` and an cell 62 state :math:`c_{t-1}` of the layer at time :math:`{t-1}`, the cell state and hidden state at 63 time :math:`t` is computed using an gating mechanism. Input gate :math:`i_t` is designed to protect the cell 64 from perturbation by irrelevant inputs. Forget gate :math:`f_t` affords protection of the cell by forgetting 65 some information in the past, which is stored in :math:`h_{t-1}`. Output gate :math:`o_t` protects other 66 units from perturbation by currently irrelevant memory contents. Candidate cell state :math:`\tilde{c}_t` is 67 calculated with the current input, on which the input gate will be applied. Finally, current cell state 68 :math:`c_{t}` and hidden state :math:`h_{t}` are computed with the calculated gates and cell states. The complete 69 formulation is as follows. 70 71 .. math:: 72 \begin{array}{ll} \\ 73 i_t = \sigma(W_{ix} x_t + b_{ix} + W_{ih} h_{(t-1)} + b_{ih}) \\ 74 f_t = \sigma(W_{fx} x_t + b_{fx} + W_{fh} h_{(t-1)} + b_{fh}) \\ 75 \tilde{c}_t = \tanh(W_{cx} x_t + b_{cx} + W_{ch} h_{(t-1)} + b_{ch}) \\ 76 o_t = \sigma(W_{ox} x_t + b_{ox} + W_{oh} h_{(t-1)} + b_{oh}) \\ 77 c_t = f_t * c_{(t-1)} + i_t * \tilde{c}_t \\ 78 h_t = o_t * \tanh(c_t) \\ 79 \end{array} 80 81 Here :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. :math:`W, b` 82 are learnable weights between the output and the input in the formula. For instance, 83 :math:`W_{ix}, b_{ix}` are the weight and bias used to transform from input :math:`x` to :math:`i`. 84 Details can be found in paper `LONG SHORT-TERM MEMORY 85 <https://www.bioinf.jku.at/publications/older/2604.pdf>`_ and 86 `Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling 87 <https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/43905.pdf>`_. 88 89 Args: 90 input_size (int): Number of features of input. 91 hidden_size (int): Number of features of hidden layer. 92 num_layers (int): Number of layers of stacked LSTM . Default: 1. 93 has_bias (bool): Whether the cell has bias `b_ih` and `b_hh`. Default: True. 94 batch_first (bool): Specifies whether the first dimension of input `x` is batch_size. Default: False. 95 dropout (float, int): If not 0, append `Dropout` layer on the outputs of each 96 LSTM layer except the last layer. Default 0. The range of dropout is [0.0, 1.0]. 97 bidirectional (bool): Specifies whether it is a bidirectional LSTM. Default: False. 98 99 Inputs: 100 - **x** (Tensor) - Tensor of shape (seq_len, batch_size, `input_size`) or 101 (batch_size, seq_len, `input_size`). 102 - **hx** (tuple) - A tuple of two Tensors (h_0, c_0) both of data type mindspore.float32 or 103 mindspore.float16 and shape (num_directions * `num_layers`, batch_size, `hidden_size`). 104 Data type of `hx` must be the same as `x`. 105 106 Outputs: 107 Tuple, a tuple contains (`output`, (`h_n`, `c_n`)). 108 109 - **output** (Tensor) - Tensor of shape (seq_len, batch_size, num_directions * `hidden_size`). 110 - **hx_n** (tuple) - A tuple of two Tensor (h_n, c_n) both of shape 111 (num_directions * `num_layers`, batch_size, `hidden_size`). 112 113 Raises: 114 TypeError: If `input_size`, `hidden_size` or `num_layers` is not an int. 115 TypeError: If `has_bias`, `batch_first` or `bidirectional` is not a bool. 116 TypeError: If `dropout` is neither a float nor an int. 117 ValueError: If `dropout` is not in range [0.0, 1.0]. 118 119 Supported Platforms: 120 ``Ascend`` ``GPU`` 121 122 Examples: 123 >>> net = nn.LSTM(10, 16, 2, has_bias=True, batch_first=True, bidirectional=False) 124 >>> x = Tensor(np.ones([3, 5, 10]).astype(np.float32)) 125 >>> h0 = Tensor(np.ones([1 * 2, 3, 16]).astype(np.float32)) 126 >>> c0 = Tensor(np.ones([1 * 2, 3, 16]).astype(np.float32)) 127 >>> output, (hn, cn) = net(x, (h0, c0)) 128 >>> print(output.shape) 129 (3, 5, 16) 130 """ 131 132 def __init__(self, 133 input_size, 134 hidden_size, 135 num_layers=1, 136 has_bias=True, 137 batch_first=False, 138 dropout=0, 139 bidirectional=False): 140 """Initialize LSTM.""" 141 super(LSTM, self).__init__() 142 validator.check_value_type("batch_first", batch_first, [bool], self.cls_name) 143 validator.check_positive_int(hidden_size, "hidden_size", self.cls_name) 144 validator.check_positive_int(num_layers, "num_layers", self.cls_name) 145 self.is_ascend = context.get_context("device_target") == "Ascend" 146 147 self.batch_first = batch_first 148 self.transpose = P.Transpose() 149 self.num_layers = num_layers 150 self.bidirectional = bidirectional 151 self.dropout = dropout 152 self.lstm = P.LSTM(input_size=input_size, 153 hidden_size=hidden_size, 154 num_layers=num_layers, 155 has_bias=has_bias, 156 bidirectional=bidirectional, 157 dropout=float(dropout)) 158 159 weight_size = 0 160 gate_size = 4 * hidden_size 161 stdv = 1 / math.sqrt(hidden_size) 162 num_directions = 2 if bidirectional else 1 163 if self.is_ascend: 164 self.reverse_seq = P.ReverseSequence(batch_dim=1, seq_dim=0) 165 self.concat = P.Concat(axis=0) 166 self.concat_2dim = P.Concat(axis=2) 167 self.cast = P.Cast() 168 self.shape = P.Shape() 169 if dropout < 0 or dropout > 1: 170 raise ValueError(f"For '{self.cls_name}', the 'dropout' must be a number in range [0, 1], " 171 f"but got {dropout}.") 172 if dropout == 1: 173 self.dropout_op = P.ZerosLike() 174 else: 175 self.dropout_op = nn.Dropout(float(1 - dropout)) 176 b0 = np.zeros(gate_size, dtype=np.float16) 177 self.w_list = [] 178 self.b_list = [] 179 self.rnns_fw = P.DynamicRNN(forget_bias=0.0) 180 self.rnns_bw = P.DynamicRNN(forget_bias=0.0) 181 182 for layer in range(num_layers): 183 w_shape = input_size if layer == 0 else (num_directions * hidden_size) 184 w_np = np.random.uniform(-stdv, stdv, (w_shape + hidden_size, gate_size)).astype(np.float16) 185 self.w_list.append(Parameter( 186 initializer(Tensor(w_np), [w_shape + hidden_size, gate_size]), name='weight_fw' + str(layer))) 187 if has_bias: 188 b_np = np.random.uniform(-stdv, stdv, gate_size).astype(np.float16) 189 self.b_list.append(Parameter(initializer(Tensor(b_np), [gate_size]), name='bias_fw' + str(layer))) 190 else: 191 self.b_list.append(Parameter(initializer(Tensor(b0), [gate_size]), name='bias_fw' + str(layer))) 192 if bidirectional: 193 w_bw_np = np.random.uniform(-stdv, stdv, (w_shape + hidden_size, gate_size)).astype(np.float16) 194 self.w_list.append(Parameter(initializer(Tensor(w_bw_np), [w_shape + hidden_size, gate_size]), 195 name='weight_bw' + str(layer))) 196 b_bw_np = np.random.uniform(-stdv, stdv, (4 * hidden_size)).astype(np.float16) if has_bias else b0 197 self.b_list.append(Parameter(initializer(Tensor(b_bw_np), [gate_size]), 198 name='bias_bw' + str(layer))) 199 self.w_list = ParameterTuple(self.w_list) 200 self.b_list = ParameterTuple(self.b_list) 201 else: 202 for layer in range(num_layers): 203 input_layer_size = input_size if layer == 0 else hidden_size * num_directions 204 increment_size = gate_size * input_layer_size 205 increment_size += gate_size * hidden_size 206 if has_bias: 207 increment_size += 2 * gate_size 208 weight_size += increment_size * num_directions 209 w_np = np.random.uniform(-stdv, stdv, (weight_size, 1, 1)).astype(np.float32) 210 self.weight = Parameter(initializer(Tensor(w_np), [weight_size, 1, 1]), name='weight') 211 212 def _stacked_bi_dynamic_rnn(self, x, init_h, init_c, weight, bias): 213 """stacked bidirectional dynamic_rnn""" 214 x_shape = self.shape(x) 215 sequence_length = _create_sequence_length(x_shape) 216 pre_layer = x 217 hn = () 218 cn = () 219 output = x 220 for i in range(self.num_layers): 221 offset = i * 2 222 weight_fw, weight_bw = weight[offset], weight[offset + 1] 223 bias_fw, bias_bw = bias[offset], bias[offset + 1] 224 init_h_fw, init_h_bw = init_h[offset:offset + 1, :, :], init_h[offset + 1:offset + 2, :, :] 225 init_c_fw, init_c_bw = init_c[offset:offset + 1, :, :], init_c[offset + 1:offset + 2, :, :] 226 bw_x = self.reverse_seq(pre_layer, sequence_length) 227 y, h, c, _, _, _, _, _ = self.rnns_fw(pre_layer, weight_fw, bias_fw, None, init_h_fw, init_c_fw) 228 y_bw, h_bw, c_bw, _, _, _, _, _ = self.rnns_bw(bw_x, weight_bw, bias_bw, None, init_h_bw, init_c_bw) 229 y_bw = self.reverse_seq(y_bw, sequence_length) 230 output = self.concat_2dim((y, y_bw)) 231 pre_layer = self.dropout_op(output) if self.dropout else output 232 hn += (h[-1:, :, :],) 233 hn += (h_bw[-1:, :, :],) 234 cn += (c[-1:, :, :],) 235 cn += (c_bw[-1:, :, :],) 236 status_h = self.concat(hn) 237 status_c = self.concat(cn) 238 return output, status_h, status_c 239 240 def _stacked_dynamic_rnn(self, x, init_h, init_c, weight, bias): 241 """stacked mutil_layer dynamic_rnn""" 242 pre_layer = x 243 hn = () 244 cn = () 245 y = 0 246 for i in range(self.num_layers): 247 weight_fw, bias_bw = weight[i], bias[i] 248 init_h_fw, init_c_bw = init_h[i:i + 1, :, :], init_c[i:i + 1, :, :] 249 y, h, c, _, _, _, _, _ = self.rnns_fw(pre_layer, weight_fw, bias_bw, None, init_h_fw, init_c_bw) 250 pre_layer = self.dropout_op(y) if self.dropout else y 251 hn += (h[-1:, :, :],) 252 cn += (c[-1:, :, :],) 253 status_h = self.concat(hn) 254 status_c = self.concat(cn) 255 return y, status_h, status_c 256 257 def construct(self, x, hx): 258 if self.batch_first: 259 x = self.transpose(x, (1, 0, 2)) 260 h, c = hx 261 if self.is_ascend: 262 x_dtype = F.dtype(x) 263 h_dtype = F.dtype(h) 264 c_dtype = F.dtype(c) 265 _check_input_3d(F.shape(h), "h of hx", self.cls_name) 266 _check_input_3d(F.shape(c), "c of hx", self.cls_name) 267 _check_input_dtype(x_dtype, "x", [mstype.float32, mstype.float16], self.cls_name) 268 _check_input_dtype(h_dtype, "h", [mstype.float32, mstype.float16], self.cls_name) 269 _check_input_dtype(c_dtype, "c", [mstype.float32, mstype.float16], self.cls_name) 270 x = self.cast(x, mstype.float16) 271 h = self.cast(h, mstype.float16) 272 c = self.cast(c, mstype.float16) 273 if self.bidirectional: 274 x, h, c = self._stacked_bi_dynamic_rnn(x, h, c, self.w_list, self.b_list) 275 else: 276 x, h, c = self._stacked_dynamic_rnn(x, h, c, self.w_list, self.b_list) 277 x = self.cast(x, x_dtype) 278 h = self.cast(h, h_dtype) 279 c = self.cast(c, c_dtype) 280 else: 281 x, h, c, _, _ = self.lstm(x, h, c, self.weight) 282 if self.batch_first: 283 x = self.transpose(x, (1, 0, 2)) 284 return x, (h, c) 285 286 287class LSTMCell(Cell): 288 r""" 289 LSTM (Long Short-Term Memory) layer. 290 291 Apply LSTM layer to the input. 292 293 There are two pipelines connecting two consecutive cells in a LSTM model; one is cell state pipeline 294 and the other is hidden state pipeline. Denote two consecutive time nodes as :math:`t-1` and :math:`t`. 295 Given an input :math:`x_t` at time :math:`t`, an hidden state :math:`h_{t-1}` and an cell 296 state :math:`c_{t-1}` of the layer at time :math:`{t-1}`, the cell state and hidden state at 297 time :math:`t` is computed using an gating mechanism. Input gate :math:`i_t` is designed to protect the cell 298 from perturbation by irrelevant inputs. Forget gate :math:`f_t` affords protection of the cell by forgetting 299 some information in the past, which is stored in :math:`h_{t-1}`. Output gate :math:`o_t` protects other 300 units from perturbation by currently irrelevant memory contents. Candidate cell state :math:`\tilde{c}_t` is 301 calculated with the current input, on which the input gate will be applied. Finally, current cell state 302 :math:`c_{t}` and hidden state :math:`h_{t}` are computed with the calculated gates and cell states. The complete 303 formulation is as follows. 304 305 .. math:: 306 \begin{array}{ll} \\ 307 i_t = \sigma(W_{ix} x_t + b_{ix} + W_{ih} h_{(t-1)} + b_{ih}) \\ 308 f_t = \sigma(W_{fx} x_t + b_{fx} + W_{fh} h_{(t-1)} + b_{fh}) \\ 309 \tilde{c}_t = \tanh(W_{cx} x_t + b_{cx} + W_{ch} h_{(t-1)} + b_{ch}) \\ 310 o_t = \sigma(W_{ox} x_t + b_{ox} + W_{oh} h_{(t-1)} + b_{oh}) \\ 311 c_t = f_t * c_{(t-1)} + i_t * \tilde{c}_t \\ 312 h_t = o_t * \tanh(c_t) \\ 313 \end{array} 314 315 Here :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. :math:`W, b` 316 are learnable weights between the output and the input in the formula. For instance, 317 :math:`W_{ix}, b_{ix}` are the weight and bias used to transform from input :math:`x` to :math:`i`. 318 Details can be found in paper `LONG SHORT-TERM MEMORY 319 <https://www.bioinf.jku.at/publications/older/2604.pdf>`_ and 320 `Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling 321 <https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/43905.pdf>`_. 322 323 Note: 324 LSTMCell is a single-layer RNN, you can achieve multi-layer RNN by stacking LSTMCell. 325 326 Args: 327 input_size (int): Number of features of input. 328 hidden_size (int): Number of features of hidden layer. 329 has_bias (bool): Whether the cell has bias `b_ih` and `b_hh`. Default: True. 330 batch_first (bool): Specifies whether the first dimension of input `x` is batch_size. Default: False. 331 dropout (float, int): If not 0, append `Dropout` layer on the outputs of each 332 LSTM layer except the last layer. Default 0. The range of dropout is [0.0, 1.0]. 333 bidirectional (bool): Specifies whether this is a bidirectional LSTM. If set True, 334 number of directions will be 2 otherwise number of directions is 1. Default: False. 335 336 Inputs: 337 - **x** (Tensor) - Tensor of shape (seq_len, batch_size, `input_size`). 338 - **h** - data type mindspore.float32 or 339 mindspore.float16 and shape (num_directions, batch_size, `hidden_size`). 340 - **c** - data type mindspore.float32 or 341 mindspore.float16 and shape (num_directions, batch_size, `hidden_size`). 342 Data type of `h' and 'c' must be the same of `x`. 343 - **w** - data type mindspore.float32 or 344 mindspore.float16 and shape (`weight_size`, 1, 1). 345 The value of `weight_size` depends on `input_size`, `hidden_size` and `bidirectional` 346 347 Outputs: 348 `output`, `h_n`, `c_n`, 'reserve', 'state'. 349 350 - **output** (Tensor) - Tensor of shape (seq_len, batch_size, num_directions * `hidden_size`). 351 - **h** - A Tensor with shape (num_directions, batch_size, `hidden_size`). 352 - **c** - A Tensor with shape (num_directions, batch_size, `hidden_size`). 353 - **reserve** - reserved 354 - **state** - reserved 355 356 Raises: 357 TypeError: If `input_size` or `hidden_size` or `num_layers` is not an int. 358 TypeError: If `has_bias` or `batch_first` or `bidirectional` is not a bool. 359 TypeError: If `dropout` is neither a float nor an int. 360 ValueError: If `dropout` is not in range [0.0, 1.0]. 361 362 Supported Platforms: 363 ``GPU`` ``CPU`` 364 365 Examples: 366 >>> net = nn.LSTMCell(10, 12, has_bias=True, batch_first=True, bidirectional=False) 367 >>> x = Tensor(np.ones([3, 5, 10]).astype(np.float32)) 368 >>> h = Tensor(np.ones([1, 3, 12]).astype(np.float32)) 369 >>> c = Tensor(np.ones([1, 3, 12]).astype(np.float32)) 370 >>> w = Tensor(np.ones([1152, 1, 1]).astype(np.float32)) 371 >>> output, h, c, _, _ = net(x, h, c, w) 372 >>> print(output.shape) 373 (3, 5, 12) 374 """ 375 376 def __init__(self, 377 input_size, 378 hidden_size, 379 has_bias=True, 380 batch_first=False, 381 dropout=0, 382 bidirectional=False): 383 """Initialize LSTMCell.""" 384 super(LSTMCell, self).__init__() 385 self.batch_first = validator.check_value_type("batch_first", batch_first, [bool], self.cls_name) 386 self.transpose = P.Transpose() 387 self.lstm = P.LSTM(input_size=input_size, 388 hidden_size=hidden_size, 389 num_layers=1, 390 has_bias=has_bias, 391 bidirectional=bidirectional, 392 dropout=float(dropout)) 393 394 def construct(self, x, h, c, w): 395 if self.batch_first: 396 x = self.transpose(x, (1, 0, 2)) 397 x, h, c, _, _ = self.lstm(x, h, c, w) 398 if self.batch_first: 399 x = self.transpose(x, (1, 0, 2)) 400 return x, h, c, _, _ 401