• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020-2021 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ============================================================================
15"""lstm"""
16import math
17import numpy as np
18import mindspore.context as context
19import mindspore.common.dtype as mstype
20from mindspore.ops.primitive import constexpr
21from mindspore._checkparam import Validator as validator
22from mindspore.common.initializer import initializer
23from mindspore.common.parameter import Parameter, ParameterTuple
24from mindspore.common.tensor import Tensor
25from mindspore.nn.cell import Cell
26from mindspore import nn
27from mindspore.ops import operations as P
28from mindspore.ops import functional as F
29
30
31__all__ = ['LSTM', 'LSTMCell']
32
33
34@constexpr
35def _create_sequence_length(shape):
36    num_step, batch_size, _ = shape
37    sequence_length = Tensor(np.ones(batch_size, np.int32) * num_step, mstype.int32)
38    return sequence_length
39
40
41@constexpr
42def _check_input_dtype(input_dtype, param_name, allow_dtypes, cls_name):
43    validator.check_type_name(param_name, input_dtype, allow_dtypes, cls_name)
44
45
46@constexpr
47def _check_input_3d(input_shape, param_name, func_name):
48    if len(input_shape) != 3:
49        raise ValueError(f"For '{func_name}', the '{param_name}' should be 3d, but got the length of input_shape:"
50                         f" {len(input_shape)}.")
51
52
53class LSTM(Cell):
54    r"""
55    Stacked LSTM (Long Short-Term Memory) layers.
56
57    Apply LSTM layer to the input.
58
59    There are two pipelines connecting two consecutive cells in a LSTM model; one is cell state pipeline
60    and the other is hidden state pipeline. Denote two consecutive time nodes as :math:`t-1` and :math:`t`.
61    Given an input :math:`x_t` at time :math:`t`, an hidden state :math:`h_{t-1}` and an cell
62    state :math:`c_{t-1}` of the layer at time :math:`{t-1}`, the cell state and hidden state at
63    time :math:`t` is computed using an gating mechanism. Input gate :math:`i_t` is designed to protect the cell
64    from perturbation by irrelevant inputs. Forget gate :math:`f_t` affords protection of the cell by forgetting
65    some information in the past, which is stored in :math:`h_{t-1}`. Output gate :math:`o_t` protects other
66    units from perturbation by currently irrelevant memory contents. Candidate cell state :math:`\tilde{c}_t` is
67    calculated with the current input, on which the input gate will be applied. Finally, current cell state
68    :math:`c_{t}` and hidden state :math:`h_{t}` are computed with the calculated gates and cell states. The complete
69    formulation is as follows.
70
71    .. math::
72        \begin{array}{ll} \\
73            i_t = \sigma(W_{ix} x_t + b_{ix} + W_{ih} h_{(t-1)} + b_{ih}) \\
74            f_t = \sigma(W_{fx} x_t + b_{fx} + W_{fh} h_{(t-1)} + b_{fh}) \\
75            \tilde{c}_t = \tanh(W_{cx} x_t + b_{cx} + W_{ch} h_{(t-1)} + b_{ch}) \\
76            o_t = \sigma(W_{ox} x_t + b_{ox} + W_{oh} h_{(t-1)} + b_{oh}) \\
77            c_t = f_t * c_{(t-1)} + i_t * \tilde{c}_t \\
78            h_t = o_t * \tanh(c_t) \\
79        \end{array}
80
81    Here :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. :math:`W, b`
82    are learnable weights between the output and the input in the formula. For instance,
83    :math:`W_{ix}, b_{ix}` are the weight and bias used to transform from input :math:`x` to :math:`i`.
84    Details can be found in paper `LONG SHORT-TERM MEMORY
85    <https://www.bioinf.jku.at/publications/older/2604.pdf>`_ and
86    `Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling
87    <https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/43905.pdf>`_.
88
89    Args:
90        input_size (int): Number of features of input.
91        hidden_size (int):  Number of features of hidden layer.
92        num_layers (int): Number of layers of stacked LSTM . Default: 1.
93        has_bias (bool): Whether the cell has bias `b_ih` and `b_hh`. Default: True.
94        batch_first (bool): Specifies whether the first dimension of input `x` is batch_size. Default: False.
95        dropout (float, int): If not 0, append `Dropout` layer on the outputs of each
96            LSTM layer except the last layer. Default 0. The range of dropout is [0.0, 1.0].
97        bidirectional (bool): Specifies whether it is a bidirectional LSTM. Default: False.
98
99    Inputs:
100        - **x** (Tensor) - Tensor of shape (seq_len, batch_size, `input_size`) or
101          (batch_size, seq_len, `input_size`).
102        - **hx** (tuple) - A tuple of two Tensors (h_0, c_0) both of data type mindspore.float32 or
103          mindspore.float16 and shape (num_directions * `num_layers`, batch_size, `hidden_size`).
104          Data type of `hx` must be the same as `x`.
105
106    Outputs:
107        Tuple, a tuple contains (`output`, (`h_n`, `c_n`)).
108
109        - **output** (Tensor) - Tensor of shape (seq_len, batch_size, num_directions * `hidden_size`).
110        - **hx_n** (tuple) - A tuple of two Tensor (h_n, c_n) both of shape
111          (num_directions * `num_layers`, batch_size, `hidden_size`).
112
113    Raises:
114        TypeError: If `input_size`, `hidden_size` or `num_layers` is not an int.
115        TypeError: If `has_bias`, `batch_first` or `bidirectional` is not a bool.
116        TypeError: If `dropout` is neither a float nor an int.
117        ValueError: If `dropout` is not in range [0.0, 1.0].
118
119    Supported Platforms:
120        ``Ascend`` ``GPU``
121
122    Examples:
123        >>> net = nn.LSTM(10, 16, 2, has_bias=True, batch_first=True, bidirectional=False)
124        >>> x = Tensor(np.ones([3, 5, 10]).astype(np.float32))
125        >>> h0 = Tensor(np.ones([1 * 2, 3, 16]).astype(np.float32))
126        >>> c0 = Tensor(np.ones([1 * 2, 3, 16]).astype(np.float32))
127        >>> output, (hn, cn) = net(x, (h0, c0))
128        >>> print(output.shape)
129        (3, 5, 16)
130    """
131
132    def __init__(self,
133                 input_size,
134                 hidden_size,
135                 num_layers=1,
136                 has_bias=True,
137                 batch_first=False,
138                 dropout=0,
139                 bidirectional=False):
140        """Initialize LSTM."""
141        super(LSTM, self).__init__()
142        validator.check_value_type("batch_first", batch_first, [bool], self.cls_name)
143        validator.check_positive_int(hidden_size, "hidden_size", self.cls_name)
144        validator.check_positive_int(num_layers, "num_layers", self.cls_name)
145        self.is_ascend = context.get_context("device_target") == "Ascend"
146
147        self.batch_first = batch_first
148        self.transpose = P.Transpose()
149        self.num_layers = num_layers
150        self.bidirectional = bidirectional
151        self.dropout = dropout
152        self.lstm = P.LSTM(input_size=input_size,
153                           hidden_size=hidden_size,
154                           num_layers=num_layers,
155                           has_bias=has_bias,
156                           bidirectional=bidirectional,
157                           dropout=float(dropout))
158
159        weight_size = 0
160        gate_size = 4 * hidden_size
161        stdv = 1 / math.sqrt(hidden_size)
162        num_directions = 2 if bidirectional else 1
163        if self.is_ascend:
164            self.reverse_seq = P.ReverseSequence(batch_dim=1, seq_dim=0)
165            self.concat = P.Concat(axis=0)
166            self.concat_2dim = P.Concat(axis=2)
167            self.cast = P.Cast()
168            self.shape = P.Shape()
169            if dropout < 0 or dropout > 1:
170                raise ValueError(f"For '{self.cls_name}', the 'dropout' must be a number in range [0, 1], "
171                                 f"but got {dropout}.")
172            if dropout == 1:
173                self.dropout_op = P.ZerosLike()
174            else:
175                self.dropout_op = nn.Dropout(float(1 - dropout))
176            b0 = np.zeros(gate_size, dtype=np.float16)
177            self.w_list = []
178            self.b_list = []
179            self.rnns_fw = P.DynamicRNN(forget_bias=0.0)
180            self.rnns_bw = P.DynamicRNN(forget_bias=0.0)
181
182            for layer in range(num_layers):
183                w_shape = input_size if layer == 0 else (num_directions * hidden_size)
184                w_np = np.random.uniform(-stdv, stdv, (w_shape + hidden_size, gate_size)).astype(np.float16)
185                self.w_list.append(Parameter(
186                    initializer(Tensor(w_np), [w_shape + hidden_size, gate_size]), name='weight_fw' + str(layer)))
187                if has_bias:
188                    b_np = np.random.uniform(-stdv, stdv, gate_size).astype(np.float16)
189                    self.b_list.append(Parameter(initializer(Tensor(b_np), [gate_size]), name='bias_fw' + str(layer)))
190                else:
191                    self.b_list.append(Parameter(initializer(Tensor(b0), [gate_size]), name='bias_fw' + str(layer)))
192                if bidirectional:
193                    w_bw_np = np.random.uniform(-stdv, stdv, (w_shape + hidden_size, gate_size)).astype(np.float16)
194                    self.w_list.append(Parameter(initializer(Tensor(w_bw_np), [w_shape + hidden_size, gate_size]),
195                                                 name='weight_bw' + str(layer)))
196                    b_bw_np = np.random.uniform(-stdv, stdv, (4 * hidden_size)).astype(np.float16) if has_bias else b0
197                    self.b_list.append(Parameter(initializer(Tensor(b_bw_np), [gate_size]),
198                                                 name='bias_bw' + str(layer)))
199            self.w_list = ParameterTuple(self.w_list)
200            self.b_list = ParameterTuple(self.b_list)
201        else:
202            for layer in range(num_layers):
203                input_layer_size = input_size if layer == 0 else hidden_size * num_directions
204                increment_size = gate_size * input_layer_size
205                increment_size += gate_size * hidden_size
206                if has_bias:
207                    increment_size += 2 * gate_size
208                weight_size += increment_size * num_directions
209            w_np = np.random.uniform(-stdv, stdv, (weight_size, 1, 1)).astype(np.float32)
210            self.weight = Parameter(initializer(Tensor(w_np), [weight_size, 1, 1]), name='weight')
211
212    def _stacked_bi_dynamic_rnn(self, x, init_h, init_c, weight, bias):
213        """stacked bidirectional dynamic_rnn"""
214        x_shape = self.shape(x)
215        sequence_length = _create_sequence_length(x_shape)
216        pre_layer = x
217        hn = ()
218        cn = ()
219        output = x
220        for i in range(self.num_layers):
221            offset = i * 2
222            weight_fw, weight_bw = weight[offset], weight[offset + 1]
223            bias_fw, bias_bw = bias[offset], bias[offset + 1]
224            init_h_fw, init_h_bw = init_h[offset:offset + 1, :, :], init_h[offset + 1:offset + 2, :, :]
225            init_c_fw, init_c_bw = init_c[offset:offset + 1, :, :], init_c[offset + 1:offset + 2, :, :]
226            bw_x = self.reverse_seq(pre_layer, sequence_length)
227            y, h, c, _, _, _, _, _ = self.rnns_fw(pre_layer, weight_fw, bias_fw, None, init_h_fw, init_c_fw)
228            y_bw, h_bw, c_bw, _, _, _, _, _ = self.rnns_bw(bw_x, weight_bw, bias_bw, None, init_h_bw, init_c_bw)
229            y_bw = self.reverse_seq(y_bw, sequence_length)
230            output = self.concat_2dim((y, y_bw))
231            pre_layer = self.dropout_op(output) if self.dropout else output
232            hn += (h[-1:, :, :],)
233            hn += (h_bw[-1:, :, :],)
234            cn += (c[-1:, :, :],)
235            cn += (c_bw[-1:, :, :],)
236        status_h = self.concat(hn)
237        status_c = self.concat(cn)
238        return output, status_h, status_c
239
240    def _stacked_dynamic_rnn(self, x, init_h, init_c, weight, bias):
241        """stacked mutil_layer dynamic_rnn"""
242        pre_layer = x
243        hn = ()
244        cn = ()
245        y = 0
246        for i in range(self.num_layers):
247            weight_fw, bias_bw = weight[i], bias[i]
248            init_h_fw, init_c_bw = init_h[i:i + 1, :, :], init_c[i:i + 1, :, :]
249            y, h, c, _, _, _, _, _ = self.rnns_fw(pre_layer, weight_fw, bias_bw, None, init_h_fw, init_c_bw)
250            pre_layer = self.dropout_op(y) if self.dropout else y
251            hn += (h[-1:, :, :],)
252            cn += (c[-1:, :, :],)
253        status_h = self.concat(hn)
254        status_c = self.concat(cn)
255        return y, status_h, status_c
256
257    def construct(self, x, hx):
258        if self.batch_first:
259            x = self.transpose(x, (1, 0, 2))
260        h, c = hx
261        if self.is_ascend:
262            x_dtype = F.dtype(x)
263            h_dtype = F.dtype(h)
264            c_dtype = F.dtype(c)
265            _check_input_3d(F.shape(h), "h of hx", self.cls_name)
266            _check_input_3d(F.shape(c), "c of hx", self.cls_name)
267            _check_input_dtype(x_dtype, "x", [mstype.float32, mstype.float16], self.cls_name)
268            _check_input_dtype(h_dtype, "h", [mstype.float32, mstype.float16], self.cls_name)
269            _check_input_dtype(c_dtype, "c", [mstype.float32, mstype.float16], self.cls_name)
270            x = self.cast(x, mstype.float16)
271            h = self.cast(h, mstype.float16)
272            c = self.cast(c, mstype.float16)
273            if self.bidirectional:
274                x, h, c = self._stacked_bi_dynamic_rnn(x, h, c, self.w_list, self.b_list)
275            else:
276                x, h, c = self._stacked_dynamic_rnn(x, h, c, self.w_list, self.b_list)
277            x = self.cast(x, x_dtype)
278            h = self.cast(h, h_dtype)
279            c = self.cast(c, c_dtype)
280        else:
281            x, h, c, _, _ = self.lstm(x, h, c, self.weight)
282        if self.batch_first:
283            x = self.transpose(x, (1, 0, 2))
284        return x, (h, c)
285
286
287class LSTMCell(Cell):
288    r"""
289    LSTM (Long Short-Term Memory) layer.
290
291    Apply LSTM layer to the input.
292
293    There are two pipelines connecting two consecutive cells in a LSTM model; one is cell state pipeline
294    and the other is hidden state pipeline. Denote two consecutive time nodes as :math:`t-1` and :math:`t`.
295    Given an input :math:`x_t` at time :math:`t`, an hidden state :math:`h_{t-1}` and an cell
296    state :math:`c_{t-1}` of the layer at time :math:`{t-1}`, the cell state and hidden state at
297    time :math:`t` is computed using an gating mechanism. Input gate :math:`i_t` is designed to protect the cell
298    from perturbation by irrelevant inputs. Forget gate :math:`f_t` affords protection of the cell by forgetting
299    some information in the past, which is stored in :math:`h_{t-1}`. Output gate :math:`o_t` protects other
300    units from perturbation by currently irrelevant memory contents. Candidate cell state :math:`\tilde{c}_t` is
301    calculated with the current input, on which the input gate will be applied. Finally, current cell state
302    :math:`c_{t}` and hidden state :math:`h_{t}` are computed with the calculated gates and cell states. The complete
303    formulation is as follows.
304
305    .. math::
306        \begin{array}{ll} \\
307            i_t = \sigma(W_{ix} x_t + b_{ix} + W_{ih} h_{(t-1)} + b_{ih}) \\
308            f_t = \sigma(W_{fx} x_t + b_{fx} + W_{fh} h_{(t-1)} + b_{fh}) \\
309            \tilde{c}_t = \tanh(W_{cx} x_t + b_{cx} + W_{ch} h_{(t-1)} + b_{ch}) \\
310            o_t = \sigma(W_{ox} x_t + b_{ox} + W_{oh} h_{(t-1)} + b_{oh}) \\
311            c_t = f_t * c_{(t-1)} + i_t * \tilde{c}_t \\
312            h_t = o_t * \tanh(c_t) \\
313        \end{array}
314
315    Here :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. :math:`W, b`
316    are learnable weights between the output and the input in the formula. For instance,
317    :math:`W_{ix}, b_{ix}` are the weight and bias used to transform from input :math:`x` to :math:`i`.
318    Details can be found in paper `LONG SHORT-TERM MEMORY
319    <https://www.bioinf.jku.at/publications/older/2604.pdf>`_ and
320    `Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling
321    <https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/43905.pdf>`_.
322
323    Note:
324        LSTMCell is a single-layer RNN, you can achieve multi-layer RNN by stacking LSTMCell.
325
326    Args:
327        input_size (int): Number of features of input.
328        hidden_size (int):  Number of features of hidden layer.
329        has_bias (bool): Whether the cell has bias `b_ih` and `b_hh`. Default: True.
330        batch_first (bool): Specifies whether the first dimension of input `x` is batch_size. Default: False.
331        dropout (float, int): If not 0, append `Dropout` layer on the outputs of each
332            LSTM layer except the last layer. Default 0. The range of dropout is [0.0, 1.0].
333        bidirectional (bool): Specifies whether this is a bidirectional LSTM. If set True,
334            number of directions will be 2 otherwise number of directions is 1. Default: False.
335
336    Inputs:
337        - **x** (Tensor) - Tensor of shape (seq_len, batch_size, `input_size`).
338        - **h** - data type mindspore.float32 or
339          mindspore.float16 and shape (num_directions, batch_size, `hidden_size`).
340        - **c** - data type mindspore.float32 or
341          mindspore.float16 and shape (num_directions, batch_size, `hidden_size`).
342          Data type of `h' and 'c' must be the same of `x`.
343        - **w** - data type mindspore.float32 or
344          mindspore.float16 and shape (`weight_size`, 1, 1).
345          The value of `weight_size` depends on `input_size`, `hidden_size` and `bidirectional`
346
347    Outputs:
348        `output`, `h_n`, `c_n`, 'reserve', 'state'.
349
350        - **output** (Tensor) - Tensor of shape (seq_len, batch_size, num_directions * `hidden_size`).
351        - **h** - A Tensor with shape (num_directions, batch_size, `hidden_size`).
352        - **c** - A Tensor with shape (num_directions, batch_size, `hidden_size`).
353        - **reserve** - reserved
354        - **state** - reserved
355
356    Raises:
357        TypeError: If `input_size` or `hidden_size` or `num_layers` is not an int.
358        TypeError: If `has_bias` or `batch_first` or `bidirectional` is not a bool.
359        TypeError: If `dropout` is neither a float nor an int.
360        ValueError: If `dropout` is not in range [0.0, 1.0].
361
362    Supported Platforms:
363        ``GPU`` ``CPU``
364
365    Examples:
366        >>> net = nn.LSTMCell(10, 12, has_bias=True, batch_first=True, bidirectional=False)
367        >>> x = Tensor(np.ones([3, 5, 10]).astype(np.float32))
368        >>> h = Tensor(np.ones([1, 3, 12]).astype(np.float32))
369        >>> c = Tensor(np.ones([1, 3, 12]).astype(np.float32))
370        >>> w = Tensor(np.ones([1152, 1, 1]).astype(np.float32))
371        >>> output, h, c, _, _ = net(x, h, c, w)
372        >>> print(output.shape)
373        (3, 5, 12)
374    """
375
376    def __init__(self,
377                 input_size,
378                 hidden_size,
379                 has_bias=True,
380                 batch_first=False,
381                 dropout=0,
382                 bidirectional=False):
383        """Initialize LSTMCell."""
384        super(LSTMCell, self).__init__()
385        self.batch_first = validator.check_value_type("batch_first", batch_first, [bool], self.cls_name)
386        self.transpose = P.Transpose()
387        self.lstm = P.LSTM(input_size=input_size,
388                           hidden_size=hidden_size,
389                           num_layers=1,
390                           has_bias=has_bias,
391                           bidirectional=bidirectional,
392                           dropout=float(dropout))
393
394    def construct(self, x, h, c, w):
395        if self.batch_first:
396            x = self.transpose(x, (1, 0, 2))
397        x, h, c, _, _ = self.lstm(x, h, c, w)
398        if self.batch_first:
399            x = self.transpose(x, (1, 0, 2))
400        return x, h, c, _, _
401