# Copyright 2021 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """ test transformer""" import numpy as np import pytest from mindspore import Tensor from mindspore.common import dtype from mindspore.parallel.nn import MultiHeadAttention, FeedForward, TransformerEncoderLayer, TransformerEncoder, \ TransformerDecoder, TransformerDecoderLayer, Transformer, CrossEntropyLoss, AttentionMask, FixedSparseAttention from mindspore.common.api import _cell_graph_executor def test_transformer_encoder_only(): model = Transformer(batch_size=2, src_seq_length=20, tgt_seq_length=10, encoder_layers=2, decoder_layers=0, hidden_size=64, ffn_hidden_size=64) encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32) encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16) _cell_graph_executor.compile(model, encoder_input_value, encoder_input_mask) def test_transformer_encoder_log_softmax(): with pytest.raises(ValueError): model = Transformer(batch_size=2, src_seq_length=20, tgt_seq_length=10, encoder_layers=2, decoder_layers=0, hidden_act='logsoftmax', hidden_size=64, ffn_hidden_size=64) encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32) encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16) _cell_graph_executor.compile(model, encoder_input_value, encoder_input_mask) def test_transformer_encoder_leakyrelu(): model = Transformer(batch_size=2, src_seq_length=20, tgt_seq_length=10, encoder_layers=2, decoder_layers=0, hidden_act='leakyrelu', hidden_size=64, ffn_hidden_size=64) encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32) encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16) _cell_graph_executor.compile(model, encoder_input_value, encoder_input_mask) def test_transformer_encoder_logsigmoid(): model = Transformer(batch_size=2, src_seq_length=20, tgt_seq_length=10, encoder_layers=2, decoder_layers=0, hidden_act='logsigmoid', hidden_size=64, ffn_hidden_size=64) encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32) encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16) _cell_graph_executor.compile(model, encoder_input_value, encoder_input_mask) def test_encoder_and_decoder(): model = Transformer(batch_size=2, src_seq_length=20, tgt_seq_length=10, encoder_layers=1, decoder_layers=2, hidden_size=64, ffn_hidden_size=64) encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32) encoder_input_mask = Tensor(np.ones((2, 20, 20)), dtype.float16) decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32) decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16) memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16) _cell_graph_executor.compile(model, encoder_input_value, encoder_input_mask, decoder_input_value, decoder_input_mask, memory_mask) def test_transformer_encoder(): model = TransformerEncoder(batch_size=2, seq_length=16, num_layers=2, hidden_size=8, ffn_hidden_size=64, num_heads=2) encoder_input_value = Tensor(np.ones((2, 16, 8)), dtype.float32) encoder_input_mask = Tensor(np.ones((2, 16, 16)), dtype.float16) _cell_graph_executor.compile(model, encoder_input_value, encoder_input_mask) def test_transformer_encoder_layer(): model = TransformerEncoderLayer(batch_size=2, hidden_size=8, ffn_hidden_size=64, seq_length=16, num_heads=2) encoder_input_value = Tensor(np.ones((2, 16, 8)), dtype.float32) encoder_input_mask = Tensor(np.ones((2, 16, 16)), dtype.float16) _cell_graph_executor.compile(model, encoder_input_value, encoder_input_mask) def test_transformer_encoder_layer_post_ture(): model = TransformerEncoderLayer(batch_size=2, seq_length=16, hidden_size=8, ffn_hidden_size=64, num_heads=2, post_layernorm_residual=True) encoder_input_value = Tensor(np.ones((2, 16, 8)), dtype.float32) encoder_input_mask = Tensor(np.ones((2, 16, 16)), dtype.float16) _cell_graph_executor.compile(model, encoder_input_value, encoder_input_mask) def test_transformer_decoder(): model = TransformerDecoder(num_layers=1, batch_size=2, src_seq_length=20, tgt_seq_length=10, hidden_size=64, ffn_hidden_size=64, num_heads=2) encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32) decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32) decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16) memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16) _cell_graph_executor.compile(model, decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask) def test_transformer_decoder_layer(): model = TransformerDecoderLayer( batch_size=2, src_seq_length=20, tgt_seq_length=10, hidden_size=64, ffn_hidden_size=64, num_heads=2) encoder_input_value = Tensor(np.ones((2, 20, 64)), dtype.float32) decoder_input_value = Tensor(np.ones((2, 10, 64)), dtype.float32) decoder_input_mask = Tensor(np.ones((2, 10, 10)), dtype.float16) memory_mask = Tensor(np.ones((2, 10, 20)), dtype.float16) _cell_graph_executor.compile(model, decoder_input_value, decoder_input_mask, encoder_input_value, memory_mask) def test_multihead_attention(): model = MultiHeadAttention(hidden_size=15, src_seq_length=20, tgt_seq_length=20, batch_size=2, num_heads=3) from_tensor = Tensor(np.ones((2, 20, 15)), dtype.float32) to_tensor = Tensor(np.ones((2, 20, 15)), dtype.float16) attention_mask = Tensor(np.ones((2, 20, 20)), dtype.float16) _cell_graph_executor.compile(model, from_tensor, to_tensor, to_tensor, attention_mask) def test_multihead_attention_wrong_batch(): model = MultiHeadAttention(hidden_size=15, src_seq_length=20, tgt_seq_length=20, batch_size=2, num_heads=3) from_tensor = Tensor(np.ones((3, 20, 15)), dtype.float32) to_tensor = Tensor(np.ones((3, 20, 15)), dtype.float16) attention_mask = Tensor(np.ones((3, 20, 20)), dtype.float16) with pytest.raises(ValueError): _cell_graph_executor.compile(model, from_tensor, to_tensor, to_tensor, attention_mask) def test_feedforward_layer(): model = FeedForward(hidden_size=15, ffn_hidden_size=30, dropout_rate=0.1, hidden_act='relu') tensor = Tensor(np.ones((2, 20, 15)), dtype.float32) _cell_graph_executor.compile(model, tensor) def test_cross_entroy(): model = CrossEntropyLoss() logits = Tensor(np.array([[3, 5, 6, 9, 12, 33, 42, 12, 32, 72]]), dtype.float32) labels_np = np.array([1]).astype(np.int32) input_mask = Tensor(np.ones(1).astype(np.float32)) labels = Tensor(labels_np) _cell_graph_executor.compile(model, logits, labels, input_mask) def test_attention_mask(): model = AttentionMask(seq_length=19) inputs = Tensor(np.ones((2, 19)), dtype.float32) _cell_graph_executor.compile(model, inputs) def test_sparse_attention(): model = FixedSparseAttention(batch_size=2, seq_length=1024, size_per_head=64, num_heads=8, block_size=64) q = Tensor(np.ones((2, 1024, 512)), dtype.float16) k = Tensor(np.ones((2, 1024, 512)), dtype.float16) v = Tensor(np.ones((2, 1024, 512)), dtype.float16) mask = Tensor(np.ones((2, 1024, 1024)), dtype.float32) _cell_graph_executor.compile(model, q, k, v, mask)