1# Copyright 2020 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================ 15"""CusMatrixCombine""" 16from __future__ import absolute_import 17 18from te import tik 19from tbe.tvm.topi.cce import util 20from mindspore.ops.op_info_register import op_info_register, TBERegOp, DataType 21 22cus_matrix_combine_op_info = TBERegOp("CusMatrixCombine") \ 23 .fusion_type("OPAQUE") \ 24 .async_flag(False) \ 25 .binfile_name("matrixcombine.so") \ 26 .compute_cost(10) \ 27 .kernel_name("cus_matrix_combine") \ 28 .partial_flag(True) \ 29 .input(0, "x1", False, "required", "all") \ 30 .output(0, "y", False, "required", "all") \ 31 .dtype_format(DataType.F32_Default, DataType.F32_Default) \ 32 .get_op_info() 33 34 35@op_info_register(cus_matrix_combine_op_info) 36def cus_matrix_combine(input_x, output, kernel_name="cus_matrix_combine"): 37 """CusMatrixCombine""" 38 input_x_shape = input_x.get("shape") 39 output_shape = output.get("shape") 40 41 if util.get_product_version() == util.VERSION_MINI: 42 tik_instance = tik.Tik(tik.Dprofile("v100", "mini")) 43 else: 44 tik_instance = tik.Tik(tik.Dprofile("v100", "cloud")) 45 46 input_x = tik_instance.Tensor("float32", input_x_shape, name="input_x", scope=tik.scope_gm) 47 res = tik_instance.Tensor("float32", output_shape, name="res", scope=tik.scope_gm) 48 49 blocks = 32 50 matrix_dim = input_x_shape[0] * input_x_shape[1] 51 if input_x_shape[0] == 1 and input_x_shape[1] == 64: 52 with tik_instance.for_range(0, blocks, block_num=blocks) as block_index: 53 input_x_ub = tik_instance.Tensor("float32", (2, matrix_dim), name="input_x_ub", 54 scope=tik.scope_ubuf) 55 tik_instance.data_move(input_x_ub, input_x[0, block_index * 2, 0], 0, 1, 16, 0, 0) 56 tik_instance.data_move(res[block_index * 2, 0], input_x_ub, 0, 1, 16, 0, 0) 57 else: 58 tiling_dim = 4 59 input_x_ub = tik_instance.Tensor("float32", (tiling_dim, matrix_dim), name="input_x_ub", 60 scope=tik.scope_ubuf) 61 zero = tik_instance.Scalar("float32") 62 zero.set_as(0.0) 63 with tik_instance.for_range(0, blocks, block_num=blocks) as block_index, \ 64 tik_instance.for_range(0, input_x_shape[0]) as i: 65 repeat_real = tiling_dim * matrix_dim // 64 66 if repeat_real <= 255: 67 tik_instance.vector_dup(64, input_x_ub, zero, repeat_real, 1, 8) 68 elif repeat_real <= 510: 69 tik_instance.vector_dup(64, input_x_ub, zero, 255, 1, 8) 70 tik_instance.vector_dup(64, input_x_ub[255 * 64], zero, repeat_real - 255, 1, 8) 71 elif repeat_real <= 765: 72 tik_instance.vector_dup(64, input_x_ub, zero, 255, 1, 8) 73 tik_instance.vector_dup(64, input_x_ub[255 * 64], zero, 255, 1, 8) 74 tik_instance.vector_dup(64, input_x_ub[510 * 64], zero, repeat_real - 510, 1, 8) 75 else: 76 tik_instance.vector_dup(64, input_x_ub, zero, 255, 1, 8) 77 tik_instance.vector_dup(64, input_x_ub[255 * 64], zero, 255, 1, 8) 78 tik_instance.vector_dup(64, input_x_ub[510 * 64], zero, 255, 1, 8) 79 tik_instance.vector_dup(64, input_x_ub[765 * 64], zero, repeat_real - 765, 1, 8) 80 81 with tik_instance.for_range(0, tiling_dim) as j: 82 tik_instance.data_move(input_x_ub[j, 128 * i], input_x[i, block_index * tiling_dim + j, 0], 83 0, 1, 16, 0, 0) 84 tik_instance.data_move(res[i * 128 + block_index * tiling_dim, 0], input_x_ub, 0, 1, 85 tiling_dim * matrix_dim * 4 // 32, 0, 0) 86 tik_instance.BuildCCE(kernel_name=kernel_name, inputs=[input_x], outputs=[res]) 87 return tik_instance 88