• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
25 
26 #include "arm_compute/core/CL/CLHelpers.h"
27 #include "arm_compute/core/CL/CLKernelLibrary.h"
28 #include "arm_compute/core/CL/ICLTensor.h"
29 #include "arm_compute/core/CL/OpenCL.h"
30 #include "arm_compute/core/Helpers.h"
31 #include "arm_compute/core/TensorInfo.h"
32 #include "arm_compute/core/Utils.h"
33 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
34 #include "src/core/AccessWindowStatic.h"
35 #include "src/core/CL/CLValidate.h"
36 #include "src/core/helpers/AutoConfiguration.h"
37 #include "src/core/helpers/WindowHelpers.h"
38 #include "src/core/utils/helpers/float_ops.h"
39 #include "support/StringSupport.h"
40 
41 #include <set>
42 #include <string>
43 
44 namespace arm_compute
45 {
46 using namespace arm_compute::misc::shape_calculator;
47 
48 namespace
49 {
50 using ElementsProcessed = Steps;
51 
validate_arguments(const ITensorInfo * input0,const ITensorInfo * input1,const ITensorInfo * input2,const ITensorInfo * output,float beta,bool is_interleaved_transposed,const GEMMReshapeInfo & reshape_info,bool fp_mixed_precision)52 inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float beta,
53                                  bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision)
54 {
55     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
56     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input0);
57     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
58     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
59     ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (input0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data");
60     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
61     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
62     ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
63     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");
64     ARM_COMPUTE_RETURN_ERROR_ON_MSG((reshape_info.reinterpret_input_as_3d() || reshape_info.depth_output_gemm3d() != 0) && (input2 != nullptr)
65                                     && (!reshape_info.broadcast_bias()),
66                                     "Bias addition only supported with broadcast mode in case the input or output has to be reinterpreted as 3D");
67 
68     if(!is_interleaved_transposed)
69     {
70         ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != input1->dimension(1));
71 
72         if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
73         {
74             const unsigned int m           = reshape_info.reinterpret_input_as_3d() ? input0->dimension(1) * input0->dimension(2) : input0->dimension(1);
75             const unsigned int n           = input1->dimension(0);
76             const unsigned int input2_dim0 = input2->dimension(0);
77             const unsigned int input2_dim1 = input2->dimension(1);
78 
79             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
80             if(reshape_info.broadcast_bias())
81             {
82                 ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
83             }
84             else
85             {
86                 ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
87             }
88         }
89     }
90     else
91     {
92         GEMMRHSMatrixInfo rhs_info;
93         GEMMLHSMatrixInfo lhs_info;
94         const auto        m                         = static_cast<unsigned int>(reshape_info.m());
95         const auto        n                         = static_cast<unsigned int>(reshape_info.n());
96         const int         k                         = reshape_info.k();
97         const int         mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
98         const int         mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
99         rhs_info.n0                                 = max_cl_vector_width / input1->element_size();
100         rhs_info.k0                                 = 1;
101         rhs_info.h0                                 = mult_transpose1xW_width;
102         rhs_info.interleave                         = false;
103         rhs_info.transpose                          = false;
104         lhs_info.m0                                 = 4;
105         lhs_info.k0                                 = 4;
106         lhs_info.v0                                 = mult_interleave4x4_height;
107         lhs_info.interleave                         = true;
108         lhs_info.transpose                          = true;
109 
110         TensorShape tensor_shape0{ input0->tensor_shape() };
111         tensor_shape0.set(0, k);
112         tensor_shape0.set(1, m);
113 
114         TensorShape tensor_shape1{ input1->tensor_shape() };
115         tensor_shape1.set(0, n);
116         tensor_shape1.set(1, k);
117 
118         const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0);
119         const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1);
120 
121         const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
122         const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
123 
124         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0);
125         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1);
126 
127         if(input2 != nullptr && !(helpers::float_ops::is_zero(beta)))
128         {
129             const unsigned int input2_dim0 = input2->dimension(0);
130             const unsigned int input2_dim1 = input2->dimension(1);
131 
132             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input2, input1);
133             if(reshape_info.broadcast_bias())
134             {
135                 ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim1 != 1 || input2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
136             }
137             else
138             {
139                 ARM_COMPUTE_RETURN_ERROR_ON_MSG((input2_dim0 != n || input2_dim1 != m), "Incorrect dimension of bias matrix");
140             }
141         }
142     }
143 
144     if(output->total_size() != 0)
145     {
146         const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info));
147         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
148         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
149     }
150 
151     return Status{};
152 }
153 
validate_and_configure_window(ITensorInfo * input0,ITensorInfo * input1,ITensorInfo * input2,ITensorInfo * output,float beta,bool is_interleaved_transposed,const GEMMReshapeInfo & reshape_info,GPUTarget gpu_target,ElementsProcessed & num_elements_processed)154 inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
155                                                                float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target,
156                                                                ElementsProcessed &num_elements_processed)
157 {
158     ARM_COMPUTE_UNUSED(beta);
159     bool   window_changed = false;
160     Window win{};
161     Window win_out{};
162 
163     const DataType data_type                           = input0->data_type();
164     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
165     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
166     bool           reinterpret_input_as_3d             = reshape_info.reinterpret_input_as_3d();
167     bool           reinterpret_output_as_3d            = (reshape_info.depth_output_gemm3d() != 0);
168 
169     // In case both input and output have to be reinterpreted as 3D tensors,
170     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
171     if(reinterpret_input_as_3d == reinterpret_output_as_3d)
172     {
173         reinterpret_input_as_3d  = false;
174         reinterpret_output_as_3d = false;
175     }
176 
177     // Output tensor auto inizialitation if not yet initialized
178     auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)));
179 
180     TensorInfo tmp_info(*output);
181 
182     if(reinterpret_output_as_3d)
183     {
184         // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
185         // the window needs to be constructed on the 2D collapsed version of the tensor
186         TensorShape tmp_shape(output->tensor_shape());
187         tmp_shape.collapse(2U, 1U);
188         tmp_info.set_tensor_shape(tmp_shape);
189     }
190 
191     if(is_interleaved_transposed)
192     {
193         // reinterpret_input_as_3d is not supported if is_interleaved_transposed is set
194         ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d());
195 
196         // Configure kernel window
197         num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
198         num_elems_processed_per_iteration_y = 4;
199 
200         win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
201         if(input2 != nullptr)
202         {
203             const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
204 
205             const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y;
206 
207             AccessWindowStatic input2_access(input2, 0, 0,
208                                              ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
209                                              ceil_to_multiple(input2->dimension(1), bias_processed_per_iteration_y));
210 
211             window_changed = update_window_and_padding(win, input2_access); // window used by the execute_window_loop
212         }
213     }
214     else // The input tensors have not been reshaped
215     {
216         // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case.
217         num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type);
218         num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
219 
220         // Create kernels according to the architecture, data type and input size.
221         GPUTarget arch_target = get_arch_from_target(gpu_target);
222         if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32)
223         {
224             num_elems_processed_per_iteration_x = (input1->dimension(0) <= 1000 && input0->num_dimensions() == 1) ? 2 : 4;
225         }
226 
227         // Configure window
228         win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
229         win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
230         AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1));
231         AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
232         AccessWindowStatic output_access(output, 0, 0,
233                                          output->dimension(0),
234                                          output->dimension(1));
235 
236         if(input2 != nullptr)
237         {
238             const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
239 
240             AccessWindowStatic input2_access(input2, 0, 0,
241                                              ceil_to_multiple(input2->dimension(0), bias_processed_per_iteration_x),
242                                              input2->dimension(1));
243 
244             window_changed = update_window_and_padding(win, input0_access, input1_access, input2_access) || // window used by the execute_window_loop
245                              update_window_and_padding(win_out, output_access);                             // window used to update the padding requirements of output tensor
246         }
247         else
248         {
249             window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
250                              update_window_and_padding(win_out, output_access);              // window used to update the padding requirements of output tensor
251         }
252 
253         Coordinates coord;
254         coord.set_num_dimensions(output->num_dimensions());
255         output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
256     }
257 
258     // Collapse along the Z direction
259     // This collapse needs to be here in order to tune the Z dimension of LWS
260     Window             collapsed             = win;
261     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
262     collapsed                                = win.collapse(win, dimension_to_collapse);
263 
264     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
265     return std::make_pair(err, collapsed);
266 }
267 } // namespace
268 
CLGEMMMatrixMultiplyKernel()269 CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel()
270     : _input0(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false), _add_bias(false),
271       _broadcast_bias(false)
272 {
273 }
274 
configure(const ICLTensor * input0,const ICLTensor * input1,const ICLTensor * input2,ICLTensor * output,float alpha,float beta,bool is_interleaved_transposed,const GEMMReshapeInfo & reshape_info,bool fp_mixed_precision,const ActivationLayerInfo & activation_info)275 void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
276                                            bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
277 {
278     configure(CLKernelLibrary::get().get_compile_context(), input0, input1, input2, output, alpha, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision, activation_info);
279 }
280 
configure(const CLCompileContext & compile_context,const ICLTensor * input0,const ICLTensor * input1,const ICLTensor * input2,ICLTensor * output,float alpha,float beta,bool is_interleaved_transposed,const GEMMReshapeInfo & reshape_info,bool fp_mixed_precision,const ActivationLayerInfo & activation_info)281 void CLGEMMMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha,
282                                            float beta,
283                                            bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
284 {
285     ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
286 
287     // Perform validate step
288     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta,
289                                                   is_interleaved_transposed, reshape_info, fp_mixed_precision));
290 
291     auto padding_info = is_interleaved_transposed ? get_padding_info({ input0, input1, output }) : get_padding_info({ input0, output });
292 
293     _input0                   = input0;
294     _input1                   = input1;
295     _input2                   = helpers::float_ops::is_zero(beta) ? nullptr : input2;
296     _output                   = output;
297     _reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
298     _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
299     _add_bias                 = _input2 != nullptr;
300     _broadcast_bias           = reshape_info.broadcast_bias();
301 
302     // In case both input and output have to be reinterpreted as 3D tensors,
303     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
304     if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
305     {
306         _reinterpret_input_as_3d  = false;
307         _reinterpret_output_as_3d = false;
308     }
309 
310     // Check if we need to slide the matrix B
311     const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d ? _input0->info()->num_dimensions() - 1 : _input0->info()->num_dimensions();
312 
313     _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
314 
315     const DataType data_type = input0->info()->data_type();
316 
317     // Get target architecture
318     GPUTarget gpu_target = get_target();
319 
320     ElementsProcessed num_elements_processed{};
321 
322     // Configure kernel window
323     auto win_config = validate_and_configure_window(input0->info(), input1->info(), (input2 != nullptr) ? input2->info() : nullptr, output->info(), beta, is_interleaved_transposed, reshape_info,
324                                                     gpu_target, num_elements_processed);
325     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
326     ICLKernel::configure_internal(win_config.second);
327 
328     // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, both will be turned off (false)
329     // in which case we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
330     // This means that the actual m used by the kernel is given by output->info()->dimension(1)
331     const unsigned int internal_m = _reinterpret_output_as_3d ? output->info()->dimension(1) * output->info()->dimension(2) : output->info()->dimension(1);
332     const unsigned int n          = output->info()->dimension(0);
333 
334     const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(1) : input0->info()->dimension(1);
335     const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? output->info()->dimension(2) : input0->info()->dimension(2);
336 
337     const unsigned int m0 = num_elements_processed.y();
338     const unsigned int n0 = num_elements_processed.x();
339 
340     // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
341     const unsigned int partial_store_m0 = internal_m % m0;
342     const unsigned int partial_store_n0 = n % n0;
343 
344     // Create build options
345     CLBuildOptions build_opts;
346 
347     build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
348     build_opts.add_option_if(_input2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
349     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
350     build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS");
351     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
352     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
353     build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
354     build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
355     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
356     build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation())));
357     build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a()));
358     build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b()));
359 
360     const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST;
361 
362     std::string kernel_name;
363     if(is_interleaved_transposed)
364     {
365         const int mult_transpose1xW_width   = reshape_info.mult_transpose1xW_width();
366         const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
367 
368         build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
369         build_opts.add_option("-DN=" + support::cpp11::to_string(n));
370         build_opts.add_option("-DK=" + support::cpp11::to_string(input1->info()->dimension(0) / (n0 * mult_transpose1xW_width)));
371         build_opts.add_option("-DH0=" + support::cpp11::to_string(mult_transpose1xW_width));
372         build_opts.add_option("-DV0=" + support::cpp11::to_string(mult_interleave4x4_height));
373         build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
374         build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
375 
376         if(is_data_type_float(data_type) && is_bifrost)
377         {
378             kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
379         }
380         else
381         {
382             kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type));
383             if(fp_mixed_precision && data_type == DataType::F16)
384             {
385                 // currently wider accumulator is only supported for fp16 kernels.
386                 kernel_name += "_acc32";
387             }
388         }
389     }
390     else // The input tensors have not been reshaped
391     {
392         build_opts.add_option("-DN=" + support::cpp11::to_string(n));
393         build_opts.add_option("-DK=" + support::cpp11::to_string(input0->info()->dimension(0)));
394         build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
395         build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
396         build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
397         build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
398         build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
399 
400         // Create kernels according to the architecture, data type and input size.
401         if(is_data_type_float(data_type) && is_bifrost)
402         {
403             kernel_name = "gemm_mm_floating_point";
404 
405             if(input0->info()->num_dimensions() != 1)
406             {
407                 kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
408                 if(fp_mixed_precision && data_type == DataType::F16)
409                 {
410                     // currently wider accumulator is only supported for fp16 kernels.
411                     kernel_name += "_acc32";
412                 }
413             }
414             else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
415             {
416                 // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
417                 // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
418                 // FC6 and FC7 of AlexNet and VGG-16).
419                 kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
420             }
421 
422             // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
423             // via exhaustive autotuning over a range of representative layer configurations.
424             set_lws_hint(cl::NDRange(4));
425         }
426         else // (MIDGARD and F32) or (F16)
427         {
428             kernel_name = "gemm_mm_floating_point";
429         }
430     }
431 
432     // Create kernel
433     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
434 
435     // Set config_id for enabling LWS tuning
436     _config_id = "gemm_";
437     _config_id += (is_interleaved_transposed ? "reshaped_" : "");
438     _config_id += (_add_bias ? "add_bias_" : "");
439     _config_id += (_broadcast_bias ? "broadcast_bias_" : "");
440     _config_id += (fp_mixed_precision ? "fp_mixed_" : "");
441     _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
442     _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
443     _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
444     _config_id += "_";
445     _config_id += support::cpp11::to_string(output->info()->dimension(1));
446     _config_id += "_";
447     _config_id += support::cpp11::to_string(output->info()->dimension(0));
448     _config_id += "_";
449     _config_id += support::cpp11::to_string(output->info()->dimension(2));
450     _config_id += "_";
451     _config_id += support::cpp11::to_string(output->info()->dimension(3));
452     _config_id += "_";
453     _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
454 
455     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
456 }
457 
validate(const ITensorInfo * input0,const ITensorInfo * input1,const ITensorInfo * input2,const ITensorInfo * output,float alpha,float beta,bool is_interleaved_transposed,const GEMMReshapeInfo & reshape_info,GPUTarget gpu_target,bool fp_mixed_precision,const ActivationLayerInfo & activation_info)458 Status CLGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
459                                             bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision, const ActivationLayerInfo &activation_info)
460 {
461     // Note: num_elements_processed will be set in validate_and_configure_window()
462     ElementsProcessed num_elements_processed{};
463     ARM_COMPUTE_UNUSED(alpha);
464     ARM_COMPUTE_UNUSED(activation_info);
465     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, input2, output, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision));
466     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
467                                                               input1->clone().get(),
468                                                               (input2 != nullptr) ? input2->clone().get() : nullptr,
469                                                               output->clone().get(),
470                                                               beta,
471                                                               is_interleaved_transposed,
472                                                               reshape_info,
473                                                               gpu_target,
474                                                               num_elements_processed)
475                                 .first);
476 
477     return Status{};
478 }
479 
run(const Window & window,cl::CommandQueue & queue)480 void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
481 {
482     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
483     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
484 
485     if(_input1->info()->num_dimensions() < 3)
486     {
487         // The stride_z for matrix B must be zero if we do not slice
488         ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
489     }
490 
491     Window slice          = window.first_slice_window_3D();
492     Window slice_matrix_b = slice;
493 
494     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
495     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
496 
497     const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0;
498 
499     if(_reinterpret_input_as_3d)
500     {
501         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
502         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias;
503         const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
504         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
505     }
506 
507     if(_reinterpret_output_as_3d)
508     {
509         // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
510         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias;
511         const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
512         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
513     }
514 
515     do
516     {
517         Window slice_b = slice;
518         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
519         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
520         if(!_slide_matrix_b)
521         {
522             slice_b = slice_matrix_b;
523         }
524 
525         unsigned int idx = 0;
526         add_2D_tensor_argument(idx, _input0, slice);
527         add_2D_tensor_argument(idx, _input1, slice_b);
528         if(_add_bias)
529         {
530             add_2D_tensor_argument(idx, _input2, slice);
531         }
532         add_2D_tensor_argument(idx, _output, slice);
533         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
534         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
535         if(_add_bias)
536         {
537             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input2->info()->strides_in_bytes()[2]));
538         }
539         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
540         enqueue(queue, *this, slice, lws_hint());
541     }
542     while(window.slide_window_slice_3D(slice));
543 }
544 } // namespace arm_compute
545