• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
25 
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Helpers.h"
28 #include "arm_compute/core/ITensor.h"
29 #include "arm_compute/core/KernelDescriptors.h"
30 #include "arm_compute/core/TensorInfo.h"
31 #include "arm_compute/core/Types.h"
32 #include "arm_compute/core/Validate.h"
33 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
34 #include "arm_compute/runtime/NEON/NEScheduler.h"
35 #include "arm_compute/runtime/TensorAllocator.h"
36 #include "src/core/helpers/AutoConfiguration.h"
37 
38 #include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
39 #include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
40 #include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
41 #include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
42 #include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
43 #include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
44 #include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
45 
46 #include "support/MemorySupport.h"
47 
48 namespace arm_compute
49 {
50 namespace
51 {
init_assembly_metadata(const GEMMInfo & info)52 AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
53 {
54     AsmGemmInfo asm_info;
55     asm_info.method                  = AsmConvMethod::Im2Col;
56     asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
57     asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
58     asm_info.activation_info         = info.activation_info();
59     asm_info.output_stage            = info.gemmlowp_output_stage();
60 
61     return asm_info;
62 }
63 } // namespace
64 
65 using namespace arm_compute::misc::shape_calculator;
66 
67 NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
68 
NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager,IWeightsManager * weights_manager)69 NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
70     : _memory_group(memory_manager), _weights_manager(weights_manager), _asm_glue(memory_manager, weights_manager), _mm_kernel(), _mtx_a_reshape_kernel(), _mtx_b_reshape_kernel(),
71       _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(), _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(),
72       _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0),
73       _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false),
74       _run_activation(false), _flip_signedness(false)
75 {
76 }
77 
configure(const ITensor * a,const ITensor * b,const ITensor * c,ITensor * output,const GEMMInfo & gemm_info)78 void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
79 {
80     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
81     ARM_COMPUTE_UNUSED(c);
82     ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
83 
84     const ITensor *matrix_a = a;
85     const ITensor *matrix_b = b;
86     GEMMInfo       info     = gemm_info;
87 
88     // Set internal variables
89     _a_offset                         = a->info()->quantization_info().uniform().offset;
90     _b_offset                         = b->info()->quantization_info().uniform().offset;
91     _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
92     _reshape_b_only_on_first_run      = info.reshape_b_only_on_first_run();
93     _is_prepared                      = false;
94     _fused_assembly_path              = false;
95     _flip_signedness                  = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
96     _original_b                       = b;
97 
98     const ITensor *a_to_use = a;
99 
100     // Convert to QASYMM8 -> QASYMM8_SIGNED and back
101     if(_flip_signedness)
102     {
103         const int32_t                 offset_correction = 128;
104         const DataType                dt                = DataType::QASYMM8_SIGNED;
105         const UniformQuantizationInfo iqinfo            = a_to_use->info()->quantization_info().uniform();
106 
107         _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
108         _memory_group.manage(&_signed_a);
109         _convert_to_signed_asymm = arm_compute::support::cpp14::make_unique<NEConvertQuantizedSignednessKernel>();
110         _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
111         a_to_use  = &_signed_a;
112         _a_offset = _signed_a.info()->quantization_info().uniform().offset;
113 
114         const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
115         _memory_group.manage(&_signed_output);
116         _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
117 
118         // Output stage correction
119         GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
120         output_stage_corr.gemmlowp_offset         = _signed_output.info()->quantization_info().uniform().offset;
121         output_stage_corr.gemmlowp_min_bound -= offset_correction;
122         output_stage_corr.gemmlowp_max_bound -= offset_correction;
123         info.set_gemmlowp_output_stage(output_stage_corr);
124 
125         // Update matrix a
126         matrix_a = &_signed_a;
127     }
128 
129     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
130     if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
131     {
132         _fuse_output_stage = true;
133         _memory_group.manage(&_mm_result_s32);
134         TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
135         _mm_result_s32.allocator()->init(info_mm_result_s32);
136     }
137 
138     // Initialize assembly kernel meta-data
139     const AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
140 #ifdef __aarch64__
141     switch(a->info()->data_type())
142     {
143         case DataType::QASYMM8:
144         case DataType::QASYMM8_SIGNED:
145         case DataType::U8:
146         case DataType::S8:
147         {
148             if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
149             {
150                 _asm_glue.configure(a_to_use, b, c, output, asm_info);
151                 _fused_assembly_path = _asm_glue.is_configured();
152             }
153             else
154             {
155                 _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, asm_info);
156             }
157             _assembly_path = _asm_glue.is_configured();
158             break;
159         }
160         default:
161         {
162             ARM_COMPUTE_ERROR("Datatype not supported");
163             break;
164         }
165     }
166 #endif /* __aarch64__ */
167     if(!(_assembly_path || _run_vector_matrix_multiplication))
168     {
169         matrix_a = &_tmp_a;
170         matrix_b = &_tmp_b;
171 
172         // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
173         TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
174         // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
175         TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
176         _tmp_a.allocator()->init(a_info);
177         _tmp_b.allocator()->init(b_info);
178         _memory_group.manage(&_tmp_a);
179         if(!_reshape_b_only_on_first_run)
180         {
181             _memory_group.manage(&_tmp_b);
182         }
183 
184         // Configure interleave kernel
185         _mtx_a_reshape_kernel = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
186         _mtx_a_reshape_kernel->configure(a_to_use, &_tmp_a);
187 
188         // Configure transpose kernel
189         _mtx_b_reshape_kernel = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
190         _mtx_b_reshape_kernel->configure(b, &_tmp_b);
191     }
192 
193     if(!_fused_assembly_path)
194     {
195         // Build reduction info
196         const GEMMLowpReductionKernelInfo reduction_info(a_to_use->info()->dimension(0), false, 0, false);
197 
198         // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
199         if(_a_offset != 0)
200         {
201             TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
202 
203             _vector_sum_col.allocator()->init(info_vector_sum_col);
204             if(!_reshape_b_only_on_first_run)
205             {
206                 _memory_group.manage(&_vector_sum_col);
207             }
208 
209             // Configure Matrix B reduction kernel
210             _mtx_b_reduction_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixBReductionKernel>();
211             _mtx_b_reduction_kernel->configure(b, &_vector_sum_col, reduction_info);
212         }
213 
214         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
215         if(_b_offset != 0)
216         {
217             TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
218 
219             _vector_sum_row.allocator()->init(info_vector_sum_row);
220             _memory_group.manage(&_vector_sum_row);
221 
222             // Configure matrix A reduction kernel
223             _mtx_a_reduction_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixAReductionKernel>();
224             _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
225         }
226 
227         if(_fuse_output_stage)
228         {
229             // Configure matrix multiply kernel
230             if(!_assembly_path)
231             {
232                 _mm_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
233                 _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
234             }
235 
236             _offset_contribution_output_stage_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpOffsetContributionOutputStageKernel>();
237             _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
238                                                                 _a_offset == 0 ? nullptr : &_vector_sum_col,
239                                                                 _b_offset == 0 ? nullptr : &_vector_sum_row, c,
240                                                                 _flip_signedness ? &_signed_output : output,
241                                                                 a->info()->dimension(0),
242                                                                 _a_offset, _b_offset, info.gemmlowp_output_stage());
243 
244             if(_flip_signedness)
245             {
246                 _convert_from_signed_asymm = arm_compute::support::cpp14::make_unique<NEConvertQuantizedSignednessKernel>();
247                 _convert_from_signed_asymm->configure(&_signed_output, output);
248             }
249         }
250         else
251         {
252             // Configure matrix multiply kernel
253             if(!_assembly_path)
254             {
255                 _mm_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
256                 _mm_kernel->configure(matrix_a, matrix_b, output);
257             }
258             // Configure offset contribution kernel
259             _offset_contribution_kernel = arm_compute::support::cpp14::make_unique<NEGEMMLowpOffsetContributionKernel>();
260             _offset_contribution_kernel->configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
261         }
262 
263         // Configure activation
264         const ActivationLayerInfo &activation = gemm_info.activation_info();
265         _run_activation                       = activation.enabled() && (!_assembly_path || (_assembly_path && !NEGEMMAssemblyDispatch::is_activation_supported(activation)));
266         if(_run_activation)
267         {
268             _activation_func.configure(output, nullptr, activation);
269         }
270     }
271 
272     // Allocate tensors
273     if(!_assembly_path && !_run_vector_matrix_multiplication)
274     {
275         _tmp_a.allocator()->allocate();
276         if(!_reshape_b_only_on_first_run)
277         {
278             _tmp_b.allocator()->allocate();
279         }
280     }
281 
282     if(!_fused_assembly_path)
283     {
284         if(_a_offset != 0 && !_reshape_b_only_on_first_run)
285         {
286             _vector_sum_col.allocator()->allocate();
287         }
288 
289         if(_b_offset != 0)
290         {
291             _vector_sum_row.allocator()->allocate();
292         }
293     }
294 
295     if(_fuse_output_stage)
296     {
297         _mm_result_s32.allocator()->allocate();
298     }
299 
300     if(_flip_signedness)
301     {
302         _signed_a.allocator()->allocate();
303         _signed_output.allocator()->allocate();
304     }
305 }
306 
validate(const ITensorInfo * a,const ITensorInfo * b,const ITensorInfo * c,const ITensorInfo * output,const GEMMInfo & gemm_info)307 Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
308 {
309     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
310     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
311     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
312     ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
313     ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
314                                     "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
315     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
316     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
317 
318     GEMMInfo           info          = gemm_info;
319     const ITensorInfo *matrix_a_info = a;
320     const ITensorInfo *matrix_b_info = b;
321 
322     const ITensorInfo *a_to_use = a;
323 
324     TensorInfo tmp_a_info{};
325     TensorInfo tmp_b_info{};
326     TensorInfo mm_result_s32_info{};
327 
328     int32_t a_offset = a->quantization_info().uniform().offset;
329     int32_t b_offset = b->quantization_info().uniform().offset;
330 
331     bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
332     if(fuse_output_stage)
333     {
334         auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
335     }
336 
337     // Convert QASYMM8->QASYMM8_SIGNED
338     TensorInfo signed_a{};
339     TensorInfo signed_output{};
340     bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
341     if(flip_signedness)
342     {
343         const int32_t                 offset_correction = 128;
344         const DataType                dt                = DataType::QASYMM8_SIGNED;
345         const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
346 
347         signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
348         ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
349         a_to_use = &signed_a;
350         a_offset = signed_a.quantization_info().uniform().offset;
351 
352         const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
353         signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
354 
355         // Output stage correction
356         GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
357         output_stage_corr.gemmlowp_offset         = signed_output.quantization_info().uniform().offset;
358         output_stage_corr.gemmlowp_min_bound -= offset_correction;
359         output_stage_corr.gemmlowp_max_bound -= offset_correction;
360         info.set_gemmlowp_output_stage(output_stage_corr);
361 
362         // Update matrix a
363         matrix_a_info = &signed_a;
364     }
365 
366     // Initialize assembly kernel meta-data
367     const AsmGemmInfo asm_info = init_assembly_metadata(info);
368 
369     // Check if we need to run the optimized assembly kernel
370     bool run_optimised             = false;
371     bool run_optimised_requantized = false;
372     if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
373     {
374         run_optimised             = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
375         run_optimised_requantized = run_optimised;
376     }
377     else
378     {
379         run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
380     }
381 
382     if(run_optimised)
383     {
384         ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
385         if(info.depth_output_gemm3d() != 0)
386         {
387             if(info.reinterpret_input_as_3d())
388             {
389                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
390                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
391             }
392             else
393             {
394                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
395             }
396         }
397         else
398         {
399             ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
400         }
401     }
402     else
403     {
404         ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
405         ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
406 
407         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
408         if(!run_vector_matrix_multiplication)
409         {
410             matrix_a_info = &tmp_a_info;
411             matrix_b_info = &tmp_b_info;
412 
413             // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
414             TensorShape shape_tmp_a = a->tensor_shape();
415             shape_tmp_a.set(0, a->dimension(0) * 4);
416             shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
417 
418             // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
419             TensorShape shape_tmp_b = b->tensor_shape();
420             shape_tmp_b.set(0, b->dimension(1) * 16);
421             shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
422 
423             // Validate interleave kernel
424             auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
425             auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
426 
427             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
428             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
429         }
430     }
431 
432     if(!run_optimised_requantized)
433     {
434         TensorInfo info_vector_sum_col{};
435         TensorInfo info_vector_sum_row{};
436 
437         const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
438 
439         // Validate matrix B reduction kernel only if _a_offset is not equal to 0
440         if(a_offset != 0)
441         {
442             info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
443 
444             // Configure Matrix B reduction kernel
445             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
446         }
447 
448         // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
449         if(b_offset != 0)
450         {
451             info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
452 
453             // Configure matrix A reduction kernel
454             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
455         }
456 
457         if(fuse_output_stage)
458         {
459             if(!run_optimised)
460             {
461                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
462                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
463 
464                 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
465             }
466 
467             // Validate offset contribution kernel
468             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
469                                                                                                 a_offset == 0 ? nullptr : &info_vector_sum_col,
470                                                                                                 b_offset == 0 ? nullptr : &info_vector_sum_row,
471                                                                                                 c,
472                                                                                                 flip_signedness ? &signed_output : output,
473                                                                                                 a_offset, b_offset,
474                                                                                                 info.gemmlowp_output_stage()));
475         }
476         else
477         {
478             if(!run_optimised)
479             {
480                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
481                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
482 
483                 ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
484             }
485             // Validate offset contribution kernel
486             ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
487                                                                                      a_offset == 0 ? nullptr : &info_vector_sum_col,
488                                                                                      b_offset == 0 ? nullptr : &info_vector_sum_row,
489                                                                                      a_offset, b_offset));
490         }
491     }
492 
493     // Validate activation
494     const ActivationLayerInfo &activation = gemm_info.activation_info();
495     if(activation.enabled())
496     {
497         ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, activation));
498     }
499 
500     return Status{};
501 }
502 
run()503 void NEGEMMLowpMatrixMultiplyCore::run()
504 {
505     prepare();
506 
507     MemoryGroupResourceScope scope_mg(_memory_group);
508 
509     // Convert QASYMM8->QASYMM8_SIGNED
510     if(_flip_signedness)
511     {
512         NEScheduler::get().schedule(_convert_to_signed_asymm.get(), Window::DimY);
513     }
514 
515     // Run GEMM
516     if(_asm_glue.is_configured())
517     {
518         _asm_glue.run();
519     }
520     else
521     {
522         if(!_run_vector_matrix_multiplication)
523         {
524             // Run interleave kernel
525             NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
526 
527             if(!_reshape_b_only_on_first_run)
528             {
529                 // Run transpose kernel
530                 NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
531             }
532         }
533         NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
534     }
535 
536     if(!_fused_assembly_path)
537     {
538         // Run matrix A reduction kernel only if _b_offset is not equal to 0
539         if(_b_offset != 0)
540         {
541             NEScheduler::get().schedule(_mtx_a_reduction_kernel.get(), Window::DimX);
542         }
543 
544         // Run matrix B reduction kernel only if _a_offset is not equal to 0
545         if(_a_offset != 0 && !_reshape_b_only_on_first_run)
546         {
547             NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
548         }
549 
550         if(_fuse_output_stage)
551         {
552             // Run offset contribution kernel
553             NEScheduler::get().schedule(_offset_contribution_output_stage_kernel.get(), Window::DimY);
554         }
555         else
556         {
557             // Run offset contribution kernel
558             NEScheduler::get().schedule(_offset_contribution_kernel.get(), Window::DimY);
559         }
560     }
561 
562     // Convert QASYMM8_SIGNED->QASYMM8
563     if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
564     {
565         NEScheduler::get().schedule(_convert_from_signed_asymm.get(), Window::DimY);
566     }
567 
568     // Run fused activation unless already run in the fused assembly
569     if(_run_activation && !_fused_assembly_path)
570     {
571         _activation_func.run();
572     }
573 }
574 
prepare()575 void NEGEMMLowpMatrixMultiplyCore::prepare()
576 {
577     if(!_is_prepared)
578     {
579         const bool original_b_managed_by_weights_manager = _weights_manager && _weights_manager->are_weights_managed(_original_b);
580         // Run assembly reshape
581         if(_asm_glue.is_configured())
582         {
583             if(!original_b_managed_by_weights_manager)
584             {
585                 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
586             }
587 
588             _asm_glue.prepare();
589             if(!original_b_managed_by_weights_manager)
590             {
591                 _original_b->mark_as_unused();
592             }
593         }
594         // Run non-assembly reshape
595         else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue.is_configured())
596         {
597             if(!original_b_managed_by_weights_manager)
598             {
599                 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
600             }
601 
602             // Run reshape kernel and mark original weights tensor as unused
603             _tmp_b.allocator()->allocate();
604             NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
605             if(!original_b_managed_by_weights_manager)
606             {
607                 _original_b->mark_as_unused();
608             }
609         }
610 
611         // Run matrix B reduction kernel only if _a_offset is not equal to 0
612         if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
613         {
614             _vector_sum_col.allocator()->allocate();
615             NEScheduler::get().schedule(_mtx_b_reduction_kernel.get(), Window::DimX);
616         }
617 
618         _is_prepared = true;
619     }
620 }
621 } // namespace arm_compute
622