• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #pragma once
25 
26 #include <alloca.h>
27 
28 #include <algorithm>
29 #include <cassert>
30 
31 #include "arm_gemm.hpp"
32 #include "bias_adder.hpp"
33 #include "convolver.hpp"
34 #include "ndrange.hpp"
35 #include "performance_parameters.hpp"
36 #include "transform.hpp"
37 #include "utils.hpp"
38 
39 #ifdef CYCLE_PROFILING
40 #include "profiler.hpp"
41 #endif
42 
43 #ifndef UNUSED
44 #define __I_DEFINED_UNUSED
45 #define UNUSED(x)  ((void)(x))
46 #endif
47 
48 namespace arm_gemm {
49 
50 namespace {
51 
52 // We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do
53 // that.
54 
55 template<typename OutputStage, bool SeparateQuantize = false>
56 class run_hybrid_kernel {
57 public:
58     template<typename strategy, typename To, typename Tr>
59     static void run (
60 #ifdef CYCLE_PROFILING
61         profiler &prof,
62 #endif
63         const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
64         unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
65         const OutputStage &os, const int32_t *col_bias, unsigned int n_0 );
66 };
67 
68 template<>
69 template<typename strategy, typename To, typename Tr>
run(profiler & prof,const strategy & strat,unsigned int num_strings,const unsigned int * string_ptr,IndirectInputArg<To> A_arg,unsigned int M,unsigned int N,unsigned int kern_k,const To * b_ptr,IndirectOutputArg<Tr> output_arg,const Tr * bias_ptr,Activation act,bool accumulate,const Nothing &,const int32_t *,unsigned int)70 void run_hybrid_kernel<Nothing, false>::run(
71 #ifdef CYCLE_PROFILING
72         profiler &prof,
73 #endif
74         const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
75         unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
76         const Nothing &, const int32_t *, unsigned int) {
77 #ifdef CYCLE_PROFILING
78     auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
79 #endif
80     UNUSED(kern_k);
81 
82     strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, bias_ptr, act, accumulate);
83 }
84 
85 template<>
86 template<typename strategy, typename To, typename Tr>
run(profiler & prof,const strategy & strat,unsigned int num_strings,const unsigned int * string_ptr,IndirectInputArg<To> A_arg,unsigned int M,unsigned int N,unsigned int kern_k,const To * b_ptr,IndirectOutputArg<Tr> output_arg,const Tr *,Activation,bool,const Requantize32 & os,const int32_t * col_bias,unsigned int n_0)87 void run_hybrid_kernel<Requantize32, false>::run(
88 #ifdef CYCLE_PROFILING
89         profiler &prof,
90 #endif
91         const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
92         unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
93         const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
94 #ifdef CYCLE_PROFILING
95     auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
96 #endif
97     UNUSED(kern_k);
98 
99     strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, &os, col_bias + n_0, n_0);
100 }
101 
102 template<>
103 template<typename strategy, typename To, typename Tr>
run(profiler & prof,const strategy & strat,unsigned int num_strings,const unsigned int * string_ptr,IndirectInputArg<To> A_arg,unsigned int M,unsigned int N,unsigned int kern_k,const To * b_ptr,IndirectOutputArg<Tr> output_arg,const Tr *,Activation,bool,const Requantize32 & os,const int32_t * col_bias,unsigned int n_0)104 void run_hybrid_kernel<Requantize32, true>::run(
105 #ifdef CYCLE_PROFILING
106         profiler &prof,
107 #endif
108         const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
109         unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
110         const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
111     UNUSED(kern_k);
112     // On this route we will only process one kernel height at a time and will make sure this happens in the driver loop.
113     assert(M <= strategy::out_height());
114     // We don't yet support indirect output (as the quantizer can't do it).
115     assert(output_arg.is_indirect == false);
116 
117     // We need a row sum buffer and intermediate output buffer.
118     // These go on the stack as they are not too large, using an automatic array and alloca() respectively.
119     int32_t row_sums[strategy::out_height()];
120     typename strategy::result_type *result_buffer;
121 
122     unsigned int output_width = roundup(N, strategy::out_width());
123 
124     result_buffer = reinterpret_cast<typename strategy::result_type *>(alloca(output_width * strategy::out_height() * sizeof(typename strategy::result_type)));
125 
126     {
127 #ifdef CYCLE_PROFILING
128         auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
129 #endif
130         // Perform the GEMM, into the output buffer.
131         strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, IndirectOutputArg<typename strategy::result_type>(result_buffer, output_width), nullptr, Activation(), false);
132     }
133 
134     if (os.b_offset != 0) {
135 #ifdef CYCLE_PROFILING
136         auto p = prof.ScopedProfiler(PROFILE_ROWSUMS, (unsigned long)M * kern_k);
137 #endif
138         row_sums_indirect(num_strings, string_ptr, A_arg, M, row_sums, &os);
139     } else {
140         memset(row_sums, 0, sizeof(int32_t) * strategy::out_height());
141     }
142 
143     {
144 #ifdef CYCLE_PROFILING
145         auto p = prof.ScopedProfiler(PROFILE_QUANTIZE, (unsigned long)M * N);
146 #endif
147         // Quantize
148         requantize_block_32(os, N, M, result_buffer, output_width, output_arg.direct.base, output_arg.direct.stride, row_sums, col_bias + n_0, n_0);
149     }
150 }
151 
152 } // anonymous namespace
153 
154 // Implementation of the GemmCommon abstract class.
155 template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false>
156 class GemmHybridIndirect : public GemmCommon<To, Tr> {
157     typedef typename strategy::operand_type Toi;
158     typedef typename strategy::result_type Tri;
159 
160     GemmArgs           _args;
161     OutputStage        _os = {};
162 
163     /* Quantized support (in addition to 'output stage' above) */
164     int32_t *_col_bias = nullptr;
165 
166     const unsigned int _Ktotal;
167     const unsigned int _rounded_Ksize;
168 
169     /* Blocking info */
170     const unsigned int _k_block;
171     const unsigned int _n_block;
172     const unsigned int _Mround;
173 
174     /* Pretransposed buffer. */
175     const Toi *_B_transposed=nullptr;
176 
177     /* Indirect parameters.  _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
178     const To * const * const * _indirect_buf = nullptr;
179 
180     /* Convolver - only set up for convolution problems, so also doubles as a flag. */
181     std::unique_ptr<convolver<To>>  _convolver = nullptr;
182 
183     // Array of pointers to output rows
184 //    Tr * const *        _output_ptrs;
185 
186     const NDRange<4> _window_range;
187 
get_col_sum_size() const188     unsigned int get_col_sum_size() const {
189         if (std::is_same<OutputStage, Requantize32>::value) {
190             return _args._Nsize * _args._nmulti * sizeof(int32_t);
191         } else {
192             return 0;
193         }
194     }
195 
get_ktotal(const GemmArgs & args)196     static unsigned int get_ktotal(const GemmArgs &args) {
197         return args._Ksections * roundup(args._Ksize, strategy::k_unroll());
198     }
199 
compute_k_block(const GemmArgs & args)200     static unsigned int compute_k_block(const GemmArgs &args) {
201         // Some kernels don't support accumulate mode - these can't do K blocking at all.
202         if (!strategy::supports_accumulate() || std::is_same<OutputStage, Requantize32>::value) {
203             return get_ktotal(args);
204         }
205 
206         if (args._cfg && args._cfg->inner_block_size) {
207             return args._cfg->inner_block_size;
208         }
209 
210         // Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other
211         // datatypes); but don't divide into blocks until we hit 1.5X this size.
212         unsigned int target_block_size = 2048 / sizeof(To);
213         auto ktotal = get_ktotal(args);
214 
215         if (ktotal > ((target_block_size*3)/2)) {
216             unsigned int target_blocks = iceildiv(ktotal, target_block_size);
217 
218             unsigned int block_size = iceildiv(ktotal, target_blocks);
219 
220             block_size = roundup(block_size, strategy::k_unroll());
221 
222             return block_size;
223         }
224 
225         return ktotal;
226     }
227 
228     // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width.  Otherwise do a
229     // single block.
compute_n_block(const GemmArgs & args,const OutputStage os={})230     static unsigned int compute_n_block(const GemmArgs &args, const OutputStage os = {}) {
231         if (args._cfg && args._cfg->outer_block_size) {
232             return args._cfg->outer_block_size;
233         }
234 
235         if (args._Nsize <= 64) {
236             return args._Nsize;
237         }
238 
239         if ((args._Msize / args._Nsize) > 155) {
240             return args._Nsize;
241         }
242 
243         // "Asymmetric" quantizing GEMMs require a different approach - the tall skinny blocks we would otherwise
244         // use imply a great deal of repeated work performing the row sums.  If row sums are involved, work out how
245         // much "column" parallelism is going to be required and set the block size accordingly.
246         if (std::is_same<OutputStage, Requantize32>::value) {
247             const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&os);
248 
249             // Row sums only needed if b_offset isn't 0
250             if (qp->b_offset != 0) {
251                 // We can already parallelize across batches, multis and rows (in units of 'out_height')
252                 int multi_row_parallelism = args._nmulti * args._nbatches * iceildiv(args._Msize, strategy::out_height());
253 
254                 // If this isn't enough, we will need to split up the columns too.
255                 if (multi_row_parallelism < args._maxthreads) {
256                     unsigned int columns_needed = iceildiv(args._maxthreads, multi_row_parallelism);
257 
258                     unsigned int n_block = iceildiv(args._Nsize, columns_needed);
259 
260                     return roundup(n_block, strategy::out_width());
261                 }
262 
263                 // Multi/Batch/Row parallelism is enough - don't split up the columns.
264                 return args._Nsize;
265             }
266         }
267 
268         if (args._Ksize <= 128 && args._maxthreads <= 16) {
269             return strategy::out_width() * 3;
270         }
271 
272         return strategy::out_width();
273     }
274 
275 public:
276     GemmHybridIndirect(GemmHybridIndirect &) = delete;
277     GemmHybridIndirect & operator= (GemmHybridIndirect &) = delete;
278 
279     /* Constructor */
GemmHybridIndirect(const GemmArgs & args,const OutputStage & os)280     GemmHybridIndirect(const GemmArgs &args, const OutputStage &os)
281               : _args(args), _os(os), _Ktotal(get_ktotal(args)),
282                 _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
283                 _k_block(compute_k_block(args)), _n_block(compute_n_block(args, os)),
284                 _Mround(roundup(args._Msize, strategy::out_height())),
285                 _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
286                               iceildiv(args._Nsize, _n_block), args._nmulti)
287     {
288         // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
289         // GemmConfig.  Clear out the pointer to avoid accidents.
290         _args._cfg = nullptr;
291     }
292 
293     /* Constructor without OutputStage */
GemmHybridIndirect(const GemmArgs & args)294     GemmHybridIndirect(const GemmArgs &args)
295               : _args(args), _Ktotal(get_ktotal(args)),
296                 _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
297                 _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
298                 _Mround(roundup(args._Msize, strategy::out_height())),
299                 _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
300                               iceildiv(args._Nsize, _n_block), args._nmulti)
301     {
302         // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
303         // GemmConfig.  Clear out the pointer to avoid accidents.
304         _args._cfg = nullptr;
305     }
306 
307     // Interface implementation - Compulsory functions
get_window_size() const308     ndrange_t get_window_size() const override {
309         return { _window_range.total_size() };
310     }
311 
312     // This kernel can always be dynamically scheduled.
supports_dynamic_scheduling() const313     bool supports_dynamic_scheduling() const override {
314         return true;
315     }
316 
317     // Execute
execute(const ndcoord_t & work_range,const ndcoord_t &,int)318     void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
319 #ifdef CYCLE_PROFILING
320         profiler prof;
321 #endif
322         strategy strat(_args._ci);
323 
324         std::vector<const To *>         in_row_ptrs;
325         std::vector<const To * const *> in_row_strings;
326         std::vector<unsigned int>       string_lengths;
327 
328         // In convolution mode, we need input pointers.
329         if (_convolver) {
330             in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args._Ksections, nullptr);
331             in_row_strings = std::vector<const To * const *>(_args._Ksections, nullptr);
332 
333             for (unsigned int i=0; i<_args._Ksections; i++) {
334                 in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
335             }
336         }
337 
338         // In any indirect mode, we need the string lengths.
339         if (_args._indirect_input) {
340             string_lengths = std::vector<unsigned int>(_args._Ksections, 0);
341         }
342 
343         /* Make sure we've been set up correctly. */
344         assert(_B_transposed);
345         static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
346 //        static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
347 
348         /* For now, each work item implies all the K for a given output
349          * pixel (so we don't need to synchronize access to the output
350          * array).  So separate the loop over K blocks here.  */
351         for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
352             unsigned int kmax   = std::min(k0 + _k_block, _Ktotal);
353             unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
354 
355             const bool first_pass = (k0 == 0);
356             const bool last_pass = (kmax == _Ktotal);
357 
358             unsigned int first_section = (k0 / _rounded_Ksize);
359             unsigned int first_offset  = (k0 % _rounded_Ksize);
360             unsigned int kleft = kern_k;
361             unsigned int sections=0;
362             unsigned int offset = first_offset;
363 
364             if (_args._indirect_input) {
365                 while (kleft) {
366                     // When chopping into sections: the amount that goes into 'string_lengths' is the amount to be
367                     // processed (excluding padding).  But the amount we subtract from 'kleft' takes account of any
368                     // padding applied.
369                     string_lengths[sections] = std::min(kleft, _args._Ksize - offset);
370                     kleft -= std::min(kleft, _rounded_Ksize - offset);
371                     sections++;
372                     offset=0;
373                 }
374             }
375 
376             auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
377 
378             if (p.done()) {
379                 return;
380             }
381 
382             // Process rows either 'out_height' rows at a time, or do all valid rows at once with a single kernel call.
383             // The separate quantizer path only handles one block of rows at a time (as it has to store sums and intermediate results).
384             // THe convolution path only generates the pointers for one block of rows at a time.
385             const bool process_all_rows = (!SeparateQuantize && !_convolver);
386 
387             do {
388                 const unsigned int m_start = p.dim(0) * strategy::out_height();
389                 const unsigned int m_end   = process_all_rows ? std::min(p.dim0_max() * strategy::out_height(), _args._Msize) : std::min(m_start + strategy::out_height(), _args._Msize);
390 //                const unsigned int m_end   = std::min(m_start + strategy::out_height(), _args._Msize);
391                 const unsigned int batch   = p.dim(1);
392                 const unsigned int n0      = p.dim(2) * _n_block;
393                 const unsigned int nmax    = std::min(n0 + _n_block, _args._Nsize);
394                 const unsigned int multi   = p.dim(3);
395 
396                 const Toi *b_panel = _B_transposed +
397                                      (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
398                                      (k0 * roundup(_args._Nsize, strategy::out_width())) +
399                                      (n0 * kern_k);
400 
401                IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
402 
403 #ifdef CYCLE_PROFILING
404                 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
405 #endif
406                 if (_indirect_buf) {
407                     run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
408 #ifdef CYCLE_PROFILING
409                                  prof,
410 #endif
411                                  strat, sections, string_lengths.data(),
412                                  IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset),
413                                  (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
414                                  (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
415                                  last_pass ? _args._act : Activation(),
416                                  !first_pass,
417                                  // Quantization parameters
418                                  _os, _col_bias+(multi * _args._Nsize), n0);
419                 } else if (_convolver) {
420                     auto conv_cols = _convolver->process_columns(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride), this->_lda, k0, kmax, _rounded_Ksize);
421 
422                     unsigned int pos=0;
423                     auto conv_rows = conv_cols.process_rows(m_start, m_end - m_start);
424 
425                     while (!conv_rows.finished()) {
426                         unsigned int width, conv_offset;
427 
428                         assert(pos < sections);
429 
430                         std::tie(width, conv_offset) = conv_rows.next_block(&(in_row_ptrs[pos * strategy::out_height()]));
431 
432                         if (pos==0) {
433                             assert(conv_offset == first_offset);
434                         }
435                         assert(width == string_lengths[pos]);
436                         pos++;
437                     }
438                     assert(pos == sections);
439 
440                     run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
441 #ifdef CYCLE_PROFILING
442                                  prof,
443 #endif
444                                  strat, sections, string_lengths.data(),
445                                  IndirectInputArg<To>(in_row_strings.data(), 0, first_offset),
446                                  (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
447                                  (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
448                                  last_pass ? _args._act : Activation(),
449                                  !first_pass,
450                                  // Quantization parameters
451                                  _os, _col_bias+(multi * _args._Nsize), n0);
452                 } else {
453                     // Length to process.  This needs to exclude padding, but 'kmax' potentially includes it.
454                     const unsigned int len = (std::min(_args._Ksize, kmax) - k0);
455 
456                     run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
457 #ifdef CYCLE_PROFILING
458                                  prof,
459 #endif
460                                  strat, 1, &len,
461                                  IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
462                                  (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
463                                  (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
464                                  last_pass ? _args._act : Activation(),
465                                  !first_pass,
466                                  // Quantization parameters
467                                  _os, _col_bias+(multi * _args._Nsize), n0);
468                 }
469             } while (process_all_rows ? p.next_dim1() : p.next_dim0());
470         }
471     }
472 
473     // Interface implementation - pretransposed
B_is_pretransposed() const474     bool B_is_pretransposed() const override {
475         return true;
476     }
477 
B_pretranspose_required() const478     bool B_pretranspose_required() const override {
479         return (_B_transposed==nullptr);
480     }
481 
get_B_pretransposed_array_size() const482     size_t get_B_pretransposed_array_size() const override {
483         // Start with actual pretransposed buffer...
484         size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi);
485 
486         // Space for result row pointers (not strictly needed any more but retained for indirect output testing)
487         size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *);
488 
489         if (std::is_same<OutputStage, Requantize32>::value) {
490             size += get_col_sum_size();
491         }
492 
493         return size;
494     }
495 
pretranspose_B_array(void * in_buffer,const To * B,const int ldb,const int B_multi_stride)496     void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
497         if (std::is_same<OutputStage, Requantize32>::value) {
498             _col_bias = reinterpret_cast<int32_t *>(in_buffer);
499 
500             Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
501 
502             for (unsigned int i=0; i<_args._nmulti; i++) {
503                 // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
504                 compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize * _args._Ksections, B + (i * B_multi_stride), ldb, _col_bias + (i * _args._Nsize), _args._Ksize * _args._Ksections, i, 0);
505             }
506         }
507 
508         // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
509         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
510         Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
511         _B_transposed = buffer;
512 
513         strategy strat(_args._ci);
514 
515         for (unsigned int multi=0; multi<_args._nmulti; multi++) {
516             for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
517                 const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
518 
519                 /* Figure out the size of each block. */
520                 unsigned int k_size = kmax - k0;
521 
522                 // We need to insert padding at the end of each K section.
523                 // The computation needed is a little delicate - the coordinates from the block walker are expressed in
524                 // terms of the full, padded, _Ktotal.
525                 // But we need to transform each section with reference to the original, unpadded, input, letting the
526                 // transform pad each section as needed.
527 
528                 // This is needed for computations below.
529                 const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
530 
531                 // The expected output format is also an entire <out_width> columns interleaved, then the next set of
532                 // columns, and so on.  This means, as we are breaking it up vertically, we have to do it one column at
533                 // a time.
534                 for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
535                     unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
536 
537                     // Track where we are and how much work is left.
538                     unsigned int kpos  = k0;
539                     unsigned int kleft = k_size;
540 
541                     while (kleft) {
542                         // Which section are we in?  Based on the rounded-up section size.
543                         unsigned int k_section_base = kpos / rounded_section_size;
544                         // How far into the section are we?
545                         unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
546 
547                         // We will either copy the rest of this section, or to the end of the requested length.
548                         unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
549 
550                         strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
551                                                   x0, xmax,
552                                                   (k_section_base * _args._Ksize) + k_offset,               // K starting point - compute row to read based on our section and the true section length.
553                                                   (k_section_base * _args._Ksize) + k_offset + k_length);   // K end point - starting point plus length computed above.
554 
555                         // We need to modify our position based on the ROUNDED version of what we just did.
556                         unsigned int padded_length = roundup(k_length, strategy::k_unroll());
557 
558                         buffer += strategy::out_width() * padded_length;
559 
560                         kpos  += padded_length;
561                         kleft -= padded_length;
562                     }
563                 }
564             }
565         }
566     }
567 
set_pretransposed_B_data(void * in_buffer)568     void set_pretransposed_B_data(void *in_buffer) override {
569         // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
570         uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
571         _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
572         _col_bias = reinterpret_cast<int32_t *>(in_buffer);
573     }
574 
575     // Estimate cycles for given problem given provided parameters
estimate_cycles(const GemmArgs & args,const PerformanceParameters & params)576     static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
577         // Note: Current hybrid kernels don't actually round up height (they
578         // have paths for each possible height).  Might need to make this
579         // configurable in future.
580         uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
581 
582         float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
583 
584         // TODO: A bit of a kludge here: current hybrid kernels incur extra
585         // overhead where the width is not a multiple of kernel width.  It's
586         // most noticable where the overall width is quite low, so add 15%
587         // penalty for such widths.
588         if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) {
589             mac_cycles *= 1.15f;
590         }
591 
592         uint64_t total_cycles = mac_cycles;
593 
594         return total_cycles;
595     }
596 
set_quantized_bias(const int32_t * bias,size_t bias_multi_stride)597     void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
598         if (std::is_same<OutputStage, Requantize32>::value) {
599             Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
600 
601             qp->bias = bias;
602             qp->bias_multi_stride = bias_multi_stride;
603         }
604     }
605 
set_indirect_parameters(size_t string_len,const To * const * const * ptr)606     void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
607         assert(string_len == _args._Ksize);
608         _indirect_buf = ptr;
609     }
610 
set_convolution_parameters(ConvolutionParameters parms)611     void set_convolution_parameters(ConvolutionParameters parms) override {
612         assert(parms.input_channels == _args._Ksize);
613         _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
614     }
615 };
616 
617 } // namespace arm_gemm
618 
619 #ifdef __I_DEFINED_UNUSED
620 #undef UNUSED
621 #endif
622