• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "asmlib.hpp"
26 #include "convolution_parameters.hpp"
27 #include "convolver.hpp"
28 #include "interleave_indirect.hpp"
29 #include "bfloat.hpp"
30 
31 #include <alloca.h>
32 
33 #include <algorithm>
34 #include <cstddef>
35 #include <cstdint>
36 #include <cstdio>
37 #include <cstring>
38 #include <tuple>
39 #include <type_traits>
40 #include <vector>
41 
42 #include <arm_neon.h>
43 
44 #include "utils.hpp"
45 
46 namespace arm_gemm {
47 
48 /*
49  * Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together.
50  *
51  * 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining
52  * entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad
53  * with a particular value.
54  *
55  * Note that it is not expected for this templated version to ever be used - all cases that matter should be
56  * explicitly specialized with an optimized implementation.
57  */
58 template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
interleave_block(TOut * & out,const TIn * const * in,size_t width,size_t height,size_t row_offset,bool first)59 void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
60     const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
61 
62     std::vector<int32_t> the_sums;
63 
64     if (integrate_sums) {
65         the_sums = std::vector<int32_t>(int_by, 0);
66 
67         if (!first) {
68             // In 'integrate sums' mode, we dump the sums at the end on each pass.
69 
70             // On the last pass this is correct, but on other passes it is not -
71             // so on the subsequent pass we need to take the output written by
72             // the previous pass as starting point for the sums, and then
73             // overwrite them with new interleaved data.
74             int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
75 
76             // Rewind pointer to where we wrote out the sums last time.
77             out_int32 -= int_by;
78 
79             // Restore the running sums.
80             memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
81 
82             // Update the "real" pointer so that the next output will clobber the old sums.
83             out = reinterpret_cast<TOut *>(out_int32);
84         }
85     }
86 
87     for (unsigned int pos=0; pos<width; pos+=block) {
88         for (unsigned int row=0; row<int_by; row++) {
89             // Row out of range - pad 'block' entries.
90             if (row >= height) {
91                 for (unsigned int col=0; col<block; col++) {
92                     *out++ = 0;
93                 }
94                 continue;
95             }
96 
97             for (unsigned int col=0; col<block; col++) {
98                 // Column out of range - pad a single entry
99                 if (pos + col >= width) {
100                     *out++ = 0;
101                     continue;
102                 }
103 
104                 if (integrate_sums) {
105                     the_sums[row] += in[row][row_offset + pos + col];
106                 }
107 
108                 *out++ = in[row][row_offset + pos + col];
109             }
110         }
111     }
112 
113     if (integrate_sums) {
114         int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
115 
116         memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
117 
118         out = reinterpret_cast<TOut *>(out_int32 + int_by);
119     }
120 }
121 
122 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
FixupRowSums(TOut * & out,const int32_t row_sum_multiplier)123 inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
124     const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
125 
126     // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
127     if (row_sum_multiplier) {
128         // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
129         // next block (post sums).
130         // We need to go back and apply the multiplier to the computed sums.  We don't need to change 'out'.
131         int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
132 
133         out_int32 -= height;
134         for (unsigned int i=0; i<height; i++) {
135             out_int32[i] *= row_sum_multiplier;
136         }
137     } else {
138         // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
139         // sum block.  We need to insert the (zero) sums, and advance 'out'.
140         int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
141 
142         for (unsigned int i=0; i<height; i++) {
143             out_int32[i] = 0;
144         }
145 
146         out_int32 += height;
147 
148         out = reinterpret_cast<TOut *>(out_int32);
149     }
150 }
151 
152 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
IndirectInterleave(TOut * out,const TIn * const * const * ptr,unsigned int stringlen,unsigned int rounded_stringlen,const unsigned int y0,const unsigned int ymax,const unsigned int k0,const unsigned int kmax,bool integrate_sums,const int32_t row_sum_multiplier)153 void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen,
154                         unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
155                         const unsigned int k0, const unsigned int kmax, bool integrate_sums,
156                         const int32_t row_sum_multiplier) {
157     const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
158 
159     // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
160     // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
161     // out of range rows).  This allows interleave_block to use techniques like row predication, or loading all
162     // pointers and conditionally overriding the out of range ones.
163 
164     // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
165     // range reads.  Avoid this with a local buffer to use in last-rows cases.  Use alloca as a std::vector can be
166     // expensive in highly threaded scenarios.
167     const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
168 
169     // Figure out the starting position based on k0 (with rounded length)
170     unsigned int start_string      = k0 / rounded_stringlen;
171     unsigned int start_stringpos   = k0 % rounded_stringlen;
172 
173     // Process blocks of 'height' height...
174     for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
175         // Height to process
176         unsigned int active_height = std::min(ymax - ybase, height);
177 
178         // Track our progress through the various strings
179         unsigned int k_left    = (kmax - k0);
180         unsigned int string    = start_string;
181         unsigned int stringpos = start_stringpos;
182 
183         bool first = true;
184 
185         // Prepare to call 'interleave_block' above for each string encompassed by K range
186         while (k_left > 0) {
187             // Width to process - and the width we will generate (with padding)
188             unsigned int in_width   = std::min(k_left, stringlen - stringpos);
189             unsigned int out_width  = std::min(k_left, rounded_stringlen - stringpos);
190 
191             const TIn * const *row_base = ptr[string] + ybase;
192 
193             // If not all rows are valid, copy the ones that are into local array (see above comment).
194             if (active_height < height) {
195                 for (unsigned int i=0; i<active_height; i++) {
196                     row_ptrs[i] = ptr[string][ybase + i];
197                 }
198 
199                 row_base = row_ptrs;
200             }
201 
202             // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
203             // much code.  However, integrated sums make no sense for non-integral types and won't ever be
204             // requested.  So put a type trait check here to avoid generating pointless code.
205             if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
206                 interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
207             } else {
208                 interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
209             }
210 
211             k_left -= out_width;
212             string++;
213             stringpos=0;
214             first=false;
215         }
216 
217         if (std::is_integral<TOut>::value && integrate_sums) {
218             FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
219         }
220     }
221 }
222 
223 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
ConvolutionInterleave(TOut * out,const TIn * in,size_t in_stride,const convolver<TIn> & conv,const unsigned int rounded_stringlen,const unsigned int y0,const unsigned int ymax,const unsigned int k0,const unsigned int kmax,bool integrate_sums,const int32_t row_sum_multiplier)224 void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
225         const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
226     const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
227 
228     auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
229 
230     // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
231     const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
232 
233     for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
234         // How many of the rows are active - the rest will get padded in interleave_block.
235         unsigned int active_height   = std::min(ymax - ybase, height);
236         bool first = true;
237 
238         auto conv_rows = conv_cols.process_rows(ybase, active_height);
239 
240         while (!conv_rows.finished()) {
241             unsigned int width, offset;
242 
243             // Get next set of parameters
244             std::tie(width, offset) = conv_rows.next_block(row_ptrs);
245 
246             // Perform the interleave
247             if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
248                 interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
249             } else {
250                 interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
251             }
252 
253             first=false;
254         }
255 
256         if (std::is_integral<TOut>::value && integrate_sums) {
257             FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
258         }
259     }
260 }
261 
262 template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
Interleave(TOut * out,const TIn * in,size_t in_stride,const unsigned int y0,const unsigned int ymax,const unsigned int k0,const unsigned int kmax,bool integrate_sums,const int32_t row_sum_multiplier)263 void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
264     const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
265 
266     // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
267     const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
268 
269     const unsigned int width=kmax-k0;
270 
271     for (unsigned int y=y0; y<ymax; y+=height) {
272         for (unsigned int r=0; r<height; r++) {
273             row_ptrs[r] = in + ((y + r) * in_stride);
274         }
275 
276         if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
277             interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
278         } else {
279             interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
280         }
281 
282         if (std::is_integral<TOut>::value && integrate_sums) {
283             FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
284         }
285     }
286 }
287 
288 #include "indirect-interleaves/list.hpp"
289 
290 /**** Instantiate needed implementations ****/
291 
292 /* AArch32 */
293 #ifdef __arm__
294 /* FP32 */
295 /* NEON implementation (height 6) */
296 template void IndirectInterleave<6, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
297 template void ConvolutionInterleave<6, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
298 template void Interleave<6, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
299 
300 /* FP16 */
301 #if __ARM_FP16_ARGS
302 /* NEON implementation using FP32 kernel (height 6) */
303 template void IndirectInterleave<6, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
304 template void ConvolutionInterleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
305 template void Interleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
306 #endif /* __ARM_FP16_ARGS */
307 
308 /* BF16 */
309 /* NEON implementation using FP32 kernel */
310 template void IndirectInterleave<6, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
311 template void ConvolutionInterleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
312 template void Interleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
313 #endif
314 
315 /* AArch64 */
316 #ifdef __aarch64__
317 /* FP64 */
318 /* NEON/SVE implementation (height 8) */
319 template void IndirectInterleave<8, 1, VLType::None>(double *, const double * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
320 template void ConvolutionInterleave<8, 1, VLType::None>(double *, const double *, size_t, const convolver<double> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
321 template void Interleave<8, 1, VLType::None>(double *, const double *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
322 
323 /* FP32 */
324 /* NEON/SVE implementation (height 8) */
325 template void IndirectInterleave<8, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
326 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
327 template void Interleave<8, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
328 
329 /* FMMLA */
330 template void IndirectInterleave<8, 2, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
331 template void ConvolutionInterleave<8, 2, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
332 template void Interleave<8, 2, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
333 
334 /* FP16 */
335 template void IndirectInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
336 template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
337 template void Interleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
338 
339 template void IndirectInterleave<8, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
340 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
341 template void Interleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
342 
343 /* BF16 */
344 /* NEON/SVE BFDOT */
345 template void IndirectInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
346 template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
347 template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
348 
349 template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
350 template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
351 template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
352 
353 /* NEON/SVE using FP32 kernel */
354 template void IndirectInterleave<8, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
355 template void ConvolutionInterleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
356 template void Interleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
357 
358 /* INT16 */
359 template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
360 template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, const convolver<int16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
361 template void Interleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
362 
363 template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
364 template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, const convolver<uint16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
365 template void Interleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
366 
367 /* INT8 */
368 /* NEON SMLA/SMLAL (height 4, block 16) */
369 template void IndirectInterleave<4, 16, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
370 template void ConvolutionInterleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
371 template void Interleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
372 
373 /* NEON SDOT (height 8, block 4) */
374 template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
375 template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
376 template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
377 
378 /* MMLA SMMLA (height 8, block 8) */
379 template void IndirectInterleave<8, 8, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
380 template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
381 template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
382 
383 /* NEON SDOT (height 8, block 1) */
384 template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
385 template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
386 template void Interleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
387 
388 /* NEON SMLA/SMLAL (height 4, block 16) */
389 template void IndirectInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
390 template void ConvolutionInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
391 template void Interleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
392 
393 /* NEON SDOT (height 8, block 4) */
394 template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
395 template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
396 template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
397 
398 /* MMLA SMMLA (height 8, block 8) */
399 template void IndirectInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
400 template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
401 template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
402 
403 /* NEON 16-bit (height 8, block 1) */
404 template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
405 template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
406 template void Interleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
407 #endif // __aarch64__
408 
409 } // namespace arm_gemm
410