• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 Google LLC. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <cstdint>
17 
18 #include "ruy/asm_helpers.h"
19 #include "ruy/check_macros.h"
20 #include "ruy/kernel_arm.h"
21 #include "ruy/opt_set.h"
22 #include "ruy/platform.h"
23 #include "ruy/profiler/instrumentation.h"
24 
25 namespace ruy {
26 
27 #if RUY_PLATFORM_NEON_64 && RUY_OPT(ASM)
28 
29 #define RUY_ASM_LABEL_STORE_UINT8 91
30 #define RUY_ASM_LABEL_STORE_INT8 92
31 #define RUY_ASM_LABEL_STORE_INT16 93
32 #define RUY_ASM_LABEL_STORE_INT32 94
33 #define RUY_ASM_LABEL_AFTER_STORE 99
34 
35 #define RUY_OFFSET_BIAS 0
36 #define RUY_OFFSET_LHS_SUMS 8
37 #define RUY_OFFSET_RHS_SUMS 16
38 #define RUY_OFFSET_LHS_BASE_PTR 24
39 #define RUY_OFFSET_MULTIPLIER_FIXEDPOINT 32
40 #define RUY_OFFSET_MULTIPLIER_EXPONENT 40
41 #define RUY_OFFSET_RHS_BASE_PTR 48
42 #define RUY_OFFSET_DST_BASE_PTR 56
43 #define RUY_OFFSET_LHS_ZERO_POINT 64
44 #define RUY_OFFSET_RHS_ZERO_POINT 68
45 #define RUY_OFFSET_DST_ZERO_POINT 72
46 #define RUY_OFFSET_PROD_ZP_DEPTH 76
47 #define RUY_OFFSET_START_ROW 80
48 #define RUY_OFFSET_START_COL 84
49 #define RUY_OFFSET_LAST_ROW 88
50 #define RUY_OFFSET_LAST_COL 92
51 #define RUY_OFFSET_DST_ROWS 96
52 #define RUY_OFFSET_DST_COLS 100
53 #define RUY_OFFSET_LHS_STRIDE 104
54 #define RUY_OFFSET_RHS_STRIDE 108
55 #define RUY_OFFSET_DST_STRIDE 112
56 #define RUY_OFFSET_DEPTH 116
57 #define RUY_OFFSET_CLAMP_MIN 120
58 #define RUY_OFFSET_CLAMP_MAX 124
59 #define RUY_OFFSET_FLAGS 128
60 
61 template <typename Params>
CheckOffsetsInKernelParams8bit(const Params &)62 void CheckOffsetsInKernelParams8bit(const Params&) {
63   static_assert(offsetof(Params, lhs_zero_point) == RUY_OFFSET_LHS_ZERO_POINT,
64                 "");
65   static_assert(offsetof(Params, rhs_zero_point) == RUY_OFFSET_RHS_ZERO_POINT,
66                 "");
67   static_assert(offsetof(Params, dst_zero_point) == RUY_OFFSET_DST_ZERO_POINT,
68                 "");
69   static_assert(offsetof(Params, prod_zp_depth) == RUY_OFFSET_PROD_ZP_DEPTH,
70                 "");
71   static_assert(offsetof(Params, multiplier_fixedpoint) ==
72                     RUY_OFFSET_MULTIPLIER_FIXEDPOINT,
73                 "");
74   static_assert(
75       offsetof(Params, multiplier_exponent) == RUY_OFFSET_MULTIPLIER_EXPONENT,
76       "");
77   static_assert(offsetof(Params, clamp_min) == RUY_OFFSET_CLAMP_MIN, "");
78   static_assert(offsetof(Params, clamp_max) == RUY_OFFSET_CLAMP_MAX, "");
79   static_assert(offsetof(Params, bias) == RUY_OFFSET_BIAS, "");
80   static_assert(offsetof(Params, lhs_sums) == RUY_OFFSET_LHS_SUMS, "");
81   static_assert(offsetof(Params, rhs_sums) == RUY_OFFSET_RHS_SUMS, "");
82   static_assert(offsetof(Params, flags) == RUY_OFFSET_FLAGS, "");
83   static_assert(offsetof(Params, lhs_base_ptr) == RUY_OFFSET_LHS_BASE_PTR, "");
84   static_assert(offsetof(Params, start_row) == RUY_OFFSET_START_ROW, "");
85   static_assert(offsetof(Params, last_row) == RUY_OFFSET_LAST_ROW, "");
86   static_assert(offsetof(Params, last_col) == RUY_OFFSET_LAST_COL, "");
87   static_assert(offsetof(Params, lhs_stride) == RUY_OFFSET_LHS_STRIDE, "");
88   static_assert(offsetof(Params, rhs_stride) == RUY_OFFSET_RHS_STRIDE, "");
89   static_assert(offsetof(Params, dst_stride) == RUY_OFFSET_DST_STRIDE, "");
90   static_assert(offsetof(Params, depth) == RUY_OFFSET_DEPTH, "");
91 }
92 
93 // Fast-int8-trick kernel, similar to this production gemmlowp kernel:
94 // NEON_64bit_GEMM_Int8Operands_AccumTwoWithin16Bits
95 // https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L2296
96 //
97 // Relevant target CPUs for this kernel include ARM Cortex-A73 and Cortex-A75,
98 // since these are 64-bit, out-of-order and without dotprod support.
Kernel8bitNeon(const KernelParams8bit<4,4> & params)99 void Kernel8bitNeon(const KernelParams8bit<4, 4>& params) {
100   profiler::ScopeLabel label("Kernel (kNeon)");
101   CheckOffsetsInKernelParams8bit(params);
102 
103   const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
104   const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
105   const std::int8_t* lhs_ptr = lhs_col_ptr;
106   const std::int8_t* rhs_ptr = rhs_col_ptr;
107   void* dst_col_ptr = params.dst_base_ptr;
108   void* dst_ptr = dst_col_ptr;
109   int row = params.start_row;
110   int col = params.start_col;
111 
112   // The asm kernel below has the following NEON register allocation:
113   //
114   // v16 -- v31 are int32 accumulators.
115   // During accumulation, v0 -- v3 are used to load int8 data from LHS and
116   // v4 -- v7 from RHS:
117   //
118   //                                      int8 RHS 16x4 block
119   //                           /-----------------------------------------|
120   //                           |v4.b[0]          ...           v7.b[0]   |
121   //                           |  ...                            ...     |
122   //                           |v4.b[15]         ...           v7.b[15]  |
123   //                           \-----------------------------------------/
124   //    int8 LHS 4x16 block
125   //  /---------------------\  /-----------------------------------------|
126   //  |v0.b[0] ... v0.b[15] |  |v16.4s           ...           v28.4s    |
127   //  |v1.b[0] ... v1.b[15] |  |v17.4s           ...           v29.4s    |
128   //  |v2.b[0] ... v2.b[15] |  |v18.4s           ...           v30.4s    |
129   //  |v3.b[0] ... v3.b[15] |  |v19.4s           ...           v31.4s    |
130   //  \---------------------/  \-----------------------------------------/
131   //                                  int32 accumulators 4x4 block
132   //
133   // No attempt had been made so far at implementing the RUY_OPT_MAX_STREAMING
134   // optimization for this kernel.
135   asm volatile(
136 #define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
137 
138         // clang-format off
139 
140         // Load some parameters into registers.
141         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
142         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
143         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
144         "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
145         "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
146         "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
147         "ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
148         "ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
149 
150         // Load the first 64 bytes of LHS and RHS data.
151         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
152         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
153         "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
154         "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
155         "ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
156         "ld1 {v5.16b}, [%[rhs_ptr]], #16\n"
157         "ld1 {v6.16b}, [%[rhs_ptr]], #16\n"
158         "ld1 {v7.16b}, [%[rhs_ptr]], #16\n"
159 
160         // Clear accumulators.
161         RUY_MAKE_ZERO(v16)
162         RUY_MAKE_ZERO(v17)
163         RUY_MAKE_ZERO(v18)
164         RUY_MAKE_ZERO(v19)
165         RUY_MAKE_ZERO(v20)
166         RUY_MAKE_ZERO(v21)
167         RUY_MAKE_ZERO(v22)
168         RUY_MAKE_ZERO(v23)
169         RUY_MAKE_ZERO(v24)
170         RUY_MAKE_ZERO(v25)
171         RUY_MAKE_ZERO(v26)
172         RUY_MAKE_ZERO(v27)
173         RUY_MAKE_ZERO(v28)
174         RUY_MAKE_ZERO(v29)
175         RUY_MAKE_ZERO(v30)
176         RUY_MAKE_ZERO(v31)
177 
178         // w1 is the number of levels of depth that we have already loaded
179         // LHS and RHS data for. Corresponding to the initial ld1 instructions
180         // above, this is currently 16.
181         "mov w1, #16\n"
182 
183         // Perform the first few multiply-adds on the data that we have already
184         // loaded.
185         "smull    v8.8h,  v0.8b,  v4.8b\n"
186         "smull    v9.8h,  v1.8b,  v4.8b\n"
187         "smull    v10.8h,  v2.8b,  v4.8b\n"
188         "smull    v11.8h,  v3.8b,  v4.8b\n"
189         "smull    v12.8h,  v0.8b,  v5.8b\n"
190         "smull    v13.8h,  v1.8b,  v5.8b\n"
191         "smull    v14.8h,  v2.8b,  v5.8b\n"
192         "smull    v15.8h,  v3.8b,  v5.8b\n"
193 
194         // Multiply-accumulate second-half, again into the same
195         // 16bit local accumulator registers. This is where we
196         // take advantage of having int8 instead of uint8 and therefore
197         // being able to accumulate two products into int16.
198         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
199         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
200         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
201         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
202         "smlal2   v12.8h,  v0.16b,  v5.16b\n"
203         "smlal2   v13.8h,  v1.16b,  v5.16b\n"
204         "smlal2   v14.8h,  v2.16b,  v5.16b\n"
205         "smlal2   v15.8h,  v3.16b,  v5.16b\n"
206 
207 
208         // Main loop of the whole GEMM, over rows and columns of the
209         // destination matrix.
210         "1:\n"
211 
212         // Reminder - w1 is how many levels of depth we have already loaded
213         // data for, w12 is the total depth.
214         "cmp w1, w12\n"
215         "beq 79f\n"
216 
217         "2:\n"
218 
219         // Some multiplications and 16-bit accumulation were already done above,
220         // so we start right away in the middle.
221         "sadalp  v16.4s, v8.8h\n"
222         "ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
223         "smull    v8.8h,  v0.8b,  v6.8b\n"
224         "sadalp  v17.4s, v9.8h\n"
225         "ld1 {v5.16b}, [%[rhs_ptr]], #16\n"
226         "smull    v9.8h,  v1.8b,  v6.8b\n"
227         "sadalp  v18.4s, v10.8h\n"
228         "smull    v10.8h,  v2.8b,  v6.8b\n"
229         "sadalp  v19.4s, v11.8h\n"
230         "smull    v11.8h,  v3.8b,  v6.8b\n"
231         "sadalp  v20.4s, v12.8h\n"
232         "smull    v12.8h,  v0.8b,  v7.8b\n"
233         "sadalp  v21.4s, v13.8h\n"
234         "smull    v13.8h,  v1.8b,  v7.8b\n"
235         "sadalp  v22.4s, v14.8h\n"
236         "smull    v14.8h,  v2.8b,  v7.8b\n"
237         "sadalp  v23.4s, v15.8h\n"
238         "smull    v15.8h,  v3.8b,  v7.8b\n"
239 
240         // Multiply-accumulate second-half, again into the same
241         // 16bit local accumulator registers. This is where we
242         // take advantage of having int8 instead of uint8 and therefore
243         // being able to accumulate two products into int16.
244         "smlal2   v8.8h,  v0.16b,  v6.16b\n"
245         "smlal2   v9.8h,  v1.16b,  v6.16b\n"
246         "smlal2   v10.8h,  v2.16b,  v6.16b\n"
247         "smlal2   v11.8h,  v3.16b,  v6.16b\n"
248 
249         "ld1 {v6.16b}, [%[rhs_ptr]], #16\n"
250 
251         "smlal2   v12.8h,  v0.16b,  v7.16b\n"
252         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
253         "smlal2   v13.8h,  v1.16b,  v7.16b\n"
254         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
255         "smlal2   v14.8h,  v2.16b,  v7.16b\n"
256         "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
257         "smlal2   v15.8h,  v3.16b,  v7.16b\n"
258         "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
259 
260         "sadalp  v24.4s, v8.8h\n"
261         "smull    v8.8h,  v0.8b,  v4.8b\n"
262         "sadalp  v25.4s, v9.8h\n"
263         "ld1 {v7.16b}, [%[rhs_ptr]], #16\n"
264         "smull    v9.8h,  v1.8b,  v4.8b\n"
265         "sadalp  v26.4s, v10.8h\n"
266         "smull    v10.8h,  v2.8b,  v4.8b\n"
267         "sadalp  v27.4s, v11.8h\n"
268         "smull    v11.8h,  v3.8b,  v4.8b\n"
269         "sadalp  v28.4s, v12.8h\n"
270         "smull    v12.8h,  v0.8b,  v5.8b\n"
271         "sadalp  v29.4s, v13.8h\n"
272         "smull    v13.8h,  v1.8b,  v5.8b\n"
273         "sadalp  v30.4s, v14.8h\n"
274         "smull    v14.8h,  v2.8b,  v5.8b\n"
275         "sadalp  v31.4s, v15.8h\n"
276         "smull    v15.8h,  v3.8b,  v5.8b\n"
277 
278         // Multiply-accumulate second-half, again into the same
279         // 16bit local accumulator registers. This is where we
280         // take advantage of having int8 instead of uint8 and therefore
281         // being able to accumulate two products into int16.
282         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
283         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
284         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
285         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
286 
287         "smlal2   v12.8h,  v0.16b,  v5.16b\n"
288         "smlal2   v13.8h,  v1.16b,  v5.16b\n"
289         "smlal2   v14.8h,  v2.16b,  v5.16b\n"
290         "smlal2   v15.8h,  v3.16b,  v5.16b\n"
291 
292 
293 
294         // Each iteration of this loop advances by 16 levels of depth.
295         "add w1, w1, #16\n"
296 
297         // Loop termination condition
298         "cmp w1, w12\n"
299 
300         "blt 2b\n"
301 
302         "79:\n"
303 
304         "sadalp  v16.4s, v8.8h\n"
305         "smull    v8.8h,  v0.8b,  v6.8b\n"
306         "sadalp  v17.4s, v9.8h\n"
307         "smull    v9.8h,  v1.8b,  v6.8b\n"
308         "sadalp  v18.4s, v10.8h\n"
309         "smull    v10.8h,  v2.8b,  v6.8b\n"
310         "sadalp  v19.4s, v11.8h\n"
311         "smull    v11.8h,  v3.8b,  v6.8b\n"
312         "sadalp  v20.4s, v12.8h\n"
313         "smull    v12.8h,  v0.8b,  v7.8b\n"
314         "sadalp  v21.4s, v13.8h\n"
315         "smull    v13.8h,  v1.8b,  v7.8b\n"
316         "sadalp  v22.4s, v14.8h\n"
317         "smull    v14.8h,  v2.8b,  v7.8b\n"
318         "sadalp  v23.4s, v15.8h\n"
319         "smull    v15.8h,  v3.8b,  v7.8b\n"
320 
321         // Multiply-accumulate second-half, again into the same
322         // 16bit local accumulator registers. This is where we
323         // take advantage of having int8 instead of uint8 and therefore
324         // being able to accumulate two products into int16.
325         "smlal2   v8.8h,  v0.16b,  v6.16b\n"
326         "smlal2   v9.8h,  v1.16b,  v6.16b\n"
327         "smlal2   v10.8h,  v2.16b,  v6.16b\n"
328         "smlal2   v11.8h,  v3.16b,  v6.16b\n"
329 
330         "smlal2   v12.8h,  v0.16b,  v7.16b\n"
331         "smlal2   v13.8h,  v1.16b,  v7.16b\n"
332         "smlal2   v14.8h,  v2.16b,  v7.16b\n"
333         "smlal2   v15.8h,  v3.16b,  v7.16b\n"
334 
335         "sadalp  v24.4s, v8.8h\n"
336         "sadalp  v25.4s, v9.8h\n"
337         "sadalp  v26.4s, v10.8h\n"
338         "sadalp  v27.4s, v11.8h\n"
339         "sadalp  v28.4s, v12.8h\n"
340         "sadalp  v29.4s, v13.8h\n"
341         "sadalp  v30.4s, v14.8h\n"
342         "sadalp  v31.4s, v15.8h\n"
343 
344         // End of accumulation. The registers v16 -- v31 contain the final
345         // int32 accumulator values of the current 4x4 destination block.
346         // We now have to compute the final 8-bit values from these int32
347         // accumulators, and advance to the next 4x4 block. We intertwine
348         // these two aspects whenever possible for optimal pipelining, both
349         // at the data flow level (prefetch data for next block as early as
350         // possible) and instruction pipelining level (some of the next-block
351         // work can dual-issue with some of the final work on the current
352         // block).
353 
354         // Reduce 32bit accumulators horizontally.
355         "addp v16.4s, v16.4s, v17.4s\n"
356         "addp v18.4s, v18.4s, v19.4s\n"
357         "addp v20.4s, v20.4s, v21.4s\n"
358         "addp v22.4s, v22.4s, v23.4s\n"
359         "addp v24.4s, v24.4s, v25.4s\n"
360         "addp v26.4s, v26.4s, v27.4s\n"
361         "addp v28.4s, v28.4s, v29.4s\n"
362         "addp v30.4s, v30.4s, v31.4s\n"
363 
364         // Reduce 32bit accumulators horizontally, second pass
365         // (each pass adds pairwise. we need to add 4-wise).
366         "addp v16.4s, v16.4s, v18.4s\n"
367         "addp v17.4s, v20.4s, v22.4s\n"
368         "addp v18.4s, v24.4s, v26.4s\n"
369         "addp v19.4s, v28.4s, v30.4s\n"
370 
371         // Logic to advance to the next block in preparation for the next
372         // iteration of the main loop. For now, we only want to compute
373         // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
374         // not yet ready to update the values of row and col, as we still need
375         // the current values for the rest of the work on the current block.
376 
377         "cmp %w[row], w7\n"  // Have we finished the last row?
378         "bge 4f\n"           // If finished last row, go to 4
379         // Not finished last row: then advance to next row.
380         "add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #2\n"
381         "b 5f\n"
382         "4:\n"  // Finished last row...
383         "mov %[lhs_col_ptr], x5\n"  // Go back to first row
384         // Now we need to advance to the next column. If we already
385         // finished the last column, then in principle we are done, however
386         // we can't just return here, as we need to allow the end work of the
387         // current block to complete. The good news is that at this point it
388         // doesn't matter what data we load for the next column, since
389         // we will exit from the main loop below before actually storing
390         // anything computed from that data.
391         "cmp %w[col], w8\n"  // Have we finished the last column?
392         "bge 5f\n" // If yes, just carry on without updating the column pointer.
393         // Not finished last column: then advance to next column.
394         "add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #2\n"
395         "5:\n"
396 
397         // Set the LHS and RHS data pointers to the start of the columns just
398         // computed.
399         "mov %[lhs_ptr], %[lhs_col_ptr]\n"
400         "mov %[rhs_ptr], %[rhs_col_ptr]\n"
401 
402         // Load some parameters needed for the end work on current block.
403         "mvni v8.4s, #0\n"
404         "ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
405         "ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
406         "ins v13.h[4], w4\n" // dst_zero_point
407         "ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
408         "ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
409         "dup v9.4s, w3\n"   // create prod_zp_depth_vec
410 
411         // Now we load: bias data, LHS sums data, RHS sums data.
412 
413         // First, load the base pointers from the params.
414         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
415 
416         // Determine the channel index.
417         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
418         "csel w3, %w[row], %w[col], eq\n"
419 
420         // Offset the bias pointer as needed given the current row, col.
421         "add x5, x1, x3, lsl #2\n"
422 
423         // If there is no bias, use no offset, just address the passed zero
424         // data.
425         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
426         "csel x1, x1, x5, eq\n"
427 
428         // Load 4 bias values.
429         "ld1 {v14.4s}, [x1]\n"
430 
431         // Load the multiplier_fixedpoint values.
432         "add x5, x4, x3, lsl #2\n"
433         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
434         "csel x4, x4, x5, eq\n"
435         "ld1 {v15.4s}, [x4]\n" // multiplier_fixedpoint
436 
437         // Now that we know what LHS and RHS data the next iteration of the
438         // main loop will need to load, we start loading the first 32 bytes of
439         // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
440         // in the rest of the work on the current block.
441         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
442         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
443         "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
444         "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
445         "ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
446         "ld1 {v5.16b}, [%[rhs_ptr]], #16\n"
447         "ld1 {v6.16b}, [%[rhs_ptr]], #16\n"
448         "ld1 {v7.16b}, [%[rhs_ptr]], #16\n"
449 
450         // Add to the bias values the product (depth * lhs_zero_point * rhs_zero_point),
451         // See the term NZ1Z2 in equation (7) in https://arxiv.org/pdf/1712.05877.pdf
452         "add v14.4s, v14.4s, v9.4s\n"
453 
454         // Perform the bias-addition (per the above, we have just folded into
455         // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
456         // Jump based on channel dimension.
457         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
458         "bne 6f\n"
459         // Case where channels are rows
460         "add v16.4s, v16.4s, v14.4s\n"
461         "add v17.4s, v17.4s, v14.4s\n"
462         "add v18.4s, v18.4s, v14.4s\n"
463         "add v19.4s, v19.4s, v14.4s\n"
464         "b 7f\n"
465 
466         "6:\n"
467         // Case where channels are columns
468         "dup v20.4s, v14.s[0]\n"
469         "dup v21.4s, v14.s[1]\n"
470         "dup v22.4s, v14.s[2]\n"
471         "dup v23.4s, v14.s[3]\n"
472         "add v16.4s, v16.4s, v20.4s\n"
473         "add v17.4s, v17.4s, v21.4s\n"
474         "add v18.4s, v18.4s, v22.4s\n"
475         "add v19.4s, v19.4s, v23.4s\n"
476         "7:\n"
477 
478         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
479         "beq 401f\n"
480         "ldr x3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
481         "add x3, x3, %x[col], lsl #2\n"
482         "ld1 {v14.4s}, [x3]\n"
483         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
484         "dup v10.4s, w5\n"  // create lhs_zero_point_vec
485         // Subtract rhs_sums * lhs_zero_point, per
486         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
487         "mls v16.4s, v10.4s, v14.s[0]\n"
488         "mls v17.4s, v10.4s, v14.s[1]\n"
489         "mls v18.4s, v10.4s, v14.s[2]\n"
490         "mls v19.4s, v10.4s, v14.s[3]\n"
491         "401:\n"
492 
493         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
494         "beq 402f\n"
495         "ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
496         "add x2, x2, %x[row], lsl #2\n"
497         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
498         // Load 4 lhs_sums values.
499         "ld1 {v11.4s}, [x2]\n"
500         "ins v13.s[1], w5\n" // rhs_zero_point
501         // Compute lhs_sums * rhs_zero_point.
502         "mul v11.4s, v11.4s, v13.s[1]\n"
503         // Subtract lhs_sums * rhs_zero_point, per
504         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
505         "sub v16.4s, v16.4s, v11.4s\n"
506         "sub v17.4s, v17.4s, v11.4s\n"
507         "sub v18.4s, v18.4s, v11.4s\n"
508         "sub v19.4s, v19.4s, v11.4s\n"
509 
510         // If the destination is int32, it means the user asks for the raw
511         // accumulators, no need for us to downquantize the value.
512         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
513         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
514 
515         "402:\n"
516 
517         // At this point we have computed the final int32 values. Now we
518         // start down-quantizing them to obtain the final 8bit values from them.
519 
520         // As part of this down-quantization, our int32 values will be
521         // multiplied by a multiplier that has a fixed-point component and an
522         // exponent component.
523 
524         //Load the exponent part of the multiplier.
525         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
526         // Determine the channel index.
527         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
528         "csel w3, %w[row], %w[col], eq\n"
529 
530         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
531         "add x5, x1, x3, lsl #2\n"
532         "csel x1, x1, x5, eq\n"
533 
534         "ld1 {v14.4s}, [x1]\n"
535 
536         "smin v11.4s, v8.4s, v14.4s\n"
537         "sub v12.4s, v14.4s, v11.4s\n"
538 
539         // Jump based on channel dimension.
540         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
541         "bne 8f\n"
542         // Case where channels are rows
543 
544         // Apply the positive exponent part of the multiplier.
545         "sshl v16.4s, v16.4s, v12.4s\n"
546         "sshl v17.4s, v17.4s, v12.4s\n"
547         "sshl v18.4s, v18.4s, v12.4s\n"
548         "sshl v19.4s, v19.4s, v12.4s\n"
549 
550         // Apply the fixed-point part of the multiplier.
551         "sqdmulh v16.4s, v16.4s, v15.4s\n"
552         "sqdmulh v17.4s, v17.4s, v15.4s\n"
553         "sqdmulh v18.4s, v18.4s, v15.4s\n"
554         "sqdmulh v19.4s, v19.4s, v15.4s\n"
555 
556         // Apply the negative exponent part of the multiplier.
557         "srshl v16.4s, v16.4s, v11.4s\n"
558         "srshl v17.4s, v17.4s, v11.4s\n"
559         "srshl v18.4s, v18.4s, v11.4s\n"
560         "srshl v19.4s, v19.4s, v11.4s\n"
561         "b 9f\n"
562 
563         "8:\n"
564         // Case where channels are columns
565 
566         // Apply the positive exponent part of the multiplier.
567         "dup v20.4s, v12.s[0]\n"
568         "dup v21.4s, v12.s[1]\n"
569         "dup v22.4s, v12.s[2]\n"
570         "dup v23.4s, v12.s[3]\n"
571         "sshl v16.4s, v16.4s, v20.4s\n"
572         "sshl v17.4s, v17.4s, v21.4s\n"
573         "sshl v18.4s, v18.4s, v22.4s\n"
574         "sshl v19.4s, v19.4s, v23.4s\n"
575 
576         // Apply the fixed-point part of the multiplier.
577         "sqdmulh v16.4s, v16.4s, v15.s[0]\n"
578         "sqdmulh v17.4s, v17.4s, v15.s[1]\n"
579         "sqdmulh v18.4s, v18.4s, v15.s[2]\n"
580         "sqdmulh v19.4s, v19.4s, v15.s[3]\n"
581 
582         // Apply the negative exponent part of the multiplier.
583         "dup v20.4s, v11.s[0]\n"
584         "dup v21.4s, v11.s[1]\n"
585         "dup v22.4s, v11.s[2]\n"
586         "dup v23.4s, v11.s[3]\n"
587         "srshl v16.4s, v16.4s, v20.4s\n"
588         "srshl v17.4s, v17.4s, v21.4s\n"
589         "srshl v18.4s, v18.4s, v22.4s\n"
590         "srshl v19.4s, v19.4s, v23.4s\n"
591         "9:\n"
592 
593         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
594         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
595         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
596         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
597 
598         RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
599 
600         // Cast-and-saturate from int32 to int16
601         "sqxtn v16.4h, v16.4s\n"
602         "sqxtn2 v16.8h, v17.4s\n"
603         "sqxtn v17.4h, v18.4s\n"
604         "sqxtn2 v17.8h, v19.4s\n"
605 
606         // At this point, v18 -- v31 aren't used anymore for the current block,
607         // so we can start clearing these accumulators for the next block
608         // (next iteration of the main loop).
609         RUY_MAKE_ZERO(v18)
610         RUY_MAKE_ZERO(v19)
611         RUY_MAKE_ZERO(v20)
612         RUY_MAKE_ZERO(v21)
613         RUY_MAKE_ZERO(v22)
614         RUY_MAKE_ZERO(v23)
615         RUY_MAKE_ZERO(v24)
616         RUY_MAKE_ZERO(v25)
617         RUY_MAKE_ZERO(v26)
618         RUY_MAKE_ZERO(v27)
619         RUY_MAKE_ZERO(v28)
620         RUY_MAKE_ZERO(v29)
621         RUY_MAKE_ZERO(v30)
622         RUY_MAKE_ZERO(v31)
623 
624         // Add the destination zero point
625         "dup v14.8h, v13.h[4]\n"
626         "sqadd v16.8h, v16.8h, v14.8h\n"
627         "sqadd v17.8h, v17.8h, v14.8h\n"
628 
629         // Cast-and-saturate from int16 to uint8
630         "sqxtun v16.8b, v16.8h\n"
631         "sqxtun2 v16.16b, v17.8h\n"
632 
633         // Load the clamp_min, clamp_max bounds
634         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
635         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
636         "dup v14.16b, w2\n"  // clamp_min
637         "dup v15.16b, w3\n"  // clamp_max
638 
639         // Apply the clamp_min bound
640         "umax v16.16b, v16.16b, v14.16b\n"
641         // Apply the clamp_max bound
642         "umin v16.16b, v16.16b, v15.16b\n"
643 
644         // Compute how much of the 4x4 block of destination 8bit values that
645         // we have computed, fit in the destination matrix. Typically, all of
646         // it fits, but when the destination matrix shape is not a multiple
647         // of 4x4, there are some 4x4 blocks along the boundaries that do
648         // not fit entirely.
649         "sub w1, %w[dst_rows], %w[row]\n"
650         "sub w2, %w[dst_cols], %w[col]\n"
651         "mov w3, #4\n"
652         "cmp w1, #4\n"
653         // Compute w1 = how many rows of the 4x4 block fit
654         "csel w1, w1, w3, le\n"
655         "cmp w2, #4\n"
656         // Compute w2 = how many cols of the 4x4 block fit
657         "csel w2, w2, w3, le\n"
658 
659         // Test if w1==4 && w2 == 4, i.e. if all of the 4x4 block fits.
660         "cmp w1, w3\n"
661         "ccmp w2, w3, 0, eq\n"
662         "mov x4, %[dst_ptr]\n"
663         // Yes, all of the 4x4 block fits, go to fast path.
664         "beq 30f\n"
665         // Not all of the 4x4 block fits.
666         // Store to dst_tmp_buf
667         "st1 {v16.16b}, [%[dst_tmp_buf]]\n"
668         // Slow loop copying from dst_tmp_buf to dst.
669         "mov x3, %[dst_tmp_buf]\n"
670         "mov w6, #0\n"
671         "50:\n"
672         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
673         "mov w5, #0\n"
674         "51:\n"
675         "ldrb w7, [x3, w5, uxtw]\n"
676         "strb w7, [x4, w5, uxtw]\n"
677         "add w5, w5, #1\n"
678         "cmp w5, w1\n"
679         "blt 51b\n"
680         "add w6, w6, #1\n"
681         "add x3, x3, #4\n"
682         "add x4, x4, x11\n"
683         "cmp w6, w2\n"
684         "blt 50b\n"
685         "b 31f\n"
686         "30:\n"
687         // Yes, all of the 4x4 block fits.
688         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
689         "mov x3, x4\n"
690         "st1 {v16.b}[0], [x3], #1\n"
691         "add x4, x4, x11\n"
692         "st1 {v16.b}[1], [x3], #1\n"
693         "st1 {v16.b}[2], [x3], #1\n"
694         "st1 {v16.b}[3], [x3], #1\n"
695         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
696         "mov x3, x4\n"
697         "st1 {v16.b}[4], [x3], #1\n"
698         "add x4, x4, x11\n"
699         "st1 {v16.b}[5], [x3], #1\n"
700         "st1 {v16.b}[6], [x3], #1\n"
701         "st1 {v16.b}[7], [x3], #1\n"
702         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
703         "mov x3, x4\n"
704         "st1 {v16.b}[8], [x3], #1\n"
705         "add x4, x4, x11\n"
706         "st1 {v16.b}[9], [x3], #1\n"
707         "st1 {v16.b}[10], [x3], #1\n"
708         "st1 {v16.b}[11], [x3], #1\n"
709         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
710         "mov x3, x4\n"
711         "st1 {v16.b}[12], [x3], #1\n"
712         "add x4, x4, x11\n"
713         "st1 {v16.b}[13], [x3], #1\n"
714         "st1 {v16.b}[14], [x3], #1\n"
715         "st1 {v16.b}[15], [x3], #1\n"
716         "31:\n"
717 
718         "add %[dst_ptr], %[dst_ptr], #4\n"
719 
720         RUY_MAKE_ZERO(v16)
721         RUY_MAKE_ZERO(v17)
722 
723         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
724 
725         RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
726 
727         // Cast-and-saturate from int32 to int16
728         "sqxtn v16.4h, v16.4s\n"
729         "sqxtn2 v16.8h, v17.4s\n"
730         "sqxtn v17.4h, v18.4s\n"
731         "sqxtn2 v17.8h, v19.4s\n"
732 
733         // At this point, v18 -- v31 aren't used anymore for the current block,
734         // so we can start clearing these accumulators for the next block
735         // (next iteration of the main loop).
736         RUY_MAKE_ZERO(v18)
737         RUY_MAKE_ZERO(v19)
738         RUY_MAKE_ZERO(v20)
739         RUY_MAKE_ZERO(v21)
740         RUY_MAKE_ZERO(v22)
741         RUY_MAKE_ZERO(v23)
742         RUY_MAKE_ZERO(v24)
743         RUY_MAKE_ZERO(v25)
744         RUY_MAKE_ZERO(v26)
745         RUY_MAKE_ZERO(v27)
746         RUY_MAKE_ZERO(v28)
747         RUY_MAKE_ZERO(v29)
748         RUY_MAKE_ZERO(v30)
749         RUY_MAKE_ZERO(v31)
750 
751         // Add the destination zero point
752         "dup v14.8h, v13.h[4]\n"
753         "sqadd v16.8h, v16.8h, v14.8h\n"
754         "sqadd v17.8h, v17.8h, v14.8h\n"
755 
756         // Cast-and-saturate from int16 to int8
757         "sqxtn v16.8b, v16.8h\n"
758         "sqxtn2 v16.16b, v17.8h\n"
759 
760         // Load the clamp_min, clamp_max bounds
761         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
762         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
763         "dup v14.16b, w2\n"  // clamp_min
764         "dup v15.16b, w3\n"  // clamp_max
765 
766         // Apply the clamp_min bound
767         "smax v16.16b, v16.16b, v14.16b\n"
768         // Apply the clamp_max bound
769         "smin v16.16b, v16.16b, v15.16b\n"
770 
771         // Compute how much of the 4x4 block of destination 8bit values that
772         // we have computed, fit in the destination matrix. Typically, all of
773         // it fits, but when the destination matrix shape is not a multiple
774         // of 4x4, there are some 4x4 blocks along the boundaries that do
775         // not fit entirely.
776         "sub w1, %w[dst_rows], %w[row]\n"
777         "sub w2, %w[dst_cols], %w[col]\n"
778         "mov w3, #4\n"
779         "cmp w1, #4\n"
780         // Compute w1 = how many rows of the 4x4 block fit
781         "csel w1, w1, w3, le\n"
782         "cmp w2, #4\n"
783         // Compute w2 = how many cols of the 4x4 block fit
784         "csel w2, w2, w3, le\n"
785 
786         // Test if w1==4 && w2 == 4, i.e. if all of the 4x4 block fits.
787         "cmp w1, w3\n"
788         "ccmp w2, w3, 0, eq\n"
789         "mov x4, %[dst_ptr]\n"
790         // Yes, all of the 4x4 block fits, go to fast path.
791         "beq 30f\n"
792         // Not all of the 4x4 block fits.
793         // Store to dst_tmp_buf
794         "st1 {v16.16b}, [%[dst_tmp_buf]]\n"
795         // Slow loop copying from dst_tmp_buf to dst.
796         "mov x3, %[dst_tmp_buf]\n"
797         "mov w6, #0\n"
798         "50:\n"
799         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
800         "mov w5, #0\n"
801         "51:\n"
802         "ldrb w7, [x3, w5, uxtw]\n"
803         "strb w7, [x4, w5, uxtw]\n"
804         "add w5, w5, #1\n"
805         "cmp w5, w1\n"
806         "blt 51b\n"
807         "add w6, w6, #1\n"
808         "add x3, x3, #4\n"
809         "add x4, x4, x11\n"
810         "cmp w6, w2\n"
811         "blt 50b\n"
812         "b 31f\n"
813         "30:\n"
814         // Yes, all of the 4x4 block fits.
815         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
816         "mov x3, x4\n"
817         "st1 {v16.b}[0], [x3], #1\n"
818         "add x4, x4, x11\n"
819         "st1 {v16.b}[1], [x3], #1\n"
820         "st1 {v16.b}[2], [x3], #1\n"
821         "st1 {v16.b}[3], [x3], #1\n"
822         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
823         "mov x3, x4\n"
824         "st1 {v16.b}[4], [x3], #1\n"
825         "add x4, x4, x11\n"
826         "st1 {v16.b}[5], [x3], #1\n"
827         "st1 {v16.b}[6], [x3], #1\n"
828         "st1 {v16.b}[7], [x3], #1\n"
829         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
830         "mov x3, x4\n"
831         "st1 {v16.b}[8], [x3], #1\n"
832         "add x4, x4, x11\n"
833         "st1 {v16.b}[9], [x3], #1\n"
834         "st1 {v16.b}[10], [x3], #1\n"
835         "st1 {v16.b}[11], [x3], #1\n"
836         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
837         "mov x3, x4\n"
838         "st1 {v16.b}[12], [x3], #1\n"
839         "add x4, x4, x11\n"
840         "st1 {v16.b}[13], [x3], #1\n"
841         "st1 {v16.b}[14], [x3], #1\n"
842         "st1 {v16.b}[15], [x3], #1\n"
843         "31:\n"
844 
845         "add %[dst_ptr], %[dst_ptr], #4\n"
846 
847         RUY_MAKE_ZERO(v16)
848         RUY_MAKE_ZERO(v17)
849 
850         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
851 
852         RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
853 
854         // Add the destination zero point
855         "dup v14.4h, v13.h[4]\n"
856         "saddw v16.4s, v16.4s, v14.4h\n"
857         "saddw v17.4s, v17.4s, v14.4h\n"
858         "saddw v18.4s, v18.4s, v14.4h\n"
859         "saddw v19.4s, v19.4s, v14.4h\n"
860 
861         // Cast-and-saturate from int32 to int16
862         "sqxtn v16.4h, v16.4s\n"
863         "sqxtn2 v16.8h, v17.4s\n"
864         "sqxtn v17.4h, v18.4s\n"
865         "sqxtn2 v17.8h, v19.4s\n"
866 
867         // At this point, v18 -- v31 aren't used anymore for the current block,
868         // so we can start clearing these accumulators for the next block
869         // (next iteration of the main loop).
870         RUY_MAKE_ZERO(v18)
871         RUY_MAKE_ZERO(v19)
872         RUY_MAKE_ZERO(v20)
873         RUY_MAKE_ZERO(v21)
874         RUY_MAKE_ZERO(v22)
875         RUY_MAKE_ZERO(v23)
876         RUY_MAKE_ZERO(v24)
877         RUY_MAKE_ZERO(v25)
878         RUY_MAKE_ZERO(v26)
879         RUY_MAKE_ZERO(v27)
880         RUY_MAKE_ZERO(v28)
881         RUY_MAKE_ZERO(v29)
882         RUY_MAKE_ZERO(v30)
883         RUY_MAKE_ZERO(v31)
884 
885         // Load the clamp_min, clamp_max bounds
886         "ldrh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
887         "ldrh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
888         "dup v14.8h, w2\n"  // clamp_min
889         "dup v15.8h, w3\n"  // clamp_max
890 
891         // Apply the clamp_min bound
892         "smax v16.8h, v16.8h, v14.8h\n"
893         "smax v17.8h, v17.8h, v14.8h\n"
894         // Apply the clamp_max bound
895         "smin v16.8h, v16.8h, v15.8h\n"
896         "smin v17.8h, v17.8h, v15.8h\n"
897 
898         // Compute how much of the 4x4 block of destination 8bit values that
899         // we have computed, fit in the destination matrix. Typically, all of
900         // it fits, but when the destination matrix shape is not a multiple
901         // of 4x4, there are some 4x4 blocks along the boundaries that do
902         // not fit entirely.
903         "sub w1, %w[dst_rows], %w[row]\n"
904         "sub w2, %w[dst_cols], %w[col]\n"
905         "mov w3, #4\n"
906         "cmp w1, #4\n"
907         // Compute w1 = how many rows of the 4x4 block fit
908         "csel w1, w1, w3, le\n"
909         "cmp w2, #4\n"
910         // Compute w2 = how many cols of the 4x4 block fit
911         "csel w2, w2, w3, le\n"
912 
913        // Test if w1==4 && w2 == 4, i.e. if all of the 8x8 block fits.
914         "cmp w1, w3\n"
915         "ccmp w2, w3, 0, eq\n"
916         "mov x4, %[dst_ptr]\n"
917         // Yes, all of the 4x4 block fits, go to fast path.
918         "beq 30f\n"
919         // Not all of the 4x4 block fits.
920         // Store to dst_tmp_buf
921         "str q16, [%[dst_tmp_buf], #0]\n"
922         "str q17, [%[dst_tmp_buf], #16]\n"
923         // Slow loop copying from dst_tmp_buf to dst.
924         "mov x3, %[dst_tmp_buf]\n"
925         "mov w6, #0\n"
926         "50:\n"
927         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
928         "mov w5, #0\n"
929         "51:\n"
930         "ldrh w7, [x3, x5, lsl #1]\n"
931         "strh w7, [x4, x5, lsl #1]\n"
932         "add w5, w5, #1\n"
933         "cmp w5, w1\n"
934         "blt 51b\n"
935         "add w6, w6, #1\n"
936         "add x3, x3, #8\n"
937         "add x4, x4, x11\n"
938         "cmp w6, w2\n"
939         "blt 50b\n"
940         "b 31f\n"
941         "30:\n"
942         // Yes, all of the 4x4 block fits.
943         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
944         "mov x3, x4\n"
945         "st1 {v16.h}[0], [x3], #2\n"
946         "add x4, x4, x11\n"
947         "st1 {v16.h}[1], [x3], #2\n"
948         "st1 {v16.h}[2], [x3], #2\n"
949         "st1 {v16.h}[3], [x3], #2\n"
950         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
951         "mov x3, x4\n"
952         "st1 {v16.h}[4], [x3], #2\n"
953         "add x4, x4, x11\n"
954         "st1 {v16.h}[5], [x3], #2\n"
955         "st1 {v16.h}[6], [x3], #2\n"
956         "st1 {v16.h}[7], [x3], #2\n"
957         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
958         "mov x3, x4\n"
959         "st1 {v17.h}[0], [x3], #2\n"
960         "add x4, x4, x11\n"
961         "st1 {v17.h}[1], [x3], #2\n"
962         "st1 {v17.h}[2], [x3], #2\n"
963         "st1 {v17.h}[3], [x3], #2\n"
964         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
965         "mov x3, x4\n"
966         "st1 {v17.h}[4], [x3], #2\n"
967         "add x4, x4, x11\n"
968         "st1 {v17.h}[5], [x3], #2\n"
969         "st1 {v17.h}[6], [x3], #2\n"
970         "st1 {v17.h}[7], [x3], #2\n"
971         "31:\n"
972 
973         "add %[dst_ptr], %[dst_ptr], #8\n"
974 
975         RUY_MAKE_ZERO(v16)
976         RUY_MAKE_ZERO(v17)
977 
978         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
979 
980         RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
981 
982         // Since the store type is the same as the accum type, no need for
983         // downcast. There's also no need for clamp by min/max.
984 
985         // At this point, v20 -- v31 aren't used anymore for the current block,
986         // so we can start clearing these accumulators for the next block
987         // (next iteration of the main loop).
988         RUY_MAKE_ZERO(v20)
989         RUY_MAKE_ZERO(v21)
990         RUY_MAKE_ZERO(v22)
991         RUY_MAKE_ZERO(v23)
992         RUY_MAKE_ZERO(v24)
993         RUY_MAKE_ZERO(v25)
994         RUY_MAKE_ZERO(v26)
995         RUY_MAKE_ZERO(v27)
996         RUY_MAKE_ZERO(v28)
997         RUY_MAKE_ZERO(v29)
998         RUY_MAKE_ZERO(v30)
999         RUY_MAKE_ZERO(v31)
1000 
1001         // Compute how much of the 4x4 block of destination 8bit values that
1002         // we have computed, fit in the destination matrix. Typically, all of
1003         // it fits, but when the destination matrix shape is not a multiple
1004         // of 4x4, there are some 4x4 blocks along the boundaries that do
1005         // not fit entirely.
1006         "sub w1, %w[dst_rows], %w[row]\n"
1007         "sub w2, %w[dst_cols], %w[col]\n"
1008         "mov w3, #4\n"
1009         "cmp w1, #4\n"
1010         // Compute w1 = how many rows of the 4x4 block fit
1011         "csel w1, w1, w3, le\n"
1012         "cmp w2, #4\n"
1013         // Compute w2 = how many cols of the 4x4 block fit
1014         "csel w2, w2, w3, le\n"
1015 
1016         // Test if w1==4 && w2 == 4, i.e. if all of the 8x8 block fits.
1017         "cmp w1, w3\n"
1018         "ccmp w2, w3, 0, eq\n"
1019         "mov x4, %[dst_ptr]\n"
1020         // Yes, all of the 4x4 block fits, go to fast path.
1021         "beq 30f\n"
1022         // Not all of the 4x4 block fits.
1023         // Store to dst_tmp_buf
1024         "str q16, [%[dst_tmp_buf], #0]\n"
1025         "str q17, [%[dst_tmp_buf], #16]\n"
1026         "str q18, [%[dst_tmp_buf], #32]\n"
1027         "str q19, [%[dst_tmp_buf], #48]\n"
1028         // Slow loop copying from dst_tmp_buf to dst.
1029         "mov x3, %[dst_tmp_buf]\n"
1030         "mov w6, #0\n"
1031         "50:\n"
1032         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1033         "mov w5, #0\n"
1034         "51:\n"
1035         "ldr w7, [x3, x5, lsl #2]\n"
1036         "str w7, [x4, x5, lsl #2]\n"
1037         "add w5, w5, #1\n"
1038         "cmp w5, w1\n"
1039         "blt 51b\n"
1040         "add w6, w6, #1\n"
1041         "add x3, x3, #16\n"
1042         "add x4, x4, x11\n"
1043         "cmp w6, w2\n"
1044         "blt 50b\n"
1045         "b 31f\n"
1046         "30:\n"
1047         // Yes, all of the 4x4 block fits.
1048         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1049         "mov x3, x4\n"
1050         "st1 {v16.s}[0], [x3], #4\n"
1051         "add x4, x4, x11\n"
1052         "st1 {v16.s}[1], [x3], #4\n"
1053         "st1 {v16.s}[2], [x3], #4\n"
1054         "st1 {v16.s}[3], [x3], #4\n"
1055         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1056         "mov x3, x4\n"
1057         "st1 {v17.s}[0], [x3], #4\n"
1058         "add x4, x4, x11\n"
1059         "st1 {v17.s}[1], [x3], #4\n"
1060         "st1 {v17.s}[2], [x3], #4\n"
1061         "st1 {v17.s}[3], [x3], #4\n"
1062         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1063         "mov x3, x4\n"
1064         "st1 {v18.s}[0], [x3], #4\n"
1065         "add x4, x4, x11\n"
1066         "st1 {v18.s}[1], [x3], #4\n"
1067         "st1 {v18.s}[2], [x3], #4\n"
1068         "st1 {v18.s}[3], [x3], #4\n"
1069         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1070         "mov x3, x4\n"
1071         "st1 {v19.s}[0], [x3], #4\n"
1072         "add x4, x4, x11\n"
1073         "st1 {v19.s}[1], [x3], #4\n"
1074         "st1 {v19.s}[2], [x3], #4\n"
1075         "st1 {v19.s}[3], [x3], #4\n"
1076         "31:\n"
1077 
1078         "add %[dst_ptr], %[dst_ptr], #16\n"
1079 
1080         RUY_MAKE_ZERO(v16)
1081         RUY_MAKE_ZERO(v17)
1082         RUY_MAKE_ZERO(v18)
1083         RUY_MAKE_ZERO(v19)
1084 
1085         RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
1086 
1087         // For the next block: perform the first few multiply-adds on the data
1088         // that we have already loaded.
1089         "smull    v8.8h,  v0.8b,  v4.8b\n"
1090         "smull    v9.8h,  v1.8b,  v4.8b\n"
1091         "smull    v10.8h,  v2.8b,  v4.8b\n"
1092         "smull    v11.8h,  v3.8b,  v4.8b\n"
1093         "smull    v12.8h,  v0.8b,  v5.8b\n"
1094         "smull    v13.8h,  v1.8b,  v5.8b\n"
1095         "smull    v14.8h,  v2.8b,  v5.8b\n"
1096         "smull    v15.8h,  v3.8b,  v5.8b\n"
1097         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
1098         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
1099         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
1100         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
1101         "smlal2   v12.8h,  v0.16b,  v5.16b\n"
1102         "smlal2   v13.8h,  v1.16b,  v5.16b\n"
1103         "smlal2   v14.8h,  v2.16b,  v5.16b\n"
1104         "smlal2   v15.8h,  v3.16b,  v5.16b\n"
1105 
1106         // Reload some params --- we had used x5 -- x7 for a few other things
1107         // since the last time we had loaded them.
1108         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
1109         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
1110         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
1111 
1112         // Move to the next block of the destination matrix, for the next iter
1113         // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
1114         // been updated earlier.
1115         // Have we reached the end row?
1116         "cmp %w[row], w7\n"
1117         "beq 20f\n"  // yes, end row.
1118         // Not end row. Move to the next row.
1119         "add %w[row], %w[row], #4\n"
1120         "b 21f\n"
1121         "20:\n"
1122         // Was already at end row.
1123         "mov %w[row], w6\n"  // Move back to first row.
1124         "add %w[col], %w[col], #4\n"  // Move to the next column.
1125         "add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #2\n"
1126         "mov %[dst_ptr], %[dst_col_ptr]\n"
1127         "21:\n"
1128 
1129         // Main loop exit condition: have we hit the end column?
1130         "cmp %w[col], w8\n"
1131 
1132         // w1 is the number of levels of depth that we have already loaded
1133         // LHS and RHS data for. Corresponding to the initial ld1 instructions
1134         // above, this is currently 4.
1135         "mov w1, #16\n"
1136 
1137         "ble 1b\n"
1138 
1139         // clang-format on
1140 
1141         : [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
1142           [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1143           [dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
1144         : [ params ] "r"(&params), [dst_rows] "r"(params.dst_rows),
1145           [dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
1146           [dst_type_id] "r"(params.dst_type_id)
1147         : "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
1148           "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
1149           "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1150           "v26", "v27", "v28", "v29", "v30", "v31");
1151 }
1152 
1153 // Similar to existing Kernel8bitNeon but specialized for the case of
1154 // RHS cols == 1.
1155 // Relevant target CPUs for this kernel include ARM Cortex-A73 and Cortex-A75,
1156 // since these are 64-bit, out-of-order and without dotprod support.
Kernel8bitNeon1Col(const KernelParams8bit<4,4> & params)1157 void Kernel8bitNeon1Col(const KernelParams8bit<4, 4>& params) {
1158   profiler::ScopeLabel label("Kernel (kNeon)");
1159 
1160   CheckOffsetsInKernelParams8bit(params);
1161 
1162   const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
1163   const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
1164   const std::int8_t* lhs_ptr = lhs_col_ptr;
1165   const std::int8_t* rhs_ptr = rhs_col_ptr;
1166   void* dst_col_ptr = params.dst_base_ptr;
1167   void* dst_ptr = dst_col_ptr;
1168   int row = params.start_row;
1169   int col = params.start_col;
1170 
1171   RUY_DCHECK(!(params.flags & RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL));
1172 
1173   // The asm kernel below has the following NEON register allocation:
1174   //
1175   // v16 -- v19 are int32 accumulators.
1176   // During accumulation, v0 -- v3 are used to load int8 data from LHS and
1177   // v4 from RHS:
1178   //
1179   //                         int8 RHS 16x1 block
1180   //                           /-----------|
1181   //                           |v4.b[0]    |
1182   //                           |  ...      |
1183   //                           |v4.b[15]   |
1184   //                           \-----------/
1185   //    int8 LHS 4x16 block
1186   //  /---------------------\  /-----------|
1187   //  |v0.b[0] ... v0.b[15] |  |v16.4s     |
1188   //  |v1.b[0] ... v1.b[15] |  |v17.4s     |
1189   //  |v2.b[0] ... v2.b[15] |  |v18.4s     |
1190   //  |v3.b[0] ... v3.b[15] |  |v19.4s     |
1191   //  \---------------------/  \-----------/
1192   //                         int32 accumulators 4x1 block
1193   //
1194   // No attempt had been made so far at implementing the RUY_OPT_MAX_STREAMING
1195   // optimization for this kernel.
1196   asm volatile(
1197 #define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
1198 
1199         // clang-format off
1200 
1201         // Load some parameters into registers.
1202         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
1203         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
1204         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
1205         "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
1206         "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
1207         "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
1208         "ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
1209         "ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
1210 
1211         // Load the first 64 bytes of LHS and RHS data.
1212         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
1213         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
1214         "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
1215         "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
1216         "ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
1217         "add %[rhs_ptr], %[rhs_ptr], #48\n"
1218 
1219         // Clear accumulators.
1220         RUY_MAKE_ZERO(v16)
1221         RUY_MAKE_ZERO(v17)
1222         RUY_MAKE_ZERO(v18)
1223         RUY_MAKE_ZERO(v19)
1224 
1225         // w1 is the number of levels of depth that we have already loaded
1226         // LHS and RHS data for. Corresponding to the initial ld1 instructions
1227         // above, this is currently 16.
1228         "mov w1, #16\n"
1229 
1230         // Perform the first few multiply-adds on the data that we have already
1231         // loaded.
1232         "smull    v8.8h,  v0.8b,  v4.8b\n"
1233         "smull    v9.8h,  v1.8b,  v4.8b\n"
1234         "smull    v10.8h,  v2.8b,  v4.8b\n"
1235         "smull    v11.8h,  v3.8b,  v4.8b\n"
1236 
1237         // Multiply-accumulate second-half, again into the same
1238         // 16bit local accumulator registers. This is where we
1239         // take advantage of having int8 instead of uint8 and therefore
1240         // being able to accumulate two products into int16.
1241         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
1242         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
1243         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
1244         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
1245 
1246         // Main loop of the whole GEMM, over rows and columns of the
1247         // destination matrix.
1248         "1:\n"
1249 
1250         // Reminder - w1 is how many levels of depth we have already loaded
1251         // data for, w12 is the total depth.
1252         "cmp w1, w12\n"
1253         "beq 79f\n"
1254 
1255         "2:\n"
1256 
1257         // Some multiplications and 16-bit accumulation were already done above,
1258         // so we start right away in the middle.
1259         "sadalp  v16.4s, v8.8h\n"
1260         "ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
1261         "add %[rhs_ptr], %[rhs_ptr], #48\n"
1262         "sadalp  v17.4s, v9.8h\n"
1263         "sadalp  v18.4s, v10.8h\n"
1264         "sadalp  v19.4s, v11.8h\n"
1265 
1266         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
1267         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
1268         "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
1269         "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
1270 
1271         "smull    v8.8h,  v0.8b,  v4.8b\n"
1272         "smull    v9.8h,  v1.8b,  v4.8b\n"
1273         "smull    v10.8h,  v2.8b,  v4.8b\n"
1274         "smull    v11.8h,  v3.8b,  v4.8b\n"
1275 
1276         // Multiply-accumulate second-half, again into the same
1277         // 16bit local accumulator registers. This is where we
1278         // take advantage of having int8 instead of uint8 and therefore
1279         // being able to accumulate two products into int16.
1280         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
1281         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
1282         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
1283         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
1284 
1285         // Each iteration of this loop advances by 16 levels of depth.
1286         "add w1, w1, #16\n"
1287 
1288         // Loop termination condition
1289         "cmp w1, w12\n"
1290 
1291         "blt 2b\n"
1292 
1293         "79:\n"
1294 
1295         "sadalp  v16.4s, v8.8h\n"
1296         "sadalp  v17.4s, v9.8h\n"
1297         "sadalp  v18.4s, v10.8h\n"
1298         "sadalp  v19.4s, v11.8h\n"
1299 
1300         // End of accumulation. The registers v16 -- v19 contain the final
1301         // int32 accumulator values of the current 4x1 destination block.
1302         // We now have to compute the final 8-bit values from these int32
1303         // accumulators, and advance to the next 4x1 block. We intertwine
1304         // these two aspects whenever possible for optimal pipelining, both
1305         // at the data flow level (prefetch data for next block as early as
1306         // possible) and instruction pipelining level (some of the next-block
1307         // work can dual-issue with some of the final work on the current
1308         // block).
1309 
1310         // Reduce 32bit accumulators horizontally.
1311         "addp v16.4s, v16.4s, v17.4s\n"
1312         "addp v18.4s, v18.4s, v19.4s\n"
1313 
1314         // Reduce 32bit accumulators horizontally, second pass
1315         // (each pass adds pairwise. we need to add 4-wise).
1316         "addp v16.4s, v16.4s, v18.4s\n"
1317 
1318         // Logic to advance to the next block in preparation for the next
1319         // iteration of the main loop. For now, we only want to compute
1320         // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
1321         // not yet ready to update the values of row and col, as we still need
1322         // the current values for the rest of the work on the current block.
1323 
1324         "cmp %w[row], w7\n"  // Have we finished the last row?
1325         "bge 4f\n"           // If finished last row, go to 4
1326         // Not finished last row: then advance to next row.
1327         "add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #2\n"
1328         "b 5f\n"
1329         "4:\n"  // Finished last row...
1330         "mov %[lhs_col_ptr], x5\n"  // Go back to first row
1331         // Now we need to advance to the next column. If we already
1332         // finished the last column, then in principle we are done, however
1333         // we can't just return here, as we need to allow the end work of the
1334         // current block to complete. The good news is that at this point it
1335         // doesn't matter what data we load for the next column, since
1336         // we will exit from the main loop below before actually storing
1337         // anything computed from that data.
1338         "cmp %w[col], w8\n"  // Have we finished the last column?
1339         "bge 5f\n" // If yes, just carry on without updating the column pointer.
1340         // Not finished last column: then advance to next column.
1341         // (still multiply column stride by 4 due to packing)
1342         "add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #2\n"
1343         "5:\n"
1344 
1345         // Set the LHS and RHS data pointers to the start of the columns just
1346         // computed.
1347         "mov %[lhs_ptr], %[lhs_col_ptr]\n"
1348         "mov %[rhs_ptr], %[rhs_col_ptr]\n"
1349 
1350         // Load some parameters needed for the end work on current block.
1351         "mvni v8.4s, #0\n"
1352         "ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
1353         "ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
1354         "ins v13.h[4], w4\n" // dst_zero_point
1355         "ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
1356         "ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
1357         "dup v9.4s, w3\n"   // create prod_zp_depth_vec
1358         "add x5, x4, %x[row], lsl #2\n"
1359         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
1360         "csel x4, x4, x5, eq\n"
1361 
1362         "ld1 {v15.4s}, [x4]\n" // multiplier_fixedpoint
1363 
1364         // Now we load: bias data, LHS sums data, RHS sums data.
1365 
1366         // First, load the base pointers from the params.
1367         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
1368 
1369         "add x5, x1, %x[row], lsl #2\n"
1370         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
1371         "csel x1, x1, x5, eq\n"
1372 
1373         // Load 4 bias values.
1374         "ld1 {v14.4s}, [x1]\n"
1375 
1376         // Now that we know what LHS and RHS data the next iteration of the
1377         // main loop will need to load, we start loading the first 32 bytes of
1378         // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
1379         // in the rest of the work on the current block.
1380         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
1381         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
1382         "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
1383         "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
1384         "ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
1385         "add %[rhs_ptr], %[rhs_ptr], #48\n"
1386 
1387         // Add to the bias values the product (depth * lhs_zero_point * rhs_zero_point),
1388         // See the term NZ1Z2 in equation (7) in https://arxiv.org/pdf/1712.05877.pdf
1389         "add v14.4s, v14.4s, v9.4s\n"
1390 
1391         // Perform the bias-addition (per the above, we have just folded into
1392         // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
1393         // (all four 32-bit accumulators are in v16 at this point)
1394         "add v16.4s, v16.4s, v14.4s\n"
1395 
1396         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
1397         "beq 401f\n"
1398         "ldr x3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
1399         "add x3, x3, %x[col], lsl #2\n"
1400         "ld1 {v14.4s}, [x3]\n"
1401         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
1402         "dup v10.4s, w5\n"  // create lhs_zero_point_vec
1403         // Subtract rhs_sums * lhs_zero_point, per
1404         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
1405         "mls v16.4s, v10.4s, v14.s[0]\n"
1406         "401:\n"
1407 
1408         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
1409         "beq 402f\n"
1410         "ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
1411         "add x2, x2, %x[row], lsl #2\n"
1412         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
1413         // Load 4 lhs_sums values.
1414         "ld1 {v11.4s}, [x2]\n"
1415         "ins v13.s[1], w5\n" // rhs_zero_point
1416         // Compute lhs_sums * rhs_zero_point.
1417         "mul v11.4s, v11.4s, v13.s[1]\n"
1418         // Subtract lhs_sums * rhs_zero_point, per
1419         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
1420         "sub v16.4s, v16.4s, v11.4s\n"
1421 
1422         // If the destination is int32, it means the user asks for the raw
1423         // accumulators, no need for us to downquantize the value.
1424         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
1425         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
1426 
1427         "402:\n"
1428 
1429         // At this point we have computed the final int32 values. Now we
1430         // start down-quantizing them to obtain the final 8bit values from them.
1431 
1432         // As part of this down-quantization, our int32 values will be
1433         // multiplied by a multiplier that has a fixed-point component and an
1434         // exponent component.
1435 
1436         //Load the exponent part of the multiplier.
1437         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
1438         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
1439         "add x5, x1, %x[row], lsl #2\n"
1440         "csel x1, x1, x5, eq\n"
1441 
1442         "ld1 {v14.4s}, [x1]\n"
1443 
1444         "smin v11.4s, v8.4s, v14.4s\n"
1445         "sub v12.4s, v14.4s, v11.4s\n"
1446 
1447         // Apply the positive exponent part of the multiplier.
1448         "sshl v16.4s, v16.4s, v12.4s\n"
1449 
1450         // Apply the fixed-point part of the multiplier.
1451         "sqdmulh v16.4s, v16.4s, v15.4s\n"
1452 
1453         // Apply the negative exponent part of the multiplier.
1454         "srshl v16.4s, v16.4s, v11.4s\n"
1455 
1456         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
1457         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
1458         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
1459         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
1460 
1461         RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
1462 
1463         // Cast-and-saturate from int32 to int16
1464         // After this instruction, all data is in lower half (64-bits) of v16
1465         "sqxtn v16.4h, v16.4s\n"
1466 
1467         // At this point, v18 -- v31 aren't used anymore for the current block,
1468         // so we can start clearing these accumulators for the next block
1469         // (next iteration of the main loop).
1470         RUY_MAKE_ZERO(v18)
1471         RUY_MAKE_ZERO(v19)
1472 
1473         // Add the destination zero point
1474         "dup v14.8h, v13.h[4]\n"
1475         "sqadd v16.8h, v16.8h, v14.8h\n"
1476 
1477         // Cast-and-saturate from int16 to uint8
1478         // Now all data is in the first 32-bits of v16
1479         "sqxtun v16.8b, v16.8h\n"
1480 
1481         // Load the clamp_min, clamp_max bounds
1482         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
1483         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
1484         "dup v14.16b, w2\n"  // clamp_min
1485         "dup v15.16b, w3\n"  // clamp_max
1486 
1487         // Apply the clamp_min bound
1488         "umax v16.16b, v16.16b, v14.16b\n"
1489         // Apply the clamp_max bound
1490         "umin v16.16b, v16.16b, v15.16b\n"
1491 
1492         // Compute how much of the 4x1 block of destination 8bit values that
1493         // we have computed, fit in the destination matrix. Typically, all of
1494         // it fits, but when the destination matrix shape is not a multiple
1495         // of 4x1, there are some 4x1 blocks along the boundaries that do
1496         // not fit entirely.
1497         "sub w1, %w[dst_rows], %w[row]\n"
1498         "mov w3, #4\n"
1499         "cmp w1, #4\n"
1500         // Compute w1 = how many rows of the 4x1 block fit
1501         "csel w1, w1, w3, le\n"
1502 
1503         // Test if w1==4, i.e. if all of the 4x1 block fits.
1504         "cmp w1, w3\n"
1505 
1506         "mov x4, %[dst_ptr]\n"
1507         // Yes, all of the 4x1 block fits, go to fast path.
1508         "beq 30f\n"
1509         // Not all of the 4x1 block fits.
1510         // Store to dst_tmp_buf
1511         "st1 {v16.16b}, [%[dst_tmp_buf]]\n"
1512         // Slow loop copying from dst_tmp_buf to dst.
1513         "mov x3, %[dst_tmp_buf]\n"
1514         "mov w6, #0\n"
1515         "50:\n"
1516         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1517         "mov w5, #0\n"
1518         "51:\n"
1519         "ldrb w7, [x3, w5, uxtw]\n"
1520         "strb w7, [x4, w5, uxtw]\n"
1521         "add w5, w5, #1\n"
1522         "cmp w5, w1\n"
1523         "blt 51b\n"
1524         "b 31f\n"
1525         "30:\n"
1526         // Yes, all of the 4x1 block fits.
1527         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1528         "mov x3, x4\n"
1529         "st1 {v16.b}[0], [x3], #1\n"
1530         "st1 {v16.b}[1], [x3], #1\n"
1531         "st1 {v16.b}[2], [x3], #1\n"
1532         "st1 {v16.b}[3], [x3], #1\n"
1533         "31:\n"
1534 
1535         "add %[dst_ptr], %[dst_ptr], #4\n"
1536 
1537         RUY_MAKE_ZERO(v16)
1538         RUY_MAKE_ZERO(v17)
1539 
1540         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
1541 
1542         RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
1543 
1544         // Cast-and-saturate from int32 to int16
1545         // After this, all values for output are in the lower half (64 bits) of v16.
1546         "sqxtn v16.4h, v16.4s\n"
1547 
1548         // At this point, v18 -- v31 aren't used anymore for the current block,
1549         // so we can start clearing these accumulators for the next block
1550         // (next iteration of the main loop).
1551         RUY_MAKE_ZERO(v18)
1552         RUY_MAKE_ZERO(v19)
1553 
1554         // Add the destination zero point
1555         "dup v14.8h, v13.h[4]\n"
1556         "sqadd v16.8h, v16.8h, v14.8h\n"
1557 
1558         // Cast-and-saturate from int16 to int8
1559         "sqxtn v16.8b, v16.8h\n"
1560         // At this point, we only need 4 lowest 8-bit values in v16.
1561 
1562         // Load the clamp_min, clamp_max bounds
1563         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
1564         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
1565         "dup v14.16b, w2\n"  // clamp_min
1566         "dup v15.16b, w3\n"  // clamp_max
1567 
1568         // Apply the clamp_min bound
1569         "smax v16.16b, v16.16b, v14.16b\n"
1570         // Apply the clamp_max bound
1571         "smin v16.16b, v16.16b, v15.16b\n"
1572 
1573         // Compute how much of the 4x4 block of destination 8bit values that
1574         // we have computed, fit in the destination matrix. Typically, all of
1575         // it fits, but when the destination matrix shape is not a multiple
1576         // of 4x4, there are some 4x4 blocks along the boundaries that do
1577         // not fit entirely.
1578         "sub w1, %w[dst_rows], %w[row]\n"
1579         "sub w2, %w[dst_cols], %w[col]\n"
1580         "mov w3, #4\n"
1581         "cmp w1, #4\n"
1582         // Compute w1 = how many rows of the 4x1 block fit
1583         "csel w1, w1, w3, le\n"
1584         "cmp w2, #4\n"
1585 
1586         // Test if w1==4, i.e. if all of the 4x1 block fits.
1587         "cmp w1, w3\n"
1588         "ccmp w2, w3, 0, eq\n"
1589         "mov x4, %[dst_ptr]\n"
1590         // Yes, all of the 4x1 block fits, go to fast path.
1591         "beq 30f\n"
1592         // Not all of the 4x4 block fits.
1593         // Store to dst_tmp_buf
1594         "st1 {v16.16b}, [%[dst_tmp_buf]]\n"
1595         // Slow loop copying from dst_tmp_buf to dst.
1596         "mov x3, %[dst_tmp_buf]\n"
1597         "mov w6, #0\n"
1598         "50:\n"
1599         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1600         "mov w5, #0\n"
1601         "51:\n"
1602         "ldrb w7, [x3, w5, uxtw]\n"
1603         "strb w7, [x4, w5, uxtw]\n"
1604         "add w5, w5, #1\n"
1605         "cmp w5, w1\n"
1606         "blt 51b\n"
1607         "b 31f\n"
1608         "30:\n"
1609         // Yes, all of the 4x4 block fits.
1610         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1611         "mov x3, x4\n"
1612         "st1 {v16.b}[0], [x3], #1\n"
1613         "st1 {v16.b}[1], [x3], #1\n"
1614         "st1 {v16.b}[2], [x3], #1\n"
1615         "st1 {v16.b}[3], [x3], #1\n"
1616         "31:\n"
1617 
1618         "add %[dst_ptr], %[dst_ptr], #4\n"
1619 
1620         RUY_MAKE_ZERO(v16)
1621         RUY_MAKE_ZERO(v17)
1622 
1623         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
1624 
1625         RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
1626 
1627         // Add the destination zero point
1628         "dup v14.4h, v13.h[4]\n"
1629         "saddw v16.4s, v16.4s, v14.4h\n"
1630 
1631         // Cast-and-saturate from int32 to int16
1632         // After this instruction, all data is in lower half of v16.
1633         "sqxtn v16.4h, v16.4s\n"
1634 
1635         // At this point, v18 -- v31 aren't used anymore for the current block,
1636         // so we can start clearing these accumulators for the next block
1637         // (next iteration of the main loop).
1638         RUY_MAKE_ZERO(v18)
1639         RUY_MAKE_ZERO(v19)
1640 
1641         // Load the clamp_min, clamp_max bounds
1642         "ldrh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
1643         "ldrh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
1644         "dup v14.8h, w2\n"  // clamp_min
1645         "dup v15.8h, w3\n"  // clamp_max
1646 
1647         // Apply the clamp_min bound
1648         "smax v16.8h, v16.8h, v14.8h\n"
1649         // Apply the clamp_max bound
1650         "smin v16.8h, v16.8h, v15.8h\n"
1651 
1652         // Compute how much of the 4x4 block of destination 8bit values that
1653         // we have computed, fit in the destination matrix. Typically, all of
1654         // it fits, but when the destination matrix shape is not a multiple
1655         // of 4x4, there are some 4x4 blocks along the boundaries that do
1656         // not fit entirely.
1657         "sub w1, %w[dst_rows], %w[row]\n"
1658         "sub w2, %w[dst_cols], %w[col]\n"
1659         "mov w3, #4\n"
1660         "cmp w1, #4\n"
1661         // Compute w1 = how many rows of the 4x4 block fit
1662         "csel w1, w1, w3, le\n"
1663         "cmp w2, #4\n"
1664 
1665        // Test if w1==4 && w2 == 4, i.e. if all of the 8x8 block fits.
1666         "cmp w1, w3\n"
1667         "mov x4, %[dst_ptr]\n"
1668         // Yes, all of the 4x4 block fits, go to fast path.
1669         "beq 30f\n"
1670         // Not all of the 4x4 block fits.
1671         // Store to dst_tmp_buf
1672         "str q16, [%[dst_tmp_buf], #0]\n"
1673         // Slow loop copying from dst_tmp_buf to dst.
1674         "mov x3, %[dst_tmp_buf]\n"
1675         "mov w6, #0\n"
1676         "50:\n"
1677         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1678         "mov w5, #0\n"
1679         "51:\n"
1680         "ldrh w7, [x3, x5, lsl #1]\n"
1681         "strh w7, [x4, x5, lsl #1]\n"
1682         "add w5, w5, #1\n"
1683         "cmp w5, w1\n"
1684         "blt 51b\n"
1685         "blt 50b\n"
1686         "b 31f\n"
1687         "30:\n"
1688         // Yes, all of the 4x4 block fits.
1689         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1690         "mov x3, x4\n"
1691         "st1 {v16.h}[0], [x3], #2\n"
1692         "st1 {v16.h}[1], [x3], #2\n"
1693         "st1 {v16.h}[2], [x3], #2\n"
1694         "st1 {v16.h}[3], [x3], #2\n"
1695         "31:\n"
1696 
1697         "add %[dst_ptr], %[dst_ptr], #8\n"
1698 
1699         RUY_MAKE_ZERO(v16)
1700         RUY_MAKE_ZERO(v17)
1701 
1702         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
1703 
1704         RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
1705 
1706         // Since the store type is the same as the accum type, no need for
1707         // downcast. There's also no need for clamp by min/max.
1708 
1709         // Compute how much of the 4x4 block of destination 8bit values that
1710         // we have computed, fit in the destination matrix. Typically, all of
1711         // it fits, but when the destination matrix shape is not a multiple
1712         // of 4x4, there are some 4x4 blocks along the boundaries that do
1713         // not fit entirely.
1714         "sub w1, %w[dst_rows], %w[row]\n"
1715         "sub w2, %w[dst_cols], %w[col]\n"
1716         "mov w3, #4\n"
1717         "cmp w1, #4\n"
1718         // Compute w1 = how many rows of the 4x4 block fit
1719         "csel w1, w1, w3, le\n"
1720         "cmp w2, #4\n"
1721 
1722         // Test if w1==4 i.e. if all of the 4x1 block fits.
1723         "cmp w1, w3\n"
1724         "ccmp w2, w3, 0, eq\n"
1725         "mov x4, %[dst_ptr]\n"
1726         // Yes, all of the 4x1 block fits, go to fast path.
1727         "beq 30f\n"
1728         // Not all of the 4x4 block fits.
1729         // Store to dst_tmp_buf
1730         "str q16, [%[dst_tmp_buf], #0]\n"
1731         // Slow loop copying from dst_tmp_buf to dst.
1732         "mov x3, %[dst_tmp_buf]\n"
1733         "mov w6, #0\n"
1734         "50:\n"
1735         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1736         "mov w5, #0\n"
1737         "51:\n"
1738         "ldr w7, [x3, x5, lsl #2]\n"
1739         "str w7, [x4, x5, lsl #2]\n"
1740         "add w5, w5, #1\n"
1741         "cmp w5, w1\n"
1742         "blt 51b\n"
1743         "b 31f\n"
1744         "30:\n"
1745         // Yes, all of the 4x4 block fits.
1746         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
1747         "mov x3, x4\n"
1748         "st1 {v16.s}[0], [x3], #4\n"
1749         "st1 {v16.s}[1], [x3], #4\n"
1750         "st1 {v16.s}[2], [x3], #4\n"
1751         "st1 {v16.s}[3], [x3], #4\n"
1752         "31:\n"
1753 
1754         "add %[dst_ptr], %[dst_ptr], #16\n"
1755 
1756         RUY_MAKE_ZERO(v16)
1757         RUY_MAKE_ZERO(v17)
1758         RUY_MAKE_ZERO(v18)
1759         RUY_MAKE_ZERO(v19)
1760 
1761         RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
1762 
1763         // For the next block: perform the first few multiply-adds on the data
1764         // that we have already loaded.
1765         "smull    v8.8h,  v0.8b,  v4.8b\n"
1766         "smull    v9.8h,  v1.8b,  v4.8b\n"
1767         "smull    v10.8h,  v2.8b,  v4.8b\n"
1768         "smull    v11.8h,  v3.8b,  v4.8b\n"
1769         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
1770         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
1771         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
1772         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
1773 
1774         // Reload some params --- we had used x5 -- x7 for a few other things
1775         // since the last time we had loaded them.
1776         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
1777         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
1778         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
1779 
1780         // Move to the next block of the destination matrix, for the next iter
1781         // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
1782         // been updated earlier.
1783         // Have we reached the end row?
1784         "cmp %w[row], w7\n"
1785         "beq 20f\n"  // yes, end row.
1786         // Not end row. Move to the next row.
1787         "add %w[row], %w[row], #4\n"
1788         "b 21f\n"
1789         "20:\n"
1790         // Was already at end row.
1791         "mov %w[row], w6\n"  // Move back to first row.
1792         "add %w[col], %w[col], #4\n"  // Move to the next column.
1793         "add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #2\n"
1794         "mov %[dst_ptr], %[dst_col_ptr]\n"
1795         "21:\n"
1796 
1797         // Main loop exit condition: have we hit the end column?
1798         "cmp %w[col], w8\n"
1799 
1800         // w1 is the number of levels of depth that we have already loaded
1801         // LHS and RHS data for. Corresponding to the initial ld1 instructions
1802         // above, this is currently 16.
1803         "mov w1, #16\n"
1804 
1805         "ble 1b\n"
1806 
1807         // clang-format on
1808 
1809         : [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
1810           [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
1811           [dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
1812         : [ params ] "r"(&params), [dst_rows] "r"(params.dst_rows),
1813           [dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
1814           [dst_type_id] "r"(params.dst_type_id)
1815         : "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
1816           "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
1817           "v13", "v14", "v15", "v16", "v17", "v18", "v19");
1818 }
1819 
1820 // Variant of the above Kernel8bitNeon, tuned for A55-ish CPUs.
1821 // Specifically here, the relevant in-order CPUs are ARM Cortex-A53 and
1822 // the original Cortex-A55, since these are 64-bit and do not support dotprod.
1823 //
1824 // While this kernel does not have a direct equivalent in gemmlowp, it was
1825 // developed based on insights that David Mansell at ARM shared with their
1826 // contribution of gemmlowp kernels tuned for Cortex-A53, with very helpful
1827 // comments. Specifically, see this comment about tuning for Cortex-A53:
1828 // https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L4215
Kernel8bitNeonA55ish(const KernelParams8bit<4,4> & params)1829 void Kernel8bitNeonA55ish(const KernelParams8bit<4, 4>& params) {
1830   profiler::ScopeLabel label("Kernel (kNeon, optimized for in-order cores)");
1831 
1832   CheckOffsetsInKernelParams8bit(params);
1833 
1834   const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
1835   const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
1836   const std::int8_t* lhs_ptr = lhs_col_ptr;
1837   const std::int8_t* rhs_ptr = rhs_col_ptr;
1838   void* dst_col_ptr = params.dst_base_ptr;
1839   void* dst_ptr = dst_col_ptr;
1840   int row = params.start_row;
1841   int col = params.start_col;
1842 
1843   // The asm kernel below has the following NEON register allocation:
1844   //
1845   // v16 -- v31 are int32 accumulators.
1846   // During accumulation, v0 -- v3 are used to load int8 data from LHS and
1847   // v4 -- v7 from RHS:
1848   //
1849   //                                      int8 RHS 16x4 block
1850   //                           /-----------------------------------------|
1851   //                           |v4.b[0]          ...           v7.b[0]   |
1852   //                           |  ...                            ...     |
1853   //                           |v4.b[15]         ...           v7.b[15]  |
1854   //                           \-----------------------------------------/
1855   //    int8 LHS 4x16 block
1856   //  /---------------------\  /-----------------------------------------|
1857   //  |v0.b[0] ... v0.b[15] |  |v16.4s           ...           v28.4s    |
1858   //  |v1.b[0] ... v1.b[15] |  |v17.4s           ...           v29.4s    |
1859   //  |v2.b[0] ... v2.b[15] |  |v18.4s           ...           v30.4s    |
1860   //  |v3.b[0] ... v3.b[15] |  |v19.4s           ...           v31.4s    |
1861   //  \---------------------/  \-----------------------------------------/
1862   //                                  int32 accumulators 4x4 block
1863   asm volatile(
1864 #define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
1865 
1866         // clang-format off
1867 
1868         // Load some parameters into registers.
1869         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
1870         RUY_MAKE_ZERO(v16)
1871         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
1872         RUY_MAKE_ZERO(v17)
1873         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
1874         RUY_MAKE_ZERO(v18)
1875         "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
1876         RUY_MAKE_ZERO(v19)
1877         "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
1878         RUY_MAKE_ZERO(v20)
1879         "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
1880         RUY_MAKE_ZERO(v21)
1881         "ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
1882         RUY_MAKE_ZERO(v22)
1883         "ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
1884         RUY_MAKE_ZERO(v23)
1885 
1886         // Load the first 64 bytes of LHS and RHS data.
1887         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
1888         RUY_MAKE_ZERO(v24)
1889         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
1890         RUY_MAKE_ZERO(v25)
1891         "ld1 {v2.16b}, [%[lhs_ptr]], #16\n"
1892         RUY_MAKE_ZERO(v26)
1893         "ld1 {v3.16b}, [%[lhs_ptr]], #16\n"
1894         RUY_MAKE_ZERO(v27)
1895         "ld1 {v4.16b}, [%[rhs_ptr]], #16\n"
1896         RUY_MAKE_ZERO(v28)
1897         "ld1 {v5.16b}, [%[rhs_ptr]], #16\n"
1898         RUY_MAKE_ZERO(v29)
1899         "ld1 {v6.16b}, [%[rhs_ptr]], #16\n"
1900         RUY_MAKE_ZERO(v30)
1901         "ld1 {v7.16b}, [%[rhs_ptr]], #16\n"
1902         RUY_MAKE_ZERO(v31)
1903 
1904 
1905         // w1 is the number of levels of depth that we have already loaded
1906         // LHS and RHS data for. Corresponding to the initial ld1 instructions
1907         // above, this is currently 16.
1908         "mov w1, #16\n"
1909 
1910         // Perform the first few multiply-adds on the data that we have already
1911         // loaded.
1912         "smull    v8.8h,  v0.8b,  v4.8b\n"
1913         "smull    v9.8h,  v1.8b,  v4.8b\n"
1914         "smull    v10.8h,  v2.8b,  v4.8b\n"
1915         "smull    v11.8h,  v3.8b,  v4.8b\n"
1916         "smull    v12.8h,  v0.8b,  v5.8b\n"
1917         "smull    v13.8h,  v1.8b,  v5.8b\n"
1918         "smull    v14.8h,  v2.8b,  v5.8b\n"
1919         "smull    v15.8h,  v3.8b,  v5.8b\n"
1920 
1921         // Multiply-accumulate second-half, again into the same
1922         // 16bit local accumulator registers. This is where we
1923         // take advantage of having int8 instead of uint8 and therefore
1924         // being able to accumulate two products into int16.
1925         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
1926         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
1927         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
1928         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
1929         "smlal2   v12.8h,  v0.16b,  v5.16b\n"
1930         "smlal2   v13.8h,  v1.16b,  v5.16b\n"
1931         "smlal2   v14.8h,  v2.16b,  v5.16b\n"
1932         "smlal2   v15.8h,  v3.16b,  v5.16b\n"
1933 
1934 
1935         // Main loop of the whole GEMM, over rows and columns of the
1936         // destination matrix.
1937         "1:\n"
1938 
1939         // Reminder - w1 is how many levels of depth we have already loaded
1940         // data for, w12 is the total depth.
1941         "cmp w1, w12\n"
1942         "beq 79f\n"
1943 
1944         "2:\n"
1945 
1946         // Some multiplications and 16-bit accumulation were already done above,
1947         // so we start right away in the middle.
1948         "sadalp  v16.4s, v8.8h\n"
1949         "ldr d4, [%[rhs_ptr], #0]\n"
1950         "smull    v8.8h,  v0.8b,  v6.8b\n"
1951         "ldr x7, [%[rhs_ptr], #8]\n"
1952         "sadalp  v17.4s, v9.8h\n"
1953         "ldr d5, [%[rhs_ptr], #16]\n"
1954         "smull    v9.8h,  v1.8b,  v6.8b\n"
1955         "ldr x8, [%[rhs_ptr], #24]\n"
1956         "sadalp  v18.4s, v10.8h\n"
1957         "smull    v10.8h,  v2.8b,  v6.8b\n"
1958         "sadalp  v19.4s, v11.8h\n"
1959         "add %[lhs_ptr], %[lhs_ptr], #64\n"
1960         "smull    v11.8h,  v3.8b,  v6.8b\n"
1961         "add %[rhs_ptr], %[rhs_ptr], #64\n"
1962         "sadalp  v20.4s, v12.8h\n"
1963         // Each iteration of this loop advances by 16 levels of depth.
1964         "add w1, w1, #16\n"
1965         "smull    v12.8h,  v0.8b,  v7.8b\n"
1966         // Loop termination condition
1967         "cmp w1, w12\n"
1968         "sadalp  v21.4s, v13.8h\n"
1969         "ldr x3, [%[lhs_ptr], #-56]\n"
1970         "smull    v13.8h,  v1.8b,  v7.8b\n"
1971         "ldr x4, [%[lhs_ptr], #-40]\n"
1972         "sadalp  v22.4s, v14.8h\n"
1973         "ldr x5, [%[lhs_ptr], #-24]\n"
1974         "smull    v14.8h,  v2.8b,  v7.8b\n"
1975         "ldr x6, [%[lhs_ptr], #-8]\n"
1976         "sadalp  v23.4s, v15.8h\n"
1977         "smull    v15.8h,  v3.8b,  v7.8b\n"
1978 
1979         // Multiply-accumulate second-half, again into the same
1980         // 16bit local accumulator registers. This is where we
1981         // take advantage of having int8 instead of uint8 and therefore
1982         // being able to accumulate two products into int16.
1983         "smlal2   v8.8h,  v0.16b,  v6.16b\n"
1984         "smlal2   v9.8h,  v1.16b,  v6.16b\n"
1985         "smlal2   v10.8h,  v2.16b,  v6.16b\n"
1986         "ldr x9, [%[rhs_ptr], #-24]\n"
1987         "smlal2   v11.8h,  v3.16b,  v6.16b\n"
1988         "ldr d6, [%[rhs_ptr], #-32]\n"
1989         "smlal2   v12.8h,  v0.16b,  v7.16b\n"
1990         "ldr d0, [%[lhs_ptr], #-64]\n"
1991         "smlal2   v13.8h,  v1.16b,  v7.16b\n"
1992         "ldr d1, [%[lhs_ptr], #-48]\n"
1993         "smlal2   v14.8h,  v2.16b,  v7.16b\n"
1994         "ins v4.d[1], x7\n"
1995         "smlal2   v15.8h,  v3.16b,  v7.16b\n"
1996         "ins v5.d[1], x8\n"
1997 
1998         "ldr d2, [%[lhs_ptr], #-32]\n"
1999         "ins v0.d[1], x3\n"
2000         "sadalp  v24.4s, v8.8h\n"
2001         "ldr d3, [%[lhs_ptr], #-16]\n"
2002         "ins v1.d[1], x4\n"
2003         "smull    v8.8h,  v0.8b,  v4.8b\n"
2004         "ins v2.d[1], x5\n"
2005         "sadalp  v25.4s, v9.8h\n"
2006         "ins v3.d[1], x6\n"
2007         "smull    v9.8h,  v1.8b,  v4.8b\n"
2008         "ldr d7, [%[rhs_ptr], #-16]\n"
2009         "sadalp  v26.4s, v10.8h\n"
2010         "ldr x10, [%[rhs_ptr], #-8]\n"
2011         "smull    v10.8h,  v2.8b,  v4.8b\n"
2012         "sadalp  v27.4s, v11.8h\n"
2013         "smull    v11.8h,  v3.8b,  v4.8b\n"
2014         "sadalp  v28.4s, v12.8h\n"
2015         "smull    v12.8h,  v0.8b,  v5.8b\n"
2016         "sadalp  v29.4s, v13.8h\n"
2017         "smull    v13.8h,  v1.8b,  v5.8b\n"
2018         "sadalp  v30.4s, v14.8h\n"
2019         "smull    v14.8h,  v2.8b,  v5.8b\n"
2020         "sadalp  v31.4s, v15.8h\n"
2021         "smull    v15.8h,  v3.8b,  v5.8b\n"
2022 
2023         // Multiply-accumulate second-half, again into the same
2024         // 16bit local accumulator registers. This is where we
2025         // take advantage of having int8 instead of uint8 and therefore
2026         // being able to accumulate two products into int16.
2027         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
2028         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
2029         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
2030         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
2031 
2032         "smlal2   v12.8h,  v0.16b,  v5.16b\n"
2033         "smlal2   v13.8h,  v1.16b,  v5.16b\n"
2034         "ins v6.d[1], x9\n"
2035         "smlal2   v14.8h,  v2.16b,  v5.16b\n"
2036         "ins v7.d[1], x10\n"
2037         "smlal2   v15.8h,  v3.16b,  v5.16b\n"
2038 
2039         "blt 2b\n"
2040 
2041         "79:\n"
2042 
2043         "sadalp  v16.4s, v8.8h\n"
2044         "smull    v8.8h,  v0.8b,  v6.8b\n"
2045         "sadalp  v17.4s, v9.8h\n"
2046         "smull    v9.8h,  v1.8b,  v6.8b\n"
2047         "sadalp  v18.4s, v10.8h\n"
2048         "smull    v10.8h,  v2.8b,  v6.8b\n"
2049         "sadalp  v19.4s, v11.8h\n"
2050         "smull    v11.8h,  v3.8b,  v6.8b\n"
2051         "sadalp  v20.4s, v12.8h\n"
2052         "smull    v12.8h,  v0.8b,  v7.8b\n"
2053         "sadalp  v21.4s, v13.8h\n"
2054         "smull    v13.8h,  v1.8b,  v7.8b\n"
2055         "sadalp  v22.4s, v14.8h\n"
2056         "smull    v14.8h,  v2.8b,  v7.8b\n"
2057         "sadalp  v23.4s, v15.8h\n"
2058         "smull    v15.8h,  v3.8b,  v7.8b\n"
2059 
2060         // Multiply-accumulate second-half, again into the same
2061         // 16bit local accumulator registers. This is where we
2062         // take advantage of having int8 instead of uint8 and therefore
2063         // being able to accumulate two products into int16.
2064         "smlal2   v8.8h,  v0.16b,  v6.16b\n"
2065         "smlal2   v9.8h,  v1.16b,  v6.16b\n"
2066         "smlal2   v10.8h,  v2.16b,  v6.16b\n"
2067         "smlal2   v11.8h,  v3.16b,  v6.16b\n"
2068 
2069         "smlal2   v12.8h,  v0.16b,  v7.16b\n"
2070         "smlal2   v13.8h,  v1.16b,  v7.16b\n"
2071         "smlal2   v14.8h,  v2.16b,  v7.16b\n"
2072         "smlal2   v15.8h,  v3.16b,  v7.16b\n"
2073 
2074         "sadalp  v24.4s, v8.8h\n"
2075         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
2076         "sadalp  v25.4s, v9.8h\n"
2077         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
2078         "sadalp  v26.4s, v10.8h\n"
2079         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
2080         "sadalp  v27.4s, v11.8h\n"
2081         "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
2082         "sadalp  v28.4s, v12.8h\n"
2083         "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
2084         "sadalp  v29.4s, v13.8h\n"
2085         "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
2086         "sadalp  v30.4s, v14.8h\n"
2087         "sadalp  v31.4s, v15.8h\n"
2088 
2089         // End of accumulation. The registers v16 -- v31 contain the final
2090         // int32 accumulator values of the current 4x4 destination block.
2091         // We now have to compute the final 8-bit values from these int32
2092         // accumulators, and advance to the next 4x4 block. We intertwine
2093         // these two aspects whenever possible for optimal pipelining, both
2094         // at the data flow level (prefetch data for next block as early as
2095         // possible) and instruction pipelining level (some of the next-block
2096         // work can dual-issue with some of the final work on the current
2097         // block).
2098 
2099         // Reduce 32bit accumulators horizontally.
2100         "addp v16.4s, v16.4s, v17.4s\n"
2101         "addp v18.4s, v18.4s, v19.4s\n"
2102         "addp v20.4s, v20.4s, v21.4s\n"
2103         "addp v22.4s, v22.4s, v23.4s\n"
2104         "addp v24.4s, v24.4s, v25.4s\n"
2105         "addp v26.4s, v26.4s, v27.4s\n"
2106         "addp v28.4s, v28.4s, v29.4s\n"
2107         "addp v30.4s, v30.4s, v31.4s\n"
2108 
2109         // Reduce 32bit accumulators horizontally, second pass
2110         // (each pass adds pairwise. we need to add 4-wise).
2111         "addp v16.4s, v16.4s, v18.4s\n"
2112         "addp v17.4s, v20.4s, v22.4s\n"
2113         "addp v18.4s, v24.4s, v26.4s\n"
2114         "addp v19.4s, v28.4s, v30.4s\n"
2115 
2116         // Logic to advance to the next block in preparation for the next
2117         // iteration of the main loop. For now, we only want to compute
2118         // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
2119         // not yet ready to update the values of row and col, as we still need
2120         // the current values for the rest of the work on the current block.
2121 
2122         "cmp %w[row], w7\n"  // Have we finished the last row?
2123         "bge 4f\n"           // If finished last row, go to 4
2124         // Not finished last row: then advance to next row.
2125         "add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #2\n"
2126         "b 5f\n"
2127         "4:\n"  // Finished last row...
2128         "mov %[lhs_col_ptr], x5\n"  // Go back to first row
2129         // Now we need to advance to the next column. If we already
2130         // finished the last column, then in principle we are done, however
2131         // we can't just return here, as we need to allow the end work of the
2132         // current block to complete. The good news is that at this point it
2133         // doesn't matter what data we load for the next column, since
2134         // we will exit from the main loop below before actually storing
2135         // anything computed from that data.
2136         "cmp %w[col], w8\n"  // Have we finished the last column?
2137         "bge 5f\n" // If yes, just carry on without updating the column pointer.
2138         // Not finished last column: then advance to next column.
2139         "add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #2\n"
2140         "5:\n"
2141 
2142         // Set the LHS and RHS data pointers to the start of the columns just
2143         // computed.
2144         "mov %[lhs_ptr], %[lhs_col_ptr]\n"
2145         "mov %[rhs_ptr], %[rhs_col_ptr]\n"
2146 
2147         // Load some parameters needed for the end work on current block.
2148         "mvni v8.4s, #0\n"
2149         "ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
2150         "ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
2151         "ins v13.h[4], w4\n" // dst_zero_point
2152         "ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
2153         "ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
2154         "dup v9.4s, w3\n"   // create prod_zp_depth_vec
2155 
2156         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
2157 
2158         // Determine the channel index.
2159         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
2160         "csel w3, %w[row], %w[col], eq\n"
2161 
2162         // Offset the bias pointer as needed given the current row, col.
2163         "add x5, x1, x3, lsl #2\n"
2164 
2165         // If there is no bias, use no offset, just address the passed zero
2166         // data.
2167         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
2168         "csel x1, x1, x5, eq\n"
2169 
2170         // Load 4 bias values.
2171         "ld1 {v14.4s}, [x1]\n"
2172 
2173         // Load the multiplier_fixedpoint values.
2174         "add x5, x4, x3, lsl #2\n"
2175         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
2176         "csel x4, x4, x5, eq\n"
2177         "ld1 {v15.4s}, [x4]\n" // multiplier_fixedpoint
2178 
2179         // Now that we know what LHS and RHS data the next iteration of the
2180         // main loop will need to load, we start loading the first 32 bytes of
2181         // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
2182         // in the rest of the work on the current block.
2183 
2184         // Add to the bias values the product (depth * lhs_zero_point * rhs_zero_point),
2185         // See the term NZ1Z2 in equation (7) in https://arxiv.org/pdf/1712.05877.pdf
2186         "add v14.4s, v14.4s, v9.4s\n"
2187         "ldr d0, [%[lhs_ptr], #0]\n"
2188 
2189         // Perform the bias-addition (per the above, we have just folded into
2190         // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
2191         // Jump based on channel dimension.
2192         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
2193         "bne 6f\n"
2194         // Case where channels are rows
2195 
2196         "add v16.4s, v16.4s, v14.4s\n"
2197         "ldr d1, [%[lhs_ptr], #16]\n"
2198         "add v17.4s, v17.4s, v14.4s\n"
2199         "ldr d2, [%[lhs_ptr], #32]\n"
2200         "add v18.4s, v18.4s, v14.4s\n"
2201         "ldr d3, [%[lhs_ptr], #48]\n"
2202         "add v19.4s, v19.4s, v14.4s\n"
2203         "ldr d4, [%[rhs_ptr], #0]\n"
2204         "ldr d5, [%[rhs_ptr], #16]\n"
2205         "ldr d6, [%[rhs_ptr], #32]\n"
2206         "ldr d7, [%[rhs_ptr], #48]\n"
2207 
2208         "b 7f\n"
2209 
2210         "6:\n"
2211         // Case where channels are columns
2212         "dup v20.4s, v14.s[0]\n"
2213         "ldr d1, [%[lhs_ptr], #16]\n"
2214         "dup v21.4s, v14.s[1]\n"
2215         "ldr d2, [%[lhs_ptr], #32]\n"
2216         "dup v22.4s, v14.s[2]\n"
2217         "ldr d3, [%[lhs_ptr], #48]\n"
2218         "dup v23.4s, v14.s[3]\n"
2219         "ldr d4, [%[rhs_ptr], #0]\n"
2220         "add v16.4s, v16.4s, v20.4s\n"
2221         "ldr d5, [%[rhs_ptr], #16]\n"
2222         "add v17.4s, v17.4s, v21.4s\n"
2223         "ldr d6, [%[rhs_ptr], #32]\n"
2224         "add v18.4s, v18.4s, v22.4s\n"
2225         "ldr d7, [%[rhs_ptr], #48]\n"
2226         "add v19.4s, v19.4s, v23.4s\n"
2227         "7:\n"
2228 
2229         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
2230         "beq 401f\n"
2231         "ldr x3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
2232         "add x3, x3, %x[col], lsl #2\n"
2233         "ld1 {v14.4s}, [x3]\n"
2234         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
2235         "dup v10.4s, w5\n"  // create lhs_zero_point_vec
2236         // Subtract rhs_sums * lhs_zero_point, per
2237         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
2238         "mls v16.4s, v10.4s, v14.s[0]\n"
2239         "mls v17.4s, v10.4s, v14.s[1]\n"
2240         "mls v18.4s, v10.4s, v14.s[2]\n"
2241         "mls v19.4s, v10.4s, v14.s[3]\n"
2242         "401:\n"
2243 
2244         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
2245         "beq 402f\n"
2246         "ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
2247         "add x2, x2, %x[row], lsl #2\n"
2248         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
2249         // Load 4 lhs_sums values.
2250         "ld1 {v11.4s}, [x2]\n"
2251         "ins v13.s[1], w5\n" // rhs_zero_point
2252         // Compute lhs_sums * rhs_zero_point.
2253         "mul v11.4s, v11.4s, v13.s[1]\n"
2254         // Subtract lhs_sums * rhs_zero_point, per
2255         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
2256         "sub v16.4s, v16.4s, v11.4s\n"
2257         "sub v17.4s, v17.4s, v11.4s\n"
2258         "sub v18.4s, v18.4s, v11.4s\n"
2259         "sub v19.4s, v19.4s, v11.4s\n"
2260 
2261         // If the destination is int32, it means the user asks for the raw
2262         // accumulators, no need for us to downquantize the value.
2263         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
2264         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
2265 
2266         "402:\n"
2267 
2268         // At this point we have computed the final int32 values. Now we
2269         // start down-quantizing them to obtain the final 8bit values from them.
2270 
2271         // As part of this down-quantization, our int32 values will be
2272         // multiplied by a multiplier that has a fixed-point component and an
2273         // exponent component.
2274 
2275         // Determine the channel index.
2276         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
2277         "csel w3, %w[row], %w[col], eq\n"
2278 
2279         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
2280         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
2281         "add x5, x1, x3, lsl #2\n"
2282         "csel x1, x1, x5, eq\n"
2283 
2284         "ld1 {v14.4s}, [x1]\n"
2285 
2286         "smin v11.4s, v8.4s, v14.4s\n"
2287         "ldr x1, [%[lhs_ptr], #8]\n"
2288         "sub v12.4s, v14.4s, v11.4s\n"
2289 
2290         // Jump based on channel dimension.
2291         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
2292         "bne 8f\n"
2293         // Case where channels are rows
2294 
2295 
2296         // Apply the positive exponent part of the multiplier.
2297         "sshl v16.4s, v16.4s, v12.4s\n"
2298         "ldr x2, [%[lhs_ptr], #24]\n"
2299         "sshl v17.4s, v17.4s, v12.4s\n"
2300         "ldr x3, [%[lhs_ptr], #40]\n"
2301         "sshl v18.4s, v18.4s, v12.4s\n"
2302         "ldr x4, [%[lhs_ptr], #56]\n"
2303         "sshl v19.4s, v19.4s, v12.4s\n"
2304 
2305 
2306         // Apply the fixed-point part of the multiplier.
2307         "ins v0.d[1], x1\n"
2308         "ldr x1, [%[rhs_ptr], #8]\n"
2309         "sqdmulh v16.4s, v16.4s, v15.4s\n"
2310         "ins v1.d[1], x2\n"
2311         "ldr x2, [%[rhs_ptr], #24]\n"
2312         "sqdmulh v17.4s, v17.4s, v15.4s\n"
2313         "ins v2.d[1], x3\n"
2314         "ldr x3, [%[rhs_ptr], #40]\n"
2315         "sqdmulh v18.4s, v18.4s, v15.4s\n"
2316         "ins v3.d[1], x4\n"
2317         "ldr x4, [%[rhs_ptr], #56]\n"
2318         "sqdmulh v19.4s, v19.4s, v15.4s\n"
2319 
2320         // Apply the negative exponent part of the multiplier.
2321         "srshl v16.4s, v16.4s, v11.4s\n"
2322         "srshl v17.4s, v17.4s, v11.4s\n"
2323         "srshl v18.4s, v18.4s, v11.4s\n"
2324         "srshl v19.4s, v19.4s, v11.4s\n"
2325 
2326         "b 9f\n"
2327 
2328         "8:\n"
2329         // Case where channels are columns
2330 
2331         // Apply the positive exponent part of the multiplier.
2332         "dup v20.4s, v12.s[0]\n"
2333         "ldr x2, [%[lhs_ptr], #24]\n"
2334         "ldr x3, [%[lhs_ptr], #40]\n"
2335         "dup v21.4s, v12.s[1]\n"
2336         "ldr x4, [%[lhs_ptr], #56]\n"
2337         "dup v22.4s, v12.s[2]\n"
2338         "ins v0.d[1], x1\n"
2339         "dup v23.4s, v12.s[3]\n"
2340         "ldr x1, [%[rhs_ptr], #8]\n"
2341         "sshl v16.4s, v16.4s, v20.4s\n"
2342         "ins v1.d[1], x2\n"
2343         "sshl v17.4s, v17.4s, v21.4s\n"
2344         "ldr x2, [%[rhs_ptr], #24]\n"
2345         "sshl v18.4s, v18.4s, v22.4s\n"
2346         "ins v2.d[1], x3\n"
2347         "sshl v19.4s, v19.4s, v23.4s\n"
2348         "ldr x3, [%[rhs_ptr], #40]\n"
2349 
2350         // Apply the fixed-point part of the multiplier.
2351         "sqdmulh v16.4s, v16.4s, v15.s[0]\n"
2352         "ins v3.d[1], x4\n"
2353         "sqdmulh v17.4s, v17.4s, v15.s[1]\n"
2354         "ldr x4, [%[rhs_ptr], #56]\n"
2355         "sqdmulh v18.4s, v18.4s, v15.s[2]\n"
2356         "dup v20.4s, v11.s[0]\n"
2357         "sqdmulh v19.4s, v19.4s, v15.s[3]\n"
2358 
2359         // Apply the negative exponent part of the multiplier.
2360         "dup v21.4s, v11.s[1]\n"
2361         "srshl v16.4s, v16.4s, v20.4s\n"
2362         "dup v22.4s, v11.s[2]\n"
2363         "srshl v17.4s, v17.4s, v21.4s\n"
2364         "dup v23.4s, v11.s[3]\n"
2365         "srshl v18.4s, v18.4s, v22.4s\n"
2366         "srshl v19.4s, v19.4s, v23.4s\n"
2367 
2368         "9:\n"
2369 
2370         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
2371         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
2372         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
2373         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
2374 
2375         RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
2376 
2377         "ins v4.d[1], x1\n"
2378         "sqxtn v16.4h, v16.4s\n"
2379         "ins v5.d[1], x2\n"
2380         "sqxtn2 v16.8h, v17.4s\n"
2381         "ins v6.d[1], x3\n"
2382         "sqxtn v17.4h, v18.4s\n"
2383         "ins v7.d[1], x4\n"
2384         RUY_MAKE_ZERO(v18)
2385         "sqxtn2 v17.8h, v19.4s\n"
2386 
2387         // At this point, v18 -- v31 aren't used anymore for the current block,
2388         // so we can start clearing these accumulators for the next block
2389         // (next iteration of the main loop).
2390         RUY_MAKE_ZERO(v19)
2391 
2392         // Add the destination zero point
2393         "add %[lhs_ptr], %[lhs_ptr], #64\n"
2394         "dup v14.8h, v13.h[4]\n"
2395         RUY_MAKE_ZERO(v20)
2396         "add %[rhs_ptr], %[rhs_ptr], #64\n"
2397         "sqadd v16.8h, v16.8h, v14.8h\n"
2398         RUY_MAKE_ZERO(v21)
2399         "sqadd v17.8h, v17.8h, v14.8h\n"
2400         RUY_MAKE_ZERO(v22)
2401 
2402         // Cast-and-saturate from int16 to uint8
2403         "sqxtun v16.8b, v16.8h\n"
2404         RUY_MAKE_ZERO(v23)
2405         "sqxtun2 v16.16b, v17.8h\n"
2406         RUY_MAKE_ZERO(v24)
2407 
2408         // Load the clamp_min, clamp_max bounds
2409         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
2410         RUY_MAKE_ZERO(v25)
2411         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
2412         RUY_MAKE_ZERO(v26)
2413         "dup v14.16b, w2\n"  // clamp_min
2414         RUY_MAKE_ZERO(v27)
2415         "dup v15.16b, w3\n"  // clamp_max
2416         RUY_MAKE_ZERO(v28)
2417 
2418         // Apply the clamp_min bound
2419         "umax v16.16b, v16.16b, v14.16b\n"
2420         RUY_MAKE_ZERO(v29)
2421         // Apply the clamp_max bound
2422         "umin v16.16b, v16.16b, v15.16b\n"
2423         RUY_MAKE_ZERO(v30)
2424 
2425         // Compute how much of the 4x4 block of destination 8bit values that
2426         // we have computed, fit in the destination matrix. Typically, all of
2427         // it fits, but when the destination matrix shape is not a multiple
2428         // of 4x4, there are some 4x4 blocks along the boundaries that do
2429         // not fit entirely.
2430         "sub w1, %w[dst_rows], %w[row]\n"
2431         RUY_MAKE_ZERO(v31)
2432         "sub w2, %w[dst_cols], %w[col]\n"
2433         "mov w3, #4\n"
2434         "cmp w1, #4\n"
2435         // Compute w1 = how many rows of the 4x4 block fit
2436         "csel w1, w1, w3, le\n"
2437         "cmp w2, #4\n"
2438         // Compute w2 = how many cols of the 4x4 block fit
2439         "csel w2, w2, w3, le\n"
2440 
2441        // Test if w1==4 && w2 == 4, i.e. if all of the 8x8 block fits.
2442         "cmp w1, w3\n"
2443         "ccmp w2, w3, 0, eq\n"
2444         "mov x4, %[dst_ptr]\n"
2445         // Yes, all of the 4x4 block fits, go to fast path.
2446         "beq 30f\n"
2447         // Not all of the 4x4 block fits.
2448         // Store to dst_tmp_buf
2449         "st1 {v16.16b}, [%[dst_tmp_buf]]\n"
2450         // Slow loop copying from dst_tmp_buf to dst.
2451         "mov x3, %[dst_tmp_buf]\n"
2452         "mov w6, #0\n"
2453         "50:\n"
2454         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2455         "mov w5, #0\n"
2456         "51:\n"
2457         "ldrb w7, [x3, w5, uxtw]\n"
2458         "strb w7, [x4, w5, uxtw]\n"
2459         "add w5, w5, #1\n"
2460         "cmp w5, w1\n"
2461         "blt 51b\n"
2462         "add w6, w6, #1\n"
2463         "add x3, x3, #4\n"
2464         "add x4, x4, x11\n"
2465         "cmp w6, w2\n"
2466         "blt 50b\n"
2467         "b 31f\n"
2468         "30:\n"
2469         // Yes, all of the 4x4 block fits.
2470         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2471         "mov x3, x4\n"
2472         "st1 {v16.b}[0], [x3], #1\n"
2473         "add x4, x4, x11\n"
2474         "st1 {v16.b}[1], [x3], #1\n"
2475         "st1 {v16.b}[2], [x3], #1\n"
2476         "st1 {v16.b}[3], [x3], #1\n"
2477         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2478         "mov x3, x4\n"
2479         "st1 {v16.b}[4], [x3], #1\n"
2480         "add x4, x4, x11\n"
2481         "st1 {v16.b}[5], [x3], #1\n"
2482         "st1 {v16.b}[6], [x3], #1\n"
2483         "st1 {v16.b}[7], [x3], #1\n"
2484         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2485         "mov x3, x4\n"
2486         "st1 {v16.b}[8], [x3], #1\n"
2487         "add x4, x4, x11\n"
2488         "st1 {v16.b}[9], [x3], #1\n"
2489         "st1 {v16.b}[10], [x3], #1\n"
2490         "st1 {v16.b}[11], [x3], #1\n"
2491         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2492         "mov x3, x4\n"
2493         "st1 {v16.b}[12], [x3], #1\n"
2494         "add x4, x4, x11\n"
2495         "st1 {v16.b}[13], [x3], #1\n"
2496         "st1 {v16.b}[14], [x3], #1\n"
2497         "st1 {v16.b}[15], [x3], #1\n"
2498         "31:\n"
2499 
2500         "add %[dst_ptr], %[dst_ptr], #4\n"
2501 
2502         RUY_MAKE_ZERO(v16)
2503         RUY_MAKE_ZERO(v17)
2504 
2505         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
2506 
2507         RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
2508 
2509         "ins v4.d[1], x1\n"
2510         "sqxtn v16.4h, v16.4s\n"
2511         "ins v5.d[1], x2\n"
2512         "sqxtn2 v16.8h, v17.4s\n"
2513         "ins v6.d[1], x3\n"
2514         "sqxtn v17.4h, v18.4s\n"
2515         "ins v7.d[1], x4\n"
2516         RUY_MAKE_ZERO(v18)
2517         "sqxtn2 v17.8h, v19.4s\n"
2518 
2519         // At this point, v18 -- v31 aren't used anymore for the current block,
2520         // so we can start clearing these accumulators for the next block
2521         // (next iteration of the main loop).
2522         RUY_MAKE_ZERO(v19)
2523 
2524         // Add the destination zero point
2525         "add %[lhs_ptr], %[lhs_ptr], #64\n"
2526         "dup v14.8h, v13.h[4]\n"
2527         RUY_MAKE_ZERO(v20)
2528         "add %[rhs_ptr], %[rhs_ptr], #64\n"
2529         "sqadd v16.8h, v16.8h, v14.8h\n"
2530         RUY_MAKE_ZERO(v21)
2531         "sqadd v17.8h, v17.8h, v14.8h\n"
2532         RUY_MAKE_ZERO(v22)
2533 
2534         // Cast-and-saturate from int16 to uint8
2535         "sqxtn v16.8b, v16.8h\n"
2536         RUY_MAKE_ZERO(v23)
2537         "sqxtn2 v16.16b, v17.8h\n"
2538         RUY_MAKE_ZERO(v24)
2539 
2540         // Load the clamp_min, clamp_max bounds
2541         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
2542         RUY_MAKE_ZERO(v25)
2543         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
2544         RUY_MAKE_ZERO(v26)
2545         "dup v14.16b, w2\n"  // clamp_min
2546         RUY_MAKE_ZERO(v27)
2547         "dup v15.16b, w3\n"  // clamp_max
2548         RUY_MAKE_ZERO(v28)
2549 
2550         // Apply the clamp_min bound
2551         "smax v16.16b, v16.16b, v14.16b\n"
2552         RUY_MAKE_ZERO(v29)
2553         // Apply the clamp_max bound
2554         "smin v16.16b, v16.16b, v15.16b\n"
2555         RUY_MAKE_ZERO(v30)
2556 
2557         // Compute how much of the 4x4 block of destination 8bit values that
2558         // we have computed, fit in the destination matrix. Typically, all of
2559         // it fits, but when the destination matrix shape is not a multiple
2560         // of 4x4, there are some 4x4 blocks along the boundaries that do
2561         // not fit entirely.
2562         "sub w1, %w[dst_rows], %w[row]\n"
2563         RUY_MAKE_ZERO(v31)
2564         "sub w2, %w[dst_cols], %w[col]\n"
2565         "mov w3, #4\n"
2566         "cmp w1, #4\n"
2567         // Compute w1 = how many rows of the 4x4 block fit
2568         "csel w1, w1, w3, le\n"
2569         "cmp w2, #4\n"
2570         // Compute w2 = how many cols of the 4x4 block fit
2571         "csel w2, w2, w3, le\n"
2572 
2573        // Test if w1==4 && w2 == 4, i.e. if all of the 8x8 block fits.
2574         "cmp w1, w3\n"
2575         "ccmp w2, w3, 0, eq\n"
2576         "mov x4, %[dst_ptr]\n"
2577         // Yes, all of the 4x4 block fits, go to fast path.
2578         "beq 30f\n"
2579         // Not all of the 4x4 block fits.
2580         // Store to dst_tmp_buf
2581         "st1 {v16.16b}, [%[dst_tmp_buf]]\n"
2582         // Slow loop copying from dst_tmp_buf to dst.
2583         "mov x3, %[dst_tmp_buf]\n"
2584         "mov w6, #0\n"
2585         "50:\n"
2586         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2587         "mov w5, #0\n"
2588         "51:\n"
2589         "ldrb w7, [x3, w5, uxtw]\n"
2590         "strb w7, [x4, w5, uxtw]\n"
2591         "add w5, w5, #1\n"
2592         "cmp w5, w1\n"
2593         "blt 51b\n"
2594         "add w6, w6, #1\n"
2595         "add x3, x3, #4\n"
2596         "add x4, x4, x11\n"
2597         "cmp w6, w2\n"
2598         "blt 50b\n"
2599         "b 31f\n"
2600         "30:\n"
2601         // Yes, all of the 4x4 block fits.
2602         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2603         "mov x3, x4\n"
2604         "st1 {v16.b}[0], [x3], #1\n"
2605         "add x4, x4, x11\n"
2606         "st1 {v16.b}[1], [x3], #1\n"
2607         "st1 {v16.b}[2], [x3], #1\n"
2608         "st1 {v16.b}[3], [x3], #1\n"
2609         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2610         "mov x3, x4\n"
2611         "st1 {v16.b}[4], [x3], #1\n"
2612         "add x4, x4, x11\n"
2613         "st1 {v16.b}[5], [x3], #1\n"
2614         "st1 {v16.b}[6], [x3], #1\n"
2615         "st1 {v16.b}[7], [x3], #1\n"
2616         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2617         "mov x3, x4\n"
2618         "st1 {v16.b}[8], [x3], #1\n"
2619         "add x4, x4, x11\n"
2620         "st1 {v16.b}[9], [x3], #1\n"
2621         "st1 {v16.b}[10], [x3], #1\n"
2622         "st1 {v16.b}[11], [x3], #1\n"
2623         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2624         "mov x3, x4\n"
2625         "st1 {v16.b}[12], [x3], #1\n"
2626         "add x4, x4, x11\n"
2627         "st1 {v16.b}[13], [x3], #1\n"
2628         "st1 {v16.b}[14], [x3], #1\n"
2629         "st1 {v16.b}[15], [x3], #1\n"
2630         "31:\n"
2631 
2632         "add %[dst_ptr], %[dst_ptr], #4\n"
2633 
2634         RUY_MAKE_ZERO(v16)
2635         RUY_MAKE_ZERO(v17)
2636 
2637         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
2638 
2639         RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
2640 
2641         // Add the destination zero point
2642         "dup v14.4h, v13.h[4]\n"
2643         "saddw v16.4s, v16.4s, v14.4h\n"
2644         "saddw v17.4s, v17.4s, v14.4h\n"
2645         "saddw v18.4s, v18.4s, v14.4h\n"
2646         "saddw v19.4s, v19.4s, v14.4h\n"
2647 
2648         // Cast-and-saturate from int32 to int16
2649         "ins v4.d[1], x1\n"
2650         "sqxtn v16.4h, v16.4s\n"
2651         "ins v5.d[1], x2\n"
2652         "sqxtn2 v16.8h, v17.4s\n"
2653         "ins v6.d[1], x3\n"
2654         "sqxtn v17.4h, v18.4s\n"
2655         "ins v7.d[1], x4\n"
2656         RUY_MAKE_ZERO(v18)
2657         "sqxtn2 v17.8h, v19.4s\n"
2658 
2659         // At this point, v18 -- v31 aren't used anymore for the current block,
2660         // so we can start clearing these accumulators for the next block
2661         // (next iteration of the main loop).
2662         RUY_MAKE_ZERO(v19)
2663 
2664         "add %[lhs_ptr], %[lhs_ptr], #64\n"
2665         RUY_MAKE_ZERO(v20)
2666         "add %[rhs_ptr], %[rhs_ptr], #64\n"
2667         RUY_MAKE_ZERO(v21)
2668         RUY_MAKE_ZERO(v22)
2669 
2670         RUY_MAKE_ZERO(v23)
2671         RUY_MAKE_ZERO(v24)
2672 
2673         // Load the clamp_min, clamp_max bounds
2674         "ldrh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
2675         RUY_MAKE_ZERO(v25)
2676         "ldrh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
2677         RUY_MAKE_ZERO(v26)
2678         "dup v14.8h, w2\n"  // clamp_min
2679         RUY_MAKE_ZERO(v27)
2680         "dup v15.8h, w3\n"  // clamp_max
2681         RUY_MAKE_ZERO(v28)
2682 
2683         // Apply the clamp_min bound
2684         "smax v16.8h, v16.8h, v14.8h\n"
2685         "smax v17.8h, v17.8h, v14.8h\n"
2686         RUY_MAKE_ZERO(v29)
2687         // Apply the clamp_max bound
2688         "smin v16.8h, v16.8h, v15.8h\n"
2689         "smin v17.8h, v17.8h, v15.8h\n"
2690         RUY_MAKE_ZERO(v30)
2691 
2692         // Compute how much of the 4x4 block of destination 8bit values that
2693         // we have computed, fit in the destination matrix. Typically, all of
2694         // it fits, but when the destination matrix shape is not a multiple
2695         // of 4x4, there are some 4x4 blocks along the boundaries that do
2696         // not fit entirely.
2697         "sub w1, %w[dst_rows], %w[row]\n"
2698         RUY_MAKE_ZERO(v31)
2699         "sub w2, %w[dst_cols], %w[col]\n"
2700         "mov w3, #4\n"
2701         "cmp w1, #4\n"
2702         // Compute w1 = how many rows of the 4x4 block fit
2703         "csel w1, w1, w3, le\n"
2704         "cmp w2, #4\n"
2705         // Compute w2 = how many cols of the 4x4 block fit
2706         "csel w2, w2, w3, le\n"
2707 
2708        // Test if w1==4 && w2 == 4, i.e. if all of the 4x4 block fits.
2709         "cmp w1, w3\n"
2710         "ccmp w2, w3, 0, eq\n"
2711         "mov x4, %[dst_ptr]\n"
2712         // Yes, all of the 4x4 block fits, go to fast path.
2713         "beq 30f\n"
2714         // Not all of the 4x4 block fits.
2715         // Store to dst_tmp_buf
2716         "str q16, [%[dst_tmp_buf], #0]\n"
2717         "str q17, [%[dst_tmp_buf], #16]\n"
2718         // Slow loop copying from dst_tmp_buf to dst.
2719         "mov x3, %[dst_tmp_buf]\n"
2720         "mov w6, #0\n"
2721         "50:\n"
2722         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2723         "mov w5, #0\n"
2724         "51:\n"
2725         "ldrh w7, [x3, x5, lsl #1]\n"
2726         "strh w7, [x4, x5, lsl #1]\n"
2727         "add w5, w5, #1\n"
2728         "cmp w5, w1\n"
2729         "blt 51b\n"
2730         "add w6, w6, #1\n"
2731         "add x3, x3, #8\n"
2732         "add x4, x4, x11\n"
2733         "cmp w6, w2\n"
2734         "blt 50b\n"
2735         "b 31f\n"
2736         "30:\n"
2737         // Yes, all of the 4x4 block fits.
2738         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2739         "mov x3, x4\n"
2740         "st1 {v16.h}[0], [x3], #2\n"
2741         "add x4, x4, x11\n"
2742         "st1 {v16.h}[1], [x3], #2\n"
2743         "st1 {v16.h}[2], [x3], #2\n"
2744         "st1 {v16.h}[3], [x3], #2\n"
2745         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2746         "mov x3, x4\n"
2747         "st1 {v16.h}[4], [x3], #2\n"
2748         "add x4, x4, x11\n"
2749         "st1 {v16.h}[5], [x3], #2\n"
2750         "st1 {v16.h}[6], [x3], #2\n"
2751         "st1 {v16.h}[7], [x3], #2\n"
2752         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2753         "mov x3, x4\n"
2754         "st1 {v17.h}[0], [x3], #2\n"
2755         "add x4, x4, x11\n"
2756         "st1 {v17.h}[1], [x3], #2\n"
2757         "st1 {v17.h}[2], [x3], #2\n"
2758         "st1 {v17.h}[3], [x3], #2\n"
2759         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2760         "mov x3, x4\n"
2761         "st1 {v17.h}[4], [x3], #2\n"
2762         "add x4, x4, x11\n"
2763         "st1 {v17.h}[5], [x3], #2\n"
2764         "st1 {v17.h}[6], [x3], #2\n"
2765         "st1 {v17.h}[7], [x3], #2\n"
2766         "31:\n"
2767 
2768         "add %[dst_ptr], %[dst_ptr], #8\n"
2769 
2770         RUY_MAKE_ZERO(v16)
2771         RUY_MAKE_ZERO(v17)
2772 
2773         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
2774 
2775         RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
2776 
2777         "ldr x1, [%[lhs_ptr], #8]\n"
2778         "ldr x2, [%[lhs_ptr], #24]\n"
2779         "ldr x3, [%[lhs_ptr], #40]\n"
2780         "ldr x4, [%[lhs_ptr], #56]\n"
2781 
2782         "ins v0.d[1], x1\n"
2783         "ldr x1, [%[rhs_ptr], #8]\n"
2784         "ins v1.d[1], x2\n"
2785         "ldr x2, [%[rhs_ptr], #24]\n"
2786         "ins v2.d[1], x3\n"
2787         "ldr x3, [%[rhs_ptr], #40]\n"
2788         "ins v3.d[1], x4\n"
2789         "ldr x4, [%[rhs_ptr], #56]\n"
2790         "ins v4.d[1], x1\n"
2791         "ins v5.d[1], x2\n"
2792         "ins v6.d[1], x3\n"
2793         "ins v7.d[1], x4\n"
2794 
2795         // Since the store type is the same as the accum type, no need for
2796         // downcast. There's also no need for clamp by min/max.
2797 
2798         // At this point, v20 -- v31 aren't used anymore for the current block,
2799         // so we can start clearing these accumulators for the next block
2800         // (next iteration of the main loop).
2801 
2802         RUY_MAKE_ZERO(v20)
2803         "add %[lhs_ptr], %[lhs_ptr], #64\n"
2804         RUY_MAKE_ZERO(v21)
2805         "add %[rhs_ptr], %[rhs_ptr], #64\n"
2806         RUY_MAKE_ZERO(v22)
2807 
2808         RUY_MAKE_ZERO(v23)
2809         RUY_MAKE_ZERO(v24)
2810         RUY_MAKE_ZERO(v25)
2811         RUY_MAKE_ZERO(v26)
2812         RUY_MAKE_ZERO(v27)
2813         RUY_MAKE_ZERO(v28)
2814         RUY_MAKE_ZERO(v29)
2815         RUY_MAKE_ZERO(v30)
2816 
2817         // Compute how much of the 4x4 block of destination 8bit values that
2818         // we have computed, fit in the destination matrix. Typically, all of
2819         // it fits, but when the destination matrix shape is not a multiple
2820         // of 4x4, there are some 4x4 blocks along the boundaries that do
2821         // not fit entirely.
2822         "sub w1, %w[dst_rows], %w[row]\n"
2823         RUY_MAKE_ZERO(v31)
2824         "sub w2, %w[dst_cols], %w[col]\n"
2825         "mov w3, #4\n"
2826         "cmp w1, #4\n"
2827         // Compute w1 = how many rows of the 4x4 block fit
2828         "csel w1, w1, w3, le\n"
2829         "cmp w2, #4\n"
2830         // Compute w2 = how many cols of the 4x4 block fit
2831         "csel w2, w2, w3, le\n"
2832 
2833         // Test if w1==4 && w2 == 4, i.e. if all of the 8x8 block fits.
2834         "cmp w1, w3\n"
2835         "ccmp w2, w3, 0, eq\n"
2836         "mov x4, %[dst_ptr]\n"
2837         // Yes, all of the 4x4 block fits, go to fast path.
2838         "beq 30f\n"
2839         // Not all of the 4x4 block fits.
2840         // Store to dst_tmp_buf
2841         "str q16, [%[dst_tmp_buf], #0]\n"
2842         "str q17, [%[dst_tmp_buf], #16]\n"
2843         "str q18, [%[dst_tmp_buf], #32]\n"
2844         "str q19, [%[dst_tmp_buf], #48]\n"
2845         // Slow loop copying from dst_tmp_buf to dst.
2846         "mov x3, %[dst_tmp_buf]\n"
2847         "mov w6, #0\n"
2848         "50:\n"
2849         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2850         "mov w5, #0\n"
2851         "51:\n"
2852         "ldr w7, [x3, x5, lsl #2]\n"
2853         "str w7, [x4, x5, lsl #2]\n"
2854         "add w5, w5, #1\n"
2855         "cmp w5, w1\n"
2856         "blt 51b\n"
2857         "add w6, w6, #1\n"
2858         "add x3, x3, #16\n"
2859         "add x4, x4, x11\n"
2860         "cmp w6, w2\n"
2861         "blt 50b\n"
2862         "b 31f\n"
2863         "30:\n"
2864         // Yes, all of the 4x4 block fits.
2865         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2866         "mov x3, x4\n"
2867         "st1 {v16.s}[0], [x3], #4\n"
2868         "add x4, x4, x11\n"
2869         "st1 {v16.s}[1], [x3], #4\n"
2870         "st1 {v16.s}[2], [x3], #4\n"
2871         "st1 {v16.s}[3], [x3], #4\n"
2872         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2873         "mov x3, x4\n"
2874         "st1 {v17.s}[0], [x3], #4\n"
2875         "add x4, x4, x11\n"
2876         "st1 {v17.s}[1], [x3], #4\n"
2877         "st1 {v17.s}[2], [x3], #4\n"
2878         "st1 {v17.s}[3], [x3], #4\n"
2879         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2880         "mov x3, x4\n"
2881         "st1 {v18.s}[0], [x3], #4\n"
2882         "add x4, x4, x11\n"
2883         "st1 {v18.s}[1], [x3], #4\n"
2884         "st1 {v18.s}[2], [x3], #4\n"
2885         "st1 {v18.s}[3], [x3], #4\n"
2886         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
2887         "mov x3, x4\n"
2888         "st1 {v19.s}[0], [x3], #4\n"
2889         "add x4, x4, x11\n"
2890         "st1 {v19.s}[1], [x3], #4\n"
2891         "st1 {v19.s}[2], [x3], #4\n"
2892         "st1 {v19.s}[3], [x3], #4\n"
2893         "31:\n"
2894 
2895         "add %[dst_ptr], %[dst_ptr], #16\n"
2896 
2897         RUY_MAKE_ZERO(v16)
2898         RUY_MAKE_ZERO(v17)
2899         RUY_MAKE_ZERO(v18)
2900         RUY_MAKE_ZERO(v19)
2901 
2902         RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
2903 
2904         // For the next block: perform the first few multiply-adds on the data
2905         // that we have already loaded.
2906         "smull    v8.8h,  v0.8b,  v4.8b\n"
2907         "smull    v9.8h,  v1.8b,  v4.8b\n"
2908         "smull    v10.8h,  v2.8b,  v4.8b\n"
2909         // Reload some params --- we had used x5 -- x7 for a few other things
2910         // since the last time we had loaded them.
2911         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
2912         "smull    v11.8h,  v3.8b,  v4.8b\n"
2913         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
2914         "smull    v12.8h,  v0.8b,  v5.8b\n"
2915         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
2916         "smull    v13.8h,  v1.8b,  v5.8b\n"
2917         "smull    v14.8h,  v2.8b,  v5.8b\n"
2918         "smull    v15.8h,  v3.8b,  v5.8b\n"
2919         // Move to the next block of the destination matrix, for the next iter
2920         // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
2921         // been updated earlier.
2922         // Have we reached the end row?
2923         "cmp %w[row], w7\n"
2924         "smlal2   v8.8h,  v0.16b,  v4.16b\n"
2925         "smlal2   v9.8h,  v1.16b,  v4.16b\n"
2926         "smlal2   v10.8h,  v2.16b,  v4.16b\n"
2927         "smlal2   v11.8h,  v3.16b,  v4.16b\n"
2928         "smlal2   v12.8h,  v0.16b,  v5.16b\n"
2929         "smlal2   v13.8h,  v1.16b,  v5.16b\n"
2930         "smlal2   v14.8h,  v2.16b,  v5.16b\n"
2931         "smlal2   v15.8h,  v3.16b,  v5.16b\n"
2932 
2933 
2934         "beq 20f\n"  // yes, end row.
2935         // Not end row. Move to the next row.
2936         "add %w[row], %w[row], #4\n"
2937         "b 21f\n"
2938         "20:\n"
2939         // Was already at end row.
2940         "mov %w[row], w6\n"  // Move back to first row.
2941         "add %w[col], %w[col], #4\n"  // Move to the next column.
2942         "add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #2\n"
2943         "mov %[dst_ptr], %[dst_col_ptr]\n"
2944         "21:\n"
2945 
2946         // Main loop exit condition: have we hit the end column?
2947         "cmp %w[col], w8\n"
2948 
2949         // w1 is the number of levels of depth that we have already loaded
2950         // LHS and RHS data for. Corresponding to the initial ld1 instructions
2951         // above, this is currently 4.
2952         "mov w1, #16\n"
2953 
2954         "ble 1b\n"
2955 
2956         // clang-format on
2957 
2958         : [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
2959           [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
2960           [dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
2961         : [ params ] "r"(&params),[dst_rows] "r"(params.dst_rows),
2962           [dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
2963           [dst_type_id] "r"(params.dst_type_id)
2964         : "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
2965           "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
2966           "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
2967           "v26", "v27", "v28", "v29", "v30", "v31");
2968 }
2969 
2970 // Kernel taking advantage of the optional dotprod instruction.
2971 // This is very similar to (and directly inspired by) this gemmlowp kernel
2972 // which was contributed by David Mansell at ARM:
2973 // NEON_64bit_GEMM_Uint8Operands_Uint32Accumulators_dotproduct
2974 // https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L3391
2975 //
2976 // Besides the ruy-ification, the main difference here is that we use a 8x8
2977 // instead of 12x8 width, so as to stick to power-of-two widths. This slightly
2978 // narrower kernel layout is still wide enough to achieve high performance
2979 // although we haven't actually performed a real comparison to know exactly
2980 // how this compares to ARM's aforementioned kernel.
2981 //
2982 // Relevant target CPUs for this kernel include ARM Cortex-A76,
2983 // since these are 64-bit, out-of-order and with dotprod support.
Kernel8bitNeonDotprod(const KernelParams8bit<8,8> & params)2984 void Kernel8bitNeonDotprod(const KernelParams8bit<8, 8>& params) {
2985   profiler::ScopeLabel label("Kernel (kNeonDotprod)");
2986 
2987   CheckOffsetsInKernelParams8bit(params);
2988 
2989   const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
2990   const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
2991   const std::int8_t* lhs_ptr = lhs_col_ptr;
2992   const std::int8_t* rhs_ptr = rhs_col_ptr;
2993   void* dst_col_ptr = params.dst_base_ptr;
2994   void* dst_ptr = dst_col_ptr;
2995   int row = params.start_row;
2996   int col = params.start_col;
2997 
2998   // The asm kernel below has the following NEON register allocation:
2999   //
3000   // v16 -- v31 are int32 accumulators.
3001   // During accumulation, v0 -- v15 are used to load int8 data from LHS and
3002   // RHS. At least v0 and v1 are used to load a 8x4 block of LHS, and v2 and
3003   // v3 are used to load a 4x8 block of RHS, like this:
3004   //
3005   //                                      int8 RHS 4x8 block
3006   //                           /-----------------------------------------|
3007   //                           |v2.b[0] ... v2.b[12] v3.b[0] ... v3.b[12]|
3008   //                           |  ...                              ...   |
3009   //                           |v2.b[3] ... v2.b[15] v3.b[3] ... v3.b[15]|
3010   //                           \-----------------------------------------/
3011   //    int8 LHS 8x4 block
3012   //  /---------------------\  /-----------------------------------------|
3013   //  |v0.b[0]  ... v0.b[3] |  |v16.s[0]           ...           v30.s[0]|
3014   //  |  ...          ...   |  |  ...                              ...   |
3015   //  |v0.b[12] ... v0.b[15]|  |v16.s[3]           ...           v30.s[3]|
3016   //  |v1.b[0]  ... v1.b[3] |  |v17.s[0]           ...           v31.s[0]|
3017   //  |  ...         ...    |  |  ...                              ...   |
3018   //  |v1.b[12] ... v1.b[15]|  |v17.s[3]           ...           v31.s[3]|
3019   //  \---------------------/  \-----------------------------------------/
3020   //                                  int32 accumulators 8x8 block
3021   //
3022   // In the RUY_OPT_MAX_STREAMING part of the kernel, this elementary step
3023   // is repeated 4 times, using 4x more registers for LHS and RHS, so that
3024   // is where instead of using v0 -- v3 for LHS and RHS, we use v0 -- v15.
3025   //
3026   // Outside of the RUY_OPT_MAX_STREAMING part of the kernel, v4 -- v7 are
3027   // unused, and v8 -- v15 are used for loading parameters used for the
3028   // post-accumulation part of the kernel.
3029   asm volatile(
3030 #define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
3031 
3032         // clang-format off
3033 
3034         // Load some parameters into registers.
3035         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
3036         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
3037         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
3038         "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
3039         "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
3040         "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
3041         "ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
3042         "ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
3043 
3044         // Load the first 32 bytes of LHS and RHS data.
3045         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
3046         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
3047         "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
3048         "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
3049 
3050         // Clear accumulators.
3051         RUY_MAKE_ZERO(v16)
3052         RUY_MAKE_ZERO(v17)
3053         RUY_MAKE_ZERO(v18)
3054         RUY_MAKE_ZERO(v19)
3055         RUY_MAKE_ZERO(v20)
3056         RUY_MAKE_ZERO(v21)
3057         RUY_MAKE_ZERO(v22)
3058         RUY_MAKE_ZERO(v23)
3059         RUY_MAKE_ZERO(v24)
3060         RUY_MAKE_ZERO(v25)
3061         RUY_MAKE_ZERO(v26)
3062         RUY_MAKE_ZERO(v27)
3063         RUY_MAKE_ZERO(v28)
3064         RUY_MAKE_ZERO(v29)
3065         RUY_MAKE_ZERO(v30)
3066         RUY_MAKE_ZERO(v31)
3067 
3068         // w1 is the number of levels of depth that we have already loaded
3069         // LHS and RHS data for. Corresponding to the initial ld1 instructions
3070         // above, this is currently 4.
3071         "mov w1, #4\n"
3072 
3073         // Perform the first few multiply-adds on the data that we have already
3074         // loaded.
3075         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
3076         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
3077         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
3078         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
3079 
3080         // Main loop of the whole GEMM, over rows and columns of the
3081         // destination matrix.
3082         "1:\n"
3083 
3084         // Optional, maximally-streaming, partial-unrolling (4x unrolled)
3085         // optimization of the kernel inner loop (over depth). For more
3086         // comments, see the non-unrolled loop below after the #endif.
3087 #if RUY_OPT(MAX_STREAMING)
3088         "cmp w12, #32\n"
3089         "blt 78f\n"
3090 
3091         "ld1 {v4.16b}, [%[lhs_ptr]], #16\n"
3092         "ld1 {v5.16b}, [%[lhs_ptr]], #16\n"
3093         "ld1 {v6.16b}, [%[rhs_ptr]], #16\n"
3094         "ld1 {v7.16b}, [%[rhs_ptr]], #16\n"
3095         "ld1 {v8.16b}, [%[lhs_ptr]], #16\n"
3096         "ld1 {v9.16b}, [%[lhs_ptr]], #16\n"
3097         "ld1 {v10.16b}, [%[rhs_ptr]], #16\n"
3098         "ld1 {v11.16b}, [%[rhs_ptr]], #16\n"
3099         "ld1 {v12.16b}, [%[lhs_ptr]], #16\n"
3100         "ld1 {v13.16b}, [%[lhs_ptr]], #16\n"
3101         "ld1 {v14.16b}, [%[rhs_ptr]], #16\n"
3102         "ld1 {v15.16b}, [%[rhs_ptr]], #16\n"
3103         "mov w1, #16\n"
3104 
3105         "and w3, w12, #-16\n"
3106         "81:\n"
3107         "add w1, w1, #16\n"
3108 
3109         ".word 0x4f83e018  // sdot v24.4s, v0.16b, v3.4b[0]\n"
3110         ".word 0x4fa3e01a  // sdot v26.4s, v0.16b, v3.4b[1]\n"
3111         ".word 0x4f83e81c  // sdot v28.4s, v0.16b, v3.4b[2]\n"
3112         ".word 0x4fa3e81e  // sdot v30.4s, v0.16b, v3.4b[3]\n"
3113         "ldr q0, [%[lhs_ptr], #0]\n"
3114         ".word 0x4f82e031  // sdot v17.4s, v1.16b, v2.4b[0]\n"
3115         ".word 0x4fa2e033  // sdot v19.4s, v1.16b, v2.4b[1]\n"
3116         ".word 0x4f82e835  // sdot v21.4s, v1.16b, v2.4b[2]\n"
3117         ".word 0x4fa2e837  // sdot v23.4s, v1.16b, v2.4b[3]\n"
3118         "ldr q2, [%[rhs_ptr], #0]\n"
3119         ".word 0x4f83e039  // sdot v25.4s, v1.16b, v3.4b[0]\n"
3120         ".word 0x4fa3e03b  // sdot v27.4s, v1.16b, v3.4b[1]\n"
3121         ".word 0x4f83e83d  // sdot v29.4s, v1.16b, v3.4b[2]\n"
3122         ".word 0x4fa3e83f  // sdot v31.4s, v1.16b, v3.4b[3]\n"
3123         "ldr q1, [%[lhs_ptr], #16]\n"
3124 
3125         ".word 0x4f87e098  // sdot v24.4s, v4.16b, v7.4b[0]\n"
3126         ".word 0x4fa7e09a  // sdot v26.4s, v4.16b, v7.4b[1]\n"
3127         "ldr q3, [%[rhs_ptr], #16]\n"
3128         ".word 0x4f87e89c  // sdot v28.4s, v4.16b, v7.4b[2]\n"
3129         ".word 0x4fa7e89e  // sdot v30.4s, v4.16b, v7.4b[3]\n"
3130         ".word 0x4f86e0b1  // sdot v17.4s, v5.16b, v6.4b[0]\n"
3131         ".word 0x4fa6e0b3  // sdot v19.4s, v5.16b, v6.4b[1]\n"
3132         ".word 0x4f86e8b5  // sdot v21.4s, v5.16b, v6.4b[2]\n"
3133         ".word 0x4fa6e8b7  // sdot v23.4s, v5.16b, v6.4b[3]\n"
3134         ".word 0x4f87e0b9  // sdot v25.4s, v5.16b, v7.4b[0]\n"
3135         ".word 0x4fa7e0bb  // sdot v27.4s, v5.16b, v7.4b[1]\n"
3136         ".word 0x4f87e8bd  // sdot v29.4s, v5.16b, v7.4b[2]\n"
3137         ".word 0x4fa7e8bf  // sdot v31.4s, v5.16b, v7.4b[3]\n"
3138         "ldr q5, [%[lhs_ptr], #48]\n"
3139         ".word 0x4f86e090  // sdot v16.4s, v4.16b, v6.4b[0]\n"
3140         ".word 0x4fa6e092  // sdot v18.4s, v4.16b, v6.4b[1]\n"
3141         "ldr q7, [%[rhs_ptr], #48]\n"
3142         ".word 0x4f86e894  // sdot v20.4s, v4.16b, v6.4b[2]\n"
3143         ".word 0x4fa6e896  // sdot v22.4s, v4.16b, v6.4b[3]\n"
3144         "ldr q4, [%[lhs_ptr], #32]\n"
3145 
3146         ".word 0x4f8be118  // sdot v24.4s, v8.16b, v11.4b[0]\n"
3147         ".word 0x4fabe11a  // sdot v26.4s, v8.16b, v11.4b[1]\n"
3148         "ldr q6, [%[rhs_ptr], #32]\n"
3149         ".word 0x4f8be91c  // sdot v28.4s, v8.16b, v11.4b[2]\n"
3150         ".word 0x4fabe91e  // sdot v30.4s, v8.16b, v11.4b[3]\n"
3151         ".word 0x4f8ae131  // sdot v17.4s, v9.16b, v10.4b[0]\n"
3152         ".word 0x4faae133  // sdot v19.4s, v9.16b, v10.4b[1]\n"
3153         ".word 0x4f8ae935  // sdot v21.4s, v9.16b, v10.4b[2]\n"
3154         ".word 0x4faae937  // sdot v23.4s, v9.16b, v10.4b[3]\n"
3155         ".word 0x4f8be139  // sdot v25.4s, v9.16b, v11.4b[0]\n"
3156         ".word 0x4fabe13b  // sdot v27.4s, v9.16b, v11.4b[1]\n"
3157         ".word 0x4f8be93d  // sdot v29.4s, v9.16b, v11.4b[2]\n"
3158         ".word 0x4fabe93f  // sdot v31.4s, v9.16b, v11.4b[3]\n"
3159         "ldr q9, [%[lhs_ptr], #80]\n"
3160         ".word 0x4f8ae110  // sdot v16.4s, v8.16b, v10.4b[0]\n"
3161         ".word 0x4faae112  // sdot v18.4s, v8.16b, v10.4b[1]\n"
3162         "ldr q11, [%[rhs_ptr], #80]\n"
3163         ".word 0x4f8ae914  // sdot v20.4s, v8.16b, v10.4b[2]\n"
3164         ".word 0x4faae916  // sdot v22.4s, v8.16b, v10.4b[3]\n"
3165         "ldr q8, [%[lhs_ptr], #64]\n"
3166 
3167         ".word 0x4f8fe198  // sdot v24.4s, v12.16b, v15.4b[0]\n"
3168         ".word 0x4fafe19a  // sdot v26.4s, v12.16b, v15.4b[1]\n"
3169         "ldr q10, [%[rhs_ptr], #64]\n"
3170         ".word 0x4f8fe99c  // sdot v28.4s, v12.16b, v15.4b[2]\n"
3171         ".word 0x4fafe99e  // sdot v30.4s, v12.16b, v15.4b[3]\n"
3172         "add %[lhs_ptr], %[lhs_ptr], #128\n"
3173         ".word 0x4f8ee1b1  // sdot v17.4s, v13.16b, v14.4b[0]\n"
3174         ".word 0x4faee1b3  // sdot v19.4s, v13.16b, v14.4b[1]\n"
3175         "add %[rhs_ptr], %[rhs_ptr], #128\n"
3176         ".word 0x4f8ee9b5  // sdot v21.4s, v13.16b, v14.4b[2]\n"
3177         ".word 0x4faee9b7  // sdot v23.4s, v13.16b, v14.4b[3]\n"
3178         ".word 0x4f8fe1b9  // sdot v25.4s, v13.16b, v15.4b[0]\n"
3179         ".word 0x4fafe1bb  // sdot v27.4s, v13.16b, v15.4b[1]\n"
3180         "cmp w1, w3\n"
3181         ".word 0x4f8fe9bd  // sdot v29.4s, v13.16b, v15.4b[2]\n"
3182         ".word 0x4fafe9bf  // sdot v31.4s, v13.16b, v15.4b[3]\n"
3183         "ldr q13, [%[lhs_ptr], #-16]\n"
3184         ".word 0x4f8ee190  // sdot v16.4s, v12.16b, v14.4b[0]\n"
3185         ".word 0x4faee192  // sdot v18.4s, v12.16b, v14.4b[1]\n"
3186         "ldr q15, [%[rhs_ptr], #-16]\n"
3187         ".word 0x4f8ee994  // sdot v20.4s, v12.16b, v14.4b[2]\n"
3188         ".word 0x4faee996  // sdot v22.4s, v12.16b, v14.4b[3]\n"
3189         "ldr q12, [%[lhs_ptr], #-32]\n"
3190 
3191         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
3192         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
3193         "ldr q14, [%[rhs_ptr], #-32]\n"
3194         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
3195         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
3196 
3197         "blt 81b\n"
3198 
3199         ".word 0x4f87e098  // sdot v24.4s, v4.16b, v7.4b[0]\n"
3200         ".word 0x4fa7e09a  // sdot v26.4s, v4.16b, v7.4b[1]\n"
3201         ".word 0x4f87e89c  // sdot v28.4s, v4.16b, v7.4b[2]\n"
3202         ".word 0x4fa7e89e  // sdot v30.4s, v4.16b, v7.4b[3]\n"
3203         ".word 0x4f86e0b1  // sdot v17.4s, v5.16b, v6.4b[0]\n"
3204         ".word 0x4fa6e0b3  // sdot v19.4s, v5.16b, v6.4b[1]\n"
3205         ".word 0x4f86e8b5  // sdot v21.4s, v5.16b, v6.4b[2]\n"
3206         ".word 0x4fa6e8b7  // sdot v23.4s, v5.16b, v6.4b[3]\n"
3207         ".word 0x4f87e0b9  // sdot v25.4s, v5.16b, v7.4b[0]\n"
3208         ".word 0x4fa7e0bb  // sdot v27.4s, v5.16b, v7.4b[1]\n"
3209         ".word 0x4f87e8bd  // sdot v29.4s, v5.16b, v7.4b[2]\n"
3210         ".word 0x4fa7e8bf  // sdot v31.4s, v5.16b, v7.4b[3]\n"
3211         ".word 0x4f86e090  // sdot v16.4s, v4.16b, v6.4b[0]\n"
3212         ".word 0x4fa6e092  // sdot v18.4s, v4.16b, v6.4b[1]\n"
3213         ".word 0x4f86e894  // sdot v20.4s, v4.16b, v6.4b[2]\n"
3214         ".word 0x4fa6e896  // sdot v22.4s, v4.16b, v6.4b[3]\n"
3215 
3216         ".word 0x4f8be118  // sdot v24.4s, v8.16b, v11.4b[0]\n"
3217         ".word 0x4fabe11a  // sdot v26.4s, v8.16b, v11.4b[1]\n"
3218         ".word 0x4f8be91c  // sdot v28.4s, v8.16b, v11.4b[2]\n"
3219         ".word 0x4fabe91e  // sdot v30.4s, v8.16b, v11.4b[3]\n"
3220         ".word 0x4f8ae131  // sdot v17.4s, v9.16b, v10.4b[0]\n"
3221         ".word 0x4faae133  // sdot v19.4s, v9.16b, v10.4b[1]\n"
3222         ".word 0x4f8ae935  // sdot v21.4s, v9.16b, v10.4b[2]\n"
3223         ".word 0x4faae937  // sdot v23.4s, v9.16b, v10.4b[3]\n"
3224         ".word 0x4f8be139  // sdot v25.4s, v9.16b, v11.4b[0]\n"
3225         ".word 0x4fabe13b  // sdot v27.4s, v9.16b, v11.4b[1]\n"
3226         ".word 0x4f8be93d  // sdot v29.4s, v9.16b, v11.4b[2]\n"
3227         ".word 0x4fabe93f  // sdot v31.4s, v9.16b, v11.4b[3]\n"
3228         ".word 0x4f8ae110  // sdot v16.4s, v8.16b, v10.4b[0]\n"
3229         ".word 0x4faae112  // sdot v18.4s, v8.16b, v10.4b[1]\n"
3230         ".word 0x4f8ae914  // sdot v20.4s, v8.16b, v10.4b[2]\n"
3231         ".word 0x4faae916  // sdot v22.4s, v8.16b, v10.4b[3]\n"
3232 
3233         ".word 0x4f8fe198  // sdot v24.4s, v12.16b, v15.4b[0]\n"
3234         ".word 0x4fafe19a  // sdot v26.4s, v12.16b, v15.4b[1]\n"
3235         ".word 0x4f8fe99c  // sdot v28.4s, v12.16b, v15.4b[2]\n"
3236         ".word 0x4fafe99e  // sdot v30.4s, v12.16b, v15.4b[3]\n"
3237         ".word 0x4f8ee1b1  // sdot v17.4s, v13.16b, v14.4b[0]\n"
3238         ".word 0x4faee1b3  // sdot v19.4s, v13.16b, v14.4b[1]\n"
3239         ".word 0x4f8ee9b5  // sdot v21.4s, v13.16b, v14.4b[2]\n"
3240         ".word 0x4faee9b7  // sdot v23.4s, v13.16b, v14.4b[3]\n"
3241         ".word 0x4f8fe1b9  // sdot v25.4s, v13.16b, v15.4b[0]\n"
3242         ".word 0x4fafe1bb  // sdot v27.4s, v13.16b, v15.4b[1]\n"
3243         ".word 0x4f8fe9bd  // sdot v29.4s, v13.16b, v15.4b[2]\n"
3244         ".word 0x4fafe9bf  // sdot v31.4s, v13.16b, v15.4b[3]\n"
3245         ".word 0x4f8ee190  // sdot v16.4s, v12.16b, v14.4b[0]\n"
3246         ".word 0x4faee192  // sdot v18.4s, v12.16b, v14.4b[1]\n"
3247         ".word 0x4f8ee994  // sdot v20.4s, v12.16b, v14.4b[2]\n"
3248         ".word 0x4faee996  // sdot v22.4s, v12.16b, v14.4b[3]\n"
3249 
3250         "78:\n"
3251 
3252 #endif  // #if RUY_OPT(MAX_STREAMING)
3253 
3254         // Ordinary kernel inner loop (over depth), the simpler loop that the
3255         // above was an equivalent 4x-partially-unrolled version of.
3256 
3257         // Reminder - w1 is how many levels of depth we have already loaded
3258         // data for, w12 is the total depth.
3259         "cmp w1, w12\n"
3260         "beq 79f\n"
3261 
3262         "2:\n"
3263 
3264         // Because of the data that we have already loaded, we can start the
3265         // loop body right away with some multiply-adds.
3266         ".word 0x4f83e018  // sdot v24.4s, v0.16b, v3.4b[0]\n"
3267         ".word 0x4fa3e01a  // sdot v26.4s, v0.16b, v3.4b[1]\n"
3268         // Each iteration of this loop advances by 4 levels of depth.
3269         "add w1, w1, #4\n"
3270         ".word 0x4f83e81c  // sdot v28.4s, v0.16b, v3.4b[2]\n"
3271         ".word 0x4fa3e81e  // sdot v30.4s, v0.16b, v3.4b[3]\n"
3272         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
3273         ".word 0x4f82e031  // sdot v17.4s, v1.16b, v2.4b[0]\n"
3274         ".word 0x4fa2e033  // sdot v19.4s, v1.16b, v2.4b[1]\n"
3275         // Loop termination condition.
3276         "cmp w1, w12\n"
3277         ".word 0x4f82e835  // sdot v21.4s, v1.16b, v2.4b[2]\n"
3278         ".word 0x4fa2e837  // sdot v23.4s, v1.16b, v2.4b[3]\n"
3279         "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
3280         ".word 0x4f83e039  // sdot v25.4s, v1.16b, v3.4b[0]\n"
3281         ".word 0x4fa3e03b  // sdot v27.4s, v1.16b, v3.4b[1]\n"
3282         ".word 0x4f83e83d  // sdot v29.4s, v1.16b, v3.4b[2]\n"
3283         ".word 0x4fa3e83f  // sdot v31.4s, v1.16b, v3.4b[3]\n"
3284         "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
3285         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
3286         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
3287         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
3288         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
3289         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
3290 
3291         "blt 2b\n"
3292 
3293         "79:\n"
3294         // End of the inner loop on depth. Now perform the remaining
3295         // multiply-adds of the last 4 levels of depth, for which the LHS
3296         // and RHS data is already loaded.
3297 
3298         ".word 0x4f83e018  // sdot v24.4s, v0.16b, v3.4b[0]\n"
3299         ".word 0x4fa3e01a  // sdot v26.4s, v0.16b, v3.4b[1]\n"
3300         ".word 0x4f83e81c  // sdot v28.4s, v0.16b, v3.4b[2]\n"
3301         ".word 0x4fa3e81e  // sdot v30.4s, v0.16b, v3.4b[3]\n"
3302         ".word 0x4f82e031  // sdot v17.4s, v1.16b, v2.4b[0]\n"
3303         ".word 0x4fa2e033  // sdot v19.4s, v1.16b, v2.4b[1]\n"
3304         ".word 0x4f82e835  // sdot v21.4s, v1.16b, v2.4b[2]\n"
3305         ".word 0x4fa2e837  // sdot v23.4s, v1.16b, v2.4b[3]\n"
3306         ".word 0x4f83e039  // sdot v25.4s, v1.16b, v3.4b[0]\n"
3307         ".word 0x4fa3e03b  // sdot v27.4s, v1.16b, v3.4b[1]\n"
3308         ".word 0x4f83e83d  // sdot v29.4s, v1.16b, v3.4b[2]\n"
3309         ".word 0x4fa3e83f  // sdot v31.4s, v1.16b, v3.4b[3]\n"
3310 
3311         // End of accumulation. The registers v16 -- v31 contain the final
3312         // int32 accumulator values of the current 8x8 destination block.
3313         // We now have to compute the final 8-bit values from these int32
3314         // accumulators, and advance to the next 8x8 block. We intertwine
3315         // these two aspects whenever possible for optimal pipelining, both
3316         // at the data flow level (prefetch data for next block as early as
3317         // possible) and instruction pipelining level (some of the next-block
3318         // work can dual-issue with some of the final work on the current
3319         // block).
3320 
3321         // Logic to advance to the next block in preparation for the next
3322         // iteration of the main loop. For now, we only want to compute
3323         // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
3324         // not yet ready to update the values of row and col, as we still need
3325         // the current values for the rest of the work on the current block.
3326 
3327         "cmp %w[row], w7\n"  // Have we finished the last row?
3328         "bge 4f\n"           // If finished last row, go to 4
3329         // Not finished last row: then advance to next row.
3330         "add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
3331         "b 5f\n"
3332         "4:\n"  // Finished last row...
3333         "mov %[lhs_col_ptr], x5\n"  // Go back to first row
3334         // Now we need to advance to the next column. If we already
3335         // finished the last column, then in principle we are done, however
3336         // we can't just return here, as we need to allow the end work of the
3337         // current block to complete. The good news is that at this point it
3338         // doesn't matter what data we load for the next column, since
3339         // we will exit from the main loop below before actually storing
3340         // anything computed from that data.
3341         "cmp %w[col], w8\n"  // Have we finished the last column?
3342         "bge 5f\n" // If yes, just carry on without updating the column pointer.
3343         // Not finished last column: then advance to next column.
3344         "add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
3345         "5:\n"
3346 
3347         // Set the LHS and RHS data pointers to the start of the columns just
3348         // computed.
3349         "mov %[lhs_ptr], %[lhs_col_ptr]\n"
3350         "mov %[rhs_ptr], %[rhs_col_ptr]\n"
3351 
3352         // Load some parameters needed for the end work on current block.
3353         "mvni v8.4s, #0\n"
3354         "ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
3355         "ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
3356         "dup v9.4s, w3\n"   // create prod_zp_depth_vec
3357 
3358         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
3359         // Determine the channel index.
3360         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
3361         "csel w3, %w[row], %w[col], eq\n"
3362 
3363         // Offset the bias pointer as needed given the current row, col.
3364         "add x5, x1, x3, lsl #2\n"
3365 
3366         // If there is no bias, use no offset, just address the passed zero
3367         // data.
3368         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
3369         "csel x1, x1, x5, eq\n"
3370 
3371         // Load 8 bias values.
3372         "ld1 {v14.4s}, [x1], #16\n"
3373         "ld1 {v15.4s}, [x1]\n"
3374 
3375         // Now that we know what LHS and RHS data the next iteration of the
3376         // main loop will need to load, we start loading the first 32 bytes of
3377         // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
3378         // in the rest of the work on the current block.
3379         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
3380         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
3381         "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
3382         "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
3383 
3384         // Add to the bias values the product (depth * lhs_zero_point * rhs_zero_point),
3385         // See the term NZ1Z2 in equation (7) in https://arxiv.org/pdf/1712.05877.pdf
3386         "add v14.4s, v14.4s, v9.4s\n"
3387         "add v15.4s, v15.4s, v9.4s\n"
3388 
3389         // Perform the bias-addition (per the above, we have just folded into
3390         // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
3391         // Jump based on channel dimension.
3392         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
3393         "bne 6f\n"
3394         // Case where channels are rows
3395         "add v16.4s, v16.4s, v14.4s\n"
3396         "add v17.4s, v17.4s, v15.4s\n"
3397         "add v18.4s, v18.4s, v14.4s\n"
3398         "add v19.4s, v19.4s, v15.4s\n"
3399         "add v20.4s, v20.4s, v14.4s\n"
3400         "add v21.4s, v21.4s, v15.4s\n"
3401         "add v22.4s, v22.4s, v14.4s\n"
3402         "add v23.4s, v23.4s, v15.4s\n"
3403         "add v24.4s, v24.4s, v14.4s\n"
3404         "add v25.4s, v25.4s, v15.4s\n"
3405         "add v26.4s, v26.4s, v14.4s\n"
3406         "add v27.4s, v27.4s, v15.4s\n"
3407         "add v28.4s, v28.4s, v14.4s\n"
3408         "add v29.4s, v29.4s, v15.4s\n"
3409         "add v30.4s, v30.4s, v14.4s\n"
3410         "add v31.4s, v31.4s, v15.4s\n"
3411         "b 7f\n"
3412 
3413         "6:\n"
3414         // Case where channels are columns
3415         "dup v10.4s, v14.s[0]\n"
3416         "dup v11.4s, v14.s[1]\n"
3417         "dup v12.4s, v14.s[2]\n"
3418         "dup v13.4s, v14.s[3]\n"
3419         "add v16.4s, v16.4s, v10.4s\n"
3420         "add v17.4s, v17.4s, v10.4s\n"
3421         "add v18.4s, v18.4s, v11.4s\n"
3422         "add v19.4s, v19.4s, v11.4s\n"
3423         "add v20.4s, v20.4s, v12.4s\n"
3424         "add v21.4s, v21.4s, v12.4s\n"
3425         "add v22.4s, v22.4s, v13.4s\n"
3426         "add v23.4s, v23.4s, v13.4s\n"
3427         "dup v10.4s, v15.s[0]\n"
3428         "dup v11.4s, v15.s[1]\n"
3429         "dup v12.4s, v15.s[2]\n"
3430         "dup v13.4s, v15.s[3]\n"
3431         "add v24.4s, v24.4s, v10.4s\n"
3432         "add v25.4s, v25.4s, v10.4s\n"
3433         "add v26.4s, v26.4s, v11.4s\n"
3434         "add v27.4s, v27.4s, v11.4s\n"
3435         "add v28.4s, v28.4s, v12.4s\n"
3436         "add v29.4s, v29.4s, v12.4s\n"
3437         "add v30.4s, v30.4s, v13.4s\n"
3438         "add v31.4s, v31.4s, v13.4s\n"
3439         "7:\n"
3440 
3441         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
3442         "beq 401f\n"
3443         "ldr x3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
3444         "add x3, x3, %x[col], lsl #2\n"
3445         "ld1 {v14.4s}, [x3], #16\n"
3446         "ld1 {v15.4s}, [x3]\n"
3447         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
3448         "dup v10.4s, w5\n"  // create lhs_zero_point_vec
3449         // Subtract rhs_sums * lhs_zero_point, per
3450         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
3451         "mls v16.4s, v10.4s, v14.s[0]\n"
3452         "mls v17.4s, v10.4s, v14.s[0]\n"
3453         "mls v18.4s, v10.4s, v14.s[1]\n"
3454         "mls v19.4s, v10.4s, v14.s[1]\n"
3455         "mls v20.4s, v10.4s, v14.s[2]\n"
3456         "mls v21.4s, v10.4s, v14.s[2]\n"
3457         "mls v22.4s, v10.4s, v14.s[3]\n"
3458         "mls v23.4s, v10.4s, v14.s[3]\n"
3459         "mls v24.4s, v10.4s, v15.s[0]\n"
3460         "mls v25.4s, v10.4s, v15.s[0]\n"
3461         "mls v26.4s, v10.4s, v15.s[1]\n"
3462         "mls v27.4s, v10.4s, v15.s[1]\n"
3463         "mls v28.4s, v10.4s, v15.s[2]\n"
3464         "mls v29.4s, v10.4s, v15.s[2]\n"
3465         "mls v30.4s, v10.4s, v15.s[3]\n"
3466         "mls v31.4s, v10.4s, v15.s[3]\n"
3467         "401:\n"
3468 
3469         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
3470         "beq 402f\n"
3471         "ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
3472         "add x2, x2, %x[row], lsl #2\n"
3473         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
3474         // Load 4 lhs_sums values.
3475         "ld1 {v11.4s}, [x2], #16\n"
3476         "ld1 {v12.4s}, [x2]\n"
3477         "ins v13.s[1], w5\n" // rhs_zero_point
3478         // Compute lhs_sums * rhs_zero_point.
3479         "mul v11.4s, v11.4s, v13.s[1]\n"
3480         "mul v12.4s, v12.4s, v13.s[1]\n"
3481         // Subtract lhs_sums * rhs_zero_point, per
3482         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
3483         "sub v16.4s, v16.4s, v11.4s\n"
3484         "sub v17.4s, v17.4s, v12.4s\n"
3485         "sub v18.4s, v18.4s, v11.4s\n"
3486         "sub v19.4s, v19.4s, v12.4s\n"
3487         "sub v20.4s, v20.4s, v11.4s\n"
3488         "sub v21.4s, v21.4s, v12.4s\n"
3489         "sub v22.4s, v22.4s, v11.4s\n"
3490         "sub v23.4s, v23.4s, v12.4s\n"
3491         "sub v24.4s, v24.4s, v11.4s\n"
3492         "sub v25.4s, v25.4s, v12.4s\n"
3493         "sub v26.4s, v26.4s, v11.4s\n"
3494         "sub v27.4s, v27.4s, v12.4s\n"
3495         "sub v28.4s, v28.4s, v11.4s\n"
3496         "sub v29.4s, v29.4s, v12.4s\n"
3497         "sub v30.4s, v30.4s, v11.4s\n"
3498         "sub v31.4s, v31.4s, v12.4s\n"
3499 
3500         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
3501         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
3502 
3503         "402:\n"
3504 
3505         // At this point we have computed the final int32 values. Now we
3506         // start down-quantizing them to obtain the final 8bit values from them.
3507 
3508         // As part of this down-quantization, our int32 values will be
3509         // multiplied by a multiplier that has a fixed-point component and an
3510         // exponent component.
3511 
3512         //Load the exponent part of the multiplier.
3513         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
3514         // Determine the channel index.
3515         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
3516         "csel w3, %w[row], %w[col], eq\n"
3517         // Compute the multiplier_exponent pointer
3518         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
3519         "add x5, x1, x3, lsl #2\n"
3520         "csel x1, x1, x5, eq\n"
3521         // Load multiplier_exponent
3522         "ldr q9, [x1]\n"
3523         "ldr q10, [x1, #16]\n"
3524         // Separate positive and negative exponents
3525         "smin v11.4s, v8.4s, v9.4s\n"
3526         "smin v12.4s, v8.4s, v10.4s\n"
3527         "sub v9.4s, v9.4s, v11.4s\n"
3528         "sub v10.4s, v10.4s, v12.4s\n"
3529 
3530         // Compute the multiplier_fixedpoint pointer
3531         "ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
3532         "add x5, x4, x3, lsl #2\n"
3533         "csel x4, x4, x5, eq\n"
3534         // Load multiplier_fixedpoint
3535         "ldr q14, [x4]\n"
3536         "ldr q15, [x4, #16]\n"
3537 
3538         // Jump based on channel dimension.
3539         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
3540         "bne 8f\n"
3541         // Case where channels are rows
3542 
3543         // Apply the positive exponent part of the multiplier.
3544         "sshl v16.4s, v16.4s, v9.4s\n"
3545         "sshl v17.4s, v17.4s, v10.4s\n"
3546         "sshl v18.4s, v18.4s, v9.4s\n"
3547         "sshl v19.4s, v19.4s, v10.4s\n"
3548         "sshl v20.4s, v20.4s, v9.4s\n"
3549         "sshl v21.4s, v21.4s, v10.4s\n"
3550         "sshl v22.4s, v22.4s, v9.4s\n"
3551         "sshl v23.4s, v23.4s, v10.4s\n"
3552         "sshl v24.4s, v24.4s, v9.4s\n"
3553         "sshl v25.4s, v25.4s, v10.4s\n"
3554         "sshl v26.4s, v26.4s, v9.4s\n"
3555         "sshl v27.4s, v27.4s, v10.4s\n"
3556         "sshl v28.4s, v28.4s, v9.4s\n"
3557         "sshl v29.4s, v29.4s, v10.4s\n"
3558         "sshl v30.4s, v30.4s, v9.4s\n"
3559         "sshl v31.4s, v31.4s, v10.4s\n"
3560         "10:\n"
3561 
3562         // Apply the fixed-point part of the multiplier.
3563         "sqdmulh v16.4s, v16.4s, v14.4s\n"
3564         "sqdmulh v17.4s, v17.4s, v15.4s\n"
3565         "sqdmulh v18.4s, v18.4s, v14.4s\n"
3566         "sqdmulh v19.4s, v19.4s, v15.4s\n"
3567         "sqdmulh v20.4s, v20.4s, v14.4s\n"
3568         "sqdmulh v21.4s, v21.4s, v15.4s\n"
3569         "sqdmulh v22.4s, v22.4s, v14.4s\n"
3570         "sqdmulh v23.4s, v23.4s, v15.4s\n"
3571         "sqdmulh v24.4s, v24.4s, v14.4s\n"
3572         "sqdmulh v25.4s, v25.4s, v15.4s\n"
3573         "sqdmulh v26.4s, v26.4s, v14.4s\n"
3574         "sqdmulh v27.4s, v27.4s, v15.4s\n"
3575         "sqdmulh v28.4s, v28.4s, v14.4s\n"
3576         "sqdmulh v29.4s, v29.4s, v15.4s\n"
3577         "sqdmulh v30.4s, v30.4s, v14.4s\n"
3578         "sqdmulh v31.4s, v31.4s, v15.4s\n"
3579 
3580         // Apply the negative exponent part of the multiplier.
3581         "srshl v16.4s, v16.4s, v11.4s\n"
3582         "srshl v17.4s, v17.4s, v12.4s\n"
3583         "srshl v18.4s, v18.4s, v11.4s\n"
3584         "srshl v19.4s, v19.4s, v12.4s\n"
3585         "srshl v20.4s, v20.4s, v11.4s\n"
3586         "srshl v21.4s, v21.4s, v12.4s\n"
3587         "srshl v22.4s, v22.4s, v11.4s\n"
3588         "srshl v23.4s, v23.4s, v12.4s\n"
3589         "srshl v24.4s, v24.4s, v11.4s\n"
3590         "srshl v25.4s, v25.4s, v12.4s\n"
3591         "srshl v26.4s, v26.4s, v11.4s\n"
3592         "srshl v27.4s, v27.4s, v12.4s\n"
3593         "srshl v28.4s, v28.4s, v11.4s\n"
3594         "srshl v29.4s, v29.4s, v12.4s\n"
3595         "srshl v30.4s, v30.4s, v11.4s\n"
3596         "srshl v31.4s, v31.4s, v12.4s\n"
3597         "b 9f\n"
3598 
3599         "8:\n"
3600         // Case where channels are columns
3601 
3602         // Apply the positive exponent part of the multiplier.
3603         "dup v4.4s, v9.s[0]\n"
3604         "dup v5.4s, v9.s[1]\n"
3605         "dup v6.4s, v9.s[2]\n"
3606         "dup v7.4s, v9.s[3]\n"
3607         "sshl v16.4s, v16.4s, v4.4s\n"
3608         "sshl v17.4s, v17.4s, v4.4s\n"
3609         "sshl v18.4s, v18.4s, v5.4s\n"
3610         "sshl v19.4s, v19.4s, v5.4s\n"
3611         "sshl v20.4s, v20.4s, v6.4s\n"
3612         "sshl v21.4s, v21.4s, v6.4s\n"
3613         "sshl v22.4s, v22.4s, v7.4s\n"
3614         "sshl v23.4s, v23.4s, v7.4s\n"
3615         "dup v4.4s, v10.s[0]\n"
3616         "dup v5.4s, v10.s[1]\n"
3617         "dup v6.4s, v10.s[2]\n"
3618         "dup v7.4s, v10.s[3]\n"
3619         "sshl v24.4s, v24.4s, v4.4s\n"
3620         "sshl v25.4s, v25.4s, v4.4s\n"
3621         "sshl v26.4s, v26.4s, v5.4s\n"
3622         "sshl v27.4s, v27.4s, v5.4s\n"
3623         "sshl v28.4s, v28.4s, v6.4s\n"
3624         "sshl v29.4s, v29.4s, v6.4s\n"
3625         "sshl v30.4s, v30.4s, v7.4s\n"
3626         "sshl v31.4s, v31.4s, v7.4s\n"
3627         "11:\n"
3628 
3629         // Apply the fixed-point part of the multiplier.
3630         "sqdmulh v16.4s, v16.4s, v14.s[0]\n"
3631         "sqdmulh v17.4s, v17.4s, v14.s[0]\n"
3632         "sqdmulh v18.4s, v18.4s, v14.s[1]\n"
3633         "sqdmulh v19.4s, v19.4s, v14.s[1]\n"
3634         "sqdmulh v20.4s, v20.4s, v14.s[2]\n"
3635         "sqdmulh v21.4s, v21.4s, v14.s[2]\n"
3636         "sqdmulh v22.4s, v22.4s, v14.s[3]\n"
3637         "sqdmulh v23.4s, v23.4s, v14.s[3]\n"
3638         "sqdmulh v24.4s, v24.4s, v15.s[0]\n"
3639         "sqdmulh v25.4s, v25.4s, v15.s[0]\n"
3640         "sqdmulh v26.4s, v26.4s, v15.s[1]\n"
3641         "sqdmulh v27.4s, v27.4s, v15.s[1]\n"
3642         "sqdmulh v28.4s, v28.4s, v15.s[2]\n"
3643         "sqdmulh v29.4s, v29.4s, v15.s[2]\n"
3644         "sqdmulh v30.4s, v30.4s, v15.s[3]\n"
3645         "sqdmulh v31.4s, v31.4s, v15.s[3]\n"
3646 
3647         // Apply the negative exponent part of the multiplier.
3648         "dup v4.4s, v11.s[0]\n"
3649         "dup v5.4s, v11.s[1]\n"
3650         "dup v6.4s, v11.s[2]\n"
3651         "dup v7.4s, v11.s[3]\n"
3652         "srshl v16.4s, v16.4s, v4.4s\n"
3653         "srshl v17.4s, v17.4s, v4.4s\n"
3654         "srshl v18.4s, v18.4s, v5.4s\n"
3655         "srshl v19.4s, v19.4s, v5.4s\n"
3656         "srshl v20.4s, v20.4s, v6.4s\n"
3657         "srshl v21.4s, v21.4s, v6.4s\n"
3658         "srshl v22.4s, v22.4s, v7.4s\n"
3659         "srshl v23.4s, v23.4s, v7.4s\n"
3660         "dup v4.4s, v12.s[0]\n"
3661         "dup v5.4s, v12.s[1]\n"
3662         "dup v6.4s, v12.s[2]\n"
3663         "dup v7.4s, v12.s[3]\n"
3664         "srshl v24.4s, v24.4s, v4.4s\n"
3665         "srshl v25.4s, v25.4s, v4.4s\n"
3666         "srshl v26.4s, v26.4s, v5.4s\n"
3667         "srshl v27.4s, v27.4s, v5.4s\n"
3668         "srshl v28.4s, v28.4s, v6.4s\n"
3669         "srshl v29.4s, v29.4s, v6.4s\n"
3670         "srshl v30.4s, v30.4s, v7.4s\n"
3671         "srshl v31.4s, v31.4s, v7.4s\n"
3672         "9:\n"
3673 
3674         "ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
3675         "ins v13.h[4], w4\n" // dst_zero_point
3676 
3677         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
3678         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
3679         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
3680         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
3681 
3682         RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
3683 
3684         // Cast-and-saturate from int32 to int16
3685         "sqxtn v16.4h, v16.4s\n"
3686         "sqxtn2 v16.8h, v17.4s\n"
3687         "sqxtn v17.4h, v18.4s\n"
3688         "sqxtn2 v17.8h, v19.4s\n"
3689         "sqxtn v18.4h, v20.4s\n"
3690         "sqxtn2 v18.8h, v21.4s\n"
3691         "sqxtn v19.4h, v22.4s\n"
3692         "sqxtn2 v19.8h, v23.4s\n"
3693         "sqxtn v20.4h, v24.4s\n"
3694         "sqxtn2 v20.8h, v25.4s\n"
3695         "sqxtn v21.4h, v26.4s\n"
3696         "sqxtn2 v21.8h, v27.4s\n"
3697         "sqxtn v22.4h, v28.4s\n"
3698         "sqxtn2 v22.8h, v29.4s\n"
3699         "sqxtn v23.4h, v30.4s\n"
3700         "sqxtn2 v23.8h, v31.4s\n"
3701 
3702         // At this point, v24 -- v31 aren't used anymore for the current block,
3703         // so we can start clearing these accumulators for the next block
3704         // (next iteration of the main loop).
3705         RUY_MAKE_ZERO(v24)
3706         RUY_MAKE_ZERO(v25)
3707         RUY_MAKE_ZERO(v26)
3708         RUY_MAKE_ZERO(v27)
3709         RUY_MAKE_ZERO(v28)
3710         RUY_MAKE_ZERO(v29)
3711         RUY_MAKE_ZERO(v30)
3712         RUY_MAKE_ZERO(v31)
3713 
3714         // Add the destination zero point
3715         "dup v14.8h, v13.h[4]\n"
3716         "sqadd v16.8h, v16.8h, v14.8h\n"
3717         "sqadd v17.8h, v17.8h, v14.8h\n"
3718         "sqadd v18.8h, v18.8h, v14.8h\n"
3719         "sqadd v19.8h, v19.8h, v14.8h\n"
3720         "sqadd v20.8h, v20.8h, v14.8h\n"
3721         "sqadd v21.8h, v21.8h, v14.8h\n"
3722         "sqadd v22.8h, v22.8h, v14.8h\n"
3723         "sqadd v23.8h, v23.8h, v14.8h\n"
3724 
3725         // Cast-and-saturate from int16 to uint8
3726         "sqxtun v16.8b, v16.8h\n"
3727         "sqxtun2 v16.16b, v17.8h\n"
3728         "sqxtun v17.8b, v18.8h\n"
3729         "sqxtun2 v17.16b, v19.8h\n"
3730         "sqxtun v18.8b, v20.8h\n"
3731         "sqxtun2 v18.16b, v21.8h\n"
3732         "sqxtun v19.8b, v22.8h\n"
3733         "sqxtun2 v19.16b, v23.8h\n"
3734 
3735         // Load the clamp_min, clamp_max bounds
3736         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
3737         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
3738         "dup v14.16b, w2\n"  // clamp_min
3739         "dup v15.16b, w3\n"  // clamp_max
3740 
3741         // Apply the clamp_min bound
3742         "umax v16.16b, v16.16b, v14.16b\n"
3743         "umax v17.16b, v17.16b, v14.16b\n"
3744         "umax v18.16b, v18.16b, v14.16b\n"
3745         "umax v19.16b, v19.16b, v14.16b\n"
3746 
3747         // Apply the clamp_max bound
3748         "umin v16.16b, v16.16b, v15.16b\n"
3749         "umin v17.16b, v17.16b, v15.16b\n"
3750         "umin v18.16b, v18.16b, v15.16b\n"
3751         "umin v19.16b, v19.16b, v15.16b\n"
3752 
3753         // Make it so that all of the final 8bit values are stored in the
3754         // first 64bits of 128bit NEON registers, so they can be stored
3755         // by 64bit st1 store instructions with byte alignment.
3756         "dup d20, v16.d[1]\n"
3757         "dup d21, v17.d[1]\n"
3758         "dup d22, v18.d[1]\n"
3759         "dup d23, v19.d[1]\n"
3760 
3761         // Compute how much of the 8x8 block of destination 8bit values that
3762         // we have computed, fit in the destination matrix. Typically, all of
3763         // it fits, but when the destination matrix shape is not a multiple
3764         // of 8x8, there are some 8x8 blocks along the boundaries that do
3765         // not fit entirely.
3766         "sub w1, %w[dst_rows], %w[row]\n"
3767         "sub w2, %w[dst_cols], %w[col]\n"
3768         "mov w3, #8\n"
3769         "cmp w1, #8\n"
3770         // Compute w1 = how many rows of the 8x8 block fit
3771         "csel w1, w1, w3, le\n"
3772         "cmp w2, #8\n"
3773         // Compute w2 = how many cols of the 8x8 block fit
3774         "csel w2, w2, w3, le\n"
3775 
3776         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
3777         "cmp w1, w3\n"
3778         "ccmp w2, w3, 0, eq\n"
3779         // Yes, all of the 8x8 block fits, go to fast path.
3780         "beq 30f\n"
3781         // Not all of the 8x8 block fits.
3782         // Set (x3 address, x4 stride) to write to dst_tmp_buf
3783         "mov x3, %[dst_tmp_buf]\n"
3784         "mov x4, #8\n"
3785         "b 31f\n"
3786         "30:\n"
3787         // Yes, all of the 8x8 block fits.
3788         // Set (x3 address, x4 stride) to write directly to destination matrix.
3789         "mov x3, %[dst_ptr]\n"
3790         "mov x4, x11\n"
3791         "31:\n"
3792 
3793         // Write our 8bit values to the destination described by
3794         // (x3 address, x4 stride).
3795         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3796         "st1 {v16.8b}, [x3], x4\n"
3797         RUY_MAKE_ZERO(v16)
3798         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3799         "st1 {v20.8b}, [x3], x4\n"
3800         RUY_MAKE_ZERO(v20)
3801         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3802         "st1 {v17.8b}, [x3], x4\n"
3803         RUY_MAKE_ZERO(v17)
3804         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3805         "st1 {v21.8b}, [x3], x4\n"
3806         RUY_MAKE_ZERO(v21)
3807         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3808         "st1 {v18.8b}, [x3], x4\n"
3809         RUY_MAKE_ZERO(v18)
3810         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3811         "st1 {v22.8b}, [x3], x4\n"
3812         RUY_MAKE_ZERO(v22)
3813         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3814         "st1 {v19.8b}, [x3], x4\n"
3815         RUY_MAKE_ZERO(v19)
3816         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3817         "st1 {v23.8b}, [x3], x4\n"
3818         RUY_MAKE_ZERO(v23)
3819 
3820         // For the next block: perform the first few multiply-adds on the data
3821         // that we have already loaded.
3822         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
3823         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
3824         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
3825         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
3826 
3827         // If all of the 8x8 block fits, we just finished writing it to the
3828         // destination, so we skip the next part.
3829         "beq 41f\n"
3830         // Not all of the 8x8 block fits in the destination matrix.  We just
3831         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
3832         // it to copy into the destination matrix the part that fits.
3833         "mov x3, %[dst_tmp_buf]\n"
3834         "mov x4, %[dst_ptr]\n"
3835         "mov w6, #0\n"
3836         "50:\n"
3837         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
3838         "mov w5, #0\n"
3839         "51:\n"
3840         "ldrb w7, [x3, w5, uxtw]\n"
3841         "strb w7, [x4, w5, uxtw]\n"
3842         "add w5, w5, #1\n"
3843         "cmp w5, w1\n"
3844         "blt 51b\n"
3845         "add w6, w6, #1\n"
3846         "add x3, x3, #8\n"
3847         "add x4, x4, x11\n"
3848         "cmp w6, w2\n"
3849         "blt 50b\n"
3850         "41:\n"
3851         "add %[dst_ptr], %[dst_ptr], #8\n"
3852         // At this point we have completely finished writing values to the
3853         // destination matrix for the current block.
3854 
3855         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
3856 
3857         RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
3858 
3859         // Cast-and-saturate from int32 to int16
3860         "sqxtn v16.4h, v16.4s\n"
3861         "sqxtn2 v16.8h, v17.4s\n"
3862         "sqxtn v17.4h, v18.4s\n"
3863         "sqxtn2 v17.8h, v19.4s\n"
3864         "sqxtn v18.4h, v20.4s\n"
3865         "sqxtn2 v18.8h, v21.4s\n"
3866         "sqxtn v19.4h, v22.4s\n"
3867         "sqxtn2 v19.8h, v23.4s\n"
3868         "sqxtn v20.4h, v24.4s\n"
3869         "sqxtn2 v20.8h, v25.4s\n"
3870         "sqxtn v21.4h, v26.4s\n"
3871         "sqxtn2 v21.8h, v27.4s\n"
3872         "sqxtn v22.4h, v28.4s\n"
3873         "sqxtn2 v22.8h, v29.4s\n"
3874         "sqxtn v23.4h, v30.4s\n"
3875         "sqxtn2 v23.8h, v31.4s\n"
3876 
3877         // At this point, v24 -- v31 aren't used anymore for the current block,
3878         // so we can start clearing these accumulators for the next block
3879         // (next iteration of the main loop).
3880         RUY_MAKE_ZERO(v24)
3881         RUY_MAKE_ZERO(v25)
3882         RUY_MAKE_ZERO(v26)
3883         RUY_MAKE_ZERO(v27)
3884         RUY_MAKE_ZERO(v28)
3885         RUY_MAKE_ZERO(v29)
3886         RUY_MAKE_ZERO(v30)
3887         RUY_MAKE_ZERO(v31)
3888 
3889         // Add the destination zero point
3890         "dup v14.8h, v13.h[4]\n"
3891         "sqadd v16.8h, v16.8h, v14.8h\n"
3892         "sqadd v17.8h, v17.8h, v14.8h\n"
3893         "sqadd v18.8h, v18.8h, v14.8h\n"
3894         "sqadd v19.8h, v19.8h, v14.8h\n"
3895         "sqadd v20.8h, v20.8h, v14.8h\n"
3896         "sqadd v21.8h, v21.8h, v14.8h\n"
3897         "sqadd v22.8h, v22.8h, v14.8h\n"
3898         "sqadd v23.8h, v23.8h, v14.8h\n"
3899 
3900         // Cast-and-saturate from int16 to uint8
3901         "sqxtn v16.8b, v16.8h\n"
3902         "sqxtn2 v16.16b, v17.8h\n"
3903         "sqxtn v17.8b, v18.8h\n"
3904         "sqxtn2 v17.16b, v19.8h\n"
3905         "sqxtn v18.8b, v20.8h\n"
3906         "sqxtn2 v18.16b, v21.8h\n"
3907         "sqxtn v19.8b, v22.8h\n"
3908         "sqxtn2 v19.16b, v23.8h\n"
3909 
3910         // Load the clamp_min, clamp_max bounds
3911         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
3912         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
3913         "dup v14.16b, w2\n"  // clamp_min
3914         "dup v15.16b, w3\n"  // clamp_max
3915 
3916         // Apply the clamp_min bound
3917         "smax v16.16b, v16.16b, v14.16b\n"
3918         "smax v17.16b, v17.16b, v14.16b\n"
3919         "smax v18.16b, v18.16b, v14.16b\n"
3920         "smax v19.16b, v19.16b, v14.16b\n"
3921 
3922         // Apply the clamp_max bound
3923         "smin v16.16b, v16.16b, v15.16b\n"
3924         "smin v17.16b, v17.16b, v15.16b\n"
3925         "smin v18.16b, v18.16b, v15.16b\n"
3926         "smin v19.16b, v19.16b, v15.16b\n"
3927 
3928         // Make it so that all of the final 8bit values are stored in the
3929         // first 64bits of 128bit NEON registers, so they can be stored
3930         // by 64bit st1 store instructions with byte alignment.
3931         "dup d20, v16.d[1]\n"
3932         "dup d21, v17.d[1]\n"
3933         "dup d22, v18.d[1]\n"
3934         "dup d23, v19.d[1]\n"
3935 
3936         // Compute how much of the 8x8 block of destination 8bit values that
3937         // we have computed, fit in the destination matrix. Typically, all of
3938         // it fits, but when the destination matrix shape is not a multiple
3939         // of 8x8, there are some 8x8 blocks along the boundaries that do
3940         // not fit entirely.
3941         "sub w1, %w[dst_rows], %w[row]\n"
3942         "sub w2, %w[dst_cols], %w[col]\n"
3943         "mov w3, #8\n"
3944         "cmp w1, #8\n"
3945         // Compute w1 = how many rows of the 8x8 block fit
3946         "csel w1, w1, w3, le\n"
3947         "cmp w2, #8\n"
3948         // Compute w2 = how many cols of the 8x8 block fit
3949         "csel w2, w2, w3, le\n"
3950 
3951         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
3952         "cmp w1, w3\n"
3953         "ccmp w2, w3, 0, eq\n"
3954         // Yes, all of the 8x8 block fits, go to fast path.
3955         "beq 130f\n"
3956         // Not all of the 8x8 block fits.
3957         // Set (x3 address, x4 stride) to write to dst_tmp_buf
3958         "mov x3, %[dst_tmp_buf]\n"
3959         "mov x4, #8\n"
3960         "b 131f\n"
3961         "130:\n"
3962         // Yes, all of the 8x8 block fits.
3963         // Set (x3 address, x4 stride) to write directly to destination matrix.
3964         "mov x3, %[dst_ptr]\n"
3965         "mov x4, x11\n"
3966         "131:\n"
3967 
3968         // Write our 8bit values to the destination described by
3969         // (x3 address, x4 stride).
3970         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3971         "st1 {v16.8b}, [x3], x4\n"
3972         RUY_MAKE_ZERO(v16)
3973         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3974         "st1 {v20.8b}, [x3], x4\n"
3975         RUY_MAKE_ZERO(v20)
3976         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3977         "st1 {v17.8b}, [x3], x4\n"
3978         RUY_MAKE_ZERO(v17)
3979         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3980         "st1 {v21.8b}, [x3], x4\n"
3981         RUY_MAKE_ZERO(v21)
3982         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3983         "st1 {v18.8b}, [x3], x4\n"
3984         RUY_MAKE_ZERO(v18)
3985         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3986         "st1 {v22.8b}, [x3], x4\n"
3987         RUY_MAKE_ZERO(v22)
3988         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3989         "st1 {v19.8b}, [x3], x4\n"
3990         RUY_MAKE_ZERO(v19)
3991         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
3992         "st1 {v23.8b}, [x3], x4\n"
3993         RUY_MAKE_ZERO(v23)
3994 
3995         // For the next block: perform the first few multiply-adds on the data
3996         // that we have already loaded.
3997         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
3998         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
3999         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
4000         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
4001 
4002         // If all of the 8x8 block fits, we just finished writing it to the
4003         // destination, so we skip the next part.
4004         "beq 141f\n"
4005         // Not all of the 8x8 block fits in the destination matrix.  We just
4006         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
4007         // it to copy into the destination matrix the part that fits.
4008         "mov x3, %[dst_tmp_buf]\n"
4009         "mov x4, %[dst_ptr]\n"
4010         "mov w6, #0\n"
4011         "150:\n"
4012         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
4013         "mov w5, #0\n"
4014         "151:\n"
4015         "ldrb w7, [x3, w5, uxtw]\n"
4016         "strb w7, [x4, w5, uxtw]\n"
4017         "add w5, w5, #1\n"
4018         "cmp w5, w1\n"
4019         "blt 151b\n"
4020         "add w6, w6, #1\n"
4021         "add x3, x3, #8\n"
4022         "add x4, x4, x11\n"
4023         "cmp w6, w2\n"
4024         "blt 150b\n"
4025         "141:\n"
4026         "add %[dst_ptr], %[dst_ptr], #8\n"
4027         // At this point we have completely finished writing values to the
4028         // destination matrix for the current block.
4029 
4030         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
4031 
4032         RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
4033 
4034         // Add the destination zero point
4035         "dup v14.8h, v13.h[4]\n"
4036         "saddw v16.4s, v16.4s, v14.4h\n"
4037         "saddw v17.4s, v17.4s, v14.4h\n"
4038         "saddw v18.4s, v18.4s, v14.4h\n"
4039         "saddw v19.4s, v19.4s, v14.4h\n"
4040         "saddw v20.4s, v20.4s, v14.4h\n"
4041         "saddw v21.4s, v21.4s, v14.4h\n"
4042         "saddw v22.4s, v22.4s, v14.4h\n"
4043         "saddw v23.4s, v23.4s, v14.4h\n"
4044         "saddw v24.4s, v24.4s, v14.4h\n"
4045         "saddw v25.4s, v25.4s, v14.4h\n"
4046         "saddw v26.4s, v26.4s, v14.4h\n"
4047         "saddw v27.4s, v27.4s, v14.4h\n"
4048         "saddw v28.4s, v28.4s, v14.4h\n"
4049         "saddw v29.4s, v29.4s, v14.4h\n"
4050         "saddw v30.4s, v30.4s, v14.4h\n"
4051         "saddw v31.4s, v31.4s, v14.4h\n"
4052 
4053         // Cast-and-saturate from int32 to int16
4054         "sqxtn v16.4h, v16.4s\n"
4055         "sqxtn2 v16.8h, v17.4s\n"
4056         "sqxtn v17.4h, v18.4s\n"
4057         "sqxtn2 v17.8h, v19.4s\n"
4058         "sqxtn v18.4h, v20.4s\n"
4059         "sqxtn2 v18.8h, v21.4s\n"
4060         "sqxtn v19.4h, v22.4s\n"
4061         "sqxtn2 v19.8h, v23.4s\n"
4062         "sqxtn v20.4h, v24.4s\n"
4063         "sqxtn2 v20.8h, v25.4s\n"
4064         "sqxtn v21.4h, v26.4s\n"
4065         "sqxtn2 v21.8h, v27.4s\n"
4066         "sqxtn v22.4h, v28.4s\n"
4067         "sqxtn2 v22.8h, v29.4s\n"
4068         "sqxtn v23.4h, v30.4s\n"
4069         "sqxtn2 v23.8h, v31.4s\n"
4070 
4071         // At this point, v24 -- v31 aren't used anymore for the current block,
4072         // so we can start clearing these accumulators for the next block
4073         // (next iteration of the main loop).
4074         RUY_MAKE_ZERO(v24)
4075         RUY_MAKE_ZERO(v25)
4076         RUY_MAKE_ZERO(v26)
4077         RUY_MAKE_ZERO(v27)
4078         RUY_MAKE_ZERO(v28)
4079         RUY_MAKE_ZERO(v29)
4080         RUY_MAKE_ZERO(v30)
4081         RUY_MAKE_ZERO(v31)
4082 
4083         // Load the clamp_min, clamp_max bounds
4084         "ldrsh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
4085         "ldrsh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
4086         "dup v14.8h, w2\n"  // clamp_min
4087         "dup v15.8h, w3\n"  // clamp_max
4088 
4089         // Apply the clamp_min bound
4090         "smax v16.8h, v16.8h, v14.8h\n"
4091         "smax v17.8h, v17.8h, v14.8h\n"
4092         "smax v18.8h, v18.8h, v14.8h\n"
4093         "smax v19.8h, v19.8h, v14.8h\n"
4094         "smax v20.8h, v20.8h, v14.8h\n"
4095         "smax v21.8h, v21.8h, v14.8h\n"
4096         "smax v22.8h, v22.8h, v14.8h\n"
4097         "smax v23.8h, v23.8h, v14.8h\n"
4098         // Apply the clamp_max bound
4099         "smin v16.8h, v16.8h, v15.8h\n"
4100         "smin v17.8h, v17.8h, v15.8h\n"
4101         "smin v18.8h, v18.8h, v15.8h\n"
4102         "smin v19.8h, v19.8h, v15.8h\n"
4103         "smin v20.8h, v20.8h, v15.8h\n"
4104         "smin v21.8h, v21.8h, v15.8h\n"
4105         "smin v22.8h, v22.8h, v15.8h\n"
4106         "smin v23.8h, v23.8h, v15.8h\n"
4107 
4108         // Compute how much of the 8x8 block of destination 16bit values that
4109         // we have computed, fit in the destination matrix. Typically, all of
4110         // it fits, but when the destination matrix shape is not a multiple
4111         // of 8x8, there are some 8x8 blocks along the boundaries that do
4112         // not fit entirely.
4113         "sub w1, %w[dst_rows], %w[row]\n"
4114         "sub w2, %w[dst_cols], %w[col]\n"
4115         "mov w3, #8\n"
4116         "cmp w1, #8\n"
4117         // Compute w1 = how many rows of the 8x8 block fit
4118         "csel w1, w1, w3, le\n"
4119         "cmp w2, #8\n"
4120         // Compute w1 = how many rows of the 8x8 block fit
4121         "csel w2, w2, w3, le\n"
4122 
4123         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
4124         "cmp w1, w3\n"
4125         "ccmp w2, w3, 0, eq\n"
4126         // Yes, all of the 8x8 block fits, go to fast path.
4127         "beq 230f\n"
4128         // Not all of the 8x8 block fits.
4129         // Set (x3 address, x4 stride) to write to dst_tmp_buf
4130         "mov x3, %[dst_tmp_buf]\n"
4131         "mov x4, #16\n"
4132         "b 231f\n"
4133         "230:\n"
4134         // Yes, all of the 8x8 block fits.
4135         // Set (x3 address, x4 stride) to write directly to destination matrix.
4136         "mov x3, %[dst_ptr]\n"
4137         "mov x4, x11\n"
4138         "231:\n"
4139 
4140         // Write our 16bit values to the destination described by
4141         // (x3 address, x4 stride).
4142         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
4143         "st1 {v16.8h}, [x3], x4\n"
4144         RUY_MAKE_ZERO(v16)
4145         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
4146         "st1 {v17.8h}, [x3], x4\n"
4147         RUY_MAKE_ZERO(v17)
4148         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
4149         "st1 {v18.8h}, [x3], x4\n"
4150         RUY_MAKE_ZERO(v18)
4151         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
4152         "st1 {v19.8h}, [x3], x4\n"
4153         RUY_MAKE_ZERO(v19)
4154         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
4155         "st1 {v20.8h}, [x3], x4\n"
4156         RUY_MAKE_ZERO(v20)
4157         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
4158         "st1 {v21.8h}, [x3], x4\n"
4159         RUY_MAKE_ZERO(v21)
4160         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
4161         "st1 {v22.8h}, [x3], x4\n"
4162         RUY_MAKE_ZERO(v22)
4163         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
4164         "st1 {v23.8h}, [x3], x4\n"
4165         RUY_MAKE_ZERO(v23)
4166 
4167         // For the next block: perform the first few multiply-adds on the data
4168         // that we have already loaded.
4169         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
4170         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
4171         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
4172         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
4173 
4174         // If all of the 8x8 block fits, we just finished writing it to the
4175         // destination, so we skip the next part.
4176         "beq 241f\n"
4177         // Not all of the 8x8 block fits in the destination matrix.  We just
4178         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
4179         // it to copy into the destination matrix the part that fits.
4180         "mov x3, %[dst_tmp_buf]\n"
4181         "mov x4, %[dst_ptr]\n"
4182         "mov w6, #0\n"
4183         "250:\n"
4184         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
4185         "mov w5, #0\n"
4186         "251:\n"
4187         "ldrsh w7, [x3, x5, lsl #1]\n"
4188         "strh w7, [x4, x5, lsl #1]\n"
4189         "add w5, w5, #1\n"
4190         "cmp w5, w1\n"
4191         "blt 251b\n"
4192         "add w6, w6, #1\n"
4193         "add x3, x3, #16\n"
4194         "add x4, x4, x11\n"
4195         "cmp w6, w2\n"
4196         "blt 250b\n"
4197         "241:\n"
4198         "add %[dst_ptr], %[dst_ptr], #16\n"
4199         // At this point we have completely finished writing values to the
4200         // destination matrix for the current block.
4201 
4202         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
4203 
4204         RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
4205 
4206         // Since the store type is the same as the accum type, no need for
4207         // downcast. There's also no need for clamp by min/max.
4208 
4209         // Compute how much of the 8x8 block of destination 32it values that
4210         // we have computed, fit in the destination matrix. Typically, all of
4211         // it fits, but when the destination matrix shape is not a multiple
4212         // of 8x8, there are some 8x8 blocks along the boundaries that do
4213         // not fit entirely.
4214         "sub w1, %w[dst_rows], %w[row]\n"
4215         "sub w2, %w[dst_cols], %w[col]\n"
4216         "mov w3, #8\n"
4217         "cmp w1, #8\n"
4218         // Compute w1 = how many rows of the 8x8 block fit
4219         "csel w1, w1, w3, le\n"
4220         "cmp w2, #8\n"
4221         // Compute w1 = how many rows of the 8x8 block fit
4222         "csel w2, w2, w3, le\n"
4223 
4224         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
4225         "cmp w1, w3\n"
4226         "ccmp w2, w3, 0, eq\n"
4227         // Yes, all of the 8x8 block fits, go to fast path.
4228         "beq 330f\n"
4229         // Not all of the 8x8 block fits.
4230         // Write to dst_tmp_buf
4231         "mov x3, %[dst_tmp_buf]\n"
4232         "st1 {v16.4s}, [x3], #16\n"
4233         RUY_MAKE_ZERO(v16)
4234         "st1 {v17.4s}, [x3], #16\n"
4235         RUY_MAKE_ZERO(v17)
4236         "st1 {v18.4s}, [x3], #16\n"
4237         RUY_MAKE_ZERO(v18)
4238         "st1 {v19.4s}, [x3], #16\n"
4239         RUY_MAKE_ZERO(v19)
4240         "st1 {v20.4s}, [x3], #16\n"
4241         RUY_MAKE_ZERO(v20)
4242         "st1 {v21.4s}, [x3], #16\n"
4243         RUY_MAKE_ZERO(v21)
4244         "st1 {v22.4s}, [x3], #16\n"
4245         RUY_MAKE_ZERO(v22)
4246         "st1 {v23.4s}, [x3], #16\n"
4247         RUY_MAKE_ZERO(v23)
4248         "st1 {v24.4s}, [x3], #16\n"
4249         RUY_MAKE_ZERO(v24)
4250         "st1 {v25.4s}, [x3], #16\n"
4251         RUY_MAKE_ZERO(v25)
4252         "st1 {v26.4s}, [x3], #16\n"
4253         RUY_MAKE_ZERO(v26)
4254         "st1 {v27.4s}, [x3], #16\n"
4255         RUY_MAKE_ZERO(v27)
4256         "st1 {v28.4s}, [x3], #16\n"
4257         RUY_MAKE_ZERO(v28)
4258         "st1 {v29.4s}, [x3], #16\n"
4259         RUY_MAKE_ZERO(v29)
4260         "st1 {v30.4s}, [x3], #16\n"
4261         RUY_MAKE_ZERO(v30)
4262         "st1 {v31.4s}, [x3], #16\n"
4263         RUY_MAKE_ZERO(v31)
4264 
4265         "b 331f\n"
4266 
4267         "330:\n"
4268         // Yes, all of the 8x8 block fits.
4269         "mov x4, %[dst_ptr]\n"
4270         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
4271         "mov x3, x4\n"
4272         "st1 {v16.4s, v17.4s}, [x3], #32\n"
4273         RUY_MAKE_ZERO(v16)
4274         RUY_MAKE_ZERO(v17)
4275         "add x4, x4, x11\n"
4276         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
4277         "mov x3, x4\n"
4278         "st1 {v18.4s, v19.4s}, [x3], #32\n"
4279         RUY_MAKE_ZERO(v18)
4280         RUY_MAKE_ZERO(v19)
4281         "add x4, x4, x11\n"
4282         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
4283         "mov x3, x4\n"
4284         "st1 {v20.4s, v21.4s}, [x3], #32\n"
4285         RUY_MAKE_ZERO(v20)
4286         RUY_MAKE_ZERO(v21)
4287         "add x4, x4, x11\n"
4288         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
4289         "mov x3, x4\n"
4290         "st1 {v22.4s, v23.4s}, [x3], #32\n"
4291         RUY_MAKE_ZERO(v22)
4292         RUY_MAKE_ZERO(v23)
4293         "add x4, x4, x11\n"
4294         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
4295         "mov x3, x4\n"
4296         "st1 {v24.4s, v25.4s}, [x3], #32\n"
4297         RUY_MAKE_ZERO(v24)
4298         RUY_MAKE_ZERO(v25)
4299         "add x4, x4, x11\n"
4300         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
4301         "mov x3, x4\n"
4302         "st1 {v26.4s, v27.4s}, [x3], #32\n"
4303         RUY_MAKE_ZERO(v26)
4304         RUY_MAKE_ZERO(v27)
4305         "add x4, x4, x11\n"
4306         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
4307         "mov x3, x4\n"
4308         "st1 {v28.4s, v29.4s}, [x3], #32\n"
4309         RUY_MAKE_ZERO(v28)
4310         RUY_MAKE_ZERO(v29)
4311         "add x4, x4, x11\n"
4312         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
4313         "mov x3, x4\n"
4314         "st1 {v30.4s, v31.4s}, [x3], #32\n"
4315         RUY_MAKE_ZERO(v30)
4316         RUY_MAKE_ZERO(v31)
4317 
4318         "331:\n"
4319 
4320         // For the next block: perform the first few multiply-adds on the data
4321         // that we have already loaded.
4322         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
4323         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
4324         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
4325         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
4326 
4327         // If all of the 8x8 block fits, we just finished writing it to the
4328         // destination, so we skip the next part.
4329         "beq 341f\n"
4330 
4331         // Not all of the 8x8 block fits in the destination matrix.  We just
4332         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
4333         // it to copy into the destination matrix the part that fits.
4334         "mov x3, %[dst_tmp_buf]\n"
4335         "mov x4, %[dst_ptr]\n"
4336         "mov w6, #0\n"
4337         "350:\n"
4338         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
4339         "mov w5, #0\n"
4340         "351:\n"
4341         "ldr w7, [x3, x5, lsl #2]\n"
4342         "str w7, [x4, x5, lsl #2]\n"
4343         "add w5, w5, #1\n"
4344         "cmp w5, w1\n"
4345         "blt 351b\n"
4346         "add w6, w6, #1\n"
4347         "add x3, x3, #32\n"
4348         "add x4, x4, x11\n"
4349         "cmp w6, w2\n"
4350         "blt 350b\n"
4351         "341:\n"
4352         "add %[dst_ptr], %[dst_ptr], #32\n"
4353         // At this point we have completely finished writing values to the
4354         // destination matrix for the current block.
4355 
4356         RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
4357 
4358         // Reload some params --- we had used x5 -- x7 for a few other things
4359         // since the last time we had loaded them.
4360         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
4361         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
4362         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
4363 
4364         // Move to the next block of the destination matrix, for the next iter
4365         // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
4366         // been updated earlier.
4367         // Have we reached the end row?
4368         "cmp %w[row], w7\n"
4369         "beq 20f\n"  // yes, end row.
4370         // Not end row. Move to the next row.
4371         "add %w[row], %w[row], #8\n"
4372         "b 21f\n"
4373         "20:\n"
4374         // Was already at end row.
4375         "mov %w[row], w6\n"  // Move back to first row.
4376         "add %w[col], %w[col], #8\n"  // Move to the next column.
4377         "add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
4378         "mov %[dst_ptr], %[dst_col_ptr]\n"
4379         "21:\n"
4380 
4381         // Main loop exit condition: have we hit the end column?
4382         "cmp %w[col], w8\n"
4383 
4384         // w1 is the number of levels of depth that we have already loaded
4385         // LHS and RHS data for. Corresponding to the initial ld1 instructions
4386         // above, this is currently 4.
4387         "mov w1, #4\n"
4388 
4389         "ble 1b\n"
4390 
4391         // clang-format on
4392 
4393         : [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
4394           [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
4395           [dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
4396         : [ params ] "r"(&params), [dst_rows] "r"(params.dst_rows),
4397           [dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
4398           [dst_type_id] "r"(params.dst_type_id)
4399         : "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
4400           "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
4401           "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
4402           "v26", "v27", "v28", "v29", "v30", "v31");
4403 }
4404 
4405 // A fork of the above 8bitNeonDotprod kernel but removes the max streaming
4406 // manual unrolling. Manually unrolling the inner loops benefits some GEMM
4407 // shapes on the Cortex-A76 but destroys performance on the X1 by increasing
4408 // backend stalls. Therefore, we remove the MAX_STREAMING option in this
4409 // kernel. The target CPU for this kernel is currently only the Cortex-X1.
Kernel8bitNeonDotprodX1(const KernelParams8bit<8,8> & params)4410 void Kernel8bitNeonDotprodX1(const KernelParams8bit<8, 8>& params) {
4411   profiler::ScopeLabel label("Kernel (kNeonDotprod)");
4412 
4413   CheckOffsetsInKernelParams8bit(params);
4414 
4415   const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
4416   const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
4417   const std::int8_t* lhs_ptr = lhs_col_ptr;
4418   const std::int8_t* rhs_ptr = rhs_col_ptr;
4419   void* dst_col_ptr = params.dst_base_ptr;
4420   void* dst_ptr = dst_col_ptr;
4421   int row = params.start_row;
4422   int col = params.start_col;
4423 
4424   // The asm kernel below has the following NEON register allocation:
4425   //
4426   // v16 -- v31 are int32 accumulators.
4427   // During accumulation, v0 -- v15 are used to load int8 data from LHS and
4428   // RHS. At least v0 and v1 are used to load a 8x4 block of LHS, and v2 and
4429   // v3 are used to load a 4x8 block of RHS, like this:
4430   //
4431   //                                      int8 RHS 4x8 block
4432   //                           /-----------------------------------------|
4433   //                           |v2.b[0] ... v2.b[12] v3.b[0] ... v3.b[12]|
4434   //                           |  ...                              ...   |
4435   //                           |v2.b[3] ... v2.b[15] v3.b[3] ... v3.b[15]|
4436   //                           \-----------------------------------------/
4437   //    int8 LHS 8x4 block
4438   //  /---------------------\  /-----------------------------------------|
4439   //  |v0.b[0]  ... v0.b[3] |  |v16.s[0]           ...           v30.s[0]|
4440   //  |  ...          ...   |  |  ...                              ...   |
4441   //  |v0.b[12] ... v0.b[15]|  |v16.s[3]           ...           v30.s[3]|
4442   //  |v1.b[0]  ... v1.b[3] |  |v17.s[0]           ...           v31.s[0]|
4443   //  |  ...         ...    |  |  ...                              ...   |
4444   //  |v1.b[12] ... v1.b[15]|  |v17.s[3]           ...           v31.s[3]|
4445   //  \---------------------/  \-----------------------------------------/
4446   //                                  int32 accumulators 8x8 block
4447   //
4448   // In the RUY_OPT_MAX_STREAMING part of the kernel, this elementary step
4449   // is repeated 4 times, using 4x more registers for LHS and RHS, so that
4450   // is where instead of using v0 -- v3 for LHS and RHS, we use v0 -- v15.
4451   //
4452   // Outside of the RUY_OPT_MAX_STREAMING part of the kernel, v4 -- v7 are
4453   // unused, and v8 -- v15 are used for loading parameters used for the
4454   // post-accumulation part of the kernel.
4455   asm volatile(
4456 #define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
4457 
4458         // clang-format off
4459 
4460         // Load some parameters into registers.
4461         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
4462         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
4463         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
4464         "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
4465         "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
4466         "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
4467         "ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
4468         "ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
4469 
4470         // Load the first 32 bytes of LHS and RHS data.
4471         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
4472         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
4473         "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
4474         "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
4475 
4476         // Clear accumulators.
4477         RUY_MAKE_ZERO(v16)
4478         RUY_MAKE_ZERO(v17)
4479         RUY_MAKE_ZERO(v18)
4480         RUY_MAKE_ZERO(v19)
4481         RUY_MAKE_ZERO(v20)
4482         RUY_MAKE_ZERO(v21)
4483         RUY_MAKE_ZERO(v22)
4484         RUY_MAKE_ZERO(v23)
4485         RUY_MAKE_ZERO(v24)
4486         RUY_MAKE_ZERO(v25)
4487         RUY_MAKE_ZERO(v26)
4488         RUY_MAKE_ZERO(v27)
4489         RUY_MAKE_ZERO(v28)
4490         RUY_MAKE_ZERO(v29)
4491         RUY_MAKE_ZERO(v30)
4492         RUY_MAKE_ZERO(v31)
4493 
4494         // w1 is the number of levels of depth that we have already loaded
4495         // LHS and RHS data for. Corresponding to the initial ld1 instructions
4496         // above, this is currently 4.
4497         "mov w1, #4\n"
4498 
4499         // Perform the first few multiply-adds on the data that we have already
4500         // loaded.
4501         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
4502         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
4503         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
4504         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
4505 
4506         // Main loop of the whole GEMM, over rows and columns of the
4507         // destination matrix.
4508         "1:\n"
4509 
4510         // Kernel inner loop (over depth).
4511         // Reminder - w1 is how many levels of depth we have already loaded
4512         // data for, w12 is the total depth.
4513         "cmp w1, w12\n"
4514         "beq 79f\n"
4515 
4516         "2:\n"
4517 
4518         // Because of the data that we have already loaded, we can start the
4519         // loop body right away with some multiply-adds.
4520         ".word 0x4f83e018  // sdot v24.4s, v0.16b, v3.4b[0]\n"
4521         ".word 0x4fa3e01a  // sdot v26.4s, v0.16b, v3.4b[1]\n"
4522         // Each iteration of this loop advances by 4 levels of depth.
4523         "add w1, w1, #4\n"
4524         ".word 0x4f83e81c  // sdot v28.4s, v0.16b, v3.4b[2]\n"
4525         ".word 0x4fa3e81e  // sdot v30.4s, v0.16b, v3.4b[3]\n"
4526         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
4527         ".word 0x4f82e031  // sdot v17.4s, v1.16b, v2.4b[0]\n"
4528         ".word 0x4fa2e033  // sdot v19.4s, v1.16b, v2.4b[1]\n"
4529         // Loop termination condition.
4530         "cmp w1, w12\n"
4531         ".word 0x4f82e835  // sdot v21.4s, v1.16b, v2.4b[2]\n"
4532         ".word 0x4fa2e837  // sdot v23.4s, v1.16b, v2.4b[3]\n"
4533         "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
4534         ".word 0x4f83e039  // sdot v25.4s, v1.16b, v3.4b[0]\n"
4535         ".word 0x4fa3e03b  // sdot v27.4s, v1.16b, v3.4b[1]\n"
4536         ".word 0x4f83e83d  // sdot v29.4s, v1.16b, v3.4b[2]\n"
4537         ".word 0x4fa3e83f  // sdot v31.4s, v1.16b, v3.4b[3]\n"
4538         "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
4539         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
4540         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
4541         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
4542         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
4543         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
4544 
4545         "blt 2b\n"
4546 
4547         "79:\n"
4548         // End of the inner loop on depth. Now perform the remaining
4549         // multiply-adds of the last 4 levels of depth, for which the LHS
4550         // and RHS data is already loaded.
4551 
4552         ".word 0x4f83e018  // sdot v24.4s, v0.16b, v3.4b[0]\n"
4553         ".word 0x4fa3e01a  // sdot v26.4s, v0.16b, v3.4b[1]\n"
4554         ".word 0x4f83e81c  // sdot v28.4s, v0.16b, v3.4b[2]\n"
4555         ".word 0x4fa3e81e  // sdot v30.4s, v0.16b, v3.4b[3]\n"
4556         ".word 0x4f82e031  // sdot v17.4s, v1.16b, v2.4b[0]\n"
4557         ".word 0x4fa2e033  // sdot v19.4s, v1.16b, v2.4b[1]\n"
4558         ".word 0x4f82e835  // sdot v21.4s, v1.16b, v2.4b[2]\n"
4559         ".word 0x4fa2e837  // sdot v23.4s, v1.16b, v2.4b[3]\n"
4560         ".word 0x4f83e039  // sdot v25.4s, v1.16b, v3.4b[0]\n"
4561         ".word 0x4fa3e03b  // sdot v27.4s, v1.16b, v3.4b[1]\n"
4562         ".word 0x4f83e83d  // sdot v29.4s, v1.16b, v3.4b[2]\n"
4563         ".word 0x4fa3e83f  // sdot v31.4s, v1.16b, v3.4b[3]\n"
4564 
4565         // End of accumulation. The registers v16 -- v31 contain the final
4566         // int32 accumulator values of the current 8x8 destination block.
4567         // We now have to compute the final 8-bit values from these int32
4568         // accumulators, and advance to the next 8x8 block. We intertwine
4569         // these two aspects whenever possible for optimal pipelining, both
4570         // at the data flow level (prefetch data for next block as early as
4571         // possible) and instruction pipelining level (some of the next-block
4572         // work can dual-issue with some of the final work on the current
4573         // block).
4574 
4575         // Logic to advance to the next block in preparation for the next
4576         // iteration of the main loop. For now, we only want to compute
4577         // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
4578         // not yet ready to update the values of row and col, as we still need
4579         // the current values for the rest of the work on the current block.
4580 
4581         "cmp %w[row], w7\n"  // Have we finished the last row?
4582         "bge 4f\n"           // If finished last row, go to 4
4583         // Not finished last row: then advance to next row.
4584         "add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
4585         "b 5f\n"
4586         "4:\n"  // Finished last row...
4587         "mov %[lhs_col_ptr], x5\n"  // Go back to first row
4588         // Now we need to advance to the next column. If we already
4589         // finished the last column, then in principle we are done, however
4590         // we can't just return here, as we need to allow the end work of the
4591         // current block to complete. The good news is that at this point it
4592         // doesn't matter what data we load for the next column, since
4593         // we will exit from the main loop below before actually storing
4594         // anything computed from that data.
4595         "cmp %w[col], w8\n"  // Have we finished the last column?
4596         "bge 5f\n" // If yes, just carry on without updating the column pointer.
4597         // Not finished last column: then advance to next column.
4598         "add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
4599         "5:\n"
4600 
4601         // Set the LHS and RHS data pointers to the start of the columns just
4602         // computed.
4603         "mov %[lhs_ptr], %[lhs_col_ptr]\n"
4604         "mov %[rhs_ptr], %[rhs_col_ptr]\n"
4605 
4606         // Load some parameters needed for the end work on current block.
4607         "mvni v8.4s, #0\n"
4608         "ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
4609         "ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
4610         "dup v9.4s, w3\n"   // create prod_zp_depth_vec
4611 
4612         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
4613         // Determine the channel index.
4614         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
4615         "csel w3, %w[row], %w[col], eq\n"
4616 
4617         // Offset the bias pointer as needed given the current row, col.
4618         "add x5, x1, x3, lsl #2\n"
4619 
4620         // If there is no bias, use no offset, just address the passed zero
4621         // data.
4622         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
4623         "csel x1, x1, x5, eq\n"
4624 
4625         // Load 8 bias values.
4626         "ld1 {v14.4s}, [x1], #16\n"
4627         "ld1 {v15.4s}, [x1]\n"
4628 
4629         // Now that we know what LHS and RHS data the next iteration of the
4630         // main loop will need to load, we start loading the first 32 bytes of
4631         // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
4632         // in the rest of the work on the current block.
4633         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
4634         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
4635         "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
4636         "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
4637 
4638         // Add to the bias values the product (depth * lhs_zero_point * rhs_zero_point),
4639         // See the term NZ1Z2 in equation (7) in https://arxiv.org/pdf/1712.05877.pdf
4640         "add v14.4s, v14.4s, v9.4s\n"
4641         "add v15.4s, v15.4s, v9.4s\n"
4642 
4643         // Perform the bias-addition (per the above, we have just folded into
4644         // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
4645         // Jump based on channel dimension.
4646         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
4647         "bne 6f\n"
4648         // Case where channels are rows
4649         "add v16.4s, v16.4s, v14.4s\n"
4650         "add v17.4s, v17.4s, v15.4s\n"
4651         "add v18.4s, v18.4s, v14.4s\n"
4652         "add v19.4s, v19.4s, v15.4s\n"
4653         "add v20.4s, v20.4s, v14.4s\n"
4654         "add v21.4s, v21.4s, v15.4s\n"
4655         "add v22.4s, v22.4s, v14.4s\n"
4656         "add v23.4s, v23.4s, v15.4s\n"
4657         "add v24.4s, v24.4s, v14.4s\n"
4658         "add v25.4s, v25.4s, v15.4s\n"
4659         "add v26.4s, v26.4s, v14.4s\n"
4660         "add v27.4s, v27.4s, v15.4s\n"
4661         "add v28.4s, v28.4s, v14.4s\n"
4662         "add v29.4s, v29.4s, v15.4s\n"
4663         "add v30.4s, v30.4s, v14.4s\n"
4664         "add v31.4s, v31.4s, v15.4s\n"
4665         "b 7f\n"
4666 
4667         "6:\n"
4668         // Case where channels are columns
4669         "dup v10.4s, v14.s[0]\n"
4670         "dup v11.4s, v14.s[1]\n"
4671         "dup v12.4s, v14.s[2]\n"
4672         "dup v13.4s, v14.s[3]\n"
4673         "add v16.4s, v16.4s, v10.4s\n"
4674         "add v17.4s, v17.4s, v10.4s\n"
4675         "add v18.4s, v18.4s, v11.4s\n"
4676         "add v19.4s, v19.4s, v11.4s\n"
4677         "add v20.4s, v20.4s, v12.4s\n"
4678         "add v21.4s, v21.4s, v12.4s\n"
4679         "add v22.4s, v22.4s, v13.4s\n"
4680         "add v23.4s, v23.4s, v13.4s\n"
4681         "dup v10.4s, v15.s[0]\n"
4682         "dup v11.4s, v15.s[1]\n"
4683         "dup v12.4s, v15.s[2]\n"
4684         "dup v13.4s, v15.s[3]\n"
4685         "add v24.4s, v24.4s, v10.4s\n"
4686         "add v25.4s, v25.4s, v10.4s\n"
4687         "add v26.4s, v26.4s, v11.4s\n"
4688         "add v27.4s, v27.4s, v11.4s\n"
4689         "add v28.4s, v28.4s, v12.4s\n"
4690         "add v29.4s, v29.4s, v12.4s\n"
4691         "add v30.4s, v30.4s, v13.4s\n"
4692         "add v31.4s, v31.4s, v13.4s\n"
4693         "7:\n"
4694 
4695         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
4696         "beq 401f\n"
4697         "ldr x3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
4698         "add x3, x3, %x[col], lsl #2\n"
4699         "ld1 {v14.4s}, [x3], #16\n"
4700         "ld1 {v15.4s}, [x3]\n"
4701         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
4702         "dup v10.4s, w5\n"  // create lhs_zero_point_vec
4703         // Subtract rhs_sums * lhs_zero_point, per
4704         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
4705         "mls v16.4s, v10.4s, v14.s[0]\n"
4706         "mls v17.4s, v10.4s, v14.s[0]\n"
4707         "mls v18.4s, v10.4s, v14.s[1]\n"
4708         "mls v19.4s, v10.4s, v14.s[1]\n"
4709         "mls v20.4s, v10.4s, v14.s[2]\n"
4710         "mls v21.4s, v10.4s, v14.s[2]\n"
4711         "mls v22.4s, v10.4s, v14.s[3]\n"
4712         "mls v23.4s, v10.4s, v14.s[3]\n"
4713         "mls v24.4s, v10.4s, v15.s[0]\n"
4714         "mls v25.4s, v10.4s, v15.s[0]\n"
4715         "mls v26.4s, v10.4s, v15.s[1]\n"
4716         "mls v27.4s, v10.4s, v15.s[1]\n"
4717         "mls v28.4s, v10.4s, v15.s[2]\n"
4718         "mls v29.4s, v10.4s, v15.s[2]\n"
4719         "mls v30.4s, v10.4s, v15.s[3]\n"
4720         "mls v31.4s, v10.4s, v15.s[3]\n"
4721         "401:\n"
4722 
4723         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
4724         "beq 402f\n"
4725         "ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
4726         "add x2, x2, %x[row], lsl #2\n"
4727         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
4728         // Load 4 lhs_sums values.
4729         "ld1 {v11.4s}, [x2], #16\n"
4730         "ld1 {v12.4s}, [x2]\n"
4731         "ins v13.s[1], w5\n" // rhs_zero_point
4732         // Compute lhs_sums * rhs_zero_point.
4733         "mul v11.4s, v11.4s, v13.s[1]\n"
4734         "mul v12.4s, v12.4s, v13.s[1]\n"
4735         // Subtract lhs_sums * rhs_zero_point, per
4736         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
4737         "sub v16.4s, v16.4s, v11.4s\n"
4738         "sub v17.4s, v17.4s, v12.4s\n"
4739         "sub v18.4s, v18.4s, v11.4s\n"
4740         "sub v19.4s, v19.4s, v12.4s\n"
4741         "sub v20.4s, v20.4s, v11.4s\n"
4742         "sub v21.4s, v21.4s, v12.4s\n"
4743         "sub v22.4s, v22.4s, v11.4s\n"
4744         "sub v23.4s, v23.4s, v12.4s\n"
4745         "sub v24.4s, v24.4s, v11.4s\n"
4746         "sub v25.4s, v25.4s, v12.4s\n"
4747         "sub v26.4s, v26.4s, v11.4s\n"
4748         "sub v27.4s, v27.4s, v12.4s\n"
4749         "sub v28.4s, v28.4s, v11.4s\n"
4750         "sub v29.4s, v29.4s, v12.4s\n"
4751         "sub v30.4s, v30.4s, v11.4s\n"
4752         "sub v31.4s, v31.4s, v12.4s\n"
4753 
4754         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
4755         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
4756 
4757         "402:\n"
4758 
4759         // At this point we have computed the final int32 values. Now we
4760         // start down-quantizing them to obtain the final 8bit values from them.
4761 
4762         // As part of this down-quantization, our int32 values will be
4763         // multiplied by a multiplier that has a fixed-point component and an
4764         // exponent component.
4765 
4766         //Load the exponent part of the multiplier.
4767         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
4768         // Determine the channel index.
4769         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
4770         "csel w3, %w[row], %w[col], eq\n"
4771         // Compute the multiplier_exponent pointer
4772         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
4773         "add x5, x1, x3, lsl #2\n"
4774         "csel x1, x1, x5, eq\n"
4775         // Load multiplier_exponent
4776         "ldr q9, [x1]\n"
4777         "ldr q10, [x1, #16]\n"
4778         // Separate positive and negative exponents
4779         "smin v11.4s, v8.4s, v9.4s\n"
4780         "smin v12.4s, v8.4s, v10.4s\n"
4781         "sub v9.4s, v9.4s, v11.4s\n"
4782         "sub v10.4s, v10.4s, v12.4s\n"
4783 
4784         // Compute the multiplier_fixedpoint pointer
4785         "ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
4786         "add x5, x4, x3, lsl #2\n"
4787         "csel x4, x4, x5, eq\n"
4788         // Load multiplier_fixedpoint
4789         "ldr q14, [x4]\n"
4790         "ldr q15, [x4, #16]\n"
4791 
4792         // Jump based on channel dimension.
4793         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
4794         "bne 8f\n"
4795         // Case where channels are rows
4796 
4797         // Apply the positive exponent part of the multiplier.
4798         "sshl v16.4s, v16.4s, v9.4s\n"
4799         "sshl v17.4s, v17.4s, v10.4s\n"
4800         "sshl v18.4s, v18.4s, v9.4s\n"
4801         "sshl v19.4s, v19.4s, v10.4s\n"
4802         "sshl v20.4s, v20.4s, v9.4s\n"
4803         "sshl v21.4s, v21.4s, v10.4s\n"
4804         "sshl v22.4s, v22.4s, v9.4s\n"
4805         "sshl v23.4s, v23.4s, v10.4s\n"
4806         "sshl v24.4s, v24.4s, v9.4s\n"
4807         "sshl v25.4s, v25.4s, v10.4s\n"
4808         "sshl v26.4s, v26.4s, v9.4s\n"
4809         "sshl v27.4s, v27.4s, v10.4s\n"
4810         "sshl v28.4s, v28.4s, v9.4s\n"
4811         "sshl v29.4s, v29.4s, v10.4s\n"
4812         "sshl v30.4s, v30.4s, v9.4s\n"
4813         "sshl v31.4s, v31.4s, v10.4s\n"
4814         "10:\n"
4815 
4816         // Apply the fixed-point part of the multiplier.
4817         "sqdmulh v16.4s, v16.4s, v14.4s\n"
4818         "sqdmulh v17.4s, v17.4s, v15.4s\n"
4819         "sqdmulh v18.4s, v18.4s, v14.4s\n"
4820         "sqdmulh v19.4s, v19.4s, v15.4s\n"
4821         "sqdmulh v20.4s, v20.4s, v14.4s\n"
4822         "sqdmulh v21.4s, v21.4s, v15.4s\n"
4823         "sqdmulh v22.4s, v22.4s, v14.4s\n"
4824         "sqdmulh v23.4s, v23.4s, v15.4s\n"
4825         "sqdmulh v24.4s, v24.4s, v14.4s\n"
4826         "sqdmulh v25.4s, v25.4s, v15.4s\n"
4827         "sqdmulh v26.4s, v26.4s, v14.4s\n"
4828         "sqdmulh v27.4s, v27.4s, v15.4s\n"
4829         "sqdmulh v28.4s, v28.4s, v14.4s\n"
4830         "sqdmulh v29.4s, v29.4s, v15.4s\n"
4831         "sqdmulh v30.4s, v30.4s, v14.4s\n"
4832         "sqdmulh v31.4s, v31.4s, v15.4s\n"
4833 
4834         // Apply the negative exponent part of the multiplier.
4835         "srshl v16.4s, v16.4s, v11.4s\n"
4836         "srshl v17.4s, v17.4s, v12.4s\n"
4837         "srshl v18.4s, v18.4s, v11.4s\n"
4838         "srshl v19.4s, v19.4s, v12.4s\n"
4839         "srshl v20.4s, v20.4s, v11.4s\n"
4840         "srshl v21.4s, v21.4s, v12.4s\n"
4841         "srshl v22.4s, v22.4s, v11.4s\n"
4842         "srshl v23.4s, v23.4s, v12.4s\n"
4843         "srshl v24.4s, v24.4s, v11.4s\n"
4844         "srshl v25.4s, v25.4s, v12.4s\n"
4845         "srshl v26.4s, v26.4s, v11.4s\n"
4846         "srshl v27.4s, v27.4s, v12.4s\n"
4847         "srshl v28.4s, v28.4s, v11.4s\n"
4848         "srshl v29.4s, v29.4s, v12.4s\n"
4849         "srshl v30.4s, v30.4s, v11.4s\n"
4850         "srshl v31.4s, v31.4s, v12.4s\n"
4851         "b 9f\n"
4852 
4853         "8:\n"
4854         // Case where channels are columns
4855 
4856         // Apply the positive exponent part of the multiplier.
4857         "dup v4.4s, v9.s[0]\n"
4858         "dup v5.4s, v9.s[1]\n"
4859         "dup v6.4s, v9.s[2]\n"
4860         "dup v7.4s, v9.s[3]\n"
4861         "sshl v16.4s, v16.4s, v4.4s\n"
4862         "sshl v17.4s, v17.4s, v4.4s\n"
4863         "sshl v18.4s, v18.4s, v5.4s\n"
4864         "sshl v19.4s, v19.4s, v5.4s\n"
4865         "sshl v20.4s, v20.4s, v6.4s\n"
4866         "sshl v21.4s, v21.4s, v6.4s\n"
4867         "sshl v22.4s, v22.4s, v7.4s\n"
4868         "sshl v23.4s, v23.4s, v7.4s\n"
4869         "dup v4.4s, v10.s[0]\n"
4870         "dup v5.4s, v10.s[1]\n"
4871         "dup v6.4s, v10.s[2]\n"
4872         "dup v7.4s, v10.s[3]\n"
4873         "sshl v24.4s, v24.4s, v4.4s\n"
4874         "sshl v25.4s, v25.4s, v4.4s\n"
4875         "sshl v26.4s, v26.4s, v5.4s\n"
4876         "sshl v27.4s, v27.4s, v5.4s\n"
4877         "sshl v28.4s, v28.4s, v6.4s\n"
4878         "sshl v29.4s, v29.4s, v6.4s\n"
4879         "sshl v30.4s, v30.4s, v7.4s\n"
4880         "sshl v31.4s, v31.4s, v7.4s\n"
4881         "11:\n"
4882 
4883         // Apply the fixed-point part of the multiplier.
4884         "sqdmulh v16.4s, v16.4s, v14.s[0]\n"
4885         "sqdmulh v17.4s, v17.4s, v14.s[0]\n"
4886         "sqdmulh v18.4s, v18.4s, v14.s[1]\n"
4887         "sqdmulh v19.4s, v19.4s, v14.s[1]\n"
4888         "sqdmulh v20.4s, v20.4s, v14.s[2]\n"
4889         "sqdmulh v21.4s, v21.4s, v14.s[2]\n"
4890         "sqdmulh v22.4s, v22.4s, v14.s[3]\n"
4891         "sqdmulh v23.4s, v23.4s, v14.s[3]\n"
4892         "sqdmulh v24.4s, v24.4s, v15.s[0]\n"
4893         "sqdmulh v25.4s, v25.4s, v15.s[0]\n"
4894         "sqdmulh v26.4s, v26.4s, v15.s[1]\n"
4895         "sqdmulh v27.4s, v27.4s, v15.s[1]\n"
4896         "sqdmulh v28.4s, v28.4s, v15.s[2]\n"
4897         "sqdmulh v29.4s, v29.4s, v15.s[2]\n"
4898         "sqdmulh v30.4s, v30.4s, v15.s[3]\n"
4899         "sqdmulh v31.4s, v31.4s, v15.s[3]\n"
4900 
4901         // Apply the negative exponent part of the multiplier.
4902         "dup v4.4s, v11.s[0]\n"
4903         "dup v5.4s, v11.s[1]\n"
4904         "dup v6.4s, v11.s[2]\n"
4905         "dup v7.4s, v11.s[3]\n"
4906         "srshl v16.4s, v16.4s, v4.4s\n"
4907         "srshl v17.4s, v17.4s, v4.4s\n"
4908         "srshl v18.4s, v18.4s, v5.4s\n"
4909         "srshl v19.4s, v19.4s, v5.4s\n"
4910         "srshl v20.4s, v20.4s, v6.4s\n"
4911         "srshl v21.4s, v21.4s, v6.4s\n"
4912         "srshl v22.4s, v22.4s, v7.4s\n"
4913         "srshl v23.4s, v23.4s, v7.4s\n"
4914         "dup v4.4s, v12.s[0]\n"
4915         "dup v5.4s, v12.s[1]\n"
4916         "dup v6.4s, v12.s[2]\n"
4917         "dup v7.4s, v12.s[3]\n"
4918         "srshl v24.4s, v24.4s, v4.4s\n"
4919         "srshl v25.4s, v25.4s, v4.4s\n"
4920         "srshl v26.4s, v26.4s, v5.4s\n"
4921         "srshl v27.4s, v27.4s, v5.4s\n"
4922         "srshl v28.4s, v28.4s, v6.4s\n"
4923         "srshl v29.4s, v29.4s, v6.4s\n"
4924         "srshl v30.4s, v30.4s, v7.4s\n"
4925         "srshl v31.4s, v31.4s, v7.4s\n"
4926         "9:\n"
4927 
4928         "ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
4929         "ins v13.h[4], w4\n" // dst_zero_point
4930 
4931         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
4932         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
4933         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
4934         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
4935 
4936         RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
4937 
4938         // Cast-and-saturate from int32 to int16
4939         "sqxtn v16.4h, v16.4s\n"
4940         "sqxtn2 v16.8h, v17.4s\n"
4941         "sqxtn v17.4h, v18.4s\n"
4942         "sqxtn2 v17.8h, v19.4s\n"
4943         "sqxtn v18.4h, v20.4s\n"
4944         "sqxtn2 v18.8h, v21.4s\n"
4945         "sqxtn v19.4h, v22.4s\n"
4946         "sqxtn2 v19.8h, v23.4s\n"
4947         "sqxtn v20.4h, v24.4s\n"
4948         "sqxtn2 v20.8h, v25.4s\n"
4949         "sqxtn v21.4h, v26.4s\n"
4950         "sqxtn2 v21.8h, v27.4s\n"
4951         "sqxtn v22.4h, v28.4s\n"
4952         "sqxtn2 v22.8h, v29.4s\n"
4953         "sqxtn v23.4h, v30.4s\n"
4954         "sqxtn2 v23.8h, v31.4s\n"
4955 
4956         // At this point, v24 -- v31 aren't used anymore for the current block,
4957         // so we can start clearing these accumulators for the next block
4958         // (next iteration of the main loop).
4959         RUY_MAKE_ZERO(v24)
4960         RUY_MAKE_ZERO(v25)
4961         RUY_MAKE_ZERO(v26)
4962         RUY_MAKE_ZERO(v27)
4963         RUY_MAKE_ZERO(v28)
4964         RUY_MAKE_ZERO(v29)
4965         RUY_MAKE_ZERO(v30)
4966         RUY_MAKE_ZERO(v31)
4967 
4968         // Add the destination zero point
4969         "dup v14.8h, v13.h[4]\n"
4970         "sqadd v16.8h, v16.8h, v14.8h\n"
4971         "sqadd v17.8h, v17.8h, v14.8h\n"
4972         "sqadd v18.8h, v18.8h, v14.8h\n"
4973         "sqadd v19.8h, v19.8h, v14.8h\n"
4974         "sqadd v20.8h, v20.8h, v14.8h\n"
4975         "sqadd v21.8h, v21.8h, v14.8h\n"
4976         "sqadd v22.8h, v22.8h, v14.8h\n"
4977         "sqadd v23.8h, v23.8h, v14.8h\n"
4978 
4979         // Cast-and-saturate from int16 to uint8
4980         "sqxtun v16.8b, v16.8h\n"
4981         "sqxtun2 v16.16b, v17.8h\n"
4982         "sqxtun v17.8b, v18.8h\n"
4983         "sqxtun2 v17.16b, v19.8h\n"
4984         "sqxtun v18.8b, v20.8h\n"
4985         "sqxtun2 v18.16b, v21.8h\n"
4986         "sqxtun v19.8b, v22.8h\n"
4987         "sqxtun2 v19.16b, v23.8h\n"
4988 
4989         // Load the clamp_min, clamp_max bounds
4990         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
4991         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
4992         "dup v14.16b, w2\n"  // clamp_min
4993         "dup v15.16b, w3\n"  // clamp_max
4994 
4995         // Apply the clamp_min bound
4996         "umax v16.16b, v16.16b, v14.16b\n"
4997         "umax v17.16b, v17.16b, v14.16b\n"
4998         "umax v18.16b, v18.16b, v14.16b\n"
4999         "umax v19.16b, v19.16b, v14.16b\n"
5000 
5001         // Apply the clamp_max bound
5002         "umin v16.16b, v16.16b, v15.16b\n"
5003         "umin v17.16b, v17.16b, v15.16b\n"
5004         "umin v18.16b, v18.16b, v15.16b\n"
5005         "umin v19.16b, v19.16b, v15.16b\n"
5006 
5007         // Make it so that all of the final 8bit values are stored in the
5008         // first 64bits of 128bit NEON registers, so they can be stored
5009         // by 64bit st1 store instructions with byte alignment.
5010         "dup d20, v16.d[1]\n"
5011         "dup d21, v17.d[1]\n"
5012         "dup d22, v18.d[1]\n"
5013         "dup d23, v19.d[1]\n"
5014 
5015         // Compute how much of the 8x8 block of destination 8bit values that
5016         // we have computed, fit in the destination matrix. Typically, all of
5017         // it fits, but when the destination matrix shape is not a multiple
5018         // of 8x8, there are some 8x8 blocks along the boundaries that do
5019         // not fit entirely.
5020         "sub w1, %w[dst_rows], %w[row]\n"
5021         "sub w2, %w[dst_cols], %w[col]\n"
5022         "mov w3, #8\n"
5023         "cmp w1, #8\n"
5024         // Compute w1 = how many rows of the 8x8 block fit
5025         "csel w1, w1, w3, le\n"
5026         "cmp w2, #8\n"
5027         // Compute w2 = how many cols of the 8x8 block fit
5028         "csel w2, w2, w3, le\n"
5029 
5030         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
5031         "cmp w1, w3\n"
5032         "ccmp w2, w3, 0, eq\n"
5033         // Yes, all of the 8x8 block fits, go to fast path.
5034         "beq 30f\n"
5035         // Not all of the 8x8 block fits.
5036         // Set (x3 address, x4 stride) to write to dst_tmp_buf
5037         "mov x3, %[dst_tmp_buf]\n"
5038         "mov x4, #8\n"
5039         "b 31f\n"
5040         "30:\n"
5041         // Yes, all of the 8x8 block fits.
5042         // Set (x3 address, x4 stride) to write directly to destination matrix.
5043         "mov x3, %[dst_ptr]\n"
5044         "mov x4, x11\n"
5045         "31:\n"
5046 
5047         // Write our 8bit values to the destination described by
5048         // (x3 address, x4 stride).
5049         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5050         "st1 {v16.8b}, [x3], x4\n"
5051         RUY_MAKE_ZERO(v16)
5052         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5053         "st1 {v20.8b}, [x3], x4\n"
5054         RUY_MAKE_ZERO(v20)
5055         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5056         "st1 {v17.8b}, [x3], x4\n"
5057         RUY_MAKE_ZERO(v17)
5058         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5059         "st1 {v21.8b}, [x3], x4\n"
5060         RUY_MAKE_ZERO(v21)
5061         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5062         "st1 {v18.8b}, [x3], x4\n"
5063         RUY_MAKE_ZERO(v18)
5064         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5065         "st1 {v22.8b}, [x3], x4\n"
5066         RUY_MAKE_ZERO(v22)
5067         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5068         "st1 {v19.8b}, [x3], x4\n"
5069         RUY_MAKE_ZERO(v19)
5070         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5071         "st1 {v23.8b}, [x3], x4\n"
5072         RUY_MAKE_ZERO(v23)
5073 
5074         // For the next block: perform the first few multiply-adds on the data
5075         // that we have already loaded.
5076         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
5077         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
5078         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
5079         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
5080 
5081         // If all of the 8x8 block fits, we just finished writing it to the
5082         // destination, so we skip the next part.
5083         "beq 41f\n"
5084         // Not all of the 8x8 block fits in the destination matrix.  We just
5085         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
5086         // it to copy into the destination matrix the part that fits.
5087         "mov x3, %[dst_tmp_buf]\n"
5088         "mov x4, %[dst_ptr]\n"
5089         "mov w6, #0\n"
5090         "50:\n"
5091         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
5092         "mov w5, #0\n"
5093         "51:\n"
5094         "ldrb w7, [x3, w5, uxtw]\n"
5095         "strb w7, [x4, w5, uxtw]\n"
5096         "add w5, w5, #1\n"
5097         "cmp w5, w1\n"
5098         "blt 51b\n"
5099         "add w6, w6, #1\n"
5100         "add x3, x3, #8\n"
5101         "add x4, x4, x11\n"
5102         "cmp w6, w2\n"
5103         "blt 50b\n"
5104         "41:\n"
5105         "add %[dst_ptr], %[dst_ptr], #8\n"
5106         // At this point we have completely finished writing values to the
5107         // destination matrix for the current block.
5108 
5109         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
5110 
5111         RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
5112 
5113         // Cast-and-saturate from int32 to int16
5114         "sqxtn v16.4h, v16.4s\n"
5115         "sqxtn2 v16.8h, v17.4s\n"
5116         "sqxtn v17.4h, v18.4s\n"
5117         "sqxtn2 v17.8h, v19.4s\n"
5118         "sqxtn v18.4h, v20.4s\n"
5119         "sqxtn2 v18.8h, v21.4s\n"
5120         "sqxtn v19.4h, v22.4s\n"
5121         "sqxtn2 v19.8h, v23.4s\n"
5122         "sqxtn v20.4h, v24.4s\n"
5123         "sqxtn2 v20.8h, v25.4s\n"
5124         "sqxtn v21.4h, v26.4s\n"
5125         "sqxtn2 v21.8h, v27.4s\n"
5126         "sqxtn v22.4h, v28.4s\n"
5127         "sqxtn2 v22.8h, v29.4s\n"
5128         "sqxtn v23.4h, v30.4s\n"
5129         "sqxtn2 v23.8h, v31.4s\n"
5130 
5131         // At this point, v24 -- v31 aren't used anymore for the current block,
5132         // so we can start clearing these accumulators for the next block
5133         // (next iteration of the main loop).
5134         RUY_MAKE_ZERO(v24)
5135         RUY_MAKE_ZERO(v25)
5136         RUY_MAKE_ZERO(v26)
5137         RUY_MAKE_ZERO(v27)
5138         RUY_MAKE_ZERO(v28)
5139         RUY_MAKE_ZERO(v29)
5140         RUY_MAKE_ZERO(v30)
5141         RUY_MAKE_ZERO(v31)
5142 
5143         // Add the destination zero point
5144         "dup v14.8h, v13.h[4]\n"
5145         "sqadd v16.8h, v16.8h, v14.8h\n"
5146         "sqadd v17.8h, v17.8h, v14.8h\n"
5147         "sqadd v18.8h, v18.8h, v14.8h\n"
5148         "sqadd v19.8h, v19.8h, v14.8h\n"
5149         "sqadd v20.8h, v20.8h, v14.8h\n"
5150         "sqadd v21.8h, v21.8h, v14.8h\n"
5151         "sqadd v22.8h, v22.8h, v14.8h\n"
5152         "sqadd v23.8h, v23.8h, v14.8h\n"
5153 
5154         // Cast-and-saturate from int16 to uint8
5155         "sqxtn v16.8b, v16.8h\n"
5156         "sqxtn2 v16.16b, v17.8h\n"
5157         "sqxtn v17.8b, v18.8h\n"
5158         "sqxtn2 v17.16b, v19.8h\n"
5159         "sqxtn v18.8b, v20.8h\n"
5160         "sqxtn2 v18.16b, v21.8h\n"
5161         "sqxtn v19.8b, v22.8h\n"
5162         "sqxtn2 v19.16b, v23.8h\n"
5163 
5164         // Load the clamp_min, clamp_max bounds
5165         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
5166         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
5167         "dup v14.16b, w2\n"  // clamp_min
5168         "dup v15.16b, w3\n"  // clamp_max
5169 
5170         // Apply the clamp_min bound
5171         "smax v16.16b, v16.16b, v14.16b\n"
5172         "smax v17.16b, v17.16b, v14.16b\n"
5173         "smax v18.16b, v18.16b, v14.16b\n"
5174         "smax v19.16b, v19.16b, v14.16b\n"
5175 
5176         // Apply the clamp_max bound
5177         "smin v16.16b, v16.16b, v15.16b\n"
5178         "smin v17.16b, v17.16b, v15.16b\n"
5179         "smin v18.16b, v18.16b, v15.16b\n"
5180         "smin v19.16b, v19.16b, v15.16b\n"
5181 
5182         // Make it so that all of the final 8bit values are stored in the
5183         // first 64bits of 128bit NEON registers, so they can be stored
5184         // by 64bit st1 store instructions with byte alignment.
5185         "dup d20, v16.d[1]\n"
5186         "dup d21, v17.d[1]\n"
5187         "dup d22, v18.d[1]\n"
5188         "dup d23, v19.d[1]\n"
5189 
5190         // Compute how much of the 8x8 block of destination 8bit values that
5191         // we have computed, fit in the destination matrix. Typically, all of
5192         // it fits, but when the destination matrix shape is not a multiple
5193         // of 8x8, there are some 8x8 blocks along the boundaries that do
5194         // not fit entirely.
5195         "sub w1, %w[dst_rows], %w[row]\n"
5196         "sub w2, %w[dst_cols], %w[col]\n"
5197         "mov w3, #8\n"
5198         "cmp w1, #8\n"
5199         // Compute w1 = how many rows of the 8x8 block fit
5200         "csel w1, w1, w3, le\n"
5201         "cmp w2, #8\n"
5202         // Compute w2 = how many cols of the 8x8 block fit
5203         "csel w2, w2, w3, le\n"
5204 
5205         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
5206         "cmp w1, w3\n"
5207         "ccmp w2, w3, 0, eq\n"
5208         // Yes, all of the 8x8 block fits, go to fast path.
5209         "beq 130f\n"
5210         // Not all of the 8x8 block fits.
5211         // Set (x3 address, x4 stride) to write to dst_tmp_buf
5212         "mov x3, %[dst_tmp_buf]\n"
5213         "mov x4, #8\n"
5214         "b 131f\n"
5215         "130:\n"
5216         // Yes, all of the 8x8 block fits.
5217         // Set (x3 address, x4 stride) to write directly to destination matrix.
5218         "mov x3, %[dst_ptr]\n"
5219         "mov x4, x11\n"
5220         "131:\n"
5221 
5222         // Write our 8bit values to the destination described by
5223         // (x3 address, x4 stride).
5224         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5225         "st1 {v16.8b}, [x3], x4\n"
5226         RUY_MAKE_ZERO(v16)
5227         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5228         "st1 {v20.8b}, [x3], x4\n"
5229         RUY_MAKE_ZERO(v20)
5230         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5231         "st1 {v17.8b}, [x3], x4\n"
5232         RUY_MAKE_ZERO(v17)
5233         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5234         "st1 {v21.8b}, [x3], x4\n"
5235         RUY_MAKE_ZERO(v21)
5236         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5237         "st1 {v18.8b}, [x3], x4\n"
5238         RUY_MAKE_ZERO(v18)
5239         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5240         "st1 {v22.8b}, [x3], x4\n"
5241         RUY_MAKE_ZERO(v22)
5242         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5243         "st1 {v19.8b}, [x3], x4\n"
5244         RUY_MAKE_ZERO(v19)
5245         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5246         "st1 {v23.8b}, [x3], x4\n"
5247         RUY_MAKE_ZERO(v23)
5248 
5249         // For the next block: perform the first few multiply-adds on the data
5250         // that we have already loaded.
5251         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
5252         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
5253         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
5254         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
5255 
5256         // If all of the 8x8 block fits, we just finished writing it to the
5257         // destination, so we skip the next part.
5258         "beq 141f\n"
5259         // Not all of the 8x8 block fits in the destination matrix.  We just
5260         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
5261         // it to copy into the destination matrix the part that fits.
5262         "mov x3, %[dst_tmp_buf]\n"
5263         "mov x4, %[dst_ptr]\n"
5264         "mov w6, #0\n"
5265         "150:\n"
5266         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
5267         "mov w5, #0\n"
5268         "151:\n"
5269         "ldrb w7, [x3, w5, uxtw]\n"
5270         "strb w7, [x4, w5, uxtw]\n"
5271         "add w5, w5, #1\n"
5272         "cmp w5, w1\n"
5273         "blt 151b\n"
5274         "add w6, w6, #1\n"
5275         "add x3, x3, #8\n"
5276         "add x4, x4, x11\n"
5277         "cmp w6, w2\n"
5278         "blt 150b\n"
5279         "141:\n"
5280         "add %[dst_ptr], %[dst_ptr], #8\n"
5281         // At this point we have completely finished writing values to the
5282         // destination matrix for the current block.
5283 
5284         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
5285 
5286         RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
5287 
5288         // Add the destination zero point
5289         "dup v14.8h, v13.h[4]\n"
5290         "saddw v16.4s, v16.4s, v14.4h\n"
5291         "saddw v17.4s, v17.4s, v14.4h\n"
5292         "saddw v18.4s, v18.4s, v14.4h\n"
5293         "saddw v19.4s, v19.4s, v14.4h\n"
5294         "saddw v20.4s, v20.4s, v14.4h\n"
5295         "saddw v21.4s, v21.4s, v14.4h\n"
5296         "saddw v22.4s, v22.4s, v14.4h\n"
5297         "saddw v23.4s, v23.4s, v14.4h\n"
5298         "saddw v24.4s, v24.4s, v14.4h\n"
5299         "saddw v25.4s, v25.4s, v14.4h\n"
5300         "saddw v26.4s, v26.4s, v14.4h\n"
5301         "saddw v27.4s, v27.4s, v14.4h\n"
5302         "saddw v28.4s, v28.4s, v14.4h\n"
5303         "saddw v29.4s, v29.4s, v14.4h\n"
5304         "saddw v30.4s, v30.4s, v14.4h\n"
5305         "saddw v31.4s, v31.4s, v14.4h\n"
5306 
5307         // Cast-and-saturate from int32 to int16
5308         "sqxtn v16.4h, v16.4s\n"
5309         "sqxtn2 v16.8h, v17.4s\n"
5310         "sqxtn v17.4h, v18.4s\n"
5311         "sqxtn2 v17.8h, v19.4s\n"
5312         "sqxtn v18.4h, v20.4s\n"
5313         "sqxtn2 v18.8h, v21.4s\n"
5314         "sqxtn v19.4h, v22.4s\n"
5315         "sqxtn2 v19.8h, v23.4s\n"
5316         "sqxtn v20.4h, v24.4s\n"
5317         "sqxtn2 v20.8h, v25.4s\n"
5318         "sqxtn v21.4h, v26.4s\n"
5319         "sqxtn2 v21.8h, v27.4s\n"
5320         "sqxtn v22.4h, v28.4s\n"
5321         "sqxtn2 v22.8h, v29.4s\n"
5322         "sqxtn v23.4h, v30.4s\n"
5323         "sqxtn2 v23.8h, v31.4s\n"
5324 
5325         // At this point, v24 -- v31 aren't used anymore for the current block,
5326         // so we can start clearing these accumulators for the next block
5327         // (next iteration of the main loop).
5328         RUY_MAKE_ZERO(v24)
5329         RUY_MAKE_ZERO(v25)
5330         RUY_MAKE_ZERO(v26)
5331         RUY_MAKE_ZERO(v27)
5332         RUY_MAKE_ZERO(v28)
5333         RUY_MAKE_ZERO(v29)
5334         RUY_MAKE_ZERO(v30)
5335         RUY_MAKE_ZERO(v31)
5336 
5337         // Load the clamp_min, clamp_max bounds
5338         "ldrsh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
5339         "ldrsh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
5340         "dup v14.8h, w2\n"  // clamp_min
5341         "dup v15.8h, w3\n"  // clamp_max
5342 
5343         // Apply the clamp_min bound
5344         "smax v16.8h, v16.8h, v14.8h\n"
5345         "smax v17.8h, v17.8h, v14.8h\n"
5346         "smax v18.8h, v18.8h, v14.8h\n"
5347         "smax v19.8h, v19.8h, v14.8h\n"
5348         "smax v20.8h, v20.8h, v14.8h\n"
5349         "smax v21.8h, v21.8h, v14.8h\n"
5350         "smax v22.8h, v22.8h, v14.8h\n"
5351         "smax v23.8h, v23.8h, v14.8h\n"
5352         // Apply the clamp_max bound
5353         "smin v16.8h, v16.8h, v15.8h\n"
5354         "smin v17.8h, v17.8h, v15.8h\n"
5355         "smin v18.8h, v18.8h, v15.8h\n"
5356         "smin v19.8h, v19.8h, v15.8h\n"
5357         "smin v20.8h, v20.8h, v15.8h\n"
5358         "smin v21.8h, v21.8h, v15.8h\n"
5359         "smin v22.8h, v22.8h, v15.8h\n"
5360         "smin v23.8h, v23.8h, v15.8h\n"
5361 
5362         // Compute how much of the 8x8 block of destination 16bit values that
5363         // we have computed, fit in the destination matrix. Typically, all of
5364         // it fits, but when the destination matrix shape is not a multiple
5365         // of 8x8, there are some 8x8 blocks along the boundaries that do
5366         // not fit entirely.
5367         "sub w1, %w[dst_rows], %w[row]\n"
5368         "sub w2, %w[dst_cols], %w[col]\n"
5369         "mov w3, #8\n"
5370         "cmp w1, #8\n"
5371         // Compute w1 = how many rows of the 8x8 block fit
5372         "csel w1, w1, w3, le\n"
5373         "cmp w2, #8\n"
5374         // Compute w1 = how many rows of the 8x8 block fit
5375         "csel w2, w2, w3, le\n"
5376 
5377         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
5378         "cmp w1, w3\n"
5379         "ccmp w2, w3, 0, eq\n"
5380         // Yes, all of the 8x8 block fits, go to fast path.
5381         "beq 230f\n"
5382         // Not all of the 8x8 block fits.
5383         // Set (x3 address, x4 stride) to write to dst_tmp_buf
5384         "mov x3, %[dst_tmp_buf]\n"
5385         "mov x4, #16\n"
5386         "b 231f\n"
5387         "230:\n"
5388         // Yes, all of the 8x8 block fits.
5389         // Set (x3 address, x4 stride) to write directly to destination matrix.
5390         "mov x3, %[dst_ptr]\n"
5391         "mov x4, x11\n"
5392         "231:\n"
5393 
5394         // Write our 16bit values to the destination described by
5395         // (x3 address, x4 stride).
5396         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5397         "st1 {v16.8h}, [x3], x4\n"
5398         RUY_MAKE_ZERO(v16)
5399         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5400         "st1 {v17.8h}, [x3], x4\n"
5401         RUY_MAKE_ZERO(v17)
5402         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5403         "st1 {v18.8h}, [x3], x4\n"
5404         RUY_MAKE_ZERO(v18)
5405         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5406         "st1 {v19.8h}, [x3], x4\n"
5407         RUY_MAKE_ZERO(v19)
5408         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5409         "st1 {v20.8h}, [x3], x4\n"
5410         RUY_MAKE_ZERO(v20)
5411         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5412         "st1 {v21.8h}, [x3], x4\n"
5413         RUY_MAKE_ZERO(v21)
5414         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5415         "st1 {v22.8h}, [x3], x4\n"
5416         RUY_MAKE_ZERO(v22)
5417         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
5418         "st1 {v23.8h}, [x3], x4\n"
5419         RUY_MAKE_ZERO(v23)
5420 
5421         // For the next block: perform the first few multiply-adds on the data
5422         // that we have already loaded.
5423         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
5424         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
5425         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
5426         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
5427 
5428         // If all of the 8x8 block fits, we just finished writing it to the
5429         // destination, so we skip the next part.
5430         "beq 241f\n"
5431         // Not all of the 8x8 block fits in the destination matrix.  We just
5432         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
5433         // it to copy into the destination matrix the part that fits.
5434         "mov x3, %[dst_tmp_buf]\n"
5435         "mov x4, %[dst_ptr]\n"
5436         "mov w6, #0\n"
5437         "250:\n"
5438         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
5439         "mov w5, #0\n"
5440         "251:\n"
5441         "ldrsh w7, [x3, x5, lsl #1]\n"
5442         "strh w7, [x4, x5, lsl #1]\n"
5443         "add w5, w5, #1\n"
5444         "cmp w5, w1\n"
5445         "blt 251b\n"
5446         "add w6, w6, #1\n"
5447         "add x3, x3, #16\n"
5448         "add x4, x4, x11\n"
5449         "cmp w6, w2\n"
5450         "blt 250b\n"
5451         "241:\n"
5452         "add %[dst_ptr], %[dst_ptr], #16\n"
5453         // At this point we have completely finished writing values to the
5454         // destination matrix for the current block.
5455 
5456         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
5457 
5458         RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
5459 
5460         // Since the store type is the same as the accum type, no need for
5461         // downcast. There's also no need for clamp by min/max.
5462 
5463         // Compute how much of the 8x8 block of destination 32it values that
5464         // we have computed, fit in the destination matrix. Typically, all of
5465         // it fits, but when the destination matrix shape is not a multiple
5466         // of 8x8, there are some 8x8 blocks along the boundaries that do
5467         // not fit entirely.
5468         "sub w1, %w[dst_rows], %w[row]\n"
5469         "sub w2, %w[dst_cols], %w[col]\n"
5470         "mov w3, #8\n"
5471         "cmp w1, #8\n"
5472         // Compute w1 = how many rows of the 8x8 block fit
5473         "csel w1, w1, w3, le\n"
5474         "cmp w2, #8\n"
5475         // Compute w1 = how many rows of the 8x8 block fit
5476         "csel w2, w2, w3, le\n"
5477 
5478         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
5479         "cmp w1, w3\n"
5480         "ccmp w2, w3, 0, eq\n"
5481         // Yes, all of the 8x8 block fits, go to fast path.
5482         "beq 330f\n"
5483         // Not all of the 8x8 block fits.
5484         // Write to dst_tmp_buf
5485         "mov x3, %[dst_tmp_buf]\n"
5486         "st1 {v16.4s}, [x3], #16\n"
5487         RUY_MAKE_ZERO(v16)
5488         "st1 {v17.4s}, [x3], #16\n"
5489         RUY_MAKE_ZERO(v17)
5490         "st1 {v18.4s}, [x3], #16\n"
5491         RUY_MAKE_ZERO(v18)
5492         "st1 {v19.4s}, [x3], #16\n"
5493         RUY_MAKE_ZERO(v19)
5494         "st1 {v20.4s}, [x3], #16\n"
5495         RUY_MAKE_ZERO(v20)
5496         "st1 {v21.4s}, [x3], #16\n"
5497         RUY_MAKE_ZERO(v21)
5498         "st1 {v22.4s}, [x3], #16\n"
5499         RUY_MAKE_ZERO(v22)
5500         "st1 {v23.4s}, [x3], #16\n"
5501         RUY_MAKE_ZERO(v23)
5502         "st1 {v24.4s}, [x3], #16\n"
5503         RUY_MAKE_ZERO(v24)
5504         "st1 {v25.4s}, [x3], #16\n"
5505         RUY_MAKE_ZERO(v25)
5506         "st1 {v26.4s}, [x3], #16\n"
5507         RUY_MAKE_ZERO(v26)
5508         "st1 {v27.4s}, [x3], #16\n"
5509         RUY_MAKE_ZERO(v27)
5510         "st1 {v28.4s}, [x3], #16\n"
5511         RUY_MAKE_ZERO(v28)
5512         "st1 {v29.4s}, [x3], #16\n"
5513         RUY_MAKE_ZERO(v29)
5514         "st1 {v30.4s}, [x3], #16\n"
5515         RUY_MAKE_ZERO(v30)
5516         "st1 {v31.4s}, [x3], #16\n"
5517         RUY_MAKE_ZERO(v31)
5518 
5519         "b 331f\n"
5520 
5521         "330:\n"
5522         // Yes, all of the 8x8 block fits.
5523         "mov x4, %[dst_ptr]\n"
5524         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
5525         "mov x3, x4\n"
5526         "st1 {v16.4s, v17.4s}, [x3], #32\n"
5527         RUY_MAKE_ZERO(v16)
5528         RUY_MAKE_ZERO(v17)
5529         "add x4, x4, x11\n"
5530         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
5531         "mov x3, x4\n"
5532         "st1 {v18.4s, v19.4s}, [x3], #32\n"
5533         RUY_MAKE_ZERO(v18)
5534         RUY_MAKE_ZERO(v19)
5535         "add x4, x4, x11\n"
5536         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
5537         "mov x3, x4\n"
5538         "st1 {v20.4s, v21.4s}, [x3], #32\n"
5539         RUY_MAKE_ZERO(v20)
5540         RUY_MAKE_ZERO(v21)
5541         "add x4, x4, x11\n"
5542         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
5543         "mov x3, x4\n"
5544         "st1 {v22.4s, v23.4s}, [x3], #32\n"
5545         RUY_MAKE_ZERO(v22)
5546         RUY_MAKE_ZERO(v23)
5547         "add x4, x4, x11\n"
5548         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
5549         "mov x3, x4\n"
5550         "st1 {v24.4s, v25.4s}, [x3], #32\n"
5551         RUY_MAKE_ZERO(v24)
5552         RUY_MAKE_ZERO(v25)
5553         "add x4, x4, x11\n"
5554         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
5555         "mov x3, x4\n"
5556         "st1 {v26.4s, v27.4s}, [x3], #32\n"
5557         RUY_MAKE_ZERO(v26)
5558         RUY_MAKE_ZERO(v27)
5559         "add x4, x4, x11\n"
5560         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
5561         "mov x3, x4\n"
5562         "st1 {v28.4s, v29.4s}, [x3], #32\n"
5563         RUY_MAKE_ZERO(v28)
5564         RUY_MAKE_ZERO(v29)
5565         "add x4, x4, x11\n"
5566         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
5567         "mov x3, x4\n"
5568         "st1 {v30.4s, v31.4s}, [x3], #32\n"
5569         RUY_MAKE_ZERO(v30)
5570         RUY_MAKE_ZERO(v31)
5571 
5572         "331:\n"
5573 
5574         // For the next block: perform the first few multiply-adds on the data
5575         // that we have already loaded.
5576         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
5577         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
5578         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
5579         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
5580 
5581         // If all of the 8x8 block fits, we just finished writing it to the
5582         // destination, so we skip the next part.
5583         "beq 341f\n"
5584 
5585         // Not all of the 8x8 block fits in the destination matrix.  We just
5586         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
5587         // it to copy into the destination matrix the part that fits.
5588         "mov x3, %[dst_tmp_buf]\n"
5589         "mov x4, %[dst_ptr]\n"
5590         "mov w6, #0\n"
5591         "350:\n"
5592         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
5593         "mov w5, #0\n"
5594         "351:\n"
5595         "ldr w7, [x3, x5, lsl #2]\n"
5596         "str w7, [x4, x5, lsl #2]\n"
5597         "add w5, w5, #1\n"
5598         "cmp w5, w1\n"
5599         "blt 351b\n"
5600         "add w6, w6, #1\n"
5601         "add x3, x3, #32\n"
5602         "add x4, x4, x11\n"
5603         "cmp w6, w2\n"
5604         "blt 350b\n"
5605         "341:\n"
5606         "add %[dst_ptr], %[dst_ptr], #32\n"
5607         // At this point we have completely finished writing values to the
5608         // destination matrix for the current block.
5609 
5610         RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
5611 
5612         // Reload some params --- we had used x5 -- x7 for a few other things
5613         // since the last time we had loaded them.
5614         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
5615         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
5616         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
5617 
5618         // Move to the next block of the destination matrix, for the next iter
5619         // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
5620         // been updated earlier.
5621         // Have we reached the end row?
5622         "cmp %w[row], w7\n"
5623         "beq 20f\n"  // yes, end row.
5624         // Not end row. Move to the next row.
5625         "add %w[row], %w[row], #8\n"
5626         "b 21f\n"
5627         "20:\n"
5628         // Was already at end row.
5629         "mov %w[row], w6\n"  // Move back to first row.
5630         "add %w[col], %w[col], #8\n"  // Move to the next column.
5631         "add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
5632         "mov %[dst_ptr], %[dst_col_ptr]\n"
5633         "21:\n"
5634 
5635         // Main loop exit condition: have we hit the end column?
5636         "cmp %w[col], w8\n"
5637 
5638         // w1 is the number of levels of depth that we have already loaded
5639         // LHS and RHS data for. Corresponding to the initial ld1 instructions
5640         // above, this is currently 4.
5641         "mov w1, #4\n"
5642 
5643         "ble 1b\n"
5644 
5645         // clang-format on
5646 
5647         : [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
5648           [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
5649           [dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
5650         : [ params ] "r"(&params), [dst_rows] "r"(params.dst_rows),
5651           [dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
5652           [dst_type_id] "r"(params.dst_type_id)
5653         : "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
5654           "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
5655           "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
5656           "v26", "v27", "v28", "v29", "v30", "v31");
5657 }
5658 
5659 
5660 // Similar to the above 8-bit dotprod kernel, but specialized for the case of
5661 // RHS cols == 1.
5662 // Relevant target CPUs for this kernel include ARM Cortex-A76,
5663 // since these are 64-bit, out-of-order and with dotprod support.
Kernel8bitNeonDotprod1Col(const KernelParams8bit<8,8> & params)5664 void Kernel8bitNeonDotprod1Col(const KernelParams8bit<8, 8>& params) {
5665   profiler::ScopeLabel label("Kernel (kNeonDotprod)");
5666 
5667   CheckOffsetsInKernelParams8bit(params);
5668 
5669   const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
5670   const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
5671   const std::int8_t* lhs_ptr = lhs_col_ptr;
5672   const std::int8_t* rhs_ptr = rhs_col_ptr;
5673   void* dst_col_ptr = params.dst_base_ptr;
5674   void* dst_ptr = dst_col_ptr;
5675   int row = params.start_row;
5676   int col = params.start_col;
5677 
5678   RUY_DCHECK(!(params.flags & RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL));
5679 
5680   // The asm kernel below has the following NEON register allocation:
5681   //
5682   // v16 -- v31 are int32 accumulators.
5683   // During accumulation, v0 -- v15 are used to load int8 data from LHS and
5684   // RHS. At least v0 and v1 are used to load a 8x4 block of LHS, and v2 and
5685   // v3 are used to load a 4x8 block of RHS, like this:
5686   //
5687   //                            int8 RHS 4x1 block
5688   //                           /-------|
5689   //                           |v2.b[0]|
5690   //                           |  ...  |
5691   //                           |v2.b[3]|
5692   //                           \-------/
5693   //    int8 LHS 8x4 block
5694   //  /---------------------\  /--------|
5695   //  |v0.b[0]  ... v0.b[3] |  |v16.s[0]|
5696   //  |  ...          ...   |  |  ...   |
5697   //  |v0.b[12] ... v0.b[15]|  |v16.s[3]|
5698   //  |v1.b[0]  ... v1.b[3] |  |v17.s[0]|
5699   //  |  ...         ...    |  |  ...   |
5700   //  |v1.b[12] ... v1.b[15]|  |v17.s[3]|
5701   //  \---------------------/  \--------/
5702   //                           int32 accumulators 8x1 block
5703   //
5704   // In the RUY_OPT_MAX_STREAMING part of the kernel, this elementary step
5705   // is repeated 4 times, using 4x more registers for LHS and RHS, so that
5706   // is where instead of using v0 -- v3 for LHS and RHS, we use v0 -- v15.
5707   //
5708   // Outside of the RUY_OPT_MAX_STREAMING part of the kernel, v4 -- v7 are
5709   // unused, and v8 -- v15 are used for loading parameters used for the
5710   // post-accumulation part of the kernel.
5711   asm volatile(
5712 #define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
5713 
5714         // clang-format off
5715 
5716         // Load some parameters into registers.
5717         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
5718         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
5719         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
5720         "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
5721         "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
5722         "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
5723         "ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
5724         "ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
5725 
5726         // Load the first 32 bytes of LHS and RHS data.
5727         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
5728         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
5729         "ld1 {v2.8b}, [%[rhs_ptr]]\n"
5730         "add %[rhs_ptr], %[rhs_ptr], #32\n"
5731 
5732         // Clear accumulators.
5733         RUY_MAKE_ZERO(v16)
5734         RUY_MAKE_ZERO(v17)
5735 
5736         // w1 is the number of levels of depth that we have already loaded
5737         // LHS and RHS data for. Corresponding to the initial ld1 instructions
5738         // above, this is currently 4.
5739         "mov w1, #4\n"
5740 
5741         // Perform the first few multiply-adds on the data that we have already
5742         // loaded.
5743         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
5744 
5745         // Main loop of the whole GEMM, over rows and columns of the
5746         // destination matrix.
5747         "1:\n"
5748 
5749         // Ordinary kernel inner loop (over depth), the simpler loop that the
5750         // above was an equivalent 4x-partially-unrolled version of.
5751 
5752         // Reminder - w1 is how many levels of depth we have already loaded
5753         // data for, w12 is the total depth.
5754         "cmp w1, w12\n"
5755         "beq 79f\n"
5756 
5757         "2:\n"
5758 
5759         // Because of the data that we have already loaded, we can start the
5760         // loop body right away with some multiply-adds.
5761         // Each iteration of this loop advances by 4 levels of depth.
5762         "add w1, w1, #4\n"
5763         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
5764         ".word 0x4f82e031  // sdot v17.4s, v1.16b, v2.4b[0]\n"
5765         // Loop termination condition.
5766         "cmp w1, w12\n"
5767         "ld1 {v2.8b}, [%[rhs_ptr]]\n"
5768         "add %[rhs_ptr], %[rhs_ptr], #32\n"
5769         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
5770         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
5771 
5772         "blt 2b\n"
5773 
5774         "79:\n"
5775         // End of the inner loop on depth. Now perform the remaining
5776         // multiply-adds of the last 4 levels of depth, for which the LHS
5777         // and RHS data is already loaded.
5778 
5779         ".word 0x4f82e031  // sdot v17.4s, v1.16b, v2.4b[0]\n"
5780 
5781         // End of accumulation. The registers v16 -- v31 contain the final
5782         // int32 accumulator values of the current 8x8 destination block.
5783         // We now have to compute the final 8-bit values from these int32
5784         // accumulators, and advance to the next 8x8 block. We intertwine
5785         // these two aspects whenever possible for optimal pipelining, both
5786         // at the data flow level (prefetch data for next block as early as
5787         // possible) and instruction pipelining level (some of the next-block
5788         // work can dual-issue with some of the final work on the current
5789         // block).
5790 
5791         // Logic to advance to the next block in preparation for the next
5792         // iteration of the main loop. For now, we only want to compute
5793         // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
5794         // not yet ready to update the values of row and col, as we still need
5795         // the current values for the rest of the work on the current block.
5796 
5797         "cmp %w[row], w7\n"  // Have we finished the last row?
5798         "bge 4f\n"           // If finished last row, go to 4
5799         // Not finished last row: then advance to next row.
5800         "add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
5801         "b 5f\n"
5802         "4:\n"  // Finished last row...
5803         "mov %[lhs_col_ptr], x5\n"  // Go back to first row
5804         // Now we need to advance to the next column. If we already
5805         // finished the last column, then in principle we are done, however
5806         // we can't just return here, as we need to allow the end work of the
5807         // current block to complete. The good news is that at this point it
5808         // doesn't matter what data we load for the next column, since
5809         // we will exit from the main loop below before actually storing
5810         // anything computed from that data.
5811         "cmp %w[col], w8\n"  // Have we finished the last column?
5812         "bge 5f\n" // If yes, just carry on without updating the column pointer.
5813         // Not finished last column: then advance to next column.
5814         "add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
5815         "5:\n"
5816 
5817         // Set the LHS and RHS data pointers to the start of the columns just
5818         // computed.
5819         "mov %[lhs_ptr], %[lhs_col_ptr]\n"
5820         "mov %[rhs_ptr], %[rhs_col_ptr]\n"
5821 
5822         // Load some parameters needed for the end work on current block.
5823         "mvni v8.4s, #0\n"
5824         "ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
5825         "ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
5826         "ins v13.h[4], w4\n" // dst_zero_point
5827         "ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
5828         "ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
5829         "dup v9.4s, w3\n"   // create prod_zp_depth_vec
5830         "add x5, x4, %x[row], lsl #2\n"
5831         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
5832         "csel x4, x4, x5, eq\n"
5833 
5834         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
5835         "add x5, x1, %x[row], lsl #2\n"
5836 
5837         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
5838         "csel x1, x1, x5, eq\n"
5839 
5840         // Load 8 bias values.
5841         "ld1 {v14.4s}, [x1], #16\n"
5842         "ld1 {v15.4s}, [x1]\n"
5843 
5844         // Now that we know what LHS and RHS data the next iteration of the
5845         // main loop will need to load, we start loading the first 32 bytes of
5846         // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
5847         // in the rest of the work on the current block.
5848         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
5849         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
5850         "ld1 {v2.8b}, [%[rhs_ptr]]\n"
5851         "add %[rhs_ptr], %[rhs_ptr], #32\n"
5852 
5853         // Add to the bias values the product (depth * lhs_zero_point * rhs_zero_point),
5854         // See the term NZ1Z2 in equation (7) in https://arxiv.org/pdf/1712.05877.pdf
5855         "add v14.4s, v14.4s, v9.4s\n"
5856         "add v15.4s, v15.4s, v9.4s\n"
5857 
5858         // Perform the bias-addition (per the above, we have just folded into
5859         // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
5860         "add v16.4s, v16.4s, v14.4s\n"
5861         "add v17.4s, v17.4s, v15.4s\n"
5862 
5863         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
5864         "beq 401f\n"
5865         "ldr x3, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
5866         "add x3, x3, %x[col], lsl #2\n"
5867         "ld1 {v14.4s}, [x3], #16\n"
5868         "ld1 {v15.4s}, [x3]\n"
5869         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
5870         "dup v10.4s, w5\n"  // create lhs_zero_point_vec
5871         // Subtract rhs_sums * lhs_zero_point, per
5872         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
5873         "mls v16.4s, v10.4s, v14.s[0]\n"
5874         "mls v17.4s, v10.4s, v14.s[0]\n"
5875         "401:\n"
5876 
5877         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
5878         "beq 402f\n"
5879         "ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
5880         "add x2, x2, %x[row], lsl #2\n"
5881         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
5882         // Load 4 lhs_sums values.
5883         "ld1 {v11.4s}, [x2], #16\n"
5884         "ld1 {v12.4s}, [x2]\n"
5885         "ins v13.s[1], w5\n" // rhs_zero_point
5886         // Compute lhs_sums * rhs_zero_point.
5887         "mul v11.4s, v11.4s, v13.s[1]\n"
5888         "mul v12.4s, v12.4s, v13.s[1]\n"
5889         // Subtract lhs_sums * rhs_zero_point, per
5890         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
5891         "sub v16.4s, v16.4s, v11.4s\n"
5892         "sub v17.4s, v17.4s, v12.4s\n"
5893 
5894         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
5895         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
5896 
5897         "402:\n"
5898 
5899         // At this point we have computed the final int32 values. Now we
5900         // start down-quantizing them to obtain the final 8bit values from them.
5901 
5902         // As part of this down-quantization, our int32 values will be
5903         // multiplied by a multiplier that has a fixed-point component and an
5904         // exponent component.
5905 
5906         //Load the exponent part of the multiplier.
5907         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
5908         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
5909         "add x5, x1, %x[row], lsl #2\n"
5910         "csel x1, x1, x5, eq\n"
5911 
5912         "ldr q9, [x1]\n"
5913         "ldr q10, [x1, #16]\n"
5914 
5915         "smin v11.4s, v8.4s, v9.4s\n"
5916         "smin v12.4s, v8.4s, v10.4s\n"
5917         "sub v9.4s, v9.4s, v11.4s\n"
5918         "sub v10.4s, v10.4s, v12.4s\n"
5919 
5920         // Apply the positive exponent part of the multiplier.
5921         "sshl v16.4s, v16.4s, v9.4s\n"
5922         "sshl v17.4s, v17.4s, v10.4s\n"
5923         "403:\n"
5924 
5925         "ldr q14, [x4]\n" // multiplier_fixedpoint
5926         "ldr q15, [x4, #16]\n" // multiplier_fixedpoint
5927 
5928         // Apply the fixed-point part of the multiplier.
5929         "sqdmulh v16.4s, v16.4s, v14.4s\n"
5930         "sqdmulh v17.4s, v17.4s, v15.4s\n"
5931 
5932         // Apply the negative exponent part of the multiplier.
5933         "srshl v16.4s, v16.4s, v11.4s\n"
5934         "srshl v17.4s, v17.4s, v12.4s\n"
5935 
5936         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
5937         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
5938         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
5939         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
5940 
5941         RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
5942 
5943         // Cast-and-saturate from int32 to int16
5944         "sqxtn v16.4h, v16.4s\n"
5945         "sqxtn2 v16.8h, v17.4s\n"
5946         // All data in v16 at this point.
5947 
5948         // Add the destination zero point
5949         "dup v14.8h, v13.h[4]\n"
5950         "sqadd v16.8h, v16.8h, v14.8h\n"
5951 
5952         // Cast-and-saturate from int16 to uint8, leaving all data in the
5953         // lower half of v16.
5954         "sqxtun v16.8b, v16.8h\n"
5955 
5956         // Load the clamp_min, clamp_max bounds
5957         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
5958         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
5959         "dup v14.16b, w2\n"  // clamp_min
5960         "dup v15.16b, w3\n"  // clamp_max
5961 
5962         // Apply the clamp_min bound
5963         "umax v16.16b, v16.16b, v14.16b\n"
5964 
5965         // Apply the clamp_max bound
5966         "umin v16.16b, v16.16b, v15.16b\n"
5967 
5968         // Make it so that all of the final 8bit values are stored in the
5969         // first 64bits of 128bit NEON registers, so they can be stored
5970         // by 64bit st1 store instructions with byte alignment.
5971         "dup d20, v16.d[1]\n"
5972 
5973         // Compute how much of the 8x1 block of destination 8bit values that
5974         // we have computed, fit in the destination matrix. Typically, all of
5975         // it fits, but when the destination matrix shape is not a multiple
5976         // of 8x1, there are some 8x1 blocks along the boundaries that do
5977         // not fit entirely.
5978         "sub w1, %w[dst_rows], %w[row]\n"
5979         "sub w2, %w[dst_cols], %w[col]\n"
5980         "mov w3, #8\n"
5981         "cmp w1, #8\n"
5982         // Compute w1 = how many rows of the 8x1 block fit
5983         "csel w1, w1, w3, le\n"
5984         "cmp w2, #8\n"
5985 
5986         // Test if w1==8, i.e. if all of the 8x1 block fits.
5987         "cmp w1, w3\n"
5988         // Yes, all of the 8x1 block fits, go to fast path.
5989         "beq 30f\n"
5990         // Not all of the 8x1 block fits.
5991         // Set (x3 address, x4 stride) to write to dst_tmp_buf
5992         "mov x3, %[dst_tmp_buf]\n"
5993         "mov x4, #8\n"
5994         "b 31f\n"
5995         "30:\n"
5996         // Yes, all of the 8x1 block fits.
5997         // Set (x3 address, x4 stride) to write directly to destination matrix.
5998         "mov x3, %[dst_ptr]\n"
5999         "mov x4, x11\n"
6000         "31:\n"
6001 
6002         // Write our 8bit values to the destination
6003         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
6004         "st1 {v16.8b}, [x3]\n"
6005         RUY_MAKE_ZERO(v16)
6006         RUY_MAKE_ZERO(v17)
6007 
6008         // For the next block: perform the first few multiply-adds on the data
6009         // that we have already loaded.
6010         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
6011 
6012         // If all of the 8x8 block fits, we just finished writing it to the
6013         // destination, so we skip the next part.
6014         "beq 41f\n"
6015         // Not all of the 8x8 block fits in the destination matrix.  We just
6016         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
6017         // it to copy into the destination matrix the part that fits.
6018         "mov x3, %[dst_tmp_buf]\n"
6019         "mov x4, %[dst_ptr]\n"
6020         "mov w6, #0\n"
6021         "50:\n"
6022         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
6023         "mov w5, #0\n"
6024         "51:\n"
6025         "ldrb w7, [x3, w5, uxtw]\n"
6026         "strb w7, [x4, w5, uxtw]\n"
6027         "add w5, w5, #1\n"
6028         "cmp w5, w1\n"
6029         "blt 51b\n"
6030         "41:\n"
6031         "add %[dst_ptr], %[dst_ptr], #8\n"
6032         // At this point we have completely finished writing values to the
6033         // destination matrix for the current block.
6034 
6035         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
6036 
6037         RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
6038 
6039         // Cast-and-saturate from int32 to int16
6040         "sqxtn v16.4h, v16.4s\n"
6041         "sqxtn2 v16.8h, v17.4s\n"
6042 
6043 
6044         // Add the destination zero point
6045         "dup v14.8h, v13.h[4]\n"
6046         "sqadd v16.8h, v16.8h, v14.8h\n"
6047 
6048         // Cast-and-saturate from int16 to uint8
6049         "sqxtn v16.8b, v16.8h\n"
6050 
6051         // Load the clamp_min, clamp_max bounds
6052         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
6053         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
6054         "dup v14.16b, w2\n"  // clamp_min
6055         "dup v15.16b, w3\n"  // clamp_max
6056 
6057         // Apply the clamp_min bound
6058         "smax v16.16b, v16.16b, v14.16b\n"
6059 
6060         // Apply the clamp_max bound
6061         "smin v16.16b, v16.16b, v15.16b\n"
6062 
6063         // Make it so that all of the final 8bit values are stored in the
6064         // first 64bits of 128bit NEON registers, so they can be stored
6065         // by 64bit st1 store instructions with byte alignment.
6066         "dup d20, v16.d[1]\n"
6067 
6068         // Compute how much of the 8x1 block of destination 8bit values that
6069         // we have computed, fit in the destination matrix. Typically, all of
6070         // it fits, but when the destination matrix shape is not a multiple
6071         // of 8x8, there are some 8x8 blocks along the boundaries that do
6072         // not fit entirely.
6073         "sub w1, %w[dst_rows], %w[row]\n"
6074         "sub w2, %w[dst_cols], %w[col]\n"
6075         "mov w3, #8\n"
6076         "cmp w1, #8\n"
6077         // Compute w1 = how many rows of the 8x1 block fit
6078         "csel w1, w1, w3, le\n"
6079         "cmp w2, #8\n"
6080 
6081         // Test if w1==8, i.e. if all of the 8x1 block fits.
6082         "cmp w1, w3\n"
6083         // Yes, all of the 8x1 block fits, go to fast path.
6084         "beq 130f\n"
6085         // Not all of the 8x1 block fits.
6086         // Set (x3 address, x4 stride) to write to dst_tmp_buf
6087         "mov x3, %[dst_tmp_buf]\n"
6088         "mov x4, #8\n"
6089         "b 131f\n"
6090         "130:\n"
6091         // Yes, all of the 8x8 block fits.
6092         // Set (x3 address, x4 stride) to write directly to destination matrix.
6093         "mov x3, %[dst_ptr]\n"
6094         "mov x4, x11\n"
6095         "131:\n"
6096 
6097         // Write our 8bit values to the destination
6098         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
6099         "st1 {v16.8b}, [x3]\n"
6100         RUY_MAKE_ZERO(v16)
6101         RUY_MAKE_ZERO(v17)
6102 
6103         // For the next block: perform the first few multiply-adds on the data
6104         // that we have already loaded.
6105         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
6106 
6107         // If all of the 8x8 block fits, we just finished writing it to the
6108         // destination, so we skip the next part.
6109         "beq 141f\n"
6110         // Not all of the 8x8 block fits in the destination matrix.  We just
6111         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
6112         // it to copy into the destination matrix the part that fits.
6113         "mov x3, %[dst_tmp_buf]\n"
6114         "mov x4, %[dst_ptr]\n"
6115         "mov w6, #0\n"
6116         "150:\n"
6117         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
6118         "mov w5, #0\n"
6119         "151:\n"
6120         "ldrb w7, [x3, w5, uxtw]\n"
6121         "strb w7, [x4, w5, uxtw]\n"
6122         "add w5, w5, #1\n"
6123         "cmp w5, w1\n"
6124         "blt 151b\n"
6125         "141:\n"
6126         "add %[dst_ptr], %[dst_ptr], #8\n"
6127         // At this point we have completely finished writing values to the
6128         // destination matrix for the current block.
6129 
6130         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
6131 
6132         RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
6133 
6134         // Add the destination zero point
6135         "dup v14.8h, v13.h[4]\n"
6136         "saddw v16.4s, v16.4s, v14.4h\n"
6137         "saddw v17.4s, v17.4s, v14.4h\n"
6138 
6139         // Cast-and-saturate from int32 to int16
6140         "sqxtn v16.4h, v16.4s\n"
6141         "sqxtn2 v16.8h, v17.4s\n"
6142 
6143         // Load the clamp_min, clamp_max bounds
6144         "ldrsh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
6145         "ldrsh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
6146         "dup v14.8h, w2\n"  // clamp_min
6147         "dup v15.8h, w3\n"  // clamp_max
6148 
6149         // Apply the clamp_min bound
6150         "smax v16.8h, v16.8h, v14.8h\n"
6151         // Apply the clamp_max bound
6152         "smin v16.8h, v16.8h, v15.8h\n"
6153 
6154         // Compute how much of the 8x1 block of destination 16bit values that
6155         // we have computed, fit in the destination matrix. Typically, all of
6156         // it fits, but when the destination matrix shape is not a multiple
6157         // of 8x8, there are some 8x1 blocks along the boundaries that do
6158         // not fit entirely.
6159         "sub w1, %w[dst_rows], %w[row]\n"
6160         "sub w2, %w[dst_cols], %w[col]\n"
6161         "mov w3, #8\n"
6162         "cmp w1, #8\n"
6163         // Compute w1 = how many rows of the 8x1 block fit
6164         "csel w1, w1, w3, le\n"
6165         "cmp w2, #8\n"
6166 
6167         // Test if w1==8, i.e. if all of the 8x8 block fits.
6168         "cmp w1, w3\n"
6169         // Yes, all of the 8x1 block fits, go to fast path.
6170         "beq 230f\n"
6171         // Not all of the 8x1 block fits.
6172         // Set (x3 address, x4 stride) to write to dst_tmp_buf
6173         "mov x3, %[dst_tmp_buf]\n"
6174         "mov x4, #16\n"
6175         "b 231f\n"
6176         "230:\n"
6177         // Yes, all of the 8x1 block fits.
6178         // Set (x3 address, x4 stride) to write directly to destination matrix.
6179         "mov x3, %[dst_ptr]\n"
6180         "mov x4, x11\n"
6181         "231:\n"
6182 
6183         // Write our 16bit values to the destination
6184         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
6185         "st1 {v16.8h}, [x3]\n"
6186         RUY_MAKE_ZERO(v16)
6187         RUY_MAKE_ZERO(v17)
6188 
6189         // For the next block: perform the first few multiply-adds on the data
6190         // that we have already loaded.
6191         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
6192 
6193         // If all of the 8x1 block fits, we just finished writing it to the
6194         // destination, so we skip the next part.
6195         "beq 241f\n"
6196         // Not all of the 8x1 block fits in the destination matrix.  We just
6197         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
6198         // it to copy into the destination matrix the part that fits.
6199         "mov x3, %[dst_tmp_buf]\n"
6200         "mov x4, %[dst_ptr]\n"
6201         "mov w6, #0\n"
6202         "250:\n"
6203         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
6204         "mov w5, #0\n"
6205         "251:\n"
6206         "ldrsh w7, [x3, x5, lsl #1]\n"
6207         "strh w7, [x4, x5, lsl #1]\n"
6208         "add w5, w5, #1\n"
6209         "cmp w5, w1\n"
6210         "blt 251b\n"
6211         "241:\n"
6212         "add %[dst_ptr], %[dst_ptr], #16\n"
6213         // At this point we have completely finished writing values to the
6214         // destination matrix for the current block.
6215 
6216         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
6217 
6218         RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
6219 
6220         // Since the store type is the same as the accum type, no need for
6221         // downcast. There's also no need for clamp by min/max.
6222 
6223         // Compute how much of the 8x1 block of destination 32 bit values that
6224         // we have computed, fit in the destination matrix. Typically, all of
6225         // it fits, but when the destination matrix shape is not a multiple
6226         // of 8x1, there are some 8x1 blocks along the boundaries that do
6227         // not fit entirely.
6228         "sub w1, %w[dst_rows], %w[row]\n"
6229         "sub w2, %w[dst_cols], %w[col]\n"
6230         "mov w3, #8\n"
6231         "cmp w1, #8\n"
6232         // Compute w1 = how many rows of the 8x1 block fit
6233         "csel w1, w1, w3, le\n"
6234         "cmp w2, #8\n"
6235         // Compute w1 = how many rows of the 8x8 block fit
6236         "csel w2, w2, w3, le\n"
6237 
6238         // Test if w1==8, i.e. if all of the 8x8 block fits.
6239         "cmp w1, w3\n"
6240         // Yes, all of the 8x1 block fits, go to fast path.
6241         "beq 330f\n"
6242         // Not all of the 8x1 block fits.
6243         // Set (x3 address, x4 stride) to write to dst_tmp_buf
6244         "mov x3, %[dst_tmp_buf]\n"
6245         "mov x4, #16\n"
6246 
6247         // Write our 32bit values to the destination described by
6248         // (x3 address, x4 stride).
6249         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
6250         "st1 {v16.4s}, [x3], x4\n"
6251         RUY_MAKE_ZERO(v16)
6252         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
6253         "st1 {v17.4s}, [x3], x4\n"
6254         RUY_MAKE_ZERO(v17)
6255 
6256         "b 331f\n"
6257 
6258         "330:\n"
6259         // Yes, all of the 8x1 block fits.
6260         // Set (x3 address, x4 stride) to write directly to destination matrix.
6261         "mov x4, %[dst_ptr]\n"
6262         "mov x3, x4\n"
6263 
6264         // Write our 32bit values to the destination described by
6265         // (x3 address, x4 stride).
6266         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
6267         "st1 {v16.4s, v17.4s}, [x3], #32\n"
6268         RUY_MAKE_ZERO(v16)
6269         RUY_MAKE_ZERO(v17)
6270 
6271         "331:\n"
6272 
6273         // For the next block: perform the first few multiply-adds on the data
6274         // that we have already loaded.
6275         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
6276 
6277         // If all of the 8x8 block fits, we just finished writing it to the
6278         // destination, so we skip the next part.
6279         "beq 341f\n"
6280 
6281         // Not all of the 8x8 block fits in the destination matrix.  We just
6282         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
6283         // it to copy into the destination matrix the part that fits.
6284         "mov x3, %[dst_tmp_buf]\n"
6285         "mov x4, %[dst_ptr]\n"
6286         "mov w6, #0\n"
6287         "350:\n"
6288         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
6289         "mov w5, #0\n"
6290         "351:\n"
6291         "ldr w7, [x3, x5, lsl #2]\n"
6292         "str w7, [x4, x5, lsl #2]\n"
6293         "add w5, w5, #1\n"
6294         "cmp w5, w1\n"
6295         "blt 351b\n"
6296         "341:\n"
6297         "add %[dst_ptr], %[dst_ptr], #32\n"
6298         // At this point we have completely finished writing values to the
6299         // destination matrix for the current block.
6300 
6301         RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
6302 
6303         // Reload some params --- we had used x5 -- x7 for a few other things
6304         // since the last time we had loaded them.
6305         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
6306         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
6307         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
6308 
6309         // Move to the next block of the destination matrix, for the next iter
6310         // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
6311         // been updated earlier.
6312         // Have we reached the end row?
6313         "cmp %w[row], w7\n"
6314         "beq 20f\n"  // yes, end row.
6315         // Not end row. Move to the next row.
6316         "add %w[row], %w[row], #8\n"
6317         "b 21f\n"
6318         "20:\n"
6319         // Was already at end row.
6320         "mov %w[row], w6\n"  // Move back to first row.
6321         "add %w[col], %w[col], #8\n"  // Move to the next column.
6322         "add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
6323         "mov %[dst_ptr], %[dst_col_ptr]\n"
6324         "21:\n"
6325 
6326         // Main loop exit condition: have we hit the end column?
6327         "cmp %w[col], w8\n"
6328 
6329         // w1 is the number of levels of depth that we have already loaded
6330         // LHS and RHS data for. Corresponding to the initial ld1 instructions
6331         // above, this is currently 4.
6332         "mov w1, #4\n"
6333 
6334         "ble 1b\n"
6335 
6336         // clang-format on
6337 
6338         : [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
6339           [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
6340           [dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
6341         : [ params ] "r"(&params), [dst_rows] "r"(params.dst_rows),
6342           [dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
6343           [dst_type_id] "r"(params.dst_type_id)
6344         : "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
6345           "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
6346           "v13", "v14", "v15", "v16", "v17");
6347 }
6348 
6349 // Variant of the above Kernel8bitNeonDotprod, tuned for in-order
6350 // CPUs. Specifically here, the relevant in-order CPUs are ARM Cortex-A55r1,
6351 // since these are 64-bit and support dotprod.
6352 //
6353 // While this kernel does not have a direct equivalent in gemmlowp, it was
6354 // developed based on insights that David Mansell at ARM shared with their
6355 // contribution of gemmlowp kernels tuned for Cortex-A55r1, with very helpful
6356 // comments. Specifically, see this comment about tuning for Cortex-A55r1:
6357 // https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L4412
Kernel8bitNeonDotprodA55ish(const KernelParams8bit<8,8> & params)6358 void Kernel8bitNeonDotprodA55ish(const KernelParams8bit<8, 8>& params) {
6359   profiler::ScopeLabel label(
6360       "Kernel (kNeonDotprod, optimized for in-order cores)");
6361 
6362   CheckOffsetsInKernelParams8bit(params);
6363 
6364   const std::int8_t* lhs_col_ptr = params.lhs_base_ptr;
6365   const std::int8_t* rhs_col_ptr = params.rhs_base_ptr;
6366   const std::int8_t* lhs_ptr = lhs_col_ptr;
6367   const std::int8_t* rhs_ptr = rhs_col_ptr;
6368   void* dst_col_ptr = params.dst_base_ptr;
6369   void* dst_ptr = dst_col_ptr;
6370   int row = params.start_row;
6371   int col = params.start_col;
6372 
6373   // The asm kernel below has the following NEON register allocation:
6374   //
6375   // v16 -- v31 are int32 accumulators.
6376   // During accumulation, v0 -- v3 are used to load int8 data from LHS and
6377   // RHS.
6378   //
6379   //                                      int8 RHS 4x8 block
6380   //                           /-----------------------------------------|
6381   //                           |v2.b[0] ... v2.b[12] v3.b[0] ... v3.b[12]|
6382   //                           |  ...                              ...   |
6383   //                           |v2.b[3] ... v2.b[15] v3.b[3] ... v3.b[15]|
6384   //                           \-----------------------------------------/
6385   //    int8 LHS 8x4 block
6386   //  /---------------------\  /-----------------------------------------|
6387   //  |v0.b[0]  ... v0.b[3] |  |v16.s[0]           ...           v30.s[0]|
6388   //  |  ...          ...   |  |  ...                              ...   |
6389   //  |v0.b[12] ... v0.b[15]|  |v16.s[3]           ...           v30.s[3]|
6390   //  |v1.b[0]  ... v1.b[3] |  |v17.s[0]           ...           v31.s[0]|
6391   //  |  ...         ...    |  |  ...                              ...   |
6392   //  |v1.b[12] ... v1.b[15]|  |v17.s[3]           ...           v31.s[3]|
6393   //  \---------------------/  \-----------------------------------------/
6394   //                                  int32 accumulators 8x8 block
6395   //
6396   // There is no RUY_OPT_MAX_STREAMING 4x-unrolled part in this kernel because
6397   // we did not observe a benefit of such partial unrolling on in-order CPUs.
6398   //
6399   // v4 -- v7 are unused, and v8 -- v15 are used for loading parameters used for
6400   // the post-accumulation part of the kernel.
6401   asm volatile(
6402 #define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
6403 
6404         // clang-format off
6405 
6406         // Load some parameters into registers.
6407         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
6408         RUY_MAKE_ZERO(v16)
6409         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
6410         RUY_MAKE_ZERO(v17)
6411         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
6412         RUY_MAKE_ZERO(v18)
6413         "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
6414         RUY_MAKE_ZERO(v19)
6415         "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
6416         RUY_MAKE_ZERO(v20)
6417         "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
6418         RUY_MAKE_ZERO(v21)
6419         "ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
6420         RUY_MAKE_ZERO(v22)
6421         "ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
6422 
6423         // Load the first 32 bytes of LHS and RHS data.
6424         "ld1 {v0.16b}, [%[lhs_ptr]], #16\n"
6425         "ld1 {v1.16b}, [%[lhs_ptr]], #16\n"
6426         "ld1 {v2.16b}, [%[rhs_ptr]], #16\n"
6427         "ld1 {v3.16b}, [%[rhs_ptr]], #16\n"
6428 
6429         // Clear accumulators.
6430         RUY_MAKE_ZERO(v23)
6431         RUY_MAKE_ZERO(v24)
6432         RUY_MAKE_ZERO(v25)
6433         RUY_MAKE_ZERO(v26)
6434         RUY_MAKE_ZERO(v27)
6435         // Perform the first few multiply-adds on the data that we have already
6436         // loaded.
6437         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
6438         RUY_MAKE_ZERO(v28)
6439         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
6440         RUY_MAKE_ZERO(v29)
6441         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
6442         RUY_MAKE_ZERO(v30)
6443         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
6444         RUY_MAKE_ZERO(v31)
6445 
6446 
6447         "1:\n"
6448 
6449         "add x5, %[lhs_ptr], x12, lsl #3\n"
6450         "sub x5, x5, #32\n"
6451         "cmp %[lhs_ptr], x5\n"
6452 
6453         "beq 79f\n"
6454 
6455         // Main accumulation loop
6456         "2:\n"
6457         ".word 0x4f83e018  // sdot v24.4s, v0.16b, v3.4b[0]\n"
6458         "ldr x1, [%[lhs_ptr], #8]\n"
6459         ".word 0x4fa3e01a  // sdot v26.4s, v0.16b, v3.4b[1]\n"
6460         "ldr x3, [%[rhs_ptr], #8]\n"
6461         ".word 0x4f83e81c  // sdot v28.4s, v0.16b, v3.4b[2]\n"
6462         "ldr x4, [%[rhs_ptr], #24]\n"
6463         ".word 0x4fa3e81e  // sdot v30.4s, v0.16b, v3.4b[3]\n"
6464         "ldr d0, [%[lhs_ptr], #0]\n"
6465         ".word 0x4f82e031  // sdot v17.4s, v1.16b, v2.4b[0]\n"
6466         "ins v0.d[1], x1\n"
6467         ".word 0x4fa2e033  // sdot v19.4s, v1.16b, v2.4b[1]\n"
6468         "ldr x2, [%[lhs_ptr], #24]\n"
6469         ".word 0x4f82e835  // sdot v21.4s, v1.16b, v2.4b[2]\n"
6470         "add %[lhs_ptr], %[lhs_ptr], #32\n"
6471         ".word 0x4fa2e837  // sdot v23.4s, v1.16b, v2.4b[3]\n"
6472         "ldr d2, [%[rhs_ptr], #0]\n"
6473         ".word 0x4f83e039  // sdot v25.4s, v1.16b, v3.4b[0]\n"
6474         "ins v2.d[1], x3\n"
6475         ".word 0x4fa3e03b  // sdot v27.4s, v1.16b, v3.4b[1]\n"
6476         "cmp %[lhs_ptr], x5\n"
6477         ".word 0x4f83e83d  // sdot v29.4s, v1.16b, v3.4b[2]\n"
6478         "add %[rhs_ptr], %[rhs_ptr], #32\n"
6479         ".word 0x4fa3e83f  // sdot v31.4s, v1.16b, v3.4b[3]\n"
6480         "ldr d3, [%[rhs_ptr], #-16]\n"
6481         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
6482         "ldr d1, [%[lhs_ptr], #-16]\n"
6483         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
6484         "ins v3.d[1], x4\n"
6485         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
6486         "ins v1.d[1], x2\n"
6487         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
6488         "blt 2b\n"
6489 
6490         // Last accumulation steps, nothing left to load.
6491         "79:\n"
6492         ".word 0x4f83e018  // sdot v24.4s, v0.16b, v3.4b[0]\n"
6493         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
6494         ".word 0x4fa3e01a  // sdot v26.4s, v0.16b, v3.4b[1]\n"
6495         "cmp %w[row], w7\n"  // Have we finished the last row?
6496         ".word 0x4f83e81c  // sdot v28.4s, v0.16b, v3.4b[2]\n"
6497         ".word 0x4fa3e81e  // sdot v30.4s, v0.16b, v3.4b[3]\n"
6498         ".word 0x4f82e031  // sdot v17.4s, v1.16b, v2.4b[0]\n"
6499         ".word 0x4fa2e033  // sdot v19.4s, v1.16b, v2.4b[1]\n"
6500         ".word 0x4f82e835  // sdot v21.4s, v1.16b, v2.4b[2]\n"
6501         ".word 0x4fa2e837  // sdot v23.4s, v1.16b, v2.4b[3]\n"
6502         ".word 0x4f83e039  // sdot v25.4s, v1.16b, v3.4b[0]\n"
6503         ".word 0x4fa3e03b  // sdot v27.4s, v1.16b, v3.4b[1]\n"
6504         ".word 0x4f83e83d  // sdot v29.4s, v1.16b, v3.4b[2]\n"
6505         ".word 0x4fa3e83f  // sdot v31.4s, v1.16b, v3.4b[3]\n"
6506 
6507         // End of accumulation. The registers v16 -- v31 contain the final
6508         // int32 accumulator values of the current 8x8 destination block.
6509         // We now have to compute the final 8-bit values from these int32
6510         // accumulators, and advance to the next 8x8 block. We intertwine
6511         // these two aspects whenever possible for optimal pipelining, both
6512         // at the data flow level (prefetch data for next block as early as
6513         // possible) and instruction pipelining level (some of the next-block
6514         // work can dual-issue with some of the final work on the current
6515         // block).
6516 
6517         // Logic to advance to the next block in preparation for the next
6518         // iteration of the main loop. For now, we only want to compute
6519         // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
6520         // not yet ready to update the values of row and col, as we still need
6521         // the current values for the rest of the work on the current block.
6522 
6523         "bge 4f\n"           // If finished last row, go to 4
6524         // Not finished last row: then advance to next row.
6525         "add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
6526         "b 5f\n"
6527         "4:\n"  // Finished last row...
6528         "mov %[lhs_col_ptr], x5\n"  // Go back to first row
6529         // Now we need to advance to the next column. If we already
6530         // finished the last column, then in principle we are done, however
6531         // we can't just return here, as we need to allow the end work of the
6532         // current block to complete. The good news is that at this point it
6533         // doesn't matter what data we load for the next column, since
6534         // we will exit from the main loop below before actually storing
6535         // anything computed from that data.
6536         "cmp %w[col], w8\n"  // Have we finished the last column?
6537         "bge 5f\n" // If yes, just carry on without updating the column pointer.
6538         // Not finished last column: then advance to next column.
6539         "add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
6540         "5:\n"
6541 
6542         // Set the LHS and RHS data pointers to the start of the columns just
6543         // computed.
6544         "mov %[lhs_ptr], %[lhs_col_ptr]\n"
6545         // Load some parameters needed for the end work on current block.
6546         "mvni v8.4s, #0\n"
6547         "mov %[rhs_ptr], %[rhs_col_ptr]\n"
6548         "ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_PROD_ZP_DEPTH) "]\n"
6549         "ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
6550         "dup v9.4s, w3\n"   // create prod_zp_depth_vec
6551 
6552         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
6553         // Determine the channel index.
6554         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
6555         "csel w3, %w[row], %w[col], eq\n"
6556 
6557         // Offset the bias pointer as needed given the current row, col.
6558         "add x5, x1, x3, lsl #2\n"
6559 
6560         // If there is no bias, use no offset, just address the passed zero
6561         // data.
6562         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
6563         "csel x1, x1, x5, eq\n"
6564 
6565         // Load 8 bias values.
6566         "ld1 {v14.2s}, [x1], #8\n"
6567         "ldr x5, [x1], #8\n"
6568         "ins v14.d[1], x5\n"
6569         "ld1 {v15.2s}, [x1], #8\n"
6570         "ldr x5, [x1], #8\n"
6571         "ins v15.d[1], x5\n"
6572 
6573         // Add to the bias values the product (depth * lhs_zero_point * rhs_zero_point),
6574         // See the term NZ1Z2 in equation (7) in https://arxiv.org/pdf/1712.05877.pdf
6575         "add v14.4s, v14.4s, v9.4s\n"
6576         "add v15.4s, v15.4s, v9.4s\n"
6577         // Perform the bias-addition (per the above, we have just folded into
6578         // the bias the (depth * lhs_zero_point * rhs_zero_point) term.)
6579         // Jump based on channel dimension.
6580         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
6581         "bne 6f\n"
6582         // Case where channels are rows
6583         "add v16.4s, v16.4s, v14.4s\n"
6584         "add v17.4s, v17.4s, v15.4s\n"
6585         "add v18.4s, v18.4s, v14.4s\n"
6586         "add v19.4s, v19.4s, v15.4s\n"
6587         "add v20.4s, v20.4s, v14.4s\n"
6588         "add v21.4s, v21.4s, v15.4s\n"
6589         "add v22.4s, v22.4s, v14.4s\n"
6590         "add v23.4s, v23.4s, v15.4s\n"
6591         "add v24.4s, v24.4s, v14.4s\n"
6592         "add v25.4s, v25.4s, v15.4s\n"
6593         "add v26.4s, v26.4s, v14.4s\n"
6594         "add v27.4s, v27.4s, v15.4s\n"
6595         "add v28.4s, v28.4s, v14.4s\n"
6596         "add v29.4s, v29.4s, v15.4s\n"
6597         "add v30.4s, v30.4s, v14.4s\n"
6598         "add v31.4s, v31.4s, v15.4s\n"
6599         "b 7f\n"
6600 
6601         "6:\n"
6602         // Case where channels are columns
6603         "dup v10.4s, v14.s[0]\n"
6604         "dup v11.4s, v14.s[1]\n"
6605         "add v16.4s, v16.4s, v10.4s\n"
6606         "dup v12.4s, v14.s[2]\n"
6607         "add v17.4s, v17.4s, v10.4s\n"
6608         "dup v13.4s, v14.s[3]\n"
6609         "add v18.4s, v18.4s, v11.4s\n"
6610         "dup v10.4s, v15.s[0]\n"
6611         "add v19.4s, v19.4s, v11.4s\n"
6612         "dup v11.4s, v15.s[1]\n"
6613         "add v20.4s, v20.4s, v12.4s\n"
6614         "add v21.4s, v21.4s, v12.4s\n"
6615         "dup v12.4s, v15.s[2]\n"
6616         "add v22.4s, v22.4s, v13.4s\n"
6617         "add v23.4s, v23.4s, v13.4s\n"
6618         "dup v13.4s, v15.s[3]\n"
6619         "add v24.4s, v24.4s, v10.4s\n"
6620         "add v25.4s, v25.4s, v10.4s\n"
6621         "add v26.4s, v26.4s, v11.4s\n"
6622         "add v27.4s, v27.4s, v11.4s\n"
6623         "add v28.4s, v28.4s, v12.4s\n"
6624         "add v29.4s, v29.4s, v12.4s\n"
6625         "add v30.4s, v30.4s, v13.4s\n"
6626         "add v31.4s, v31.4s, v13.4s\n"
6627         "7:\n"
6628 
6629         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_RHS_SUMS) "\n"
6630         "beq 401f\n"
6631         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_ZERO_POINT) "]\n"
6632         "dup v10.4s, w5\n"  // create lhs_zero_point_vec
6633         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_SUMS) "]\n"
6634         "add x5, x5, %x[col], lsl #2\n"
6635         // Load 8 rhs_sums values.
6636         "ld1 {v14.2s}, [x5], #8\n"
6637         "ldr x7, [x5], #8\n"
6638         "ld1 {v15.2s}, [x5], #8\n"
6639         "ins v14.d[1], x7\n"
6640         "ldr x7, [x5], #8\n"
6641         "ins v15.d[1], x7\n"
6642         // Subtract rhs_sums * lhs_zero_point, per
6643         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
6644         "mls v16.4s, v10.4s, v14.s[0]\n"
6645         "mls v17.4s, v10.4s, v14.s[0]\n"
6646         "mls v18.4s, v10.4s, v14.s[1]\n"
6647         "mls v19.4s, v10.4s, v14.s[1]\n"
6648         "mls v20.4s, v10.4s, v14.s[2]\n"
6649         "mls v21.4s, v10.4s, v14.s[2]\n"
6650         "mls v22.4s, v10.4s, v14.s[3]\n"
6651         "mls v23.4s, v10.4s, v14.s[3]\n"
6652         "mls v24.4s, v10.4s, v15.s[0]\n"
6653         "mls v25.4s, v10.4s, v15.s[0]\n"
6654         "mls v26.4s, v10.4s, v15.s[1]\n"
6655         "mls v27.4s, v10.4s, v15.s[1]\n"
6656         "mls v28.4s, v10.4s, v15.s[2]\n"
6657         "mls v29.4s, v10.4s, v15.s[2]\n"
6658         "mls v30.4s, v10.4s, v15.s[3]\n"
6659         "mls v31.4s, v10.4s, v15.s[3]\n"
6660         "401:\n"
6661 
6662         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_LHS_SUMS) "\n"
6663         "beq 402f\n"
6664         "ldr x2, [%[params], #" RUY_STR(RUY_OFFSET_LHS_SUMS) "]\n"
6665         "add x2, x2, %x[row], lsl #2\n"
6666         "ldr w5, [%[params], #" RUY_STR(RUY_OFFSET_RHS_ZERO_POINT) "]\n"
6667         "ins v13.s[1], w5\n" // rhs_zero_point
6668         // Load 8 lhs_sums values.
6669         "ld1 {v11.2s}, [x2], #8\n"
6670         "ldr x4, [x2], #8\n"
6671         "ins v11.d[1], x4\n"
6672         "ld1 {v12.2s}, [x2], #8\n"
6673         "ldr x4, [x2], #8\n"
6674         "ins v12.d[1], x4\n"
6675         // Compute lhs_sums * rhs_zero_point.
6676         "mul v11.4s, v11.4s, v13.s[1]\n"
6677         "mul v12.4s, v12.4s, v13.s[1]\n"
6678         // Subtract lhs_sums * rhs_zero_point, per
6679         // equation (7) in https://arxiv.org/pdf/1712.05877.pdf
6680         "sub v16.4s, v16.4s, v11.4s\n"
6681         "sub v17.4s, v17.4s, v12.4s\n"
6682         "sub v18.4s, v18.4s, v11.4s\n"
6683         "sub v19.4s, v19.4s, v12.4s\n"
6684         "sub v20.4s, v20.4s, v11.4s\n"
6685         "sub v21.4s, v21.4s, v12.4s\n"
6686         "sub v22.4s, v22.4s, v11.4s\n"
6687         "sub v23.4s, v23.4s, v12.4s\n"
6688         "sub v24.4s, v24.4s, v11.4s\n"
6689         "sub v25.4s, v25.4s, v12.4s\n"
6690         "sub v26.4s, v26.4s, v11.4s\n"
6691         "sub v27.4s, v27.4s, v12.4s\n"
6692         "sub v28.4s, v28.4s, v11.4s\n"
6693         "sub v29.4s, v29.4s, v12.4s\n"
6694         "sub v30.4s, v30.4s, v11.4s\n"
6695         "sub v31.4s, v31.4s, v12.4s\n"
6696 
6697         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT32) "\n"
6698         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT32) "f\n"
6699 
6700         "402:\n"
6701 
6702         // At this point we have computed the final int32 values. Now we
6703         // start down-quantizing them to obtain the final 8bit values from them.
6704 
6705         // As part of this down-quantization, our int32 values will be
6706         // multiplied by a multiplier that has a fixed-point component and an
6707         // exponent component.
6708 
6709         //Load the exponent part of the multiplier.
6710         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_EXPONENT) "]\n"
6711         // Compute the multiplier_exponent pointer
6712         "ldrb w6, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
6713         "tst w6, #" RUY_STR(RUY_ASM_FLAG_HAS_PERCHANNEL) "\n"
6714         "add x5, x1, x3, lsl #2\n"
6715         "csel x1, x1, x5, eq\n"
6716         // Load multiplier_exponent
6717         "ldr q9, [x1]\n"
6718         "ldr q10, [x1, #16]\n"
6719         // Separate positive and negative exponents
6720         "smin v11.4s, v8.4s, v9.4s\n"
6721         "smin v12.4s, v8.4s, v10.4s\n"
6722         "sub v9.4s, v9.4s, v11.4s\n"
6723         "sub v10.4s, v10.4s, v12.4s\n"
6724 
6725         // Compute the multiplier_fixedpoint pointer
6726         "ldr x4, [%[params], #" RUY_STR(RUY_OFFSET_MULTIPLIER_FIXEDPOINT) "]\n"
6727         "add x5, x4, x3, lsl #2\n"
6728         "csel x4, x4, x5, eq\n"
6729         // Load multiplier_fixedpoint
6730         "ldr q14, [x4]\n"
6731         "ldr q15, [x4, #16]\n"
6732 
6733         // Jump based on channel dimension.
6734         "tst w6, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
6735         "bne 8f\n"
6736         // Case where channels are rows
6737 
6738         // Apply the positive exponent part of the multiplier.
6739         "sshl v16.4s, v16.4s, v9.4s\n"
6740         "sshl v17.4s, v17.4s, v10.4s\n"
6741         "sshl v18.4s, v18.4s, v9.4s\n"
6742         "sshl v19.4s, v19.4s, v10.4s\n"
6743         "sshl v20.4s, v20.4s, v9.4s\n"
6744         "sshl v21.4s, v21.4s, v10.4s\n"
6745         "sshl v22.4s, v22.4s, v9.4s\n"
6746         "sshl v23.4s, v23.4s, v10.4s\n"
6747         "sshl v24.4s, v24.4s, v9.4s\n"
6748         "sshl v25.4s, v25.4s, v10.4s\n"
6749         "sshl v26.4s, v26.4s, v9.4s\n"
6750         "sshl v27.4s, v27.4s, v10.4s\n"
6751         "sshl v28.4s, v28.4s, v9.4s\n"
6752         "sshl v29.4s, v29.4s, v10.4s\n"
6753         "sshl v30.4s, v30.4s, v9.4s\n"
6754         "sshl v31.4s, v31.4s, v10.4s\n"
6755         "10:\n"
6756 
6757         // Apply the fixed-point part of the multiplier.
6758         //
6759         // ... and, interleaved into that:
6760         // Now that we know what LHS and RHS data the next iteration of the
6761         // main loop will need to load, we start loading the first 32 bytes of
6762         // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
6763         // in the rest of the work on the current block.
6764         "ld1 {v0.8b}, [%[lhs_ptr]], #8\n"
6765         "sqdmulh v16.4s, v16.4s, v14.4s\n"
6766         "ldr x1, [%[lhs_ptr]], #8\n"
6767         "sqdmulh v17.4s, v17.4s, v15.4s\n"
6768         "ld1 {v1.8b}, [%[lhs_ptr]], #8\n"
6769         "sqdmulh v18.4s, v18.4s, v14.4s\n"
6770         "ldr x2, [%[lhs_ptr]], #8\n"
6771         "sqdmulh v19.4s, v19.4s, v15.4s\n"
6772         "ld1 {v2.8b}, [%[rhs_ptr]], #8\n"
6773         "sqdmulh v20.4s, v20.4s, v14.4s\n"
6774         "ldr x5, [%[rhs_ptr]], #8\n"
6775         "sqdmulh v21.4s, v21.4s, v15.4s\n"
6776         "ld1 {v3.8b}, [%[rhs_ptr]], #8\n"
6777         "sqdmulh v22.4s, v22.4s, v14.4s\n"
6778         "ldr x6, [%[rhs_ptr]], #8\n"
6779         "sqdmulh v23.4s, v23.4s, v15.4s\n"
6780         "sqdmulh v24.4s, v24.4s, v14.4s\n"
6781         "sqdmulh v25.4s, v25.4s, v15.4s\n"
6782         "sqdmulh v26.4s, v26.4s, v14.4s\n"
6783         "sqdmulh v27.4s, v27.4s, v15.4s\n"
6784         "sqdmulh v28.4s, v28.4s, v14.4s\n"
6785         "sqdmulh v29.4s, v29.4s, v15.4s\n"
6786         "sqdmulh v30.4s, v30.4s, v14.4s\n"
6787         "sqdmulh v31.4s, v31.4s, v15.4s\n"
6788 
6789         // Apply the negative exponent part of the multiplier.
6790         "srshl v16.4s, v16.4s, v11.4s\n"
6791         "srshl v17.4s, v17.4s, v12.4s\n"
6792         "srshl v18.4s, v18.4s, v11.4s\n"
6793         "srshl v19.4s, v19.4s, v12.4s\n"
6794         "srshl v20.4s, v20.4s, v11.4s\n"
6795         "srshl v21.4s, v21.4s, v12.4s\n"
6796         "srshl v22.4s, v22.4s, v11.4s\n"
6797         "srshl v23.4s, v23.4s, v12.4s\n"
6798         "srshl v24.4s, v24.4s, v11.4s\n"
6799         "srshl v25.4s, v25.4s, v12.4s\n"
6800         "ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
6801         "srshl v26.4s, v26.4s, v11.4s\n"
6802         "ins v13.h[4], w4\n" // dst_zero_point
6803         "srshl v27.4s, v27.4s, v12.4s\n"
6804         "ins v0.d[1], x1\n"
6805         "srshl v28.4s, v28.4s, v11.4s\n"
6806         "ins v1.d[1], x2\n"
6807         "srshl v29.4s, v29.4s, v12.4s\n"
6808         "ins v2.d[1], x5\n"
6809         "srshl v30.4s, v30.4s, v11.4s\n"
6810         "ins v3.d[1], x6\n"
6811         "srshl v31.4s, v31.4s, v12.4s\n"
6812         "b 9f\n"
6813 
6814         "8:\n"
6815         // Case where channels are columns
6816 
6817         // Apply the positive exponent part of the multiplier.
6818         "dup v4.4s, v9.s[0]\n"
6819         "dup v5.4s, v9.s[1]\n"
6820         "sshl v16.4s, v16.4s, v4.4s\n"
6821         "dup v6.4s, v9.s[2]\n"
6822         "sshl v17.4s, v17.4s, v4.4s\n"
6823         "dup v7.4s, v9.s[3]\n"
6824         "sshl v18.4s, v18.4s, v5.4s\n"
6825         "dup v4.4s, v10.s[0]\n"
6826         "sshl v19.4s, v19.4s, v5.4s\n"
6827         "dup v5.4s, v10.s[1]\n"
6828         "sshl v20.4s, v20.4s, v6.4s\n"
6829         "sshl v21.4s, v21.4s, v6.4s\n"
6830         "dup v6.4s, v10.s[2]\n"
6831         "sshl v22.4s, v22.4s, v7.4s\n"
6832         "sshl v23.4s, v23.4s, v7.4s\n"
6833         "dup v7.4s, v10.s[3]\n"
6834         "sshl v24.4s, v24.4s, v4.4s\n"
6835         "sshl v25.4s, v25.4s, v4.4s\n"
6836         "sshl v26.4s, v26.4s, v5.4s\n"
6837         "sshl v27.4s, v27.4s, v5.4s\n"
6838         "sshl v28.4s, v28.4s, v6.4s\n"
6839         "sshl v29.4s, v29.4s, v6.4s\n"
6840         "sshl v30.4s, v30.4s, v7.4s\n"
6841         "sshl v31.4s, v31.4s, v7.4s\n"
6842         "11:\n"
6843 
6844         // Apply the fixed-point part of the multiplier.
6845         //
6846         // ... and, interleaved into that:
6847         // Now that we know what LHS and RHS data the next iteration of the
6848         // main loop will need to load, we start loading the first 32 bytes of
6849         // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
6850         // in the rest of the work on the current block.
6851         "ld1 {v0.8b}, [%[lhs_ptr]], #8\n"
6852         "sqdmulh v16.4s, v16.4s, v14.s[0]\n"
6853         "ldr x1, [%[lhs_ptr]], #8\n"
6854         "sqdmulh v17.4s, v17.4s, v14.s[0]\n"
6855         "ld1 {v1.8b}, [%[lhs_ptr]], #8\n"
6856         "sqdmulh v18.4s, v18.4s, v14.s[1]\n"
6857         "ldr x2, [%[lhs_ptr]], #8\n"
6858         "sqdmulh v19.4s, v19.4s, v14.s[1]\n"
6859         "ld1 {v2.8b}, [%[rhs_ptr]], #8\n"
6860         "sqdmulh v20.4s, v20.4s, v14.s[2]\n"
6861         "ldr x5, [%[rhs_ptr]], #8\n"
6862         "sqdmulh v21.4s, v21.4s, v14.s[2]\n"
6863         "ld1 {v3.8b}, [%[rhs_ptr]], #8\n"
6864         "sqdmulh v22.4s, v22.4s, v14.s[3]\n"
6865         "ldr x6, [%[rhs_ptr]], #8\n"
6866         "sqdmulh v23.4s, v23.4s, v14.s[3]\n"
6867         "dup v4.4s, v11.s[0]\n"
6868         "sqdmulh v24.4s, v24.4s, v15.s[0]\n"
6869         "dup v5.4s, v11.s[1]\n"
6870         "sqdmulh v25.4s, v25.4s, v15.s[0]\n"
6871         "dup v6.4s, v11.s[2]\n"
6872         "sqdmulh v26.4s, v26.4s, v15.s[1]\n"
6873         "dup v7.4s, v11.s[3]\n"
6874         "sqdmulh v27.4s, v27.4s, v15.s[1]\n"
6875         "sqdmulh v28.4s, v28.4s, v15.s[2]\n"
6876         "sqdmulh v29.4s, v29.4s, v15.s[2]\n"
6877         "sqdmulh v30.4s, v30.4s, v15.s[3]\n"
6878         "sqdmulh v31.4s, v31.4s, v15.s[3]\n"
6879 
6880         // Apply the negative exponent part of the multiplier.
6881         "srshl v16.4s, v16.4s, v4.4s\n"
6882         "srshl v17.4s, v17.4s, v4.4s\n"
6883         "dup v4.4s, v12.s[0]\n"
6884         "srshl v18.4s, v18.4s, v5.4s\n"
6885         "srshl v19.4s, v19.4s, v5.4s\n"
6886         "dup v5.4s, v12.s[1]\n"
6887         "srshl v20.4s, v20.4s, v6.4s\n"
6888         "srshl v21.4s, v21.4s, v6.4s\n"
6889         "dup v6.4s, v12.s[2]\n"
6890         "srshl v22.4s, v22.4s, v7.4s\n"
6891         "srshl v23.4s, v23.4s, v7.4s\n"
6892         "dup v7.4s, v12.s[3]\n"
6893         "srshl v24.4s, v24.4s, v4.4s\n"
6894         "ldr w4, [%[params], #" RUY_STR(RUY_OFFSET_DST_ZERO_POINT) "]\n"
6895         "srshl v25.4s, v25.4s, v4.4s\n"
6896         "ins v13.h[4], w4\n" // dst_zero_point
6897         "srshl v26.4s, v26.4s, v5.4s\n"
6898         "ins v0.d[1], x1\n"
6899         "srshl v27.4s, v27.4s, v5.4s\n"
6900         "ins v1.d[1], x2\n"
6901         "srshl v28.4s, v28.4s, v6.4s\n"
6902         "ins v2.d[1], x5\n"
6903         "srshl v29.4s, v29.4s, v6.4s\n"
6904         "ins v3.d[1], x6\n"
6905         "srshl v30.4s, v30.4s, v7.4s\n"
6906         "srshl v31.4s, v31.4s, v7.4s\n"
6907         "9:\n"
6908 
6909         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT16) "\n"
6910         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT16) "f\n"
6911         "cmp %w[dst_type_id], #" RUY_STR(RUY_ASM_TYPE_ID_INT8) "\n"
6912         "beq " RUY_STR(RUY_ASM_LABEL_STORE_INT8) "f\n"
6913 
6914         RUY_STR(RUY_ASM_LABEL_STORE_UINT8) ":\n"
6915 
6916         // Cast-and-saturate from int32 to int16
6917         "sqxtn v16.4h, v16.4s\n"
6918         "sqxtn2 v16.8h, v17.4s\n"
6919         "sqxtn v17.4h, v18.4s\n"
6920         "sqxtn2 v17.8h, v19.4s\n"
6921         "sqxtn v18.4h, v20.4s\n"
6922         "sqxtn2 v18.8h, v21.4s\n"
6923         "sqxtn v19.4h, v22.4s\n"
6924         "sqxtn2 v19.8h, v23.4s\n"
6925         "sqxtn v20.4h, v24.4s\n"
6926         "sqxtn2 v20.8h, v25.4s\n"
6927         "sqxtn v21.4h, v26.4s\n"
6928         "sqxtn2 v21.8h, v27.4s\n"
6929         "sqxtn v22.4h, v28.4s\n"
6930         "sqxtn2 v22.8h, v29.4s\n"
6931         "sqxtn v23.4h, v30.4s\n"
6932         "sqxtn2 v23.8h, v31.4s\n"
6933 
6934         // Destination zero_point
6935         "dup v14.8h, v13.h[4]\n"
6936         // At this point, v24 -- v31 aren't used anymore for the current block,
6937         // so we can start clearing these accumulators for the next block
6938         // (next iteration of the main loop).
6939         RUY_MAKE_ZERO(v24)
6940         RUY_MAKE_ZERO(v25)
6941         RUY_MAKE_ZERO(v26)
6942         RUY_MAKE_ZERO(v27)
6943         RUY_MAKE_ZERO(v28)
6944         RUY_MAKE_ZERO(v29)
6945         RUY_MAKE_ZERO(v30)
6946         RUY_MAKE_ZERO(v31)
6947 
6948         // Add the destination zero point
6949         "sqadd v16.8h, v16.8h, v14.8h\n"
6950         "sqadd v17.8h, v17.8h, v14.8h\n"
6951         "sqadd v18.8h, v18.8h, v14.8h\n"
6952         "sqadd v19.8h, v19.8h, v14.8h\n"
6953         "sqadd v20.8h, v20.8h, v14.8h\n"
6954         "sqadd v21.8h, v21.8h, v14.8h\n"
6955         "sqadd v22.8h, v22.8h, v14.8h\n"
6956         "sqadd v23.8h, v23.8h, v14.8h\n"
6957 
6958         // Load the clamp_min, clamp_max bounds
6959         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
6960         // Cast-and-saturate from int16 to uint8
6961         "sqxtun v16.8b, v16.8h\n"
6962         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
6963         "sqxtun2 v16.16b, v17.8h\n"
6964         "sqxtun v17.8b, v18.8h\n"
6965         "sqxtun2 v17.16b, v19.8h\n"
6966         "sqxtun v18.8b, v20.8h\n"
6967         "sqxtun2 v18.16b, v21.8h\n"
6968         "sqxtun v19.8b, v22.8h\n"
6969         "sqxtun2 v19.16b, v23.8h\n"
6970 
6971         "dup v14.16b, w2\n"  // clamp_min
6972         "dup v15.16b, w3\n"  // clamp_max
6973 
6974         // Compute how much of the 8x8 block of destination 8bit values that
6975         // we have computed, fit in the destination matrix. Typically, all of
6976         // it fits, but when the destination matrix shape is not a multiple
6977         // of 8x8, there are some 8x8 blocks along the boundaries that do
6978         // not fit entirely.
6979         "sub w1, %w[dst_rows], %w[row]\n"
6980         // Apply the clamp_min bound
6981         "umax v16.16b, v16.16b, v14.16b\n"
6982         "sub w2, %w[dst_cols], %w[col]\n"
6983         "umax v17.16b, v17.16b, v14.16b\n"
6984         "mov w3, #8\n"
6985         "umax v18.16b, v18.16b, v14.16b\n"
6986         "cmp w1, #8\n"
6987         "umax v19.16b, v19.16b, v14.16b\n"
6988         // Compute w1 = how many rows of the 8x8 block fit
6989         "csel w1, w1, w3, le\n"
6990         // Apply the clamp_max bound
6991         "umin v16.16b, v16.16b, v15.16b\n"
6992         "cmp w2, #8\n"
6993         "umin v17.16b, v17.16b, v15.16b\n"
6994         // Compute w2 = how many cols of the 8x8 block fit
6995         "csel w2, w2, w3, le\n"
6996         "umin v18.16b, v18.16b, v15.16b\n"
6997         "umin v19.16b, v19.16b, v15.16b\n"
6998 
6999         // Make it so that all of the final 8bit values are stored in the
7000         // first 64bits of 128bit NEON registers, so they can be stored
7001         // by 64bit st1 store instructions with byte alignment.
7002         "dup d20, v16.d[1]\n"
7003         "dup d21, v17.d[1]\n"
7004         "dup d22, v18.d[1]\n"
7005         "dup d23, v19.d[1]\n"
7006 
7007         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
7008         "cmp w1, w3\n"
7009         "ccmp w2, w3, 0, eq\n"
7010         // Yes, all of the 8x8 block fits, go to fast path.
7011         "beq 30f\n"
7012         // Not all of the 8x8 block fits.
7013         // Set (x3 address, x4 stride) to write to dst_tmp_buf
7014         "mov x3, %[dst_tmp_buf]\n"
7015         "mov x4, #8\n"
7016         "b 31f\n"
7017         "30:\n"
7018         // Yes, all of the 8x8 block fits.
7019         // Set (x3 address, x4 stride) to write directly to destination matrix.
7020         "mov x3, %[dst_ptr]\n"
7021         "mov x4, x11\n"
7022         "31:\n"
7023 
7024         // Write our 8bit values to the destination described by
7025         // (x3 address, x4 stride).
7026         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7027         "st1 {v16.8b}, [x3], x4\n"
7028         RUY_MAKE_ZERO(v16)
7029         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7030         "st1 {v20.8b}, [x3], x4\n"
7031         RUY_MAKE_ZERO(v20)
7032         // For the next block: perform the first few multiply-adds on the data
7033         // that we have already loaded.
7034         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
7035         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7036         "st1 {v17.8b}, [x3], x4\n"
7037         RUY_MAKE_ZERO(v17)
7038         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
7039         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7040         "st1 {v21.8b}, [x3], x4\n"
7041         RUY_MAKE_ZERO(v21)
7042         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7043         "st1 {v18.8b}, [x3], x4\n"
7044         RUY_MAKE_ZERO(v18)
7045         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7046         "st1 {v22.8b}, [x3], x4\n"
7047         RUY_MAKE_ZERO(v22)
7048         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
7049         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7050         "st1 {v19.8b}, [x3], x4\n"
7051         RUY_MAKE_ZERO(v19)
7052         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
7053         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7054         "st1 {v23.8b}, [x3], x4\n"
7055         RUY_MAKE_ZERO(v23)
7056 
7057         // If all of the 8x8 block fits, we just finished writing it to the
7058         // destination, so we skip the next part.
7059         "beq 41f\n"
7060         // Not all of the 8x8 block fits in the destination matrix.  We just
7061         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
7062         // it to copy into the destination matrix the part that fits.
7063         "mov x3, %[dst_tmp_buf]\n"
7064         "mov x4, %[dst_ptr]\n"
7065         "mov w6, #0\n"
7066         "50:\n"
7067         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
7068         "mov w5, #0\n"
7069         "51:\n"
7070         "ldrb w7, [x3, w5, uxtw]\n"
7071         "strb w7, [x4, w5, uxtw]\n"
7072         "add w5, w5, #1\n"
7073         "cmp w5, w1\n"
7074         "blt 51b\n"
7075         "add w6, w6, #1\n"
7076         "add x3, x3, #8\n"
7077         "add x4, x4, x11\n"
7078         "cmp w6, w2\n"
7079         "blt 50b\n"
7080         "41:\n"
7081         "add %[dst_ptr], %[dst_ptr], #8\n"
7082 
7083         // At this point we have completely finished writing values to the
7084         // destination matrix for the current block.
7085 
7086         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
7087 
7088         RUY_STR(RUY_ASM_LABEL_STORE_INT8) ":\n"
7089 
7090         // Cast-and-saturate from int32 to int16
7091         "sqxtn v16.4h, v16.4s\n"
7092         "sqxtn2 v16.8h, v17.4s\n"
7093         "sqxtn v17.4h, v18.4s\n"
7094         "sqxtn2 v17.8h, v19.4s\n"
7095         "sqxtn v18.4h, v20.4s\n"
7096         "sqxtn2 v18.8h, v21.4s\n"
7097         "sqxtn v19.4h, v22.4s\n"
7098         "sqxtn2 v19.8h, v23.4s\n"
7099         "sqxtn v20.4h, v24.4s\n"
7100         "sqxtn2 v20.8h, v25.4s\n"
7101         "sqxtn v21.4h, v26.4s\n"
7102         "sqxtn2 v21.8h, v27.4s\n"
7103         "sqxtn v22.4h, v28.4s\n"
7104         "sqxtn2 v22.8h, v29.4s\n"
7105         "sqxtn v23.4h, v30.4s\n"
7106         "sqxtn2 v23.8h, v31.4s\n"
7107 
7108         // Destination zero_point
7109         "dup v14.8h, v13.h[4]\n"
7110         // At this point, v24 -- v31 aren't used anymore for the current block,
7111         // so we can start clearing these accumulators for the next block
7112         // (next iteration of the main loop).
7113         RUY_MAKE_ZERO(v24)
7114         RUY_MAKE_ZERO(v25)
7115         RUY_MAKE_ZERO(v26)
7116         RUY_MAKE_ZERO(v27)
7117         RUY_MAKE_ZERO(v28)
7118         RUY_MAKE_ZERO(v29)
7119         RUY_MAKE_ZERO(v30)
7120         RUY_MAKE_ZERO(v31)
7121 
7122         // Add the destination zero point
7123         "sqadd v16.8h, v16.8h, v14.8h\n"
7124         "sqadd v17.8h, v17.8h, v14.8h\n"
7125         "sqadd v18.8h, v18.8h, v14.8h\n"
7126         "sqadd v19.8h, v19.8h, v14.8h\n"
7127         "sqadd v20.8h, v20.8h, v14.8h\n"
7128         "sqadd v21.8h, v21.8h, v14.8h\n"
7129         "sqadd v22.8h, v22.8h, v14.8h\n"
7130         "sqadd v23.8h, v23.8h, v14.8h\n"
7131 
7132         // Load the clamp_min, clamp_max bounds
7133         "ldrb w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
7134         // Cast-and-saturate from int16 to uint8
7135         "sqxtn v16.8b, v16.8h\n"
7136         "ldrb w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
7137         "sqxtn2 v16.16b, v17.8h\n"
7138         "sqxtn v17.8b, v18.8h\n"
7139         "sqxtn2 v17.16b, v19.8h\n"
7140         "sqxtn v18.8b, v20.8h\n"
7141         "sqxtn2 v18.16b, v21.8h\n"
7142         "sqxtn v19.8b, v22.8h\n"
7143         "sqxtn2 v19.16b, v23.8h\n"
7144 
7145         "dup v14.16b, w2\n"  // clamp_min
7146         "dup v15.16b, w3\n"  // clamp_max
7147 
7148         // Compute how much of the 8x8 block of destination 8bit values that
7149         // we have computed, fit in the destination matrix. Typically, all of
7150         // it fits, but when the destination matrix shape is not a multiple
7151         // of 8x8, there are some 8x8 blocks along the boundaries that do
7152         // not fit entirely.
7153         "sub w1, %w[dst_rows], %w[row]\n"
7154         // Apply the clamp_min bound
7155         "smax v16.16b, v16.16b, v14.16b\n"
7156         "sub w2, %w[dst_cols], %w[col]\n"
7157         "smax v17.16b, v17.16b, v14.16b\n"
7158         "mov w3, #8\n"
7159         "smax v18.16b, v18.16b, v14.16b\n"
7160         "cmp w1, #8\n"
7161         "smax v19.16b, v19.16b, v14.16b\n"
7162         // Compute w1 = how many rows of the 8x8 block fit
7163         "csel w1, w1, w3, le\n"
7164         // Apply the clamp_max bound
7165         "smin v16.16b, v16.16b, v15.16b\n"
7166         "cmp w2, #8\n"
7167         "smin v17.16b, v17.16b, v15.16b\n"
7168         // Compute w2 = how many cols of the 8x8 block fit
7169         "csel w2, w2, w3, le\n"
7170         "smin v18.16b, v18.16b, v15.16b\n"
7171         "smin v19.16b, v19.16b, v15.16b\n"
7172 
7173         // Make it so that all of the final 8bit values are stored in the
7174         // first 64bits of 128bit NEON registers, so they can be stored
7175         // by 64bit st1 store instructions with byte alignment.
7176         "dup d20, v16.d[1]\n"
7177         "dup d21, v17.d[1]\n"
7178         "dup d22, v18.d[1]\n"
7179         "dup d23, v19.d[1]\n"
7180 
7181         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
7182         "cmp w1, w3\n"
7183         "ccmp w2, w3, 0, eq\n"
7184         // Yes, all of the 8x8 block fits, go to fast path.
7185         "beq 130f\n"
7186         // Not all of the 8x8 block fits.
7187         // Set (x3 address, x4 stride) to write to dst_tmp_buf
7188         "mov x3, %[dst_tmp_buf]\n"
7189         "mov x4, #8\n"
7190         "b 131f\n"
7191         "130:\n"
7192         // Yes, all of the 8x8 block fits.
7193         // Set (x3 address, x4 stride) to write directly to destination matrix.
7194         "mov x3, %[dst_ptr]\n"
7195         "mov x4, x11\n"
7196         "131:\n"
7197 
7198         // Write our 8bit values to the destination described by
7199         // (x3 address, x4 stride).
7200         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7201         "st1 {v16.8b}, [x3], x4\n"
7202         RUY_MAKE_ZERO(v16)
7203         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7204         "st1 {v20.8b}, [x3], x4\n"
7205         RUY_MAKE_ZERO(v20)
7206         // For the next block: perform the first few multiply-adds on the data
7207         // that we have already loaded.
7208         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
7209         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7210         "st1 {v17.8b}, [x3], x4\n"
7211         RUY_MAKE_ZERO(v17)
7212         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
7213         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7214         "st1 {v21.8b}, [x3], x4\n"
7215         RUY_MAKE_ZERO(v21)
7216         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7217         "st1 {v18.8b}, [x3], x4\n"
7218         RUY_MAKE_ZERO(v18)
7219         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7220         "st1 {v22.8b}, [x3], x4\n"
7221         RUY_MAKE_ZERO(v22)
7222         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
7223         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7224         "st1 {v19.8b}, [x3], x4\n"
7225         RUY_MAKE_ZERO(v19)
7226         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
7227         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7228         "st1 {v23.8b}, [x3], x4\n"
7229         RUY_MAKE_ZERO(v23)
7230 
7231         // If all of the 8x8 block fits, we just finished writing it to the
7232         // destination, so we skip the next part.
7233         "beq 141f\n"
7234         // Not all of the 8x8 block fits in the destination matrix.  We just
7235         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
7236         // it to copy into the destination matrix the part that fits.
7237         "mov x3, %[dst_tmp_buf]\n"
7238         "mov x4, %[dst_ptr]\n"
7239         "mov w6, #0\n"
7240         "150:\n"
7241         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
7242         "mov w5, #0\n"
7243         "151:\n"
7244         "ldrb w7, [x3, w5, uxtw]\n"
7245         "strb w7, [x4, w5, uxtw]\n"
7246         "add w5, w5, #1\n"
7247         "cmp w5, w1\n"
7248         "blt 151b\n"
7249         "add w6, w6, #1\n"
7250         "add x3, x3, #8\n"
7251         "add x4, x4, x11\n"
7252         "cmp w6, w2\n"
7253         "blt 150b\n"
7254         "141:\n"
7255         "add %[dst_ptr], %[dst_ptr], #8\n"
7256 
7257         // At this point we have completely finished writing values to the
7258         // destination matrix for the current block.
7259 
7260         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
7261 
7262         RUY_STR(RUY_ASM_LABEL_STORE_INT16) ":\n"
7263 
7264         // Add the destination zero point
7265         "dup v14.8h, v13.h[4]\n"
7266         "saddw v16.4s, v16.4s, v14.4h\n"
7267         "saddw v17.4s, v17.4s, v14.4h\n"
7268         "saddw v18.4s, v18.4s, v14.4h\n"
7269         "saddw v19.4s, v19.4s, v14.4h\n"
7270         "saddw v20.4s, v20.4s, v14.4h\n"
7271         "saddw v21.4s, v21.4s, v14.4h\n"
7272         "saddw v22.4s, v22.4s, v14.4h\n"
7273         "saddw v23.4s, v23.4s, v14.4h\n"
7274         "saddw v24.4s, v24.4s, v14.4h\n"
7275         "saddw v25.4s, v25.4s, v14.4h\n"
7276         "saddw v26.4s, v26.4s, v14.4h\n"
7277         "saddw v27.4s, v27.4s, v14.4h\n"
7278         "saddw v28.4s, v28.4s, v14.4h\n"
7279         "saddw v29.4s, v29.4s, v14.4h\n"
7280         "saddw v30.4s, v30.4s, v14.4h\n"
7281         "saddw v31.4s, v31.4s, v14.4h\n"
7282 
7283         // Cast-and-saturate from int32 to int16
7284         "sqxtn v16.4h, v16.4s\n"
7285         "sqxtn2 v16.8h, v17.4s\n"
7286         "sqxtn v17.4h, v18.4s\n"
7287         "sqxtn2 v17.8h, v19.4s\n"
7288         "sqxtn v18.4h, v20.4s\n"
7289         "sqxtn2 v18.8h, v21.4s\n"
7290         "sqxtn v19.4h, v22.4s\n"
7291         "sqxtn2 v19.8h, v23.4s\n"
7292         "sqxtn v20.4h, v24.4s\n"
7293         "sqxtn2 v20.8h, v25.4s\n"
7294         "sqxtn v21.4h, v26.4s\n"
7295         "sqxtn2 v21.8h, v27.4s\n"
7296         "sqxtn v22.4h, v28.4s\n"
7297         "sqxtn2 v22.8h, v29.4s\n"
7298         "sqxtn v23.4h, v30.4s\n"
7299         "sqxtn2 v23.8h, v31.4s\n"
7300 
7301         // At this point, v24 -- v31 aren't used anymore for the current block,
7302         // so we can start clearing these accumulators for the next block
7303         // (next iteration of the main loop).
7304         RUY_MAKE_ZERO(v24)
7305         RUY_MAKE_ZERO(v25)
7306         RUY_MAKE_ZERO(v26)
7307         RUY_MAKE_ZERO(v27)
7308         RUY_MAKE_ZERO(v28)
7309         RUY_MAKE_ZERO(v29)
7310         RUY_MAKE_ZERO(v30)
7311         RUY_MAKE_ZERO(v31)
7312 
7313         // Load the clamp_min, clamp_max bounds
7314         "ldrsh w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
7315         "ldrsh w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
7316         "dup v14.8h, w2\n"  // clamp_min
7317         "dup v15.8h, w3\n"  // clamp_max
7318 
7319         // Apply the clamp_min bound
7320         "smax v16.8h, v16.8h, v14.8h\n"
7321         "smax v17.8h, v17.8h, v14.8h\n"
7322         "smax v18.8h, v18.8h, v14.8h\n"
7323         "smax v19.8h, v19.8h, v14.8h\n"
7324         "smax v20.8h, v20.8h, v14.8h\n"
7325         "smax v21.8h, v21.8h, v14.8h\n"
7326         "smax v22.8h, v22.8h, v14.8h\n"
7327         "smax v23.8h, v23.8h, v14.8h\n"
7328         // Apply the clamp_max bound
7329         "smin v16.8h, v16.8h, v15.8h\n"
7330         "smin v17.8h, v17.8h, v15.8h\n"
7331         "smin v18.8h, v18.8h, v15.8h\n"
7332         "smin v19.8h, v19.8h, v15.8h\n"
7333         "smin v20.8h, v20.8h, v15.8h\n"
7334         "smin v21.8h, v21.8h, v15.8h\n"
7335         "smin v22.8h, v22.8h, v15.8h\n"
7336         "smin v23.8h, v23.8h, v15.8h\n"
7337 
7338         // Compute how much of the 8x8 block of destination 16bit values that
7339         // we have computed, fit in the destination matrix. Typically, all of
7340         // it fits, but when the destination matrix shape is not a multiple
7341         // of 8x8, there are some 8x8 blocks along the boundaries that do
7342         // not fit entirely.
7343         "sub w1, %w[dst_rows], %w[row]\n"
7344         "sub w2, %w[dst_cols], %w[col]\n"
7345         "mov w3, #8\n"
7346         "cmp w1, #8\n"
7347         // Compute w1 = how many rows of the 8x8 block fit
7348         "csel w1, w1, w3, le\n"
7349         "cmp w2, #8\n"
7350         // Compute w1 = how many rows of the 8x8 block fit
7351         "csel w2, w2, w3, le\n"
7352 
7353         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
7354         "cmp w1, w3\n"
7355         "ccmp w2, w3, 0, eq\n"
7356         // Yes, all of the 8x8 block fits, go to fast path.
7357         "beq 230f\n"
7358         // Not all of the 8x8 block fits.
7359         // Set (x3 address, x4 stride) to write to dst_tmp_buf
7360         "mov x3, %[dst_tmp_buf]\n"
7361         "mov x4, #16\n"
7362         "b 231f\n"
7363         "230:\n"
7364         // Yes, all of the 8x8 block fits.
7365         // Set (x3 address, x4 stride) to write directly to destination matrix.
7366         "mov x3, %[dst_ptr]\n"
7367         "mov x4, x11\n"
7368         "231:\n"
7369 
7370         // Write our 8bit values to the destination described by
7371         // (x3 address, x4 stride).
7372         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7373         "st1 {v16.8h}, [x3], x4\n"
7374         RUY_MAKE_ZERO(v16)
7375         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7376         "st1 {v17.8h}, [x3], x4\n"
7377         RUY_MAKE_ZERO(v17)
7378         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7379         "st1 {v18.8h}, [x3], x4\n"
7380         RUY_MAKE_ZERO(v18)
7381         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7382         "st1 {v19.8h}, [x3], x4\n"
7383         RUY_MAKE_ZERO(v19)
7384         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7385         "st1 {v20.8h}, [x3], x4\n"
7386         RUY_MAKE_ZERO(v20)
7387         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7388         "st1 {v21.8h}, [x3], x4\n"
7389         RUY_MAKE_ZERO(v21)
7390         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7391         "st1 {v22.8h}, [x3], x4\n"
7392         RUY_MAKE_ZERO(v22)
7393         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
7394         "st1 {v23.8h}, [x3], x4\n"
7395         RUY_MAKE_ZERO(v23)
7396 
7397         // For the next block: perform the first few multiply-adds on the data
7398         // that we have already loaded.
7399         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
7400         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
7401         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
7402         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
7403 
7404         // If all of the 8x8 block fits, we just finished writing it to the
7405         // destination, so we skip the next part.
7406         "beq 241f\n"
7407         // Not all of the 8x8 block fits in the destination matrix.  We just
7408         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
7409         // it to copy into the destination matrix the part that fits.
7410         "mov x3, %[dst_tmp_buf]\n"
7411         "mov x4, %[dst_ptr]\n"
7412         "mov w6, #0\n"
7413         "250:\n"
7414         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
7415         "mov w5, #0\n"
7416         "251:\n"
7417         "ldrsh w7, [x3, x5, lsl #1]\n"
7418         "strh w7, [x4, x5, lsl #1]\n"
7419         "add w5, w5, #1\n"
7420         "cmp w5, w1\n"
7421         "blt 251b\n"
7422         "add w6, w6, #1\n"
7423         "add x3, x3, #16\n"
7424         "add x4, x4, x11\n"
7425         "cmp w6, w2\n"
7426         "blt 250b\n"
7427         "241:\n"
7428         "add %[dst_ptr], %[dst_ptr], #16\n"
7429         // At this point we have completely finished writing values to the
7430         // destination matrix for the current block.
7431 
7432         "b " RUY_STR(RUY_ASM_LABEL_AFTER_STORE) "f\n"
7433 
7434         RUY_STR(RUY_ASM_LABEL_STORE_INT32) ":\n"
7435 
7436         "ld1 {v0.8b}, [%[lhs_ptr]], #8\n"
7437         "ldr x1, [%[lhs_ptr]], #8\n"
7438         "ld1 {v1.8b}, [%[lhs_ptr]], #8\n"
7439         "ldr x2, [%[lhs_ptr]], #8\n"
7440         "ld1 {v2.8b}, [%[rhs_ptr]], #8\n"
7441         "ldr x5, [%[rhs_ptr]], #8\n"
7442         "ld1 {v3.8b}, [%[rhs_ptr]], #8\n"
7443         "ldr x6, [%[rhs_ptr]], #8\n"
7444         "ins v0.d[1], x1\n"
7445         "ins v1.d[1], x2\n"
7446         "ins v2.d[1], x5\n"
7447         "ins v3.d[1], x6\n"
7448 
7449         // Since the store type is the same as the accum type, no need for
7450         // downcast. There's also no need for clamp by min/max.
7451 
7452         // Compute how much of the 8x8 block of destination 32it values that
7453         // we have computed, fit in the destination matrix. Typically, all of
7454         // it fits, but when the destination matrix shape is not a multiple
7455         // of 8x8, there are some 8x8 blocks along the boundaries that do
7456         // not fit entirely.
7457         "sub w1, %w[dst_rows], %w[row]\n"
7458         "sub w2, %w[dst_cols], %w[col]\n"
7459         "mov w3, #8\n"
7460         "cmp w1, #8\n"
7461         // Compute w1 = how many rows of the 8x8 block fit
7462         "csel w1, w1, w3, le\n"
7463         "cmp w2, #8\n"
7464         // Compute w1 = how many rows of the 8x8 block fit
7465         "csel w2, w2, w3, le\n"
7466 
7467         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
7468         "cmp w1, w3\n"
7469         "ccmp w2, w3, 0, eq\n"
7470         // Yes, all of the 8x8 block fits, go to fast path.
7471         "beq 330f\n"
7472         // Not all of the 8x8 block fits.
7473         // Write to dst_tmp_buf
7474         "mov x3, %[dst_tmp_buf]\n"
7475         "st1 {v16.4s}, [x3], #16\n"
7476         RUY_MAKE_ZERO(v16)
7477         "st1 {v17.4s}, [x3], #16\n"
7478         RUY_MAKE_ZERO(v17)
7479         "st1 {v18.4s}, [x3], #16\n"
7480         RUY_MAKE_ZERO(v18)
7481         "st1 {v19.4s}, [x3], #16\n"
7482         RUY_MAKE_ZERO(v19)
7483         "st1 {v20.4s}, [x3], #16\n"
7484         RUY_MAKE_ZERO(v20)
7485         "st1 {v21.4s}, [x3], #16\n"
7486         RUY_MAKE_ZERO(v21)
7487         "st1 {v22.4s}, [x3], #16\n"
7488         RUY_MAKE_ZERO(v22)
7489         "st1 {v23.4s}, [x3], #16\n"
7490         RUY_MAKE_ZERO(v23)
7491         "st1 {v24.4s}, [x3], #16\n"
7492         RUY_MAKE_ZERO(v24)
7493         "st1 {v25.4s}, [x3], #16\n"
7494         RUY_MAKE_ZERO(v25)
7495         "st1 {v26.4s}, [x3], #16\n"
7496         RUY_MAKE_ZERO(v26)
7497         "st1 {v27.4s}, [x3], #16\n"
7498         RUY_MAKE_ZERO(v27)
7499         "st1 {v28.4s}, [x3], #16\n"
7500         RUY_MAKE_ZERO(v28)
7501         "st1 {v29.4s}, [x3], #16\n"
7502         RUY_MAKE_ZERO(v29)
7503         "st1 {v30.4s}, [x3], #16\n"
7504         RUY_MAKE_ZERO(v30)
7505         "st1 {v31.4s}, [x3], #16\n"
7506         RUY_MAKE_ZERO(v31)
7507 
7508         "b 331f\n"
7509 
7510         "330:\n"
7511         // Yes, all of the 8x8 block fits.
7512         "mov x4, %[dst_ptr]\n"
7513         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
7514         "st1 {v16.4s, v17.4s}, [x4], x11\n"
7515         RUY_MAKE_ZERO(v16)
7516         RUY_MAKE_ZERO(v17)
7517         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
7518         "st1 {v18.4s, v19.4s}, [x4], x11\n"
7519         RUY_MAKE_ZERO(v18)
7520         RUY_MAKE_ZERO(v19)
7521         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
7522         "st1 {v20.4s, v21.4s}, [x4], x11\n"
7523         RUY_MAKE_ZERO(v20)
7524         RUY_MAKE_ZERO(v21)
7525         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
7526         "st1 {v22.4s, v23.4s}, [x4], x11\n"
7527         RUY_MAKE_ZERO(v22)
7528         RUY_MAKE_ZERO(v23)
7529         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
7530         "st1 {v24.4s, v25.4s}, [x4], x11\n"
7531         RUY_MAKE_ZERO(v24)
7532         RUY_MAKE_ZERO(v25)
7533         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
7534         "st1 {v26.4s, v27.4s}, [x4], x11\n"
7535         RUY_MAKE_ZERO(v26)
7536         RUY_MAKE_ZERO(v27)
7537         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
7538         "st1 {v28.4s, v29.4s}, [x4], x11\n"
7539         RUY_MAKE_ZERO(v28)
7540         RUY_MAKE_ZERO(v29)
7541         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
7542         "st1 {v30.4s, v31.4s}, [x4], x11\n"
7543         RUY_MAKE_ZERO(v30)
7544         RUY_MAKE_ZERO(v31)
7545 
7546         "331:\n"
7547 
7548         // For the next block: perform the first few multiply-adds on the data
7549         // that we have already loaded.
7550         ".word 0x4f82e010  // sdot v16.4s, v0.16b, v2.4b[0]\n"
7551         ".word 0x4fa2e012  // sdot v18.4s, v0.16b, v2.4b[1]\n"
7552         ".word 0x4f82e814  // sdot v20.4s, v0.16b, v2.4b[2]\n"
7553         ".word 0x4fa2e816  // sdot v22.4s, v0.16b, v2.4b[3]\n"
7554 
7555         // If all of the 8x8 block fits, we just finished writing it to the
7556         // destination, so we skip the next part.
7557         "beq 341f\n"
7558 
7559         // Not all of the 8x8 block fits in the destination matrix.  We just
7560         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
7561         // it to copy into the destination matrix the part that fits.
7562         "mov x3, %[dst_tmp_buf]\n"
7563         "mov x4, %[dst_ptr]\n"
7564         "mov w6, #0\n"
7565         "350:\n"
7566         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
7567         "mov w5, #0\n"
7568         "351:\n"
7569         "ldr w7, [x3, x5, lsl #2]\n"
7570         "str w7, [x4, x5, lsl #2]\n"
7571         "add w5, w5, #1\n"
7572         "cmp w5, w1\n"
7573         "blt 351b\n"
7574         "add w6, w6, #1\n"
7575         "add x3, x3, #32\n"
7576         "add x4, x4, x11\n"
7577         "cmp w6, w2\n"
7578         "blt 350b\n"
7579         "341:\n"
7580         "add %[dst_ptr], %[dst_ptr], #32\n"
7581         // At this point we have completely finished writing values to the
7582         // destination matrix for the current block.
7583 
7584         RUY_STR(RUY_ASM_LABEL_AFTER_STORE) ":\n"
7585 
7586         // Reload some params --- we had used x5 -- x7 for a few other things
7587         // since the last time we had loaded them.
7588         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
7589         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
7590 
7591         // Move to the next block of the destination matrix, for the next iter
7592         // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
7593         // been updated earlier.
7594         // Have we reached the end row?
7595         "cmp %w[row], w7\n"
7596         "beq 20f\n"  // yes, end row.
7597         // Not end row. Move to the next row.
7598         "add %w[row], %w[row], #8\n"
7599         "b 21f\n"
7600         "20:\n"
7601         // Was already at end row.
7602         "mov %w[row], w6\n"  // Move back to first row.
7603         "add %w[col], %w[col], #8\n"  // Move to the next column.
7604         "add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
7605         "mov %[dst_ptr], %[dst_col_ptr]\n"
7606         "21:\n"
7607 
7608         // Main loop exit condition: have we hit the end column?
7609         "cmp %w[col], w8\n"
7610         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
7611         "ble 1b\n"
7612 
7613         // clang-format on
7614 
7615         : [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
7616           [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
7617           [dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
7618         : [ params ] "r"(&params), [dst_rows] "r"(params.dst_rows),
7619           [dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf),
7620           [dst_type_id] "r"(params.dst_type_id)
7621         : "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
7622           "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
7623           "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
7624           "v26", "v27", "v28", "v29", "v30", "v31");
7625 }
7626 #undef RUY_OFFSET_BIAS
7627 #undef RUY_OFFSET_LHS_SUMS
7628 #undef RUY_OFFSET_RHS_SUMS
7629 #undef RUY_OFFSET_LHS_BASE_PTR
7630 #undef RUY_OFFSET_MULTIPLIER_FIXEDPOINT
7631 #undef RUY_OFFSET_MULTIPLIER_EXPONENT
7632 #undef RUY_OFFSET_RHS_BASE_PTR
7633 #undef RUY_OFFSET_DST_BASE_PTR
7634 #undef RUY_OFFSET_LHS_ZERO_POINT
7635 #undef RUY_OFFSET_RHS_ZERO_POINT
7636 #undef RUY_OFFSET_DST_ZERO_POINT
7637 #undef RUY_OFFSET_PROD_ZP_DEPTH
7638 #undef RUY_OFFSET_START_ROW
7639 #undef RUY_OFFSET_START_COL
7640 #undef RUY_OFFSET_LAST_ROW
7641 #undef RUY_OFFSET_LAST_COL
7642 #undef RUY_OFFSET_DST_ROWS
7643 #undef RUY_OFFSET_DST_COLS
7644 #undef RUY_OFFSET_LHS_STRIDE
7645 #undef RUY_OFFSET_RHS_STRIDE
7646 #undef RUY_OFFSET_DST_STRIDE
7647 #undef RUY_OFFSET_DEPTH
7648 #undef RUY_OFFSET_CLAMP_MIN
7649 #undef RUY_OFFSET_CLAMP_MAX
7650 #undef RUY_OFFSET_FLAGS
7651 
7652 #define RUY_OFFSET_LHS_BASE_PTR 0
7653 #define RUY_OFFSET_RHS_BASE_PTR 8
7654 #define RUY_OFFSET_DST_BASE_PTR 16
7655 #define RUY_OFFSET_BIAS 24
7656 #define RUY_OFFSET_START_ROW 32
7657 #define RUY_OFFSET_START_COL 36
7658 #define RUY_OFFSET_LAST_ROW 40
7659 #define RUY_OFFSET_LAST_COL 44
7660 #define RUY_OFFSET_LHS_STRIDE 56
7661 #define RUY_OFFSET_RHS_STRIDE 60
7662 #define RUY_OFFSET_DST_STRIDE 64
7663 #define RUY_OFFSET_DEPTH 68
7664 #define RUY_OFFSET_CLAMP_MIN 72
7665 #define RUY_OFFSET_CLAMP_MAX 76
7666 #define RUY_OFFSET_FLAGS 80
7667 
7668 template <typename Params>
CheckOffsetsInKernelParamsFloat(const Params &)7669 void CheckOffsetsInKernelParamsFloat(const Params&) {
7670   static_assert(offsetof(Params, lhs_base_ptr) == RUY_OFFSET_LHS_BASE_PTR, "");
7671   static_assert(offsetof(Params, rhs_base_ptr) == RUY_OFFSET_RHS_BASE_PTR, "");
7672   static_assert(offsetof(Params, dst_base_ptr) == RUY_OFFSET_DST_BASE_PTR, "");
7673   static_assert(offsetof(Params, bias) == RUY_OFFSET_BIAS, "");
7674   static_assert(offsetof(Params, start_row) == RUY_OFFSET_START_ROW, "");
7675   static_assert(offsetof(Params, start_col) == RUY_OFFSET_START_COL, "");
7676   static_assert(offsetof(Params, last_row) == RUY_OFFSET_LAST_ROW, "");
7677   static_assert(offsetof(Params, last_col) == RUY_OFFSET_LAST_COL, "");
7678   static_assert(offsetof(Params, lhs_stride) == RUY_OFFSET_LHS_STRIDE, "");
7679   static_assert(offsetof(Params, rhs_stride) == RUY_OFFSET_RHS_STRIDE, "");
7680   static_assert(offsetof(Params, dst_stride) == RUY_OFFSET_DST_STRIDE, "");
7681   static_assert(offsetof(Params, depth) == RUY_OFFSET_DEPTH, "");
7682   static_assert(offsetof(Params, clamp_min) == RUY_OFFSET_CLAMP_MIN, "");
7683   static_assert(offsetof(Params, clamp_max) == RUY_OFFSET_CLAMP_MAX, "");
7684   static_assert(offsetof(Params, flags) == RUY_OFFSET_FLAGS, "");
7685 }
7686 
7687 // Just a plain float kernel; good enough for out-of-order cores.
7688 // The closest to it in the gemmlowp collection would be
7689 // NEON_64bit_GEMM_Float32_WithScalar,
7690 // https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L3925
7691 //
7692 // Besides ruy-ification, the main nuance here is that we stick to a 8x8
7693 // width instead of the wider 12x8 that the register space permits and that
7694 // the aforementioned gemmlowp kernel uses.  Ruy likes powers of two for now
7695 // and we don't have evidence that going beyond 8x8 is needed.
KernelFloatNeon(const KernelParamsFloat<8,8> & params)7696 void KernelFloatNeon(const KernelParamsFloat<8, 8>& params) {
7697   CheckOffsetsInKernelParamsFloat(params);
7698   profiler::ScopeLabel label("Kernel (kNeon)");
7699 
7700   const float* lhs_col_ptr = params.lhs_base_ptr;
7701   const float* rhs_col_ptr = params.rhs_base_ptr;
7702   const float* lhs_ptr = lhs_col_ptr;
7703   const float* rhs_ptr = rhs_col_ptr;
7704   float* dst_col_ptr = params.dst_base_ptr;
7705   float* dst_ptr = dst_col_ptr;
7706   int row = params.start_row;
7707   int col = params.start_col;
7708 
7709   // The asm kernel below has the following NEON register allocation:
7710   //
7711   // v16 -- v31 are accumulators.
7712   // During accumulation, v0 -- v15 are used to load data from LHS and RHS.
7713   // At least v0 and v1 are used to load a 8x1 block of LHS, and v2 and
7714   // v3 are used to load a 1x8 block of RHS, like this:
7715   //
7716   //                                          RHS 1x8 block
7717   //                           /-----------------------------------------|
7718   //                           |v2.s[0] ... v2.s[3]   v3.s[0] ... v3.s[3]|
7719   //                           \-----------------------------------------/
7720   //        LHS 8x1 block
7721   //  /---------------------\  /-----------------------------------------|
7722   //  |        v0.s[0]      |  |v16.s[0]           ...           v30.s[0]|
7723   //  |         ...         |  |  ...                              ...   |
7724   //  |        v0.s[3]      |  |v16.s[3]           ...           v30.s[3]|
7725   //  |        v1.s[0]      |  |v17.s[0]           ...           v31.s[0]|
7726   //  |         ...         |  |  ...                              ...   |
7727   //  |        v1.s[3]      |  |v17.s[3]           ...           v31.s[3]|
7728   //  \---------------------/  \-----------------------------------------/
7729   //                                      accumulators 8x8 block
7730   //
7731   // In the RUY_OPT_MAX_STREAMING part of the kernel, this elementary step
7732   // is repeated 4 times, using 4x more registers for LHS and RHS, so that
7733   // is where instead of using v0 -- v3 for LHS and RHS, we use v0 -- v15.
7734   //
7735   // Outside of the RUY_OPT_MAX_STREAMING part of the kernel, v4 -- v7 are
7736   // unused, and v8 -- v15 are used for floading parameters used for the
7737   // post-accumulation part of the kernel.
7738   asm volatile(
7739 #define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
7740 
7741         // clang-format off
7742 
7743         // Load some parameters into registers.
7744         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
7745         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
7746         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
7747         "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
7748         "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
7749         "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
7750         "ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
7751         "ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
7752 
7753         // Load the first 32 bytes of LHS and RHS data.
7754         "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
7755         "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
7756         "ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
7757         "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
7758 
7759         // Clear accumulators.
7760         RUY_MAKE_ZERO(v16)
7761         RUY_MAKE_ZERO(v17)
7762         RUY_MAKE_ZERO(v18)
7763         RUY_MAKE_ZERO(v19)
7764         RUY_MAKE_ZERO(v20)
7765         RUY_MAKE_ZERO(v21)
7766         RUY_MAKE_ZERO(v22)
7767         RUY_MAKE_ZERO(v23)
7768         RUY_MAKE_ZERO(v24)
7769         RUY_MAKE_ZERO(v25)
7770         RUY_MAKE_ZERO(v26)
7771         RUY_MAKE_ZERO(v27)
7772         RUY_MAKE_ZERO(v28)
7773         RUY_MAKE_ZERO(v29)
7774         RUY_MAKE_ZERO(v30)
7775         RUY_MAKE_ZERO(v31)
7776 
7777         // w1 is the number of levels of depth that we have already loaded
7778         // LHS and RHS data for. Corresponding to the initial ld1 instructions
7779         // above, this is currently 1.
7780         "mov w1, #1\n"
7781 
7782         // Main loop of the whole GEMM, over rows and columns of the
7783         // destination matrix.
7784         "1:\n"
7785 
7786         "fmla v16.4s, v0.4s, v2.s[0]\n"
7787         "fmla v18.4s, v0.4s, v2.s[1]\n"
7788         "fmla v20.4s, v0.4s, v2.s[2]\n"
7789         "fmla v22.4s, v0.4s, v2.s[3]\n"
7790 
7791 #if RUY_OPT(MAX_STREAMING)
7792         "cmp w12, #8\n"
7793         "blt 78f\n"
7794         "and w2, w12, #-4\n"
7795 
7796         "ld1 {v4.4s}, [%[lhs_ptr]], #16\n"
7797         "ld1 {v5.4s}, [%[lhs_ptr]], #16\n"
7798         "ld1 {v6.4s}, [%[rhs_ptr]], #16\n"
7799         "ld1 {v7.4s}, [%[rhs_ptr]], #16\n"
7800 
7801         "ld1 {v8.4s}, [%[lhs_ptr]], #16\n"
7802         "ld1 {v9.4s}, [%[lhs_ptr]], #16\n"
7803         "ld1 {v10.4s}, [%[rhs_ptr]], #16\n"
7804         "ld1 {v11.4s}, [%[rhs_ptr]], #16\n"
7805 
7806         "ld1 {v12.4s}, [%[lhs_ptr]], #16\n"
7807         "ld1 {v13.4s}, [%[lhs_ptr]], #16\n"
7808         "ld1 {v14.4s}, [%[rhs_ptr]], #16\n"
7809         "ld1 {v15.4s}, [%[rhs_ptr]], #16\n"
7810         "mov w1, #4\n"
7811 
7812         "80:\n"
7813 
7814         "add %[lhs_ptr], %[lhs_ptr], #128\n"
7815         "add %[rhs_ptr], %[rhs_ptr], #128\n"
7816 
7817         "fmla v24.4s, v0.4s, v3.s[0]\n"
7818         "fmla v26.4s, v0.4s, v3.s[1]\n"
7819         "fmla v28.4s, v0.4s, v3.s[2]\n"
7820         "fmla v30.4s, v0.4s, v3.s[3]\n"
7821         "ldr q0, [%[lhs_ptr], #-128]\n"
7822         "fmla v25.4s, v1.4s, v3.s[0]\n"
7823         "fmla v27.4s, v1.4s, v3.s[1]\n"
7824         "fmla v29.4s, v1.4s, v3.s[2]\n"
7825         "fmla v31.4s, v1.4s, v3.s[3]\n"
7826         "ldr q3, [%[rhs_ptr], #-112]\n"
7827         "fmla v17.4s, v1.4s, v2.s[0]\n"
7828         "fmla v19.4s, v1.4s, v2.s[1]\n"
7829         "fmla v21.4s, v1.4s, v2.s[2]\n"
7830         "fmla v23.4s, v1.4s, v2.s[3]\n"
7831         "ldr q1, [%[lhs_ptr], #-112]\n"
7832         "fmla v16.4s, v4.4s, v6.s[0]\n"
7833         "fmla v18.4s, v4.4s, v6.s[1]\n"
7834         "ldr q2, [%[rhs_ptr], #-128]\n"
7835         "fmla v20.4s, v4.4s, v6.s[2]\n"
7836         "fmla v22.4s, v4.4s, v6.s[3]\n"
7837 
7838         "fmla v24.4s, v4.4s, v7.s[0]\n"
7839         "fmla v26.4s, v4.4s, v7.s[1]\n"
7840         "fmla v28.4s, v4.4s, v7.s[2]\n"
7841         "fmla v30.4s, v4.4s, v7.s[3]\n"
7842         "ldr q4, [%[lhs_ptr], #-96]\n"
7843         "fmla v25.4s, v5.4s, v7.s[0]\n"
7844         "fmla v27.4s, v5.4s, v7.s[1]\n"
7845         "fmla v29.4s, v5.4s, v7.s[2]\n"
7846         "fmla v31.4s, v5.4s, v7.s[3]\n"
7847         "ldr q7, [%[rhs_ptr], #-80]\n"
7848         "fmla v17.4s, v5.4s, v6.s[0]\n"
7849         "fmla v19.4s, v5.4s, v6.s[1]\n"
7850         "fmla v21.4s, v5.4s, v6.s[2]\n"
7851         "fmla v23.4s, v5.4s, v6.s[3]\n"
7852         "ldr q5, [%[lhs_ptr], #-80]\n"
7853         "fmla v16.4s, v8.4s, v10.s[0]\n"
7854         "fmla v18.4s, v8.4s, v10.s[1]\n"
7855         "ldr q6, [%[rhs_ptr], #-96]\n"
7856         "fmla v20.4s, v8.4s, v10.s[2]\n"
7857         "fmla v22.4s, v8.4s, v10.s[3]\n"
7858 
7859         "fmla v24.4s, v8.4s, v11.s[0]\n"
7860         "fmla v26.4s, v8.4s, v11.s[1]\n"
7861         "fmla v28.4s, v8.4s, v11.s[2]\n"
7862         "fmla v30.4s, v8.4s, v11.s[3]\n"
7863         "ldr q8, [%[lhs_ptr], #-64]\n"
7864         "fmla v25.4s, v9.4s, v11.s[0]\n"
7865         "fmla v27.4s, v9.4s, v11.s[1]\n"
7866         "fmla v29.4s, v9.4s, v11.s[2]\n"
7867         "fmla v31.4s, v9.4s, v11.s[3]\n"
7868         "ldr q11, [%[rhs_ptr], #-48]\n"
7869         "fmla v17.4s, v9.4s, v10.s[0]\n"
7870         "fmla v19.4s, v9.4s, v10.s[1]\n"
7871         "fmla v21.4s, v9.4s, v10.s[2]\n"
7872         "fmla v23.4s, v9.4s, v10.s[3]\n"
7873         "ldr q9, [%[lhs_ptr], #-48]\n"
7874         "fmla v16.4s, v12.4s, v14.s[0]\n"
7875         "fmla v18.4s, v12.4s, v14.s[1]\n"
7876         "ldr q10, [%[rhs_ptr], #-64]\n"
7877         "fmla v20.4s, v12.4s, v14.s[2]\n"
7878         "fmla v22.4s, v12.4s, v14.s[3]\n"
7879 
7880         "fmla v24.4s, v12.4s, v15.s[0]\n"
7881         "fmla v26.4s, v12.4s, v15.s[1]\n"
7882         "fmla v28.4s, v12.4s, v15.s[2]\n"
7883         "fmla v30.4s, v12.4s, v15.s[3]\n"
7884         "ldr q12, [%[lhs_ptr], #-32]\n"
7885         "fmla v25.4s, v13.4s, v15.s[0]\n"
7886         "fmla v27.4s, v13.4s, v15.s[1]\n"
7887         "fmla v29.4s, v13.4s, v15.s[2]\n"
7888         "fmla v31.4s, v13.4s, v15.s[3]\n"
7889         "ldr q15, [%[rhs_ptr], #-16]\n"
7890         "fmla v17.4s, v13.4s, v14.s[0]\n"
7891         "fmla v19.4s, v13.4s, v14.s[1]\n"
7892         "fmla v21.4s, v13.4s, v14.s[2]\n"
7893         "fmla v23.4s, v13.4s, v14.s[3]\n"
7894         "ldr q13, [%[lhs_ptr], #-16]\n"
7895         "fmla v16.4s, v0.4s, v2.s[0]\n"
7896         "fmla v18.4s, v0.4s, v2.s[1]\n"
7897         "ldr q14, [%[rhs_ptr], #-32]\n"
7898         "fmla v20.4s, v0.4s, v2.s[2]\n"
7899         "fmla v22.4s, v0.4s, v2.s[3]\n"
7900 
7901         "add w1, w1, #4\n"
7902         "cmp w1, w2\n"
7903         "blt 80b\n"
7904 
7905         "fmla v16.4s, v4.4s, v6.s[0]\n"
7906         "fmla v18.4s, v4.4s, v6.s[1]\n"
7907         "fmla v20.4s, v4.4s, v6.s[2]\n"
7908         "fmla v22.4s, v4.4s, v6.s[3]\n"
7909         "fmla v24.4s, v4.4s, v7.s[0]\n"
7910         "fmla v26.4s, v4.4s, v7.s[1]\n"
7911         "fmla v28.4s, v4.4s, v7.s[2]\n"
7912         "fmla v30.4s, v4.4s, v7.s[3]\n"
7913         "fmla v25.4s, v5.4s, v7.s[0]\n"
7914         "fmla v27.4s, v5.4s, v7.s[1]\n"
7915         "fmla v29.4s, v5.4s, v7.s[2]\n"
7916         "fmla v31.4s, v5.4s, v7.s[3]\n"
7917         "fmla v17.4s, v5.4s, v6.s[0]\n"
7918         "fmla v19.4s, v5.4s, v6.s[1]\n"
7919         "fmla v21.4s, v5.4s, v6.s[2]\n"
7920         "fmla v23.4s, v5.4s, v6.s[3]\n"
7921 
7922         "fmla v16.4s, v8.4s, v10.s[0]\n"
7923         "fmla v18.4s, v8.4s, v10.s[1]\n"
7924         "fmla v20.4s, v8.4s, v10.s[2]\n"
7925         "fmla v22.4s, v8.4s, v10.s[3]\n"
7926         "fmla v24.4s, v8.4s, v11.s[0]\n"
7927         "fmla v26.4s, v8.4s, v11.s[1]\n"
7928         "fmla v28.4s, v8.4s, v11.s[2]\n"
7929         "fmla v30.4s, v8.4s, v11.s[3]\n"
7930         "fmla v25.4s, v9.4s, v11.s[0]\n"
7931         "fmla v27.4s, v9.4s, v11.s[1]\n"
7932         "fmla v29.4s, v9.4s, v11.s[2]\n"
7933         "fmla v31.4s, v9.4s, v11.s[3]\n"
7934         "fmla v17.4s, v9.4s, v10.s[0]\n"
7935         "fmla v19.4s, v9.4s, v10.s[1]\n"
7936         "fmla v21.4s, v9.4s, v10.s[2]\n"
7937         "fmla v23.4s, v9.4s, v10.s[3]\n"
7938 
7939         "fmla v16.4s, v12.4s, v14.s[0]\n"
7940         "fmla v18.4s, v12.4s, v14.s[1]\n"
7941         "fmla v20.4s, v12.4s, v14.s[2]\n"
7942         "fmla v22.4s, v12.4s, v14.s[3]\n"
7943         "fmla v24.4s, v12.4s, v15.s[0]\n"
7944         "fmla v26.4s, v12.4s, v15.s[1]\n"
7945         "fmla v28.4s, v12.4s, v15.s[2]\n"
7946         "fmla v30.4s, v12.4s, v15.s[3]\n"
7947         "fmla v25.4s, v13.4s, v15.s[0]\n"
7948         "fmla v27.4s, v13.4s, v15.s[1]\n"
7949         "fmla v29.4s, v13.4s, v15.s[2]\n"
7950         "fmla v31.4s, v13.4s, v15.s[3]\n"
7951         "fmla v17.4s, v13.4s, v14.s[0]\n"
7952         "fmla v19.4s, v13.4s, v14.s[1]\n"
7953         "fmla v21.4s, v13.4s, v14.s[2]\n"
7954         "fmla v23.4s, v13.4s, v14.s[3]\n"
7955 
7956         "78:\n"
7957 #endif
7958 
7959         // Accumulation loop
7960         "cmp w1, w12\n"
7961         "beq 79f\n"
7962 
7963         "2:\n"
7964         "fmla v24.4s, v0.4s, v3.s[0]\n"
7965         "fmla v26.4s, v0.4s, v3.s[1]\n"
7966         "ld1 {v4.4s}, [%[rhs_ptr]], #16\n"
7967         "fmla v28.4s, v0.4s, v3.s[2]\n"
7968         "fmla v30.4s, v0.4s, v3.s[3]\n"
7969         "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
7970         "fmla v25.4s, v1.4s, v3.s[0]\n"
7971         "fmla v27.4s, v1.4s, v3.s[1]\n"
7972         "add w1, w1, #1\n"
7973         "fmla v29.4s, v1.4s, v3.s[2]\n"
7974         "fmla v31.4s, v1.4s, v3.s[3]\n"
7975         "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
7976         "fmla v17.4s, v1.4s, v2.s[0]\n"
7977         "fmla v19.4s, v1.4s, v2.s[1]\n"
7978         "cmp w1, w12\n"
7979         "fmla v21.4s, v1.4s, v2.s[2]\n"
7980         "fmla v23.4s, v1.4s, v2.s[3]\n"
7981         "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
7982         "fmla v16.4s, v0.4s, v4.s[0]\n"
7983         "fmla v18.4s, v0.4s, v4.s[1]\n"
7984         "mov v2.16b, v4.16b\n"
7985         "fmla v20.4s, v0.4s, v4.s[2]\n"
7986         "fmla v22.4s, v0.4s, v4.s[3]\n"
7987         "blt 2b\n"
7988 
7989         "79:\n"
7990 
7991         // End of the inner loop on depth. Now perform the remaining
7992         // multiply-adds of the last level of depth, for which the LHS
7993         // and RHS data is already loaded.
7994 
7995         "fmla v24.4s, v0.4s, v3.s[0]\n"
7996         "fmla v26.4s, v0.4s, v3.s[1]\n"
7997         "fmla v28.4s, v0.4s, v3.s[2]\n"
7998         "fmla v30.4s, v0.4s, v3.s[3]\n"
7999         "fmla v25.4s, v1.4s, v3.s[0]\n"
8000         "fmla v27.4s, v1.4s, v3.s[1]\n"
8001         "fmla v29.4s, v1.4s, v3.s[2]\n"
8002         "fmla v31.4s, v1.4s, v3.s[3]\n"
8003         "fmla v17.4s, v1.4s, v2.s[0]\n"
8004         "fmla v19.4s, v1.4s, v2.s[1]\n"
8005         "fmla v21.4s, v1.4s, v2.s[2]\n"
8006         "fmla v23.4s, v1.4s, v2.s[3]\n"
8007 
8008         // End of accumulation. The registers v16 -- v31 contain the final
8009         // int32 accumulator values of the current 8x8 destination block.
8010         // We now have to compute the final 8-bit values from these int32
8011         // accumulators, and advance to the next 8x8 block. We intertwine
8012         // these two aspects whenever possible for optimal pipelining, both
8013         // at the data flow level (prefetch data for next block as early as
8014         // possible) and instruction pipelining level (some of the next-block
8015         // work can dual-issue with some of the final work on the current
8016         // block).
8017 
8018         // Logic to advance to the next block in preparation for the next
8019         // iteration of the main loop. For now, we only want to compute
8020         // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
8021         // not yet ready to update the values of row and col, as we still need
8022         // the current values for the rest of the work on the current block.
8023 
8024         "cmp %w[row], w7\n"  // Have we finished the last row?
8025         "bge 4f\n"           // If finished last row, go to 4
8026         // Not finished last row: then advance to next row.
8027         "add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
8028         "b 5f\n"
8029         "4:\n"  // Finished last row...
8030         "mov %[lhs_col_ptr], x5\n"  // Go back to first row
8031         // Now we need to advance to the next column. If we already
8032         // finished the last column, then in principle we are done, however
8033         // we can't just return here, as we need to allow the end work of the
8034         // current block to complete. The good news is that at this point it
8035         // doesn't matter what data we load for the next column, since
8036         // we will exit from the main loop below before actually storing
8037         // anything computed from that data.
8038         "cmp %w[col], w8\n"  // Have we finished the last column?
8039         "bge 5f\n" // If yes, just carry on without updating the column pointer.
8040         // Not finished last column: then advance to next column.
8041         "add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
8042         "5:\n"
8043 
8044         // Set the LHS and RHS data pointers to the start of the columns just
8045         // computed.
8046         "mov %[lhs_ptr], %[lhs_col_ptr]\n"
8047         "mov %[rhs_ptr], %[rhs_col_ptr]\n"
8048 
8049         // Load some parameters needed for the end work on current block.
8050         "ldrb w4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
8051         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
8052 
8053         // Determine the channel index.
8054         "tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
8055         "csel w3, %w[row], %w[col], eq\n"
8056 
8057         // Offset the bias pointer as needed given the current row, col.
8058         "add x5, x1, x3, lsl #2\n"
8059 
8060         // If there is no bias, use no offset, just address the passed zero
8061         // data.
8062         "tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
8063         "csel x1, x1, x5, eq\n"
8064 
8065         // Load 8 bias values.
8066         "ld1 {v14.4s}, [x1], #16\n"
8067         "ld1 {v15.4s}, [x1]\n"
8068 
8069         // Now that we know what LHS and RHS data the next iteration of the
8070         // main loop will need to load, we start loading the first 32 bytes of
8071         // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
8072         // in the rest of the work on the current block.
8073         "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
8074         "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
8075         "ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
8076         "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
8077 
8078         // Perform the bias-addition.
8079         // Jump based on channel dimension.
8080         "tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
8081         "bne 6f\n"
8082         // Case where channels are rows
8083         "fadd v16.4s, v16.4s, v14.4s\n"
8084         "fadd v17.4s, v17.4s, v15.4s\n"
8085         "fadd v18.4s, v18.4s, v14.4s\n"
8086         "fadd v19.4s, v19.4s, v15.4s\n"
8087         "fadd v20.4s, v20.4s, v14.4s\n"
8088         "fadd v21.4s, v21.4s, v15.4s\n"
8089         "fadd v22.4s, v22.4s, v14.4s\n"
8090         "fadd v23.4s, v23.4s, v15.4s\n"
8091         "fadd v24.4s, v24.4s, v14.4s\n"
8092         "fadd v25.4s, v25.4s, v15.4s\n"
8093         "fadd v26.4s, v26.4s, v14.4s\n"
8094         "fadd v27.4s, v27.4s, v15.4s\n"
8095         "fadd v28.4s, v28.4s, v14.4s\n"
8096         "fadd v29.4s, v29.4s, v15.4s\n"
8097         "fadd v30.4s, v30.4s, v14.4s\n"
8098         "fadd v31.4s, v31.4s, v15.4s\n"
8099         "b 7f\n"
8100 
8101         "6:\n"
8102         // Case where channels are columns
8103         "dup v8.4s, v14.s[0]\n"
8104         "dup v9.4s, v14.s[1]\n"
8105         "dup v10.4s, v14.s[2]\n"
8106         "dup v11.4s, v14.s[3]\n"
8107         "dup v12.4s, v15.s[0]\n"
8108         "dup v13.4s, v15.s[1]\n"
8109         "dup v14.4s, v15.s[2]\n"
8110         "dup v15.4s, v15.s[3]\n"
8111         "fadd v16.4s, v16.4s, v8.4s\n"
8112         "fadd v17.4s, v17.4s, v8.4s\n"
8113         "fadd v18.4s, v18.4s, v9.4s\n"
8114         "fadd v19.4s, v19.4s, v9.4s\n"
8115         "fadd v20.4s, v20.4s, v10.4s\n"
8116         "fadd v21.4s, v21.4s, v10.4s\n"
8117         "fadd v22.4s, v22.4s, v11.4s\n"
8118         "fadd v23.4s, v23.4s, v11.4s\n"
8119         "fadd v24.4s, v24.4s, v12.4s\n"
8120         "fadd v25.4s, v25.4s, v12.4s\n"
8121         "fadd v26.4s, v26.4s, v13.4s\n"
8122         "fadd v27.4s, v27.4s, v13.4s\n"
8123         "fadd v28.4s, v28.4s, v14.4s\n"
8124         "fadd v29.4s, v29.4s, v14.4s\n"
8125         "fadd v30.4s, v30.4s, v15.4s\n"
8126         "fadd v31.4s, v31.4s, v15.4s\n"
8127         "7:\n"
8128 
8129         // Load the clamp_min, clamp_max bounds
8130         "ldr w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
8131         "ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
8132         "dup v14.4s, w2\n"  // clamp_min
8133         "dup v15.4s, w3\n"  // clamp_max
8134 
8135         // Apply the clamp_min bound
8136         "fmax v16.4s, v16.4s, v14.4s\n"
8137         "fmax v17.4s, v17.4s, v14.4s\n"
8138         "fmax v18.4s, v18.4s, v14.4s\n"
8139         "fmax v19.4s, v19.4s, v14.4s\n"
8140         "fmax v20.4s, v20.4s, v14.4s\n"
8141         "fmax v21.4s, v21.4s, v14.4s\n"
8142         "fmax v22.4s, v22.4s, v14.4s\n"
8143         "fmax v23.4s, v23.4s, v14.4s\n"
8144         "fmax v24.4s, v24.4s, v14.4s\n"
8145         "fmax v25.4s, v25.4s, v14.4s\n"
8146         "fmax v26.4s, v26.4s, v14.4s\n"
8147         "fmax v27.4s, v27.4s, v14.4s\n"
8148         "fmax v28.4s, v28.4s, v14.4s\n"
8149         "fmax v29.4s, v29.4s, v14.4s\n"
8150         "fmax v30.4s, v30.4s, v14.4s\n"
8151         "fmax v31.4s, v31.4s, v14.4s\n"
8152 
8153         // Apply the clamp_max bound
8154         "fmin v16.4s, v16.4s, v15.4s\n"
8155         "fmin v17.4s, v17.4s, v15.4s\n"
8156         "fmin v18.4s, v18.4s, v15.4s\n"
8157         "fmin v19.4s, v19.4s, v15.4s\n"
8158         "fmin v20.4s, v20.4s, v15.4s\n"
8159         "fmin v21.4s, v21.4s, v15.4s\n"
8160         "fmin v22.4s, v22.4s, v15.4s\n"
8161         "fmin v23.4s, v23.4s, v15.4s\n"
8162         "fmin v24.4s, v24.4s, v15.4s\n"
8163         "fmin v25.4s, v25.4s, v15.4s\n"
8164         "fmin v26.4s, v26.4s, v15.4s\n"
8165         "fmin v27.4s, v27.4s, v15.4s\n"
8166         "fmin v28.4s, v28.4s, v15.4s\n"
8167         "fmin v29.4s, v29.4s, v15.4s\n"
8168         "fmin v30.4s, v30.4s, v15.4s\n"
8169         "fmin v31.4s, v31.4s, v15.4s\n"
8170 
8171         // Compute how much of the 8x8 block of destination 8bit values that
8172         // we have computed, fit in the destination matrix. Typically, all of
8173         // it fits, but when the destination matrix shape is not a multiple
8174         // of 8x8, there are some 8x8 blocks along the boundaries that do
8175         // not fit entirely.
8176         "sub w1, %w[dst_rows], %w[row]\n"
8177         "sub w2, %w[dst_cols], %w[col]\n"
8178         "mov w3, #8\n"
8179         "cmp w1, #8\n"
8180         // Compute w1 = how many rows of the 8x8 block fit
8181         "csel w1, w1, w3, le\n"
8182         "cmp w2, #8\n"
8183         // Compute w2 = how many cols of the 8x8 block fit
8184         "csel w2, w2, w3, le\n"
8185 
8186         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
8187         "cmp w1, w3\n"
8188         "ccmp w2, w3, 0, eq\n"
8189         // Yes, all of the 8x8 block fits, go to fast path.
8190         "beq 30f\n"
8191         // Not all of the 8x8 block fits.
8192         // Set (x3 address, x4 stride) to write to dst_tmp_buf
8193         "mov x3, %[dst_tmp_buf]\n"
8194         "mov x4, #32\n"
8195         "b 31f\n"
8196         "30:\n"
8197         // Yes, all of the 8x8 block fits.
8198         // Set (x3 address, x4 stride) to write directly to destination matrix.
8199         "mov x3, %[dst_ptr]\n"
8200         "mov x4, x11\n"
8201         "31:\n"
8202 
8203         // Write our 8bit values to the destination described by
8204         // (x3 address, x4 stride).
8205         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8206         "str q16, [x3, #0]\n"
8207         "str q17, [x3, #16]\n"
8208         "add x3, x3, x4\n"
8209         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8210         RUY_MAKE_ZERO(v16)
8211         RUY_MAKE_ZERO(v17)
8212         "str q18, [x3, #0]\n"
8213         "str q19, [x3, #16]\n"
8214         "add x3, x3, x4\n"
8215         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8216         RUY_MAKE_ZERO(v18)
8217         RUY_MAKE_ZERO(v19)
8218         "str q20, [x3, #0]\n"
8219         "str q21, [x3, #16]\n"
8220         "add x3, x3, x4\n"
8221         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8222         RUY_MAKE_ZERO(v20)
8223         RUY_MAKE_ZERO(v21)
8224         "str q22, [x3, #0]\n"
8225         "str q23, [x3, #16]\n"
8226         "add x3, x3, x4\n"
8227         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8228         RUY_MAKE_ZERO(v22)
8229         RUY_MAKE_ZERO(v23)
8230         "str q24, [x3, #0]\n"
8231         "str q25, [x3, #16]\n"
8232         "add x3, x3, x4\n"
8233         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8234         RUY_MAKE_ZERO(v24)
8235         RUY_MAKE_ZERO(v25)
8236         "str q26, [x3, #0]\n"
8237         "str q27, [x3, #16]\n"
8238         "add x3, x3, x4\n"
8239         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8240         RUY_MAKE_ZERO(v26)
8241         RUY_MAKE_ZERO(v27)
8242         "str q28, [x3, #0]\n"
8243         "str q29, [x3, #16]\n"
8244         "add x3, x3, x4\n"
8245         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8246         RUY_MAKE_ZERO(v28)
8247         RUY_MAKE_ZERO(v29)
8248         "str q30, [x3, #0]\n"
8249         "str q31, [x3, #16]\n"
8250         RUY_MAKE_ZERO(v30)
8251         RUY_MAKE_ZERO(v31)
8252 
8253         // If all of the 8x8 block fits, we just finished writing it to the
8254         // destination, so we skip the next part.
8255         "beq 41f\n"
8256         // Not all of the 8x8 block fits in the destination matrix.  We just
8257         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
8258         // it to copy into the destination matrix the part that fits.
8259         "mov x3, %[dst_tmp_buf]\n"
8260         "mov x4, %[dst_ptr]\n"
8261         "mov w6, #0\n"
8262         "50:\n"
8263         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
8264         "mov w5, #0\n"
8265         "51:\n"
8266         "ldr w7, [x3, x5, lsl #2]\n"
8267         "str w7, [x4, x5, lsl #2]\n"
8268         "add w5, w5, #1\n"
8269         "cmp w5, w1\n"
8270         "blt 51b\n"
8271         "add w6, w6, #1\n"
8272         "add x3, x3, #32\n"
8273         "add x4, x4, x11\n"
8274         "cmp w6, w2\n"
8275         "blt 50b\n"
8276         "41:\n"
8277         "add %[dst_ptr], %[dst_ptr], #32\n"
8278         // At this point we have completely finished writing values to the
8279         // destination matrix for the current block.
8280 
8281         // Reload some params --- we had used x5 -- x7 for a few other things
8282         // since the last time we had loaded them.
8283         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
8284         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
8285         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
8286 
8287         // Move to the next block of the destination matrix, for the next iter
8288         // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
8289         // been updated earlier.
8290         // Have we reached the end row?
8291         "cmp %w[row], w7\n"
8292         "beq 20f\n"  // yes, end row.
8293         // Not end row. Move to the next row.
8294         "add %w[row], %w[row], #8\n"
8295         "b 21f\n"
8296         "20:\n"
8297         // Was already at end row.
8298         "mov %w[row], w6\n"  // Move back to first row.
8299         "add %w[col], %w[col], #8\n"  // Move to the next column.
8300         "add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
8301         "mov %[dst_ptr], %[dst_col_ptr]\n"
8302         "21:\n"
8303 
8304         // Main loop exit condition: have we hit the end column?
8305         "cmp %w[col], w8\n"
8306 
8307         // w1 is the number of levels of depth that we have already loaded
8308         // LHS and RHS data for. Corresponding to the initial ld1 instructions
8309         // above, this is currently 1.
8310         "mov w1, #1\n"
8311 
8312         "ble 1b\n"
8313 
8314         // clang-format on
8315 
8316         : [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
8317           [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
8318           [dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
8319         : [ params ] "r"(&params), [dst_rows] "r"(params.dst_rows),
8320           [dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf)
8321         : "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
8322           "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
8323           "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
8324           "v26", "v27", "v28", "v29", "v30", "v31");
8325 }
8326 
8327 // A fork of the standard float kernel where we omit the manual loop unrolling
8328 // to recover performance on the X1. For now, the X1 core is the only CPU that
8329 // uses this kernel.
KernelFloatNeonX1(const KernelParamsFloat<8,8> & params)8330 void KernelFloatNeonX1(const KernelParamsFloat<8, 8>& params) {
8331   CheckOffsetsInKernelParamsFloat(params);
8332   profiler::ScopeLabel label("Kernel (kNeon) X1");
8333 
8334   const float* lhs_col_ptr = params.lhs_base_ptr;
8335   const float* rhs_col_ptr = params.rhs_base_ptr;
8336   const float* lhs_ptr = lhs_col_ptr;
8337   const float* rhs_ptr = rhs_col_ptr;
8338   float* dst_col_ptr = params.dst_base_ptr;
8339   float* dst_ptr = dst_col_ptr;
8340   int row = params.start_row;
8341   int col = params.start_col;
8342 
8343   // The asm kernel below has the following NEON register allocation:
8344   //
8345   // v16 -- v31 are accumulators.
8346   // During accumulation, v0 -- v15 are used to load data from LHS and RHS.
8347   // At least v0 and v1 are used to load a 8x1 block of LHS, and v2 and
8348   // v3 are used to load a 1x8 block of RHS, like this:
8349   //
8350   //                                          RHS 1x8 block
8351   //                           /-----------------------------------------|
8352   //                           |v2.s[0] ... v2.s[3]   v3.s[0] ... v3.s[3]|
8353   //                           \-----------------------------------------/
8354   //        LHS 8x1 block
8355   //  /---------------------\  /-----------------------------------------|
8356   //  |        v0.s[0]      |  |v16.s[0]           ...           v30.s[0]|
8357   //  |         ...         |  |  ...                              ...   |
8358   //  |        v0.s[3]      |  |v16.s[3]           ...           v30.s[3]|
8359   //  |        v1.s[0]      |  |v17.s[0]           ...           v31.s[0]|
8360   //  |         ...         |  |  ...                              ...   |
8361   //  |        v1.s[3]      |  |v17.s[3]           ...           v31.s[3]|
8362   //  \---------------------/  \-----------------------------------------/
8363   //                                      accumulators 8x8 block
8364   //
8365   // In the RUY_OPT_MAX_STREAMING part of the kernel, this elementary step
8366   // is repeated 4 times, using 4x more registers for LHS and RHS, so that
8367   // is where instead of using v0 -- v3 for LHS and RHS, we use v0 -- v15.
8368   //
8369   // Outside of the RUY_OPT_MAX_STREAMING part of the kernel, v4 -- v7 are
8370   // unused, and v8 -- v15 are used for floading parameters used for the
8371   // post-accumulation part of the kernel.
8372   asm volatile(
8373 #define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
8374 
8375         // clang-format off
8376 
8377         // Load some parameters into registers.
8378         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
8379         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
8380         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
8381         "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
8382         "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
8383         "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
8384         "ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
8385         "ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
8386 
8387         // Load the first 32 bytes of LHS and RHS data.
8388         "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
8389         "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
8390         "ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
8391         "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
8392 
8393         // Clear accumulators.
8394         RUY_MAKE_ZERO(v16)
8395         RUY_MAKE_ZERO(v17)
8396         RUY_MAKE_ZERO(v18)
8397         RUY_MAKE_ZERO(v19)
8398         RUY_MAKE_ZERO(v20)
8399         RUY_MAKE_ZERO(v21)
8400         RUY_MAKE_ZERO(v22)
8401         RUY_MAKE_ZERO(v23)
8402         RUY_MAKE_ZERO(v24)
8403         RUY_MAKE_ZERO(v25)
8404         RUY_MAKE_ZERO(v26)
8405         RUY_MAKE_ZERO(v27)
8406         RUY_MAKE_ZERO(v28)
8407         RUY_MAKE_ZERO(v29)
8408         RUY_MAKE_ZERO(v30)
8409         RUY_MAKE_ZERO(v31)
8410 
8411         // w1 is the number of levels of depth that we have already loaded
8412         // LHS and RHS data for. Corresponding to the initial ld1 instructions
8413         // above, this is currently 1.
8414         "mov w1, #1\n"
8415 
8416         // Main loop of the whole GEMM, over rows and columns of the
8417         // destination matrix.
8418         "1:\n"
8419 
8420         "fmla v16.4s, v0.4s, v2.s[0]\n"
8421         "fmla v18.4s, v0.4s, v2.s[1]\n"
8422         "fmla v20.4s, v0.4s, v2.s[2]\n"
8423         "fmla v22.4s, v0.4s, v2.s[3]\n"
8424 
8425         // Accumulation loop
8426         "cmp w1, w12\n"
8427         "beq 79f\n"
8428 
8429         "2:\n"
8430         "fmla v24.4s, v0.4s, v3.s[0]\n"
8431         "fmla v26.4s, v0.4s, v3.s[1]\n"
8432         "ld1 {v4.4s}, [%[rhs_ptr]], #16\n"
8433         "fmla v28.4s, v0.4s, v3.s[2]\n"
8434         "fmla v30.4s, v0.4s, v3.s[3]\n"
8435         "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
8436         "fmla v25.4s, v1.4s, v3.s[0]\n"
8437         "fmla v27.4s, v1.4s, v3.s[1]\n"
8438         "add w1, w1, #1\n"
8439         "fmla v29.4s, v1.4s, v3.s[2]\n"
8440         "fmla v31.4s, v1.4s, v3.s[3]\n"
8441         "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
8442         "fmla v17.4s, v1.4s, v2.s[0]\n"
8443         "fmla v19.4s, v1.4s, v2.s[1]\n"
8444         "cmp w1, w12\n"
8445         "fmla v21.4s, v1.4s, v2.s[2]\n"
8446         "fmla v23.4s, v1.4s, v2.s[3]\n"
8447         "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
8448         "fmla v16.4s, v0.4s, v4.s[0]\n"
8449         "fmla v18.4s, v0.4s, v4.s[1]\n"
8450         "mov v2.16b, v4.16b\n"
8451         "fmla v20.4s, v0.4s, v4.s[2]\n"
8452         "fmla v22.4s, v0.4s, v4.s[3]\n"
8453         "blt 2b\n"
8454 
8455         "79:\n"
8456 
8457         // End of the inner loop on depth. Now perform the remaining
8458         // multiply-adds of the last level of depth, for which the LHS
8459         // and RHS data is already loaded.
8460 
8461         "fmla v24.4s, v0.4s, v3.s[0]\n"
8462         "fmla v26.4s, v0.4s, v3.s[1]\n"
8463         "fmla v28.4s, v0.4s, v3.s[2]\n"
8464         "fmla v30.4s, v0.4s, v3.s[3]\n"
8465         "fmla v25.4s, v1.4s, v3.s[0]\n"
8466         "fmla v27.4s, v1.4s, v3.s[1]\n"
8467         "fmla v29.4s, v1.4s, v3.s[2]\n"
8468         "fmla v31.4s, v1.4s, v3.s[3]\n"
8469         "fmla v17.4s, v1.4s, v2.s[0]\n"
8470         "fmla v19.4s, v1.4s, v2.s[1]\n"
8471         "fmla v21.4s, v1.4s, v2.s[2]\n"
8472         "fmla v23.4s, v1.4s, v2.s[3]\n"
8473 
8474         // End of accumulation. The registers v16 -- v31 contain the final
8475         // int32 accumulator values of the current 8x8 destination block.
8476         // We now have to compute the final 8-bit values from these int32
8477         // accumulators, and advance to the next 8x8 block. We intertwine
8478         // these two aspects whenever possible for optimal pipelining, both
8479         // at the data flow level (prefetch data for next block as early as
8480         // possible) and instruction pipelining level (some of the next-block
8481         // work can dual-issue with some of the final work on the current
8482         // block).
8483 
8484         // Logic to advance to the next block in preparation for the next
8485         // iteration of the main loop. For now, we only want to compute
8486         // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
8487         // not yet ready to update the values of row and col, as we still need
8488         // the current values for the rest of the work on the current block.
8489 
8490         "cmp %w[row], w7\n"  // Have we finished the last row?
8491         "bge 4f\n"           // If finished last row, go to 4
8492         // Not finished last row: then advance to next row.
8493         "add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
8494         "b 5f\n"
8495         "4:\n"  // Finished last row...
8496         "mov %[lhs_col_ptr], x5\n"  // Go back to first row
8497         // Now we need to advance to the next column. If we already
8498         // finished the last column, then in principle we are done, however
8499         // we can't just return here, as we need to allow the end work of the
8500         // current block to complete. The good news is that at this point it
8501         // doesn't matter what data we load for the next column, since
8502         // we will exit from the main loop below before actually storing
8503         // anything computed from that data.
8504         "cmp %w[col], w8\n"  // Have we finished the last column?
8505         "bge 5f\n" // If yes, just carry on without updating the column pointer.
8506         // Not finished last column: then advance to next column.
8507         "add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
8508         "5:\n"
8509 
8510         // Set the LHS and RHS data pointers to the start of the columns just
8511         // computed.
8512         "mov %[lhs_ptr], %[lhs_col_ptr]\n"
8513         "mov %[rhs_ptr], %[rhs_col_ptr]\n"
8514 
8515         // Load some parameters needed for the end work on current block.
8516         "ldrb w4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
8517         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
8518 
8519         // Determine the channel index.
8520         "tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
8521         "csel w3, %w[row], %w[col], eq\n"
8522 
8523         // Offset the bias pointer as needed given the current row, col.
8524         "add x5, x1, x3, lsl #2\n"
8525 
8526         // If there is no bias, use no offset, just address the passed zero
8527         // data.
8528         "tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
8529         "csel x1, x1, x5, eq\n"
8530 
8531         // Load 8 bias values.
8532         "ld1 {v14.4s}, [x1], #16\n"
8533         "ld1 {v15.4s}, [x1]\n"
8534 
8535         // Now that we know what LHS and RHS data the next iteration of the
8536         // main loop will need to load, we start loading the first 32 bytes of
8537         // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
8538         // in the rest of the work on the current block.
8539         "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
8540         "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
8541         "ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
8542         "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
8543 
8544         // Perform the bias-addition.
8545         // Jump based on channel dimension.
8546         "tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
8547         "bne 6f\n"
8548         // Case where channels are rows
8549         "fadd v16.4s, v16.4s, v14.4s\n"
8550         "fadd v17.4s, v17.4s, v15.4s\n"
8551         "fadd v18.4s, v18.4s, v14.4s\n"
8552         "fadd v19.4s, v19.4s, v15.4s\n"
8553         "fadd v20.4s, v20.4s, v14.4s\n"
8554         "fadd v21.4s, v21.4s, v15.4s\n"
8555         "fadd v22.4s, v22.4s, v14.4s\n"
8556         "fadd v23.4s, v23.4s, v15.4s\n"
8557         "fadd v24.4s, v24.4s, v14.4s\n"
8558         "fadd v25.4s, v25.4s, v15.4s\n"
8559         "fadd v26.4s, v26.4s, v14.4s\n"
8560         "fadd v27.4s, v27.4s, v15.4s\n"
8561         "fadd v28.4s, v28.4s, v14.4s\n"
8562         "fadd v29.4s, v29.4s, v15.4s\n"
8563         "fadd v30.4s, v30.4s, v14.4s\n"
8564         "fadd v31.4s, v31.4s, v15.4s\n"
8565         "b 7f\n"
8566 
8567         "6:\n"
8568         // Case where channels are columns
8569         "dup v8.4s, v14.s[0]\n"
8570         "dup v9.4s, v14.s[1]\n"
8571         "dup v10.4s, v14.s[2]\n"
8572         "dup v11.4s, v14.s[3]\n"
8573         "dup v12.4s, v15.s[0]\n"
8574         "dup v13.4s, v15.s[1]\n"
8575         "dup v14.4s, v15.s[2]\n"
8576         "dup v15.4s, v15.s[3]\n"
8577         "fadd v16.4s, v16.4s, v8.4s\n"
8578         "fadd v17.4s, v17.4s, v8.4s\n"
8579         "fadd v18.4s, v18.4s, v9.4s\n"
8580         "fadd v19.4s, v19.4s, v9.4s\n"
8581         "fadd v20.4s, v20.4s, v10.4s\n"
8582         "fadd v21.4s, v21.4s, v10.4s\n"
8583         "fadd v22.4s, v22.4s, v11.4s\n"
8584         "fadd v23.4s, v23.4s, v11.4s\n"
8585         "fadd v24.4s, v24.4s, v12.4s\n"
8586         "fadd v25.4s, v25.4s, v12.4s\n"
8587         "fadd v26.4s, v26.4s, v13.4s\n"
8588         "fadd v27.4s, v27.4s, v13.4s\n"
8589         "fadd v28.4s, v28.4s, v14.4s\n"
8590         "fadd v29.4s, v29.4s, v14.4s\n"
8591         "fadd v30.4s, v30.4s, v15.4s\n"
8592         "fadd v31.4s, v31.4s, v15.4s\n"
8593         "7:\n"
8594 
8595         // Load the clamp_min, clamp_max bounds
8596         "ldr w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
8597         "ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
8598         "dup v14.4s, w2\n"  // clamp_min
8599         "dup v15.4s, w3\n"  // clamp_max
8600 
8601         // Apply the clamp_min bound
8602         "fmax v16.4s, v16.4s, v14.4s\n"
8603         "fmax v17.4s, v17.4s, v14.4s\n"
8604         "fmax v18.4s, v18.4s, v14.4s\n"
8605         "fmax v19.4s, v19.4s, v14.4s\n"
8606         "fmax v20.4s, v20.4s, v14.4s\n"
8607         "fmax v21.4s, v21.4s, v14.4s\n"
8608         "fmax v22.4s, v22.4s, v14.4s\n"
8609         "fmax v23.4s, v23.4s, v14.4s\n"
8610         "fmax v24.4s, v24.4s, v14.4s\n"
8611         "fmax v25.4s, v25.4s, v14.4s\n"
8612         "fmax v26.4s, v26.4s, v14.4s\n"
8613         "fmax v27.4s, v27.4s, v14.4s\n"
8614         "fmax v28.4s, v28.4s, v14.4s\n"
8615         "fmax v29.4s, v29.4s, v14.4s\n"
8616         "fmax v30.4s, v30.4s, v14.4s\n"
8617         "fmax v31.4s, v31.4s, v14.4s\n"
8618 
8619         // Apply the clamp_max bound
8620         "fmin v16.4s, v16.4s, v15.4s\n"
8621         "fmin v17.4s, v17.4s, v15.4s\n"
8622         "fmin v18.4s, v18.4s, v15.4s\n"
8623         "fmin v19.4s, v19.4s, v15.4s\n"
8624         "fmin v20.4s, v20.4s, v15.4s\n"
8625         "fmin v21.4s, v21.4s, v15.4s\n"
8626         "fmin v22.4s, v22.4s, v15.4s\n"
8627         "fmin v23.4s, v23.4s, v15.4s\n"
8628         "fmin v24.4s, v24.4s, v15.4s\n"
8629         "fmin v25.4s, v25.4s, v15.4s\n"
8630         "fmin v26.4s, v26.4s, v15.4s\n"
8631         "fmin v27.4s, v27.4s, v15.4s\n"
8632         "fmin v28.4s, v28.4s, v15.4s\n"
8633         "fmin v29.4s, v29.4s, v15.4s\n"
8634         "fmin v30.4s, v30.4s, v15.4s\n"
8635         "fmin v31.4s, v31.4s, v15.4s\n"
8636 
8637         // Compute how much of the 8x8 block of destination 8bit values that
8638         // we have computed, fit in the destination matrix. Typically, all of
8639         // it fits, but when the destination matrix shape is not a multiple
8640         // of 8x8, there are some 8x8 blocks along the boundaries that do
8641         // not fit entirely.
8642         "sub w1, %w[dst_rows], %w[row]\n"
8643         "sub w2, %w[dst_cols], %w[col]\n"
8644         "mov w3, #8\n"
8645         "cmp w1, #8\n"
8646         // Compute w1 = how many rows of the 8x8 block fit
8647         "csel w1, w1, w3, le\n"
8648         "cmp w2, #8\n"
8649         // Compute w2 = how many cols of the 8x8 block fit
8650         "csel w2, w2, w3, le\n"
8651 
8652         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
8653         "cmp w1, w3\n"
8654         "ccmp w2, w3, 0, eq\n"
8655         // Yes, all of the 8x8 block fits, go to fast path.
8656         "beq 30f\n"
8657         // Not all of the 8x8 block fits.
8658         // Set (x3 address, x4 stride) to write to dst_tmp_buf
8659         "mov x3, %[dst_tmp_buf]\n"
8660         "mov x4, #32\n"
8661         "b 31f\n"
8662         "30:\n"
8663         // Yes, all of the 8x8 block fits.
8664         // Set (x3 address, x4 stride) to write directly to destination matrix.
8665         "mov x3, %[dst_ptr]\n"
8666         "mov x4, x11\n"
8667         "31:\n"
8668 
8669         // Write our 8bit values to the destination described by
8670         // (x3 address, x4 stride).
8671         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8672         "str q16, [x3, #0]\n"
8673         "str q17, [x3, #16]\n"
8674         "add x3, x3, x4\n"
8675         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8676         RUY_MAKE_ZERO(v16)
8677         RUY_MAKE_ZERO(v17)
8678         "str q18, [x3, #0]\n"
8679         "str q19, [x3, #16]\n"
8680         "add x3, x3, x4\n"
8681         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8682         RUY_MAKE_ZERO(v18)
8683         RUY_MAKE_ZERO(v19)
8684         "str q20, [x3, #0]\n"
8685         "str q21, [x3, #16]\n"
8686         "add x3, x3, x4\n"
8687         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8688         RUY_MAKE_ZERO(v20)
8689         RUY_MAKE_ZERO(v21)
8690         "str q22, [x3, #0]\n"
8691         "str q23, [x3, #16]\n"
8692         "add x3, x3, x4\n"
8693         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8694         RUY_MAKE_ZERO(v22)
8695         RUY_MAKE_ZERO(v23)
8696         "str q24, [x3, #0]\n"
8697         "str q25, [x3, #16]\n"
8698         "add x3, x3, x4\n"
8699         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8700         RUY_MAKE_ZERO(v24)
8701         RUY_MAKE_ZERO(v25)
8702         "str q26, [x3, #0]\n"
8703         "str q27, [x3, #16]\n"
8704         "add x3, x3, x4\n"
8705         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8706         RUY_MAKE_ZERO(v26)
8707         RUY_MAKE_ZERO(v27)
8708         "str q28, [x3, #0]\n"
8709         "str q29, [x3, #16]\n"
8710         "add x3, x3, x4\n"
8711         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
8712         RUY_MAKE_ZERO(v28)
8713         RUY_MAKE_ZERO(v29)
8714         "str q30, [x3, #0]\n"
8715         "str q31, [x3, #16]\n"
8716         RUY_MAKE_ZERO(v30)
8717         RUY_MAKE_ZERO(v31)
8718 
8719         // If all of the 8x8 block fits, we just finished writing it to the
8720         // destination, so we skip the next part.
8721         "beq 41f\n"
8722         // Not all of the 8x8 block fits in the destination matrix.  We just
8723         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
8724         // it to copy into the destination matrix the part that fits.
8725         "mov x3, %[dst_tmp_buf]\n"
8726         "mov x4, %[dst_ptr]\n"
8727         "mov w6, #0\n"
8728         "50:\n"
8729         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
8730         "mov w5, #0\n"
8731         "51:\n"
8732         "ldr w7, [x3, x5, lsl #2]\n"
8733         "str w7, [x4, x5, lsl #2]\n"
8734         "add w5, w5, #1\n"
8735         "cmp w5, w1\n"
8736         "blt 51b\n"
8737         "add w6, w6, #1\n"
8738         "add x3, x3, #32\n"
8739         "add x4, x4, x11\n"
8740         "cmp w6, w2\n"
8741         "blt 50b\n"
8742         "41:\n"
8743         "add %[dst_ptr], %[dst_ptr], #32\n"
8744         // At this point we have completely finished writing values to the
8745         // destination matrix for the current block.
8746 
8747         // Reload some params --- we had used x5 -- x7 for a few other things
8748         // since the last time we had loaded them.
8749         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
8750         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
8751         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
8752 
8753         // Move to the next block of the destination matrix, for the next iter
8754         // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
8755         // been updated earlier.
8756         // Have we reached the end row?
8757         "cmp %w[row], w7\n"
8758         "beq 20f\n"  // yes, end row.
8759         // Not end row. Move to the next row.
8760         "add %w[row], %w[row], #8\n"
8761         "b 21f\n"
8762         "20:\n"
8763         // Was already at end row.
8764         "mov %w[row], w6\n"  // Move back to first row.
8765         "add %w[col], %w[col], #8\n"  // Move to the next column.
8766         "add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
8767         "mov %[dst_ptr], %[dst_col_ptr]\n"
8768         "21:\n"
8769 
8770         // Main loop exit condition: have we hit the end column?
8771         "cmp %w[col], w8\n"
8772 
8773         // w1 is the number of levels of depth that we have already loaded
8774         // LHS and RHS data for. Corresponding to the initial ld1 instructions
8775         // above, this is currently 1.
8776         "mov w1, #1\n"
8777 
8778         "ble 1b\n"
8779 
8780         // clang-format on
8781 
8782         : [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
8783           [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
8784           [dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
8785         : [ params ] "r"(&params), [dst_rows] "r"(params.dst_rows),
8786           [dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf)
8787         : "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
8788           "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
8789           "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
8790           "v26", "v27", "v28", "v29", "v30", "v31");
8791 }
8792 
8793 // Variant of KernelFloatNeon tuned for in-order CPUs that do not
8794 // support dotprod (while dotprod by itself is not relevant to floating-point,
8795 // this additional bit of information that we have about the target happens to
8796 // be useful here).
8797 //
8798 // So a typical target CPU here would be ARM Cortex-A53 or the original
8799 // Cortex-A55.
8800 //
8801 // This kernel is similar to and inspired by gemmlowp's
8802 // NEON_64bit_GEMM_Float32_WithScalar_A53.
8803 // which was contributed by David Mansell with very helpful
8804 // comments. Specifically, see this comment about tuning for Cortex-A53:
8805 // https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L4215
KernelFloatNeonA55ish(const KernelParamsFloat<8,8> & params)8806 void KernelFloatNeonA55ish(const KernelParamsFloat<8, 8>& params) {
8807   profiler::ScopeLabel label("Kernel (kNeon, optimized for in-order cores)");
8808 
8809   CheckOffsetsInKernelParamsFloat(params);
8810 
8811   const float* lhs_col_ptr = params.lhs_base_ptr;
8812   const float* rhs_col_ptr = params.rhs_base_ptr;
8813   const float* lhs_ptr = lhs_col_ptr;
8814   const float* rhs_ptr = rhs_col_ptr;
8815   float* dst_col_ptr = params.dst_base_ptr;
8816   float* dst_ptr = dst_col_ptr;
8817   int row = params.start_row;
8818   int col = params.start_col;
8819 
8820   // The asm kernel below has the following NEON register allocation:
8821   //
8822   // v16 -- v31 are accumulators.
8823   // During accumulation, v0 -- v3 are used to load data from LHS and RHS.
8824   //
8825   //                                          RHS 1x8 block
8826   //                           /-----------------------------------------|
8827   //                           |v2.s[0] ... v2.s[3]   v3.s[0] ... v3.s[3]|
8828   //                           \-----------------------------------------/
8829   //        LHS 8x1 block
8830   //  /---------------------\  /-----------------------------------------|
8831   //  |        v0.s[0]      |  |v16.s[0]           ...           v30.s[0]|
8832   //  |         ...         |  |  ...                              ...   |
8833   //  |        v0.s[3]      |  |v16.s[3]           ...           v30.s[3]|
8834   //  |        v1.s[0]      |  |v17.s[0]           ...           v31.s[0]|
8835   //  |         ...         |  |  ...                              ...   |
8836   //  |        v1.s[3]      |  |v17.s[3]           ...           v31.s[3]|
8837   //  \---------------------/  \-----------------------------------------/
8838   //                                      accumulators 8x8 block
8839   //
8840   // There is no RUY_OPT_MAX_STREAMING 4x-unrolled part in this kernel because
8841   // we did not observe a benefit of such partial unrolling on in-order CPUs.
8842   //
8843   // v4 -- v7 are unused, and v8 -- v15 are used for floading parameters used
8844   // for the post-accumulation part of the kernel.
8845   asm volatile(
8846 #define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
8847 
8848         // clang-format off
8849 
8850         // Load some parameters into registers.
8851         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
8852         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
8853         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
8854         "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
8855         "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
8856         "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
8857         "ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
8858         "ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
8859 
8860 
8861         // Clear accumulators.
8862         RUY_MAKE_ZERO(v16)
8863         // Load the first 32 bytes of LHS and RHS data.
8864         "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
8865         RUY_MAKE_ZERO(v17)
8866         "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
8867         RUY_MAKE_ZERO(v18)
8868         "ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
8869         RUY_MAKE_ZERO(v19)
8870         "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
8871         RUY_MAKE_ZERO(v20)
8872         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #64]\n")
8873         RUY_MAKE_ZERO(v21)
8874         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #64]\n")
8875         RUY_MAKE_ZERO(v22)
8876         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #128]\n")
8877         RUY_MAKE_ZERO(v23)
8878         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #128]\n")
8879         RUY_MAKE_ZERO(v24)
8880         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #192]\n")
8881         RUY_MAKE_ZERO(v25)
8882         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #192]\n")
8883         RUY_MAKE_ZERO(v26)
8884         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #256]\n")
8885         RUY_MAKE_ZERO(v27)
8886         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #256]\n")
8887         RUY_MAKE_ZERO(v28)
8888         RUY_MAKE_ZERO(v29)
8889         RUY_MAKE_ZERO(v30)
8890         RUY_MAKE_ZERO(v31)
8891 
8892         // w1 is the number of levels of depth that remain to load
8893         // LHS and RHS data for. Corresponding to the initial ld1 instructions
8894         // above, this is currently depth - 1.
8895         "sub w1, w12, #1\n"
8896 
8897         // Main loop of the whole GEMM, over rows and columns of the
8898         // destination matrix.
8899         "1:\n"
8900 
8901         "cmp w1, #0\n"
8902         "fmla v16.4s, v0.4s, v2.s[0]\n"
8903         "fmla v18.4s, v0.4s, v2.s[1]\n"
8904         "fmla v20.4s, v0.4s, v2.s[2]\n"
8905         "fmla v22.4s, v0.4s, v2.s[3]\n"
8906 
8907         // Accumulation loop
8908         "beq 79f\n"
8909 
8910         "2:\n"
8911 
8912         "fmla v24.4s, v0.4s, v3.s[0]\n"
8913         "ldr x2, [%[lhs_ptr], #8]\n"
8914         "fmla v26.4s, v0.4s, v3.s[1]\n"
8915         "ldr x3, [%[lhs_ptr], #24]\n"
8916         "fmla v28.4s, v0.4s, v3.s[2]\n"
8917         "ldr x5, [%[rhs_ptr], #24]\n"
8918         "fmla v30.4s, v0.4s, v3.s[3]\n"
8919         "ldr x4, [%[rhs_ptr], #8]\n"
8920         "fmla v25.4s, v1.4s, v3.s[0]\n"
8921         "subs w1, w1, #1\n"
8922         "ldr d0, [%[lhs_ptr]], #32\n"
8923         "fmla v27.4s, v1.4s, v3.s[1]\n"
8924         "fmla v29.4s, v1.4s, v3.s[2]\n"
8925         "fmla v31.4s, v1.4s, v3.s[3]\n"
8926         "ins v0.d[1], x2\n"
8927         "ldr d3, [%[rhs_ptr], #16]\n"
8928         "fmla v17.4s, v1.4s, v2.s[0]\n"
8929         "fmla v19.4s, v1.4s, v2.s[1]\n"
8930         "ins v3.d[1], x5\n"
8931         "ldr d4, [%[rhs_ptr]], #32\n"
8932         "fmla v21.4s, v1.4s, v2.s[2]\n"
8933         "fmla v23.4s, v1.4s, v2.s[3]\n"
8934         "fmla v16.4s, v0.4s, v4.s[0]\n"
8935         "ins v4.d[1], x4\n"
8936         "ldr d1, [%[lhs_ptr], #-16]\n"
8937         "fmla v18.4s, v0.4s, v4.s[1]\n"
8938         "fmla v20.4s, v0.4s, v4.s[2]\n"
8939         "ins v1.d[1], x3\n"
8940         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #256]\n")
8941         "mov v2.16b, v4.16b\n"
8942         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #256]\n")
8943         "fmla v22.4s, v0.4s, v4.s[3]\n"
8944         "bne 2b\n"
8945 
8946         "79:\n"
8947 
8948         // End of the inner loop on depth. Now perform the remaining
8949         // multiply-adds of the last level of depth, for which the LHS
8950         // and RHS data is already loaded.
8951 
8952         "fmla v24.4s, v0.4s, v3.s[0]\n"
8953         "fmla v26.4s, v0.4s, v3.s[1]\n"
8954         "fmla v28.4s, v0.4s, v3.s[2]\n"
8955         "fmla v30.4s, v0.4s, v3.s[3]\n"
8956         "fmla v25.4s, v1.4s, v3.s[0]\n"
8957         "fmla v27.4s, v1.4s, v3.s[1]\n"
8958         "fmla v29.4s, v1.4s, v3.s[2]\n"
8959         "fmla v31.4s, v1.4s, v3.s[3]\n"
8960         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
8961         "fmla v17.4s, v1.4s, v2.s[0]\n"
8962         "fmla v19.4s, v1.4s, v2.s[1]\n"
8963         "fmla v21.4s, v1.4s, v2.s[2]\n"
8964         "fmla v23.4s, v1.4s, v2.s[3]\n"
8965 
8966         // End of accumulation. The registers v16 -- v31 contain the final
8967         // int32 accumulator values of the current 8x8 destination block.
8968         // We now have to compute the final 8-bit values from these int32
8969         // accumulators, and advance to the next 8x8 block. We intertwine
8970         // these two aspects whenever possible for optimal pipelining, both
8971         // at the data flow level (prefetch data for next block as early as
8972         // possible) and instruction pipelining level (some of the next-block
8973         // work can dual-issue with some of the final work on the current
8974         // block).
8975 
8976         // Logic to advance to the next block in preparation for the next
8977         // iteration of the main loop. For now, we only want to compute
8978         // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
8979         // not yet ready to update the values of row and col, as we still need
8980         // the current values for the rest of the work on the current block.
8981 
8982         "cmp %w[row], w7\n"  // Have we finished the last row?
8983         "bge 4f\n"           // If finished last row, go to 4
8984         // Not finished last row: then advance to next row.
8985         "add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
8986         "b 5f\n"
8987         "4:\n"  // Finished last row...
8988         "mov %[lhs_col_ptr], x5\n"  // Go back to first row
8989         // Now we need to advance to the next column. If we already
8990         // finished the last column, then in principle we are done, however
8991         // we can't just return here, as we need to allow the end work of the
8992         // current block to complete. The good news is that at this point it
8993         // doesn't matter what data we load for the next column, since
8994         // we will exit from the main loop below before actually storing
8995         // anything computed from that data.
8996         "cmp %w[col], w8\n"  // Have we finished the last column?
8997         "bge 5f\n" // If yes, just carry on without updating the column pointer.
8998         // Not finished last column: then advance to next column.
8999         "add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
9000         "5:\n"
9001 
9002         // Set the LHS and RHS data pointers to the start of the columns just
9003         // computed.
9004         "mov %[lhs_ptr], %[lhs_col_ptr]\n"
9005         "mov %[rhs_ptr], %[rhs_col_ptr]\n"
9006 
9007         // Load some parameters needed for the end work on current block.
9008         "ldrb w4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
9009         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
9010 
9011         // Determine the channel index.
9012         "tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
9013         "csel w3, %w[row], %w[col], eq\n"
9014 
9015         // Offset the bias pointer as needed given the current row, col.
9016         "add x5, x1, x3, lsl #2\n"
9017 
9018         // If there is no bias, use no offset, just address the passed zero
9019         // data.
9020 
9021         "tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
9022         "csel x1, x1, x5, eq\n"
9023 
9024         // Load 8 bias values.
9025         "ld1 {v14.4s}, [x1], #16\n"
9026         "ld1 {v15.4s}, [x1]\n"
9027 
9028         // Now that we know what LHS and RHS data the next iteration of the
9029         // main loop will need to load, we start loading the first 32 bytes of
9030         // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
9031         // in the rest of the work on the current block.
9032         "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
9033         "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
9034         "ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
9035         "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
9036 
9037         // Perform the bias-addition.
9038         // Jump based on channel dimension.
9039         "tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
9040         "bne 6f\n"
9041         // Case where channels are rows
9042         "fadd v16.4s, v16.4s, v14.4s\n"
9043         "fadd v17.4s, v17.4s, v15.4s\n"
9044         "fadd v18.4s, v18.4s, v14.4s\n"
9045         "fadd v19.4s, v19.4s, v15.4s\n"
9046         "fadd v20.4s, v20.4s, v14.4s\n"
9047         "fadd v21.4s, v21.4s, v15.4s\n"
9048         "fadd v22.4s, v22.4s, v14.4s\n"
9049         "fadd v23.4s, v23.4s, v15.4s\n"
9050         "fadd v24.4s, v24.4s, v14.4s\n"
9051         "fadd v25.4s, v25.4s, v15.4s\n"
9052         "fadd v26.4s, v26.4s, v14.4s\n"
9053         "fadd v27.4s, v27.4s, v15.4s\n"
9054         "fadd v28.4s, v28.4s, v14.4s\n"
9055         "fadd v29.4s, v29.4s, v15.4s\n"
9056         "fadd v30.4s, v30.4s, v14.4s\n"
9057         "fadd v31.4s, v31.4s, v15.4s\n"
9058         "b 7f\n"
9059 
9060         "6:\n"
9061         // Case where channels are columns
9062         "dup v8.4s, v14.s[0]\n"
9063         "dup v9.4s, v14.s[1]\n"
9064         "fadd v16.4s, v16.4s, v8.4s\n"
9065         "dup v10.4s, v14.s[2]\n"
9066         "fadd v17.4s, v17.4s, v8.4s\n"
9067         "dup v11.4s, v14.s[3]\n"
9068         "fadd v18.4s, v18.4s, v9.4s\n"
9069         "dup v12.4s, v15.s[0]\n"
9070         "fadd v19.4s, v19.4s, v9.4s\n"
9071         "dup v13.4s, v15.s[1]\n"
9072         "fadd v20.4s, v20.4s, v10.4s\n"
9073         "dup v14.4s, v15.s[2]\n"
9074         "fadd v21.4s, v21.4s, v10.4s\n"
9075         "dup v15.4s, v15.s[3]\n"
9076         "fadd v22.4s, v22.4s, v11.4s\n"
9077         "fadd v23.4s, v23.4s, v11.4s\n"
9078         "fadd v24.4s, v24.4s, v12.4s\n"
9079         "fadd v25.4s, v25.4s, v12.4s\n"
9080         "fadd v26.4s, v26.4s, v13.4s\n"
9081         "fadd v27.4s, v27.4s, v13.4s\n"
9082         "fadd v28.4s, v28.4s, v14.4s\n"
9083         "fadd v29.4s, v29.4s, v14.4s\n"
9084         "fadd v30.4s, v30.4s, v15.4s\n"
9085         "fadd v31.4s, v31.4s, v15.4s\n"
9086         "7:\n"
9087 
9088         // Load the clamp_min, clamp_max bounds
9089         "ldr w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
9090         "ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
9091         "dup v14.4s, w2\n"  // clamp_min
9092         "dup v15.4s, w3\n"  // clamp_max
9093 
9094         // Apply the clamp_min bound
9095         "fmax v16.4s, v16.4s, v14.4s\n"
9096         "fmax v17.4s, v17.4s, v14.4s\n"
9097         "fmax v18.4s, v18.4s, v14.4s\n"
9098         "fmax v19.4s, v19.4s, v14.4s\n"
9099         "fmax v20.4s, v20.4s, v14.4s\n"
9100         "fmax v21.4s, v21.4s, v14.4s\n"
9101         "fmax v22.4s, v22.4s, v14.4s\n"
9102         "fmax v23.4s, v23.4s, v14.4s\n"
9103         "fmax v24.4s, v24.4s, v14.4s\n"
9104         "fmax v25.4s, v25.4s, v14.4s\n"
9105         "fmax v26.4s, v26.4s, v14.4s\n"
9106         "fmax v27.4s, v27.4s, v14.4s\n"
9107         "fmax v28.4s, v28.4s, v14.4s\n"
9108         "fmax v29.4s, v29.4s, v14.4s\n"
9109         "fmax v30.4s, v30.4s, v14.4s\n"
9110         "fmax v31.4s, v31.4s, v14.4s\n"
9111 
9112         // Apply the clamp_max bound
9113         "fmin v16.4s, v16.4s, v15.4s\n"
9114         "fmin v17.4s, v17.4s, v15.4s\n"
9115         "fmin v18.4s, v18.4s, v15.4s\n"
9116         "fmin v19.4s, v19.4s, v15.4s\n"
9117         "fmin v20.4s, v20.4s, v15.4s\n"
9118         "fmin v21.4s, v21.4s, v15.4s\n"
9119         "fmin v22.4s, v22.4s, v15.4s\n"
9120         "fmin v23.4s, v23.4s, v15.4s\n"
9121         "fmin v24.4s, v24.4s, v15.4s\n"
9122         "fmin v25.4s, v25.4s, v15.4s\n"
9123         "fmin v26.4s, v26.4s, v15.4s\n"
9124         "fmin v27.4s, v27.4s, v15.4s\n"
9125         "fmin v28.4s, v28.4s, v15.4s\n"
9126         "fmin v29.4s, v29.4s, v15.4s\n"
9127         "fmin v30.4s, v30.4s, v15.4s\n"
9128         "fmin v31.4s, v31.4s, v15.4s\n"
9129 
9130         // Compute how much of the 8x8 block of destination 8bit values that
9131         // we have computed, fit in the destination matrix. Typically, all of
9132         // it fits, but when the destination matrix shape is not a multiple
9133         // of 8x8, there are some 8x8 blocks along the boundaries that do
9134         // not fit entirely.
9135         "sub w1, %w[dst_rows], %w[row]\n"
9136         "sub w2, %w[dst_cols], %w[col]\n"
9137         "mov w3, #8\n"
9138         "cmp w1, #8\n"
9139         // Compute w1 = how many rows of the 8x8 block fit
9140         "csel w1, w1, w3, le\n"
9141         "cmp w2, #8\n"
9142         // Compute w2 = how many cols of the 8x8 block fit
9143         "csel w2, w2, w3, le\n"
9144 
9145         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
9146         "cmp w1, w3\n"
9147         "ccmp w2, w3, 0, eq\n"
9148         // Yes, all of the 8x8 block fits, go to fast path.
9149         "beq 30f\n"
9150         // Not all of the 8x8 block fits.
9151         // Set (x3 address, x4 stride) to write to dst_tmp_buf
9152         "mov x3, %[dst_tmp_buf]\n"
9153         "mov x4, #32\n"
9154         "b 31f\n"
9155         "30:\n"
9156         // Yes, all of the 8x8 block fits.
9157         // Set (x3 address, x4 stride) to write directly to destination matrix.
9158         "mov x3, %[dst_ptr]\n"
9159         "mov x4, x11\n"
9160         "31:\n"
9161 
9162         // Write our 8bit values to the destination described by
9163         // (x3 address, x4 stride).
9164         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9165         "str q16, [x3, #0]\n"
9166         "str q17, [x3, #16]\n"
9167         "add x3, x3, x4\n"
9168         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9169         RUY_MAKE_ZERO(v16)
9170         RUY_MAKE_ZERO(v17)
9171         "str q18, [x3, #0]\n"
9172         "str q19, [x3, #16]\n"
9173         "add x3, x3, x4\n"
9174         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9175         RUY_MAKE_ZERO(v18)
9176         RUY_MAKE_ZERO(v19)
9177         "str q20, [x3, #0]\n"
9178         "str q21, [x3, #16]\n"
9179         "add x3, x3, x4\n"
9180         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9181         RUY_MAKE_ZERO(v20)
9182         RUY_MAKE_ZERO(v21)
9183         "str q22, [x3, #0]\n"
9184         "str q23, [x3, #16]\n"
9185         "add x3, x3, x4\n"
9186         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9187         RUY_MAKE_ZERO(v22)
9188         RUY_MAKE_ZERO(v23)
9189         "str q24, [x3, #0]\n"
9190         "str q25, [x3, #16]\n"
9191         "add x3, x3, x4\n"
9192         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9193         RUY_MAKE_ZERO(v24)
9194         RUY_MAKE_ZERO(v25)
9195         "str q26, [x3, #0]\n"
9196         "str q27, [x3, #16]\n"
9197         "add x3, x3, x4\n"
9198         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9199         RUY_MAKE_ZERO(v26)
9200         RUY_MAKE_ZERO(v27)
9201         "str q28, [x3, #0]\n"
9202         "str q29, [x3, #16]\n"
9203         "add x3, x3, x4\n"
9204         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9205         RUY_MAKE_ZERO(v28)
9206         RUY_MAKE_ZERO(v29)
9207         "str q30, [x3, #0]\n"
9208         "str q31, [x3, #16]\n"
9209         RUY_MAKE_ZERO(v30)
9210         RUY_MAKE_ZERO(v31)
9211 
9212         // If all of the 8x8 block fits, we just finished writing it to the
9213         // destination, so we skip the next part.
9214         "beq 41f\n"
9215         // Not all of the 8x8 block fits in the destination matrix.  We just
9216         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
9217         // it to copy into the destination matrix the part that fits.
9218         "mov x3, %[dst_tmp_buf]\n"
9219         "mov x4, %[dst_ptr]\n"
9220         "mov w6, #0\n"
9221         "50:\n"
9222         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
9223         "mov w5, #0\n"
9224         "51:\n"
9225         "ldr w7, [x3, x5, lsl #2]\n"
9226         "str w7, [x4, x5, lsl #2]\n"
9227         "add w5, w5, #1\n"
9228         "cmp w5, w1\n"
9229         "blt 51b\n"
9230         "add w6, w6, #1\n"
9231         "add x3, x3, #32\n"
9232         "add x4, x4, x11\n"
9233         "cmp w6, w2\n"
9234         "blt 50b\n"
9235         "41:\n"
9236         "add %[dst_ptr], %[dst_ptr], #32\n"
9237         // At this point we have completely finished writing values to the
9238         // destination matrix for the current block.
9239 
9240         // Reload some params --- we had used x5 -- x7 for a few other things
9241         // since the last time we had loaded them.
9242         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
9243         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
9244         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
9245 
9246         // Move to the next block of the destination matrix, for the next iter
9247         // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
9248         // been updated earlier.
9249         // Have we reached the end row?
9250         "cmp %w[row], w7\n"
9251         "beq 20f\n"  // yes, end row.
9252         // Not end row. Move to the next row.
9253         "add %w[row], %w[row], #8\n"
9254         "b 21f\n"
9255         "20:\n"
9256         // Was already at end row.
9257         "mov %w[row], w6\n"  // Move back to first row.
9258         "add %w[col], %w[col], #8\n"  // Move to the next column.
9259         "add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
9260         "mov %[dst_ptr], %[dst_col_ptr]\n"
9261         "21:\n"
9262 
9263         // Main loop exit condition: have we hit the end column?
9264         "cmp %w[col], w8\n"
9265 
9266         // w1 is the number of levels of depth that remain to load
9267         // LHS and RHS data for. Corresponding to the initial ld1 instructions
9268         // above, this is currently depth - 1.
9269         "sub w1, w12, #1\n"
9270 
9271         "ble 1b\n"
9272 
9273         // clang-format on
9274 
9275         : [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
9276           [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
9277           [dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
9278         : [ params ] "r"(&params), [dst_rows] "r"(params.dst_rows),
9279           [dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf)
9280         : "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
9281           "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
9282           "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
9283           "v26", "v27", "v28", "v29", "v30", "v31");
9284 }
9285 
9286 // Variant of KernelFloatNeonA55ish tuned for in-order CPUs that do
9287 // support dotprod (while dotprod by itself is not relevant to floating-point,
9288 // this additional bit of information that we have about the target happens to
9289 // be useful here).
9290 //
9291 // So a typical target CPU here would be ARM Cortex-A55r1.
9292 //
9293 // This kernel is similar to and inspired by gemmlowp's
9294 // NEON_64bit_GEMM_Float32_WithScalar_A55r1.
9295 // which was contributed by David Mansell with very helpful
9296 // comments. Specifically, see this comment about tuning for Cortex-A55r1:
9297 // https://github.com/google/gemmlowp/blob/36212ad3651871bc3e9a599f1a6d5324778aea25/standalone/neon-gemm-kernel-benchmark.cc#L4412
KernelFloatNeonDotprodA55ish(const KernelParamsFloat<8,8> & params)9298 void KernelFloatNeonDotprodA55ish(const KernelParamsFloat<8, 8>& params) {
9299   profiler::ScopeLabel label(
9300       "Kernel (kNeonDotprod, optimized for in-order cores)");
9301 
9302   CheckOffsetsInKernelParamsFloat(params);
9303 
9304   const float* lhs_col_ptr = params.lhs_base_ptr;
9305   const float* rhs_col_ptr = params.rhs_base_ptr;
9306   const float* lhs_ptr = lhs_col_ptr;
9307   const float* rhs_ptr = rhs_col_ptr;
9308   float* dst_col_ptr = params.dst_base_ptr;
9309   float* dst_ptr = dst_col_ptr;
9310   int row = params.start_row;
9311   int col = params.start_col;
9312 
9313   // The asm kernel below has the following NEON register allocation:
9314   //
9315   // v16 -- v31 are accumulators.
9316   // During accumulation, v0 -- v3 are used to load data from LHS and RHS.
9317   //
9318   //                                          RHS 1x8 block
9319   //                           /-----------------------------------------|
9320   //                           |v2.s[0] ... v2.s[3]   v3.s[0] ... v3.s[3]|
9321   //                           \-----------------------------------------/
9322   //        LHS 8x1 block
9323   //  /---------------------\  /-----------------------------------------|
9324   //  |        v0.s[0]      |  |v16.s[0]           ...           v30.s[0]|
9325   //  |         ...         |  |  ...                              ...   |
9326   //  |        v0.s[3]      |  |v16.s[3]           ...           v30.s[3]|
9327   //  |        v1.s[0]      |  |v17.s[0]           ...           v31.s[0]|
9328   //  |         ...         |  |  ...                              ...   |
9329   //  |        v1.s[3]      |  |v17.s[3]           ...           v31.s[3]|
9330   //  \---------------------/  \-----------------------------------------/
9331   //                                      accumulators 8x8 block
9332   //
9333   // There is no RUY_OPT_MAX_STREAMING 4x-unrolled part in this kernel because
9334   // we did not observe a benefit of such partial unrolling on in-order CPUs.
9335   //
9336   // v4 -- v7 are unused, and v8 -- v15 are used for floading parameters used
9337   // for the post-accumulation part of the kernel.
9338   asm volatile(
9339 #define RUY_MAKE_ZERO(reg) "movi " #reg ".4s, #0\n"
9340 
9341         // clang-format off
9342 
9343         // Load some parameters into registers.
9344         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
9345         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
9346         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
9347         "ldr w8, [%[params], #" RUY_STR(RUY_OFFSET_LAST_COL) "]\n"
9348         "ldr w9, [%[params], #" RUY_STR(RUY_OFFSET_LHS_STRIDE) "]\n"
9349         "ldr w10, [%[params], #" RUY_STR(RUY_OFFSET_RHS_STRIDE) "]\n"
9350         "ldr w11, [%[params], #" RUY_STR(RUY_OFFSET_DST_STRIDE) "]\n"
9351         "ldr w12, [%[params], #" RUY_STR(RUY_OFFSET_DEPTH) "]\n"
9352 
9353 
9354         // Clear accumulators.
9355         RUY_MAKE_ZERO(v16)
9356         // Load the first 32 bytes of LHS and RHS data.
9357         "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
9358         RUY_MAKE_ZERO(v17)
9359         "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
9360         RUY_MAKE_ZERO(v18)
9361         "ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
9362         RUY_MAKE_ZERO(v19)
9363         "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
9364         RUY_MAKE_ZERO(v20)
9365         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #64]\n")
9366         RUY_MAKE_ZERO(v21)
9367         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #64]\n")
9368         RUY_MAKE_ZERO(v22)
9369         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #128]\n")
9370         RUY_MAKE_ZERO(v23)
9371         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #128]\n")
9372         RUY_MAKE_ZERO(v24)
9373         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #192]\n")
9374         RUY_MAKE_ZERO(v25)
9375         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #192]\n")
9376         RUY_MAKE_ZERO(v26)
9377         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #256]\n")
9378         RUY_MAKE_ZERO(v27)
9379         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #256]\n")
9380         RUY_MAKE_ZERO(v28)
9381         RUY_MAKE_ZERO(v29)
9382         RUY_MAKE_ZERO(v30)
9383         RUY_MAKE_ZERO(v31)
9384 
9385         // w1 is the number of levels of depth that remain to load
9386         // LHS and RHS data for. Corresponding to the initial ld1 instructions
9387         // above, this is currently depth - 1.
9388         "sub w1, w12, #1\n"
9389 
9390         // Main loop of the whole GEMM, over rows and columns of the
9391         // destination matrix.
9392         "1:\n"
9393 
9394         "cmp w1, #0\n"
9395         "fmla v16.4s, v0.4s, v2.s[0]\n"
9396         "fmla v18.4s, v0.4s, v2.s[1]\n"
9397         "fmla v20.4s, v0.4s, v2.s[2]\n"
9398         "fmla v22.4s, v0.4s, v2.s[3]\n"
9399 
9400         // Accumulation loop
9401         "beq 79f\n"
9402 
9403         "2:\n"
9404 
9405         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[lhs_ptr], #256]\n")
9406         "fmla v24.4s, v0.4s, v3.s[0]\n"
9407         "ldr x2, [%[lhs_ptr], #8]\n"
9408         "fmla v26.4s, v0.4s, v3.s[1]\n"
9409         "ldr x3, [%[lhs_ptr], #24]\n"
9410         "fmla v28.4s, v0.4s, v3.s[2]\n"
9411         "ldr x5, [%[rhs_ptr], #24]\n"
9412         "fmla v30.4s, v0.4s, v3.s[3]\n"
9413         "ldr d0, [%[lhs_ptr]], #32\n"
9414         "fmla v25.4s, v1.4s, v3.s[0]\n"
9415         "ldr x4, [%[rhs_ptr], #8]\n"
9416         "fmla v27.4s, v1.4s, v3.s[1]\n"
9417         "subs w1, w1, #1\n"
9418         "fmla v29.4s, v1.4s, v3.s[2]\n"
9419         "ins v0.d[1], x2\n"
9420         "fmla v31.4s, v1.4s, v3.s[3]\n"
9421         "ldr d3, [%[rhs_ptr], #16]\n"
9422         "fmla v17.4s, v1.4s, v2.s[0]\n"
9423         "ins v3.d[1], x5\n"
9424         "fmla v19.4s, v1.4s, v2.s[1]\n"
9425         "ldr d4, [%[rhs_ptr]], #32\n"
9426         "fmla v21.4s, v1.4s, v2.s[2]\n"
9427         "ins v4.d[1], x4\n"
9428         "fmla v23.4s, v1.4s, v2.s[3]\n"
9429         RUY_PREFETCH_LOAD("prfm pldl1keep, [%[rhs_ptr], #256]\n")
9430         "fmla v16.4s, v0.4s, v4.s[0]\n"
9431         "ldr d1, [%[lhs_ptr], #-16]\n"
9432         "fmla v18.4s, v0.4s, v4.s[1]\n"
9433         "ins v1.d[1], x3\n"
9434         "fmla v20.4s, v0.4s, v4.s[2]\n"
9435         "mov v2.16b, v4.16b\n"
9436         "fmla v22.4s, v0.4s, v4.s[3]\n"
9437         "bne 2b\n"
9438 
9439         "79:\n"
9440 
9441         // End of the inner loop on depth. Now perform the remaining
9442         // multiply-adds of the last level of depth, for which the LHS
9443         // and RHS data is already loaded.
9444 
9445         "fmla v24.4s, v0.4s, v3.s[0]\n"
9446         "fmla v26.4s, v0.4s, v3.s[1]\n"
9447         "fmla v28.4s, v0.4s, v3.s[2]\n"
9448         "fmla v30.4s, v0.4s, v3.s[3]\n"
9449         "fmla v25.4s, v1.4s, v3.s[0]\n"
9450         "fmla v27.4s, v1.4s, v3.s[1]\n"
9451         "fmla v29.4s, v1.4s, v3.s[2]\n"
9452         "fmla v31.4s, v1.4s, v3.s[3]\n"
9453         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
9454         "fmla v17.4s, v1.4s, v2.s[0]\n"
9455         "fmla v19.4s, v1.4s, v2.s[1]\n"
9456         "fmla v21.4s, v1.4s, v2.s[2]\n"
9457         "fmla v23.4s, v1.4s, v2.s[3]\n"
9458 
9459         // End of accumulation. The registers v16 -- v31 contain the final
9460         // int32 accumulator values of the current 8x8 destination block.
9461         // We now have to compute the final 8-bit values from these int32
9462         // accumulators, and advance to the next 8x8 block. We intertwine
9463         // these two aspects whenever possible for optimal pipelining, both
9464         // at the data flow level (prefetch data for next block as early as
9465         // possible) and instruction pipelining level (some of the next-block
9466         // work can dual-issue with some of the final work on the current
9467         // block).
9468 
9469         // Logic to advance to the next block in preparation for the next
9470         // iteration of the main loop. For now, we only want to compute
9471         // the LHS and RHS data pointers, lhs_col_ptr and rhs_col_ptr. We are
9472         // not yet ready to update the values of row and col, as we still need
9473         // the current values for the rest of the work on the current block.
9474 
9475         "cmp %w[row], w7\n"  // Have we finished the last row?
9476         "bge 4f\n"           // If finished last row, go to 4
9477         // Not finished last row: then advance to next row.
9478         "add %[lhs_col_ptr], %[lhs_col_ptr], x9, lsl #3\n"
9479         "b 5f\n"
9480         "4:\n"  // Finished last row...
9481         "mov %[lhs_col_ptr], x5\n"  // Go back to first row
9482         // Now we need to advance to the next column. If we already
9483         // finished the last column, then in principle we are done, however
9484         // we can't just return here, as we need to allow the end work of the
9485         // current block to complete. The good news is that at this point it
9486         // doesn't matter what data we load for the next column, since
9487         // we will exit from the main loop below before actually storing
9488         // anything computed from that data.
9489         "cmp %w[col], w8\n"  // Have we finished the last column?
9490         "bge 5f\n" // If yes, just carry on without updating the column pointer.
9491         // Not finished last column: then advance to next column.
9492         "add %[rhs_col_ptr], %[rhs_col_ptr], x10, lsl #3\n"
9493         "5:\n"
9494 
9495         // Set the LHS and RHS data pointers to the start of the columns just
9496         // computed.
9497         "mov %[lhs_ptr], %[lhs_col_ptr]\n"
9498         "mov %[rhs_ptr], %[rhs_col_ptr]\n"
9499 
9500         // Load some parameters needed for the end work on current block.
9501         "ldrb w4, [%[params], #" RUY_STR(RUY_OFFSET_FLAGS) "]\n"
9502         "ldr x1, [%[params], #" RUY_STR(RUY_OFFSET_BIAS) "]\n"
9503 
9504         // Determine the channel index.
9505         "tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
9506         "csel w3, %w[row], %w[col], eq\n"
9507 
9508         // Offset the bias pointer as needed given the current row, col.
9509         "add x5, x1, x3, lsl #2\n"
9510 
9511         // If there is no bias, use no offset, just address the passed zero
9512         // data.
9513 
9514         "tst w4, #" RUY_STR(RUY_ASM_FLAG_HAS_BIAS) "\n"
9515         "csel x1, x1, x5, eq\n"
9516 
9517         // Load 8 bias values.
9518         "ld1 {v14.4s}, [x1], #16\n"
9519         "ld1 {v15.4s}, [x1]\n"
9520 
9521         // Now that we know what LHS and RHS data the next iteration of the
9522         // main loop will need to load, we start loading the first 32 bytes of
9523         // each of LHS and RHS, into v0 -- v3, as we don't need v0 -- v3 anymore
9524         // in the rest of the work on the current block.
9525         "ld1 {v0.4s}, [%[lhs_ptr]], #16\n"
9526         "ld1 {v1.4s}, [%[lhs_ptr]], #16\n"
9527         "ld1 {v2.4s}, [%[rhs_ptr]], #16\n"
9528         "ld1 {v3.4s}, [%[rhs_ptr]], #16\n"
9529 
9530         // Perform the bias-addition.
9531         // Jump based on channel dimension.
9532         "tst w4, #" RUY_STR(RUY_ASM_FLAG_CHANNEL_DIMENSION_IS_COL) "\n"
9533         "bne 6f\n"
9534         // Case where channels are rows
9535         "fadd v16.4s, v16.4s, v14.4s\n"
9536         "fadd v17.4s, v17.4s, v15.4s\n"
9537         "fadd v18.4s, v18.4s, v14.4s\n"
9538         "fadd v19.4s, v19.4s, v15.4s\n"
9539         "fadd v20.4s, v20.4s, v14.4s\n"
9540         "fadd v21.4s, v21.4s, v15.4s\n"
9541         "fadd v22.4s, v22.4s, v14.4s\n"
9542         "fadd v23.4s, v23.4s, v15.4s\n"
9543         "fadd v24.4s, v24.4s, v14.4s\n"
9544         "fadd v25.4s, v25.4s, v15.4s\n"
9545         "fadd v26.4s, v26.4s, v14.4s\n"
9546         "fadd v27.4s, v27.4s, v15.4s\n"
9547         "fadd v28.4s, v28.4s, v14.4s\n"
9548         "fadd v29.4s, v29.4s, v15.4s\n"
9549         "fadd v30.4s, v30.4s, v14.4s\n"
9550         "fadd v31.4s, v31.4s, v15.4s\n"
9551         "b 7f\n"
9552 
9553         "6:\n"
9554         // Case where channels are columns
9555         "dup v8.4s, v14.s[0]\n"
9556         "dup v9.4s, v14.s[1]\n"
9557         "fadd v16.4s, v16.4s, v8.4s\n"
9558         "dup v10.4s, v14.s[2]\n"
9559         "fadd v17.4s, v17.4s, v8.4s\n"
9560         "dup v11.4s, v14.s[3]\n"
9561         "fadd v18.4s, v18.4s, v9.4s\n"
9562         "dup v12.4s, v15.s[0]\n"
9563         "fadd v19.4s, v19.4s, v9.4s\n"
9564         "dup v13.4s, v15.s[1]\n"
9565         "fadd v20.4s, v20.4s, v10.4s\n"
9566         "dup v14.4s, v15.s[2]\n"
9567         "fadd v21.4s, v21.4s, v10.4s\n"
9568         "dup v15.4s, v15.s[3]\n"
9569         "fadd v22.4s, v22.4s, v11.4s\n"
9570         "fadd v23.4s, v23.4s, v11.4s\n"
9571         "fadd v24.4s, v24.4s, v12.4s\n"
9572         "fadd v25.4s, v25.4s, v12.4s\n"
9573         "fadd v26.4s, v26.4s, v13.4s\n"
9574         "fadd v27.4s, v27.4s, v13.4s\n"
9575         "fadd v28.4s, v28.4s, v14.4s\n"
9576         "fadd v29.4s, v29.4s, v14.4s\n"
9577         "fadd v30.4s, v30.4s, v15.4s\n"
9578         "fadd v31.4s, v31.4s, v15.4s\n"
9579         "7:\n"
9580 
9581         // Load the clamp_min, clamp_max bounds
9582         "ldr w2, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MIN) "]\n"
9583         "ldr w3, [%[params], #" RUY_STR(RUY_OFFSET_CLAMP_MAX) "]\n"
9584         "dup v14.4s, w2\n"  // clamp_min
9585         "dup v15.4s, w3\n"  // clamp_max
9586 
9587         // Apply the clamp_min bound
9588         "fmax v16.4s, v16.4s, v14.4s\n"
9589         "fmax v17.4s, v17.4s, v14.4s\n"
9590         "fmax v18.4s, v18.4s, v14.4s\n"
9591         "fmax v19.4s, v19.4s, v14.4s\n"
9592         "fmax v20.4s, v20.4s, v14.4s\n"
9593         "fmax v21.4s, v21.4s, v14.4s\n"
9594         "fmax v22.4s, v22.4s, v14.4s\n"
9595         "fmax v23.4s, v23.4s, v14.4s\n"
9596         "fmax v24.4s, v24.4s, v14.4s\n"
9597         "fmax v25.4s, v25.4s, v14.4s\n"
9598         "fmax v26.4s, v26.4s, v14.4s\n"
9599         "fmax v27.4s, v27.4s, v14.4s\n"
9600         "fmax v28.4s, v28.4s, v14.4s\n"
9601         "fmax v29.4s, v29.4s, v14.4s\n"
9602         "fmax v30.4s, v30.4s, v14.4s\n"
9603         "fmax v31.4s, v31.4s, v14.4s\n"
9604 
9605         // Apply the clamp_max bound
9606         "fmin v16.4s, v16.4s, v15.4s\n"
9607         "fmin v17.4s, v17.4s, v15.4s\n"
9608         "fmin v18.4s, v18.4s, v15.4s\n"
9609         "fmin v19.4s, v19.4s, v15.4s\n"
9610         "fmin v20.4s, v20.4s, v15.4s\n"
9611         "fmin v21.4s, v21.4s, v15.4s\n"
9612         "fmin v22.4s, v22.4s, v15.4s\n"
9613         "fmin v23.4s, v23.4s, v15.4s\n"
9614         "fmin v24.4s, v24.4s, v15.4s\n"
9615         "fmin v25.4s, v25.4s, v15.4s\n"
9616         "fmin v26.4s, v26.4s, v15.4s\n"
9617         "fmin v27.4s, v27.4s, v15.4s\n"
9618         "fmin v28.4s, v28.4s, v15.4s\n"
9619         "fmin v29.4s, v29.4s, v15.4s\n"
9620         "fmin v30.4s, v30.4s, v15.4s\n"
9621         "fmin v31.4s, v31.4s, v15.4s\n"
9622 
9623         // Compute how much of the 8x8 block of destination 8bit values that
9624         // we have computed, fit in the destination matrix. Typically, all of
9625         // it fits, but when the destination matrix shape is not a multiple
9626         // of 8x8, there are some 8x8 blocks along the boundaries that do
9627         // not fit entirely.
9628         "sub w1, %w[dst_rows], %w[row]\n"
9629         "sub w2, %w[dst_cols], %w[col]\n"
9630         "mov w3, #8\n"
9631         "cmp w1, #8\n"
9632         // Compute w1 = how many rows of the 8x8 block fit
9633         "csel w1, w1, w3, le\n"
9634         "cmp w2, #8\n"
9635         // Compute w2 = how many cols of the 8x8 block fit
9636         "csel w2, w2, w3, le\n"
9637 
9638         // Test if w1==8 && w2 == 8, i.e. if all of the 8x8 block fits.
9639         "cmp w1, w3\n"
9640         "ccmp w2, w3, 0, eq\n"
9641         // Yes, all of the 8x8 block fits, go to fast path.
9642         "beq 30f\n"
9643         // Not all of the 8x8 block fits.
9644         // Set (x3 address, x4 stride) to write to dst_tmp_buf
9645         "mov x3, %[dst_tmp_buf]\n"
9646         "mov x4, #32\n"
9647         "b 31f\n"
9648         "30:\n"
9649         // Yes, all of the 8x8 block fits.
9650         // Set (x3 address, x4 stride) to write directly to destination matrix.
9651         "mov x3, %[dst_ptr]\n"
9652         "mov x4, x11\n"
9653         "31:\n"
9654 
9655         // Write our 8bit values to the destination described by
9656         // (x3 address, x4 stride).
9657         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9658         "str q16, [x3, #0]\n"
9659         "str q17, [x3, #16]\n"
9660         "add x3, x3, x4\n"
9661         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9662         RUY_MAKE_ZERO(v16)
9663         RUY_MAKE_ZERO(v17)
9664         "str q18, [x3, #0]\n"
9665         "str q19, [x3, #16]\n"
9666         "add x3, x3, x4\n"
9667         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9668         RUY_MAKE_ZERO(v18)
9669         RUY_MAKE_ZERO(v19)
9670         "str q20, [x3, #0]\n"
9671         "str q21, [x3, #16]\n"
9672         "add x3, x3, x4\n"
9673         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9674         RUY_MAKE_ZERO(v20)
9675         RUY_MAKE_ZERO(v21)
9676         "str q22, [x3, #0]\n"
9677         "str q23, [x3, #16]\n"
9678         "add x3, x3, x4\n"
9679         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9680         RUY_MAKE_ZERO(v22)
9681         RUY_MAKE_ZERO(v23)
9682         "str q24, [x3, #0]\n"
9683         "str q25, [x3, #16]\n"
9684         "add x3, x3, x4\n"
9685         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9686         RUY_MAKE_ZERO(v24)
9687         RUY_MAKE_ZERO(v25)
9688         "str q26, [x3, #0]\n"
9689         "str q27, [x3, #16]\n"
9690         "add x3, x3, x4\n"
9691         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9692         RUY_MAKE_ZERO(v26)
9693         RUY_MAKE_ZERO(v27)
9694         "str q28, [x3, #0]\n"
9695         "str q29, [x3, #16]\n"
9696         "add x3, x3, x4\n"
9697         RUY_PREFETCH_STORE("prfm pstl1strm, [x3]\n")
9698         RUY_MAKE_ZERO(v28)
9699         RUY_MAKE_ZERO(v29)
9700         "str q30, [x3, #0]\n"
9701         "str q31, [x3, #16]\n"
9702         RUY_MAKE_ZERO(v30)
9703         RUY_MAKE_ZERO(v31)
9704 
9705         // If all of the 8x8 block fits, we just finished writing it to the
9706         // destination, so we skip the next part.
9707         "beq 41f\n"
9708         // Not all of the 8x8 block fits in the destination matrix.  We just
9709         // wrote it to dst_tmp_buf. Now we perform the slow scalar loop over
9710         // it to copy into the destination matrix the part that fits.
9711         "mov x3, %[dst_tmp_buf]\n"
9712         "mov x4, %[dst_ptr]\n"
9713         "mov w6, #0\n"
9714         "50:\n"
9715         RUY_PREFETCH_STORE("prfm pstl1strm, [x4]\n")
9716         "mov w5, #0\n"
9717         "51:\n"
9718         "ldr w7, [x3, x5, lsl #2]\n"
9719         "str w7, [x4, x5, lsl #2]\n"
9720         "add w5, w5, #1\n"
9721         "cmp w5, w1\n"
9722         "blt 51b\n"
9723         "add w6, w6, #1\n"
9724         "add x3, x3, #32\n"
9725         "add x4, x4, x11\n"
9726         "cmp w6, w2\n"
9727         "blt 50b\n"
9728         "41:\n"
9729         "add %[dst_ptr], %[dst_ptr], #32\n"
9730         // At this point we have completely finished writing values to the
9731         // destination matrix for the current block.
9732 
9733         // Reload some params --- we had used x5 -- x7 for a few other things
9734         // since the last time we had loaded them.
9735         "ldr x5, [%[params], #" RUY_STR(RUY_OFFSET_LHS_BASE_PTR) "]\n"
9736         "ldr w6, [%[params], #" RUY_STR(RUY_OFFSET_START_ROW) "]\n"
9737         "ldr w7, [%[params], #" RUY_STR(RUY_OFFSET_LAST_ROW) "]\n"
9738 
9739         // Move to the next block of the destination matrix, for the next iter
9740         // of the main loop.  Notice that lhs_col_ptr, rhs_col_ptr have already
9741         // been updated earlier.
9742         // Have we reached the end row?
9743         "cmp %w[row], w7\n"
9744         "beq 20f\n"  // yes, end row.
9745         // Not end row. Move to the next row.
9746         "add %w[row], %w[row], #8\n"
9747         "b 21f\n"
9748         "20:\n"
9749         // Was already at end row.
9750         "mov %w[row], w6\n"  // Move back to first row.
9751         "add %w[col], %w[col], #8\n"  // Move to the next column.
9752         "add %[dst_col_ptr], %[dst_col_ptr], x11, lsl #3\n"
9753         "mov %[dst_ptr], %[dst_col_ptr]\n"
9754         "21:\n"
9755 
9756         // Main loop exit condition: have we hit the end column?
9757         "cmp %w[col], w8\n"
9758 
9759         // w1 is the number of levels of depth that remain to load
9760         // LHS and RHS data for. Corresponding to the initial ld1 instructions
9761         // above, this is currently depth - 1.
9762         "sub w1, w12, #1\n"
9763 
9764         "ble 1b\n"
9765 
9766         // clang-format on
9767 
9768         : [ lhs_col_ptr ] "+r"(lhs_col_ptr), [rhs_col_ptr] "+r"(rhs_col_ptr),
9769           [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
9770           [dst_col_ptr] "+r"(dst_col_ptr), [dst_ptr] "+r"(dst_ptr), [row] "+r"(row), [col] "+r"(col)
9771         : [ params ] "r"(&params), [dst_rows] "r"(params.dst_rows),
9772           [dst_cols] "r"(params.dst_cols), [dst_tmp_buf] "r"(params.dst_tmp_buf)
9773         : "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc",
9774           "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
9775           "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
9776           "v26", "v27", "v28", "v29", "v30", "v31");
9777 }
9778 #undef RUY_OFFSET_BIAS
9779 #undef RUY_OFFSET_FLAGS
9780 #undef RUY_OFFSET_LHS_BASE_PTR
9781 #undef RUY_OFFSET_CLAMP_MIN
9782 #undef RUY_OFFSET_CLAMP_MAX
9783 #undef RUY_OFFSET_START_ROW
9784 #undef RUY_OFFSET_LAST_ROW
9785 #undef RUY_OFFSET_LAST_COL
9786 #undef RUY_OFFSET_LHS_STRIDE
9787 #undef RUY_OFFSET_RHS_STRIDE
9788 #undef RUY_OFFSET_DST_STRIDE
9789 #undef RUY_OFFSET_DEPTH
9790 #undef RUY_OFFSET_START_COL
9791 #undef RUY_OFFSET_RHS_BASE_PTR
9792 #undef RUY_OFFSET_DST_BASE_PTR
9793 
9794 #endif  // RUY_PLATFORM_NEON_64 && RUY_OPT(ASM)
9795 
9796 }  // namespace ruy
9797