• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     const int8_t* restrict a,  x3
18#     size_t a_stride,           x4
19#     const void* restrict w,    x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,          [sp] -> x12
23#     const union xnn_qs8_minmax_params params)  [sp + 8] -> x11
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# Register usage
28# A0  x3  v0  v4
29# A1 x15  v1  v5
30# A2 x13  v2  v6
31# A3  x4  v3  v7
32# B   x5  v8  v9 v10 v11
33# C0  x6 v16 v20 v24 v28
34# C1  x8 v17 v21 v25 v29
35# C2  x9 v18 v22 v26 v30
36# C3  x7 v19 v23 v27 v31
37# unused v12 v13 v14 v15
38
39# x14 temp for Cortex-A55 loads
40
41BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
42
43        # Clamp A and C pointers
44        CMP     x0, 2                   // if mr < 2
45
46        LDP     x12, x11, [sp]          // cn_stride, params
47
48        ADD     x15, x3, x4             // a1 = a0 + a_stride
49        ADD     x8, x6, x7              // c1 = c0 + cm_stride
50
51        STP     d8,  d9, [sp, -32]!
52
53        CSEL    x15, x3, x15, LO        //   a1 = a0
54        CSEL    x8, x6,  x8, LO         //   c1 = c0
55        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
56
57        ADD     x13, x15, x4            // a2 = a1 + a_stride
58        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
59                                        // if mr <= 2
60        CSEL    x13, x15, x13, LS       //   a2 = a1
61        CSEL    x9,  x8,  x9, LS        //   c2 = c1
62        BIC     x2, x2, 3
63
64        STP     d10, d11, [sp, 16]
65
66        CMP     x0, 4                   // if mr < 4
67        ADD     x4, x13, x4             // a3 = a2 + a_stride
68        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
69        CSEL    x4, x13, x4, LO         //   a3 = a2
70        CSEL    x7,  x9, x7, LO         //   c3 = c2
71
72        .p2align 3
730:
74        # Load initial bias from w into accumulators
75        LDP     q16, q20, [x5], 32
76        MOV     v17.16b, v16.16b
77        MOV     v18.16b, v16.16b
78        LDP     q24, q28, [x5], 32
79        MOV     v19.16b, v16.16b
80        MOV     v21.16b, v20.16b
81        MOV     v22.16b, v20.16b
82        MOV     v23.16b, v20.16b
83        MOV     v25.16b, v24.16b
84        MOV     v26.16b, v24.16b
85        SUBS    x0, x2, 16              // k = kc - 16
86        MOV     v27.16b, v24.16b
87        MOV     v29.16b, v28.16b
88        MOV     v30.16b, v28.16b
89        MOV     v31.16b, v28.16b
90        # Is there at least 16 bytes for prologue/epilogue?
91        B.LO    4f
92
93        # prologue - read A and B values for block 0 and 1
94        LDR     d0,  [x3], 8
95        LDR     q8,  [x5], 16
96        LDR     d1, [x15], 8
97        LDR     d2, [x13], 8
98        LDR     d3,  [x4], 8
99        SUBS    x0, x0, 16              // is there 16 for main loop?
100        LDR     d9,  [x5], 8
101        LDR     x14,  [x5], 8
102        # Is there at least 16 bytes for main loop?
103        B.LO    2f
104
105        # Main loop - 16 bytes of A in 4 groups.
106        # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
107        # 4 LD64 for A
108        # 4 LD128 for W. = 2 LD64 + INS.
109        # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.
110
111        .p2align 3
1121:
113        # BLOCK 0
114        SDOT    v16.4s,  v8.16b, v0.4b[0]
115        LDR     d10,  [x5], 8
116        SDOT    v17.4s,  v8.16b, v1.4b[0]
117        INS     v9.d[1], x14
118        SDOT    v18.4s,  v8.16b, v2.4b[0]
119        LDR     x14,  [x5], 8
120        SDOT    v19.4s,  v8.16b, v3.4b[0]
121        LDR     d4,  [x3], 8
122
123        # BLOCK 1
124        SDOT    v20.4s,  v9.16b, v0.4b[0]
125        LDR     d11,  [x5], 8
126        SDOT    v21.4s,  v9.16b, v1.4b[0]
127        INS     v10.d[1], x14
128        SDOT    v22.4s,  v9.16b, v2.4b[0]
129        LDR     x14,  [x5], 8
130        SDOT    v23.4s,  v9.16b, v3.4b[0]
131        LDR     d5, [x15], 8
132
133        # BLOCK 2
134        SDOT    v24.4s, v10.16b, v0.4b[0]
135        LDR     d8,  [x5], 8
136        SDOT    v25.4s, v10.16b, v1.4b[0]
137        INS     v11.d[1], x14
138        SDOT    v26.4s, v10.16b, v2.4b[0]
139        LDR     x14,  [x5], 8
140        SDOT    v27.4s, v10.16b, v3.4b[0]
141        LDR     d6, [x13], 8
142
143        # BLOCK 3
144        SDOT    v28.4s, v11.16b, v0.4b[0]
145        LDR     d9,  [x5], 8
146        SDOT    v29.4s, v11.16b, v1.4b[0]
147        INS     v8.d[1], x14
148        SDOT    v30.4s, v11.16b, v2.4b[0]
149        LDR     x14,  [x5], 8
150        SDOT    v31.4s, v11.16b, v3.4b[0]
151        LDR     d7,  [x4], 8
152
153        # BLOCK 0
154        SDOT    v16.4s,  v8.16b, v0.4b[1]
155        LDR     d10,  [x5], 8
156        SDOT    v17.4s,  v8.16b, v1.4b[1]
157        INS     v9.d[1], x14
158        SDOT    v18.4s,  v8.16b, v2.4b[1]
159        LDR     x14,  [x5], 8
160        SDOT    v19.4s,  v8.16b, v3.4b[1]
161
162        # BLOCK 1
163        SDOT    v20.4s,  v9.16b, v0.4b[1]
164        LDR     d11,  [x5], 8
165        SDOT    v21.4s,  v9.16b, v1.4b[1]
166        INS     v10.d[1], x14
167        SDOT    v22.4s,  v9.16b, v2.4b[1]
168        LDR     x14,  [x5], 8
169        SDOT    v23.4s,  v9.16b, v3.4b[1]
170
171        # BLOCK 2
172        SDOT    v24.4s, v10.16b, v0.4b[1]
173        LDR     d8,  [x5], 8
174        SDOT    v25.4s, v10.16b, v1.4b[1]
175        INS     v11.d[1], x14
176        SDOT    v26.4s, v10.16b, v2.4b[1]
177        LDR     x14,  [x5], 8
178        SDOT    v27.4s, v10.16b, v3.4b[1]
179
180        # BLOCK 3
181        SDOT    v28.4s, v11.16b, v0.4b[1]
182        LDR     d9,  [x5], 8
183        SDOT    v29.4s, v11.16b, v1.4b[1]
184        INS     v8.d[1], x14
185        SDOT    v30.4s, v11.16b, v2.4b[1]
186        LDR     x14,  [x5], 8
187        SDOT    v31.4s, v11.16b, v3.4b[1]
188
189        # BLOCK 0
190        SDOT    v16.4s,  v8.16b, v4.4b[0]
191        LDR     d10,  [x5], 8
192        SDOT    v17.4s,  v8.16b, v5.4b[0]
193        INS     v9.d[1], x14
194        SDOT    v18.4s,  v8.16b, v6.4b[0]
195        LDR     x14,  [x5], 8
196        SDOT    v19.4s,  v8.16b, v7.4b[0]
197        LDR     d0,  [x3], 8
198
199        # BLOCK 1
200        SDOT    v20.4s,  v9.16b, v4.4b[0]
201        LDR     d11,  [x5], 8
202        SDOT    v21.4s,  v9.16b, v5.4b[0]
203        INS     v10.d[1], x14
204        SDOT    v22.4s,  v9.16b, v6.4b[0]
205        LDR     x14,  [x5], 8
206        SDOT    v23.4s,  v9.16b, v7.4b[0]
207        LDR     d1, [x15], 8
208
209        # BLOCK 2
210        SDOT    v24.4s, v10.16b, v4.4b[0]
211        LDR     d8,  [x5], 8
212        SDOT    v25.4s, v10.16b, v5.4b[0]
213        INS     v11.d[1], x14
214        SDOT    v26.4s, v10.16b, v6.4b[0]
215        LDR     x14,  [x5], 8
216        SDOT    v27.4s, v10.16b, v7.4b[0]
217        LDR     d2, [x13], 8
218
219        # BLOCK 3
220        SDOT    v28.4s, v11.16b, v4.4b[0]
221        LDR     d9,  [x5], 8
222        SDOT    v29.4s, v11.16b, v5.4b[0]
223        INS     v8.d[1], x14
224        SDOT    v30.4s, v11.16b, v6.4b[0]
225        LDR     x14,  [x5], 8
226        SDOT    v31.4s, v11.16b, v7.4b[0]
227        LDR     d3,  [x4], 8
228
229        # BLOCK 0
230        SDOT    v16.4s,  v8.16b, v4.4b[1]
231        LDR     d10,  [x5], 8
232        SDOT    v17.4s,  v8.16b, v5.4b[1]
233        INS     v9.d[1], x14
234        SDOT    v18.4s,  v8.16b, v6.4b[1]
235        LDR     x14,  [x5], 8
236        SDOT    v19.4s,  v8.16b, v7.4b[1]
237
238        # BLOCK 1
239        SDOT    v20.4s,  v9.16b, v4.4b[1]
240        LDR     d11,  [x5], 8
241        SDOT    v21.4s,  v9.16b, v5.4b[1]
242        INS     v10.d[1], x14
243        SDOT    v22.4s,  v9.16b, v6.4b[1]
244        LDR     x14,  [x5], 8
245        SDOT    v23.4s,  v9.16b, v7.4b[1]
246
247        # BLOCK 2
248        SDOT    v24.4s, v10.16b, v4.4b[1]
249        LDR     d8,  [x5], 8            // First B values for block 0 and 1
250        SDOT    v25.4s, v10.16b, v5.4b[1]
251        INS     v11.d[1], x14
252        SDOT    v26.4s, v10.16b, v6.4b[1]
253        LDR     x14,  [x5], 8
254        SDOT    v27.4s, v10.16b, v7.4b[1]
255        SUBS    x0, x0, 16
256
257        # BLOCK 3
258        SDOT    v28.4s, v11.16b, v4.4b[1]
259        LDR     d9,  [x5], 8
260        SDOT    v29.4s, v11.16b, v5.4b[1]
261        INS     v8.d[1], x14
262        SDOT    v30.4s, v11.16b, v6.4b[1]
263        LDR     x14,  [x5], 8
264        SDOT    v31.4s, v11.16b, v7.4b[1]
265        B.HS    1b
266
267        # Epilogue.  Same as main loop but no preloads in final group
2682:
269        # BLOCK 0
270        SDOT    v16.4s,  v8.16b, v0.4b[0]
271        LDR     d10,  [x5], 8
272        SDOT    v17.4s,  v8.16b, v1.4b[0]
273        INS     v9.d[1], x14
274        SDOT    v18.4s,  v8.16b, v2.4b[0]
275        LDR     x14,  [x5], 8
276        SDOT    v19.4s,  v8.16b, v3.4b[0]
277        LDR     d4,  [x3], 8
278
279        # BLOCK 1
280        SDOT    v20.4s,  v9.16b, v0.4b[0]
281        LDR     d11,  [x5], 8
282        SDOT    v21.4s,  v9.16b, v1.4b[0]
283        INS     v10.d[1], x14
284        SDOT    v22.4s,  v9.16b, v2.4b[0]
285        LDR     x14,  [x5], 8
286        SDOT    v23.4s,  v9.16b, v3.4b[0]
287        LDR     d5, [x15], 8
288
289        # BLOCK 2
290        SDOT    v24.4s, v10.16b, v0.4b[0]
291        LDR     d8,  [x5], 8
292        SDOT    v25.4s, v10.16b, v1.4b[0]
293        INS     v11.d[1], x14
294        SDOT    v26.4s, v10.16b, v2.4b[0]
295        LDR     x14,  [x5], 8
296        SDOT    v27.4s, v10.16b, v3.4b[0]
297        LDR     d6, [x13], 8
298
299        # BLOCK 3
300        SDOT    v28.4s, v11.16b, v0.4b[0]
301        LDR     d9,  [x5], 8
302        SDOT    v29.4s, v11.16b, v1.4b[0]
303        INS     v8.d[1], x14
304        SDOT    v30.4s, v11.16b, v2.4b[0]
305        LDR     x14,  [x5], 8
306        SDOT    v31.4s, v11.16b, v3.4b[0]
307        LDR     d7,  [x4], 8
308
309        # BLOCK 0
310        SDOT    v16.4s,  v8.16b, v0.4b[1]
311        LDR     d10,  [x5], 8
312        SDOT    v17.4s,  v8.16b, v1.4b[1]
313        INS     v9.d[1], x14
314        SDOT    v18.4s,  v8.16b, v2.4b[1]
315        LDR     x14,  [x5], 8
316        SDOT    v19.4s,  v8.16b, v3.4b[1]
317
318        # BLOCK 1
319        SDOT    v20.4s,  v9.16b, v0.4b[1]
320        LDR     d11,  [x5], 8
321        SDOT    v21.4s,  v9.16b, v1.4b[1]
322        INS     v10.d[1], x14
323        SDOT    v22.4s,  v9.16b, v2.4b[1]
324        LDR     x14,  [x5], 8
325        SDOT    v23.4s,  v9.16b, v3.4b[1]
326
327        # BLOCK 2
328        SDOT    v24.4s, v10.16b, v0.4b[1]
329        LDR     d8,  [x5], 8
330        SDOT    v25.4s, v10.16b, v1.4b[1]
331        INS     v11.d[1], x14
332        SDOT    v26.4s, v10.16b, v2.4b[1]
333        LDR     x14,  [x5], 8
334        SDOT    v27.4s, v10.16b, v3.4b[1]
335
336        # BLOCK 3
337        SDOT    v28.4s, v11.16b, v0.4b[1]
338        LDR     d9,  [x5], 8
339        SDOT    v29.4s, v11.16b, v1.4b[1]
340        INS     v8.d[1], x14
341        SDOT    v30.4s, v11.16b, v2.4b[1]
342        LDR     x14,  [x5], 8
343        SDOT    v31.4s, v11.16b, v3.4b[1]
344
345        # BLOCK 0
346        SDOT    v16.4s,  v8.16b, v4.4b[0]
347        LDR     d10,  [x5], 8
348        SDOT    v17.4s,  v8.16b, v5.4b[0]
349        INS     v9.d[1], x14
350        SDOT    v18.4s,  v8.16b, v6.4b[0]
351        LDR     x14,  [x5], 8
352        SDOT    v19.4s,  v8.16b, v7.4b[0]
353
354        # BLOCK 1
355        SDOT    v20.4s,  v9.16b, v4.4b[0]
356        LDR     d11,  [x5], 8
357        SDOT    v21.4s,  v9.16b, v5.4b[0]
358        INS     v10.d[1], x14
359        SDOT    v22.4s,  v9.16b, v6.4b[0]
360        LDR     x14,  [x5], 8
361        SDOT    v23.4s,  v9.16b, v7.4b[0]
362
363        # BLOCK 2
364        SDOT    v24.4s, v10.16b, v4.4b[0]
365        LDR     d8,  [x5], 8
366        SDOT    v25.4s, v10.16b, v5.4b[0]
367        INS     v11.d[1], x14
368        SDOT    v26.4s, v10.16b, v6.4b[0]
369        LDR     x14,  [x5], 8
370        SDOT    v27.4s, v10.16b, v7.4b[0]
371
372        # BLOCK 3
373        SDOT    v28.4s, v11.16b, v4.4b[0]
374        LDR     d9,  [x5], 8
375        SDOT    v29.4s, v11.16b, v5.4b[0]
376        INS     v8.d[1], x14
377        SDOT    v30.4s, v11.16b, v6.4b[0]
378        LDR     x14,  [x5], 8
379        SDOT    v31.4s, v11.16b, v7.4b[0]
380
381        # BLOCK 0
382        SDOT    v16.4s,  v8.16b, v4.4b[1]
383        LDR     d10,  [x5], 8
384        SDOT    v17.4s,  v8.16b, v5.4b[1]
385        INS     v9.d[1], x14
386        SDOT    v18.4s,  v8.16b, v6.4b[1]
387        LDR     x14,  [x5], 8
388        SDOT    v19.4s,  v8.16b, v7.4b[1]
389
390        # BLOCK 1
391        SDOT    v20.4s,  v9.16b, v4.4b[1]
392        LDR     d11,  [x5], 8
393        SDOT    v21.4s,  v9.16b, v5.4b[1]
394        INS     v10.d[1], x14
395        SDOT    v22.4s,  v9.16b, v6.4b[1]
396        LDR     x14,  [x5], 8
397        SDOT    v23.4s,  v9.16b, v7.4b[1]
398
399        # BLOCK 2
400        SDOT    v24.4s, v10.16b, v4.4b[1]
401        SDOT    v25.4s, v10.16b, v5.4b[1]
402        INS     v11.d[1], x14
403        SDOT    v26.4s, v10.16b, v6.4b[1]
404        SDOT    v27.4s, v10.16b, v7.4b[1]
405        AND     x0, x2, 15              // kc remainder 0 to 12
406
407        # BLOCK 3
408        SDOT    v28.4s, v11.16b, v4.4b[1]
409        SDOT    v29.4s, v11.16b, v5.4b[1]
410        SDOT    v30.4s, v11.16b, v6.4b[1]
411        SDOT    v31.4s, v11.16b, v7.4b[1]
412
413        # Is there a remainder?- 4 to 12 bytes of A
414        CBNZ    x0, 5f
415
416        .p2align 3
4173:
418        SCVTF   v16.4s, v16.4s
419        SCVTF   v17.4s, v17.4s
420        # Load per channel scale values from weights
421        LDR     q4, [x5], 16
422        SCVTF   v18.4s, v18.4s
423        SCVTF   v19.4s, v19.4s
424        LDR     q5, [x5], 16
425        SCVTF   v20.4s, v20.4s
426        SCVTF   v21.4s, v21.4s
427        SCVTF   v22.4s, v22.4s
428        SCVTF   v23.4s, v23.4s
429        SCVTF   v24.4s, v24.4s
430        SCVTF   v25.4s, v25.4s
431        SCVTF   v26.4s, v26.4s
432        SCVTF   v27.4s, v27.4s
433        SCVTF   v28.4s, v28.4s
434        SCVTF   v29.4s, v29.4s
435        SCVTF   v30.4s, v30.4s
436        SCVTF   v31.4s, v31.4s
437
438        LDR     q6, [x5], 16
439        FMUL    v16.4s, v16.4s, v4.4s
440        FMUL    v17.4s, v17.4s, v4.4s
441        FMUL    v18.4s, v18.4s, v4.4s
442        FMUL    v19.4s, v19.4s, v4.4s
443        FMUL    v20.4s, v20.4s, v5.4s
444        LDR     q4, [x5], 16
445        FMUL    v21.4s, v21.4s, v5.4s
446        FMUL    v22.4s, v22.4s, v5.4s
447        FMUL    v23.4s, v23.4s, v5.4s
448        FMUL    v24.4s, v24.4s, v6.4s
449        FMUL    v25.4s, v25.4s, v6.4s
450        FMUL    v26.4s, v26.4s, v6.4s
451        FMUL    v27.4s, v27.4s, v6.4s
452        FMUL    v28.4s, v28.4s, v4.4s
453        FMUL    v29.4s, v29.4s, v4.4s
454        FMUL    v30.4s, v30.4s, v4.4s
455        FMUL    v31.4s, v31.4s, v4.4s
456
457        FCVTNS  v16.4s, v16.4s
458        FCVTNS  v17.4s, v17.4s
459        FCVTNS  v18.4s, v18.4s
460        FCVTNS  v19.4s, v19.4s
461        FCVTNS  v20.4s, v20.4s
462        FCVTNS  v21.4s, v21.4s
463        FCVTNS  v22.4s, v22.4s
464        FCVTNS  v23.4s, v23.4s
465        FCVTNS  v24.4s, v24.4s
466        FCVTNS  v25.4s, v25.4s
467        FCVTNS  v26.4s, v26.4s
468        FCVTNS  v27.4s, v27.4s
469        FCVTNS  v28.4s, v28.4s
470        FCVTNS  v29.4s, v29.4s
471        FCVTNS  v30.4s, v30.4s
472        FCVTNS  v31.4s, v31.4s
473
474        SQXTN   v16.4h, v16.4s
475        SQXTN   v17.4h, v17.4s
476        SQXTN   v18.4h, v18.4s
477        SQXTN   v19.4h, v19.4s
478        SQXTN   v24.4h, v24.4s
479        SQXTN   v25.4h, v25.4s
480        SQXTN   v26.4h, v26.4s
481        SQXTN   v27.4h, v27.4s
482        LD1R    {v6.8h}, [x11], 2       // add bias
483
484        SQXTN2  v16.8h, v20.4s
485        SQXTN2  v17.8h, v21.4s
486        SQXTN2  v18.8h, v22.4s
487        SQXTN2  v19.8h, v23.4s
488        SQXTN2  v24.8h, v28.4s
489        SQXTN2  v25.8h, v29.4s
490        SQXTN2  v26.8h, v30.4s
491        SQXTN2  v27.8h, v31.4s
492
493        SQADD   v16.8h, v16.8h, v6.8h
494        SQADD   v17.8h, v17.8h, v6.8h
495        SQADD   v18.8h, v18.8h, v6.8h
496        SQADD   v19.8h, v19.8h, v6.8h
497        SQADD   v24.8h, v24.8h, v6.8h
498        SQADD   v25.8h, v25.8h, v6.8h
499        SQADD   v26.8h, v26.8h, v6.8h
500        SQADD   v27.8h, v27.8h, v6.8h
501        LD1R    {v4.16b}, [x11], 1      // clamp min value
502
503        SQXTN   v0.8b, v16.8h
504        SQXTN   v1.8b, v17.8h
505        SQXTN   v2.8b, v18.8h
506        SQXTN   v3.8b, v19.8h
507        LD1R    {v5.16b}, [x11]         // clamp max value
508        SQXTN2  v0.16b, v24.8h
509        SQXTN2  v1.16b, v25.8h
510        SQXTN2  v2.16b, v26.8h
511        SQXTN2  v3.16b, v27.8h
512        SUB     x11, x11, 3            // rewind params pointer
513
514        SMAX    v0.16b, v0.16b, v4.16b
515        SMAX    v1.16b, v1.16b, v4.16b
516        SMAX    v2.16b, v2.16b, v4.16b
517        SMAX    v3.16b, v3.16b, v4.16b
518        SUBS    x1, x1, 16
519        SMIN    v0.16b, v0.16b, v5.16b
520        SMIN    v1.16b, v1.16b, v5.16b
521        SMIN    v2.16b, v2.16b, v5.16b
522        SMIN    v3.16b, v3.16b, v5.16b
523        B.LO    6f
524
525        # Store full 4 x 16
526        ST1     {v0.16b}, [x6], x12
527        SUB     x3,  x3, x2             // a0 -= kc
528        ST1     {v1.16b}, [x8], x12
529        SUB     x15, x15, x2            // a1 -= kc
530        ST1     {v2.16b}, [x9], x12
531        SUB     x13, x13, x2            // a2 -= kc
532        ST1     {v3.16b}, [x7], x12
533        SUB     x4,  x4, x2             // a3 -= kc
534        B.NE    0b
535
536        # Restore d8-d11 from stack
537        LDP     d10, d11, [sp, 16]
538        LDP     d8,  d9, [sp], 32
539        RET
540
541        # Remainder- 4 to 12 bytes of A
542        # Although C4, its safe to read 16 bytes.
543        .p2align 3
5444:
545        AND     x0, x2, 15              // kc remainder 4 to 12
5465:
547        LDP     q8,  q9,  [x5], 32
548        LDP     q10, q11,  [x5], 32
549        LD1     {v0.16b},  [x3], x0
550        LD1     {v1.16b}, [x15], x0
551        LD1     {v2.16b}, [x13], x0
552        LD1     {v3.16b},  [x4], x0
553        SDOT    v16.4s,  v8.16b, v0.4b[0]
554        SDOT    v17.4s,  v8.16b, v1.4b[0]
555        SDOT    v18.4s,  v8.16b, v2.4b[0]
556        SDOT    v19.4s,  v8.16b, v3.4b[0]
557        SDOT    v20.4s,  v9.16b, v0.4b[0]
558        SDOT    v21.4s,  v9.16b, v1.4b[0]
559        SDOT    v22.4s,  v9.16b, v2.4b[0]
560        SDOT    v23.4s,  v9.16b, v3.4b[0]
561        SDOT    v24.4s, v10.16b, v0.4b[0]
562        SDOT    v25.4s, v10.16b, v1.4b[0]
563        SDOT    v26.4s, v10.16b, v2.4b[0]
564        SDOT    v27.4s, v10.16b, v3.4b[0]
565        SDOT    v28.4s, v11.16b, v0.4b[0]
566        SDOT    v29.4s, v11.16b, v1.4b[0]
567        SDOT    v30.4s, v11.16b, v2.4b[0]
568        SDOT    v31.4s, v11.16b, v3.4b[0]
569        CMP     x0, 4
570        B.LS    3b
571        LDP     q8,  q9,  [x5], 32
572        LDP     q10, q11,  [x5], 32
573        SDOT    v16.4s,  v8.16b, v0.4b[1]
574        SDOT    v17.4s,  v8.16b, v1.4b[1]
575        SDOT    v18.4s,  v8.16b, v2.4b[1]
576        SDOT    v19.4s,  v8.16b, v3.4b[1]
577        SDOT    v20.4s,  v9.16b, v0.4b[1]
578        SDOT    v21.4s,  v9.16b, v1.4b[1]
579        SDOT    v22.4s,  v9.16b, v2.4b[1]
580        SDOT    v23.4s,  v9.16b, v3.4b[1]
581        SDOT    v24.4s, v10.16b, v0.4b[1]
582        SDOT    v25.4s, v10.16b, v1.4b[1]
583        SDOT    v26.4s, v10.16b, v2.4b[1]
584        SDOT    v27.4s, v10.16b, v3.4b[1]
585        SDOT    v28.4s, v11.16b, v0.4b[1]
586        SDOT    v29.4s, v11.16b, v1.4b[1]
587        SDOT    v30.4s, v11.16b, v2.4b[1]
588        SDOT    v31.4s, v11.16b, v3.4b[1]
589        CMP     x0, 8
590        B.LS    3b
591        LDP     q8,  q9,  [x5], 32
592        LDP     q10, q11,  [x5], 32
593        SDOT    v16.4s,  v8.16b, v0.4b[2]
594        SDOT    v17.4s,  v8.16b, v1.4b[2]
595        SDOT    v18.4s,  v8.16b, v2.4b[2]
596        SDOT    v19.4s,  v8.16b, v3.4b[2]
597        SDOT    v20.4s,  v9.16b, v0.4b[2]
598        SDOT    v21.4s,  v9.16b, v1.4b[2]
599        SDOT    v22.4s,  v9.16b, v2.4b[2]
600        SDOT    v23.4s,  v9.16b, v3.4b[2]
601        SDOT    v24.4s, v10.16b, v0.4b[2]
602        SDOT    v25.4s, v10.16b, v1.4b[2]
603        SDOT    v26.4s, v10.16b, v2.4b[2]
604        SDOT    v27.4s, v10.16b, v3.4b[2]
605        SDOT    v28.4s, v11.16b, v0.4b[2]
606        SDOT    v29.4s, v11.16b, v1.4b[2]
607        SDOT    v30.4s, v11.16b, v2.4b[2]
608        SDOT    v31.4s, v11.16b, v3.4b[2]
609        B       3b
610
611        # Store odd width
612        .p2align 3
6136:
614        TBZ     x1, 3, 7f
615        STR     d0, [x6], 8
616        STR     d1, [x8], 8
617        DUP     d0, v0.d[1]
618        DUP     d1, v1.d[1]
619        STR     d2, [x9], 8
620        STR     d3, [x7], 8
621        DUP     d2, v2.d[1]
622        DUP     d3, v3.d[1]
6237:
624        TBZ     x1, 2, 8f
625        STR     s0, [x6], 4
626        STR     s1, [x8], 4
627        DUP     s0, v0.s[1]
628        DUP     s1, v1.s[1]
629        STR     s2, [x9], 4
630        STR     s3, [x7], 4
631        DUP     s2, v2.s[1]
632        DUP     s3, v3.s[1]
6338:
634        TBZ     x1, 1, 9f
635        STR     h0, [x6], 2
636        STR     h1, [x8], 2
637        DUP     h0, v0.h[1]
638        DUP     h1, v1.h[1]
639        STR     h2, [x9], 2
640        STR     h3, [x7], 2
641        DUP     h2, v2.h[1]
642        DUP     h3, v3.h[1]
6439:
644        TBZ     x1, 0, 10f
645        STR     b0, [x6]
646        STR     b1, [x8]
647        STR     b2, [x9]
648        STR     b3, [x7]
64910:
650        # Restore d8-d11 from stack
651        LDP     d10, d11, [sp, 16]
652        LDP     d8,  d9, [sp], 32
653        RET
654
655END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
656
657#ifdef __ELF__
658.section ".note.GNU-stack","",%progbits
659#endif
660