• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert REQUANTIZATION in ["FP32", "RNDNU"]
7$assert not CHANNELWISE or REQUANTIZATION == "FP32"
8
9#include <xnnpack/assembly.h>
10
11$DATATYPE = "qc8" if CHANNELWISE else "qs8"
12$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
13$REWIND_DECREMENT = 3 if CHANNELWISE else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
14# void xnn_${DATATYPE}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55(
15#     size_t mr,                 x0
16#     size_t nc,                 x1
17#     size_t kc,                 x2 / x0
18#     const int8_t* restrict a,  x3
19#     size_t a_stride,           x4
20#     const void* restrict w,    x5
21#     int8_t* restrict c,        x6
22#     size_t cm_stride,          x7
23#     size_t cn_stride,          [sp] -> x12
24#     const union ${PARAMS_UNION} params)  [sp + 8] -> x11
25
26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
27
28# Register usage
29# A0  x3  v0  v4
30# A1 x15  v1  v5
31# A2 x13  v2  v6
32# A3  x4  v3  v7
33# B   x5  v8  v9 v10 v11
34# C0  x6 v16 v20 v24 v28
35# C1  x8 v17 v21 v25 v29
36# C2  x9 v18 v22 v26 v30
37# C3  x7 v19 v23 v27 v31
38# unused v12 v13 v14 v15
39
40# x14 temp for Cortex-A55 loads
41
42BEGIN_FUNCTION xnn_${DATATYPE}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
43
44        # Clamp A and C pointers
45        CMP     x0, 2                   // if mr < 2
46
47        LDP     x12, x11, [sp]          // cn_stride, params
48
49        ADD     x15, x3, x4             // a1 = a0 + a_stride
50        ADD     x8, x6, x7              // c1 = c0 + cm_stride
51
52        STP     d8,  d9, [sp, -32]!
53
54        CSEL    x15, x3, x15, LO        //   a1 = a0
55        CSEL    x8, x6,  x8, LO         //   c1 = c0
56        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
57
58        ADD     x13, x15, x4            // a2 = a1 + a_stride
59        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
60                                        // if mr <= 2
61        CSEL    x13, x15, x13, LS       //   a2 = a1
62        CSEL    x9,  x8,  x9, LS        //   c2 = c1
63        BIC     x2, x2, 3
64
65        STP     d10, d11, [sp, 16]
66
67        CMP     x0, 4                   // if mr < 4
68        ADD     x4, x13, x4             // a3 = a2 + a_stride
69        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
70        CSEL    x4, x13, x4, LO         //   a3 = a2
71        CSEL    x7,  x9, x7, LO         //   c3 = c2
72
73        .p2align 3
740:
75        # Load initial bias from w into accumulators
76        LDP     q16, q20, [x5], 32
77        MOV     v17.16b, v16.16b
78        MOV     v18.16b, v16.16b
79        LDP     q24, q28, [x5], 32
80        MOV     v19.16b, v16.16b
81        MOV     v21.16b, v20.16b
82        MOV     v22.16b, v20.16b
83        MOV     v23.16b, v20.16b
84        MOV     v25.16b, v24.16b
85        MOV     v26.16b, v24.16b
86        SUBS    x0, x2, 16              // k = kc - 16
87        MOV     v27.16b, v24.16b
88        MOV     v29.16b, v28.16b
89        MOV     v30.16b, v28.16b
90        MOV     v31.16b, v28.16b
91        # Is there at least 16 bytes for prologue/epilogue?
92        B.LO    4f
93
94        # prologue - read A and B values for block 0 and 1
95        LDR     d0,  [x3], 8
96        LDR     q8,  [x5], 16
97        LDR     d1, [x15], 8
98        LDR     d2, [x13], 8
99        LDR     d3,  [x4], 8
100        SUBS    x0, x0, 16              // is there 16 for main loop?
101        LDR     d9,  [x5], 8
102        LDR     x14,  [x5], 8
103        # Is there at least 16 bytes for main loop?
104        B.LO    2f
105
106        # Main loop - 16 bytes of A in 4 groups.
107        # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
108        # 4 LD64 for A
109        # 4 LD128 for W. = 2 LD64 + INS.
110        # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.
111
112        .p2align 3
1131:
114        # BLOCK 0
115        SDOT    v16.4s,  v8.16b, v0.4b[0]
116        LDR     d10,  [x5], 8
117        SDOT    v17.4s,  v8.16b, v1.4b[0]
118        INS     v9.d[1], x14
119        SDOT    v18.4s,  v8.16b, v2.4b[0]
120        LDR     x14,  [x5], 8
121        SDOT    v19.4s,  v8.16b, v3.4b[0]
122        LDR     d4,  [x3], 8
123
124        # BLOCK 1
125        SDOT    v20.4s,  v9.16b, v0.4b[0]
126        LDR     d11,  [x5], 8
127        SDOT    v21.4s,  v9.16b, v1.4b[0]
128        INS     v10.d[1], x14
129        SDOT    v22.4s,  v9.16b, v2.4b[0]
130        LDR     x14,  [x5], 8
131        SDOT    v23.4s,  v9.16b, v3.4b[0]
132        LDR     d5, [x15], 8
133
134        # BLOCK 2
135        SDOT    v24.4s, v10.16b, v0.4b[0]
136        LDR     d8,  [x5], 8
137        SDOT    v25.4s, v10.16b, v1.4b[0]
138        INS     v11.d[1], x14
139        SDOT    v26.4s, v10.16b, v2.4b[0]
140        LDR     x14,  [x5], 8
141        SDOT    v27.4s, v10.16b, v3.4b[0]
142        LDR     d6, [x13], 8
143
144        # BLOCK 3
145        SDOT    v28.4s, v11.16b, v0.4b[0]
146        LDR     d9,  [x5], 8
147        SDOT    v29.4s, v11.16b, v1.4b[0]
148        INS     v8.d[1], x14
149        SDOT    v30.4s, v11.16b, v2.4b[0]
150        LDR     x14,  [x5], 8
151        SDOT    v31.4s, v11.16b, v3.4b[0]
152        LDR     d7,  [x4], 8
153
154        # BLOCK 0
155        SDOT    v16.4s,  v8.16b, v0.4b[1]
156        LDR     d10,  [x5], 8
157        SDOT    v17.4s,  v8.16b, v1.4b[1]
158        INS     v9.d[1], x14
159        SDOT    v18.4s,  v8.16b, v2.4b[1]
160        LDR     x14,  [x5], 8
161        SDOT    v19.4s,  v8.16b, v3.4b[1]
162
163        # BLOCK 1
164        SDOT    v20.4s,  v9.16b, v0.4b[1]
165        LDR     d11,  [x5], 8
166        SDOT    v21.4s,  v9.16b, v1.4b[1]
167        INS     v10.d[1], x14
168        SDOT    v22.4s,  v9.16b, v2.4b[1]
169        LDR     x14,  [x5], 8
170        SDOT    v23.4s,  v9.16b, v3.4b[1]
171
172        # BLOCK 2
173        SDOT    v24.4s, v10.16b, v0.4b[1]
174        LDR     d8,  [x5], 8
175        SDOT    v25.4s, v10.16b, v1.4b[1]
176        INS     v11.d[1], x14
177        SDOT    v26.4s, v10.16b, v2.4b[1]
178        LDR     x14,  [x5], 8
179        SDOT    v27.4s, v10.16b, v3.4b[1]
180
181        # BLOCK 3
182        SDOT    v28.4s, v11.16b, v0.4b[1]
183        LDR     d9,  [x5], 8
184        SDOT    v29.4s, v11.16b, v1.4b[1]
185        INS     v8.d[1], x14
186        SDOT    v30.4s, v11.16b, v2.4b[1]
187        LDR     x14,  [x5], 8
188        SDOT    v31.4s, v11.16b, v3.4b[1]
189
190        # BLOCK 0
191        SDOT    v16.4s,  v8.16b, v4.4b[0]
192        LDR     d10,  [x5], 8
193        SDOT    v17.4s,  v8.16b, v5.4b[0]
194        INS     v9.d[1], x14
195        SDOT    v18.4s,  v8.16b, v6.4b[0]
196        LDR     x14,  [x5], 8
197        SDOT    v19.4s,  v8.16b, v7.4b[0]
198        LDR     d0,  [x3], 8
199
200        # BLOCK 1
201        SDOT    v20.4s,  v9.16b, v4.4b[0]
202        LDR     d11,  [x5], 8
203        SDOT    v21.4s,  v9.16b, v5.4b[0]
204        INS     v10.d[1], x14
205        SDOT    v22.4s,  v9.16b, v6.4b[0]
206        LDR     x14,  [x5], 8
207        SDOT    v23.4s,  v9.16b, v7.4b[0]
208        LDR     d1, [x15], 8
209
210        # BLOCK 2
211        SDOT    v24.4s, v10.16b, v4.4b[0]
212        LDR     d8,  [x5], 8
213        SDOT    v25.4s, v10.16b, v5.4b[0]
214        INS     v11.d[1], x14
215        SDOT    v26.4s, v10.16b, v6.4b[0]
216        LDR     x14,  [x5], 8
217        SDOT    v27.4s, v10.16b, v7.4b[0]
218        LDR     d2, [x13], 8
219
220        # BLOCK 3
221        SDOT    v28.4s, v11.16b, v4.4b[0]
222        LDR     d9,  [x5], 8
223        SDOT    v29.4s, v11.16b, v5.4b[0]
224        INS     v8.d[1], x14
225        SDOT    v30.4s, v11.16b, v6.4b[0]
226        LDR     x14,  [x5], 8
227        SDOT    v31.4s, v11.16b, v7.4b[0]
228        LDR     d3,  [x4], 8
229
230        # BLOCK 0
231        SDOT    v16.4s,  v8.16b, v4.4b[1]
232        LDR     d10,  [x5], 8
233        SDOT    v17.4s,  v8.16b, v5.4b[1]
234        INS     v9.d[1], x14
235        SDOT    v18.4s,  v8.16b, v6.4b[1]
236        LDR     x14,  [x5], 8
237        SDOT    v19.4s,  v8.16b, v7.4b[1]
238
239        # BLOCK 1
240        SDOT    v20.4s,  v9.16b, v4.4b[1]
241        LDR     d11,  [x5], 8
242        SDOT    v21.4s,  v9.16b, v5.4b[1]
243        INS     v10.d[1], x14
244        SDOT    v22.4s,  v9.16b, v6.4b[1]
245        LDR     x14,  [x5], 8
246        SDOT    v23.4s,  v9.16b, v7.4b[1]
247
248        # BLOCK 2
249        SDOT    v24.4s, v10.16b, v4.4b[1]
250        LDR     d8,  [x5], 8            // First B values for block 0 and 1
251        SDOT    v25.4s, v10.16b, v5.4b[1]
252        INS     v11.d[1], x14
253        SDOT    v26.4s, v10.16b, v6.4b[1]
254        LDR     x14,  [x5], 8
255        SDOT    v27.4s, v10.16b, v7.4b[1]
256        SUBS    x0, x0, 16
257
258        # BLOCK 3
259        SDOT    v28.4s, v11.16b, v4.4b[1]
260        LDR     d9,  [x5], 8
261        SDOT    v29.4s, v11.16b, v5.4b[1]
262        INS     v8.d[1], x14
263        SDOT    v30.4s, v11.16b, v6.4b[1]
264        LDR     x14,  [x5], 8
265        SDOT    v31.4s, v11.16b, v7.4b[1]
266        B.HS    1b
267
268        # Epilogue.  Same as main loop but no preloads in final group
2692:
270        # BLOCK 0
271        SDOT    v16.4s,  v8.16b, v0.4b[0]
272        LDR     d10,  [x5], 8
273        SDOT    v17.4s,  v8.16b, v1.4b[0]
274        INS     v9.d[1], x14
275        SDOT    v18.4s,  v8.16b, v2.4b[0]
276        LDR     x14,  [x5], 8
277        SDOT    v19.4s,  v8.16b, v3.4b[0]
278        LDR     d4,  [x3], 8
279
280        # BLOCK 1
281        SDOT    v20.4s,  v9.16b, v0.4b[0]
282        LDR     d11,  [x5], 8
283        SDOT    v21.4s,  v9.16b, v1.4b[0]
284        INS     v10.d[1], x14
285        SDOT    v22.4s,  v9.16b, v2.4b[0]
286        LDR     x14,  [x5], 8
287        SDOT    v23.4s,  v9.16b, v3.4b[0]
288        LDR     d5, [x15], 8
289
290        # BLOCK 2
291        SDOT    v24.4s, v10.16b, v0.4b[0]
292        LDR     d8,  [x5], 8
293        SDOT    v25.4s, v10.16b, v1.4b[0]
294        INS     v11.d[1], x14
295        SDOT    v26.4s, v10.16b, v2.4b[0]
296        LDR     x14,  [x5], 8
297        SDOT    v27.4s, v10.16b, v3.4b[0]
298        LDR     d6, [x13], 8
299
300        # BLOCK 3
301        SDOT    v28.4s, v11.16b, v0.4b[0]
302        LDR     d9,  [x5], 8
303        SDOT    v29.4s, v11.16b, v1.4b[0]
304        INS     v8.d[1], x14
305        SDOT    v30.4s, v11.16b, v2.4b[0]
306        LDR     x14,  [x5], 8
307        SDOT    v31.4s, v11.16b, v3.4b[0]
308        LDR     d7,  [x4], 8
309
310        # BLOCK 0
311        SDOT    v16.4s,  v8.16b, v0.4b[1]
312        LDR     d10,  [x5], 8
313        SDOT    v17.4s,  v8.16b, v1.4b[1]
314        INS     v9.d[1], x14
315        SDOT    v18.4s,  v8.16b, v2.4b[1]
316        LDR     x14,  [x5], 8
317        SDOT    v19.4s,  v8.16b, v3.4b[1]
318
319        # BLOCK 1
320        SDOT    v20.4s,  v9.16b, v0.4b[1]
321        LDR     d11,  [x5], 8
322        SDOT    v21.4s,  v9.16b, v1.4b[1]
323        INS     v10.d[1], x14
324        SDOT    v22.4s,  v9.16b, v2.4b[1]
325        LDR     x14,  [x5], 8
326        SDOT    v23.4s,  v9.16b, v3.4b[1]
327
328        # BLOCK 2
329        SDOT    v24.4s, v10.16b, v0.4b[1]
330        LDR     d8,  [x5], 8
331        SDOT    v25.4s, v10.16b, v1.4b[1]
332        INS     v11.d[1], x14
333        SDOT    v26.4s, v10.16b, v2.4b[1]
334        LDR     x14,  [x5], 8
335        SDOT    v27.4s, v10.16b, v3.4b[1]
336
337        # BLOCK 3
338        SDOT    v28.4s, v11.16b, v0.4b[1]
339        LDR     d9,  [x5], 8
340        SDOT    v29.4s, v11.16b, v1.4b[1]
341        INS     v8.d[1], x14
342        SDOT    v30.4s, v11.16b, v2.4b[1]
343        LDR     x14,  [x5], 8
344        SDOT    v31.4s, v11.16b, v3.4b[1]
345
346        # BLOCK 0
347        SDOT    v16.4s,  v8.16b, v4.4b[0]
348        LDR     d10,  [x5], 8
349        SDOT    v17.4s,  v8.16b, v5.4b[0]
350        INS     v9.d[1], x14
351        SDOT    v18.4s,  v8.16b, v6.4b[0]
352        LDR     x14,  [x5], 8
353        SDOT    v19.4s,  v8.16b, v7.4b[0]
354
355        # BLOCK 1
356        SDOT    v20.4s,  v9.16b, v4.4b[0]
357        LDR     d11,  [x5], 8
358        SDOT    v21.4s,  v9.16b, v5.4b[0]
359        INS     v10.d[1], x14
360        SDOT    v22.4s,  v9.16b, v6.4b[0]
361        LDR     x14,  [x5], 8
362        SDOT    v23.4s,  v9.16b, v7.4b[0]
363
364        # BLOCK 2
365        SDOT    v24.4s, v10.16b, v4.4b[0]
366        LDR     d8,  [x5], 8
367        SDOT    v25.4s, v10.16b, v5.4b[0]
368        INS     v11.d[1], x14
369        SDOT    v26.4s, v10.16b, v6.4b[0]
370        LDR     x14,  [x5], 8
371        SDOT    v27.4s, v10.16b, v7.4b[0]
372
373        # BLOCK 3
374        SDOT    v28.4s, v11.16b, v4.4b[0]
375        LDR     d9,  [x5], 8
376        SDOT    v29.4s, v11.16b, v5.4b[0]
377        INS     v8.d[1], x14
378        SDOT    v30.4s, v11.16b, v6.4b[0]
379        LDR     x14,  [x5], 8
380        SDOT    v31.4s, v11.16b, v7.4b[0]
381
382        # BLOCK 0
383        SDOT    v16.4s,  v8.16b, v4.4b[1]
384        LDR     d10,  [x5], 8
385        SDOT    v17.4s,  v8.16b, v5.4b[1]
386        INS     v9.d[1], x14
387        SDOT    v18.4s,  v8.16b, v6.4b[1]
388        LDR     x14,  [x5], 8
389        SDOT    v19.4s,  v8.16b, v7.4b[1]
390
391        # BLOCK 1
392        SDOT    v20.4s,  v9.16b, v4.4b[1]
393        LDR     d11,  [x5], 8
394        SDOT    v21.4s,  v9.16b, v5.4b[1]
395        INS     v10.d[1], x14
396        SDOT    v22.4s,  v9.16b, v6.4b[1]
397        LDR     x14,  [x5], 8
398        SDOT    v23.4s,  v9.16b, v7.4b[1]
399
400        # BLOCK 2
401        SDOT    v24.4s, v10.16b, v4.4b[1]
402        SDOT    v25.4s, v10.16b, v5.4b[1]
403        INS     v11.d[1], x14
404        SDOT    v26.4s, v10.16b, v6.4b[1]
405        SDOT    v27.4s, v10.16b, v7.4b[1]
406        AND     x0, x2, 15              // kc remainder 0 to 12
407
408        # BLOCK 3
409        SDOT    v28.4s, v11.16b, v4.4b[1]
410        SDOT    v29.4s, v11.16b, v5.4b[1]
411        SDOT    v30.4s, v11.16b, v6.4b[1]
412        SDOT    v31.4s, v11.16b, v7.4b[1]
413
414        # Is there a remainder?- 4 to 12 bytes of A
415        CBNZ    x0, 5f
416
417        .p2align 3
4183:
419        $if REQUANTIZATION == "RNDNU":
420          # Apply params - preshift, scale, postshift, bias and clamp
421          LD1R    {v4.4s}, [x11], 4
422          SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
423          SQSHL   v17.4s, v17.4s, v4.4s
424          SQSHL   v18.4s, v18.4s, v4.4s
425          SQSHL   v19.4s, v19.4s, v4.4s
426          SQSHL   v20.4s, v20.4s, v4.4s
427          SQSHL   v21.4s, v21.4s, v4.4s
428          SQSHL   v22.4s, v22.4s, v4.4s
429          SQSHL   v23.4s, v23.4s, v4.4s
430          LD1R    {v5.4s}, [x11], 4
431          SQSHL   v24.4s, v24.4s, v4.4s
432          SQSHL   v25.4s, v25.4s, v4.4s
433          SQSHL   v26.4s, v26.4s, v4.4s
434          SQSHL   v27.4s, v27.4s, v4.4s
435          SQSHL   v28.4s, v28.4s, v4.4s
436          SQSHL   v29.4s, v29.4s, v4.4s
437          SQSHL   v30.4s, v30.4s, v4.4s
438          SQSHL   v31.4s, v31.4s, v4.4s
439          LD1R    {v6.4s}, [x11], 4
440          SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
441          SQDMULH v17.4s, v17.4s, v5.4s
442          SQDMULH v18.4s, v18.4s, v5.4s
443          SQDMULH v19.4s, v19.4s, v5.4s
444          SQDMULH v20.4s, v20.4s, v5.4s
445          SQDMULH v21.4s, v21.4s, v5.4s
446          SQDMULH v22.4s, v22.4s, v5.4s
447          SQDMULH v23.4s, v23.4s, v5.4s
448          SQDMULH v24.4s, v24.4s, v5.4s
449          SQDMULH v25.4s, v25.4s, v5.4s
450          SQDMULH v26.4s, v26.4s, v5.4s
451          SQDMULH v27.4s, v27.4s, v5.4s
452          SQDMULH v28.4s, v28.4s, v5.4s
453          SQDMULH v29.4s, v29.4s, v5.4s
454          SQDMULH v30.4s, v30.4s, v5.4s
455          SQDMULH v31.4s, v31.4s, v5.4s
456          SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
457          SRSHL   v17.4s, v17.4s, v6.4s
458          SRSHL   v18.4s, v18.4s, v6.4s
459          SRSHL   v19.4s, v19.4s, v6.4s
460          SRSHL   v20.4s, v20.4s, v6.4s
461          SRSHL   v21.4s, v21.4s, v6.4s
462          SRSHL   v22.4s, v22.4s, v6.4s
463          SRSHL   v23.4s, v23.4s, v6.4s
464          SRSHL   v24.4s, v24.4s, v6.4s
465          SRSHL   v25.4s, v25.4s, v6.4s
466          SRSHL   v26.4s, v26.4s, v6.4s
467          SRSHL   v27.4s, v27.4s, v6.4s
468          SRSHL   v28.4s, v28.4s, v6.4s
469          SRSHL   v29.4s, v29.4s, v6.4s
470          SRSHL   v30.4s, v30.4s, v6.4s
471          SRSHL   v31.4s, v31.4s, v6.4s
472        $elif REQUANTIZATION == "FP32":
473          SCVTF   v16.4s, v16.4s
474          SCVTF   v17.4s, v17.4s
475          $if not CHANNELWISE:
476            # Apply params - scale, bias and clamp
477            LD1R    {v4.4s}, [x11], 4
478            SCVTF   v18.4s, v18.4s
479            SCVTF   v19.4s, v19.4s
480          $else:
481            # Load per channel scale values from weights
482            LDR     q4, [x5], 16
483            SCVTF   v18.4s, v18.4s
484            SCVTF   v19.4s, v19.4s
485            LDR     q5, [x5], 16
486          SCVTF   v20.4s, v20.4s
487          SCVTF   v21.4s, v21.4s
488          SCVTF   v22.4s, v22.4s
489          SCVTF   v23.4s, v23.4s
490          SCVTF   v24.4s, v24.4s
491          SCVTF   v25.4s, v25.4s
492          SCVTF   v26.4s, v26.4s
493          SCVTF   v27.4s, v27.4s
494          SCVTF   v28.4s, v28.4s
495          SCVTF   v29.4s, v29.4s
496          SCVTF   v30.4s, v30.4s
497          SCVTF   v31.4s, v31.4s
498
499          $if CHANNELWISE:
500            LDR     q6, [x5], 16
501            FMUL    v16.4s, v16.4s, v4.4s
502            FMUL    v17.4s, v17.4s, v4.4s
503            FMUL    v18.4s, v18.4s, v4.4s
504            FMUL    v19.4s, v19.4s, v4.4s
505            FMUL    v20.4s, v20.4s, v5.4s
506            LDR     q4, [x5], 16
507            FMUL    v21.4s, v21.4s, v5.4s
508            FMUL    v22.4s, v22.4s, v5.4s
509            FMUL    v23.4s, v23.4s, v5.4s
510            FMUL    v24.4s, v24.4s, v6.4s
511            FMUL    v25.4s, v25.4s, v6.4s
512            FMUL    v26.4s, v26.4s, v6.4s
513            FMUL    v27.4s, v27.4s, v6.4s
514            FMUL    v28.4s, v28.4s, v4.4s
515            FMUL    v29.4s, v29.4s, v4.4s
516            FMUL    v30.4s, v30.4s, v4.4s
517            FMUL    v31.4s, v31.4s, v4.4s
518          $else:
519            FMUL    v16.4s, v16.4s, v4.4s
520            FMUL    v17.4s, v17.4s, v4.4s
521            FMUL    v18.4s, v18.4s, v4.4s
522            FMUL    v19.4s, v19.4s, v4.4s
523            FMUL    v20.4s, v20.4s, v4.4s
524            FMUL    v21.4s, v21.4s, v4.4s
525            FMUL    v22.4s, v22.4s, v4.4s
526            FMUL    v23.4s, v23.4s, v4.4s
527            FMUL    v24.4s, v24.4s, v4.4s
528            FMUL    v25.4s, v25.4s, v4.4s
529            FMUL    v26.4s, v26.4s, v4.4s
530            FMUL    v27.4s, v27.4s, v4.4s
531            FMUL    v28.4s, v28.4s, v4.4s
532            FMUL    v29.4s, v29.4s, v4.4s
533            FMUL    v30.4s, v30.4s, v4.4s
534            FMUL    v31.4s, v31.4s, v4.4s
535
536          FCVTNS  v16.4s, v16.4s
537          FCVTNS  v17.4s, v17.4s
538          FCVTNS  v18.4s, v18.4s
539          FCVTNS  v19.4s, v19.4s
540          FCVTNS  v20.4s, v20.4s
541          FCVTNS  v21.4s, v21.4s
542          FCVTNS  v22.4s, v22.4s
543          FCVTNS  v23.4s, v23.4s
544          FCVTNS  v24.4s, v24.4s
545          FCVTNS  v25.4s, v25.4s
546          FCVTNS  v26.4s, v26.4s
547          FCVTNS  v27.4s, v27.4s
548          FCVTNS  v28.4s, v28.4s
549          FCVTNS  v29.4s, v29.4s
550          FCVTNS  v30.4s, v30.4s
551          FCVTNS  v31.4s, v31.4s
552
553        SQXTN   v16.4h, v16.4s
554        SQXTN   v17.4h, v17.4s
555        SQXTN   v18.4h, v18.4s
556        SQXTN   v19.4h, v19.4s
557        SQXTN   v24.4h, v24.4s
558        SQXTN   v25.4h, v25.4s
559        SQXTN   v26.4h, v26.4s
560        SQXTN   v27.4h, v27.4s
561        LD1R    {v6.8h}, [x11], 2       // add bias
562
563        SQXTN2  v16.8h, v20.4s
564        SQXTN2  v17.8h, v21.4s
565        SQXTN2  v18.8h, v22.4s
566        SQXTN2  v19.8h, v23.4s
567        SQXTN2  v24.8h, v28.4s
568        SQXTN2  v25.8h, v29.4s
569        SQXTN2  v26.8h, v30.4s
570        SQXTN2  v27.8h, v31.4s
571
572        SQADD   v16.8h, v16.8h, v6.8h
573        SQADD   v17.8h, v17.8h, v6.8h
574        SQADD   v18.8h, v18.8h, v6.8h
575        SQADD   v19.8h, v19.8h, v6.8h
576        SQADD   v24.8h, v24.8h, v6.8h
577        SQADD   v25.8h, v25.8h, v6.8h
578        SQADD   v26.8h, v26.8h, v6.8h
579        SQADD   v27.8h, v27.8h, v6.8h
580        LD1R    {v4.16b}, [x11], 1      // clamp min value
581
582        SQXTN   v0.8b, v16.8h
583        SQXTN   v1.8b, v17.8h
584        SQXTN   v2.8b, v18.8h
585        SQXTN   v3.8b, v19.8h
586        LD1R    {v5.16b}, [x11]         // clamp max value
587        SQXTN2  v0.16b, v24.8h
588        SQXTN2  v1.16b, v25.8h
589        SQXTN2  v2.16b, v26.8h
590        SQXTN2  v3.16b, v27.8h
591        SUB     x11, x11, ${REWIND_DECREMENT}            // rewind params pointer
592
593        SMAX    v0.16b, v0.16b, v4.16b
594        SMAX    v1.16b, v1.16b, v4.16b
595        SMAX    v2.16b, v2.16b, v4.16b
596        SMAX    v3.16b, v3.16b, v4.16b
597        SUBS    x1, x1, 16
598        SMIN    v0.16b, v0.16b, v5.16b
599        SMIN    v1.16b, v1.16b, v5.16b
600        SMIN    v2.16b, v2.16b, v5.16b
601        SMIN    v3.16b, v3.16b, v5.16b
602        B.LO    6f
603
604        # Store full 4 x 16
605        ST1     {v0.16b}, [x6], x12
606        SUB     x3,  x3, x2             // a0 -= kc
607        ST1     {v1.16b}, [x8], x12
608        SUB     x15, x15, x2            // a1 -= kc
609        ST1     {v2.16b}, [x9], x12
610        SUB     x13, x13, x2            // a2 -= kc
611        ST1     {v3.16b}, [x7], x12
612        SUB     x4,  x4, x2             // a3 -= kc
613        B.NE    0b
614
615        # Restore d8-d11 from stack
616        LDP     d10, d11, [sp, 16]
617        LDP     d8,  d9, [sp], 32
618        RET
619
620        # Remainder- 4 to 12 bytes of A
621        # Although C4, its safe to read 16 bytes.
622        .p2align 3
6234:
624        AND     x0, x2, 15              // kc remainder 4 to 12
6255:
626        LDP     q8,  q9,  [x5], 32
627        LDP     q10, q11,  [x5], 32
628        LD1     {v0.16b},  [x3], x0
629        LD1     {v1.16b}, [x15], x0
630        LD1     {v2.16b}, [x13], x0
631        LD1     {v3.16b},  [x4], x0
632        SDOT    v16.4s,  v8.16b, v0.4b[0]
633        SDOT    v17.4s,  v8.16b, v1.4b[0]
634        SDOT    v18.4s,  v8.16b, v2.4b[0]
635        SDOT    v19.4s,  v8.16b, v3.4b[0]
636        SDOT    v20.4s,  v9.16b, v0.4b[0]
637        SDOT    v21.4s,  v9.16b, v1.4b[0]
638        SDOT    v22.4s,  v9.16b, v2.4b[0]
639        SDOT    v23.4s,  v9.16b, v3.4b[0]
640        SDOT    v24.4s, v10.16b, v0.4b[0]
641        SDOT    v25.4s, v10.16b, v1.4b[0]
642        SDOT    v26.4s, v10.16b, v2.4b[0]
643        SDOT    v27.4s, v10.16b, v3.4b[0]
644        SDOT    v28.4s, v11.16b, v0.4b[0]
645        SDOT    v29.4s, v11.16b, v1.4b[0]
646        SDOT    v30.4s, v11.16b, v2.4b[0]
647        SDOT    v31.4s, v11.16b, v3.4b[0]
648        CMP     x0, 4
649        B.LS    3b
650        LDP     q8,  q9,  [x5], 32
651        LDP     q10, q11,  [x5], 32
652        SDOT    v16.4s,  v8.16b, v0.4b[1]
653        SDOT    v17.4s,  v8.16b, v1.4b[1]
654        SDOT    v18.4s,  v8.16b, v2.4b[1]
655        SDOT    v19.4s,  v8.16b, v3.4b[1]
656        SDOT    v20.4s,  v9.16b, v0.4b[1]
657        SDOT    v21.4s,  v9.16b, v1.4b[1]
658        SDOT    v22.4s,  v9.16b, v2.4b[1]
659        SDOT    v23.4s,  v9.16b, v3.4b[1]
660        SDOT    v24.4s, v10.16b, v0.4b[1]
661        SDOT    v25.4s, v10.16b, v1.4b[1]
662        SDOT    v26.4s, v10.16b, v2.4b[1]
663        SDOT    v27.4s, v10.16b, v3.4b[1]
664        SDOT    v28.4s, v11.16b, v0.4b[1]
665        SDOT    v29.4s, v11.16b, v1.4b[1]
666        SDOT    v30.4s, v11.16b, v2.4b[1]
667        SDOT    v31.4s, v11.16b, v3.4b[1]
668        CMP     x0, 8
669        B.LS    3b
670        LDP     q8,  q9,  [x5], 32
671        LDP     q10, q11,  [x5], 32
672        SDOT    v16.4s,  v8.16b, v0.4b[2]
673        SDOT    v17.4s,  v8.16b, v1.4b[2]
674        SDOT    v18.4s,  v8.16b, v2.4b[2]
675        SDOT    v19.4s,  v8.16b, v3.4b[2]
676        SDOT    v20.4s,  v9.16b, v0.4b[2]
677        SDOT    v21.4s,  v9.16b, v1.4b[2]
678        SDOT    v22.4s,  v9.16b, v2.4b[2]
679        SDOT    v23.4s,  v9.16b, v3.4b[2]
680        SDOT    v24.4s, v10.16b, v0.4b[2]
681        SDOT    v25.4s, v10.16b, v1.4b[2]
682        SDOT    v26.4s, v10.16b, v2.4b[2]
683        SDOT    v27.4s, v10.16b, v3.4b[2]
684        SDOT    v28.4s, v11.16b, v0.4b[2]
685        SDOT    v29.4s, v11.16b, v1.4b[2]
686        SDOT    v30.4s, v11.16b, v2.4b[2]
687        SDOT    v31.4s, v11.16b, v3.4b[2]
688        B       3b
689
690        # Store odd width
691        .p2align 3
6926:
693        TBZ     x1, 3, 7f
694        STR     d0, [x6], 8
695        STR     d1, [x8], 8
696        DUP     d0, v0.d[1]
697        DUP     d1, v1.d[1]
698        STR     d2, [x9], 8
699        STR     d3, [x7], 8
700        DUP     d2, v2.d[1]
701        DUP     d3, v3.d[1]
7027:
703        TBZ     x1, 2, 8f
704        STR     s0, [x6], 4
705        STR     s1, [x8], 4
706        DUP     s0, v0.s[1]
707        DUP     s1, v1.s[1]
708        STR     s2, [x9], 4
709        STR     s3, [x7], 4
710        DUP     s2, v2.s[1]
711        DUP     s3, v3.s[1]
7128:
713        TBZ     x1, 1, 9f
714        STR     h0, [x6], 2
715        STR     h1, [x8], 2
716        DUP     h0, v0.h[1]
717        DUP     h1, v1.h[1]
718        STR     h2, [x9], 2
719        STR     h3, [x7], 2
720        DUP     h2, v2.h[1]
721        DUP     h3, v3.h[1]
7229:
723        TBZ     x1, 0, 10f
724        STR     b0, [x6]
725        STR     b1, [x8]
726        STR     b2, [x9]
727        STR     b3, [x7]
72810:
729        # Restore d8-d11 from stack
730        LDP     d10, d11, [sp, 16]
731        LDP     d8,  d9, [sp], 32
732        RET
733
734END_FUNCTION xnn_${DATATYPE}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
735
736#ifdef __ELF__
737.section ".note.GNU-stack","",%progbits
738#endif
739