• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2020 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10
11#include <xnnpack/assembly.h>
12
13# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55(
14#     size_t mr,                 x0
15#     size_t nc,                 x1
16#     size_t kc,                 x2 / x0
17#     size_t ks,                 x3 / x9
18#     const int8_t**restrict a,  x4
19#     const int8_t* restrict w,  x5
20#     int8_t* restrict c,        x6
21#     size_t cm_stride,          x7
22#     size_t cn_stride,                  [sp] -> (x0)
23#     size_t a_offset,                   [sp + 8] -> x8
24#     const int8_t* zero,                [sp + 16] -> x12
25#     const union xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
26
27# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
28
29# Register usage
30# A0  x13  v0  v4
31# A1  x14  v1  v5
32# A2  x15  v2  v6
33# A3  x10  v3  v7
34# B    x5  v8  v9 v10 v11
35# C0   x6 v16 v20 v24 v28
36# C1  x16 v17 v21 v25 v29
37# C2  x17 v18 v22 v26 v30
38# C3   x7 v19 v23 v27 v31
39# unused v12 v13 v14 v15
40
41# x11 temp for Cortex-A55 loads
42
43BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55
44
45        # Clamp C pointers
46        CMP     x0, 2                   // if mr < 2
47        LDR     x8, [sp, 8]             // Load a_offset
48        ADD     x16, x6, x7             // c1 = c0 + cm_stride
49        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
50        CSEL    x16, x6,  x16, LO       //   c1 = c0
51        ADD     x2, x2, 3               // kc = (kc + 3) & ~3
52        STP     d8,  d9, [sp, -32]!     // Save d8-d11 on stack
53
54        ADD     x17, x16, x7            // c2 = c1 + cm_stride
55        STP     d10, d11, [sp, 16]
56                                        // if mr <= 2
57        CSEL    x17, x16, x17, LS       //   c2 = c1
58        BIC     x2, x2, 3
59
60        CMP     x0, 4                   // if mr < 4
61        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
62        CSEL    x7,  x17, x7, LO        //   c3 = c2
63
64        .p2align 3
650:
66        # Load initial bias from w into accumulators
67        LDP     q16, q20, [x5], 32
68        MOV     v17.16b, v16.16b
69        MOV     v18.16b, v16.16b
70        LDP     q24, q28, [x5], 32
71        MOV     v19.16b, v16.16b
72        MOV     v21.16b, v20.16b
73        MOV     v22.16b, v20.16b
74        MOV     v23.16b, v20.16b
75        MOV     v25.16b, v24.16b
76        MOV     v26.16b, v24.16b
77        MOV     v27.16b, v24.16b
78        MOV     v29.16b, v28.16b
79        MOV     v30.16b, v28.16b
80        MOV     v31.16b, v28.16b
81        MOV     x9, x3                  // p = ks
82
83        .p2align 3
841:
85        # Load next 4 A pointers
86        LDP     x13, x14, [x4], 16
87        LDP     x15, x10, [x4], 16
88
89        CMP     x13, x12                // if a0 == zero
90        ADD     x13, x13, x8            // a0 += a_offset
91        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
92        CMP     x14, x12                // if a1 == zero
93        ADD     x14, x14, x8            // a1 += a_offset
94        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
95        CMP     x15, x12                // if a2 == zero
96        ADD     x15, x15, x8            // a2 += a_offset
97        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
98        CMP     x10, x12                // if a3 == zero
99        ADD     x10, x10, x8            // a3 += a_offset
100        CSEL    x10, x12, x10, EQ       //   a3 = zero, else += a3 + a_offset
101
102        # Is there at least 16 bytes for prologue/epilogue?
103        SUBS    x0, x2, 16              // k = kc - 16
104        B.LO    5f
105
106        # prologue - read A and B values for block 0 and 1
107        LDR     d0, [x13], 8
108        LDR     q8,  [x5], 16
109        LDR     d1, [x14], 8
110        LDR     d2, [x15], 8
111        LDR     d3, [x10], 8
112        SUBS    x0, x0, 16              // is there 16 for main loop?
113        LDR     d9,  [x5], 8
114        LDR     x11,  [x5], 8
115        # Is there at least 16 bytes for main loop?
116        B.LO    3f
117
118        # Main loop - 16 bytes of A in 4 groups.
119        # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
120        # 4 LD64 for A
121        # 4 LD128 for W. = 2 LD64 + INS.
122        # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.
123
124        .p2align 3
1252:
126        # BLOCK 0
127        SDOT    v16.4s,  v8.16b, v0.4b[0]
128        LDR     d10,  [x5], 8
129        SDOT    v17.4s,  v8.16b, v1.4b[0]
130        INS     v9.d[1], x11
131        SDOT    v18.4s,  v8.16b, v2.4b[0]
132        LDR     x11,  [x5], 8
133        SDOT    v19.4s,  v8.16b, v3.4b[0]
134        LDR     d4,  [x13], 8
135
136        # BLOCK 1
137        SDOT    v20.4s,  v9.16b, v0.4b[0]
138        LDR     d11,  [x5], 8
139        SDOT    v21.4s,  v9.16b, v1.4b[0]
140        INS     v10.d[1], x11
141        SDOT    v22.4s,  v9.16b, v2.4b[0]
142        LDR     x11,  [x5], 8
143        SDOT    v23.4s,  v9.16b, v3.4b[0]
144        LDR     d5, [x14], 8
145
146        # BLOCK 2
147        SDOT    v24.4s, v10.16b, v0.4b[0]
148        LDR     d8,  [x5], 8
149        SDOT    v25.4s, v10.16b, v1.4b[0]
150        INS     v11.d[1], x11
151        SDOT    v26.4s, v10.16b, v2.4b[0]
152        LDR     x11,  [x5], 8
153        SDOT    v27.4s, v10.16b, v3.4b[0]
154        LDR     d6, [x15], 8
155
156        # BLOCK 3
157        SDOT    v28.4s, v11.16b, v0.4b[0]
158        LDR     d9,  [x5], 8
159        SDOT    v29.4s, v11.16b, v1.4b[0]
160        INS     v8.d[1], x11
161        SDOT    v30.4s, v11.16b, v2.4b[0]
162        LDR     x11,  [x5], 8
163        SDOT    v31.4s, v11.16b, v3.4b[0]
164        LDR     d7,  [x10], 8
165
166        # BLOCK 0
167        SDOT    v16.4s,  v8.16b, v0.4b[1]
168        LDR     d10,  [x5], 8
169        SDOT    v17.4s,  v8.16b, v1.4b[1]
170        INS     v9.d[1], x11
171        SDOT    v18.4s,  v8.16b, v2.4b[1]
172        LDR     x11,  [x5], 8
173        SDOT    v19.4s,  v8.16b, v3.4b[1]
174
175        # BLOCK 1
176        SDOT    v20.4s,  v9.16b, v0.4b[1]
177        LDR     d11,  [x5], 8
178        SDOT    v21.4s,  v9.16b, v1.4b[1]
179        INS     v10.d[1], x11
180        SDOT    v22.4s,  v9.16b, v2.4b[1]
181        LDR     x11,  [x5], 8
182        SDOT    v23.4s,  v9.16b, v3.4b[1]
183
184        # BLOCK 2
185        SDOT    v24.4s, v10.16b, v0.4b[1]
186        LDR     d8,  [x5], 8
187        SDOT    v25.4s, v10.16b, v1.4b[1]
188        INS     v11.d[1], x11
189        SDOT    v26.4s, v10.16b, v2.4b[1]
190        LDR     x11,  [x5], 8
191        SDOT    v27.4s, v10.16b, v3.4b[1]
192
193        # BLOCK 3
194        SDOT    v28.4s, v11.16b, v0.4b[1]
195        LDR     d9,  [x5], 8
196        SDOT    v29.4s, v11.16b, v1.4b[1]
197        INS     v8.d[1], x11
198        SDOT    v30.4s, v11.16b, v2.4b[1]
199        LDR     x11,  [x5], 8
200        SDOT    v31.4s, v11.16b, v3.4b[1]
201
202        # BLOCK 0
203        SDOT    v16.4s,  v8.16b, v4.4b[0]
204        LDR     d10,  [x5], 8
205        SDOT    v17.4s,  v8.16b, v5.4b[0]
206        INS     v9.d[1], x11
207        SDOT    v18.4s,  v8.16b, v6.4b[0]
208        LDR     x11,  [x5], 8
209        SDOT    v19.4s,  v8.16b, v7.4b[0]
210        LDR     d0,  [x13], 8
211
212        # BLOCK 1
213        SDOT    v20.4s,  v9.16b, v4.4b[0]
214        LDR     d11,  [x5], 8
215        SDOT    v21.4s,  v9.16b, v5.4b[0]
216        INS     v10.d[1], x11
217        SDOT    v22.4s,  v9.16b, v6.4b[0]
218        LDR     x11,  [x5], 8
219        SDOT    v23.4s,  v9.16b, v7.4b[0]
220        LDR     d1, [x14], 8
221
222        # BLOCK 2
223        SDOT    v24.4s, v10.16b, v4.4b[0]
224        LDR     d8,  [x5], 8
225        SDOT    v25.4s, v10.16b, v5.4b[0]
226        INS     v11.d[1], x11
227        SDOT    v26.4s, v10.16b, v6.4b[0]
228        LDR     x11,  [x5], 8
229        SDOT    v27.4s, v10.16b, v7.4b[0]
230        LDR     d2, [x15], 8
231
232        # BLOCK 3
233        SDOT    v28.4s, v11.16b, v4.4b[0]
234        LDR     d9,  [x5], 8
235        SDOT    v29.4s, v11.16b, v5.4b[0]
236        INS     v8.d[1], x11
237        SDOT    v30.4s, v11.16b, v6.4b[0]
238        LDR     x11,  [x5], 8
239        SDOT    v31.4s, v11.16b, v7.4b[0]
240        LDR     d3,  [x10], 8
241
242        # BLOCK 0
243        SDOT    v16.4s,  v8.16b, v4.4b[1]
244        LDR     d10,  [x5], 8
245        SDOT    v17.4s,  v8.16b, v5.4b[1]
246        INS     v9.d[1], x11
247        SDOT    v18.4s,  v8.16b, v6.4b[1]
248        LDR     x11,  [x5], 8
249        SDOT    v19.4s,  v8.16b, v7.4b[1]
250
251        # BLOCK 1
252        SDOT    v20.4s,  v9.16b, v4.4b[1]
253        LDR     d11,  [x5], 8
254        SDOT    v21.4s,  v9.16b, v5.4b[1]
255        INS     v10.d[1], x11
256        SDOT    v22.4s,  v9.16b, v6.4b[1]
257        LDR     x11,  [x5], 8
258        SDOT    v23.4s,  v9.16b, v7.4b[1]
259
260        # BLOCK 2
261        SDOT    v24.4s, v10.16b, v4.4b[1]
262        LDR     d8,  [x5], 8            // First B values for block 0 and 1
263        SDOT    v25.4s, v10.16b, v5.4b[1]
264        INS     v11.d[1], x11
265        SDOT    v26.4s, v10.16b, v6.4b[1]
266        LDR     x11,  [x5], 8
267        SDOT    v27.4s, v10.16b, v7.4b[1]
268        SUBS    x0, x0, 16
269
270        # BLOCK 3
271        SDOT    v28.4s, v11.16b, v4.4b[1]
272        LDR     d9,  [x5], 8
273        SDOT    v29.4s, v11.16b, v5.4b[1]
274        INS     v8.d[1], x11
275        SDOT    v30.4s, v11.16b, v6.4b[1]
276        LDR     x11,  [x5], 8
277        SDOT    v31.4s, v11.16b, v7.4b[1]
278        B.HS    2b
279
280        # Epilogue.  Same as main loop but no preloads in final group
2813:
282        # BLOCK 0
283        SDOT    v16.4s,  v8.16b, v0.4b[0]
284        LDR     d10,  [x5], 8
285        SDOT    v17.4s,  v8.16b, v1.4b[0]
286        INS     v9.d[1], x11
287        SDOT    v18.4s,  v8.16b, v2.4b[0]
288        LDR     x11,  [x5], 8
289        SDOT    v19.4s,  v8.16b, v3.4b[0]
290        LDR     d4,  [x13], 8
291
292        # BLOCK 1
293        SDOT    v20.4s,  v9.16b, v0.4b[0]
294        LDR     d11,  [x5], 8
295        SDOT    v21.4s,  v9.16b, v1.4b[0]
296        INS     v10.d[1], x11
297        SDOT    v22.4s,  v9.16b, v2.4b[0]
298        LDR     x11,  [x5], 8
299        SDOT    v23.4s,  v9.16b, v3.4b[0]
300        LDR     d5, [x14], 8
301
302        # BLOCK 2
303        SDOT    v24.4s, v10.16b, v0.4b[0]
304        LDR     d8,  [x5], 8
305        SDOT    v25.4s, v10.16b, v1.4b[0]
306        INS     v11.d[1], x11
307        SDOT    v26.4s, v10.16b, v2.4b[0]
308        LDR     x11,  [x5], 8
309        SDOT    v27.4s, v10.16b, v3.4b[0]
310        LDR     d6, [x15], 8
311
312        # BLOCK 3
313        SDOT    v28.4s, v11.16b, v0.4b[0]
314        LDR     d9,  [x5], 8
315        SDOT    v29.4s, v11.16b, v1.4b[0]
316        INS     v8.d[1], x11
317        SDOT    v30.4s, v11.16b, v2.4b[0]
318        LDR     x11,  [x5], 8
319        SDOT    v31.4s, v11.16b, v3.4b[0]
320        LDR     d7,  [x10], 8
321
322        # BLOCK 0
323        SDOT    v16.4s,  v8.16b, v0.4b[1]
324        LDR     d10,  [x5], 8
325        SDOT    v17.4s,  v8.16b, v1.4b[1]
326        INS     v9.d[1], x11
327        SDOT    v18.4s,  v8.16b, v2.4b[1]
328        LDR     x11,  [x5], 8
329        SDOT    v19.4s,  v8.16b, v3.4b[1]
330
331        # BLOCK 1
332        SDOT    v20.4s,  v9.16b, v0.4b[1]
333        LDR     d11,  [x5], 8
334        SDOT    v21.4s,  v9.16b, v1.4b[1]
335        INS     v10.d[1], x11
336        SDOT    v22.4s,  v9.16b, v2.4b[1]
337        LDR     x11,  [x5], 8
338        SDOT    v23.4s,  v9.16b, v3.4b[1]
339
340        # BLOCK 2
341        SDOT    v24.4s, v10.16b, v0.4b[1]
342        LDR     d8,  [x5], 8
343        SDOT    v25.4s, v10.16b, v1.4b[1]
344        INS     v11.d[1], x11
345        SDOT    v26.4s, v10.16b, v2.4b[1]
346        LDR     x11,  [x5], 8
347        SDOT    v27.4s, v10.16b, v3.4b[1]
348
349        # BLOCK 3
350        SDOT    v28.4s, v11.16b, v0.4b[1]
351        LDR     d9,  [x5], 8
352        SDOT    v29.4s, v11.16b, v1.4b[1]
353        INS     v8.d[1], x11
354        SDOT    v30.4s, v11.16b, v2.4b[1]
355        LDR     x11,  [x5], 8
356        SDOT    v31.4s, v11.16b, v3.4b[1]
357
358        # BLOCK 0
359        SDOT    v16.4s,  v8.16b, v4.4b[0]
360        LDR     d10,  [x5], 8
361        SDOT    v17.4s,  v8.16b, v5.4b[0]
362        INS     v9.d[1], x11
363        SDOT    v18.4s,  v8.16b, v6.4b[0]
364        LDR     x11,  [x5], 8
365        SDOT    v19.4s,  v8.16b, v7.4b[0]
366
367        # BLOCK 1
368        SDOT    v20.4s,  v9.16b, v4.4b[0]
369        LDR     d11,  [x5], 8
370        SDOT    v21.4s,  v9.16b, v5.4b[0]
371        INS     v10.d[1], x11
372        SDOT    v22.4s,  v9.16b, v6.4b[0]
373        LDR     x11,  [x5], 8
374        SDOT    v23.4s,  v9.16b, v7.4b[0]
375
376        # BLOCK 2
377        SDOT    v24.4s, v10.16b, v4.4b[0]
378        LDR     d8,  [x5], 8
379        SDOT    v25.4s, v10.16b, v5.4b[0]
380        INS     v11.d[1], x11
381        SDOT    v26.4s, v10.16b, v6.4b[0]
382        LDR     x11,  [x5], 8
383        SDOT    v27.4s, v10.16b, v7.4b[0]
384
385        # BLOCK 3
386        SDOT    v28.4s, v11.16b, v4.4b[0]
387        LDR     d9,  [x5], 8
388        SDOT    v29.4s, v11.16b, v5.4b[0]
389        INS     v8.d[1], x11
390        SDOT    v30.4s, v11.16b, v6.4b[0]
391        LDR     x11,  [x5], 8
392        SDOT    v31.4s, v11.16b, v7.4b[0]
393
394        # BLOCK 0
395        SDOT    v16.4s,  v8.16b, v4.4b[1]
396        LDR     d10,  [x5], 8
397        SDOT    v17.4s,  v8.16b, v5.4b[1]
398        INS     v9.d[1], x11
399        SDOT    v18.4s,  v8.16b, v6.4b[1]
400        LDR     x11,  [x5], 8
401        SDOT    v19.4s,  v8.16b, v7.4b[1]
402
403        # BLOCK 1
404        SDOT    v20.4s,  v9.16b, v4.4b[1]
405        LDR     d11,  [x5], 8
406        SDOT    v21.4s,  v9.16b, v5.4b[1]
407        INS     v10.d[1], x11
408        SDOT    v22.4s,  v9.16b, v6.4b[1]
409        LDR     x11,  [x5], 8
410        SDOT    v23.4s,  v9.16b, v7.4b[1]
411
412        # BLOCK 2
413        SDOT    v24.4s, v10.16b, v4.4b[1]
414        SDOT    v25.4s, v10.16b, v5.4b[1]
415        INS     v11.d[1], x11
416        SDOT    v26.4s, v10.16b, v6.4b[1]
417        SDOT    v27.4s, v10.16b, v7.4b[1]
418        AND     x0, x2, 15              // kc remainder 0 to 12
419
420        # BLOCK 3
421        SDOT    v28.4s, v11.16b, v4.4b[1]
422        SDOT    v29.4s, v11.16b, v5.4b[1]
423        LDR     x11, [sp, 56]            // reload params pointer
424        SDOT    v30.4s, v11.16b, v6.4b[1]
425        SDOT    v31.4s, v11.16b, v7.4b[1]
426
427        # Is there a remainder?- 4 to 12 bytes of A
428        CBNZ    x0, 6f
429
430        .p2align 3
4314:
432        # ks loop
433        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
434        B.HI    1b
435
436        # Apply params - preshift, scale, postshift, bias and clamp
437        LD1R    {v4.4s}, [x11], 4
438        SQSHL   v16.4s, v16.4s, v4.4s   // shift to upper bits
439        SQSHL   v17.4s, v17.4s, v4.4s
440        SQSHL   v18.4s, v18.4s, v4.4s
441        SQSHL   v19.4s, v19.4s, v4.4s
442        SQSHL   v20.4s, v20.4s, v4.4s
443        SQSHL   v21.4s, v21.4s, v4.4s
444        SQSHL   v22.4s, v22.4s, v4.4s
445        SQSHL   v23.4s, v23.4s, v4.4s
446        LD1R    {v5.4s}, [x11], 4
447        SQSHL   v24.4s, v24.4s, v4.4s
448        SQSHL   v25.4s, v25.4s, v4.4s
449        SQSHL   v26.4s, v26.4s, v4.4s
450        SQSHL   v27.4s, v27.4s, v4.4s
451        SQSHL   v28.4s, v28.4s, v4.4s
452        SQSHL   v29.4s, v29.4s, v4.4s
453        SQSHL   v30.4s, v30.4s, v4.4s
454        SQSHL   v31.4s, v31.4s, v4.4s
455        LD1R    {v6.4s}, [x11], 4
456        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
457        SQDMULH v17.4s, v17.4s, v5.4s
458        SQDMULH v18.4s, v18.4s, v5.4s
459        SQDMULH v19.4s, v19.4s, v5.4s
460        SQDMULH v20.4s, v20.4s, v5.4s
461        SQDMULH v21.4s, v21.4s, v5.4s
462        SQDMULH v22.4s, v22.4s, v5.4s
463        SQDMULH v23.4s, v23.4s, v5.4s
464        SQDMULH v24.4s, v24.4s, v5.4s
465        SQDMULH v25.4s, v25.4s, v5.4s
466        SQDMULH v26.4s, v26.4s, v5.4s
467        SQDMULH v27.4s, v27.4s, v5.4s
468        SQDMULH v28.4s, v28.4s, v5.4s
469        SQDMULH v29.4s, v29.4s, v5.4s
470        SQDMULH v30.4s, v30.4s, v5.4s
471        SQDMULH v31.4s, v31.4s, v5.4s
472        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
473        SRSHL   v17.4s, v17.4s, v6.4s
474        SRSHL   v18.4s, v18.4s, v6.4s
475        SRSHL   v19.4s, v19.4s, v6.4s
476        SRSHL   v20.4s, v20.4s, v6.4s
477        SRSHL   v21.4s, v21.4s, v6.4s
478        SRSHL   v22.4s, v22.4s, v6.4s
479        SRSHL   v23.4s, v23.4s, v6.4s
480        SRSHL   v24.4s, v24.4s, v6.4s
481        SRSHL   v25.4s, v25.4s, v6.4s
482        SRSHL   v26.4s, v26.4s, v6.4s
483        SRSHL   v27.4s, v27.4s, v6.4s
484        SRSHL   v28.4s, v28.4s, v6.4s
485        SRSHL   v29.4s, v29.4s, v6.4s
486        SRSHL   v30.4s, v30.4s, v6.4s
487        SRSHL   v31.4s, v31.4s, v6.4s
488
489        SQXTN   v16.4h, v16.4s
490        SQXTN   v17.4h, v17.4s
491        SQXTN   v18.4h, v18.4s
492        SQXTN   v19.4h, v19.4s
493        SQXTN   v24.4h, v24.4s
494        SQXTN   v25.4h, v25.4s
495        SQXTN   v26.4h, v26.4s
496        SQXTN   v27.4h, v27.4s
497        LD1R    {v6.8h}, [x11], 2       // add bias
498
499        SQXTN2  v16.8h, v20.4s
500        SQXTN2  v17.8h, v21.4s
501        SQXTN2  v18.8h, v22.4s
502        SQXTN2  v19.8h, v23.4s
503        SQXTN2  v24.8h, v28.4s
504        SQXTN2  v25.8h, v29.4s
505        SQXTN2  v26.8h, v30.4s
506        SQXTN2  v27.8h, v31.4s
507
508        SQADD   v16.8h, v16.8h, v6.8h
509        SQADD   v17.8h, v17.8h, v6.8h
510        SQADD   v18.8h, v18.8h, v6.8h
511        SQADD   v19.8h, v19.8h, v6.8h
512        SQADD   v24.8h, v24.8h, v6.8h
513        SQADD   v25.8h, v25.8h, v6.8h
514        SQADD   v26.8h, v26.8h, v6.8h
515        SQADD   v27.8h, v27.8h, v6.8h
516        LD1R    {v4.16b}, [x11], 1      // clamp min value
517
518        SQXTN   v0.8b, v16.8h
519        SQXTN   v1.8b, v17.8h
520        SQXTN   v2.8b, v18.8h
521        SQXTN   v3.8b, v19.8h
522        LD1R    {v5.16b}, [x11]         // clamp max value
523        SQXTN2  v0.16b, v24.8h
524        SQXTN2  v1.16b, v25.8h
525        SQXTN2  v2.16b, v26.8h
526        SQXTN2  v3.16b, v27.8h
527        LDR     x0, [sp, 32]            // cn_stride
528        SMAX    v0.16b, v0.16b, v4.16b
529        SMAX    v1.16b, v1.16b, v4.16b
530        SUB     x11, x11, 15          // rewind params pointer
531        SMAX    v2.16b, v2.16b, v4.16b
532        SMAX    v3.16b, v3.16b, v4.16b
533        SUBS    x1, x1, 16
534        SMIN    v0.16b, v0.16b, v5.16b
535        SMIN    v1.16b, v1.16b, v5.16b
536        SMIN    v2.16b, v2.16b, v5.16b
537        SMIN    v3.16b, v3.16b, v5.16b
538        B.LO    7f
539
540        # Store full 4 x 16
541        ST1     {v3.16b},  [x7], x0
542        ST1     {v2.16b}, [x17], x0
543        ST1     {v1.16b}, [x16], x0
544        ST1     {v0.16b},  [x6], x0
545
546        SUB     x4, x4, x3              // a -= ks
547
548        # nc loop
549        B.HI    0b
550
551        # Restore d8-d11 from stack
552        LDP     d10, d11, [sp, 16]
553        LDP     d8,  d9, [sp], 32
554        RET
555
556        # Remainder- 4 to 12 bytes of A
557        # Although C4, its safe to read 16 bytes.
558        .p2align 3
5595:
560        AND     x0, x2, 15              // kc remainder 4 to 12
5616:
562        LDR     q0, [x13]
563        LDP     q8,  q9,  [x5], 32
564        LDR     q1, [x14]
565        LDR     q2, [x15]
566        LDR     q3, [x10]
567        LDP     q10, q11, [x5], 32
568        SDOT    v16.4s,  v8.16b, v0.4b[0]
569        SDOT    v17.4s,  v8.16b, v1.4b[0]
570        SDOT    v18.4s,  v8.16b, v2.4b[0]
571        SDOT    v19.4s,  v8.16b, v3.4b[0]
572        SDOT    v20.4s,  v9.16b, v0.4b[0]
573        SDOT    v21.4s,  v9.16b, v1.4b[0]
574        SDOT    v22.4s,  v9.16b, v2.4b[0]
575        SDOT    v23.4s,  v9.16b, v3.4b[0]
576        SDOT    v24.4s, v10.16b, v0.4b[0]
577        SDOT    v25.4s, v10.16b, v1.4b[0]
578        SDOT    v26.4s, v10.16b, v2.4b[0]
579        SDOT    v27.4s, v10.16b, v3.4b[0]
580        SDOT    v28.4s, v11.16b, v0.4b[0]
581        SDOT    v29.4s, v11.16b, v1.4b[0]
582        SDOT    v30.4s, v11.16b, v2.4b[0]
583        SDOT    v31.4s, v11.16b, v3.4b[0]
584        CMP     x0, 4
585        B.LS    4b
586        LDP     q8,  q9,  [x5], 32
587        LDP     q10, q11,  [x5], 32
588        SDOT    v16.4s,  v8.16b, v0.4b[1]
589        SDOT    v17.4s,  v8.16b, v1.4b[1]
590        SDOT    v18.4s,  v8.16b, v2.4b[1]
591        SDOT    v19.4s,  v8.16b, v3.4b[1]
592        SDOT    v20.4s,  v9.16b, v0.4b[1]
593        SDOT    v21.4s,  v9.16b, v1.4b[1]
594        SDOT    v22.4s,  v9.16b, v2.4b[1]
595        SDOT    v23.4s,  v9.16b, v3.4b[1]
596        SDOT    v24.4s, v10.16b, v0.4b[1]
597        SDOT    v25.4s, v10.16b, v1.4b[1]
598        SDOT    v26.4s, v10.16b, v2.4b[1]
599        SDOT    v27.4s, v10.16b, v3.4b[1]
600        SDOT    v28.4s, v11.16b, v0.4b[1]
601        SDOT    v29.4s, v11.16b, v1.4b[1]
602        SDOT    v30.4s, v11.16b, v2.4b[1]
603        SDOT    v31.4s, v11.16b, v3.4b[1]
604        CMP     x0, 8
605        B.LS    4b
606        LDP     q8,  q9,  [x5], 32
607        LDP     q10, q11,  [x5], 32
608        SDOT    v16.4s,  v8.16b, v0.4b[2]
609        SDOT    v17.4s,  v8.16b, v1.4b[2]
610        SDOT    v18.4s,  v8.16b, v2.4b[2]
611        SDOT    v19.4s,  v8.16b, v3.4b[2]
612        SDOT    v20.4s,  v9.16b, v0.4b[2]
613        SDOT    v21.4s,  v9.16b, v1.4b[2]
614        SDOT    v22.4s,  v9.16b, v2.4b[2]
615        SDOT    v23.4s,  v9.16b, v3.4b[2]
616        SDOT    v24.4s, v10.16b, v0.4b[2]
617        SDOT    v25.4s, v10.16b, v1.4b[2]
618        SDOT    v26.4s, v10.16b, v2.4b[2]
619        SDOT    v27.4s, v10.16b, v3.4b[2]
620        SDOT    v28.4s, v11.16b, v0.4b[2]
621        SDOT    v29.4s, v11.16b, v1.4b[2]
622        SDOT    v30.4s, v11.16b, v2.4b[2]
623        SDOT    v31.4s, v11.16b, v3.4b[2]
624        B       4b
625
626        # Store odd width
627        .p2align 3
6287:
629        TBZ     x1, 3, 8f
630        STR     d3, [x7], 8
631        STR     d2, [x17], 8
632        DUP     d3, v3.d[1]
633        DUP     d2, v2.d[1]
634        STR     d1, [x16], 8
635        STR     d0, [x6], 8
636        DUP     d1, v1.d[1]
637        DUP     d0, v0.d[1]
6388:
639        TBZ     x1, 2, 9f
640        STR     s3, [x7], 4
641        STR     s2, [x17], 4
642        DUP     s3, v3.s[1]
643        DUP     s2, v2.s[1]
644        STR     s1, [x16], 4
645        STR     s0, [x6], 4
646        DUP     s1, v1.s[1]
647        DUP     s0, v0.s[1]
6489:
649        TBZ     x1, 1, 10f
650        STR     h3, [x7], 2
651        STR     h2, [x17], 2
652        DUP     h3, v3.h[1]
653        DUP     h2, v2.h[1]
654        STR     h1, [x16], 2
655        STR     h0, [x6], 2
656        DUP     h1, v1.h[1]
657        DUP     h0, v0.h[1]
65810:
659        TBZ     x1, 0, 11f
660        STR     b3, [x7]
661        STR     b2, [x17]
662        STR     b1, [x16]
663        STR     b0, [x6]
66411:
665        # Restore d8-d11 from stack
666        LDP     d10, d11, [sp, 16]
667        LDP     d8,  d9, [sp], 32
668        RET
669
670END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55
671
672#ifdef __ELF__
673.section ".note.GNU-stack","",%progbits
674#endif
675