• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const void*restrict w,             x5
15#     uint8_t*restrict c,                x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_output_params params [sp + 24] -> x8
21
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
25# A pointers
26# x13 a0
27# x14 a1
28# x15 a2
29#  x8 a3
30
31# C pointers
32#  x6 c0
33# x16 c1
34# x17 c2
35#  x7 c3
36
37# x19 temporary vector shadow register
38
39# Vector register usage
40# A0  v0     v3
41# A1  v0[1]  v3[1]
42# A2  v1     v4
43# A3  v1[1]  v4[1]
44
45# B   v12 v13 v14 v15 second set of B
46# B   v16 v17 v18 v19 first set
47# C   v20 v21
48# C   v22 v23
49# C   v24 v25
50# C   v26 v27
51# Clamp v6 v7
52
53# unused A   v8 v9 v10 v11
54# x12 a4
55#  x4 a5
56# x13 c4
57#  x7 c5
58# A4  v2     v5
59# A5  v2[1]  v5[1]
60# C   v28 v29
61# C   v30 v31
62
63BEGIN_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53
64
65        # Clamp C pointers
66        CMP x0, 2                // if mr < 2
67        ADD x16, x6, x7          // c1 = c0 + cm_stride
68        CSEL x16, x6, x16, LO    //   c1 = c0
69
70        ADD x17, x16, x7         // c2 = c1 + cm_stride
71                                 // if mr <= 2
72        CSEL x17, x16, x17, LS   //   c2 = c1
73
74        CMP x0, 4                // if mr < 4
75        ADD x7, x17, x7          // c3 = c2 + cm_stride
76        CSEL x7, x17, x7, LO     //   c3 = c2
77
78        # Load cn_stride, a_offset
79        LDP x10, x11, [sp]
80
81        # Load zero, clamping params pointer
82        LDP x12, x8, [sp, 16]
83
84        # Load clamping_params values
85        LD2R {v6.4s, v7.4s}, [x8]
86
87        // Save x19, d12-d15 on stack
88        STP d12, d13, [sp, -48]!
89        STP d14, d15, [sp, 16]
90        STP x19, x19, [sp, 32]
91
920:
93        # Load initial bias from w into accumulators
94        LDP q20, q21, [x5], 32
95        MOV v22.16b, v20.16b
96        PRFM PLDL1KEEP,  [x13,  0]  // Prefetch A
97        PRFM PLDL1KEEP,  [x13, 64]
98        MOV v23.16b, v21.16b
99        PRFM PLDL1KEEP,  [x14,  0]
100        PRFM PLDL1KEEP,  [x14, 64]
101        MOV v24.16b, v20.16b
102        PRFM PLDL1KEEP, [x15,  0]
103        PRFM PLDL1KEEP, [x15, 64]
104        MOV v25.16b, v21.16b
105        PRFM PLDL1KEEP, [x8,  0]
106        PRFM PLDL1KEEP, [x8, 64]
107        MOV v26.16b, v20.16b
108        PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
109        PRFM PLDL1KEEP, [x5,  64]
110        MOV v27.16b, v21.16b
111        PRFM PLDL1KEEP, [x5, 128]
112        PRFM PLDL1KEEP, [x5, 192]
113
114        MOV x9, x3  // p = ks
115
1161:
117        # Load next 4 A pointers
118        LDP x13, x14, [x4], 16
119        LDP x15, x8, [x4], 16
120
121
122        CMP x13, x12            // if a0 == zero
123        ADD x13, x13, x11       // a0 += a_offset
124        CSEL x13, x12, x13, EQ  //   a0 = zero, else += a0 + a_offset
125        CMP x14, x12            // if a1 == zero
126        ADD x14, x14, x11       // a1 += a_offset
127        CSEL x14, x12, x14, EQ  //   a1 = zero, else += a1 + a_offset
128        CMP x15, x12            // if a2 == zero
129        ADD x15, x15, x11       // a2 += a_offset
130        CSEL x15, x12, x15, EQ  //   a2 = zero, else += a2 + a_offset
131        CMP x8, x12             // if a3 == zero
132        ADD x8, x8, x11         // a3 += a_offset
133        CSEL x8, x12, x8, EQ    //   a3 = zero, else += a3 + a_offset
134
135        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
136        SUBS x0, x2, 16  // k = kc - 16
137        B.LO 4f
138
139        # Prologue - First group loads, no FMA
140        LDR   d0, [x13], 8            // a0
141        LDP q16, q17, [x5], 32        // b
142        LDR   d1, [x15], 8            // a2
143        LD1  {v0.d}[1],  [x14], 8     // a1
144        LD1  {v1.d}[1], [x8], 8       // a3
145        SUBS x0, x0, 16
146        LDR  q18, [x5], 16
147        LDR  d19, [x5], 8
148        LDR  x19, [x5], 8   // ins is in BLOCK 0
149
150        # Is there at least 4 floats (16 bytes) for main loop?
151        B.LO 3f
152
153        # Main loop - 4 floats of A (16 bytes)
154        # 32 FMA + 8 LD64 A + 8 LDR B
1552:
156        # First group of 16 FMA, Second group loads
157        // BLOCK 0
158        LDR   d3, [x13], 8              // a0
159        INS v19.d[1], x19               // b from second group
160        FMLA v20.4s, v16.4s,  v0.s[0]
161        LDR  x19, [x14], 8              // a1
162        FMLA v22.4s, v16.4s,  v0.s[2]
163        FMLA v24.4s, v16.4s,  v1.s[0]
164
165        // BLOCK 1
166        LDR  d12, [x5]
167        INS v3.d[1], x19                // a1 ins
168        FMLA v26.4s, v16.4s,  v1.s[2]
169        LDR  x19, [x5, 8]   // b
170        FMLA v21.4s, v17.4s,  v0.s[0]
171        FMLA v23.4s, v17.4s,  v0.s[2]
172
173        // BLOCK 2
174        LDR   d4, [x15], 8              // a2
175        INS v12.d[1], x19  // b  ins
176        FMLA v25.4s, v17.4s,  v1.s[0]
177        LDR  x19, [x8], 8               // a3
178        FMLA v27.4s, v17.4s,  v1.s[2]
179        FMLA v20.4s, v18.4s,  v0.s[1]
180
181        // BLOCK 3
182        LDR  d13, [x5, 16]
183        INS v4.d[1], x19                // a3 ins
184        FMLA v22.4s, v18.4s,  v0.s[3]
185        LDR  x19, [x5, 24]
186        FMLA v24.4s, v18.4s,  v1.s[1]
187        FMLA v26.4s, v18.4s,  v1.s[3]
188
189        // BLOCK 4
190        LDR  d14, [x5, 32]
191        INS v13.d[1], x19  // b
192        FMLA v21.4s, v19.4s,  v0.s[1]
193        LDR  x19, [x5, 40]
194        FMLA v23.4s, v19.4s,  v0.s[3]
195        FMLA v25.4s, v19.4s,  v1.s[1]
196
197        // BLOCK 5
198        // NOPs to ensure 4 cycle LDR lands on next LDR
199        LDR  d15, [x5, 48]
200        INS v14.d[1], x19  // b from previous
201        FMLA v27.4s, v19.4s,  v1.s[3]
202        LDR x19, [x5, 56]
203        NOP
204        NOP
205        NOP
206        NOP
207
208        # Second group of 16 FMA, First group of loads
209        // BLOCK 0
210        LDR   d0, [x13], 8              // a0
211        INS v15.d[1], x19  // b from previous
212        FMLA v20.4s, v12.4s,  v3.s[0]
213        LDR  x19, [x14], 8              // a1
214        FMLA v22.4s, v12.4s,  v3.s[2]
215        FMLA v24.4s, v12.4s,  v4.s[0]
216        PRFM PLDL1KEEP, [x13, 128]      // Prefetch A0
217
218        // BLOCK 1
219        LDR  d16, [x5, 64]
220        INS v0.d[1], x19               // a1 ins
221        FMLA v26.4s, v12.4s,  v4.s[2]
222        LDR  x19, [x5, 72]  // b
223        FMLA v21.4s, v13.4s,  v3.s[0]
224        FMLA v23.4s, v13.4s,  v3.s[2]
225        PRFM PLDL1KEEP, [x14, 128]      // Prefetch A1
226
227        // BLOCK 2
228        LDR   d1, [x15], 8             // a2
229        INS v16.d[1], x19  // b
230        FMLA v25.4s, v13.4s,  v4.s[0]
231        LDR  x19, [x8], 8             // a3
232        FMLA v27.4s, v13.4s,  v4.s[2]
233        FMLA v20.4s, v14.4s,  v3.s[1]
234        PRFM PLDL1KEEP, [x15, 128]     // Prefetch A2
235
236        // BLOCK 3
237        LDR  d17, [x5, 80]
238        INS v1.d[1], x19               // a3 ins
239        FMLA v22.4s, v14.4s,  v3.s[3]
240        LDR  x19, [x5, 88]
241        FMLA v24.4s, v14.4s,  v4.s[1]
242        FMLA v26.4s, v14.4s,  v4.s[3]
243        PRFM PLDL1KEEP, [x8, 128]     // Prefetch A3
244
245        // BLOCK 4
246        LDR  d18, [x5, 96]
247        INS v17.d[1], x19  // b
248        FMLA v21.4s, v15.4s,  v3.s[1]
249        LDR  x19, [x5, 104]
250        FMLA v23.4s, v15.4s,  v3.s[3]
251        FMLA v25.4s, v15.4s,  v4.s[1]
252        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
253
254        // BLOCK 5
255        // NOTE that block needs to be 4 cycles for LDR not to stall
256        LDR  d19, [x5, 112]
257        INS v18.d[1], x19
258        FMLA v27.4s, v15.4s,  v4.s[3]
259        LDR  x19, [x5, 120]
260        SUBS x0, x0, 16
261        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
262        ADD x5, x5, 128
263        B.HS 2b
264
265        # Epilogue - 4 floats of A (16 bytes)
266        # 32 FMA + 8 LD64 A + 8 LDR B
2673:
268        # First group of 16 FMA, Second group loads
269        // BLOCK 0
270        LDR   d3, [x13], 8              // a0
271        INS v19.d[1], x19              // b from second group
272        FMLA v20.4s, v16.4s,  v0.s[0]
273        LDR  x19, [x14], 8              // a1
274        FMLA v22.4s, v16.4s,  v0.s[2]
275        FMLA v24.4s, v16.4s,  v1.s[0]
276
277        // BLOCK 1
278        LDR  d12, [x5]
279        INS v3.d[1], x19               // a1 ins
280        FMLA v26.4s, v16.4s,  v1.s[2]
281        LDR  x19, [x5, 8]   // b
282        FMLA v21.4s, v17.4s,  v0.s[0]
283        FMLA v23.4s, v17.4s,  v0.s[2]
284
285        // BLOCK 2
286        LDR   d4, [x15], 8             // a2
287        INS v12.d[1], x19  // b  ins
288        FMLA v25.4s, v17.4s,  v1.s[0]
289        LDR  x19, [x8], 8             // a3
290        FMLA v27.4s, v17.4s,  v1.s[2]
291        FMLA v20.4s, v18.4s,  v0.s[1]
292
293        // BLOCK 3
294        LDR  d13, [x5, 16]
295        INS v4.d[1], x19               // a3 ins
296        FMLA v22.4s, v18.4s,  v0.s[3]
297        LDR  x19, [x5, 24]
298        FMLA v24.4s, v18.4s,  v1.s[1]
299        FMLA v26.4s, v18.4s,  v1.s[3]
300
301        // BLOCK 4
302        LDR  d14, [x5, 32]
303        INS v13.d[1], x19  // b
304        FMLA v21.4s, v19.4s,  v0.s[1]
305        LDR  x19, [x5, 40]
306        FMLA v23.4s, v19.4s,  v0.s[3]
307        FMLA v25.4s, v19.4s,  v1.s[1]
308
309        // BLOCK 5
310        // NOPs to ensure 4 cycle LDR lands on next LDR
311        LDR  d15, [x5, 48]
312        INS v14.d[1], x19
313        FMLA v27.4s, v19.4s,  v1.s[3]
314        LDR x19, [x5, 56]
315        NOP // fma
316        NOP
317        NOP // fma
318        NOP
319
320        # Second group of 16 FMA, no loads
321        // BLOCK 0
322        INS v15.d[1], x19  // b from previous
323        FMLA v20.4s, v12.4s,  v3.s[0]
324        FMLA v22.4s, v12.4s,  v3.s[2]
325        FMLA v24.4s, v12.4s,  v4.s[0]
326
327        // BLOCK 1
328        FMLA v26.4s, v12.4s,  v4.s[2]
329        FMLA v21.4s, v13.4s,  v3.s[0]
330        FMLA v23.4s, v13.4s,  v3.s[2]
331
332        // BLOCK 2
333        FMLA v25.4s, v13.4s,  v4.s[0]
334        FMLA v27.4s, v13.4s,  v4.s[2]
335        FMLA v20.4s, v14.4s,  v3.s[1]
336
337        // BLOCK 3
338        FMLA v22.4s, v14.4s,  v3.s[3]
339        FMLA v24.4s, v14.4s,  v4.s[1]
340        FMLA v26.4s, v14.4s,  v4.s[3]
341
342        // BLOCK 4
343        FMLA v21.4s, v15.4s,  v3.s[1]
344        FMLA v23.4s, v15.4s,  v3.s[3]
345        FMLA v25.4s, v15.4s,  v4.s[1]
346        ADD x5, x5, 64
347
348        // BLOCK 5
349        FMLA v27.4s, v15.4s,  v4.s[3]
350
3514:
352        # Is there a remainder?- 2 floats of A (8 bytes)
353        TBNZ x0, 3, 6f
354        # Is there a remainder?- 1 floats of A (4 bytes)
355        TBNZ x0, 2, 7f
3565:
357        # ks loop
358        SUBS x9, x9, 32  // ks -= MR * sizeof(void*)
359        B.NE 1b
360
361        # Clamp
362        FMIN v20.4s, v20.4s, v6.4s
363        FMIN v21.4s, v21.4s, v6.4s
364        FMIN v22.4s, v22.4s, v6.4s
365        FMIN v23.4s, v23.4s, v6.4s
366        FMIN v24.4s, v24.4s, v6.4s
367        FMIN v25.4s, v25.4s, v6.4s
368        FMIN v26.4s, v26.4s, v6.4s
369        FMIN v27.4s, v27.4s, v6.4s
370        FMAX v20.4s, v20.4s, v7.4s
371        FMAX v21.4s, v21.4s, v7.4s
372        FMAX v22.4s, v22.4s, v7.4s
373        FMAX v23.4s, v23.4s, v7.4s
374        FMAX v24.4s, v24.4s, v7.4s
375        FMAX v25.4s, v25.4s, v7.4s
376        FMAX v26.4s, v26.4s, v7.4s
377        FMAX v27.4s, v27.4s, v7.4s
378
379        # Store full 4 x 8
380        SUBS x1, x1, 8
381        B.LO 8f
382
383        STP q26, q27, [x7]
384        ADD x7, x7, x10
385        STP q24, q25, [x17]
386        ADD x17, x17, x10
387        STP q22, q23, [x16]
388        ADD x16, x16, x10
389        STP q20, q21,  [x6]
390        ADD  x6,  x6, x10
391
392        SUB x4, x4, x3  // a -= ks
393
394        # nc loop
395        B.HI 0b
396
397        // Restore x19, d12-d15 from stack
398        LDR x19,      [sp, 32]
399        LDP d14, d15, [sp, 16]
400        LDP d12, d13, [sp], 48
401        RET
402
403        # Remainder - 2 floats of A (8 bytes)
404        # 16 FMA + 4 LD64 A + 2 LDP B
4056:
406        LDR   d0,  [x13], 8
407        LDP  q16,  q17, [x5], 32
408        LD1   {v0.d}[1], [x14], 8
409        LDR   d1, [x15], 8
410        LD1   {v1.d}[1], [x8], 8
411        LDP  q18,  q19, [x5], 32
412        FMLA v20.4s, v16.4s,  v0.s[0]
413        FMLA v22.4s, v16.4s,  v0.s[2]
414        FMLA v24.4s, v16.4s,  v1.s[0]
415        FMLA v26.4s, v16.4s,  v1.s[2]
416        FMLA v21.4s, v17.4s,  v0.s[0]
417        FMLA v23.4s, v17.4s,  v0.s[2]
418        FMLA v25.4s, v17.4s,  v1.s[0]
419        FMLA v27.4s, v17.4s,  v1.s[2]
420
421        FMLA v20.4s, v18.4s,  v0.s[1]
422        FMLA v22.4s, v18.4s,  v0.s[3]
423        FMLA v24.4s, v18.4s,  v1.s[1]
424        FMLA v26.4s, v18.4s,  v1.s[3]
425        FMLA v21.4s, v19.4s,  v0.s[1]
426        FMLA v23.4s, v19.4s,  v0.s[3]
427        FMLA v25.4s, v19.4s,  v1.s[1]
428        FMLA v27.4s, v19.4s,  v1.s[3]
429
430        # Is there a remainder?- 1 floats of A (4 bytes)
431        TBZ x0, 2, 5b
432
4337:
434        # Remainder- 1 floats of A (4 bytes)
435        LDR   s0,  [x13], 4
436        LDP  q16,  q17, [x5], 32
437        LD1   {v0.s}[2], [x14], 4
438        LDR   s1, [x15], 4
439        LD1   {v1.s}[2], [x8], 4
440
441        FMLA v20.4s, v16.4s,  v0.s[0]
442        FMLA v22.4s, v16.4s,  v0.s[2]
443        FMLA v24.4s, v16.4s,  v1.s[0]
444        FMLA v26.4s, v16.4s,  v1.s[2]
445        FMLA v21.4s, v17.4s,  v0.s[0]
446        FMLA v23.4s, v17.4s,  v0.s[2]
447        FMLA v25.4s, v17.4s,  v1.s[0]
448        FMLA v27.4s, v17.4s,  v1.s[2]
449        B 5b
450
451        # Store odd width
4528:
453        TBZ x1, 2, 9f
454        STR q26,  [x7], 16
455        MOV v26.16b, v27.16b
456        STR q24, [x17], 16
457        MOV v24.16b, v25.16b
458        STR q22, [x16], 16
459        MOV v22.16b, v23.16b
460        STR q20,  [x6], 16
461        MOV v20.16b, v21.16b
4629:
463        TBZ x1, 1, 10f
464        STR d26,  [x7], 8
465        DUP d26, v26.d[1]
466        STR d24, [x17], 8
467        DUP d24, v24.d[1]
468        STR d22, [x16], 8
469        DUP d22, v22.d[1]
470        STR d20,  [x6], 8
471        DUP d20, v20.d[1]
472
47310:
474        TBZ x1, 0, 11f
475        STR s26,  [x7]
476        STR s24, [x17]
477        STR s22, [x16]
478        STR s20,  [x6]
47911:
480        // Restore x19, d12-d15 from stack
481        LDR x19,      [sp, 32]
482        LDP d14, d15, [sp, 16]
483        LDP d12, d13, [sp], 48
484        RET
485
486END_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53
487
488#ifdef __ELF__
489.section ".note.GNU-stack","",%progbits
490#endif
491