• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
24
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29#  x3 a0
30#  x9 a1
31# x10 a2
32# x11 a3
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x18 c3
39
40# x4 temporary vector shadow register
41
42# Vector register usage
43# A0  v0     v3
44# A1  v0[1]  v3[1]
45# A2  v1     v4
46# A3  v1[1]  v4[1]
47
48# B   v12 v13 v14 v15 second set of B
49# B   v16 v17 v18 v19 first set
50# C   v20 v21
51# C   v22 v23
52# C   v24 v25
53# C   v26 v27
54# Clamp v6 v7
55
56# unused A   v8 v9 v10 v11
57# x12 a4
58# x13 c4
59#  x7 c5
60# A4  v2     v5
61# A5  v2[1]  v5[1]
62# C   v28 v29
63# C   v30 v31
64
65BEGIN_FUNCTION xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53
66
67        # Clamp A and C pointers
68        CMP x0, 2                // if mr < 2
69        ADD x9, x3, x4           // a1 = a0 + a_stride
70        ADD x16, x6, x7          // c1 = c0 + cm_stride
71        CSEL x9, x3, x9, LO      //   a1 = a0
72        CSEL x16, x6, x16, LO    //   c1 = c0
73
74        ADD x10, x9, x4          // a2 = a1 + a_stride
75        ADD x17, x16, x7         // c2 = c1 + cm_stride
76                                 // if mr <= 2
77        CSEL x10, x9, x10, LS    //   a2 = a1
78        CSEL x17, x16, x17, LS   //   c2 = c1
79
80        CMP x0, 4                // if mr < 4
81        ADD x11, x10, x4         // a3 = a2 + a_stride
82        ADD x18, x17, x7         // c3 = c2 + cm_stride
83        CSEL x11, x10, x11, LO   //   a3 = a2
84        CSEL x18, x17, x18, LO   //   c3 = c2
85
86        # Load acc, params pointer
87        LDP x15, x8, [sp, 8]
88
89        # Load clamping_params values
90        LD2R {v6.4s, v7.4s}, [x8]
91
92        # Load cn_stride
93        LDR x14, [sp]
94
95        // Save d12-d15 on stack
96        STP d12, d13, [sp, -32]!
97        STP d14, d15, [sp, 16]
98
990:
100        # Load initial accumulators
101        LDP q20, q21, [x15], 32
102        LDP q22, q23, [x15], 32
103        LDP q24, q25, [x15], 32
104        LDP q26, q27, [x15], 32
105        PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
106        PRFM PLDL1KEEP,  [x3, 64]
107        PRFM PLDL1KEEP,  [x9,  0]
108        PRFM PLDL1KEEP,  [x9, 64]
109        PRFM PLDL1KEEP, [x10,  0]
110        PRFM PLDL1KEEP, [x10, 64]
111        PRFM PLDL1KEEP, [x11,  0]
112        PRFM PLDL1KEEP, [x11, 64]
113        PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
114        PRFM PLDL1KEEP, [x5,  64]
115        PRFM PLDL1KEEP, [x5, 128]
116        PRFM PLDL1KEEP, [x5, 192]
117
118        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
119        SUBS x0, x2, 16  // k = kc - 16
120        B.LO 5f
121
122        # Prologue - First group loads, no FMA
123        LDR   d0, [x3], 8              // a0
124        LDP q16, q17, [x5], 32         // b
125        LDR   d1, [x10], 8             // a2
126        LD1  {v0.d}[1],  [x9], 8       // a1
127        LD1  {v1.d}[1], [x11], 8       // a3
128        SUBS x0, x0, 16
129        LDR  q18, [x5], 16
130        LDR  d19, [x5], 8
131        LDR  x4, [x5], 8   // ins is in BLOCK 0
132
133        # Is there at least 4 floats (16 bytes) for main loop?
134        B.LO 2f
135
136        # Main loop - 4 floats of A (16 bytes)
137        # 32 FMA + 8 LD64 A + 8 LDR B
1381:
139        # First group of 16 FMA, Second group loads
140        // BLOCK 0
141        LDR   d3, [x3], 8              // a0
142        INS v19.d[1], x4               // b from second group
143        FMLA v20.4s, v16.4s,  v0.s[0]
144        LDR  x4, [x9], 8               // a1
145        FMLA v22.4s, v16.4s,  v0.s[2]
146        FMLA v24.4s, v16.4s,  v1.s[0]
147
148        // BLOCK 1
149        LDR  d12, [x5]
150        INS v3.d[1], x4                // a1 ins
151        FMLA v26.4s, v16.4s,  v1.s[2]
152        LDR  x4, [x5, 8]   // b
153        FMLA v21.4s, v17.4s,  v0.s[0]
154        FMLA v23.4s, v17.4s,  v0.s[2]
155
156        // BLOCK 2
157        LDR   d4, [x10], 8             // a2
158        INS v12.d[1], x4  // b  ins
159        FMLA v25.4s, v17.4s,  v1.s[0]
160        LDR  x4, [x11], 8              // a3
161        FMLA v27.4s, v17.4s,  v1.s[2]
162        FMLA v20.4s, v18.4s,  v0.s[1]
163
164        // BLOCK 3
165        LDR  d13, [x5, 16]
166        INS v4.d[1], x4                // a3 ins
167        FMLA v22.4s, v18.4s,  v0.s[3]
168        LDR  x4, [x5, 24]
169        FMLA v24.4s, v18.4s,  v1.s[1]
170        FMLA v26.4s, v18.4s,  v1.s[3]
171
172        // BLOCK 4
173        LDR  d14, [x5, 32]
174        INS v13.d[1], x4  // b
175        FMLA v21.4s, v19.4s,  v0.s[1]
176        LDR  x4, [x5, 40]
177        FMLA v23.4s, v19.4s,  v0.s[3]
178        FMLA v25.4s, v19.4s,  v1.s[1]
179
180        // BLOCK 5
181        // NOPs to ensure 4 cycle LDR lands on next LDR
182        LDR  d15, [x5, 48]
183        INS v14.d[1], x4  // b from previous
184        FMLA v27.4s, v19.4s,  v1.s[3]
185        LDR x4, [x5, 56]
186        NOP
187        NOP
188        NOP
189        NOP
190
191        # Second group of 16 FMA, First group of loads
192        // BLOCK 0
193        LDR   d0, [x3], 8              // a0
194        INS v15.d[1], x4  // b from previous
195        FMLA v20.4s, v12.4s,  v3.s[0]
196        LDR  x4, [x9], 8               // a1
197        FMLA v22.4s, v12.4s,  v3.s[2]
198        FMLA v24.4s, v12.4s,  v4.s[0]
199        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
200
201        // BLOCK 1
202        LDR  d16, [x5, 64]
203        INS v0.d[1], x4                // a1 ins
204        FMLA v26.4s, v12.4s,  v4.s[2]
205        LDR  x4, [x5, 72]  // b
206        FMLA v21.4s, v13.4s,  v3.s[0]
207        FMLA v23.4s, v13.4s,  v3.s[2]
208        PRFM PLDL1KEEP, [x9, 128]      // Prefetch A1
209
210        // BLOCK 2
211        LDR   d1, [x10], 8             // a2
212        INS v16.d[1], x4  // b
213        FMLA v25.4s, v13.4s,  v4.s[0]
214        LDR  x4, [x11], 8              // a3
215        FMLA v27.4s, v13.4s,  v4.s[2]
216        FMLA v20.4s, v14.4s,  v3.s[1]
217        PRFM PLDL1KEEP, [x10, 128]     // Prefetch A2
218
219        // BLOCK 3
220        LDR  d17, [x5, 80]
221        INS v1.d[1], x4                // a3 ins
222        FMLA v22.4s, v14.4s,  v3.s[3]
223        LDR  x4, [x5, 88]
224        FMLA v24.4s, v14.4s,  v4.s[1]
225        FMLA v26.4s, v14.4s,  v4.s[3]
226        PRFM PLDL1KEEP, [x11, 128]     // Prefetch A3
227
228        // BLOCK 4
229        LDR  d18, [x5, 96]
230        INS v17.d[1], x4  // b
231        FMLA v21.4s, v15.4s,  v3.s[1]
232        LDR  x4, [x5, 104]
233        FMLA v23.4s, v15.4s,  v3.s[3]
234        FMLA v25.4s, v15.4s,  v4.s[1]
235        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
236
237        // BLOCK 5
238        // NOTE that block needs to be 4 cycles for LDR not to stall
239        LDR  d19, [x5, 112]
240        INS v18.d[1], x4
241        FMLA v27.4s, v15.4s,  v4.s[3]
242        LDR  x4, [x5, 120]
243        SUBS x0, x0, 16
244        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
245        ADD x5, x5, 128
246        B.HS 1b
247
248        # Epilogue - 4 floats of A (16 bytes)
249        # 32 FMA + 8 LD64 A + 8 LDR B
2502:
251        # First group of 16 FMA, Second group loads
252        // BLOCK 0
253        LDR   d3, [x3], 8              // a0
254        INS v19.d[1], x4               // b from second group
255        FMLA v20.4s, v16.4s,  v0.s[0]
256        LDR  x4, [x9], 8               // a1
257        FMLA v22.4s, v16.4s,  v0.s[2]
258        FMLA v24.4s, v16.4s,  v1.s[0]
259
260        // BLOCK 1
261        LDR  d12, [x5]
262        INS v3.d[1], x4                // a1 ins
263        FMLA v26.4s, v16.4s,  v1.s[2]
264        LDR  x4, [x5, 8]   // b
265        FMLA v21.4s, v17.4s,  v0.s[0]
266        FMLA v23.4s, v17.4s,  v0.s[2]
267
268        // BLOCK 2
269        LDR   d4, [x10], 8             // a2
270        INS v12.d[1], x4  // b  ins
271        FMLA v25.4s, v17.4s,  v1.s[0]
272        LDR  x4, [x11], 8              // a3
273        FMLA v27.4s, v17.4s,  v1.s[2]
274        FMLA v20.4s, v18.4s,  v0.s[1]
275
276        // BLOCK 3
277        LDR  d13, [x5, 16]
278        INS v4.d[1], x4                // a3 ins
279        FMLA v22.4s, v18.4s,  v0.s[3]
280        LDR  x4, [x5, 24]
281        FMLA v24.4s, v18.4s,  v1.s[1]
282        FMLA v26.4s, v18.4s,  v1.s[3]
283
284        // BLOCK 4
285        LDR  d14, [x5, 32]
286        INS v13.d[1], x4  // b
287        FMLA v21.4s, v19.4s,  v0.s[1]
288        LDR  x4, [x5, 40]
289        FMLA v23.4s, v19.4s,  v0.s[3]
290        FMLA v25.4s, v19.4s,  v1.s[1]
291
292        // BLOCK 5
293        // NOPs to ensure 4 cycle LDR lands on next LDR
294        LDR  d15, [x5, 48]
295        INS v14.d[1], x4
296        FMLA v27.4s, v19.4s,  v1.s[3]
297        LDR x4, [x5, 56]
298        NOP // fma
299        NOP
300        NOP // fma
301        NOP
302
303        # Second group of 16 FMA, no loads
304        // BLOCK 0
305        INS v15.d[1], x4  // b from previous
306        FMLA v20.4s, v12.4s,  v3.s[0]
307        FMLA v22.4s, v12.4s,  v3.s[2]
308        FMLA v24.4s, v12.4s,  v4.s[0]
309
310        // BLOCK 1
311        FMLA v26.4s, v12.4s,  v4.s[2]
312        FMLA v21.4s, v13.4s,  v3.s[0]
313        FMLA v23.4s, v13.4s,  v3.s[2]
314
315        // BLOCK 2
316        FMLA v25.4s, v13.4s,  v4.s[0]
317        FMLA v27.4s, v13.4s,  v4.s[2]
318        FMLA v20.4s, v14.4s,  v3.s[1]
319
320        // BLOCK 3
321        FMLA v22.4s, v14.4s,  v3.s[3]
322        FMLA v24.4s, v14.4s,  v4.s[1]
323        FMLA v26.4s, v14.4s,  v4.s[3]
324        TST x0, 15
325
326        // BLOCK 4
327        FMLA v21.4s, v15.4s,  v3.s[1]
328        FMLA v23.4s, v15.4s,  v3.s[3]
329        FMLA v25.4s, v15.4s,  v4.s[1]
330        ADD x5, x5, 64
331
332        // BLOCK 5
333        FMLA v27.4s, v15.4s,  v4.s[3]
334
335        # Is there a remainder?- 2 floats of A (8 bytes) or less
336        B.NE 5f
337
3384:
339        # Clamp
340        FMIN v20.4s, v20.4s, v6.4s
341        SUBS x1, x1, 8
342        FMIN v21.4s, v21.4s, v6.4s
343        FMIN v22.4s, v22.4s, v6.4s
344        FMIN v23.4s, v23.4s, v6.4s
345        FMIN v24.4s, v24.4s, v6.4s
346        FMIN v25.4s, v25.4s, v6.4s
347        FMIN v26.4s, v26.4s, v6.4s
348        FMIN v27.4s, v27.4s, v6.4s
349        FMAX v20.4s, v20.4s, v7.4s
350        FMAX v21.4s, v21.4s, v7.4s
351        FMAX v22.4s, v22.4s, v7.4s
352        FMAX v23.4s, v23.4s, v7.4s
353        FMAX v24.4s, v24.4s, v7.4s
354        FMAX v25.4s, v25.4s, v7.4s
355        FMAX v26.4s, v26.4s, v7.4s
356        FMAX v27.4s, v27.4s, v7.4s
357
358        # Store full 4 x 8
359        B.LO 8f
360
361        ST1 {v26.16b, v27.16b}, [x18], x14
362        SUB  x3,  x3, x2 // a0 -= kc
363        ST1 {v24.16b, v25.16b}, [x17], x14
364        SUB  x9,  x9, x2 // a1 -= kc
365        ST1 {v22.16b, v23.16b}, [x16], x14
366        SUB x10, x10, x2 // a2 -= kc
367        ST1 {v20.16b, v21.16b},  [x6], x14
368        SUB x11, x11, x2 // a3 -= kc
369
370        B.HI 0b
371
372        // Restore d12-d15 from stack
373        LDP d14, d15, [sp, 16]
374        LDP d12, d13, [sp], 32
375        RET
376
3775:
378        # Is there a remainder?- 2 floats of A (8 bytes)
379        TBZ x0, 3, 6f
380
381        # Remainder- 2 floats of A (8 bytes)
382        LDR   d0,  [x3], 8
383        LDR  q16, [x5], 16
384        LD1   {v0.d}[1], [x9], 8
385        LDR   d1, [x10], 8
386        LD1   {v1.d}[1], [x11], 8
387        LDR  q17, [x5], 16
388        LDR  q18, [x5], 16
389        LDR  q19, [x5], 16
390        FMLA v20.4s, v16.4s,  v0.s[0]
391        FMLA v22.4s, v16.4s,  v0.s[2]
392        FMLA v24.4s, v16.4s,  v1.s[0]
393        FMLA v26.4s, v16.4s,  v1.s[2]
394        FMLA v21.4s, v17.4s,  v0.s[0]
395        FMLA v23.4s, v17.4s,  v0.s[2]
396        FMLA v25.4s, v17.4s,  v1.s[0]
397        FMLA v27.4s, v17.4s,  v1.s[2]
398
399        FMLA v20.4s, v18.4s,  v0.s[1]
400        FMLA v22.4s, v18.4s,  v0.s[3]
401        FMLA v24.4s, v18.4s,  v1.s[1]
402        FMLA v26.4s, v18.4s,  v1.s[3]
403        FMLA v21.4s, v19.4s,  v0.s[1]
404        FMLA v23.4s, v19.4s,  v0.s[3]
405        FMLA v25.4s, v19.4s,  v1.s[1]
406        FMLA v27.4s, v19.4s,  v1.s[3]
407
408        # Is there a remainder?- 1 floats of A (4 bytes)
409        TBZ x0, 2, 4b
410
4116:
412        # Remainder- 1 floats of A (4 bytes)
413        LDR   s0,  [x3], 4
414        LDR  q16, [x5], 16
415        LD1   {v0.s}[2], [x9], 4
416        LDR   s1, [x10], 4
417        LD1   {v1.s}[2], [x11], 4
418        LDR  q17, [x5], 16
419
420        FMLA v20.4s, v16.4s,  v0.s[0]
421        FMLA v22.4s, v16.4s,  v0.s[2]
422        FMLA v24.4s, v16.4s,  v1.s[0]
423        FMLA v26.4s, v16.4s,  v1.s[2]
424        FMLA v21.4s, v17.4s,  v0.s[0]
425        FMLA v23.4s, v17.4s,  v0.s[2]
426        FMLA v25.4s, v17.4s,  v1.s[0]
427        FMLA v27.4s, v17.4s,  v1.s[2]
428        B 4b
429
430        # Store odd width
4318:
432        TBZ x1, 2, 9f
433        STR q26, [x18], 16
434        MOV v26.16b, v27.16b
435        STR q24, [x17], 16
436        MOV v24.16b, v25.16b
437        STR q22, [x16], 16
438        MOV v22.16b, v23.16b
439        STR q20,  [x6], 16
440        MOV v20.16b, v21.16b
441
4429:
443        TBZ x1, 1, 10f
444        STR d26, [x18], 8
445        DUP d26, v26.d[1]
446        STR d24, [x17], 8
447        DUP d24, v24.d[1]
448        STR d22, [x16], 8
449        DUP d22, v22.d[1]
450        STR d20,  [x6], 8
451        DUP d20, v20.d[1]
452
45310:
454        TBZ x1, 0, 11f
455        STR s26, [x18]
456        STR s24, [x17]
457        STR s22, [x16]
458        STR s20,  [x6]
45911:
460        // Restore d12-d15 from stack
461        LDP d14, d15, [sp, 16]
462        LDP d12, d13, [sp], 32
463        RET
464
465END_FUNCTION xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53
466
467#ifdef __ELF__
468.section ".note.GNU-stack","",%progbits
469#endif
470