• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32
33# C pointers
34#  x6 c0
35# x16 c1
36# x17 c2
37# x18 c3
38
39# x4 temporary vector shadow register
40
41# Vector register usage
42# A0  v0     v3
43# A1  v0[1]  v3[1]
44# A2  v1     v4
45# A3  v1[1]  v4[1]
46
47# B   v12 v13 v14 v15 second set of B
48# B   v16 v17 v18 v19 first set
49# C   v20 v21
50# C   v22 v23
51# C   v24 v25
52# C   v26 v27
53# Clamp v6 v7
54
55# unused A   v8 v9 v10 v11
56# x12 a4
57# x13 c4
58#  x7 c5
59# A4  v2     v5
60# A5  v2[1]  v5[1]
61# C   v28 v29
62# C   v30 v31
63
64BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53
65
66        # Clamp A and C pointers
67        CMP x0, 2                // if mr < 2
68        ADD x9, x3, x4           // a1 = a0 + a_stride
69        ADD x16, x6, x7          // c1 = c0 + cm_stride
70        CSEL x9, x3, x9, LO      //   a1 = a0
71        CSEL x16, x6, x16, LO    //   c1 = c0
72
73        ADD x10, x9, x4          // a2 = a1 + a_stride
74        ADD x17, x16, x7         // c2 = c1 + cm_stride
75                                 // if mr <= 2
76        CSEL x10, x9, x10, LS    //   a2 = a1
77        CSEL x17, x16, x17, LS   //   c2 = c1
78
79        CMP x0, 4                // if mr < 4
80        ADD x11, x10, x4         // a3 = a2 + a_stride
81        ADD x18, x17, x7         // c3 = c2 + cm_stride
82        CSEL x11, x10, x11, LO   //   a3 = a2
83        CSEL x18, x17, x18, LO   //   c3 = c2
84
85        # Load params pointer
86        LDR x8, [sp, 8]
87
88        # Load clamping_params values
89        LD2R {v6.4s, v7.4s}, [x8]
90
91        # Load cn_stride
92        LDR x14, [sp]
93
94        // Save d12-d15 on stack
95        STP d12, d13, [sp, -32]!
96        STP d14, d15, [sp, 16]
97
980:
99        # Load initial bias from w into accumulators
100        LDP q20, q21, [x5], 32
101        MOV v22.16b, v20.16b
102        PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
103        PRFM PLDL1KEEP,  [x3, 64]
104        MOV v23.16b, v21.16b
105        PRFM PLDL1KEEP,  [x9,  0]
106        PRFM PLDL1KEEP,  [x9, 64]
107        MOV v24.16b, v20.16b
108        PRFM PLDL1KEEP, [x10,  0]
109        PRFM PLDL1KEEP, [x10, 64]
110        MOV v25.16b, v21.16b
111        PRFM PLDL1KEEP, [x11,  0]
112        PRFM PLDL1KEEP, [x11, 64]
113        MOV v26.16b, v20.16b
114        PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
115        MOV v27.16b, v21.16b
116        PRFM PLDL1KEEP, [x5,  64]
117        PRFM PLDL1KEEP, [x5, 128]
118        PRFM PLDL1KEEP, [x5, 192]
119
120        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
121        SUBS x0, x2, 16  // k = kc - 16
122        B.LO 5f
123
124        # Prologue - First group loads, no FMA
125        LDR   d0, [x3], 8              // a0
126        LDP q16, q17, [x5], 32         // b
127        LDR   d1, [x10], 8             // a2
128        LD1  {v0.d}[1],  [x9], 8       // a1
129        LD1  {v1.d}[1], [x11], 8       // a3
130        SUBS x0, x0, 16
131        LDR  q18, [x5], 16
132        LDR  d19, [x5], 8
133        LDR  x4, [x5], 8   // ins is in BLOCK 0
134
135        # Is there at least 4 floats (16 bytes) for main loop?
136        B.LO 2f
137
138        # Main loop - 4 floats of A (16 bytes)
139        # 32 FMA + 8 LD64 A + 8 LDR B
1401:
141        # First group of 16 FMA, Second group loads
142        // BLOCK 0
143        LDR   d3, [x3], 8              // a0
144        INS v19.d[1], x4               // b from second group
145        FMLA v20.4s, v16.4s,  v0.s[0]
146        LDR  x4, [x9], 8               // a1
147        FMLA v22.4s, v16.4s,  v0.s[2]
148        FMLA v24.4s, v16.4s,  v1.s[0]
149
150        // BLOCK 1
151        LDR  d12, [x5]
152        INS v3.d[1], x4                // a1 ins
153        FMLA v26.4s, v16.4s,  v1.s[2]
154        LDR  x4, [x5, 8]   // b
155        FMLA v21.4s, v17.4s,  v0.s[0]
156        FMLA v23.4s, v17.4s,  v0.s[2]
157
158        // BLOCK 2
159        LDR   d4, [x10], 8             // a2
160        INS v12.d[1], x4  // b  ins
161        FMLA v25.4s, v17.4s,  v1.s[0]
162        LDR  x4, [x11], 8              // a3
163        FMLA v27.4s, v17.4s,  v1.s[2]
164        FMLA v20.4s, v18.4s,  v0.s[1]
165
166        // BLOCK 3
167        LDR  d13, [x5, 16]
168        INS v4.d[1], x4                // a3 ins
169        FMLA v22.4s, v18.4s,  v0.s[3]
170        LDR  x4, [x5, 24]
171        FMLA v24.4s, v18.4s,  v1.s[1]
172        FMLA v26.4s, v18.4s,  v1.s[3]
173
174        // BLOCK 4
175        LDR  d14, [x5, 32]
176        INS v13.d[1], x4  // b
177        FMLA v21.4s, v19.4s,  v0.s[1]
178        LDR  x4, [x5, 40]
179        FMLA v23.4s, v19.4s,  v0.s[3]
180        FMLA v25.4s, v19.4s,  v1.s[1]
181
182        // BLOCK 5
183        // NOPs to ensure 4 cycle LDR lands on next LDR
184        LDR  d15, [x5, 48]
185        INS v14.d[1], x4  // b from previous
186        FMLA v27.4s, v19.4s,  v1.s[3]
187        LDR x4, [x5, 56]
188        NOP
189        NOP
190        NOP
191        NOP
192
193        # Second group of 16 FMA, First group of loads
194        // BLOCK 0
195        LDR   d0, [x3], 8              // a0
196        INS v15.d[1], x4  // b from previous
197        FMLA v20.4s, v12.4s,  v3.s[0]
198        LDR  x4, [x9], 8               // a1
199        FMLA v22.4s, v12.4s,  v3.s[2]
200        FMLA v24.4s, v12.4s,  v4.s[0]
201        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
202
203        // BLOCK 1
204        LDR  d16, [x5, 64]
205        INS v0.d[1], x4                // a1 ins
206        FMLA v26.4s, v12.4s,  v4.s[2]
207        LDR  x4, [x5, 72]  // b
208        FMLA v21.4s, v13.4s,  v3.s[0]
209        FMLA v23.4s, v13.4s,  v3.s[2]
210        PRFM PLDL1KEEP, [x9, 128]      // Prefetch A1
211
212        // BLOCK 2
213        LDR   d1, [x10], 8             // a2
214        INS v16.d[1], x4  // b
215        FMLA v25.4s, v13.4s,  v4.s[0]
216        LDR  x4, [x11], 8              // a3
217        FMLA v27.4s, v13.4s,  v4.s[2]
218        FMLA v20.4s, v14.4s,  v3.s[1]
219        PRFM PLDL1KEEP, [x10, 128]     // Prefetch A2
220
221        // BLOCK 3
222        LDR  d17, [x5, 80]
223        INS v1.d[1], x4                // a3 ins
224        FMLA v22.4s, v14.4s,  v3.s[3]
225        LDR  x4, [x5, 88]
226        FMLA v24.4s, v14.4s,  v4.s[1]
227        FMLA v26.4s, v14.4s,  v4.s[3]
228        PRFM PLDL1KEEP, [x11, 128]     // Prefetch A3
229
230        // BLOCK 4
231        LDR  d18, [x5, 96]
232        INS v17.d[1], x4  // b
233        FMLA v21.4s, v15.4s,  v3.s[1]
234        LDR  x4, [x5, 104]
235        FMLA v23.4s, v15.4s,  v3.s[3]
236        FMLA v25.4s, v15.4s,  v4.s[1]
237        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
238
239        // BLOCK 5
240        // NOTE that block needs to be 4 cycles for LDR not to stall
241        LDR  d19, [x5, 112]
242        INS v18.d[1], x4
243        FMLA v27.4s, v15.4s,  v4.s[3]
244        LDR  x4, [x5, 120]
245        SUBS x0, x0, 16
246        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
247        ADD x5, x5, 128
248        B.HS 1b
249
250        # Epilogue - 4 floats of A (16 bytes)
251        # 32 FMA + 8 LD64 A + 8 LDR B
2522:
253        # First group of 16 FMA, Second group loads
254        // BLOCK 0
255        LDR   d3, [x3], 8              // a0
256        INS v19.d[1], x4               // b from second group
257        FMLA v20.4s, v16.4s,  v0.s[0]
258        LDR  x4, [x9], 8               // a1
259        FMLA v22.4s, v16.4s,  v0.s[2]
260        FMLA v24.4s, v16.4s,  v1.s[0]
261
262        // BLOCK 1
263        LDR  d12, [x5]
264        INS v3.d[1], x4                // a1 ins
265        FMLA v26.4s, v16.4s,  v1.s[2]
266        LDR  x4, [x5, 8]   // b
267        FMLA v21.4s, v17.4s,  v0.s[0]
268        FMLA v23.4s, v17.4s,  v0.s[2]
269
270        // BLOCK 2
271        LDR   d4, [x10], 8             // a2
272        INS v12.d[1], x4  // b  ins
273        FMLA v25.4s, v17.4s,  v1.s[0]
274        LDR  x4, [x11], 8              // a3
275        FMLA v27.4s, v17.4s,  v1.s[2]
276        FMLA v20.4s, v18.4s,  v0.s[1]
277
278        // BLOCK 3
279        LDR  d13, [x5, 16]
280        INS v4.d[1], x4                // a3 ins
281        FMLA v22.4s, v18.4s,  v0.s[3]
282        LDR  x4, [x5, 24]
283        FMLA v24.4s, v18.4s,  v1.s[1]
284        FMLA v26.4s, v18.4s,  v1.s[3]
285
286        // BLOCK 4
287        LDR  d14, [x5, 32]
288        INS v13.d[1], x4  // b
289        FMLA v21.4s, v19.4s,  v0.s[1]
290        LDR  x4, [x5, 40]
291        FMLA v23.4s, v19.4s,  v0.s[3]
292        FMLA v25.4s, v19.4s,  v1.s[1]
293
294        // BLOCK 5
295        // NOPs to ensure 4 cycle LDR lands on next LDR
296        LDR  d15, [x5, 48]
297        INS v14.d[1], x4
298        FMLA v27.4s, v19.4s,  v1.s[3]
299        LDR x4, [x5, 56]
300        NOP // fma
301        NOP
302        NOP // fma
303        NOP
304
305        # Second group of 16 FMA, no loads
306        // BLOCK 0
307        INS v15.d[1], x4  // b from previous
308        FMLA v20.4s, v12.4s,  v3.s[0]
309        FMLA v22.4s, v12.4s,  v3.s[2]
310        FMLA v24.4s, v12.4s,  v4.s[0]
311
312        // BLOCK 1
313        FMLA v26.4s, v12.4s,  v4.s[2]
314        FMLA v21.4s, v13.4s,  v3.s[0]
315        FMLA v23.4s, v13.4s,  v3.s[2]
316
317        // BLOCK 2
318        FMLA v25.4s, v13.4s,  v4.s[0]
319        FMLA v27.4s, v13.4s,  v4.s[2]
320        FMLA v20.4s, v14.4s,  v3.s[1]
321
322        // BLOCK 3
323        FMLA v22.4s, v14.4s,  v3.s[3]
324        FMLA v24.4s, v14.4s,  v4.s[1]
325        FMLA v26.4s, v14.4s,  v4.s[3]
326        TST x0, 15
327
328        // BLOCK 4
329        FMLA v21.4s, v15.4s,  v3.s[1]
330        FMLA v23.4s, v15.4s,  v3.s[3]
331        FMLA v25.4s, v15.4s,  v4.s[1]
332        ADD x5, x5, 64
333
334        // BLOCK 5
335        FMLA v27.4s, v15.4s,  v4.s[3]
336
337        # Is there a remainder?- 2 floats of A (8 bytes) or less
338        B.NE 5f
339
3404:
341        # Clamp
342        FMIN v20.4s, v20.4s, v6.4s
343        SUBS x1, x1, 8
344        FMIN v21.4s, v21.4s, v6.4s
345        FMIN v22.4s, v22.4s, v6.4s
346        FMIN v23.4s, v23.4s, v6.4s
347        FMIN v24.4s, v24.4s, v6.4s
348        FMIN v25.4s, v25.4s, v6.4s
349        FMIN v26.4s, v26.4s, v6.4s
350        FMIN v27.4s, v27.4s, v6.4s
351        FMAX v20.4s, v20.4s, v7.4s
352        FMAX v21.4s, v21.4s, v7.4s
353        FMAX v22.4s, v22.4s, v7.4s
354        FMAX v23.4s, v23.4s, v7.4s
355        FMAX v24.4s, v24.4s, v7.4s
356        FMAX v25.4s, v25.4s, v7.4s
357        FMAX v26.4s, v26.4s, v7.4s
358        FMAX v27.4s, v27.4s, v7.4s
359
360        # Store full 4 x 8
361        B.LO 8f
362
363        ST1 {v20.16b, v21.16b},  [x6], x14
364        SUB  x3,  x3, x2 // a0 -= kc
365        ST1 {v22.16b, v23.16b}, [x16], x14
366        SUB  x9,  x9, x2 // a1 -= kc
367        ST1 {v24.16b, v25.16b}, [x17], x14
368        SUB x10, x10, x2 // a2 -= kc
369        ST1 {v26.16b, v27.16b}, [x18], x14
370        SUB x11, x11, x2 // a3 -= kc
371
372        B.HI 0b
373
374        // Restore d12-d15 from stack
375        LDP d14, d15, [sp, 16]
376        LDP d12, d13, [sp], 32
377        RET
378
3795:
380        # Is there a remainder?- 2 floats of A (8 bytes)
381        TBZ x0, 3, 6f
382
383        # Remainder- 2 floats of A (8 bytes)
384        LDR   d0,  [x3], 8
385        LDR  q16, [x5], 16
386        LD1   {v0.d}[1], [x9], 8
387        LDR   d1, [x10], 8
388        LD1   {v1.d}[1], [x11], 8
389        LDR  q17, [x5], 16
390        LDR  q18, [x5], 16
391        LDR  q19, [x5], 16
392        FMLA v20.4s, v16.4s,  v0.s[0]
393        FMLA v22.4s, v16.4s,  v0.s[2]
394        FMLA v24.4s, v16.4s,  v1.s[0]
395        FMLA v26.4s, v16.4s,  v1.s[2]
396        FMLA v21.4s, v17.4s,  v0.s[0]
397        FMLA v23.4s, v17.4s,  v0.s[2]
398        FMLA v25.4s, v17.4s,  v1.s[0]
399        FMLA v27.4s, v17.4s,  v1.s[2]
400
401        FMLA v20.4s, v18.4s,  v0.s[1]
402        FMLA v22.4s, v18.4s,  v0.s[3]
403        FMLA v24.4s, v18.4s,  v1.s[1]
404        FMLA v26.4s, v18.4s,  v1.s[3]
405        FMLA v21.4s, v19.4s,  v0.s[1]
406        FMLA v23.4s, v19.4s,  v0.s[3]
407        FMLA v25.4s, v19.4s,  v1.s[1]
408        FMLA v27.4s, v19.4s,  v1.s[3]
409
410        # Is there a remainder?- 1 floats of A (4 bytes)
411        TBZ x0, 2, 4b
412
4136:
414        # Remainder- 1 floats of A (4 bytes)
415        LDR   s0,  [x3], 4
416        LDR  q16, [x5], 16
417        LD1   {v0.s}[2], [x9], 4
418        LDR   s1, [x10], 4
419        LD1   {v1.s}[2], [x11], 4
420        LDR  q17, [x5], 16
421
422        FMLA v20.4s, v16.4s,  v0.s[0]
423        FMLA v22.4s, v16.4s,  v0.s[2]
424        FMLA v24.4s, v16.4s,  v1.s[0]
425        FMLA v26.4s, v16.4s,  v1.s[2]
426        FMLA v21.4s, v17.4s,  v0.s[0]
427        FMLA v23.4s, v17.4s,  v0.s[2]
428        FMLA v25.4s, v17.4s,  v1.s[0]
429        FMLA v27.4s, v17.4s,  v1.s[2]
430        B 4b
431
432        # Store odd width
4338:
434        TBZ x1, 2, 9f
435        STR q20,  [x6], 16
436        MOV v20.16b, v21.16b
437        STR q22, [x16], 16
438        MOV v22.16b, v23.16b
439        STR q24, [x17], 16
440        MOV v24.16b, v25.16b
441        STR q26, [x18], 16
442        MOV v26.16b, v27.16b
443
4449:
445        TBZ x1, 1, 10f
446        STR d20,  [x6], 8
447        DUP d20, v20.d[1]
448        STR d22, [x16], 8
449        DUP d22, v22.d[1]
450        STR d24, [x17], 8
451        DUP d24, v24.d[1]
452        STR d26, [x18], 8
453        DUP d26, v26.d[1]
454
45510:
456        TBZ x1, 0, 11f
457        STR s20,  [x6]
458        STR s22, [x16]
459        STR s24, [x17]
460        STR s26, [x18]
46111:
462        // Restore d12-d15 from stack
463        LDP d14, d15, [sp, 16]
464        LDP d12, d13, [sp], 32
465        RET
466
467END_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53
468
469#ifdef __ELF__
470.section ".note.GNU-stack","",%progbits
471#endif
472