• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a55.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> (x0)
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
24
25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32
33# C pointers
34#  x6 c0
35# x16 c1
36# x17 c2
37# x14 c3
38
39# x4 temporary vector shadow register
40
41# Vector register usage
42# A0  v0     v3
43# A1  v0[1]  v3[1]
44# A2  v1     v4
45# A3  v1[1]  v4[1]
46
47# B   v12 v13 v14 v15 second set of B
48# B   v16 v17 v18 v19 first set
49# C   v20 v21
50# C   v22 v23
51# C   v24 v25
52# C   v26 v27
53# Clamp v6 v7
54
55# unused A   v8 v9 v10 v11
56# x12 a4
57# x13 c4
58#  x7 c5
59# A4  v2     v5
60# A5  v2[1]  v5[1]
61# C   v28 v29
62# C   v30 v31
63
64BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55
65
66        # Load acc, params pointer
67        LDP x15, x8, [sp, 8]
68
69        # Clamp A and C pointers
70        CMP x0, 2                // if mr < 2
71        ADD x9, x3, x4           // a1 = a0 + a_stride
72        ADD x16, x6, x7          // c1 = c0 + cm_stride
73        CSEL x9, x3, x9, LO      //   a1 = a0
74        CSEL x16, x6, x16, LO    //   c1 = c0
75
76        ADD x10, x9, x4          // a2 = a1 + a_stride
77        ADD x17, x16, x7         // c2 = c1 + cm_stride
78                                 // if mr <= 2
79        CSEL x10, x9, x10, LS    //   a2 = a1
80        CSEL x17, x16, x17, LS   //   c2 = c1
81
82        CMP x0, 4                // if mr < 4
83        ADD x11, x10, x4         // a3 = a2 + a_stride
84        ADD x14, x17, x7         // c3 = c2 + cm_stride
85        CSEL x11, x10, x11, LO   //   a3 = a2
86        CSEL x14, x17, x14, LO   //   c3 = c2
87
88        # Load min/max values
89        LD2R {v6.4s, v7.4s}, [x8]
90
91        // Save d12-d15 on stack
92        STP d12, d13, [sp, -32]!
93        STP d14, d15, [sp, 16]
94
950:
96        # Load initial accumulators
97        LDP q20, q21, [x15], 32
98        LDP q22, q23, [x15], 32
99        LDP q24, q25, [x15], 32
100        LDP q26, q27, [x15], 32
101        PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
102        PRFM PLDL1KEEP,  [x3, 64]
103        PRFM PLDL1KEEP,  [x9,  0]
104        PRFM PLDL1KEEP,  [x9, 64]
105        PRFM PLDL1KEEP, [x10,  0]
106        PRFM PLDL1KEEP, [x10, 64]
107        PRFM PLDL1KEEP, [x11,  0]
108        PRFM PLDL1KEEP, [x11, 64]
109        PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
110        PRFM PLDL1KEEP, [x5,  64]
111        PRFM PLDL1KEEP, [x5, 128]
112        PRFM PLDL1KEEP, [x5, 192]
113
114        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
115        SUBS x0, x2, 16  // k = kc - 16
116        B.LO 4f
117
118        # Prologue - First group loads, no FMA
119        LDR   d0, [x3], 8              // a0
120        LDP q16, q17, [x5], 32         // b
121        LDR   d1, [x10], 8             // a2
122        LD1  {v0.d}[1],  [x9], 8       // a1
123        LD1  {v1.d}[1], [x11], 8       // a3
124        SUBS x0, x0, 16
125        LDR  q18, [x5], 16
126        LDR  d19, [x5], 8
127        LDR  x4, [x5], 8   // ins is in BLOCK 0
128
129        # Is there at least 4 floats (16 bytes) for main loop?
130        B.LO 2f
131
132        # Main loop - 4 floats of A (16 bytes)
133        # 32 FMA + 8 LD64 A + 8 LDR B
1341:
135        # First group of 16 FMA, Second group loads
136        // BLOCK 0
137        FMLA v20.4s, v16.4s,  v0.s[0]
138        LDR   d3, [x3], 8              // a0
139        FMLA v22.4s, v16.4s,  v0.s[2]
140        INS v19.d[1], x4               // b from second group
141        FMLA v24.4s, v16.4s,  v1.s[0]
142        LDR  x4, [x9], 8               // a1
143
144        // BLOCK 1
145        FMLA v26.4s, v16.4s,  v1.s[2]
146        LDR  d12, [x5]
147        FMLA v21.4s, v17.4s,  v0.s[0]
148        INS v3.d[1], x4                // a1 ins
149        FMLA v23.4s, v17.4s,  v0.s[2]
150        LDR  x4, [x5, 8]   // b
151
152        // BLOCK 2
153        FMLA v25.4s, v17.4s,  v1.s[0]
154        LDR   d4, [x10], 8             // a2
155        FMLA v27.4s, v17.4s,  v1.s[2]
156        INS v12.d[1], x4  // b  ins
157        FMLA v20.4s, v18.4s,  v0.s[1]
158        LDR  x4, [x11], 8              // a3
159
160        // BLOCK 3
161        FMLA v22.4s, v18.4s,  v0.s[3]
162        LDR  d13, [x5, 16]
163        FMLA v24.4s, v18.4s,  v1.s[1]
164        INS v4.d[1], x4                // a3 ins
165        FMLA v26.4s, v18.4s,  v1.s[3]
166        LDR  x4, [x5, 24]
167
168        // BLOCK 4
169        FMLA v21.4s, v19.4s,  v0.s[1]
170        LDR  d14, [x5, 32]
171        FMLA v23.4s, v19.4s,  v0.s[3]
172        INS v13.d[1], x4  // b
173        FMLA v25.4s, v19.4s,  v1.s[1]
174        LDR  x4, [x5, 40]
175
176        // BLOCK 5
177        // NOPs to ensure 4 cycle LDR lands on next LDR
178        FMLA v27.4s, v19.4s,  v1.s[3]
179        LDR  d15, [x5, 48]
180        NOP
181        INS v14.d[1], x4  // b from previous
182        SUBS x0, x0, 16
183        LDR x4, [x5, 56]
184
185        # Second group of 16 FMA, First group of loads
186        // BLOCK 0
187        FMLA v20.4s, v12.4s,  v3.s[0]
188        LDR   d0, [x3], 8              // a0
189        FMLA v22.4s, v12.4s,  v3.s[2]
190        INS v15.d[1], x4  // b from previous
191        FMLA v24.4s, v12.4s,  v4.s[0]
192        LDR  x4, [x9], 8               // a1
193
194        // BLOCK 1
195        FMLA v26.4s, v12.4s,  v4.s[2]
196        LDR  d16, [x5, 64]
197        FMLA v21.4s, v13.4s,  v3.s[0]
198        INS v0.d[1], x4                // a1 ins
199        FMLA v23.4s, v13.4s,  v3.s[2]
200        LDR  x4, [x5, 72]  // b
201
202        // BLOCK 2
203        FMLA v25.4s, v13.4s,  v4.s[0]
204        LDR   d1, [x10], 8             // a2
205        FMLA v27.4s, v13.4s,  v4.s[2]
206        INS v16.d[1], x4  // b
207        FMLA v20.4s, v14.4s,  v3.s[1]
208        LDR  x4, [x11], 8              // a3
209
210        // BLOCK 3
211        FMLA v22.4s, v14.4s,  v3.s[3]
212        LDR  d17, [x5, 80]
213        FMLA v24.4s, v14.4s,  v4.s[1]
214        INS v1.d[1], x4                // a3 ins
215        FMLA v26.4s, v14.4s,  v4.s[3]
216        LDR  x4, [x5, 88]
217
218        // BLOCK 4
219        FMLA v21.4s, v15.4s,  v3.s[1]
220        LDR  d18, [x5, 96]
221        FMLA v23.4s, v15.4s,  v3.s[3]
222        INS v17.d[1], x4  // b
223        FMLA v25.4s, v15.4s,  v4.s[1]
224        LDR  x4, [x5, 104]
225
226        // BLOCK 5
227        // NOTE that block needs to be 4 cycles for LDR not to stall
228        FMLA v27.4s, v15.4s,  v4.s[3]
229        LDR  d19, [x5, 112]
230        INS v18.d[1], x4
231        LDR  x4, [x5, 120]
232        ADD x5, x5, 128
233        B.HS 1b
234
235        # Epilogue - 4 floats of A (16 bytes)
236        # 32 FMA + 8 LD64 A + 8 LDR B
2372:
238        # First group of 16 FMA, Second group loads
239        // BLOCK 0
240        FMLA v20.4s, v16.4s,  v0.s[0]
241        LDR   d3, [x3], 8              // a0
242        FMLA v22.4s, v16.4s,  v0.s[2]
243        INS v19.d[1], x4               // b from second group
244        FMLA v24.4s, v16.4s,  v1.s[0]
245        LDR  x4, [x9], 8               // a1
246
247        // BLOCK 1
248        FMLA v26.4s, v16.4s,  v1.s[2]
249        LDR  d12, [x5]
250        FMLA v21.4s, v17.4s,  v0.s[0]
251        INS v3.d[1], x4                // a1 ins
252        FMLA v23.4s, v17.4s,  v0.s[2]
253        LDR  x4, [x5, 8]   // b
254
255        // BLOCK 2
256        FMLA v25.4s, v17.4s,  v1.s[0]
257        LDR   d4, [x10], 8             // a2
258        FMLA v27.4s, v17.4s,  v1.s[2]
259        INS v12.d[1], x4  // b  ins
260        FMLA v20.4s, v18.4s,  v0.s[1]
261        LDR  x4, [x11], 8              // a3
262
263        // BLOCK 3
264        FMLA v22.4s, v18.4s,  v0.s[3]
265        LDR  d13, [x5, 16]
266        FMLA v24.4s, v18.4s,  v1.s[1]
267        INS v4.d[1], x4                // a3 ins
268        FMLA v26.4s, v18.4s,  v1.s[3]
269        LDR  x4, [x5, 24]
270
271        // BLOCK 4
272        FMLA v21.4s, v19.4s,  v0.s[1]
273        LDR  d14, [x5, 32]
274        FMLA v23.4s, v19.4s,  v0.s[3]
275        INS v13.d[1], x4  // b
276        FMLA v25.4s, v19.4s,  v1.s[1]
277        LDR  x4, [x5, 40]
278
279        // BLOCK 5
280        // NOPs to ensure 4 cycle LDR lands on next LDR
281        FMLA v27.4s, v19.4s,  v1.s[3]
282        LDR  d15, [x5, 48]
283        NOP // fma
284        INS v14.d[1], x4
285        NOP
286        LDR x4, [x5, 56]
287
288        # Second group of 16 FMA, no loads
289        // BLOCK 0
290        FMLA v20.4s, v12.4s,  v3.s[0]
291        FMLA v22.4s, v12.4s,  v3.s[2]
292        INS v15.d[1], x4  // b from previous
293        FMLA v24.4s, v12.4s,  v4.s[0]
294
295        // BLOCK 1
296        FMLA v26.4s, v12.4s,  v4.s[2]
297        FMLA v21.4s, v13.4s,  v3.s[0]
298        FMLA v23.4s, v13.4s,  v3.s[2]
299
300        // BLOCK 2
301        FMLA v25.4s, v13.4s,  v4.s[0]
302        FMLA v27.4s, v13.4s,  v4.s[2]
303        FMLA v20.4s, v14.4s,  v3.s[1]
304
305        // BLOCK 3
306        FMLA v22.4s, v14.4s,  v3.s[3]
307        FMLA v24.4s, v14.4s,  v4.s[1]
308        FMLA v26.4s, v14.4s,  v4.s[3]
309        TST x0, 15
310
311        // BLOCK 4
312        FMLA v21.4s, v15.4s,  v3.s[1]
313        FMLA v23.4s, v15.4s,  v3.s[3]
314        FMLA v25.4s, v15.4s,  v4.s[1]
315        ADD x5, x5, 64
316
317        // BLOCK 5
318        FMLA v27.4s, v15.4s,  v4.s[3]
319
320        # Is there a remainder?- 2 floats of A (8 bytes) or less
321        B.NE 4f
322
3233:
324        # Clamp
325        FMAX v20.4s, v20.4s, v6.4s
326        # Load cn_stride
327        LDR x0, [sp, 32]
328        FMAX v21.4s, v21.4s, v6.4s
329        FMAX v22.4s, v22.4s, v6.4s
330        FMAX v23.4s, v23.4s, v6.4s
331        FMAX v24.4s, v24.4s, v6.4s
332        FMAX v25.4s, v25.4s, v6.4s
333        FMAX v26.4s, v26.4s, v6.4s
334        FMAX v27.4s, v27.4s, v6.4s
335        SUBS x1, x1, 8
336        FMIN v20.4s, v20.4s, v7.4s
337        FMIN v21.4s, v21.4s, v7.4s
338        FMIN v22.4s, v22.4s, v7.4s
339        FMIN v23.4s, v23.4s, v7.4s
340        FMIN v24.4s, v24.4s, v7.4s
341        FMIN v25.4s, v25.4s, v7.4s
342        FMIN v26.4s, v26.4s, v7.4s
343        FMIN v27.4s, v27.4s, v7.4s
344
345        # Store full 4 x 8
346        B.LO 6f
347
348        ST1 {v26.16b, v27.16b}, [x14], x0
349        SUB  x3,  x3, x2 // a0 -= kc
350        ST1 {v24.16b, v25.16b}, [x17], x0
351        SUB  x9,  x9, x2 // a1 -= kc
352        ST1 {v22.16b, v23.16b}, [x16], x0
353        SUB x10, x10, x2 // a2 -= kc
354        ST1 {v20.16b, v21.16b},  [x6], x0
355        SUB x11, x11, x2 // a3 -= kc
356
357        B.HI 0b
358
359        // Restore d12-d15 from stack
360        LDP d14, d15, [sp, 16]
361        LDP d12, d13, [sp], 32
362        RET
363
3644:
365        # Is there a remainder?- 2 floats of A (8 bytes)
366        TBZ x0, 3, 5f
367
368        # Remainder- 2 floats of A (8 bytes)
369        LDR   d0,  [x3], 8
370        LDR  q16, [x5], 16
371        LD1   {v0.d}[1], [x9], 8
372        LDR   d1, [x10], 8
373        LD1   {v1.d}[1], [x11], 8
374        LDR  q17, [x5], 16
375        LDR  q18, [x5], 16
376        LDR  q19, [x5], 16
377        FMLA v20.4s, v16.4s,  v0.s[0]
378        FMLA v22.4s, v16.4s,  v0.s[2]
379        FMLA v24.4s, v16.4s,  v1.s[0]
380        FMLA v26.4s, v16.4s,  v1.s[2]
381        FMLA v21.4s, v17.4s,  v0.s[0]
382        FMLA v23.4s, v17.4s,  v0.s[2]
383        FMLA v25.4s, v17.4s,  v1.s[0]
384        FMLA v27.4s, v17.4s,  v1.s[2]
385
386        FMLA v20.4s, v18.4s,  v0.s[1]
387        FMLA v22.4s, v18.4s,  v0.s[3]
388        FMLA v24.4s, v18.4s,  v1.s[1]
389        FMLA v26.4s, v18.4s,  v1.s[3]
390        FMLA v21.4s, v19.4s,  v0.s[1]
391        FMLA v23.4s, v19.4s,  v0.s[3]
392        FMLA v25.4s, v19.4s,  v1.s[1]
393        FMLA v27.4s, v19.4s,  v1.s[3]
394
395        # Is there a remainder?- 1 floats of A (4 bytes)
396        TBZ x0, 2, 3b
397
3985:
399        # Remainder- 1 floats of A (4 bytes)
400        LDR   s0,  [x3], 4
401        LDR  q16, [x5], 16
402        LD1   {v0.s}[2], [x9], 4
403        LDR   s1, [x10], 4
404        LD1   {v1.s}[2], [x11], 4
405        LDR  q17, [x5], 16
406
407        FMLA v20.4s, v16.4s,  v0.s[0]
408        FMLA v22.4s, v16.4s,  v0.s[2]
409        FMLA v24.4s, v16.4s,  v1.s[0]
410        FMLA v26.4s, v16.4s,  v1.s[2]
411        FMLA v21.4s, v17.4s,  v0.s[0]
412        FMLA v23.4s, v17.4s,  v0.s[2]
413        FMLA v25.4s, v17.4s,  v1.s[0]
414        FMLA v27.4s, v17.4s,  v1.s[2]
415        B 3b
416
417        # Store odd width
4186:
419        TBZ x1, 2, 7f
420        STR q26, [x14], 16
421        MOV v26.16b, v27.16b
422        STR q24, [x17], 16
423        MOV v24.16b, v25.16b
424        STR q22, [x16], 16
425        MOV v22.16b, v23.16b
426        STR q20,  [x6], 16
427        MOV v20.16b, v21.16b
428
4297:
430        TBZ x1, 1, 8f
431        STR d26, [x14], 8
432        DUP d26, v26.d[1]
433        STR d24, [x17], 8
434        DUP d24, v24.d[1]
435        STR d22, [x16], 8
436        DUP d22, v22.d[1]
437        STR d20,  [x6], 8
438        DUP d20, v20.d[1]
439
4408:
441        TBZ x1, 0, 9f
442        STR s26, [x14]
443        STR s24, [x17]
444        STR s22, [x16]
445        STR s20,  [x6]
4469:
447        // Restore d12-d15 from stack
448        LDP d14, d15, [sp, 16]
449        LDP d12, d13, [sp], 32
450        RET
451
452END_FUNCTION xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55
453
454#ifdef __ELF__
455.section ".note.GNU-stack","",%progbits
456#endif
457