• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> (x0)
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31
32# C pointers
33#  x6 c0
34# x16 c1
35# x17 c2
36# x14 c3
37
38# x4 temporary vector shadow register
39
40# Vector register usage
41# A0  v0     v3
42# A1  v0[1]  v3[1]
43# A2  v1     v4
44# A3  v1[1]  v4[1]
45
46# B   v12 v13 v14 v15 second set of B
47# B   v16 v17 v18 v19 first set
48# C   v20 v21
49# C   v22 v23
50# C   v24 v25
51# C   v26 v27
52# Clamp v6 v7
53
54# unused A   v8 v9 v10 v11
55# x12 a4
56# x13 c4
57#  x7 c5
58# A4  v2     v5
59# A5  v2[1]  v5[1]
60# C   v28 v29
61# C   v30 v31
62
63BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55
64
65        $if INC:
66          # Load acc, params pointer
67          LDP x15, x8, [sp, 8]
68        $else:
69          # Load params pointer
70          LDR x8, [sp, 8]
71
72        # Clamp A and C pointers
73        CMP x0, 2                // if mr < 2
74        ADD x9, x3, x4           // a1 = a0 + a_stride
75        ADD x16, x6, x7          // c1 = c0 + cm_stride
76        CSEL x9, x3, x9, LO      //   a1 = a0
77        CSEL x16, x6, x16, LO    //   c1 = c0
78
79        ADD x10, x9, x4          // a2 = a1 + a_stride
80        ADD x17, x16, x7         // c2 = c1 + cm_stride
81                                 // if mr <= 2
82        CSEL x10, x9, x10, LS    //   a2 = a1
83        CSEL x17, x16, x17, LS   //   c2 = c1
84
85        CMP x0, 4                // if mr < 4
86        ADD x11, x10, x4         // a3 = a2 + a_stride
87        ADD x14, x17, x7         // c3 = c2 + cm_stride
88        CSEL x11, x10, x11, LO   //   a3 = a2
89        CSEL x14, x17, x14, LO   //   c3 = c2
90
91        # Load min/max values
92        LD2R {v6.4s, v7.4s}, [x8]
93
94        // Save d12-d15 on stack
95        STP d12, d13, [sp, -32]!
96        STP d14, d15, [sp, 16]
97
980:
99        $if INC:
100          # Load initial accumulators
101          LDP q20, q21, [x15], 32
102          LDP q22, q23, [x15], 32
103          LDP q24, q25, [x15], 32
104          LDP q26, q27, [x15], 32
105          PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
106          PRFM PLDL1KEEP,  [x3, 64]
107          PRFM PLDL1KEEP,  [x9,  0]
108          PRFM PLDL1KEEP,  [x9, 64]
109          PRFM PLDL1KEEP, [x10,  0]
110          PRFM PLDL1KEEP, [x10, 64]
111          PRFM PLDL1KEEP, [x11,  0]
112          PRFM PLDL1KEEP, [x11, 64]
113          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
114          PRFM PLDL1KEEP, [x5,  64]
115          PRFM PLDL1KEEP, [x5, 128]
116          PRFM PLDL1KEEP, [x5, 192]
117        $else:
118          # Load initial bias from w into accumulators
119          LDP q20, q21, [x5], 32
120          MOV v22.16b, v20.16b
121          PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
122          PRFM PLDL1KEEP,  [x3, 64]
123          MOV v23.16b, v21.16b
124          PRFM PLDL1KEEP,  [x9,  0]
125          PRFM PLDL1KEEP,  [x9, 64]
126          MOV v24.16b, v20.16b
127          PRFM PLDL1KEEP, [x10,  0]
128          PRFM PLDL1KEEP, [x10, 64]
129          MOV v25.16b, v21.16b
130          PRFM PLDL1KEEP, [x11,  0]
131          PRFM PLDL1KEEP, [x11, 64]
132          MOV v26.16b, v20.16b
133          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
134          MOV v27.16b, v21.16b
135          PRFM PLDL1KEEP, [x5,  64]
136          PRFM PLDL1KEEP, [x5, 128]
137          PRFM PLDL1KEEP, [x5, 192]
138
139        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
140        SUBS x0, x2, 16  // k = kc - 16
141        B.LO 4f
142
143        # Prologue - First group loads, no FMA
144        LDR   d0, [x3], 8              // a0
145        LDP q16, q17, [x5], 32         // b
146        LDR   d1, [x10], 8             // a2
147        LD1  {v0.d}[1],  [x9], 8       // a1
148        LD1  {v1.d}[1], [x11], 8       // a3
149        SUBS x0, x0, 16
150        LDR  q18, [x5], 16
151        LDR  d19, [x5], 8
152        LDR  x4, [x5], 8   // ins is in BLOCK 0
153
154        # Is there at least 4 floats (16 bytes) for main loop?
155        B.LO 2f
156
157        # Main loop - 4 floats of A (16 bytes)
158        # 32 FMA + 8 LD64 A + 8 LDR B
1591:
160        # First group of 16 FMA, Second group loads
161        // BLOCK 0
162        FMLA v20.4s, v16.4s,  v0.s[0]
163        LDR   d3, [x3], 8              // a0
164        FMLA v22.4s, v16.4s,  v0.s[2]
165        INS v19.d[1], x4               // b from second group
166        FMLA v24.4s, v16.4s,  v1.s[0]
167        LDR  x4, [x9], 8               // a1
168
169        // BLOCK 1
170        FMLA v26.4s, v16.4s,  v1.s[2]
171        LDR  d12, [x5]
172        FMLA v21.4s, v17.4s,  v0.s[0]
173        INS v3.d[1], x4                // a1 ins
174        FMLA v23.4s, v17.4s,  v0.s[2]
175        LDR  x4, [x5, 8]   // b
176
177        // BLOCK 2
178        FMLA v25.4s, v17.4s,  v1.s[0]
179        LDR   d4, [x10], 8             // a2
180        FMLA v27.4s, v17.4s,  v1.s[2]
181        INS v12.d[1], x4  // b  ins
182        FMLA v20.4s, v18.4s,  v0.s[1]
183        LDR  x4, [x11], 8              // a3
184
185        // BLOCK 3
186        FMLA v22.4s, v18.4s,  v0.s[3]
187        LDR  d13, [x5, 16]
188        FMLA v24.4s, v18.4s,  v1.s[1]
189        INS v4.d[1], x4                // a3 ins
190        FMLA v26.4s, v18.4s,  v1.s[3]
191        LDR  x4, [x5, 24]
192
193        // BLOCK 4
194        FMLA v21.4s, v19.4s,  v0.s[1]
195        LDR  d14, [x5, 32]
196        FMLA v23.4s, v19.4s,  v0.s[3]
197        INS v13.d[1], x4  // b
198        FMLA v25.4s, v19.4s,  v1.s[1]
199        LDR  x4, [x5, 40]
200
201        // BLOCK 5
202        // NOPs to ensure 4 cycle LDR lands on next LDR
203        FMLA v27.4s, v19.4s,  v1.s[3]
204        LDR  d15, [x5, 48]
205        NOP
206        INS v14.d[1], x4  // b from previous
207        SUBS x0, x0, 16
208        LDR x4, [x5, 56]
209
210        # Second group of 16 FMA, First group of loads
211        // BLOCK 0
212        FMLA v20.4s, v12.4s,  v3.s[0]
213        LDR   d0, [x3], 8              // a0
214        FMLA v22.4s, v12.4s,  v3.s[2]
215        INS v15.d[1], x4  // b from previous
216        FMLA v24.4s, v12.4s,  v4.s[0]
217        LDR  x4, [x9], 8               // a1
218
219        // BLOCK 1
220        FMLA v26.4s, v12.4s,  v4.s[2]
221        LDR  d16, [x5, 64]
222        FMLA v21.4s, v13.4s,  v3.s[0]
223        INS v0.d[1], x4                // a1 ins
224        FMLA v23.4s, v13.4s,  v3.s[2]
225        LDR  x4, [x5, 72]  // b
226
227        // BLOCK 2
228        FMLA v25.4s, v13.4s,  v4.s[0]
229        LDR   d1, [x10], 8             // a2
230        FMLA v27.4s, v13.4s,  v4.s[2]
231        INS v16.d[1], x4  // b
232        FMLA v20.4s, v14.4s,  v3.s[1]
233        LDR  x4, [x11], 8              // a3
234
235        // BLOCK 3
236        FMLA v22.4s, v14.4s,  v3.s[3]
237        LDR  d17, [x5, 80]
238        FMLA v24.4s, v14.4s,  v4.s[1]
239        INS v1.d[1], x4                // a3 ins
240        FMLA v26.4s, v14.4s,  v4.s[3]
241        LDR  x4, [x5, 88]
242
243        // BLOCK 4
244        FMLA v21.4s, v15.4s,  v3.s[1]
245        LDR  d18, [x5, 96]
246        FMLA v23.4s, v15.4s,  v3.s[3]
247        INS v17.d[1], x4  // b
248        FMLA v25.4s, v15.4s,  v4.s[1]
249        LDR  x4, [x5, 104]
250
251        // BLOCK 5
252        // NOTE that block needs to be 4 cycles for LDR not to stall
253        FMLA v27.4s, v15.4s,  v4.s[3]
254        LDR  d19, [x5, 112]
255        INS v18.d[1], x4
256        LDR  x4, [x5, 120]
257        ADD x5, x5, 128
258        B.HS 1b
259
260        # Epilogue - 4 floats of A (16 bytes)
261        # 32 FMA + 8 LD64 A + 8 LDR B
2622:
263        # First group of 16 FMA, Second group loads
264        // BLOCK 0
265        FMLA v20.4s, v16.4s,  v0.s[0]
266        LDR   d3, [x3], 8              // a0
267        FMLA v22.4s, v16.4s,  v0.s[2]
268        INS v19.d[1], x4               // b from second group
269        FMLA v24.4s, v16.4s,  v1.s[0]
270        LDR  x4, [x9], 8               // a1
271
272        // BLOCK 1
273        FMLA v26.4s, v16.4s,  v1.s[2]
274        LDR  d12, [x5]
275        FMLA v21.4s, v17.4s,  v0.s[0]
276        INS v3.d[1], x4                // a1 ins
277        FMLA v23.4s, v17.4s,  v0.s[2]
278        LDR  x4, [x5, 8]   // b
279
280        // BLOCK 2
281        FMLA v25.4s, v17.4s,  v1.s[0]
282        LDR   d4, [x10], 8             // a2
283        FMLA v27.4s, v17.4s,  v1.s[2]
284        INS v12.d[1], x4  // b  ins
285        FMLA v20.4s, v18.4s,  v0.s[1]
286        LDR  x4, [x11], 8              // a3
287
288        // BLOCK 3
289        FMLA v22.4s, v18.4s,  v0.s[3]
290        LDR  d13, [x5, 16]
291        FMLA v24.4s, v18.4s,  v1.s[1]
292        INS v4.d[1], x4                // a3 ins
293        FMLA v26.4s, v18.4s,  v1.s[3]
294        LDR  x4, [x5, 24]
295
296        // BLOCK 4
297        FMLA v21.4s, v19.4s,  v0.s[1]
298        LDR  d14, [x5, 32]
299        FMLA v23.4s, v19.4s,  v0.s[3]
300        INS v13.d[1], x4  // b
301        FMLA v25.4s, v19.4s,  v1.s[1]
302        LDR  x4, [x5, 40]
303
304        // BLOCK 5
305        // NOPs to ensure 4 cycle LDR lands on next LDR
306        FMLA v27.4s, v19.4s,  v1.s[3]
307        LDR  d15, [x5, 48]
308        NOP // fma
309        INS v14.d[1], x4
310        NOP
311        LDR x4, [x5, 56]
312
313        # Second group of 16 FMA, no loads
314        // BLOCK 0
315        FMLA v20.4s, v12.4s,  v3.s[0]
316        FMLA v22.4s, v12.4s,  v3.s[2]
317        INS v15.d[1], x4  // b from previous
318        FMLA v24.4s, v12.4s,  v4.s[0]
319
320        // BLOCK 1
321        FMLA v26.4s, v12.4s,  v4.s[2]
322        FMLA v21.4s, v13.4s,  v3.s[0]
323        FMLA v23.4s, v13.4s,  v3.s[2]
324
325        // BLOCK 2
326        FMLA v25.4s, v13.4s,  v4.s[0]
327        FMLA v27.4s, v13.4s,  v4.s[2]
328        FMLA v20.4s, v14.4s,  v3.s[1]
329
330        // BLOCK 3
331        FMLA v22.4s, v14.4s,  v3.s[3]
332        FMLA v24.4s, v14.4s,  v4.s[1]
333        FMLA v26.4s, v14.4s,  v4.s[3]
334        TST x0, 15
335
336        // BLOCK 4
337        FMLA v21.4s, v15.4s,  v3.s[1]
338        FMLA v23.4s, v15.4s,  v3.s[3]
339        FMLA v25.4s, v15.4s,  v4.s[1]
340        ADD x5, x5, 64
341
342        // BLOCK 5
343        FMLA v27.4s, v15.4s,  v4.s[3]
344
345        # Is there a remainder?- 2 floats of A (8 bytes) or less
346        B.NE 4f
347
3483:
349        # Clamp
350        FMAX v20.4s, v20.4s, v6.4s
351        # Load cn_stride
352        LDR x0, [sp, 32]
353        FMAX v21.4s, v21.4s, v6.4s
354        FMAX v22.4s, v22.4s, v6.4s
355        FMAX v23.4s, v23.4s, v6.4s
356        FMAX v24.4s, v24.4s, v6.4s
357        FMAX v25.4s, v25.4s, v6.4s
358        FMAX v26.4s, v26.4s, v6.4s
359        FMAX v27.4s, v27.4s, v6.4s
360        SUBS x1, x1, 8
361        FMIN v20.4s, v20.4s, v7.4s
362        FMIN v21.4s, v21.4s, v7.4s
363        FMIN v22.4s, v22.4s, v7.4s
364        FMIN v23.4s, v23.4s, v7.4s
365        FMIN v24.4s, v24.4s, v7.4s
366        FMIN v25.4s, v25.4s, v7.4s
367        FMIN v26.4s, v26.4s, v7.4s
368        FMIN v27.4s, v27.4s, v7.4s
369
370        # Store full 4 x 8
371        B.LO 6f
372
373        $if INC:
374          ST1 {v26.16b, v27.16b}, [x14], x0
375          SUB  x3,  x3, x2 // a0 -= kc
376          ST1 {v24.16b, v25.16b}, [x17], x0
377          SUB  x9,  x9, x2 // a1 -= kc
378          ST1 {v22.16b, v23.16b}, [x16], x0
379          SUB x10, x10, x2 // a2 -= kc
380          ST1 {v20.16b, v21.16b},  [x6], x0
381          SUB x11, x11, x2 // a3 -= kc
382        $else:
383          ST1 {v20.16b, v21.16b},  [x6], x0
384          SUB  x3,  x3, x2 // a0 -= kc
385          ST1 {v22.16b, v23.16b}, [x16], x0
386          SUB  x9,  x9, x2 // a1 -= kc
387          ST1 {v24.16b, v25.16b}, [x17], x0
388          SUB x10, x10, x2 // a2 -= kc
389          ST1 {v26.16b, v27.16b}, [x14], x0
390          SUB x11, x11, x2 // a3 -= kc
391
392        B.HI 0b
393
394        // Restore d12-d15 from stack
395        LDP d14, d15, [sp, 16]
396        LDP d12, d13, [sp], 32
397        RET
398
3994:
400        # Is there a remainder?- 2 floats of A (8 bytes)
401        TBZ x0, 3, 5f
402
403        # Remainder- 2 floats of A (8 bytes)
404        LDR   d0,  [x3], 8
405        LDR  q16, [x5], 16
406        LD1   {v0.d}[1], [x9], 8
407        LDR   d1, [x10], 8
408        LD1   {v1.d}[1], [x11], 8
409        LDR  q17, [x5], 16
410        LDR  q18, [x5], 16
411        LDR  q19, [x5], 16
412        FMLA v20.4s, v16.4s,  v0.s[0]
413        FMLA v22.4s, v16.4s,  v0.s[2]
414        FMLA v24.4s, v16.4s,  v1.s[0]
415        FMLA v26.4s, v16.4s,  v1.s[2]
416        FMLA v21.4s, v17.4s,  v0.s[0]
417        FMLA v23.4s, v17.4s,  v0.s[2]
418        FMLA v25.4s, v17.4s,  v1.s[0]
419        FMLA v27.4s, v17.4s,  v1.s[2]
420
421        FMLA v20.4s, v18.4s,  v0.s[1]
422        FMLA v22.4s, v18.4s,  v0.s[3]
423        FMLA v24.4s, v18.4s,  v1.s[1]
424        FMLA v26.4s, v18.4s,  v1.s[3]
425        FMLA v21.4s, v19.4s,  v0.s[1]
426        FMLA v23.4s, v19.4s,  v0.s[3]
427        FMLA v25.4s, v19.4s,  v1.s[1]
428        FMLA v27.4s, v19.4s,  v1.s[3]
429
430        # Is there a remainder?- 1 floats of A (4 bytes)
431        TBZ x0, 2, 3b
432
4335:
434        # Remainder- 1 floats of A (4 bytes)
435        LDR   s0,  [x3], 4
436        LDR  q16, [x5], 16
437        LD1   {v0.s}[2], [x9], 4
438        LDR   s1, [x10], 4
439        LD1   {v1.s}[2], [x11], 4
440        LDR  q17, [x5], 16
441
442        FMLA v20.4s, v16.4s,  v0.s[0]
443        FMLA v22.4s, v16.4s,  v0.s[2]
444        FMLA v24.4s, v16.4s,  v1.s[0]
445        FMLA v26.4s, v16.4s,  v1.s[2]
446        FMLA v21.4s, v17.4s,  v0.s[0]
447        FMLA v23.4s, v17.4s,  v0.s[2]
448        FMLA v25.4s, v17.4s,  v1.s[0]
449        FMLA v27.4s, v17.4s,  v1.s[2]
450        B 3b
451
452        # Store odd width
4536:
454        TBZ x1, 2, 7f
455        $if INC:
456          STR q26, [x14], 16
457          MOV v26.16b, v27.16b
458          STR q24, [x17], 16
459          MOV v24.16b, v25.16b
460          STR q22, [x16], 16
461          MOV v22.16b, v23.16b
462          STR q20,  [x6], 16
463          MOV v20.16b, v21.16b
464        $else:
465          STR q20,  [x6], 16
466          MOV v20.16b, v21.16b
467          STR q22, [x16], 16
468          MOV v22.16b, v23.16b
469          STR q24, [x17], 16
470          MOV v24.16b, v25.16b
471          STR q26, [x14], 16
472          MOV v26.16b, v27.16b
473
4747:
475        TBZ x1, 1, 8f
476        $if INC:
477          STR d26, [x14], 8
478          DUP d26, v26.d[1]
479          STR d24, [x17], 8
480          DUP d24, v24.d[1]
481          STR d22, [x16], 8
482          DUP d22, v22.d[1]
483          STR d20,  [x6], 8
484          DUP d20, v20.d[1]
485        $else:
486          STR d20,  [x6], 8
487          DUP d20, v20.d[1]
488          STR d22, [x16], 8
489          DUP d22, v22.d[1]
490          STR d24, [x17], 8
491          DUP d24, v24.d[1]
492          STR d26, [x14], 8
493          DUP d26, v26.d[1]
494
4958:
496        TBZ x1, 0, 9f
497        $if INC:
498          STR s26, [x14]
499          STR s24, [x17]
500          STR s22, [x16]
501          STR s20,  [x6]
502        $else:
503          STR s20,  [x6]
504          STR s22, [x16]
505          STR s24, [x17]
506          STR s26, [x14]
5079:
508        // Restore d12-d15 from stack
509        LDP d14, d15, [sp, 16]
510        LDP d12, d13, [sp], 32
511        RET
512
513END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55
514
515#ifdef __ELF__
516.section ".note.GNU-stack","",%progbits
517#endif
518