• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> (x0)
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31
32# C pointers
33#  x6 c0
34# x16 c1
35# x17 c2
36# x14 c3
37
38# x4 temporary vector shadow register
39
40# Vector register usage
41# A0  v0     v3
42# A1  v0[1]  v3[1]
43# A2  v1     v4
44# A3  v1[1]  v4[1]
45
46# B   v12 v13 v14 v15 second set of B
47# B   v16 v17 v18 v19 first set
48# C   v20 v21
49# C   v22 v23
50# C   v24 v25
51# C   v26 v27
52# Clamp v6 v7
53
54# unused A   v8 v9 v10 v11
55# x12 a4
56# x13 c4
57#  x7 c5
58# A4  v2     v5
59# A5  v2[1]  v5[1]
60# C   v28 v29
61# C   v30 v31
62
63BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53
64
65        $if INC:
66          # Load acc, params pointer
67          LDP x15, x8, [sp, 8]
68        $else:
69          # Load params pointer
70          LDR x8, [sp, 8]
71
72        # Clamp A and C pointers
73        CMP x0, 2                // if mr < 2
74        ADD x9, x3, x4           // a1 = a0 + a_stride
75        ADD x16, x6, x7          // c1 = c0 + cm_stride
76        CSEL x9, x3, x9, LO      //   a1 = a0
77        CSEL x16, x6, x16, LO    //   c1 = c0
78
79        ADD x10, x9, x4          // a2 = a1 + a_stride
80        ADD x17, x16, x7         // c2 = c1 + cm_stride
81                                 // if mr <= 2
82        CSEL x10, x9, x10, LS    //   a2 = a1
83        CSEL x17, x16, x17, LS   //   c2 = c1
84
85        CMP x0, 4                // if mr < 4
86        ADD x11, x10, x4         // a3 = a2 + a_stride
87        ADD x14, x17, x7         // c3 = c2 + cm_stride
88        CSEL x11, x10, x11, LO   //   a3 = a2
89        CSEL x14, x17, x14, LO   //   c3 = c2
90
91        # Load min/max values
92        LD2R {v6.4s, v7.4s}, [x8]
93
94        // Save d12-d15 on stack
95        STP d12, d13, [sp, -32]!
96        STP d14, d15, [sp, 16]
97
980:
99        $if INC:
100          # Load initial accumulators
101          LDP q20, q21, [x15], 32
102          LDP q22, q23, [x15], 32
103          LDP q24, q25, [x15], 32
104          LDP q26, q27, [x15], 32
105          PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
106          PRFM PLDL1KEEP,  [x3, 64]
107          PRFM PLDL1KEEP,  [x9,  0]
108          PRFM PLDL1KEEP,  [x9, 64]
109          PRFM PLDL1KEEP, [x10,  0]
110          PRFM PLDL1KEEP, [x10, 64]
111          PRFM PLDL1KEEP, [x11,  0]
112          PRFM PLDL1KEEP, [x11, 64]
113          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
114          PRFM PLDL1KEEP, [x5,  64]
115          PRFM PLDL1KEEP, [x5, 128]
116          PRFM PLDL1KEEP, [x5, 192]
117        $else:
118          # Load initial bias from w into accumulators
119          LDP q20, q21, [x5], 32
120          MOV v22.16b, v20.16b
121          PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
122          PRFM PLDL1KEEP,  [x3, 64]
123          MOV v23.16b, v21.16b
124          PRFM PLDL1KEEP,  [x9,  0]
125          PRFM PLDL1KEEP,  [x9, 64]
126          MOV v24.16b, v20.16b
127          PRFM PLDL1KEEP, [x10,  0]
128          PRFM PLDL1KEEP, [x10, 64]
129          MOV v25.16b, v21.16b
130          PRFM PLDL1KEEP, [x11,  0]
131          PRFM PLDL1KEEP, [x11, 64]
132          MOV v26.16b, v20.16b
133          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
134          MOV v27.16b, v21.16b
135          PRFM PLDL1KEEP, [x5,  64]
136          PRFM PLDL1KEEP, [x5, 128]
137          PRFM PLDL1KEEP, [x5, 192]
138
139        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
140        SUBS x0, x2, 16  // k = kc - 16
141        B.LO 4f
142
143        # Prologue - First group loads, no FMA
144        LDR   d0, [x3], 8              // a0
145        LDP q16, q17, [x5], 32         // b
146        LDR   d1, [x10], 8             // a2
147        LD1  {v0.d}[1],  [x9], 8       // a1
148        LD1  {v1.d}[1], [x11], 8       // a3
149        SUBS x0, x0, 16
150        LDR  q18, [x5], 16
151        LDR  d19, [x5], 8
152        LDR  x4, [x5], 8   // ins is in BLOCK 0
153
154        # Is there at least 4 floats (16 bytes) for main loop?
155        B.LO 2f
156
157        # Main loop - 4 floats of A (16 bytes)
158        # 32 FMA + 8 LD64 A + 8 LDR B
1591:
160        # First group of 16 FMA, Second group loads
161        // BLOCK 0
162        LDR   d3, [x3], 8              // a0
163        INS v19.d[1], x4               // b from second group
164        FMLA v20.4s, v16.4s,  v0.s[0]
165        LDR  x4, [x9], 8               // a1
166        FMLA v22.4s, v16.4s,  v0.s[2]
167        FMLA v24.4s, v16.4s,  v1.s[0]
168
169        // BLOCK 1
170        LDR  d12, [x5]
171        INS v3.d[1], x4                // a1 ins
172        FMLA v26.4s, v16.4s,  v1.s[2]
173        LDR  x4, [x5, 8]   // b
174        FMLA v21.4s, v17.4s,  v0.s[0]
175        FMLA v23.4s, v17.4s,  v0.s[2]
176
177        // BLOCK 2
178        LDR   d4, [x10], 8             // a2
179        INS v12.d[1], x4  // b  ins
180        FMLA v25.4s, v17.4s,  v1.s[0]
181        LDR  x4, [x11], 8              // a3
182        FMLA v27.4s, v17.4s,  v1.s[2]
183        FMLA v20.4s, v18.4s,  v0.s[1]
184
185        // BLOCK 3
186        LDR  d13, [x5, 16]
187        INS v4.d[1], x4                // a3 ins
188        FMLA v22.4s, v18.4s,  v0.s[3]
189        LDR  x4, [x5, 24]
190        FMLA v24.4s, v18.4s,  v1.s[1]
191        FMLA v26.4s, v18.4s,  v1.s[3]
192
193        // BLOCK 4
194        LDR  d14, [x5, 32]
195        INS v13.d[1], x4  // b
196        FMLA v21.4s, v19.4s,  v0.s[1]
197        LDR  x4, [x5, 40]
198        FMLA v23.4s, v19.4s,  v0.s[3]
199        FMLA v25.4s, v19.4s,  v1.s[1]
200
201        // BLOCK 5
202        // NOPs to ensure 4 cycle LDR lands on next LDR
203        LDR  d15, [x5, 48]
204        INS v14.d[1], x4  // b from previous
205        FMLA v27.4s, v19.4s,  v1.s[3]
206        LDR x4, [x5, 56]
207        NOP
208        NOP
209        NOP
210        NOP
211
212        # Second group of 16 FMA, First group of loads
213        // BLOCK 0
214        LDR   d0, [x3], 8              // a0
215        INS v15.d[1], x4  // b from previous
216        FMLA v20.4s, v12.4s,  v3.s[0]
217        LDR  x4, [x9], 8               // a1
218        FMLA v22.4s, v12.4s,  v3.s[2]
219        FMLA v24.4s, v12.4s,  v4.s[0]
220        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
221
222        // BLOCK 1
223        LDR  d16, [x5, 64]
224        INS v0.d[1], x4                // a1 ins
225        FMLA v26.4s, v12.4s,  v4.s[2]
226        LDR  x4, [x5, 72]  // b
227        FMLA v21.4s, v13.4s,  v3.s[0]
228        FMLA v23.4s, v13.4s,  v3.s[2]
229        PRFM PLDL1KEEP, [x9, 128]      // Prefetch A1
230
231        // BLOCK 2
232        LDR   d1, [x10], 8             // a2
233        INS v16.d[1], x4  // b
234        FMLA v25.4s, v13.4s,  v4.s[0]
235        LDR  x4, [x11], 8              // a3
236        FMLA v27.4s, v13.4s,  v4.s[2]
237        FMLA v20.4s, v14.4s,  v3.s[1]
238        PRFM PLDL1KEEP, [x10, 128]     // Prefetch A2
239
240        // BLOCK 3
241        LDR  d17, [x5, 80]
242        INS v1.d[1], x4                // a3 ins
243        FMLA v22.4s, v14.4s,  v3.s[3]
244        LDR  x4, [x5, 88]
245        FMLA v24.4s, v14.4s,  v4.s[1]
246        FMLA v26.4s, v14.4s,  v4.s[3]
247        PRFM PLDL1KEEP, [x11, 128]     // Prefetch A3
248
249        // BLOCK 4
250        LDR  d18, [x5, 96]
251        INS v17.d[1], x4  // b
252        FMLA v21.4s, v15.4s,  v3.s[1]
253        LDR  x4, [x5, 104]
254        FMLA v23.4s, v15.4s,  v3.s[3]
255        FMLA v25.4s, v15.4s,  v4.s[1]
256        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
257
258        // BLOCK 5
259        // NOTE that block needs to be 4 cycles for LDR not to stall
260        LDR  d19, [x5, 112]
261        INS v18.d[1], x4
262        FMLA v27.4s, v15.4s,  v4.s[3]
263        LDR  x4, [x5, 120]
264        SUBS x0, x0, 16
265        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
266        ADD x5, x5, 128
267        B.HS 1b
268
269        # Epilogue - 4 floats of A (16 bytes)
270        # 32 FMA + 8 LD64 A + 8 LDR B
2712:
272        # First group of 16 FMA, Second group loads
273        // BLOCK 0
274        LDR   d3, [x3], 8              // a0
275        INS v19.d[1], x4               // b from second group
276        FMLA v20.4s, v16.4s,  v0.s[0]
277        LDR  x4, [x9], 8               // a1
278        FMLA v22.4s, v16.4s,  v0.s[2]
279        FMLA v24.4s, v16.4s,  v1.s[0]
280
281        // BLOCK 1
282        LDR  d12, [x5]
283        INS v3.d[1], x4                // a1 ins
284        FMLA v26.4s, v16.4s,  v1.s[2]
285        LDR  x4, [x5, 8]   // b
286        FMLA v21.4s, v17.4s,  v0.s[0]
287        FMLA v23.4s, v17.4s,  v0.s[2]
288
289        // BLOCK 2
290        LDR   d4, [x10], 8             // a2
291        INS v12.d[1], x4  // b  ins
292        FMLA v25.4s, v17.4s,  v1.s[0]
293        LDR  x4, [x11], 8              // a3
294        FMLA v27.4s, v17.4s,  v1.s[2]
295        FMLA v20.4s, v18.4s,  v0.s[1]
296
297        // BLOCK 3
298        LDR  d13, [x5, 16]
299        INS v4.d[1], x4                // a3 ins
300        FMLA v22.4s, v18.4s,  v0.s[3]
301        LDR  x4, [x5, 24]
302        FMLA v24.4s, v18.4s,  v1.s[1]
303        FMLA v26.4s, v18.4s,  v1.s[3]
304
305        // BLOCK 4
306        LDR  d14, [x5, 32]
307        INS v13.d[1], x4  // b
308        FMLA v21.4s, v19.4s,  v0.s[1]
309        LDR  x4, [x5, 40]
310        FMLA v23.4s, v19.4s,  v0.s[3]
311        FMLA v25.4s, v19.4s,  v1.s[1]
312
313        // BLOCK 5
314        // NOPs to ensure 4 cycle LDR lands on next LDR
315        LDR  d15, [x5, 48]
316        INS v14.d[1], x4
317        FMLA v27.4s, v19.4s,  v1.s[3]
318        LDR x4, [x5, 56]
319        NOP // fma
320        NOP
321        NOP // fma
322        NOP
323
324        # Second group of 16 FMA, no loads
325        // BLOCK 0
326        INS v15.d[1], x4  // b from previous
327        FMLA v20.4s, v12.4s,  v3.s[0]
328        FMLA v22.4s, v12.4s,  v3.s[2]
329        FMLA v24.4s, v12.4s,  v4.s[0]
330
331        // BLOCK 1
332        FMLA v26.4s, v12.4s,  v4.s[2]
333        FMLA v21.4s, v13.4s,  v3.s[0]
334        FMLA v23.4s, v13.4s,  v3.s[2]
335
336        // BLOCK 2
337        FMLA v25.4s, v13.4s,  v4.s[0]
338        FMLA v27.4s, v13.4s,  v4.s[2]
339        FMLA v20.4s, v14.4s,  v3.s[1]
340
341        // BLOCK 3
342        FMLA v22.4s, v14.4s,  v3.s[3]
343        FMLA v24.4s, v14.4s,  v4.s[1]
344        FMLA v26.4s, v14.4s,  v4.s[3]
345        TST x0, 15
346
347        // BLOCK 4
348        FMLA v21.4s, v15.4s,  v3.s[1]
349        FMLA v23.4s, v15.4s,  v3.s[3]
350        FMLA v25.4s, v15.4s,  v4.s[1]
351        ADD x5, x5, 64
352
353        // BLOCK 5
354        FMLA v27.4s, v15.4s,  v4.s[3]
355
356        # Is there a remainder?- 2 floats of A (8 bytes) or less
357        B.NE 4f
358
3593:
360        # Clamp
361        FMAX v20.4s, v20.4s, v6.4s
362        # Load cn_stride
363        LDR x0, [sp, 32]
364        FMAX v21.4s, v21.4s, v6.4s
365        FMAX v22.4s, v22.4s, v6.4s
366        FMAX v23.4s, v23.4s, v6.4s
367        FMAX v24.4s, v24.4s, v6.4s
368        FMAX v25.4s, v25.4s, v6.4s
369        FMAX v26.4s, v26.4s, v6.4s
370        FMAX v27.4s, v27.4s, v6.4s
371        SUBS x1, x1, 8
372        FMIN v20.4s, v20.4s, v7.4s
373        FMIN v21.4s, v21.4s, v7.4s
374        FMIN v22.4s, v22.4s, v7.4s
375        FMIN v23.4s, v23.4s, v7.4s
376        FMIN v24.4s, v24.4s, v7.4s
377        FMIN v25.4s, v25.4s, v7.4s
378        FMIN v26.4s, v26.4s, v7.4s
379        FMIN v27.4s, v27.4s, v7.4s
380
381        # Store full 4 x 8
382        B.LO 6f
383
384        $if INC:
385          ST1 {v26.16b, v27.16b}, [x14], x0
386          SUB  x3,  x3, x2 // a0 -= kc
387          ST1 {v24.16b, v25.16b}, [x17], x0
388          SUB  x9,  x9, x2 // a1 -= kc
389          ST1 {v22.16b, v23.16b}, [x16], x0
390          SUB x10, x10, x2 // a2 -= kc
391          ST1 {v20.16b, v21.16b},  [x6], x0
392          SUB x11, x11, x2 // a3 -= kc
393        $else:
394          ST1 {v20.16b, v21.16b},  [x6], x0
395          SUB  x3,  x3, x2 // a0 -= kc
396          ST1 {v22.16b, v23.16b}, [x16], x0
397          SUB  x9,  x9, x2 // a1 -= kc
398          ST1 {v24.16b, v25.16b}, [x17], x0
399          SUB x10, x10, x2 // a2 -= kc
400          ST1 {v26.16b, v27.16b}, [x14], x0
401          SUB x11, x11, x2 // a3 -= kc
402
403        B.HI 0b
404
405        // Restore d12-d15 from stack
406        LDP d14, d15, [sp, 16]
407        LDP d12, d13, [sp], 32
408        RET
409
4104:
411        # Is there a remainder?- 2 floats of A (8 bytes)
412        TBZ x0, 3, 5f
413
414        # Remainder- 2 floats of A (8 bytes)
415        LDR   d0,  [x3], 8
416        LDR  q16, [x5], 16
417        LD1   {v0.d}[1], [x9], 8
418        LDR   d1, [x10], 8
419        LD1   {v1.d}[1], [x11], 8
420        LDR  q17, [x5], 16
421        LDR  q18, [x5], 16
422        LDR  q19, [x5], 16
423        FMLA v20.4s, v16.4s,  v0.s[0]
424        FMLA v22.4s, v16.4s,  v0.s[2]
425        FMLA v24.4s, v16.4s,  v1.s[0]
426        FMLA v26.4s, v16.4s,  v1.s[2]
427        FMLA v21.4s, v17.4s,  v0.s[0]
428        FMLA v23.4s, v17.4s,  v0.s[2]
429        FMLA v25.4s, v17.4s,  v1.s[0]
430        FMLA v27.4s, v17.4s,  v1.s[2]
431
432        FMLA v20.4s, v18.4s,  v0.s[1]
433        FMLA v22.4s, v18.4s,  v0.s[3]
434        FMLA v24.4s, v18.4s,  v1.s[1]
435        FMLA v26.4s, v18.4s,  v1.s[3]
436        FMLA v21.4s, v19.4s,  v0.s[1]
437        FMLA v23.4s, v19.4s,  v0.s[3]
438        FMLA v25.4s, v19.4s,  v1.s[1]
439        FMLA v27.4s, v19.4s,  v1.s[3]
440
441        # Is there a remainder?- 1 floats of A (4 bytes)
442        TBZ x0, 2, 3b
443
4445:
445        # Remainder- 1 floats of A (4 bytes)
446        LDR   s0,  [x3], 4
447        LDR  q16, [x5], 16
448        LD1   {v0.s}[2], [x9], 4
449        LDR   s1, [x10], 4
450        LD1   {v1.s}[2], [x11], 4
451        LDR  q17, [x5], 16
452
453        FMLA v20.4s, v16.4s,  v0.s[0]
454        FMLA v22.4s, v16.4s,  v0.s[2]
455        FMLA v24.4s, v16.4s,  v1.s[0]
456        FMLA v26.4s, v16.4s,  v1.s[2]
457        FMLA v21.4s, v17.4s,  v0.s[0]
458        FMLA v23.4s, v17.4s,  v0.s[2]
459        FMLA v25.4s, v17.4s,  v1.s[0]
460        FMLA v27.4s, v17.4s,  v1.s[2]
461        B 3b
462
463        # Store odd width
4646:
465        TBZ x1, 2, 7f
466        $if INC:
467          STR q26, [x14], 16
468          MOV v26.16b, v27.16b
469          STR q24, [x17], 16
470          MOV v24.16b, v25.16b
471          STR q22, [x16], 16
472          MOV v22.16b, v23.16b
473          STR q20,  [x6], 16
474          MOV v20.16b, v21.16b
475        $else:
476          STR q20,  [x6], 16
477          MOV v20.16b, v21.16b
478          STR q22, [x16], 16
479          MOV v22.16b, v23.16b
480          STR q24, [x17], 16
481          MOV v24.16b, v25.16b
482          STR q26, [x14], 16
483          MOV v26.16b, v27.16b
484
4857:
486        TBZ x1, 1, 8f
487        $if INC:
488          STR d26, [x14], 8
489          DUP d26, v26.d[1]
490          STR d24, [x17], 8
491          DUP d24, v24.d[1]
492          STR d22, [x16], 8
493          DUP d22, v22.d[1]
494          STR d20,  [x6], 8
495          DUP d20, v20.d[1]
496        $else:
497          STR d20,  [x6], 8
498          DUP d20, v20.d[1]
499          STR d22, [x16], 8
500          DUP d22, v22.d[1]
501          STR d24, [x17], 8
502          DUP d24, v24.d[1]
503          STR d26, [x14], 8
504          DUP d26, v26.d[1]
505
5068:
507        TBZ x1, 0, 9f
508        $if INC:
509          STR s26, [x14]
510          STR s24, [x17]
511          STR s22, [x16]
512          STR s20,  [x6]
513        $else:
514          STR s20,  [x6]
515          STR s22, [x16]
516          STR s24, [x17]
517          STR s26, [x14]
5189:
519        // Restore d12-d15 from stack
520        LDP d14, d15, [sp, 16]
521        LDP d12, d13, [sp], 32
522        RET
523
524END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53
525
526#ifdef __ELF__
527.section ".note.GNU-stack","",%progbits
528#endif
529