• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_a53(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32
33# C pointers
34#  x6 c0
35# x16 c1
36# x17 c2
37# x18 c3
38
39# x4 temporary vector shadow register
40
41# Vector register usage
42# A0  v0     v3
43# A1  v0[1]  v3[1]
44# A2  v1     v4
45# A3  v1[1]  v4[1]
46
47# B   v12 v13 v14 v15 second set of B
48# B   v16 v17 v18 v19 first set
49# C   v20 v21
50# C   v22 v23
51# C   v24 v25
52# C   v26 v27
53# Clamp v6 v7
54
55# unused A   v8 v9 v10 v11
56# x12 a4
57# x13 c4
58#  x7 c5
59# A4  v2     v5
60# A5  v2[1]  v5[1]
61# C   v28 v29
62# C   v30 v31
63
64BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_a53
65
66        # Clamp A and C pointers
67        CMP x0, 2                // if mr < 2
68        ADD x9, x3, x4           // a1 = a0 + a_stride
69        ADD x16, x6, x7          // c1 = c0 + cm_stride
70        CSEL x9, x3, x9, LO      //   a1 = a0
71        CSEL x16, x6, x16, LO    //   c1 = c0
72
73        ADD x10, x9, x4          // a2 = a1 + a_stride
74        ADD x17, x16, x7         // c2 = c1 + cm_stride
75                                 // if mr <= 2
76        CSEL x10, x9, x10, LS    //   a2 = a1
77        CSEL x17, x16, x17, LS   //   c2 = c1
78
79        CMP x0, 4                // if mr < 4
80        ADD x11, x10, x4         // a3 = a2 + a_stride
81        ADD x18, x17, x7         // c3 = c2 + cm_stride
82        CSEL x11, x10, x11, LO   //   a3 = a2
83        CSEL x18, x17, x18, LO   //   c3 = c2
84
85        $if INC:
86          # Load acc, params pointer
87          LDP x15, x8, [sp, 8]
88        $else:
89          # Load params pointer
90          LDR x8, [sp, 8]
91
92        # Load clamping_params values
93        LD2R {v6.4s, v7.4s}, [x8]
94
95        # Load cn_stride
96        LDR x14, [sp]
97
98        // Save d12-d15 on stack
99        STP d12, d13, [sp, -32]!
100        STP d14, d15, [sp, 16]
101
1020:
103        $if INC:
104          # Load initial accumulators
105          LDP q20, q21, [x15], 32
106          LDP q22, q23, [x15], 32
107          LDP q24, q25, [x15], 32
108          LDP q26, q27, [x15], 32
109          PRFM PLDL1KEEP,  [x3,  0]  // Prefetch A
110          PRFM PLDL1KEEP,  [x3, 64]
111          PRFM PLDL1KEEP,  [x9,  0]
112          PRFM PLDL1KEEP,  [x9, 64]
113          PRFM PLDL1KEEP, [x10,  0]
114          PRFM PLDL1KEEP, [x10, 64]
115          PRFM PLDL1KEEP, [x11,  0]
116          PRFM PLDL1KEEP, [x11, 64]
117          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
118          PRFM PLDL1KEEP, [x5,  64]
119          PRFM PLDL1KEEP, [x5, 128]
120          PRFM PLDL1KEEP, [x5, 192]
121        $else:
122          # Load initial bias from w into accumulators
123          LDP q20, q21, [x5], 32
124          MOV v22.16b, v20.16b
125          PRFM PLDL1KEEP,  [x3,  0]    // Prefetch A
126          PRFM PLDL1KEEP,  [x3, 64]
127          MOV v23.16b, v21.16b
128          PRFM PLDL1KEEP,  [x9,  0]
129          PRFM PLDL1KEEP,  [x9, 64]
130          MOV v24.16b, v20.16b
131          PRFM PLDL1KEEP, [x10,  0]
132          PRFM PLDL1KEEP, [x10, 64]
133          MOV v25.16b, v21.16b
134          PRFM PLDL1KEEP, [x11,  0]
135          PRFM PLDL1KEEP, [x11, 64]
136          MOV v26.16b, v20.16b
137          PRFM PLDL1KEEP, [x5,   0]  // Prefetch B
138          MOV v27.16b, v21.16b
139          PRFM PLDL1KEEP, [x5,  64]
140          PRFM PLDL1KEEP, [x5, 128]
141          PRFM PLDL1KEEP, [x5, 192]
142
143        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
144        SUBS x0, x2, 16  // k = kc - 16
145        B.LO 5f
146
147        # Prologue - First group loads, no FMA
148        LDR   d0, [x3], 8              // a0
149        LDP q16, q17, [x5], 32         // b
150        LDR   d1, [x10], 8             // a2
151        LD1  {v0.d}[1],  [x9], 8       // a1
152        LD1  {v1.d}[1], [x11], 8       // a3
153        SUBS x0, x0, 16
154        LDR  q18, [x5], 16
155        LDR  d19, [x5], 8
156        LDR  x4, [x5], 8   // ins is in BLOCK 0
157
158        # Is there at least 4 floats (16 bytes) for main loop?
159        B.LO 2f
160
161        # Main loop - 4 floats of A (16 bytes)
162        # 32 FMA + 8 LD64 A + 8 LDR B
1631:
164        # First group of 16 FMA, Second group loads
165        // BLOCK 0
166        LDR   d3, [x3], 8              // a0
167        INS v19.d[1], x4               // b from second group
168        FMLA v20.4s, v16.4s,  v0.s[0]
169        LDR  x4, [x9], 8               // a1
170        FMLA v22.4s, v16.4s,  v0.s[2]
171        FMLA v24.4s, v16.4s,  v1.s[0]
172
173        // BLOCK 1
174        LDR  d12, [x5]
175        INS v3.d[1], x4                // a1 ins
176        FMLA v26.4s, v16.4s,  v1.s[2]
177        LDR  x4, [x5, 8]   // b
178        FMLA v21.4s, v17.4s,  v0.s[0]
179        FMLA v23.4s, v17.4s,  v0.s[2]
180
181        // BLOCK 2
182        LDR   d4, [x10], 8             // a2
183        INS v12.d[1], x4  // b  ins
184        FMLA v25.4s, v17.4s,  v1.s[0]
185        LDR  x4, [x11], 8              // a3
186        FMLA v27.4s, v17.4s,  v1.s[2]
187        FMLA v20.4s, v18.4s,  v0.s[1]
188
189        // BLOCK 3
190        LDR  d13, [x5, 16]
191        INS v4.d[1], x4                // a3 ins
192        FMLA v22.4s, v18.4s,  v0.s[3]
193        LDR  x4, [x5, 24]
194        FMLA v24.4s, v18.4s,  v1.s[1]
195        FMLA v26.4s, v18.4s,  v1.s[3]
196
197        // BLOCK 4
198        LDR  d14, [x5, 32]
199        INS v13.d[1], x4  // b
200        FMLA v21.4s, v19.4s,  v0.s[1]
201        LDR  x4, [x5, 40]
202        FMLA v23.4s, v19.4s,  v0.s[3]
203        FMLA v25.4s, v19.4s,  v1.s[1]
204
205        // BLOCK 5
206        // NOPs to ensure 4 cycle LDR lands on next LDR
207        LDR  d15, [x5, 48]
208        INS v14.d[1], x4  // b from previous
209        FMLA v27.4s, v19.4s,  v1.s[3]
210        LDR x4, [x5, 56]
211        NOP
212        NOP
213        NOP
214        NOP
215
216        # Second group of 16 FMA, First group of loads
217        // BLOCK 0
218        LDR   d0, [x3], 8              // a0
219        INS v15.d[1], x4  // b from previous
220        FMLA v20.4s, v12.4s,  v3.s[0]
221        LDR  x4, [x9], 8               // a1
222        FMLA v22.4s, v12.4s,  v3.s[2]
223        FMLA v24.4s, v12.4s,  v4.s[0]
224        PRFM PLDL1KEEP, [x3, 128]      // Prefetch A0
225
226        // BLOCK 1
227        LDR  d16, [x5, 64]
228        INS v0.d[1], x4                // a1 ins
229        FMLA v26.4s, v12.4s,  v4.s[2]
230        LDR  x4, [x5, 72]  // b
231        FMLA v21.4s, v13.4s,  v3.s[0]
232        FMLA v23.4s, v13.4s,  v3.s[2]
233        PRFM PLDL1KEEP, [x9, 128]      // Prefetch A1
234
235        // BLOCK 2
236        LDR   d1, [x10], 8             // a2
237        INS v16.d[1], x4  // b
238        FMLA v25.4s, v13.4s,  v4.s[0]
239        LDR  x4, [x11], 8              // a3
240        FMLA v27.4s, v13.4s,  v4.s[2]
241        FMLA v20.4s, v14.4s,  v3.s[1]
242        PRFM PLDL1KEEP, [x10, 128]     // Prefetch A2
243
244        // BLOCK 3
245        LDR  d17, [x5, 80]
246        INS v1.d[1], x4                // a3 ins
247        FMLA v22.4s, v14.4s,  v3.s[3]
248        LDR  x4, [x5, 88]
249        FMLA v24.4s, v14.4s,  v4.s[1]
250        FMLA v26.4s, v14.4s,  v4.s[3]
251        PRFM PLDL1KEEP, [x11, 128]     // Prefetch A3
252
253        // BLOCK 4
254        LDR  d18, [x5, 96]
255        INS v17.d[1], x4  // b
256        FMLA v21.4s, v15.4s,  v3.s[1]
257        LDR  x4, [x5, 104]
258        FMLA v23.4s, v15.4s,  v3.s[3]
259        FMLA v25.4s, v15.4s,  v4.s[1]
260        PRFM PLDL1KEEP, [x5, 192]      // Prefetch B
261
262        // BLOCK 5
263        // NOTE that block needs to be 4 cycles for LDR not to stall
264        LDR  d19, [x5, 112]
265        INS v18.d[1], x4
266        FMLA v27.4s, v15.4s,  v4.s[3]
267        LDR  x4, [x5, 120]
268        SUBS x0, x0, 16
269        PRFM PLDL1KEEP, [x5, 256]      // Prefetch B
270        ADD x5, x5, 128
271        B.HS 1b
272
273        # Epilogue - 4 floats of A (16 bytes)
274        # 32 FMA + 8 LD64 A + 8 LDR B
2752:
276        # First group of 16 FMA, Second group loads
277        // BLOCK 0
278        LDR   d3, [x3], 8              // a0
279        INS v19.d[1], x4               // b from second group
280        FMLA v20.4s, v16.4s,  v0.s[0]
281        LDR  x4, [x9], 8               // a1
282        FMLA v22.4s, v16.4s,  v0.s[2]
283        FMLA v24.4s, v16.4s,  v1.s[0]
284
285        // BLOCK 1
286        LDR  d12, [x5]
287        INS v3.d[1], x4                // a1 ins
288        FMLA v26.4s, v16.4s,  v1.s[2]
289        LDR  x4, [x5, 8]   // b
290        FMLA v21.4s, v17.4s,  v0.s[0]
291        FMLA v23.4s, v17.4s,  v0.s[2]
292
293        // BLOCK 2
294        LDR   d4, [x10], 8             // a2
295        INS v12.d[1], x4  // b  ins
296        FMLA v25.4s, v17.4s,  v1.s[0]
297        LDR  x4, [x11], 8              // a3
298        FMLA v27.4s, v17.4s,  v1.s[2]
299        FMLA v20.4s, v18.4s,  v0.s[1]
300
301        // BLOCK 3
302        LDR  d13, [x5, 16]
303        INS v4.d[1], x4                // a3 ins
304        FMLA v22.4s, v18.4s,  v0.s[3]
305        LDR  x4, [x5, 24]
306        FMLA v24.4s, v18.4s,  v1.s[1]
307        FMLA v26.4s, v18.4s,  v1.s[3]
308
309        // BLOCK 4
310        LDR  d14, [x5, 32]
311        INS v13.d[1], x4  // b
312        FMLA v21.4s, v19.4s,  v0.s[1]
313        LDR  x4, [x5, 40]
314        FMLA v23.4s, v19.4s,  v0.s[3]
315        FMLA v25.4s, v19.4s,  v1.s[1]
316
317        // BLOCK 5
318        // NOPs to ensure 4 cycle LDR lands on next LDR
319        LDR  d15, [x5, 48]
320        INS v14.d[1], x4
321        FMLA v27.4s, v19.4s,  v1.s[3]
322        LDR x4, [x5, 56]
323        NOP // fma
324        NOP
325        NOP // fma
326        NOP
327
328        # Second group of 16 FMA, no loads
329        // BLOCK 0
330        INS v15.d[1], x4  // b from previous
331        FMLA v20.4s, v12.4s,  v3.s[0]
332        FMLA v22.4s, v12.4s,  v3.s[2]
333        FMLA v24.4s, v12.4s,  v4.s[0]
334
335        // BLOCK 1
336        FMLA v26.4s, v12.4s,  v4.s[2]
337        FMLA v21.4s, v13.4s,  v3.s[0]
338        FMLA v23.4s, v13.4s,  v3.s[2]
339
340        // BLOCK 2
341        FMLA v25.4s, v13.4s,  v4.s[0]
342        FMLA v27.4s, v13.4s,  v4.s[2]
343        FMLA v20.4s, v14.4s,  v3.s[1]
344
345        // BLOCK 3
346        FMLA v22.4s, v14.4s,  v3.s[3]
347        FMLA v24.4s, v14.4s,  v4.s[1]
348        FMLA v26.4s, v14.4s,  v4.s[3]
349        TST x0, 15
350
351        // BLOCK 4
352        FMLA v21.4s, v15.4s,  v3.s[1]
353        FMLA v23.4s, v15.4s,  v3.s[3]
354        FMLA v25.4s, v15.4s,  v4.s[1]
355        ADD x5, x5, 64
356
357        // BLOCK 5
358        FMLA v27.4s, v15.4s,  v4.s[3]
359
360        # Is there a remainder?- 2 floats of A (8 bytes) or less
361        B.NE 5f
362
3634:
364        # Clamp
365        FMIN v20.4s, v20.4s, v6.4s
366        SUBS x1, x1, 8
367        FMIN v21.4s, v21.4s, v6.4s
368        FMIN v22.4s, v22.4s, v6.4s
369        FMIN v23.4s, v23.4s, v6.4s
370        FMIN v24.4s, v24.4s, v6.4s
371        FMIN v25.4s, v25.4s, v6.4s
372        FMIN v26.4s, v26.4s, v6.4s
373        FMIN v27.4s, v27.4s, v6.4s
374        FMAX v20.4s, v20.4s, v7.4s
375        FMAX v21.4s, v21.4s, v7.4s
376        FMAX v22.4s, v22.4s, v7.4s
377        FMAX v23.4s, v23.4s, v7.4s
378        FMAX v24.4s, v24.4s, v7.4s
379        FMAX v25.4s, v25.4s, v7.4s
380        FMAX v26.4s, v26.4s, v7.4s
381        FMAX v27.4s, v27.4s, v7.4s
382
383        # Store full 4 x 8
384        B.LO 8f
385
386        $if INC:
387          ST1 {v26.16b, v27.16b}, [x18], x14
388          SUB  x3,  x3, x2 // a0 -= kc
389          ST1 {v24.16b, v25.16b}, [x17], x14
390          SUB  x9,  x9, x2 // a1 -= kc
391          ST1 {v22.16b, v23.16b}, [x16], x14
392          SUB x10, x10, x2 // a2 -= kc
393          ST1 {v20.16b, v21.16b},  [x6], x14
394          SUB x11, x11, x2 // a3 -= kc
395        $else:
396          ST1 {v20.16b, v21.16b},  [x6], x14
397          SUB  x3,  x3, x2 // a0 -= kc
398          ST1 {v22.16b, v23.16b}, [x16], x14
399          SUB  x9,  x9, x2 // a1 -= kc
400          ST1 {v24.16b, v25.16b}, [x17], x14
401          SUB x10, x10, x2 // a2 -= kc
402          ST1 {v26.16b, v27.16b}, [x18], x14
403          SUB x11, x11, x2 // a3 -= kc
404
405        B.HI 0b
406
407        // Restore d12-d15 from stack
408        LDP d14, d15, [sp, 16]
409        LDP d12, d13, [sp], 32
410        RET
411
4125:
413        # Is there a remainder?- 2 floats of A (8 bytes)
414        TBZ x0, 3, 6f
415
416        # Remainder- 2 floats of A (8 bytes)
417        LDR   d0,  [x3], 8
418        LDR  q16, [x5], 16
419        LD1   {v0.d}[1], [x9], 8
420        LDR   d1, [x10], 8
421        LD1   {v1.d}[1], [x11], 8
422        LDR  q17, [x5], 16
423        LDR  q18, [x5], 16
424        LDR  q19, [x5], 16
425        FMLA v20.4s, v16.4s,  v0.s[0]
426        FMLA v22.4s, v16.4s,  v0.s[2]
427        FMLA v24.4s, v16.4s,  v1.s[0]
428        FMLA v26.4s, v16.4s,  v1.s[2]
429        FMLA v21.4s, v17.4s,  v0.s[0]
430        FMLA v23.4s, v17.4s,  v0.s[2]
431        FMLA v25.4s, v17.4s,  v1.s[0]
432        FMLA v27.4s, v17.4s,  v1.s[2]
433
434        FMLA v20.4s, v18.4s,  v0.s[1]
435        FMLA v22.4s, v18.4s,  v0.s[3]
436        FMLA v24.4s, v18.4s,  v1.s[1]
437        FMLA v26.4s, v18.4s,  v1.s[3]
438        FMLA v21.4s, v19.4s,  v0.s[1]
439        FMLA v23.4s, v19.4s,  v0.s[3]
440        FMLA v25.4s, v19.4s,  v1.s[1]
441        FMLA v27.4s, v19.4s,  v1.s[3]
442
443        # Is there a remainder?- 1 floats of A (4 bytes)
444        TBZ x0, 2, 4b
445
4466:
447        # Remainder- 1 floats of A (4 bytes)
448        LDR   s0,  [x3], 4
449        LDR  q16, [x5], 16
450        LD1   {v0.s}[2], [x9], 4
451        LDR   s1, [x10], 4
452        LD1   {v1.s}[2], [x11], 4
453        LDR  q17, [x5], 16
454
455        FMLA v20.4s, v16.4s,  v0.s[0]
456        FMLA v22.4s, v16.4s,  v0.s[2]
457        FMLA v24.4s, v16.4s,  v1.s[0]
458        FMLA v26.4s, v16.4s,  v1.s[2]
459        FMLA v21.4s, v17.4s,  v0.s[0]
460        FMLA v23.4s, v17.4s,  v0.s[2]
461        FMLA v25.4s, v17.4s,  v1.s[0]
462        FMLA v27.4s, v17.4s,  v1.s[2]
463        B 4b
464
465        # Store odd width
4668:
467        TBZ x1, 2, 9f
468        $if INC:
469          STR q26, [x18], 16
470          MOV v26.16b, v27.16b
471          STR q24, [x17], 16
472          MOV v24.16b, v25.16b
473          STR q22, [x16], 16
474          MOV v22.16b, v23.16b
475          STR q20,  [x6], 16
476          MOV v20.16b, v21.16b
477        $else:
478          STR q20,  [x6], 16
479          MOV v20.16b, v21.16b
480          STR q22, [x16], 16
481          MOV v22.16b, v23.16b
482          STR q24, [x17], 16
483          MOV v24.16b, v25.16b
484          STR q26, [x18], 16
485          MOV v26.16b, v27.16b
486
4879:
488        TBZ x1, 1, 10f
489        $if INC:
490          STR d26, [x18], 8
491          DUP d26, v26.d[1]
492          STR d24, [x17], 8
493          DUP d24, v24.d[1]
494          STR d22, [x16], 8
495          DUP d22, v22.d[1]
496          STR d20,  [x6], 8
497          DUP d20, v20.d[1]
498        $else:
499          STR d20,  [x6], 8
500          DUP d20, v20.d[1]
501          STR d22, [x16], 8
502          DUP d22, v22.d[1]
503          STR d24, [x17], 8
504          DUP d24, v24.d[1]
505          STR d26, [x18], 8
506          DUP d26, v26.d[1]
507
50810:
509        TBZ x1, 0, 11f
510        $if INC:
511          STR s26, [x18]
512          STR s24, [x17]
513          STR s22, [x16]
514          STR s20,  [x6]
515        $else:
516          STR s20,  [x6]
517          STR s22, [x16]
518          STR s24, [x17]
519          STR s26, [x18]
52011:
521        // Restore d12-d15 from stack
522        LDP d14, d15, [sp, 16]
523        LDP d12, d13, [sp], 32
524        RET
525
526END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_a53
527
528#ifdef __ELF__
529.section ".note.GNU-stack","",%progbits
530#endif
531