• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53(
9#     size_t mr,                         (x0) - unused.  mr = 1
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const float*restrict w,            x5
15#     float*restrict c,                  x6
16#     size_t cm_stride,                  (x7) - unused
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_output_params params [sp + 24] -> x8
21
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
25# A pointer
26# x8  a0
27
28# C pointer
29# x6  c0
30
31# Vector register usage and GPR shadows
32# a0  v0           first set of A
33# a0  v1           second set of A
34# B   v2  v3  v4   x14 x15 x16  first set of B
35# B   v5  v6  v7   x17 x18 x7
36# B  v23 v24 v25   x14 x15 x16  second set of B (same x as first set)
37# B  v17 v18 v19   x17 x18 x7
38# C  v20 v21 v22
39
40BEGIN_FUNCTION xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53
41
42        # Load cn_stride, a_offset
43        LDP x10, x11, [sp]
44
45        # Load zero, clamping params pointer
46        LDP x12, x8, [sp, 16]
47
48        # Load clamping_params values
49        LD2R {v30.4s, v31.4s}, [x8]
50
510:
52        # Load initial bias from w into accumulators
53        LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
54
55        PRFM PLDL1KEEP, [x5]
56        PRFM PLDL1KEEP, [x5, 64]
57        PRFM PLDL1KEEP, [x5, 128]
58        PRFM PLDL1KEEP, [x5, 192]
59        PRFM PLDL1KEEP, [x5, 256]
60        PRFM PLDL1KEEP, [x5, 320]
61
62        MOV x9, x3  // p = ks
63
641:
65        # Load next A pointer
66        LDR x8, [x4], 8
67
68        CMP x8, x12           // if a0 == zero
69        ADD x8, x8, x11       // a0 += a_offset
70        CSEL x8, x12, x8, EQ  //   a0 = zero, else += a0 + a_offset
71
72        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
73        SUBS x0, x2, 16  // k = kc - 16
74        B.LO 5f
75
76        # Prologue - loads for first group of 6 fma
77
78        # Read first block of 1 A.
79        LDR d0, [x8], 8    // a0
80
81        LDR d2, [x5]        // vb0x0123
82        LDR x14, [x5, 8]
83
84        LDR d3, [x5, 16]    // vb0x25567
85        LDR x15, [x5, 24]
86
87        LDR d4, [x5, 32]   // vb0x89AB
88        LDR x16, [x5, 40]
89
90        LDR d5, [x5, 48]   // vb1x0123
91        LDR x17, [x5, 56]
92
93        LDR d6, [x5, 64]   // vb1x25567
94        LDR x18, [x5, 72]
95
96        LDR d7, [x5, 80]   // vb1x89AB
97        LDR x7, [x5, 88]
98        INS v2.d[1], x14
99        ADD x5, x5, 96
100
101        # Is there at least 4 floats (16 bytes) for main loop?
102        SUBS x0, x0, 16  // 4 floats for main loop
103        B.LO 3f
104
105        # Main loop - 4 floats of A (16 bytes)
1062:
107        # First group of 6 fma.
108        # A is loaded for 2nd group into v1
109
110        # BLOCK 0
111        LDR d1, [x8], 8         // a0
112        INS v3.d[1], x15
113        FMLA v20.4s, v2.4s, v0.s[0]
114        PRFM PLDL1KEEP, [x5, 192]
115
116        # BLOCK 1
117        INS v4.d[1], x16
118        FMLA v21.4s, v3.4s, v0.s[0]
119        PRFM PLDL1KEEP, [x5, 256]
120
121        # BLOCK 2
122        LDR d23, [x5]       // vb0x0123
123        INS v5.d[1], x17
124        LDR x14, [x5, 8]
125        PRFM PLDL1KEEP, [x5, 320]
126        FMLA v22.4s, v4.4s, v0.s[0]
127
128        # BLOCK 3
129        LDR d24, [x5, 16]   // vb0x25567
130        INS v6.d[1], x18
131        LDR x15, [x5, 24]
132
133        # BLOCK 4
134        LDR d25, [x5, 32]   // vb0x89AB
135        INS v7.d[1], x7
136        FMLA v20.4s, v5.4s, v0.s[1]
137        LDR x16, [x5, 40]
138
139        # BLOCK 5
140        LDR d17, [x5, 48]   // vb1x0123
141        LDR x17, [x5, 56]
142        FMLA v21.4s, v6.4s, v0.s[1]
143
144        # BLOCK 6
145        LDR d18, [x5, 64]   // vb1x25567
146        LDR x18, [x5, 72]
147        FMLA v22.4s, v7.4s, v0.s[1]
148
149        # BLOCK 7
150        LDR d19, [x5, 80]   // vb1x89AB
151        INS v23.d[1], x14   // v23 was loaded in block 2
152        LDR x7, [x5, 88]
153
154        # Second group of 6 fma.
155        # A is loaded for 1st group into v0
156
157        # BLOCK 0
158        LDR d0, [x8], 8         // a0
159        INS v24.d[1], x15
160        FMLA v20.4s, v23.4s, v1.s[0]
161
162        # BLOCK 1
163        INS v25.d[1], x16
164        FMLA v21.4s, v24.4s, v1.s[0]
165
166        # BLOCK 2
167        LDR d2, [x5, 96]        // vb0x0123
168        INS v17.d[1], x17
169        LDR x14, [x5, 104]
170        FMLA v22.4s, v25.4s, v1.s[0]
171
172        # BLOCK 3
173        LDR d3, [x5, 112]    // vb0x25567
174        INS v18.d[1], x18
175        LDR x15, [x5, 120]
176
177        # BLOCK 4
178        LDR d4, [x5, 128]   // vb0x89AB
179        INS v19.d[1], x7
180        FMLA v20.4s, v17.4s, v1.s[1]
181        LDR x16, [x5, 136]
182
183        # BLOCK 5
184        LDR d5, [x5, 144]   // vb1x0123
185        LDR x17, [x5, 152]
186        FMLA v21.4s, v18.4s, v1.s[1]
187
188        # BLOCK 6
189        LDR d6, [x5, 160]   // vb1x25567
190        LDR x18, [x5, 168]
191        SUBS x0, x0, 16
192        FMLA v22.4s, v19.4s, v1.s[1]
193
194        # BLOCK 7
195        LDR d7, [x5, 176]   // vb1x89AB
196        INS v2.d[1], x14
197        LDR x7, [x5, 184]
198        ADD x5, x5, 192
199        B.HS 2b
200
201        # Epilogue
202        # First block same as main loop.  Second block has no loads.
2033:
204        # BLOCK 0
205        LDR d1, [x8], 8         // a0
206        INS v3.d[1], x15
207        FMLA v20.4s, v2.4s, v0.s[0]
208        PRFM PLDL1KEEP, [x5, 192]
209
210        # BLOCK 1
211        INS v4.d[1], x16
212        FMLA v21.4s, v3.4s, v0.s[0]
213        PRFM PLDL1KEEP, [x5, 256]
214
215        # BLOCK 2
216        LDR d23, [x5]       // vb0x0123
217        INS v5.d[1], x17
218        LDR x14, [x5, 8]
219        PRFM PLDL1KEEP, [x5, 320]
220        FMLA v22.4s, v4.4s, v0.s[0]
221
222        # BLOCK 3
223        LDR d24, [x5, 16]   // vb0x25567
224        INS v6.d[1], x18
225        LDR x15, [x5, 24]
226
227        # BLOCK 4
228        LDR d25, [x5, 32]   // vb0x89AB
229        INS v7.d[1], x7
230        FMLA v20.4s, v5.4s, v0.s[1]
231        LDR x16, [x5, 40]
232
233        # BLOCK 5
234        LDR d17, [x5, 48]   // vb1x0123
235        LDR x17, [x5, 56]
236        FMLA v21.4s, v6.4s, v0.s[1]
237
238        # BLOCK 6
239        LDR d18, [x5, 64]   // vb1x25567
240        LDR x18, [x5, 72]
241        FMLA v22.4s, v7.4s, v0.s[1]
242
243        # BLOCK 7
244        LDR d19, [x5, 80]   // vb1x89AB
245        INS v23.d[1], x14   // v23 was loaded in block 2
246        LDR x7, [x5, 88]
247        ADD x5, x5, 96
248
249        # Second group of 6 fma.  8 blocks of 4 cycles.
250        # Epilogue version does no loads
251
252        # BLOCK 0
253        INS v24.d[1], x15
254        FMLA v20.4s, v23.4s, v1.s[0]
255
256        # BLOCK 1
257        INS v25.d[1], x16
258        FMLA v21.4s, v24.4s, v1.s[0]
259
260        # BLOCK 2
261        INS v17.d[1], x17
262        FMLA v22.4s, v25.4s, v1.s[0]
263
264        # BLOCK 3
265        INS v18.d[1], x18
266
267        # BLOCK 4
268        INS v19.d[1], x7
269        FMLA v20.4s, v17.4s, v1.s[1]
270        TST x0, 15
271
272        # BLOCK 5
273        FMLA v21.4s, v18.4s, v1.s[1]
274
275        # BLOCK 6
276        FMLA v22.4s, v19.4s, v1.s[1]
277
278        # BLOCK 7
279        # Is there a remainder?- 2 floats of A (8 bytes) or less
280        B.NE 5f
281
2824:
283        # ks loop
284        SUBS x9, x9, 8  // ks -= MR * sizeof(void*)
285        B.NE 1b
286
287        # Clamp
288        FMIN v20.4s, v20.4s, v30.4s
289        FMIN v21.4s, v21.4s, v30.4s
290        FMIN v22.4s, v22.4s, v30.4s
291        FMAX v20.4s, v20.4s, v31.4s
292        FMAX v21.4s, v21.4s, v31.4s
293        FMAX v22.4s, v22.4s, v31.4s
294
295        # Store full 1 x 12
296        SUBS x1, x1, 12
297        B.LO 8f
298
299        ST1 {v20.16b, v21.16b, v22.16b}, [x6], x10
300        SUB x4, x4, x3  // a -= ks
301
302        # nc loop
303        B.HI 0b
304        RET
305
3065:
307        # Is there a remainder?- 2 floats of A (8 bytes)
308        TBZ x0, 3, 6f
309
310        # Remainder- 2 floats of A (8 bytes)
311        LDR d0, [x8], 8  // a0
312        LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
313        LD1 {v5.16b, v6.16b, v7.16b}, [x5], 48
314
315        # First block of 3 B
316        FMLA v20.4s, v2.4s, v0.s[0]
317        FMLA v21.4s, v3.4s, v0.s[0]
318        FMLA v22.4s, v4.4s, v0.s[0]
319
320        # Second block of 3 B
321        FMLA v20.4s, v5.4s, v0.s[1]
322        FMLA v21.4s, v6.4s, v0.s[1]
323        FMLA v22.4s, v7.4s, v0.s[1]
324
325        TBZ x0, 2, 4b
3266:
327        # Remainder - 1 float of A (4 bytes)
328        LDR s0, [x8], 4  // a0
329        LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
330
331        FMLA v20.4s, v2.4s, v0.s[0]
332        FMLA v21.4s, v3.4s, v0.s[0]
333        FMLA v22.4s, v4.4s, v0.s[0]
334        B 4b
335
3368:
337        ADD x1, x1, 12
338        # Store odd channels
339        TBZ x1, 3, 9f
340        STP q20, q21, [x6]
341        ADD x6, x6, 32
342        MOV v20.16b, v22.16b
343
3449:
345        TBZ x1, 2, 10f
346        STR q20, [x6], 16
347        MOV v20.16b, v21.16b
348
34910:
350        TBZ x1, 1, 11f
351        STR d20, [x6], 8
352        DUP d20, v20.d[1]
353
35411:
355        TBZ x1, 0, 12f
356        STR s20, [x6]
35712:
358        RET
359
360END_FUNCTION xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53
361
362#ifdef __ELF__
363.section ".note.GNU-stack","",%progbits
364#endif
365