• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53(
9#     size_t mr,                         (x0) - unused.  mr = 1
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const float*restrict w,            x5
15#     float*restrict c,                  x6
16#     size_t cm_stride,                  (x7) - unused
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_minmax_params params [sp + 24] -> x8
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# A pointer
25# x8  a0
26
27# C pointer
28# x6  c0
29
30# Vector register usage and GPR shadows
31# a0  v0           first set of A
32# a0  v1           second set of A
33# B   v2  v3  v4   x14 x15 x16  first set of B
34# B   v5  v6  v7   x17 x13 x7
35# B  v23 v24 v25   x14 x15 x16  second set of B (same x as first set)
36# B  v17 v18 v19   x17 x13 x7
37# C  v20 v21 v22
38
39BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53
40
41        # Load cn_stride, a_offset
42        LDP x10, x11, [sp]
43
44        # Load zero, params pointer
45        LDP x12, x8, [sp, 16]
46
47        # Load min/max values
48        LD2R {v30.4s, v31.4s}, [x8]
49
500:
51        # Load initial bias from w into accumulators
52        LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
53
54        PRFM PLDL1KEEP, [x5]
55        PRFM PLDL1KEEP, [x5, 64]
56        PRFM PLDL1KEEP, [x5, 128]
57        PRFM PLDL1KEEP, [x5, 192]
58        PRFM PLDL1KEEP, [x5, 256]
59        PRFM PLDL1KEEP, [x5, 320]
60
61        MOV x9, x3  // p = ks
62
631:
64        # Load next A pointer
65        LDR x8, [x4], 8
66
67        CMP x8, x12           // if a0 == zero
68        ADD x8, x8, x11       // a0 += a_offset
69        CSEL x8, x12, x8, EQ  //   a0 = zero, else += a0 + a_offset
70
71        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
72        SUBS x0, x2, 16  // k = kc - 16
73        B.LO 5f
74
75        # Prologue - loads for first group of 6 fma
76
77        # Read first block of 1 A.
78        LDR d0, [x8], 8    // a0
79
80        LDR d2, [x5]        // vb0x0123
81        LDR x14, [x5, 8]
82
83        LDR d3, [x5, 16]    // vb0x25567
84        LDR x15, [x5, 24]
85
86        LDR d4, [x5, 32]   // vb0x89AB
87        LDR x16, [x5, 40]
88
89        LDR d5, [x5, 48]   // vb1x0123
90        LDR x17, [x5, 56]
91
92        LDR d6, [x5, 64]   // vb1x25567
93        LDR x13, [x5, 72]
94
95        LDR d7, [x5, 80]   // vb1x89AB
96        LDR x7, [x5, 88]
97        INS v2.d[1], x14
98        ADD x5, x5, 96
99
100        # Is there at least 4 floats (16 bytes) for main loop?
101        SUBS x0, x0, 16  // 4 floats for main loop
102        B.LO 3f
103
104        # Main loop - 4 floats of A (16 bytes)
1052:
106        # First group of 6 fma.
107        # A is loaded for 2nd group into v1
108
109        # BLOCK 0
110        LDR d1, [x8], 8         // a0
111        INS v3.d[1], x15
112        FMLA v20.4s, v2.4s, v0.s[0]
113        PRFM PLDL1KEEP, [x5, 192]
114
115        # BLOCK 1
116        INS v4.d[1], x16
117        FMLA v21.4s, v3.4s, v0.s[0]
118        PRFM PLDL1KEEP, [x5, 256]
119
120        # BLOCK 2
121        LDR d23, [x5]       // vb0x0123
122        INS v5.d[1], x17
123        LDR x14, [x5, 8]
124        PRFM PLDL1KEEP, [x5, 320]
125        FMLA v22.4s, v4.4s, v0.s[0]
126
127        # BLOCK 3
128        LDR d24, [x5, 16]   // vb0x25567
129        INS v6.d[1], x13
130        LDR x15, [x5, 24]
131
132        # BLOCK 4
133        LDR d25, [x5, 32]   // vb0x89AB
134        INS v7.d[1], x7
135        FMLA v20.4s, v5.4s, v0.s[1]
136        LDR x16, [x5, 40]
137
138        # BLOCK 5
139        LDR d17, [x5, 48]   // vb1x0123
140        LDR x17, [x5, 56]
141        FMLA v21.4s, v6.4s, v0.s[1]
142
143        # BLOCK 6
144        LDR d18, [x5, 64]   // vb1x25567
145        LDR x13, [x5, 72]
146        FMLA v22.4s, v7.4s, v0.s[1]
147
148        # BLOCK 7
149        LDR d19, [x5, 80]   // vb1x89AB
150        INS v23.d[1], x14   // v23 was loaded in block 2
151        LDR x7, [x5, 88]
152
153        # Second group of 6 fma.
154        # A is loaded for 1st group into v0
155
156        # BLOCK 0
157        LDR d0, [x8], 8         // a0
158        INS v24.d[1], x15
159        FMLA v20.4s, v23.4s, v1.s[0]
160
161        # BLOCK 1
162        INS v25.d[1], x16
163        FMLA v21.4s, v24.4s, v1.s[0]
164
165        # BLOCK 2
166        LDR d2, [x5, 96]        // vb0x0123
167        INS v17.d[1], x17
168        LDR x14, [x5, 104]
169        FMLA v22.4s, v25.4s, v1.s[0]
170
171        # BLOCK 3
172        LDR d3, [x5, 112]    // vb0x25567
173        INS v18.d[1], x13
174        LDR x15, [x5, 120]
175
176        # BLOCK 4
177        LDR d4, [x5, 128]   // vb0x89AB
178        INS v19.d[1], x7
179        FMLA v20.4s, v17.4s, v1.s[1]
180        LDR x16, [x5, 136]
181
182        # BLOCK 5
183        LDR d5, [x5, 144]   // vb1x0123
184        LDR x17, [x5, 152]
185        FMLA v21.4s, v18.4s, v1.s[1]
186
187        # BLOCK 6
188        LDR d6, [x5, 160]   // vb1x25567
189        LDR x13, [x5, 168]
190        SUBS x0, x0, 16
191        FMLA v22.4s, v19.4s, v1.s[1]
192
193        # BLOCK 7
194        LDR d7, [x5, 176]   // vb1x89AB
195        INS v2.d[1], x14
196        LDR x7, [x5, 184]
197        ADD x5, x5, 192
198        B.HS 2b
199
200        # Epilogue
201        # First block same as main loop.  Second block has no loads.
2023:
203        # BLOCK 0
204        LDR d1, [x8], 8         // a0
205        INS v3.d[1], x15
206        FMLA v20.4s, v2.4s, v0.s[0]
207        PRFM PLDL1KEEP, [x5, 192]
208
209        # BLOCK 1
210        INS v4.d[1], x16
211        FMLA v21.4s, v3.4s, v0.s[0]
212        PRFM PLDL1KEEP, [x5, 256]
213
214        # BLOCK 2
215        LDR d23, [x5]       // vb0x0123
216        INS v5.d[1], x17
217        LDR x14, [x5, 8]
218        PRFM PLDL1KEEP, [x5, 320]
219        FMLA v22.4s, v4.4s, v0.s[0]
220
221        # BLOCK 3
222        LDR d24, [x5, 16]   // vb0x25567
223        INS v6.d[1], x13
224        LDR x15, [x5, 24]
225
226        # BLOCK 4
227        LDR d25, [x5, 32]   // vb0x89AB
228        INS v7.d[1], x7
229        FMLA v20.4s, v5.4s, v0.s[1]
230        LDR x16, [x5, 40]
231
232        # BLOCK 5
233        LDR d17, [x5, 48]   // vb1x0123
234        LDR x17, [x5, 56]
235        FMLA v21.4s, v6.4s, v0.s[1]
236
237        # BLOCK 6
238        LDR d18, [x5, 64]   // vb1x25567
239        LDR x13, [x5, 72]
240        FMLA v22.4s, v7.4s, v0.s[1]
241
242        # BLOCK 7
243        LDR d19, [x5, 80]   // vb1x89AB
244        INS v23.d[1], x14   // v23 was loaded in block 2
245        LDR x7, [x5, 88]
246        ADD x5, x5, 96
247
248        # Second group of 6 fma.  8 blocks of 4 cycles.
249        # Epilogue version does no loads
250
251        # BLOCK 0
252        INS v24.d[1], x15
253        FMLA v20.4s, v23.4s, v1.s[0]
254
255        # BLOCK 1
256        INS v25.d[1], x16
257        FMLA v21.4s, v24.4s, v1.s[0]
258
259        # BLOCK 2
260        INS v17.d[1], x17
261        FMLA v22.4s, v25.4s, v1.s[0]
262
263        # BLOCK 3
264        INS v18.d[1], x13
265
266        # BLOCK 4
267        INS v19.d[1], x7
268        FMLA v20.4s, v17.4s, v1.s[1]
269        TST x0, 15
270
271        # BLOCK 5
272        FMLA v21.4s, v18.4s, v1.s[1]
273
274        # BLOCK 6
275        FMLA v22.4s, v19.4s, v1.s[1]
276
277        # BLOCK 7
278        # Is there a remainder?- 2 floats of A (8 bytes) or less
279        B.NE 5f
280
2814:
282        # ks loop
283        SUBS x9, x9, 8  // ks -= MR * sizeof(void*)
284        B.HI 1b
285
286        # Clamp
287        FMAX v20.4s, v20.4s, v30.4s
288        FMAX v21.4s, v21.4s, v30.4s
289        FMAX v22.4s, v22.4s, v30.4s
290        FMIN v20.4s, v20.4s, v31.4s
291        FMIN v21.4s, v21.4s, v31.4s
292        FMIN v22.4s, v22.4s, v31.4s
293
294        # Store full 1 x 12
295        SUBS x1, x1, 12
296        B.LO 7f
297
298        ST1 {v20.16b, v21.16b, v22.16b}, [x6], x10
299        SUB x4, x4, x3  // a -= ks
300
301        # nc loop
302        B.HI 0b
303        RET
304
3055:
306        # Is there a remainder?- 2 floats of A (8 bytes)
307        TBZ x0, 3, 6f
308
309        # Remainder- 2 floats of A (8 bytes)
310        LDR d0, [x8], 8  // a0
311        LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
312        LD1 {v5.16b, v6.16b, v7.16b}, [x5], 48
313
314        # First block of 3 B
315        FMLA v20.4s, v2.4s, v0.s[0]
316        FMLA v21.4s, v3.4s, v0.s[0]
317        FMLA v22.4s, v4.4s, v0.s[0]
318
319        # Second block of 3 B
320        FMLA v20.4s, v5.4s, v0.s[1]
321        FMLA v21.4s, v6.4s, v0.s[1]
322        FMLA v22.4s, v7.4s, v0.s[1]
323
324        TBZ x0, 2, 4b
3256:
326        # Remainder - 1 float of A (4 bytes)
327        LDR s0, [x8], 4  // a0
328        LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
329
330        FMLA v20.4s, v2.4s, v0.s[0]
331        FMLA v21.4s, v3.4s, v0.s[0]
332        FMLA v22.4s, v4.4s, v0.s[0]
333        B 4b
334
3357:
336        ADD x1, x1, 12
337        # Store odd channels
338        TBZ x1, 3, 8f
339        STP q20, q21, [x6]
340        ADD x6, x6, 32
341        MOV v20.16b, v22.16b
342
3438:
344        TBZ x1, 2, 9f
345        STR q20, [x6], 16
346        MOV v20.16b, v21.16b
347
3489:
349        TBZ x1, 1, 10f
350        STR d20, [x6], 8
351        DUP d20, v20.d[1]
352
35310:
354        TBZ x1, 0, 11f
355        STR s20, [x6]
35611:
357        RET
358
359END_FUNCTION xnn_f32_igemm_minmax_ukernel_1x12__aarch64_neonfma_cortex_a53
360
361#ifdef __ELF__
362.section ".note.GNU-stack","",%progbits
363#endif
364