• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53(
13#     size_t mr,                (x0) - unused.  mr = 1
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          (x4) - unused
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         (x7) - unused
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
24
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointer
29# x3  a0
30
31# C pointer
32# x6  c0
33
34# Vector register usage and GPR shadows
35# a0  v0           first set of A
36# a0  v1           second set of A
37# B   v2  v3  v4    x7 x10 x16  first set of B
38# B   v5  v6  v7   x17 x18  x9
39# B  v23 v24 v25    x7 x10 x16  second set of B (same x as first set)
40# B  v17 v18 v19   x17 x18  x9
41# C  v20 v21 v22
42
43BEGIN_FUNCTION xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53
44
45        # Load cn_stride, acc
46        LDP x14, x15, [sp]
47        # Load params pointer
48        LDR x8, [sp, 16]
49
50        # Load clamping_params values
51        LD2R {v30.4s, v31.4s}, [x8]
52
530:
54        # Load initial accumulators
55        LD1 {v20.16b, v21.16b, v22.16b}, [x15], 48
56
57        PRFM PLDL1KEEP, [x5]
58        PRFM PLDL1KEEP, [x5, 64]
59        PRFM PLDL1KEEP, [x5, 128]
60
61        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
62        SUBS x0, x2, 16  // k = kc - 16
63        B.LO 5f
64
65        # Prologue - loads for first group of 6 fma
66
67        # Read first block of 1 A.
68        LDR d0, [x3], 8     // a0
69
70        LDR d2, [x5]       // vb0x0123
71        LDR x7, [x5, 8]
72
73        LDR d3, [x5, 16]   // vb0x4567
74        LDR x10, [x5, 24]
75
76        LDR d4, [x5, 32]   // vb0x89AB
77        LDR x16, [x5, 40]
78
79        LDR d5, [x5, 48]   // vb1x0123
80        LDR x17, [x5, 56]
81
82        LDR d6, [x5, 64]   // vb1x4567
83        LDR x18, [x5, 72]
84
85        LDR d7, [x5, 80]   // vb1x89AB
86        LDR x9, [x5, 88]
87        INS v2.d[1], x7
88        ADD x5, x5, 96
89
90        # Is there at least 4 floats (16 bytes) for main loop?
91        SUBS x0, x0, 16
92        B.LO 2f
93
94        # Main loop - 4 floats of A (16 bytes)
951:
96        # First group of 6 fma.
97        # A is loaded for 2nd group into v1
98
99        # BLOCK 0
100        LDR d1, [x3], 8          // a0
101        INS v3.d[1], x10
102        FMLA v20.4s, v2.4s, v0.s[0]
103        PRFM PLDL1KEEP, [x5, 96]
104
105        # BLOCK 1
106        INS v4.d[1], x16
107        FMLA v21.4s, v3.4s, v0.s[0]
108        PRFM PLDL1KEEP, [x5, 128]
109
110        # BLOCK 2
111        LDR d23, [x5]       // vb0x0123
112        INS v5.d[1], x17
113        LDR x7, [x5, 8]
114        FMLA v22.4s, v4.4s, v0.s[0]
115
116        # BLOCK 3
117        LDR d24, [x5, 16]   // vb0x4567
118        INS v6.d[1], x18
119        LDR x10, [x5, 24]
120
121        # BLOCK 4
122        LDR d25, [x5, 32]   // vb0x89AB
123        INS v7.d[1], x9
124        FMLA v20.4s, v5.4s, v0.s[1]
125        LDR x16, [x5, 40]
126
127        # BLOCK 5
128        LDR d17, [x5, 48]   // vb1x0123
129        LDR x17, [x5, 56]
130        FMLA v21.4s, v6.4s, v0.s[1]
131
132        # BLOCK 6
133        LDR d18, [x5, 64]   // vb1x4567
134        LDR x18, [x5, 72]
135        FMLA v22.4s, v7.4s, v0.s[1]
136
137        # BLOCK 7
138        LDR d19, [x5, 80]   // vb1x89AB
139        INS v23.d[1], x7   // v23 was loaded in block 2
140        LDR x9, [x5, 88]
141
142        # Second group of 6 fma.
143        # A is loaded for 1st group into v0
144
145        # BLOCK 0
146        LDR d0, [x3], 8          // a0
147        INS v24.d[1], x10
148        FMLA v20.4s, v23.4s, v1.s[0]
149
150        # BLOCK 1
151        INS v25.d[1], x16
152        FMLA v21.4s, v24.4s, v1.s[0]
153
154        # BLOCK 2
155        LDR d2, [x5, 96]        // vb0x0123
156        INS v17.d[1], x17
157        LDR x7, [x5, 104]
158        FMLA v22.4s, v25.4s, v1.s[0]
159
160        # BLOCK 3
161        LDR d3, [x5, 112]    // vb0x4567
162        INS v18.d[1], x18
163        LDR x10, [x5, 120]
164
165        # BLOCK 4
166        LDR d4, [x5, 128]   // vb0x89AB
167        INS v19.d[1], x9
168        FMLA v20.4s, v17.4s, v1.s[1]
169        LDR x16, [x5, 136]
170
171        # BLOCK 5
172        LDR d5, [x5, 144]   // vb1x0123
173        LDR x17, [x5, 152]
174        FMLA v21.4s, v18.4s, v1.s[1]
175
176        # BLOCK 6
177        LDR d6, [x5, 160]   // vb1x4567
178        LDR x18, [x5, 168]
179        SUBS x0, x0, 16
180        FMLA v22.4s, v19.4s, v1.s[1]
181
182        # BLOCK 7
183        LDR d7, [x5, 176]   // vb1x89AB
184        INS v2.d[1], x7
185        LDR x9, [x5, 184]
186        ADD x5, x5, 192
187        B.HS 1b
188
189        # Epilogue
190        # First block same as main loop.  Second block has no loads.
1912:
192        # BLOCK 0
193        LDR d1, [x3], 8          // a0
194        INS v3.d[1], x10
195        FMLA v20.4s, v2.4s, v0.s[0]
196        PRFM PLDL1KEEP, [x5, 96]
197
198        # BLOCK 1
199        INS v4.d[1], x16
200        FMLA v21.4s, v3.4s, v0.s[0]
201        PRFM PLDL1KEEP, [x5, 128]
202
203        # BLOCK 2
204        LDR d23, [x5]       // vb0x0123
205        INS v5.d[1], x17
206        LDR x7, [x5, 8]
207        FMLA v22.4s, v4.4s, v0.s[0]
208
209        # BLOCK 3
210        LDR d24, [x5, 16]   // vb0x4567
211        INS v6.d[1], x18
212        LDR x10, [x5, 24]
213
214        # BLOCK 4
215        LDR d25, [x5, 32]   // vb0x89AB
216        INS v7.d[1], x9
217        FMLA v20.4s, v5.4s, v0.s[1]
218        LDR x16, [x5, 40]
219
220        # BLOCK 5
221        LDR d17, [x5, 48]   // vb1x0123
222        LDR x17, [x5, 56]
223        FMLA v21.4s, v6.4s, v0.s[1]
224
225        # BLOCK 6
226        LDR d18, [x5, 64]   // vb1x4567
227        LDR x18, [x5, 72]
228        FMLA v22.4s, v7.4s, v0.s[1]
229
230        # BLOCK 7
231        LDR d19, [x5, 80]   // vb1x89AB
232        INS v23.d[1], x7   // v23 was loaded in block 2
233        LDR x9, [x5, 88]
234        ADD x5, x5, 96
235
236        # Second group of 6 fma.  8 blocks of 4 cycles.
237        # Epilogue version does no loads
238
239        # BLOCK 0
240        INS v24.d[1], x10
241        FMLA v20.4s, v23.4s, v1.s[0]
242
243        # BLOCK 1
244        INS v25.d[1], x16
245        FMLA v21.4s, v24.4s, v1.s[0]
246
247        # BLOCK 2
248        INS v17.d[1], x17
249        FMLA v22.4s, v25.4s, v1.s[0]
250
251        # BLOCK 3
252        INS v18.d[1], x18
253
254        # BLOCK 4
255        INS v19.d[1], x9
256        FMLA v20.4s, v17.4s, v1.s[1]
257        TST x0, 15
258
259        # BLOCK 5
260        FMLA v21.4s, v18.4s, v1.s[1]
261
262        # BLOCK 6
263        FMLA v22.4s, v19.4s, v1.s[1]
264
265        # BLOCK 7
266        # Is there a remainder?- 2 floats of A (8 bytes) or less
267        B.NE 5f
268
2694:
270        # Clamp
271        FMIN v20.4s, v20.4s, v30.4s
272        SUBS x1, x1, 12
273        FMIN v21.4s, v21.4s, v30.4s
274        FMIN v22.4s, v22.4s, v30.4s
275        FMAX v20.4s, v20.4s, v31.4s
276        FMAX v21.4s, v21.4s, v31.4s
277        FMAX v22.4s, v22.4s, v31.4s
278
279        # Store full 1 x 12
280        B.LO 7f
281
282        ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14
283        SUB  x3,  x3, x2 // a0 -= kc
284        B.HI 0b
285        RET
286
2875:
288        # Is there a remainder?- 2 floats of A (8 bytes)
289        TBZ x0, 3, 6f
290
291        # Remainder - 2 floats of A (8 bytes)
292        # Read first block of 1 A.
293        LDR d0, [x3], 8   // a0
294        LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
295        LD1 {v5.16b, v6.16b, v7.16b}, [x5], 48
296
297        # First block of 3 B
298        FMLA v20.4s, v2.4s, v0.s[0]
299        FMLA v21.4s, v3.4s, v0.s[0]
300        FMLA v22.4s, v4.4s, v0.s[0]
301
302        # Second block of 3 B
303        FMLA v20.4s, v5.4s, v0.s[1]
304        FMLA v21.4s, v6.4s, v0.s[1]
305        FMLA v22.4s, v7.4s, v0.s[1]
306
307        TBZ x0, 2, 4b
3086:
309        # Remainder - 1 float of A (4 bytes)
310        LDR s0, [x3], 4   // a0
311        LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
312
313        FMLA v20.4s, v2.4s, v0.s[0]
314        FMLA v21.4s, v3.4s, v0.s[0]
315        FMLA v22.4s, v4.4s, v0.s[0]
316        B 4b
317
3187:
319        ADD x1, x1, 12
320        # Store odd channels
321        TBZ x1, 3, 8f
322        STP q20, q21, [x6], 32
323        MOV v20.16b, v22.16b
324
3258:
326        TBZ x1, 2, 9f
327        STR q20, [x6], 16
328        MOV v20.16b, v21.16b
329
3309:
331        TBZ x1, 1, 10f
332        STR d20, [x6], 8
333        DUP d20, v20.d[1]
334
33510:
336        TBZ x1, 0, 11f
337        STR s20, [x6]
33811:
339        RET
340
341END_FUNCTION xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53
342
343#ifdef __ELF__
344.section ".note.GNU-stack","",%progbits
345#endif
346