• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/1x12-aarch64-neonfma-cortex-a53.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53(
13#     size_t mr,                (x0) - unused.  mr = 1
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          (x4) - unused
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         (x7) - unused
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointer
28# x3  a0
29
30# C pointer
31# x6  c0
32
33# Vector register usage and GPR shadows
34# a0  v0           first set of A
35# a0  v1           second set of A
36# B   v2  v3  v4    x7 x10 x16  first set of B
37# B   v5  v6  v7   x17 x18  x9
38# B  v23 v24 v25    x7 x10 x16  second set of B (same x as first set)
39# B  v17 v18 v19   x17 x18  x9
40# C  v20 v21 v22
41
42BEGIN_FUNCTION xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53
43
44        # Load cn_stride, params pointer
45        LDP x14, x8, [sp]
46
47        # Load clamping_params values
48        LD2R {v30.4s, v31.4s}, [x8]
49
500:
51        # Load initial bias from w into accumulators
52        LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48
53
54        PRFM PLDL1KEEP, [x5]
55        PRFM PLDL1KEEP, [x5, 64]
56        PRFM PLDL1KEEP, [x5, 128]
57
58        # Is there at least 4 floats (16 bytes) for prologue + epilogue?
59        SUBS x0, x2, 16  // k = kc - 16
60        B.LO 5f
61
62        # Prologue - loads for first group of 6 fma
63
64        # Read first block of 1 A.
65        LDR d0, [x3], 8     // a0
66
67        LDR d2, [x5]       // vb0x0123
68        LDR x7, [x5, 8]
69
70        LDR d3, [x5, 16]   // vb0x4567
71        LDR x10, [x5, 24]
72
73        LDR d4, [x5, 32]   // vb0x89AB
74        LDR x16, [x5, 40]
75
76        LDR d5, [x5, 48]   // vb1x0123
77        LDR x17, [x5, 56]
78
79        LDR d6, [x5, 64]   // vb1x4567
80        LDR x18, [x5, 72]
81
82        LDR d7, [x5, 80]   // vb1x89AB
83        LDR x9, [x5, 88]
84        INS v2.d[1], x7
85        ADD x5, x5, 96
86
87        # Is there at least 4 floats (16 bytes) for main loop?
88        SUBS x0, x0, 16
89        B.LO 2f
90
91        # Main loop - 4 floats of A (16 bytes)
921:
93        # First group of 6 fma.
94        # A is loaded for 2nd group into v1
95
96        # BLOCK 0
97        LDR d1, [x3], 8          // a0
98        INS v3.d[1], x10
99        FMLA v20.4s, v2.4s, v0.s[0]
100        PRFM PLDL1KEEP, [x5, 96]
101
102        # BLOCK 1
103        INS v4.d[1], x16
104        FMLA v21.4s, v3.4s, v0.s[0]
105        PRFM PLDL1KEEP, [x5, 128]
106
107        # BLOCK 2
108        LDR d23, [x5]       // vb0x0123
109        INS v5.d[1], x17
110        LDR x7, [x5, 8]
111        FMLA v22.4s, v4.4s, v0.s[0]
112
113        # BLOCK 3
114        LDR d24, [x5, 16]   // vb0x4567
115        INS v6.d[1], x18
116        LDR x10, [x5, 24]
117
118        # BLOCK 4
119        LDR d25, [x5, 32]   // vb0x89AB
120        INS v7.d[1], x9
121        FMLA v20.4s, v5.4s, v0.s[1]
122        LDR x16, [x5, 40]
123
124        # BLOCK 5
125        LDR d17, [x5, 48]   // vb1x0123
126        LDR x17, [x5, 56]
127        FMLA v21.4s, v6.4s, v0.s[1]
128
129        # BLOCK 6
130        LDR d18, [x5, 64]   // vb1x4567
131        LDR x18, [x5, 72]
132        FMLA v22.4s, v7.4s, v0.s[1]
133
134        # BLOCK 7
135        LDR d19, [x5, 80]   // vb1x89AB
136        INS v23.d[1], x7   // v23 was loaded in block 2
137        LDR x9, [x5, 88]
138
139        # Second group of 6 fma.
140        # A is loaded for 1st group into v0
141
142        # BLOCK 0
143        LDR d0, [x3], 8          // a0
144        INS v24.d[1], x10
145        FMLA v20.4s, v23.4s, v1.s[0]
146
147        # BLOCK 1
148        INS v25.d[1], x16
149        FMLA v21.4s, v24.4s, v1.s[0]
150
151        # BLOCK 2
152        LDR d2, [x5, 96]        // vb0x0123
153        INS v17.d[1], x17
154        LDR x7, [x5, 104]
155        FMLA v22.4s, v25.4s, v1.s[0]
156
157        # BLOCK 3
158        LDR d3, [x5, 112]    // vb0x4567
159        INS v18.d[1], x18
160        LDR x10, [x5, 120]
161
162        # BLOCK 4
163        LDR d4, [x5, 128]   // vb0x89AB
164        INS v19.d[1], x9
165        FMLA v20.4s, v17.4s, v1.s[1]
166        LDR x16, [x5, 136]
167
168        # BLOCK 5
169        LDR d5, [x5, 144]   // vb1x0123
170        LDR x17, [x5, 152]
171        FMLA v21.4s, v18.4s, v1.s[1]
172
173        # BLOCK 6
174        LDR d6, [x5, 160]   // vb1x4567
175        LDR x18, [x5, 168]
176        SUBS x0, x0, 16
177        FMLA v22.4s, v19.4s, v1.s[1]
178
179        # BLOCK 7
180        LDR d7, [x5, 176]   // vb1x89AB
181        INS v2.d[1], x7
182        LDR x9, [x5, 184]
183        ADD x5, x5, 192
184        B.HS 1b
185
186        # Epilogue
187        # First block same as main loop.  Second block has no loads.
1882:
189        # BLOCK 0
190        LDR d1, [x3], 8          // a0
191        INS v3.d[1], x10
192        FMLA v20.4s, v2.4s, v0.s[0]
193        PRFM PLDL1KEEP, [x5, 96]
194
195        # BLOCK 1
196        INS v4.d[1], x16
197        FMLA v21.4s, v3.4s, v0.s[0]
198        PRFM PLDL1KEEP, [x5, 128]
199
200        # BLOCK 2
201        LDR d23, [x5]       // vb0x0123
202        INS v5.d[1], x17
203        LDR x7, [x5, 8]
204        FMLA v22.4s, v4.4s, v0.s[0]
205
206        # BLOCK 3
207        LDR d24, [x5, 16]   // vb0x4567
208        INS v6.d[1], x18
209        LDR x10, [x5, 24]
210
211        # BLOCK 4
212        LDR d25, [x5, 32]   // vb0x89AB
213        INS v7.d[1], x9
214        FMLA v20.4s, v5.4s, v0.s[1]
215        LDR x16, [x5, 40]
216
217        # BLOCK 5
218        LDR d17, [x5, 48]   // vb1x0123
219        LDR x17, [x5, 56]
220        FMLA v21.4s, v6.4s, v0.s[1]
221
222        # BLOCK 6
223        LDR d18, [x5, 64]   // vb1x4567
224        LDR x18, [x5, 72]
225        FMLA v22.4s, v7.4s, v0.s[1]
226
227        # BLOCK 7
228        LDR d19, [x5, 80]   // vb1x89AB
229        INS v23.d[1], x7   // v23 was loaded in block 2
230        LDR x9, [x5, 88]
231        ADD x5, x5, 96
232
233        # Second group of 6 fma.  8 blocks of 4 cycles.
234        # Epilogue version does no loads
235
236        # BLOCK 0
237        INS v24.d[1], x10
238        FMLA v20.4s, v23.4s, v1.s[0]
239
240        # BLOCK 1
241        INS v25.d[1], x16
242        FMLA v21.4s, v24.4s, v1.s[0]
243
244        # BLOCK 2
245        INS v17.d[1], x17
246        FMLA v22.4s, v25.4s, v1.s[0]
247
248        # BLOCK 3
249        INS v18.d[1], x18
250
251        # BLOCK 4
252        INS v19.d[1], x9
253        FMLA v20.4s, v17.4s, v1.s[1]
254        TST x0, 15
255
256        # BLOCK 5
257        FMLA v21.4s, v18.4s, v1.s[1]
258
259        # BLOCK 6
260        FMLA v22.4s, v19.4s, v1.s[1]
261
262        # BLOCK 7
263        # Is there a remainder?- 2 floats of A (8 bytes) or less
264        B.NE 5f
265
2664:
267        # Clamp
268        FMIN v20.4s, v20.4s, v30.4s
269        SUBS x1, x1, 12
270        FMIN v21.4s, v21.4s, v30.4s
271        FMIN v22.4s, v22.4s, v30.4s
272        FMAX v20.4s, v20.4s, v31.4s
273        FMAX v21.4s, v21.4s, v31.4s
274        FMAX v22.4s, v22.4s, v31.4s
275
276        # Store full 1 x 12
277        B.LO 7f
278
279        ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14
280        SUB  x3,  x3, x2 // a0 -= kc
281        B.HI 0b
282        RET
283
2845:
285        # Is there a remainder?- 2 floats of A (8 bytes)
286        TBZ x0, 3, 6f
287
288        # Remainder - 2 floats of A (8 bytes)
289        # Read first block of 1 A.
290        LDR d0, [x3], 8   // a0
291        LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
292        LD1 {v5.16b, v6.16b, v7.16b}, [x5], 48
293
294        # First block of 3 B
295        FMLA v20.4s, v2.4s, v0.s[0]
296        FMLA v21.4s, v3.4s, v0.s[0]
297        FMLA v22.4s, v4.4s, v0.s[0]
298
299        # Second block of 3 B
300        FMLA v20.4s, v5.4s, v0.s[1]
301        FMLA v21.4s, v6.4s, v0.s[1]
302        FMLA v22.4s, v7.4s, v0.s[1]
303
304        TBZ x0, 2, 4b
3056:
306        # Remainder - 1 float of A (4 bytes)
307        LDR s0, [x3], 4   // a0
308        LD1 {v2.16b, v3.16b, v4.16b}, [x5], 48
309
310        FMLA v20.4s, v2.4s, v0.s[0]
311        FMLA v21.4s, v3.4s, v0.s[0]
312        FMLA v22.4s, v4.4s, v0.s[0]
313        B 4b
314
3157:
316        ADD x1, x1, 12
317        # Store odd channels
318        TBZ x1, 3, 8f
319        STP q20, q21, [x6], 32
320        MOV v20.16b, v22.16b
321
3228:
323        TBZ x1, 2, 9f
324        STR q20, [x6], 16
325        MOV v20.16b, v21.16b
326
3279:
328        TBZ x1, 1, 10f
329        STR d20, [x6], 8
330        DUP d20, v20.d[1]
331
33210:
333        TBZ x1, 0, 11f
334        STR s20, [x6]
33511:
336        RET
337
338END_FUNCTION xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53
339
340#ifdef __ELF__
341.section ".note.GNU-stack","",%progbits
342#endif
343