• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41#  x7 c5
42
43# Vector register usage
44# A0   v0
45# A1   v1
46# A2   v2
47# A3   v3
48# A4   v4
49# A5   v5
50# B   v16 v17 v18 v19
51# C   v20 v21
52# C   v22 v23
53# C   v24 v25
54# C   v26 v27
55# C   v28 v29
56# C   v30 v31
57# Clamp v6 v7
58# unused A   v8 v9 v10 v11
59# unused B   v12 v13 v14 v15
60
61BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128
62
63        # Clamp A and C pointers
64        CMP x0, 2                // if mr < 2
65        ADD x9, x3, x4           // a1 = a0 + a_stride
66        ADD x16, x6, x7          // c1 = c0 + cm_stride
67        CSEL x9, x3, x9, LO      //   a1 = a0
68        CSEL x16, x6, x16, LO    //   c1 = c0
69
70        ADD x10, x9, x4          // a2 = a1 + a_stride
71        ADD x17, x16, x7         // c2 = c1 + cm_stride
72                                 // if mr <= 2
73        CSEL x10, x9, x10, LS    //   a2 = a1
74        CSEL x17, x16, x17, LS   //   c2 = c1
75
76        CMP x0, 4                // if mr < 4
77        ADD x11, x10, x4         // a3 = a2 + a_stride
78        ADD x18, x17, x7         // c3 = c2 + cm_stride
79        CSEL x11, x10, x11, LO   //   a3 = a2
80        CSEL x18, x17, x18, LO   //   c3 = c2
81
82        ADD x12, x11, x4         // a4 = a3 + a_stride
83        ADD x13, x18, x7         // c4 = c3 + cm_stride
84                                 // if mr <= 5
85        CSEL x12, x11, x12, LS   //   a4 = a3
86        CSEL x13, x18, x13, LS   //   c4 = c3
87
88        # Load params pointer
89        LDR x8, [sp, 8]
90
91        CMP x0, 6                // if mr < 6
92        ADD x4, x12, x4          // a5 = a4 + a_stride
93        ADD x7, x13, x7          // c5 = c4 + cm_stride
94        CSEL x4, x12, x4, LO     //   a5 = a4
95        CSEL x7, x13, x7, LO     //   c5 = c4
96
97        # Load clamping_params values
98        LD2R {v6.4s, v7.4s}, [x8]
99
100        # Load cn_stride
101        LDR x14, [sp]
102
1030:
104        # Load initial bias from w into accumulators
105        LDP q20, q21, [x5], 32
106        MOV v22.16b, v20.16b
107        PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
108        MOV v23.16b, v21.16b
109        PRFM PLDL1KEEP, [x5, 64]
110        MOV v24.16b, v20.16b
111        PRFM PLDL1KEEP, [x5, 128]
112        MOV v25.16b, v21.16b
113        PRFM PLDL1KEEP, [x5, 192]
114        MOV v26.16b, v20.16b
115        PRFM PLDL1KEEP,  [x3]    // Prefetch A
116        MOV v27.16b, v21.16b
117        PRFM PLDL1KEEP,  [x9]
118        MOV v28.16b, v20.16b
119        PRFM PLDL1KEEP, [x10]
120        MOV v29.16b, v21.16b
121        PRFM PLDL1KEEP, [x11]
122        MOV v30.16b, v20.16b
123        PRFM PLDL1KEEP, [x12]
124        MOV v31.16b, v21.16b
125        PRFM PLDL1KEEP,  [x4]
126
127        # Is there at least 4 floats (16 bytes)?
128        SUBS x0, x2, 16  // k = kc - 16
129        B.LO 5f
130
131        # Main loop - 4 floats of A (16 bytes)
132        # 48 FMA + 6 ld128 A + 4 LDP B
1331:
134        LDR   q0,  [x3], 16
135        LDP  q16,  q17, [x5], 32
136        LDR   q1,  [x9], 16
137        LDR   q2, [x10], 16
138        LDR   q3, [x11], 16
139        LDR   q4, [x12], 16
140        LDR   q5,  [x4], 16
141        FMLA v20.4s, v16.4s,  v0.s[0]
142        FMLA v22.4s, v16.4s,  v1.s[0]
143        FMLA v24.4s, v16.4s,  v2.s[0]
144        FMLA v26.4s, v16.4s,  v3.s[0]
145        LDP  q18,  q19, [x5], 32
146        FMLA v28.4s, v16.4s,  v4.s[0]
147        FMLA v30.4s, v16.4s,  v5.s[0]
148        FMLA v21.4s, v17.4s,  v0.s[0]
149        FMLA v23.4s, v17.4s,  v1.s[0]
150        FMLA v25.4s, v17.4s,  v2.s[0]
151        FMLA v27.4s, v17.4s,  v3.s[0]
152        FMLA v29.4s, v17.4s,  v4.s[0]
153        FMLA v31.4s, v17.4s,  v5.s[0]
154
155        FMLA v20.4s, v18.4s,  v0.s[1]
156        LDP  q16,  q17, [x5], 32
157        FMLA v22.4s, v18.4s,  v1.s[1]
158        FMLA v24.4s, v18.4s,  v2.s[1]
159        FMLA v26.4s, v18.4s,  v3.s[1]
160        FMLA v28.4s, v18.4s,  v4.s[1]
161        FMLA v30.4s, v18.4s,  v5.s[1]
162        FMLA v21.4s, v19.4s,  v0.s[1]
163        FMLA v23.4s, v19.4s,  v1.s[1]
164        FMLA v25.4s, v19.4s,  v2.s[1]
165        FMLA v27.4s, v19.4s,  v3.s[1]
166        FMLA v29.4s, v19.4s,  v4.s[1]
167        FMLA v31.4s, v19.4s,  v5.s[1]
168
169        FMLA v20.4s, v16.4s,  v0.s[2]
170        LDP  q18,  q19, [x5], 32
171        FMLA v22.4s, v16.4s,  v1.s[2]
172        FMLA v24.4s, v16.4s,  v2.s[2]
173        FMLA v26.4s, v16.4s,  v3.s[2]
174        FMLA v28.4s, v16.4s,  v4.s[2]
175        FMLA v30.4s, v16.4s,  v5.s[2]
176        FMLA v21.4s, v17.4s,  v0.s[2]
177        FMLA v23.4s, v17.4s,  v1.s[2]
178        FMLA v25.4s, v17.4s,  v2.s[2]
179        FMLA v27.4s, v17.4s,  v3.s[2]
180        FMLA v29.4s, v17.4s,  v4.s[2]
181        FMLA v31.4s, v17.4s,  v5.s[2]
182
183        FMLA v20.4s, v18.4s,  v0.s[3]
184        FMLA v22.4s, v18.4s,  v1.s[3]
185        FMLA v24.4s, v18.4s,  v2.s[3]
186        FMLA v26.4s, v18.4s,  v3.s[3]
187        FMLA v28.4s, v18.4s,  v4.s[3]
188        FMLA v30.4s, v18.4s,  v5.s[3]
189        FMLA v21.4s, v19.4s,  v0.s[3]
190        FMLA v23.4s, v19.4s,  v1.s[3]
191        FMLA v25.4s, v19.4s,  v2.s[3]
192        FMLA v27.4s, v19.4s,  v3.s[3]
193        SUBS x0, x0, 16
194        FMLA v29.4s, v19.4s,  v4.s[3]
195        FMLA v31.4s, v19.4s,  v5.s[3]
196        B.HS 1b
197
198        # Is there a remainder?- 2 floats of A (8 bytes) or less
199        TST x0, 15
200        B.NE 5f
201
2024:
203        # Clamp
204        FMIN v20.4s, v20.4s, v6.4s
205        SUBS x1, x1, 8
206        FMIN v21.4s, v21.4s, v6.4s
207        FMIN v22.4s, v22.4s, v6.4s
208        FMIN v23.4s, v23.4s, v6.4s
209        FMIN v24.4s, v24.4s, v6.4s
210        FMIN v25.4s, v25.4s, v6.4s
211        FMIN v26.4s, v26.4s, v6.4s
212        FMIN v27.4s, v27.4s, v6.4s
213        FMIN v28.4s, v28.4s, v6.4s
214        FMIN v29.4s, v29.4s, v6.4s
215        FMIN v30.4s, v30.4s, v6.4s
216        FMIN v31.4s, v31.4s, v6.4s
217        FMAX v20.4s, v20.4s, v7.4s
218        FMAX v21.4s, v21.4s, v7.4s
219        FMAX v22.4s, v22.4s, v7.4s
220        FMAX v23.4s, v23.4s, v7.4s
221        FMAX v24.4s, v24.4s, v7.4s
222        FMAX v25.4s, v25.4s, v7.4s
223        FMAX v26.4s, v26.4s, v7.4s
224        FMAX v27.4s, v27.4s, v7.4s
225        FMAX v28.4s, v28.4s, v7.4s
226        FMAX v29.4s, v29.4s, v7.4s
227        FMAX v30.4s, v30.4s, v7.4s
228        FMAX v31.4s, v31.4s, v7.4s
229
230        # Store full 6 x 8
231        B.LO 7f
232
233        ST1 {v20.16b, v21.16b},  [x6], x14
234        SUB  x3,  x3, x2 // a0 -= kc
235        ST1 {v22.16b, v23.16b}, [x16], x14
236        SUB  x9,  x9, x2 // a1 -= kc
237        ST1 {v24.16b, v25.16b}, [x17], x14
238        SUB x10, x10, x2 // a2 -= kc
239        ST1 {v26.16b, v27.16b}, [x18], x14
240        SUB x11, x11, x2 // a3 -= kc
241        ST1 {v28.16b, v29.16b}, [x13], x14
242        SUB x12, x12, x2 // a4 -= kc
243        ST1 {v30.16b, v31.16b},  [x7], x14
244        SUB  x4,  x4, x2 // a5 -= kc
245
246        B.HI 0b
247        RET
248
2495:
250        # Is there a remainder?- 2 floats of A (8 bytes)
251        TBZ x0, 3, 6f
252
253        # Remainder- 2 floats of A (8 bytes)
254        LDR   d0,  [x3], 8
255        LDP  q16,  q17, [x5], 32
256        LDR   d1,  [x9], 8
257        LDR   d2, [x10], 8
258        LDR   d3, [x11], 8
259        LDR   d4, [x12], 8
260        LDR   d5,  [x4], 8
261        FMLA v20.4s, v16.4s,  v0.s[0]
262        FMLA v22.4s, v16.4s,  v1.s[0]
263        FMLA v24.4s, v16.4s,  v2.s[0]
264        FMLA v26.4s, v16.4s,  v3.s[0]
265        LDP  q18,  q19, [x5], 32
266        FMLA v28.4s, v16.4s,  v4.s[0]
267        FMLA v30.4s, v16.4s,  v5.s[0]
268        FMLA v21.4s, v17.4s,  v0.s[0]
269        FMLA v23.4s, v17.4s,  v1.s[0]
270        FMLA v25.4s, v17.4s,  v2.s[0]
271        FMLA v27.4s, v17.4s,  v3.s[0]
272        FMLA v29.4s, v17.4s,  v4.s[0]
273        FMLA v31.4s, v17.4s,  v5.s[0]
274
275        FMLA v20.4s, v18.4s,  v0.s[1]
276        FMLA v22.4s, v18.4s,  v1.s[1]
277        FMLA v24.4s, v18.4s,  v2.s[1]
278        FMLA v26.4s, v18.4s,  v3.s[1]
279        FMLA v28.4s, v18.4s,  v4.s[1]
280        FMLA v30.4s, v18.4s,  v5.s[1]
281        FMLA v21.4s, v19.4s,  v0.s[1]
282        FMLA v23.4s, v19.4s,  v1.s[1]
283        FMLA v25.4s, v19.4s,  v2.s[1]
284        FMLA v27.4s, v19.4s,  v3.s[1]
285        FMLA v29.4s, v19.4s,  v4.s[1]
286        FMLA v31.4s, v19.4s,  v5.s[1]
287
288        # Is there a remainder?- 1 floats of A (4 bytes)
289        TBZ x0, 2, 4b
290
291        # Remainder- 1 float of A (4 bytes)
2926:
293        LDR   s0,  [x3], 4
294        LDP  q16,  q17, [x5], 32
295        LDR   s1,  [x9], 4
296        LDR   s2, [x10], 4
297        LDR   s3, [x11], 4
298        LDR   s4, [x12], 4
299        LDR   s5,  [x4], 4
300        FMLA v20.4s, v16.4s,  v0.s[0]
301        FMLA v22.4s, v16.4s,  v1.s[0]
302        FMLA v24.4s, v16.4s,  v2.s[0]
303        FMLA v26.4s, v16.4s,  v3.s[0]
304        FMLA v28.4s, v16.4s,  v4.s[0]
305        FMLA v30.4s, v16.4s,  v5.s[0]
306        FMLA v21.4s, v17.4s,  v0.s[0]
307        FMLA v23.4s, v17.4s,  v1.s[0]
308        FMLA v25.4s, v17.4s,  v2.s[0]
309        FMLA v27.4s, v17.4s,  v3.s[0]
310        FMLA v29.4s, v17.4s,  v4.s[0]
311        FMLA v31.4s, v17.4s,  v5.s[0]
312        B 4b
313
314        # Store odd width
3157:
316        TBZ x1, 2, 8f
317        STR q20,  [x6], 16
318        MOV v20.16b, v21.16b
319        STR q22, [x16], 16
320        MOV v22.16b, v23.16b
321        STR q24, [x17], 16
322        MOV v24.16b, v25.16b
323        STR q26, [x18], 16
324        MOV v26.16b, v27.16b
325        STR q28, [x13], 16
326        MOV v28.16b, v29.16b
327        STR q30,  [x7], 16
328        MOV v30.16b, v31.16b
329
3308:
331        TBZ x1, 1, 9f
332        STR d20,  [x6], 8
333        DUP d20, v20.d[1]
334        STR d22, [x16], 8
335        DUP d22, v22.d[1]
336        STR d24, [x17], 8
337        DUP d24, v24.d[1]
338        STR d26, [x18], 8
339        DUP d26, v26.d[1]
340        STR d28, [x13], 8
341        DUP d28, v28.d[1]
342        STR d30,  [x7], 8
343        DUP d30, v30.d[1]
344
3459:
346        TBZ x1, 0, 10f
347        STR s20,  [x6]
348        STR s22, [x16]
349        STR s24, [x17]
350        STR s26, [x18]
351        STR s28, [x13]
352        STR s30,  [x7]
35310:
354        RET
355
356END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128
357
358#ifdef __ELF__
359.section ".note.GNU-stack","",%progbits
360#endif
361