• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
24
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29#  x3 a0
30#  x9 a1
31# x10 a2
32# x11 a3
33# x12 a4
34#  x4 a5
35
36# C pointers
37#  x6 c0
38# x16 c1
39# x17 c2
40# x18 c3
41# x13 c4
42#  x7 c5
43
44# Vector register usage
45# A0   v0
46# A1   v1
47# A2   v2
48# A3   v3
49# A4   v4
50# A5   v5
51# B   v16 v17 v18 v19
52# C   v20 v21
53# C   v22 v23
54# C   v24 v25
55# C   v26 v27
56# C   v28 v29
57# C   v30 v31
58# Clamp v6 v7
59# unused A   v8 v9 v10 v11
60# unused B   v12 v13 v14 v15
61
62BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
63
64        # Clamp A and C pointers
65        CMP x0, 2                // if mr < 2
66        ADD x9, x3, x4           // a1 = a0 + a_stride
67        ADD x16, x6, x7          // c1 = c0 + cm_stride
68        CSEL x9, x3, x9, LO      //   a1 = a0
69        CSEL x16, x6, x16, LO    //   c1 = c0
70
71        ADD x10, x9, x4          // a2 = a1 + a_stride
72        ADD x17, x16, x7         // c2 = c1 + cm_stride
73                                 // if mr <= 2
74        CSEL x10, x9, x10, LS    //   a2 = a1
75        CSEL x17, x16, x17, LS   //   c2 = c1
76
77        CMP x0, 4                // if mr < 4
78        ADD x11, x10, x4         // a3 = a2 + a_stride
79        ADD x18, x17, x7         // c3 = c2 + cm_stride
80        CSEL x11, x10, x11, LO   //   a3 = a2
81        CSEL x18, x17, x18, LO   //   c3 = c2
82
83        ADD x12, x11, x4         // a4 = a3 + a_stride
84        ADD x13, x18, x7         // c4 = c3 + cm_stride
85                                 // if mr <= 5
86        CSEL x12, x11, x12, LS   //   a4 = a3
87        CSEL x13, x18, x13, LS   //   c4 = c3
88
89        # Load acc, params pointer
90        LDP x15, x8, [sp, 8]
91
92        CMP x0, 6                // if mr < 6
93        ADD x4, x12, x4          // a5 = a4 + a_stride
94        ADD x7, x13, x7          // c5 = c4 + cm_stride
95        CSEL x4, x12, x4, LO     //   a5 = a4
96        CSEL x7, x13, x7, LO     //   c5 = c4
97
98        # Load clamping_params values
99        LD2R {v6.4s, v7.4s}, [x8]
100
101        # Load cn_stride
102        LDR x14, [sp]
103
1040:
105        # Load initial accumulators
106        LDP q20, q21, [x15], 32
107        LDP q22, q23, [x15], 32
108        LDP q24, q25, [x15], 32
109        LDP q26, q27, [x15], 32
110        LDP q28, q29, [x15], 32
111        LDP q30, q31, [x15], 32
112        PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
113        PRFM PLDL1KEEP, [x5, 64]
114        PRFM PLDL1KEEP, [x5, 128]
115        PRFM PLDL1KEEP, [x5, 192]
116        PRFM PLDL1KEEP,  [x3]    // Prefetch A
117        PRFM PLDL1KEEP,  [x9]
118        PRFM PLDL1KEEP, [x10]
119        PRFM PLDL1KEEP, [x11]
120        PRFM PLDL1KEEP, [x12]
121        PRFM PLDL1KEEP,  [x4]
122
123        # Is there at least 4 floats (16 bytes)?
124        SUBS x0, x2, 16  // k = kc - 16
125        B.LO 5f
126
127        # Main loop - 4 floats of A (16 bytes)
128        # 48 FMA + 6 ld128 A + 4 LDP B
1291:
130        LDR   q0,  [x3], 16
131        LDP  q16,  q17, [x5], 32
132        LDR   q1,  [x9], 16
133        LDR   q2, [x10], 16
134        LDR   q3, [x11], 16
135        LDR   q4, [x12], 16
136        LDR   q5,  [x4], 16
137        FMLA v20.4s, v16.4s,  v0.s[0]
138        FMLA v22.4s, v16.4s,  v1.s[0]
139        FMLA v24.4s, v16.4s,  v2.s[0]
140        FMLA v26.4s, v16.4s,  v3.s[0]
141        LDP  q18,  q19, [x5], 32
142        FMLA v28.4s, v16.4s,  v4.s[0]
143        FMLA v30.4s, v16.4s,  v5.s[0]
144        FMLA v21.4s, v17.4s,  v0.s[0]
145        FMLA v23.4s, v17.4s,  v1.s[0]
146        FMLA v25.4s, v17.4s,  v2.s[0]
147        FMLA v27.4s, v17.4s,  v3.s[0]
148        FMLA v29.4s, v17.4s,  v4.s[0]
149        FMLA v31.4s, v17.4s,  v5.s[0]
150
151        FMLA v20.4s, v18.4s,  v0.s[1]
152        LDP  q16,  q17, [x5], 32
153        FMLA v22.4s, v18.4s,  v1.s[1]
154        FMLA v24.4s, v18.4s,  v2.s[1]
155        FMLA v26.4s, v18.4s,  v3.s[1]
156        FMLA v28.4s, v18.4s,  v4.s[1]
157        FMLA v30.4s, v18.4s,  v5.s[1]
158        FMLA v21.4s, v19.4s,  v0.s[1]
159        FMLA v23.4s, v19.4s,  v1.s[1]
160        FMLA v25.4s, v19.4s,  v2.s[1]
161        FMLA v27.4s, v19.4s,  v3.s[1]
162        FMLA v29.4s, v19.4s,  v4.s[1]
163        FMLA v31.4s, v19.4s,  v5.s[1]
164
165        FMLA v20.4s, v16.4s,  v0.s[2]
166        LDP  q18,  q19, [x5], 32
167        FMLA v22.4s, v16.4s,  v1.s[2]
168        FMLA v24.4s, v16.4s,  v2.s[2]
169        FMLA v26.4s, v16.4s,  v3.s[2]
170        FMLA v28.4s, v16.4s,  v4.s[2]
171        FMLA v30.4s, v16.4s,  v5.s[2]
172        FMLA v21.4s, v17.4s,  v0.s[2]
173        FMLA v23.4s, v17.4s,  v1.s[2]
174        FMLA v25.4s, v17.4s,  v2.s[2]
175        FMLA v27.4s, v17.4s,  v3.s[2]
176        FMLA v29.4s, v17.4s,  v4.s[2]
177        FMLA v31.4s, v17.4s,  v5.s[2]
178
179        FMLA v20.4s, v18.4s,  v0.s[3]
180        FMLA v22.4s, v18.4s,  v1.s[3]
181        FMLA v24.4s, v18.4s,  v2.s[3]
182        FMLA v26.4s, v18.4s,  v3.s[3]
183        FMLA v28.4s, v18.4s,  v4.s[3]
184        FMLA v30.4s, v18.4s,  v5.s[3]
185        FMLA v21.4s, v19.4s,  v0.s[3]
186        FMLA v23.4s, v19.4s,  v1.s[3]
187        FMLA v25.4s, v19.4s,  v2.s[3]
188        FMLA v27.4s, v19.4s,  v3.s[3]
189        SUBS x0, x0, 16
190        FMLA v29.4s, v19.4s,  v4.s[3]
191        FMLA v31.4s, v19.4s,  v5.s[3]
192        B.HS 1b
193
194        # Is there a remainder?- 2 floats of A (8 bytes) or less
195        TST x0, 15
196        B.NE 5f
197
1984:
199        # Clamp
200        FMIN v20.4s, v20.4s, v6.4s
201        SUBS x1, x1, 8
202        FMIN v21.4s, v21.4s, v6.4s
203        FMIN v22.4s, v22.4s, v6.4s
204        FMIN v23.4s, v23.4s, v6.4s
205        FMIN v24.4s, v24.4s, v6.4s
206        FMIN v25.4s, v25.4s, v6.4s
207        FMIN v26.4s, v26.4s, v6.4s
208        FMIN v27.4s, v27.4s, v6.4s
209        FMIN v28.4s, v28.4s, v6.4s
210        FMIN v29.4s, v29.4s, v6.4s
211        FMIN v30.4s, v30.4s, v6.4s
212        FMIN v31.4s, v31.4s, v6.4s
213        FMAX v20.4s, v20.4s, v7.4s
214        FMAX v21.4s, v21.4s, v7.4s
215        FMAX v22.4s, v22.4s, v7.4s
216        FMAX v23.4s, v23.4s, v7.4s
217        FMAX v24.4s, v24.4s, v7.4s
218        FMAX v25.4s, v25.4s, v7.4s
219        FMAX v26.4s, v26.4s, v7.4s
220        FMAX v27.4s, v27.4s, v7.4s
221        FMAX v28.4s, v28.4s, v7.4s
222        FMAX v29.4s, v29.4s, v7.4s
223        FMAX v30.4s, v30.4s, v7.4s
224        FMAX v31.4s, v31.4s, v7.4s
225
226        # Store full 6 x 8
227        B.LO 7f
228
229        ST1 {v30.16b, v31.16b},  [x7], x14
230        SUB  x3,  x3, x2 // a0 -= kc
231        ST1 {v28.16b, v29.16b}, [x13], x14
232        SUB  x9,  x9, x2 // a1 -= kc
233        ST1 {v26.16b, v27.16b}, [x18], x14
234        SUB x10, x10, x2 // a2 -= kc
235        ST1 {v24.16b, v25.16b}, [x17], x14
236        SUB x11, x11, x2 // a3 -= kc
237        ST1 {v22.16b, v23.16b}, [x16], x14
238        SUB x12, x12, x2 // a4 -= kc
239        ST1 {v20.16b, v21.16b},  [x6], x14
240        SUB  x4,  x4, x2 // a5 -= kc
241
242        B.HI 0b
243        RET
244
2455:
246        # Is there a remainder?- 2 floats of A (8 bytes)
247        TBZ x0, 3, 6f
248
249        # Remainder- 2 floats of A (8 bytes)
250        LDR   d0,  [x3], 8
251        LDP  q16,  q17, [x5], 32
252        LDR   d1,  [x9], 8
253        LDR   d2, [x10], 8
254        LDR   d3, [x11], 8
255        LDR   d4, [x12], 8
256        LDR   d5,  [x4], 8
257        FMLA v20.4s, v16.4s,  v0.s[0]
258        FMLA v22.4s, v16.4s,  v1.s[0]
259        FMLA v24.4s, v16.4s,  v2.s[0]
260        FMLA v26.4s, v16.4s,  v3.s[0]
261        LDP  q18,  q19, [x5], 32
262        FMLA v28.4s, v16.4s,  v4.s[0]
263        FMLA v30.4s, v16.4s,  v5.s[0]
264        FMLA v21.4s, v17.4s,  v0.s[0]
265        FMLA v23.4s, v17.4s,  v1.s[0]
266        FMLA v25.4s, v17.4s,  v2.s[0]
267        FMLA v27.4s, v17.4s,  v3.s[0]
268        FMLA v29.4s, v17.4s,  v4.s[0]
269        FMLA v31.4s, v17.4s,  v5.s[0]
270
271        FMLA v20.4s, v18.4s,  v0.s[1]
272        FMLA v22.4s, v18.4s,  v1.s[1]
273        FMLA v24.4s, v18.4s,  v2.s[1]
274        FMLA v26.4s, v18.4s,  v3.s[1]
275        FMLA v28.4s, v18.4s,  v4.s[1]
276        FMLA v30.4s, v18.4s,  v5.s[1]
277        FMLA v21.4s, v19.4s,  v0.s[1]
278        FMLA v23.4s, v19.4s,  v1.s[1]
279        FMLA v25.4s, v19.4s,  v2.s[1]
280        FMLA v27.4s, v19.4s,  v3.s[1]
281        FMLA v29.4s, v19.4s,  v4.s[1]
282        FMLA v31.4s, v19.4s,  v5.s[1]
283
284        # Is there a remainder?- 1 floats of A (4 bytes)
285        TBZ x0, 2, 4b
286
287        # Remainder- 1 float of A (4 bytes)
2886:
289        LDR   s0,  [x3], 4
290        LDP  q16,  q17, [x5], 32
291        LDR   s1,  [x9], 4
292        LDR   s2, [x10], 4
293        LDR   s3, [x11], 4
294        LDR   s4, [x12], 4
295        LDR   s5,  [x4], 4
296        FMLA v20.4s, v16.4s,  v0.s[0]
297        FMLA v22.4s, v16.4s,  v1.s[0]
298        FMLA v24.4s, v16.4s,  v2.s[0]
299        FMLA v26.4s, v16.4s,  v3.s[0]
300        FMLA v28.4s, v16.4s,  v4.s[0]
301        FMLA v30.4s, v16.4s,  v5.s[0]
302        FMLA v21.4s, v17.4s,  v0.s[0]
303        FMLA v23.4s, v17.4s,  v1.s[0]
304        FMLA v25.4s, v17.4s,  v2.s[0]
305        FMLA v27.4s, v17.4s,  v3.s[0]
306        FMLA v29.4s, v17.4s,  v4.s[0]
307        FMLA v31.4s, v17.4s,  v5.s[0]
308        B 4b
309
310        # Store odd width
3117:
312        TBZ x1, 2, 8f
313        STR q30,  [x7], 16
314        MOV v30.16b, v31.16b
315        STR q28, [x13], 16
316        MOV v28.16b, v29.16b
317        STR q26, [x18], 16
318        MOV v26.16b, v27.16b
319        STR q24, [x17], 16
320        MOV v24.16b, v25.16b
321        STR q22, [x16], 16
322        MOV v22.16b, v23.16b
323        STR q20,  [x6], 16
324        MOV v20.16b, v21.16b
325
3268:
327        TBZ x1, 1, 9f
328        STR d30,  [x7], 8
329        DUP d30, v30.d[1]
330        STR d28, [x13], 8
331        DUP d28, v28.d[1]
332        STR d26, [x18], 8
333        DUP d26, v26.d[1]
334        STR d24, [x17], 8
335        DUP d24, v24.d[1]
336        STR d22, [x16], 8
337        DUP d22, v22.d[1]
338        STR d20,  [x6], 8
339        DUP d20, v20.d[1]
340
3419:
342        TBZ x1, 0, 10f
343        STR s30,  [x7]
344        STR s28, [x13]
345        STR s26, [x18]
346        STR s24, [x17]
347        STR s22, [x16]
348        STR s20,  [x6]
34910:
350        RET
351
352END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128
353
354#ifdef __ELF__
355.section ".note.GNU-stack","",%progbits
356#endif
357