• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41#  x7 c5
42
43# Vector register usage
44# A0   v0
45# A1   v1
46# A2   v2
47# A3   v3
48# A4   v4
49# A5   v5
50# B   v16 v17 v18 v19
51# C   v20 v21
52# C   v22 v23
53# C   v24 v25
54# C   v26 v27
55# C   v28 v29
56# C   v30 v31
57# Clamp v6 v7
58# unused A   v8 v9 v10 v11
59# unused B   v12 v13 v14 v15
60
61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64
62
63        # Clamp A and C pointers
64        CMP x0, 2                // if mr < 2
65        ADD x9, x3, x4           // a1 = a0 + a_stride
66        ADD x16, x6, x7          // c1 = c0 + cm_stride
67        CSEL x9, x3, x9, LO      //   a1 = a0
68        CSEL x16, x6, x16, LO    //   c1 = c0
69
70        ADD x10, x9, x4          // a2 = a1 + a_stride
71        ADD x17, x16, x7         // c2 = c1 + cm_stride
72                                 // if mr <= 2
73        CSEL x10, x9, x10, LS    //   a2 = a1
74        CSEL x17, x16, x17, LS   //   c2 = c1
75
76        CMP x0, 4                // if mr < 4
77        ADD x11, x10, x4         // a3 = a2 + a_stride
78        ADD x18, x17, x7         // c3 = c2 + cm_stride
79        CSEL x11, x10, x11, LO   //   a3 = a2
80        CSEL x18, x17, x18, LO   //   c3 = c2
81
82        ADD x12, x11, x4         // a4 = a3 + a_stride
83        ADD x13, x18, x7         // c4 = c3 + cm_stride
84                                 // if mr <= 5
85        CSEL x12, x11, x12, LS   //   a4 = a3
86        CSEL x13, x18, x13, LS   //   c4 = c3
87
88        $if INC:
89          # Load acc, params pointer
90          LDP x15, x8, [sp, 8]
91        $else:
92          # Load params pointer
93          LDR x8, [sp, 8]
94
95        CMP x0, 6                // if mr < 6
96        ADD x4, x12, x4          // a5 = a4 + a_stride
97        ADD x7, x13, x7          // c5 = c4 + cm_stride
98        CSEL x4, x12, x4, LO     //   a5 = a4
99        CSEL x7, x13, x7, LO     //   c5 = c4
100
101        # Load clamping_params values
102        LD2R {v6.4s, v7.4s}, [x8]
103
104        # Load cn_stride
105        LDR x14, [sp]
106
1070:
108        $if INC:
109          # Load initial accumulators
110          LDP q20, q21, [x15], 32
111          LDP q22, q23, [x15], 32
112          LDP q24, q25, [x15], 32
113          LDP q26, q27, [x15], 32
114          LDP q28, q29, [x15], 32
115          LDP q30, q31, [x15], 32
116          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
117          PRFM PLDL1KEEP, [x5, 64]
118          PRFM PLDL1KEEP, [x5, 128]
119          PRFM PLDL1KEEP, [x5, 192]
120          PRFM PLDL1KEEP,  [x3]    // Prefetch A
121          PRFM PLDL1KEEP,  [x9]
122          PRFM PLDL1KEEP, [x10]
123          PRFM PLDL1KEEP, [x11]
124          PRFM PLDL1KEEP, [x12]
125          PRFM PLDL1KEEP,  [x4]
126        $else:
127          # Load initial bias from w into accumulators
128          LDP q20, q21, [x5], 32
129          MOV v22.16b, v20.16b
130          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
131          MOV v23.16b, v21.16b
132          PRFM PLDL1KEEP, [x5, 64]
133          MOV v24.16b, v20.16b
134          PRFM PLDL1KEEP, [x5, 128]
135          MOV v25.16b, v21.16b
136          PRFM PLDL1KEEP, [x5, 192]
137          MOV v26.16b, v20.16b
138          PRFM PLDL1KEEP,  [x3]    // Prefetch A
139          MOV v27.16b, v21.16b
140          PRFM PLDL1KEEP,  [x9]
141          MOV v28.16b, v20.16b
142          PRFM PLDL1KEEP, [x10]
143          MOV v29.16b, v21.16b
144          PRFM PLDL1KEEP, [x11]
145          MOV v30.16b, v20.16b
146          PRFM PLDL1KEEP, [x12]
147          MOV v31.16b, v21.16b
148          PRFM PLDL1KEEP,  [x4]
149
150        # Is there at least 2 floats (8 bytes) for main loop?
151        SUBS x0, x2, 8  // k = kc - 8
152        B.LO 4f
153
154        # Main loop - 2 floats of A (8 bytes)
155        # 24 FMA + 6 LD64 A + 2 LDP B
1561:
157        LDR   d0,  [x3], 8
158        LDP  q16,  q17, [x5], 32
159        LDR   d1,  [x9], 8
160        LDR   d2, [x10], 8
161        LDR   d3, [x11], 8
162        LDR   d4, [x12], 8
163        LDR   d5,  [x4], 8
164        FMLA v20.4s, v16.4s,  v0.s[0]
165        FMLA v22.4s, v16.4s,  v1.s[0]
166        FMLA v24.4s, v16.4s,  v2.s[0]
167        FMLA v26.4s, v16.4s,  v3.s[0]
168        LDP  q18,  q19, [x5], 32
169        FMLA v28.4s, v16.4s,  v4.s[0]
170        FMLA v30.4s, v16.4s,  v5.s[0]
171        FMLA v21.4s, v17.4s,  v0.s[0]
172        FMLA v23.4s, v17.4s,  v1.s[0]
173        FMLA v25.4s, v17.4s,  v2.s[0]
174        FMLA v27.4s, v17.4s,  v3.s[0]
175        FMLA v29.4s, v17.4s,  v4.s[0]
176        FMLA v31.4s, v17.4s,  v5.s[0]
177
178        FMLA v20.4s, v18.4s,  v0.s[1]
179        FMLA v22.4s, v18.4s,  v1.s[1]
180        FMLA v24.4s, v18.4s,  v2.s[1]
181        FMLA v26.4s, v18.4s,  v3.s[1]
182        FMLA v28.4s, v18.4s,  v4.s[1]
183        FMLA v30.4s, v18.4s,  v5.s[1]
184        FMLA v21.4s, v19.4s,  v0.s[1]
185        FMLA v23.4s, v19.4s,  v1.s[1]
186        FMLA v25.4s, v19.4s,  v2.s[1]
187        FMLA v27.4s, v19.4s,  v3.s[1]
188        SUBS x0, x0, 8
189        FMLA v29.4s, v19.4s,  v4.s[1]
190        FMLA v31.4s, v19.4s,  v5.s[1]
191        B.HS 1b
192
193        # Is there a remainder?- 1 floats of A (4 bytes)
194        TBNZ x0, 2, 4f
1953:
196        # Clamp
197        FMIN v20.4s, v20.4s, v6.4s
198        SUBS x1, x1, 8
199        FMIN v21.4s, v21.4s, v6.4s
200        FMIN v22.4s, v22.4s, v6.4s
201        FMIN v23.4s, v23.4s, v6.4s
202        FMIN v24.4s, v24.4s, v6.4s
203        FMIN v25.4s, v25.4s, v6.4s
204        FMIN v26.4s, v26.4s, v6.4s
205        FMIN v27.4s, v27.4s, v6.4s
206        FMIN v28.4s, v28.4s, v6.4s
207        FMIN v29.4s, v29.4s, v6.4s
208        FMIN v30.4s, v30.4s, v6.4s
209        FMIN v31.4s, v31.4s, v6.4s
210        FMAX v20.4s, v20.4s, v7.4s
211        FMAX v21.4s, v21.4s, v7.4s
212        FMAX v22.4s, v22.4s, v7.4s
213        FMAX v23.4s, v23.4s, v7.4s
214        FMAX v24.4s, v24.4s, v7.4s
215        FMAX v25.4s, v25.4s, v7.4s
216        FMAX v26.4s, v26.4s, v7.4s
217        FMAX v27.4s, v27.4s, v7.4s
218        FMAX v28.4s, v28.4s, v7.4s
219        FMAX v29.4s, v29.4s, v7.4s
220        FMAX v30.4s, v30.4s, v7.4s
221        FMAX v31.4s, v31.4s, v7.4s
222
223        # Store full 6 x 8
224        B.LO 5f
225
226        $if INC:
227          ST1 {v30.16b, v31.16b},  [x7], x14
228          SUB  x3,  x3, x2 // a0 -= kc
229          ST1 {v28.16b, v29.16b}, [x13], x14
230          SUB  x9,  x9, x2 // a1 -= kc
231          ST1 {v26.16b, v27.16b}, [x18], x14
232          SUB x10, x10, x2 // a2 -= kc
233          ST1 {v24.16b, v25.16b}, [x17], x14
234          SUB x11, x11, x2 // a3 -= kc
235          ST1 {v22.16b, v23.16b}, [x16], x14
236          SUB x12, x12, x2 // a4 -= kc
237          ST1 {v20.16b, v21.16b},  [x6], x14
238          SUB  x4,  x4, x2 // a5 -= kc
239        $else:
240          ST1 {v20.16b, v21.16b},  [x6], x14
241          SUB  x3,  x3, x2 // a0 -= kc
242          ST1 {v22.16b, v23.16b}, [x16], x14
243          SUB  x9,  x9, x2 // a1 -= kc
244          ST1 {v24.16b, v25.16b}, [x17], x14
245          SUB x10, x10, x2 // a2 -= kc
246          ST1 {v26.16b, v27.16b}, [x18], x14
247          SUB x11, x11, x2 // a3 -= kc
248          ST1 {v28.16b, v29.16b}, [x13], x14
249          SUB x12, x12, x2 // a4 -= kc
250          ST1 {v30.16b, v31.16b},  [x7], x14
251          SUB  x4,  x4, x2 // a5 -= kc
252
253        B.HI 0b
254        RET
255
2564:
257        # Remainder- 1 floats of A (4 bytes)
258        LDR   s0,  [x3], 4
259        LDP  q16,  q17, [x5], 32
260        LDR   s1,  [x9], 4
261        LDR   s2, [x10], 4
262        LDR   s3, [x11], 4
263        LDR   s4, [x12], 4
264        LDR   s5,  [x4], 4
265        FMLA v20.4s, v16.4s,  v0.s[0]
266        FMLA v22.4s, v16.4s,  v1.s[0]
267        FMLA v24.4s, v16.4s,  v2.s[0]
268        FMLA v26.4s, v16.4s,  v3.s[0]
269        FMLA v28.4s, v16.4s,  v4.s[0]
270        FMLA v30.4s, v16.4s,  v5.s[0]
271        FMLA v21.4s, v17.4s,  v0.s[0]
272        FMLA v23.4s, v17.4s,  v1.s[0]
273        FMLA v25.4s, v17.4s,  v2.s[0]
274        FMLA v27.4s, v17.4s,  v3.s[0]
275        FMLA v29.4s, v17.4s,  v4.s[0]
276        FMLA v31.4s, v17.4s,  v5.s[0]
277        B 3b
278
279        # Store odd width
2805:
281        TBZ x1, 2, 6f
282        $if INC:
283          STR q30,  [x7], 16
284          MOV v30.16b, v31.16b
285          STR q28, [x13], 16
286          MOV v28.16b, v29.16b
287          STR q26, [x18], 16
288          MOV v26.16b, v27.16b
289          STR q24, [x17], 16
290          MOV v24.16b, v25.16b
291          STR q22, [x16], 16
292          MOV v22.16b, v23.16b
293          STR q20,  [x6], 16
294          MOV v20.16b, v21.16b
295        $else:
296          STR q20,  [x6], 16
297          MOV v20.16b, v21.16b
298          STR q22, [x16], 16
299          MOV v22.16b, v23.16b
300          STR q24, [x17], 16
301          MOV v24.16b, v25.16b
302          STR q26, [x18], 16
303          MOV v26.16b, v27.16b
304          STR q28, [x13], 16
305          MOV v28.16b, v29.16b
306          STR q30,  [x7], 16
307          MOV v30.16b, v31.16b
308
3096:
310        TBZ x1, 1, 7f
311        $if INC:
312          STR d30,  [x7], 8
313          DUP d30, v30.d[1]
314          STR d28, [x13], 8
315          DUP d28, v28.d[1]
316          STR d26, [x18], 8
317          DUP d26, v26.d[1]
318          STR d24, [x17], 8
319          DUP d24, v24.d[1]
320          STR d22, [x16], 8
321          DUP d22, v22.d[1]
322          STR d20,  [x6], 8
323          DUP d20, v20.d[1]
324        $else:
325          STR d20,  [x6], 8
326          DUP d20, v20.d[1]
327          STR d22, [x16], 8
328          DUP d22, v22.d[1]
329          STR d24, [x17], 8
330          DUP d24, v24.d[1]
331          STR d26, [x18], 8
332          DUP d26, v26.d[1]
333          STR d28, [x13], 8
334          DUP d28, v28.d[1]
335          STR d30,  [x7], 8
336          DUP d30, v30.d[1]
337
3387:
339        TBZ x1, 0, 8f
340        $if INC:
341          STR s30,  [x7]
342          STR s28, [x13]
343          STR s26, [x18]
344          STR s24, [x17]
345          STR s22, [x16]
346          STR s20,  [x6]
347        $else:
348          STR s20,  [x6]
349          STR s22, [x16]
350          STR s24, [x17]
351          STR s26, [x18]
352          STR s28, [x13]
353          STR s30,  [x7]
3548:
355        RET
356
357END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64
358
359#ifdef __ELF__
360.section ".note.GNU-stack","",%progbits
361#endif
362