• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld128(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28#  x3 a0
29#  x9 a1
30# x10 a2
31# x11 a3
32# x12 a4
33#  x4 a5
34
35# C pointers
36#  x6 c0
37# x16 c1
38# x17 c2
39# x18 c3
40# x13 c4
41#  x7 c5
42
43# Vector register usage
44# A0   v0
45# A1   v1
46# A2   v2
47# A3   v3
48# A4   v4
49# A5   v5
50# B   v16 v17 v18 v19
51# C   v20 v21
52# C   v22 v23
53# C   v24 v25
54# C   v26 v27
55# C   v28 v29
56# C   v30 v31
57# Clamp v6 v7
58# unused A   v8 v9 v10 v11
59# unused B   v12 v13 v14 v15
60
61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld128
62
63        # Clamp A and C pointers
64        CMP x0, 2                // if mr < 2
65        ADD x9, x3, x4           // a1 = a0 + a_stride
66        ADD x16, x6, x7          // c1 = c0 + cm_stride
67        CSEL x9, x3, x9, LO      //   a1 = a0
68        CSEL x16, x6, x16, LO    //   c1 = c0
69
70        ADD x10, x9, x4          // a2 = a1 + a_stride
71        ADD x17, x16, x7         // c2 = c1 + cm_stride
72                                 // if mr <= 2
73        CSEL x10, x9, x10, LS    //   a2 = a1
74        CSEL x17, x16, x17, LS   //   c2 = c1
75
76        CMP x0, 4                // if mr < 4
77        ADD x11, x10, x4         // a3 = a2 + a_stride
78        ADD x18, x17, x7         // c3 = c2 + cm_stride
79        CSEL x11, x10, x11, LO   //   a3 = a2
80        CSEL x18, x17, x18, LO   //   c3 = c2
81
82        ADD x12, x11, x4         // a4 = a3 + a_stride
83        ADD x13, x18, x7         // c4 = c3 + cm_stride
84                                 // if mr <= 5
85        CSEL x12, x11, x12, LS   //   a4 = a3
86        CSEL x13, x18, x13, LS   //   c4 = c3
87
88        $if INC:
89          # Load acc, params pointer
90          LDP x15, x8, [sp, 8]
91        $else:
92          # Load params pointer
93          LDR x8, [sp, 8]
94
95        CMP x0, 6                // if mr < 6
96        ADD x4, x12, x4          // a5 = a4 + a_stride
97        ADD x7, x13, x7          // c5 = c4 + cm_stride
98        CSEL x4, x12, x4, LO     //   a5 = a4
99        CSEL x7, x13, x7, LO     //   c5 = c4
100
101        # Load clamping_params values
102        LD2R {v6.4s, v7.4s}, [x8]
103
104        # Load cn_stride
105        LDR x14, [sp]
106
1070:
108        $if INC:
109          # Load initial accumulators
110          LDP q20, q21, [x15], 32
111          LDP q22, q23, [x15], 32
112          LDP q24, q25, [x15], 32
113          LDP q26, q27, [x15], 32
114          LDP q28, q29, [x15], 32
115          LDP q30, q31, [x15], 32
116          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
117          PRFM PLDL1KEEP, [x5, 64]
118          PRFM PLDL1KEEP, [x5, 128]
119          PRFM PLDL1KEEP, [x5, 192]
120          PRFM PLDL1KEEP,  [x3]    // Prefetch A
121          PRFM PLDL1KEEP,  [x9]
122          PRFM PLDL1KEEP, [x10]
123          PRFM PLDL1KEEP, [x11]
124          PRFM PLDL1KEEP, [x12]
125          PRFM PLDL1KEEP,  [x4]
126        $else:
127          # Load initial bias from w into accumulators
128          LDP q20, q21, [x5], 32
129          MOV v22.16b, v20.16b
130          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
131          MOV v23.16b, v21.16b
132          PRFM PLDL1KEEP, [x5, 64]
133          MOV v24.16b, v20.16b
134          PRFM PLDL1KEEP, [x5, 128]
135          MOV v25.16b, v21.16b
136          PRFM PLDL1KEEP, [x5, 192]
137          MOV v26.16b, v20.16b
138          PRFM PLDL1KEEP,  [x3]    // Prefetch A
139          MOV v27.16b, v21.16b
140          PRFM PLDL1KEEP,  [x9]
141          MOV v28.16b, v20.16b
142          PRFM PLDL1KEEP, [x10]
143          MOV v29.16b, v21.16b
144          PRFM PLDL1KEEP, [x11]
145          MOV v30.16b, v20.16b
146          PRFM PLDL1KEEP, [x12]
147          MOV v31.16b, v21.16b
148          PRFM PLDL1KEEP,  [x4]
149
150        # Is there at least 4 floats (16 bytes)?
151        SUBS x0, x2, 16  // k = kc - 16
152        B.LO 5f
153
154        # Main loop - 4 floats of A (16 bytes)
155        # 48 FMA + 6 ld128 A + 4 LDP B
1561:
157        LDR   q0,  [x3], 16
158        LDP  q16,  q17, [x5], 32
159        LDR   q1,  [x9], 16
160        LDR   q2, [x10], 16
161        LDR   q3, [x11], 16
162        LDR   q4, [x12], 16
163        LDR   q5,  [x4], 16
164        FMLA v20.4s, v16.4s,  v0.s[0]
165        FMLA v22.4s, v16.4s,  v1.s[0]
166        FMLA v24.4s, v16.4s,  v2.s[0]
167        FMLA v26.4s, v16.4s,  v3.s[0]
168        LDP  q18,  q19, [x5], 32
169        FMLA v28.4s, v16.4s,  v4.s[0]
170        FMLA v30.4s, v16.4s,  v5.s[0]
171        FMLA v21.4s, v17.4s,  v0.s[0]
172        FMLA v23.4s, v17.4s,  v1.s[0]
173        FMLA v25.4s, v17.4s,  v2.s[0]
174        FMLA v27.4s, v17.4s,  v3.s[0]
175        FMLA v29.4s, v17.4s,  v4.s[0]
176        FMLA v31.4s, v17.4s,  v5.s[0]
177
178        FMLA v20.4s, v18.4s,  v0.s[1]
179        LDP  q16,  q17, [x5], 32
180        FMLA v22.4s, v18.4s,  v1.s[1]
181        FMLA v24.4s, v18.4s,  v2.s[1]
182        FMLA v26.4s, v18.4s,  v3.s[1]
183        FMLA v28.4s, v18.4s,  v4.s[1]
184        FMLA v30.4s, v18.4s,  v5.s[1]
185        FMLA v21.4s, v19.4s,  v0.s[1]
186        FMLA v23.4s, v19.4s,  v1.s[1]
187        FMLA v25.4s, v19.4s,  v2.s[1]
188        FMLA v27.4s, v19.4s,  v3.s[1]
189        FMLA v29.4s, v19.4s,  v4.s[1]
190        FMLA v31.4s, v19.4s,  v5.s[1]
191
192        FMLA v20.4s, v16.4s,  v0.s[2]
193        LDP  q18,  q19, [x5], 32
194        FMLA v22.4s, v16.4s,  v1.s[2]
195        FMLA v24.4s, v16.4s,  v2.s[2]
196        FMLA v26.4s, v16.4s,  v3.s[2]
197        FMLA v28.4s, v16.4s,  v4.s[2]
198        FMLA v30.4s, v16.4s,  v5.s[2]
199        FMLA v21.4s, v17.4s,  v0.s[2]
200        FMLA v23.4s, v17.4s,  v1.s[2]
201        FMLA v25.4s, v17.4s,  v2.s[2]
202        FMLA v27.4s, v17.4s,  v3.s[2]
203        FMLA v29.4s, v17.4s,  v4.s[2]
204        FMLA v31.4s, v17.4s,  v5.s[2]
205
206        FMLA v20.4s, v18.4s,  v0.s[3]
207        FMLA v22.4s, v18.4s,  v1.s[3]
208        FMLA v24.4s, v18.4s,  v2.s[3]
209        FMLA v26.4s, v18.4s,  v3.s[3]
210        FMLA v28.4s, v18.4s,  v4.s[3]
211        FMLA v30.4s, v18.4s,  v5.s[3]
212        FMLA v21.4s, v19.4s,  v0.s[3]
213        FMLA v23.4s, v19.4s,  v1.s[3]
214        FMLA v25.4s, v19.4s,  v2.s[3]
215        FMLA v27.4s, v19.4s,  v3.s[3]
216        SUBS x0, x0, 16
217        FMLA v29.4s, v19.4s,  v4.s[3]
218        FMLA v31.4s, v19.4s,  v5.s[3]
219        B.HS 1b
220
221        # Is there a remainder?- 2 floats of A (8 bytes) or less
222        TST x0, 15
223        B.NE 5f
224
2254:
226        # Clamp
227        FMIN v20.4s, v20.4s, v6.4s
228        SUBS x1, x1, 8
229        FMIN v21.4s, v21.4s, v6.4s
230        FMIN v22.4s, v22.4s, v6.4s
231        FMIN v23.4s, v23.4s, v6.4s
232        FMIN v24.4s, v24.4s, v6.4s
233        FMIN v25.4s, v25.4s, v6.4s
234        FMIN v26.4s, v26.4s, v6.4s
235        FMIN v27.4s, v27.4s, v6.4s
236        FMIN v28.4s, v28.4s, v6.4s
237        FMIN v29.4s, v29.4s, v6.4s
238        FMIN v30.4s, v30.4s, v6.4s
239        FMIN v31.4s, v31.4s, v6.4s
240        FMAX v20.4s, v20.4s, v7.4s
241        FMAX v21.4s, v21.4s, v7.4s
242        FMAX v22.4s, v22.4s, v7.4s
243        FMAX v23.4s, v23.4s, v7.4s
244        FMAX v24.4s, v24.4s, v7.4s
245        FMAX v25.4s, v25.4s, v7.4s
246        FMAX v26.4s, v26.4s, v7.4s
247        FMAX v27.4s, v27.4s, v7.4s
248        FMAX v28.4s, v28.4s, v7.4s
249        FMAX v29.4s, v29.4s, v7.4s
250        FMAX v30.4s, v30.4s, v7.4s
251        FMAX v31.4s, v31.4s, v7.4s
252
253        # Store full 6 x 8
254        B.LO 7f
255
256        $if INC:
257          ST1 {v30.16b, v31.16b},  [x7], x14
258          SUB  x3,  x3, x2 // a0 -= kc
259          ST1 {v28.16b, v29.16b}, [x13], x14
260          SUB  x9,  x9, x2 // a1 -= kc
261          ST1 {v26.16b, v27.16b}, [x18], x14
262          SUB x10, x10, x2 // a2 -= kc
263          ST1 {v24.16b, v25.16b}, [x17], x14
264          SUB x11, x11, x2 // a3 -= kc
265          ST1 {v22.16b, v23.16b}, [x16], x14
266          SUB x12, x12, x2 // a4 -= kc
267          ST1 {v20.16b, v21.16b},  [x6], x14
268          SUB  x4,  x4, x2 // a5 -= kc
269        $else:
270          ST1 {v20.16b, v21.16b},  [x6], x14
271          SUB  x3,  x3, x2 // a0 -= kc
272          ST1 {v22.16b, v23.16b}, [x16], x14
273          SUB  x9,  x9, x2 // a1 -= kc
274          ST1 {v24.16b, v25.16b}, [x17], x14
275          SUB x10, x10, x2 // a2 -= kc
276          ST1 {v26.16b, v27.16b}, [x18], x14
277          SUB x11, x11, x2 // a3 -= kc
278          ST1 {v28.16b, v29.16b}, [x13], x14
279          SUB x12, x12, x2 // a4 -= kc
280          ST1 {v30.16b, v31.16b},  [x7], x14
281          SUB  x4,  x4, x2 // a5 -= kc
282
283        B.HI 0b
284        RET
285
2865:
287        # Is there a remainder?- 2 floats of A (8 bytes)
288        TBZ x0, 3, 6f
289
290        # Remainder- 2 floats of A (8 bytes)
291        LDR   d0,  [x3], 8
292        LDP  q16,  q17, [x5], 32
293        LDR   d1,  [x9], 8
294        LDR   d2, [x10], 8
295        LDR   d3, [x11], 8
296        LDR   d4, [x12], 8
297        LDR   d5,  [x4], 8
298        FMLA v20.4s, v16.4s,  v0.s[0]
299        FMLA v22.4s, v16.4s,  v1.s[0]
300        FMLA v24.4s, v16.4s,  v2.s[0]
301        FMLA v26.4s, v16.4s,  v3.s[0]
302        LDP  q18,  q19, [x5], 32
303        FMLA v28.4s, v16.4s,  v4.s[0]
304        FMLA v30.4s, v16.4s,  v5.s[0]
305        FMLA v21.4s, v17.4s,  v0.s[0]
306        FMLA v23.4s, v17.4s,  v1.s[0]
307        FMLA v25.4s, v17.4s,  v2.s[0]
308        FMLA v27.4s, v17.4s,  v3.s[0]
309        FMLA v29.4s, v17.4s,  v4.s[0]
310        FMLA v31.4s, v17.4s,  v5.s[0]
311
312        FMLA v20.4s, v18.4s,  v0.s[1]
313        FMLA v22.4s, v18.4s,  v1.s[1]
314        FMLA v24.4s, v18.4s,  v2.s[1]
315        FMLA v26.4s, v18.4s,  v3.s[1]
316        FMLA v28.4s, v18.4s,  v4.s[1]
317        FMLA v30.4s, v18.4s,  v5.s[1]
318        FMLA v21.4s, v19.4s,  v0.s[1]
319        FMLA v23.4s, v19.4s,  v1.s[1]
320        FMLA v25.4s, v19.4s,  v2.s[1]
321        FMLA v27.4s, v19.4s,  v3.s[1]
322        FMLA v29.4s, v19.4s,  v4.s[1]
323        FMLA v31.4s, v19.4s,  v5.s[1]
324
325        # Is there a remainder?- 1 floats of A (4 bytes)
326        TBZ x0, 2, 4b
327
328        # Remainder- 1 float of A (4 bytes)
3296:
330        LDR   s0,  [x3], 4
331        LDP  q16,  q17, [x5], 32
332        LDR   s1,  [x9], 4
333        LDR   s2, [x10], 4
334        LDR   s3, [x11], 4
335        LDR   s4, [x12], 4
336        LDR   s5,  [x4], 4
337        FMLA v20.4s, v16.4s,  v0.s[0]
338        FMLA v22.4s, v16.4s,  v1.s[0]
339        FMLA v24.4s, v16.4s,  v2.s[0]
340        FMLA v26.4s, v16.4s,  v3.s[0]
341        FMLA v28.4s, v16.4s,  v4.s[0]
342        FMLA v30.4s, v16.4s,  v5.s[0]
343        FMLA v21.4s, v17.4s,  v0.s[0]
344        FMLA v23.4s, v17.4s,  v1.s[0]
345        FMLA v25.4s, v17.4s,  v2.s[0]
346        FMLA v27.4s, v17.4s,  v3.s[0]
347        FMLA v29.4s, v17.4s,  v4.s[0]
348        FMLA v31.4s, v17.4s,  v5.s[0]
349        B 4b
350
351        # Store odd width
3527:
353        TBZ x1, 2, 8f
354        $if INC:
355          STR q30,  [x7], 16
356          MOV v30.16b, v31.16b
357          STR q28, [x13], 16
358          MOV v28.16b, v29.16b
359          STR q26, [x18], 16
360          MOV v26.16b, v27.16b
361          STR q24, [x17], 16
362          MOV v24.16b, v25.16b
363          STR q22, [x16], 16
364          MOV v22.16b, v23.16b
365          STR q20,  [x6], 16
366          MOV v20.16b, v21.16b
367        $else:
368          STR q20,  [x6], 16
369          MOV v20.16b, v21.16b
370          STR q22, [x16], 16
371          MOV v22.16b, v23.16b
372          STR q24, [x17], 16
373          MOV v24.16b, v25.16b
374          STR q26, [x18], 16
375          MOV v26.16b, v27.16b
376          STR q28, [x13], 16
377          MOV v28.16b, v29.16b
378          STR q30,  [x7], 16
379          MOV v30.16b, v31.16b
380
3818:
382        TBZ x1, 1, 9f
383        $if INC:
384          STR d30,  [x7], 8
385          DUP d30, v30.d[1]
386          STR d28, [x13], 8
387          DUP d28, v28.d[1]
388          STR d26, [x18], 8
389          DUP d26, v26.d[1]
390          STR d24, [x17], 8
391          DUP d24, v24.d[1]
392          STR d22, [x16], 8
393          DUP d22, v22.d[1]
394          STR d20,  [x6], 8
395          DUP d20, v20.d[1]
396        $else:
397          STR d20,  [x6], 8
398          DUP d20, v20.d[1]
399          STR d22, [x16], 8
400          DUP d22, v22.d[1]
401          STR d24, [x17], 8
402          DUP d24, v24.d[1]
403          STR d26, [x18], 8
404          DUP d26, v26.d[1]
405          STR d28, [x13], 8
406          DUP d28, v28.d[1]
407          STR d30,  [x7], 8
408          DUP d30, v30.d[1]
409
4109:
411        TBZ x1, 0, 10f
412        $if INC:
413          STR s30,  [x7]
414          STR s28, [x13]
415          STR s26, [x18]
416          STR s24, [x17]
417          STR s22, [x16]
418          STR s20,  [x6]
419        $else:
420          STR s20,  [x6]
421          STR s22, [x16]
422          STR s24, [x17]
423          STR s26, [x18]
424          STR s28, [x13]
425          STR s30,  [x7]
42610:
427        RET
428
429END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld128
430
431#ifdef __ELF__
432.section ".note.GNU-stack","",%progbits
433#endif
434