• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_ld128(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> (x0)
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27#  x3 a0
28#  x9 a1
29# x10 a2
30# x11 a3
31# x12 a4
32#  x4 a5
33
34# C pointers
35#  x6 c0
36# x16 c1
37# x17 c2
38# x14 c3
39# x13 c4
40#  x7 c5
41
42# Vector register usage
43# A0   v0
44# A1   v1
45# A2   v2
46# A3   v3
47# A4   v4
48# A5   v5
49# B   v16 v17 v18 v19
50# C   v20 v21
51# C   v22 v23
52# C   v24 v25
53# C   v26 v27
54# C   v28 v29
55# C   v30 v31
56# Clamp v6 v7
57# unused A   v8 v9 v10 v11
58# unused B   v12 v13 v14 v15
59
60BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_ld128
61
62        $if INC:
63          # Load acc, params pointer
64          LDP x15, x8, [sp, 8]
65        $else:
66          # Load params pointer
67          LDR x8, [sp, 8]
68
69        # Clamp A and C pointers
70        CMP x0, 2                // if mr < 2
71        ADD x9, x3, x4           // a1 = a0 + a_stride
72        ADD x16, x6, x7          // c1 = c0 + cm_stride
73        CSEL x9, x3, x9, LO      //   a1 = a0
74        CSEL x16, x6, x16, LO    //   c1 = c0
75
76        ADD x10, x9, x4          // a2 = a1 + a_stride
77        ADD x17, x16, x7         // c2 = c1 + cm_stride
78                                 // if mr <= 2
79        CSEL x10, x9, x10, LS    //   a2 = a1
80        CSEL x17, x16, x17, LS   //   c2 = c1
81
82        CMP x0, 4                // if mr < 4
83        ADD x11, x10, x4         // a3 = a2 + a_stride
84        ADD x14, x17, x7         // c3 = c2 + cm_stride
85        CSEL x11, x10, x11, LO   //   a3 = a2
86        CSEL x14, x17, x14, LO   //   c3 = c2
87
88        ADD x12, x11, x4         // a4 = a3 + a_stride
89        ADD x13, x14, x7         // c4 = c3 + cm_stride
90                                 // if mr <= 4
91        CSEL x12, x11, x12, LS   //   a4 = a3
92        CSEL x13, x14, x13, LS   //   c4 = c3
93
94        CMP x0, 6                // if mr < 6
95        ADD x4, x12, x4          // a5 = a4 + a_stride
96        ADD x7, x13, x7          // c5 = c4 + cm_stride
97        CSEL x4, x12, x4, LO     //   a5 = a4
98        CSEL x7, x13, x7, LO     //   c5 = c4
99
100        # Load min/max values
101        LD2R {v6.4s, v7.4s}, [x8]
102
1030:
104        $if INC:
105          # Load initial accumulators
106          LDP q20, q21, [x15], 32
107          LDP q22, q23, [x15], 32
108          LDP q24, q25, [x15], 32
109          LDP q26, q27, [x15], 32
110          LDP q28, q29, [x15], 32
111          LDP q30, q31, [x15], 32
112          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
113          PRFM PLDL1KEEP, [x5, 64]
114          PRFM PLDL1KEEP, [x5, 128]
115          PRFM PLDL1KEEP, [x5, 192]
116          PRFM PLDL1KEEP,  [x3]    // Prefetch A
117          PRFM PLDL1KEEP,  [x9]
118          PRFM PLDL1KEEP, [x10]
119          PRFM PLDL1KEEP, [x11]
120          PRFM PLDL1KEEP, [x12]
121          PRFM PLDL1KEEP,  [x4]
122        $else:
123          # Load initial bias from w into accumulators
124          LDP q20, q21, [x5], 32
125          MOV v22.16b, v20.16b
126          PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
127          MOV v23.16b, v21.16b
128          PRFM PLDL1KEEP, [x5, 64]
129          MOV v24.16b, v20.16b
130          PRFM PLDL1KEEP, [x5, 128]
131          MOV v25.16b, v21.16b
132          PRFM PLDL1KEEP, [x5, 192]
133          MOV v26.16b, v20.16b
134          PRFM PLDL1KEEP,  [x3]    // Prefetch A
135          MOV v27.16b, v21.16b
136          PRFM PLDL1KEEP,  [x9]
137          MOV v28.16b, v20.16b
138          PRFM PLDL1KEEP, [x10]
139          MOV v29.16b, v21.16b
140          PRFM PLDL1KEEP, [x11]
141          MOV v30.16b, v20.16b
142          PRFM PLDL1KEEP, [x12]
143          MOV v31.16b, v21.16b
144          PRFM PLDL1KEEP,  [x4]
145
146        # Is there at least 4 floats (16 bytes)?
147        SUBS x0, x2, 16  // k = kc - 16
148        B.LO 3f
149
150        # Main loop - 4 floats of A (16 bytes)
151        # 48 FMA + 6 ld128 A + 4 LDP B
1521:
153        LDR   q0,  [x3], 16
154        LDP  q16,  q17, [x5], 32
155        LDR   q1,  [x9], 16
156        LDR   q2, [x10], 16
157        LDR   q3, [x11], 16
158        LDR   q4, [x12], 16
159        LDR   q5,  [x4], 16
160        FMLA v20.4s, v16.4s,  v0.s[0]
161        FMLA v22.4s, v16.4s,  v1.s[0]
162        FMLA v24.4s, v16.4s,  v2.s[0]
163        FMLA v26.4s, v16.4s,  v3.s[0]
164        LDP  q18,  q19, [x5], 32
165        FMLA v28.4s, v16.4s,  v4.s[0]
166        FMLA v30.4s, v16.4s,  v5.s[0]
167        FMLA v21.4s, v17.4s,  v0.s[0]
168        FMLA v23.4s, v17.4s,  v1.s[0]
169        FMLA v25.4s, v17.4s,  v2.s[0]
170        FMLA v27.4s, v17.4s,  v3.s[0]
171        FMLA v29.4s, v17.4s,  v4.s[0]
172        FMLA v31.4s, v17.4s,  v5.s[0]
173
174        FMLA v20.4s, v18.4s,  v0.s[1]
175        LDP  q16,  q17, [x5], 32
176        FMLA v22.4s, v18.4s,  v1.s[1]
177        FMLA v24.4s, v18.4s,  v2.s[1]
178        FMLA v26.4s, v18.4s,  v3.s[1]
179        FMLA v28.4s, v18.4s,  v4.s[1]
180        FMLA v30.4s, v18.4s,  v5.s[1]
181        FMLA v21.4s, v19.4s,  v0.s[1]
182        FMLA v23.4s, v19.4s,  v1.s[1]
183        FMLA v25.4s, v19.4s,  v2.s[1]
184        FMLA v27.4s, v19.4s,  v3.s[1]
185        FMLA v29.4s, v19.4s,  v4.s[1]
186        FMLA v31.4s, v19.4s,  v5.s[1]
187
188        FMLA v20.4s, v16.4s,  v0.s[2]
189        LDP  q18,  q19, [x5], 32
190        FMLA v22.4s, v16.4s,  v1.s[2]
191        FMLA v24.4s, v16.4s,  v2.s[2]
192        FMLA v26.4s, v16.4s,  v3.s[2]
193        FMLA v28.4s, v16.4s,  v4.s[2]
194        FMLA v30.4s, v16.4s,  v5.s[2]
195        FMLA v21.4s, v17.4s,  v0.s[2]
196        FMLA v23.4s, v17.4s,  v1.s[2]
197        FMLA v25.4s, v17.4s,  v2.s[2]
198        FMLA v27.4s, v17.4s,  v3.s[2]
199        FMLA v29.4s, v17.4s,  v4.s[2]
200        FMLA v31.4s, v17.4s,  v5.s[2]
201
202        FMLA v20.4s, v18.4s,  v0.s[3]
203        FMLA v22.4s, v18.4s,  v1.s[3]
204        FMLA v24.4s, v18.4s,  v2.s[3]
205        FMLA v26.4s, v18.4s,  v3.s[3]
206        FMLA v28.4s, v18.4s,  v4.s[3]
207        FMLA v30.4s, v18.4s,  v5.s[3]
208        FMLA v21.4s, v19.4s,  v0.s[3]
209        FMLA v23.4s, v19.4s,  v1.s[3]
210        FMLA v25.4s, v19.4s,  v2.s[3]
211        FMLA v27.4s, v19.4s,  v3.s[3]
212        SUBS x0, x0, 16
213        FMLA v29.4s, v19.4s,  v4.s[3]
214        FMLA v31.4s, v19.4s,  v5.s[3]
215        B.HS 1b
216
217        # Is there a remainder?- 2 floats of A (8 bytes) or less
218        TST x0, 15
219        B.NE 3f
220
2212:
222        # Clamp
223        FMAX v20.4s, v20.4s, v6.4s
224        # Load cn_stride
225        LDR x0, [sp, 0]
226        FMAX v21.4s, v21.4s, v6.4s
227        FMAX v22.4s, v22.4s, v6.4s
228        FMAX v23.4s, v23.4s, v6.4s
229        FMAX v24.4s, v24.4s, v6.4s
230        FMAX v25.4s, v25.4s, v6.4s
231        FMAX v26.4s, v26.4s, v6.4s
232        FMAX v27.4s, v27.4s, v6.4s
233        FMAX v28.4s, v28.4s, v6.4s
234        FMAX v29.4s, v29.4s, v6.4s
235        FMAX v30.4s, v30.4s, v6.4s
236        FMAX v31.4s, v31.4s, v6.4s
237        SUBS x1, x1, 8
238        FMIN v20.4s, v20.4s, v7.4s
239        FMIN v21.4s, v21.4s, v7.4s
240        FMIN v22.4s, v22.4s, v7.4s
241        FMIN v23.4s, v23.4s, v7.4s
242        FMIN v24.4s, v24.4s, v7.4s
243        FMIN v25.4s, v25.4s, v7.4s
244        FMIN v26.4s, v26.4s, v7.4s
245        FMIN v27.4s, v27.4s, v7.4s
246        FMIN v28.4s, v28.4s, v7.4s
247        FMIN v29.4s, v29.4s, v7.4s
248        FMIN v30.4s, v30.4s, v7.4s
249        FMIN v31.4s, v31.4s, v7.4s
250
251        # Store full 6 x 8
252        B.LO 5f
253
254        $if INC:
255          ST1 {v30.16b, v31.16b},  [x7], x0
256          SUB  x3,  x3, x2 // a0 -= kc
257          ST1 {v28.16b, v29.16b}, [x13], x0
258          SUB  x9,  x9, x2 // a1 -= kc
259          ST1 {v26.16b, v27.16b}, [x14], x0
260          SUB x10, x10, x2 // a2 -= kc
261          ST1 {v24.16b, v25.16b}, [x17], x0
262          SUB x11, x11, x2 // a3 -= kc
263          ST1 {v22.16b, v23.16b}, [x16], x0
264          SUB x12, x12, x2 // a4 -= kc
265          ST1 {v20.16b, v21.16b},  [x6], x0
266          SUB  x4,  x4, x2 // a5 -= kc
267        $else:
268          ST1 {v20.16b, v21.16b},  [x6], x0
269          SUB  x3,  x3, x2 // a0 -= kc
270          ST1 {v22.16b, v23.16b}, [x16], x0
271          SUB  x9,  x9, x2 // a1 -= kc
272          ST1 {v24.16b, v25.16b}, [x17], x0
273          SUB x10, x10, x2 // a2 -= kc
274          ST1 {v26.16b, v27.16b}, [x14], x0
275          SUB x11, x11, x2 // a3 -= kc
276          ST1 {v28.16b, v29.16b}, [x13], x0
277          SUB x12, x12, x2 // a4 -= kc
278          ST1 {v30.16b, v31.16b},  [x7], x0
279          SUB  x4,  x4, x2 // a5 -= kc
280
281        B.HI 0b
282        RET
283
2843:
285        # Is there a remainder?- 2 floats of A (8 bytes)
286        TBZ x0, 3, 4f
287
288        # Remainder- 2 floats of A (8 bytes)
289        LDR   d0,  [x3], 8
290        LDP  q16,  q17, [x5], 32
291        LDR   d1,  [x9], 8
292        LDR   d2, [x10], 8
293        LDR   d3, [x11], 8
294        LDR   d4, [x12], 8
295        LDR   d5,  [x4], 8
296        FMLA v20.4s, v16.4s,  v0.s[0]
297        FMLA v22.4s, v16.4s,  v1.s[0]
298        FMLA v24.4s, v16.4s,  v2.s[0]
299        FMLA v26.4s, v16.4s,  v3.s[0]
300        LDP  q18,  q19, [x5], 32
301        FMLA v28.4s, v16.4s,  v4.s[0]
302        FMLA v30.4s, v16.4s,  v5.s[0]
303        FMLA v21.4s, v17.4s,  v0.s[0]
304        FMLA v23.4s, v17.4s,  v1.s[0]
305        FMLA v25.4s, v17.4s,  v2.s[0]
306        FMLA v27.4s, v17.4s,  v3.s[0]
307        FMLA v29.4s, v17.4s,  v4.s[0]
308        FMLA v31.4s, v17.4s,  v5.s[0]
309
310        FMLA v20.4s, v18.4s,  v0.s[1]
311        FMLA v22.4s, v18.4s,  v1.s[1]
312        FMLA v24.4s, v18.4s,  v2.s[1]
313        FMLA v26.4s, v18.4s,  v3.s[1]
314        FMLA v28.4s, v18.4s,  v4.s[1]
315        FMLA v30.4s, v18.4s,  v5.s[1]
316        FMLA v21.4s, v19.4s,  v0.s[1]
317        FMLA v23.4s, v19.4s,  v1.s[1]
318        FMLA v25.4s, v19.4s,  v2.s[1]
319        FMLA v27.4s, v19.4s,  v3.s[1]
320        FMLA v29.4s, v19.4s,  v4.s[1]
321        FMLA v31.4s, v19.4s,  v5.s[1]
322
323        # Is there a remainder?- 1 floats of A (4 bytes)
324        TBZ x0, 2, 2b
325
326        # Remainder- 1 float of A (4 bytes)
3274:
328        LDR   s0,  [x3], 4
329        LDP  q16,  q17, [x5], 32
330        LDR   s1,  [x9], 4
331        LDR   s2, [x10], 4
332        LDR   s3, [x11], 4
333        LDR   s4, [x12], 4
334        LDR   s5,  [x4], 4
335        FMLA v20.4s, v16.4s,  v0.s[0]
336        FMLA v22.4s, v16.4s,  v1.s[0]
337        FMLA v24.4s, v16.4s,  v2.s[0]
338        FMLA v26.4s, v16.4s,  v3.s[0]
339        FMLA v28.4s, v16.4s,  v4.s[0]
340        FMLA v30.4s, v16.4s,  v5.s[0]
341        FMLA v21.4s, v17.4s,  v0.s[0]
342        FMLA v23.4s, v17.4s,  v1.s[0]
343        FMLA v25.4s, v17.4s,  v2.s[0]
344        FMLA v27.4s, v17.4s,  v3.s[0]
345        FMLA v29.4s, v17.4s,  v4.s[0]
346        FMLA v31.4s, v17.4s,  v5.s[0]
347        B 2b
348
349        # Store odd width
3505:
351        TBZ x1, 2, 6f
352        $if INC:
353          STR q30,  [x7], 16
354          MOV v30.16b, v31.16b
355          STR q28, [x13], 16
356          MOV v28.16b, v29.16b
357          STR q26, [x14], 16
358          MOV v26.16b, v27.16b
359          STR q24, [x17], 16
360          MOV v24.16b, v25.16b
361          STR q22, [x16], 16
362          MOV v22.16b, v23.16b
363          STR q20,  [x6], 16
364          MOV v20.16b, v21.16b
365        $else:
366          STR q20,  [x6], 16
367          MOV v20.16b, v21.16b
368          STR q22, [x16], 16
369          MOV v22.16b, v23.16b
370          STR q24, [x17], 16
371          MOV v24.16b, v25.16b
372          STR q26, [x14], 16
373          MOV v26.16b, v27.16b
374          STR q28, [x13], 16
375          MOV v28.16b, v29.16b
376          STR q30,  [x7], 16
377          MOV v30.16b, v31.16b
378
3796:
380        TBZ x1, 1, 7f
381        $if INC:
382          STR d30,  [x7], 8
383          DUP d30, v30.d[1]
384          STR d28, [x13], 8
385          DUP d28, v28.d[1]
386          STR d26, [x14], 8
387          DUP d26, v26.d[1]
388          STR d24, [x17], 8
389          DUP d24, v24.d[1]
390          STR d22, [x16], 8
391          DUP d22, v22.d[1]
392          STR d20,  [x6], 8
393          DUP d20, v20.d[1]
394        $else:
395          STR d20,  [x6], 8
396          DUP d20, v20.d[1]
397          STR d22, [x16], 8
398          DUP d22, v22.d[1]
399          STR d24, [x17], 8
400          DUP d24, v24.d[1]
401          STR d26, [x14], 8
402          DUP d26, v26.d[1]
403          STR d28, [x13], 8
404          DUP d28, v28.d[1]
405          STR d30,  [x7], 8
406          DUP d30, v30.d[1]
407
4087:
409        TBZ x1, 0, 8f
410        $if INC:
411          STR s30,  [x7]
412          STR s28, [x13]
413          STR s26, [x14]
414          STR s24, [x17]
415          STR s22, [x16]
416          STR s20,  [x6]
417        $else:
418          STR s20,  [x6]
419          STR s22, [x16]
420          STR s24, [x17]
421          STR s26, [x14]
422          STR s28, [x13]
423          STR s30,  [x7]
4248:
425        RET
426
427END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_ld128
428
429#ifdef __ELF__
430.section ".note.GNU-stack","",%progbits
431#endif
432