• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
9#     size_t mr,                x0
10#     size_t nc,                x1
11#     size_t kc,                x2 / x0
12#     const uint8_t*restrict a, x3
13#     size_t a_stride,          x4
14#     const void*restrict w,    x5
15#     uint8_t*restrict c,       x6
16#     size_t cm_stride,         x7
17#     size_t cn_stride,         [sp] -> x14
18$if INC:
19  #     const float*restrict acc,  [sp + 8] -> x15
20  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
21$else:
22  #     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3  a0
29# x11 a1
30# x12 a2
31# x4  a3 / a_stride
32
33# C pointers
34# x6  c0
35# x9  c1
36# x10 c2
37# x7  c3 / cm_stride
38
39# Vector register usage
40# A0  v0  v4
41# A1  v1  v5
42# A2  v2  v6
43# A3  v3  v7
44# B   v8  v9 v10 v11
45# B  v12 v13 v14 v15
46# B  v20 v21 v22 v23
47# B  v24 v25 v26 v27
48# C  v16 v17
49# C  v18 v19
50# C  v28 v29
51# C  v30 v31
52# Clamp v4 v5
53
54BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
55
56        $if INC:
57          # Load cn_stride, acc
58          LDP x14, x15, [sp]
59          # Load params pointer
60          LDR x8, [sp, 16]
61        $else:
62          # Load cn_stride, params pointer
63          LDP x14, x8, [sp]
64
65        # Load clamping_params values
66        LD2R {v4.4s, v5.4s}, [x8]
67
68        # Save d8-d15 on stack
69        STP  d8,  d9, [sp, -64]!
70        STP d10, d11, [sp, 16]
71        STP d12, d13, [sp, 32]
72        STP d14, d15, [sp, 48]
73
74        # Clamp A and C pointers
75        CMP x0, 2                // if mr < 2
76        ADD x11, x3, x4          // a1 = a0 + a_stride
77        ADD x9, x6, x7           // c1 = c0 + cm_stride
78        CSEL x11, x3, x11, LO    //   a1 = a0
79        CSEL x9, x6, x9, LO      //   c1 = c0
80
81        ADD x12, x11, x4         // a2 = a1 + a_stride
82        ADD x10, x9, x7          // c2 = c1 + cm_stride
83                                 // if mr <= 2
84        CSEL x12, x11, x12, LS   //   a2 = a1
85        CSEL x10, x9, x10, LS    //   c2 = c1
86
87        CMP x0, 4                // if mr < 4
88        ADD x4, x12, x4          // a3 = a2 + a_stride
89        ADD x7, x10, x7          // c3 = c2 + cm_stride
90        CSEL x4, x12, x4, LO     //   a3 = a2
91        CSEL x7, x10, x7, LO     //   c3 = c2
92
930:
94        $if INC:
95          # Load initial accumulators
96          LDP q16, q17, [x15], 32
97          LDP q18, q19, [x15], 32
98          LDP q28, q29, [x15], 32
99          LDP q30, q31, [x15], 32
100        $else:
101          # Load initial bias from w into accumulators
102          LDP q16, q17, [x5], 32
103          MOV v18.16b, v16.16b
104          MOV v19.16b, v17.16b
105          MOV v28.16b, v16.16b
106          MOV v29.16b, v17.16b
107          MOV v30.16b, v16.16b
108          MOV v31.16b, v17.16b
109
110        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
111        SUBS x0, x2, 32  // k = kc - 32
112        B.LO 3f
113
114        # 16 prologue
115        # Read first block of 4 A and B.
116        LDR q0,  [x3], 16
117        LDP q20, q21, [x5], 32
118        LDR q1, [x11], 16
119        LDR q2, [x12], 16
120        LDR q3,  [x4], 16
121        LDP q22, q23, [x5], 32
122        LDP q24, q25, [x5], 32
123        LDP q26, q27, [x5], 32
124
125        # Is there at least 32.  yes do main loop
126        SUBS x0, x0, 32
127        B.LO 2f
128
129        # Main loop - 8 floats of A (32 bytes)
1301:
131        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
132        FMLA v16.4s, v20.4s, v0.s[0]
133        LDP q8, q9, [x5], 32
134        FMLA v17.4s, v21.4s, v0.s[0]
135        FMLA v18.4s, v20.4s, v1.s[0]
136        LDP q10, q11, [x5], 32
137        FMLA v19.4s, v21.4s, v1.s[0]
138        FMLA v28.4s, v20.4s, v2.s[0]
139        LDP q12, q13, [x5], 32
140        FMLA v29.4s, v21.4s, v2.s[0]
141        FMLA v30.4s, v20.4s, v3.s[0]
142        LDP q14, q15, [x5], 32
143        FMLA v31.4s, v21.4s, v3.s[0]
144        FMLA v16.4s, v22.4s, v0.s[1]
145        LDR q4, [x3], 16
146        FMLA v17.4s, v23.4s, v0.s[1]
147        FMLA v18.4s, v22.4s, v1.s[1]
148        LDR q5, [x11], 16
149        FMLA v19.4s, v23.4s, v1.s[1]
150        FMLA v28.4s, v22.4s, v2.s[1]
151        LDR q6, [x12], 16
152        FMLA v29.4s, v23.4s, v2.s[1]
153        FMLA v30.4s, v22.4s, v3.s[1]
154        LDR q7, [x4], 16
155        FMLA v31.4s, v23.4s, v3.s[1]
156        FMLA v16.4s, v24.4s, v0.s[2]
157        $if PREFETCH:
158          PRFM PLDL1KEEP, [x5, 128]
159        FMLA v17.4s, v25.4s, v0.s[2]
160        FMLA v18.4s, v24.4s, v1.s[2]
161        $if PREFETCH:
162          PRFM PLDL1KEEP, [x5, 192]
163        FMLA v19.4s, v25.4s, v1.s[2]
164        FMLA v28.4s, v24.4s, v2.s[2]
165        $if PREFETCH:
166          PRFM PLDL1KEEP, [x5, 256]
167        FMLA v29.4s, v25.4s, v2.s[2]
168        FMLA v30.4s, v24.4s, v3.s[2]
169        $if PREFETCH:
170          PRFM PLDL1KEEP, [x5, 320]
171        FMLA v31.4s, v25.4s, v3.s[2]
172        FMLA v16.4s, v26.4s, v0.s[3]
173        FMLA v17.4s, v27.4s, v0.s[3]
174        FMLA v18.4s, v26.4s, v1.s[3]
175        FMLA v19.4s, v27.4s, v1.s[3]
176        FMLA v28.4s, v26.4s, v2.s[3]
177        FMLA v29.4s, v27.4s, v2.s[3]
178        FMLA v30.4s, v26.4s, v3.s[3]
179        FMLA v31.4s, v27.4s, v3.s[3]
180
181        # Second block of 4.  FMA for second 4, loads for 1nd block of 4.
182        FMLA v16.4s, v8.4s, v4.s[0]
183        LDP q20, q21, [x5], 32
184        FMLA v17.4s, v9.4s, v4.s[0]
185        FMLA v18.4s, v8.4s, v5.s[0]
186        LDP q22, q23, [x5], 32
187        FMLA v19.4s, v9.4s, v5.s[0]
188        FMLA v28.4s, v8.4s, v6.s[0]
189        LDP q24, q25, [x5], 32
190        FMLA v29.4s, v9.4s, v6.s[0]
191        FMLA v30.4s, v8.4s, v7.s[0]
192        LDP q26, q27, [x5], 32
193        FMLA v31.4s, v9.4s, v7.s[0]
194        FMLA v16.4s, v10.4s, v4.s[1]
195        LDR q0, [x3], 16
196        FMLA v17.4s, v11.4s, v4.s[1]
197        FMLA v18.4s, v10.4s, v5.s[1]
198        LDR q1, [x11], 16
199        FMLA v19.4s, v11.4s, v5.s[1]
200        FMLA v28.4s, v10.4s, v6.s[1]
201        LDR q2, [x12], 16
202        FMLA v29.4s, v11.4s, v6.s[1]
203        FMLA v30.4s, v10.4s, v7.s[1]
204        LDR q3, [x4], 16
205        FMLA v31.4s, v11.4s, v7.s[1]
206        FMLA v16.4s, v12.4s, v4.s[2]
207        FMLA v17.4s, v13.4s, v4.s[2]
208        FMLA v18.4s, v12.4s, v5.s[2]
209        FMLA v19.4s, v13.4s, v5.s[2]
210        FMLA v28.4s, v12.4s, v6.s[2]
211        FMLA v29.4s, v13.4s, v6.s[2]
212        FMLA v30.4s, v12.4s, v7.s[2]
213        FMLA v31.4s, v13.4s, v7.s[2]
214        FMLA v16.4s, v14.4s, v4.s[3]
215        FMLA v17.4s, v15.4s, v4.s[3]
216        FMLA v18.4s, v14.4s, v5.s[3]
217        FMLA v19.4s, v15.4s, v5.s[3]
218        FMLA v28.4s, v14.4s, v6.s[3]
219        FMLA v29.4s, v15.4s, v6.s[3]
220        SUBS x0, x0, 32
221        FMLA v30.4s, v14.4s, v7.s[3]
222        FMLA v31.4s, v15.4s, v7.s[3]
223        B.HS 1b
224
2252:
226        # Epilogue
227        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
228        FMLA v16.4s, v20.4s, v0.s[0]
229        LDP q8, q9, [x5], 32
230        FMLA v17.4s, v21.4s, v0.s[0]
231        FMLA v18.4s, v20.4s, v1.s[0]
232        LDP q10, q11, [x5], 32
233        FMLA v19.4s, v21.4s, v1.s[0]
234        FMLA v28.4s, v20.4s, v2.s[0]
235        LDP q12, q13, [x5], 32
236        FMLA v29.4s, v21.4s, v2.s[0]
237        FMLA v30.4s, v20.4s, v3.s[0]
238        LDP q14, q15, [x5], 32
239        FMLA v31.4s, v21.4s, v3.s[0]
240        FMLA v16.4s, v22.4s, v0.s[1]
241        LDR q4, [x3], 16
242        FMLA v17.4s, v23.4s, v0.s[1]
243        FMLA v18.4s, v22.4s, v1.s[1]
244        LDR q5, [x11], 16
245        FMLA v19.4s, v23.4s, v1.s[1]
246        FMLA v28.4s, v22.4s, v2.s[1]
247        LDR q6, [x12], 16
248        FMLA v29.4s, v23.4s, v2.s[1]
249        FMLA v30.4s, v22.4s, v3.s[1]
250        LDR q7, [x4], 16
251        FMLA v31.4s, v23.4s, v3.s[1]
252        FMLA v16.4s, v24.4s, v0.s[2]
253        FMLA v17.4s, v25.4s, v0.s[2]
254        FMLA v18.4s, v24.4s, v1.s[2]
255        FMLA v19.4s, v25.4s, v1.s[2]
256        FMLA v28.4s, v24.4s, v2.s[2]
257        FMLA v29.4s, v25.4s, v2.s[2]
258        FMLA v30.4s, v24.4s, v3.s[2]
259        FMLA v31.4s, v25.4s, v3.s[2]
260        FMLA v16.4s, v26.4s, v0.s[3]
261        FMLA v17.4s, v27.4s, v0.s[3]
262        FMLA v18.4s, v26.4s, v1.s[3]
263        FMLA v19.4s, v27.4s, v1.s[3]
264        FMLA v28.4s, v26.4s, v2.s[3]
265        FMLA v29.4s, v27.4s, v2.s[3]
266        FMLA v30.4s, v26.4s, v3.s[3]
267        FMLA v31.4s, v27.4s, v3.s[3]
268
269        # Second block of 4.  FMA for second 4, noloads
270        FMLA v16.4s, v8.4s, v4.s[0]
271        FMLA v17.4s, v9.4s, v4.s[0]
272        FMLA v18.4s, v8.4s, v5.s[0]
273        FMLA v19.4s, v9.4s, v5.s[0]
274        FMLA v28.4s, v8.4s, v6.s[0]
275        FMLA v29.4s, v9.4s, v6.s[0]
276        FMLA v30.4s, v8.4s, v7.s[0]
277        FMLA v31.4s, v9.4s, v7.s[0]
278
279        FMLA v16.4s, v10.4s, v4.s[1]
280        FMLA v17.4s, v11.4s, v4.s[1]
281        FMLA v18.4s, v10.4s, v5.s[1]
282        FMLA v19.4s, v11.4s, v5.s[1]
283        FMLA v28.4s, v10.4s, v6.s[1]
284        FMLA v29.4s, v11.4s, v6.s[1]
285        FMLA v30.4s, v10.4s, v7.s[1]
286        FMLA v31.4s, v11.4s, v7.s[1]
287
288        FMLA v16.4s, v12.4s, v4.s[2]
289        FMLA v17.4s, v13.4s, v4.s[2]
290        FMLA v18.4s, v12.4s, v5.s[2]
291        FMLA v19.4s, v13.4s, v5.s[2]
292        FMLA v28.4s, v12.4s, v6.s[2]
293        FMLA v29.4s, v13.4s, v6.s[2]
294        FMLA v30.4s, v12.4s, v7.s[2]
295        FMLA v31.4s, v13.4s, v7.s[2]
296
297        FMLA v16.4s, v14.4s, v4.s[3]
298        FMLA v17.4s, v15.4s, v4.s[3]
299        FMLA v18.4s, v14.4s, v5.s[3]
300        FMLA v19.4s, v15.4s, v5.s[3]
301
302        # Load clamping_params values
303        LD2R {v4.4s, v5.4s}, [x8]
304
305        FMLA v28.4s, v14.4s, v6.s[3]
306        FMLA v29.4s, v15.4s, v6.s[3]
307        FMLA v30.4s, v14.4s, v7.s[3]
308        FMLA v31.4s, v15.4s, v7.s[3]
309
3103:
311        # Remainder- 4 floats of A (16 bytes)
312        TBZ x0, 4, 4f
313
314        LDR q0,  [x3], 16
315        LDP q20, q21, [x5], 32
316        LDR q1, [x11], 16
317        LDR q2, [x12], 16
318        LDR q3,  [x4], 16
319        FMLA v16.4s, v20.4s, v0.s[0]
320        FMLA v17.4s, v21.4s, v0.s[0]
321        LDP q22, q23, [x5], 32
322        FMLA v18.4s, v20.4s, v1.s[0]
323        FMLA v19.4s, v21.4s, v1.s[0]
324        LDP q24, q25, [x5], 32
325        FMLA v28.4s, v20.4s, v2.s[0]
326        FMLA v29.4s, v21.4s, v2.s[0]
327        LDP q26, q27, [x5], 32
328        FMLA v30.4s, v20.4s, v3.s[0]
329        FMLA v31.4s, v21.4s, v3.s[0]
330        FMLA v16.4s, v22.4s, v0.s[1]
331        FMLA v17.4s, v23.4s, v0.s[1]
332        FMLA v18.4s, v22.4s, v1.s[1]
333        FMLA v19.4s, v23.4s, v1.s[1]
334        FMLA v28.4s, v22.4s, v2.s[1]
335        FMLA v29.4s, v23.4s, v2.s[1]
336        FMLA v30.4s, v22.4s, v3.s[1]
337        FMLA v31.4s, v23.4s, v3.s[1]
338        FMLA v16.4s, v24.4s, v0.s[2]
339        FMLA v17.4s, v25.4s, v0.s[2]
340        FMLA v18.4s, v24.4s, v1.s[2]
341        FMLA v19.4s, v25.4s, v1.s[2]
342        FMLA v28.4s, v24.4s, v2.s[2]
343        FMLA v29.4s, v25.4s, v2.s[2]
344        FMLA v30.4s, v24.4s, v3.s[2]
345        FMLA v31.4s, v25.4s, v3.s[2]
346        FMLA v16.4s, v26.4s, v0.s[3]
347        FMLA v17.4s, v27.4s, v0.s[3]
348        FMLA v18.4s, v26.4s, v1.s[3]
349        FMLA v19.4s, v27.4s, v1.s[3]
350        FMLA v28.4s, v26.4s, v2.s[3]
351        FMLA v29.4s, v27.4s, v2.s[3]
352        FMLA v30.4s, v26.4s, v3.s[3]
353        FMLA v31.4s, v27.4s, v3.s[3]
354
3554:
356        # Remainder- 2 floats of A (8 bytes)
357        TBZ x0, 3, 5f
358
359        LDR d0,  [x3], 8
360        LDP q20, q21, [x5], 32
361        LDR d1, [x11], 8
362        LDR d2, [x12], 8
363        LDR d3,  [x4], 8
364        FMLA v16.4s, v20.4s, v0.s[0]
365        FMLA v17.4s, v21.4s, v0.s[0]
366        LDP q22, q23, [x5], 32
367        FMLA v18.4s, v20.4s, v1.s[0]
368        FMLA v19.4s, v21.4s, v1.s[0]
369        FMLA v28.4s, v20.4s, v2.s[0]
370        FMLA v29.4s, v21.4s, v2.s[0]
371        FMLA v30.4s, v20.4s, v3.s[0]
372        FMLA v31.4s, v21.4s, v3.s[0]
373        FMLA v16.4s, v22.4s, v0.s[1]
374        FMLA v17.4s, v23.4s, v0.s[1]
375        FMLA v18.4s, v22.4s, v1.s[1]
376        FMLA v19.4s, v23.4s, v1.s[1]
377        FMLA v28.4s, v22.4s, v2.s[1]
378        FMLA v29.4s, v23.4s, v2.s[1]
379        FMLA v30.4s, v22.4s, v3.s[1]
380        FMLA v31.4s, v23.4s, v3.s[1]
381
3825:
383        # Remainder- 1 float of A (4 bytes)
384        TBZ x0, 2, 6f
385
386        LDR s0,  [x3], 4
387        LDP q20, q21, [x5], 32
388        LDR s1, [x11], 4
389        LDR s2, [x12], 4
390        LDR s3,  [x4], 4
391        FMLA v16.4s, v20.4s, v0.s[0]
392        FMLA v17.4s, v21.4s, v0.s[0]
393        FMLA v18.4s, v20.4s, v1.s[0]
394        FMLA v19.4s, v21.4s, v1.s[0]
395        FMLA v28.4s, v20.4s, v2.s[0]
396        FMLA v29.4s, v21.4s, v2.s[0]
397        FMLA v30.4s, v20.4s, v3.s[0]
398        FMLA v31.4s, v21.4s, v3.s[0]
399
4006:
401        # Clamp
402        FMIN v16.4s, v16.4s, v4.4s
403        SUBS x1, x1, 8
404        FMIN v17.4s, v17.4s, v4.4s
405        FMIN v18.4s, v18.4s, v4.4s
406        FMIN v19.4s, v19.4s, v4.4s
407        FMIN v28.4s, v28.4s, v4.4s
408        FMIN v29.4s, v29.4s, v4.4s
409        FMIN v30.4s, v30.4s, v4.4s
410        FMIN v31.4s, v31.4s, v4.4s
411        FMAX v16.4s, v16.4s, v5.4s
412        FMAX v17.4s, v17.4s, v5.4s
413        FMAX v18.4s, v18.4s, v5.4s
414        FMAX v19.4s, v19.4s, v5.4s
415        FMAX v28.4s, v28.4s, v5.4s
416        FMAX v29.4s, v29.4s, v5.4s
417        FMAX v30.4s, v30.4s, v5.4s
418        FMAX v31.4s, v31.4s, v5.4s
419
420        # Store full 4 x 8
421        B.LO 7f
422
423        $if INC:
424          STP q30, q31,  [x7]
425          SUB  x3,  x3, x2 // a0 -= kc
426          ADD  x7,  x7, x14
427          STP q28, q29, [x10]
428          SUB x11, x11, x2 // a1 -= kc
429          ADD x10, x10, x14
430          STP q18, q19,  [x9]
431          SUB x12, x12, x2 // a2 -= kc
432          ADD  x9,  x9, x14
433          STP q16, q17,  [x6]
434          SUB  x4,  x4, x2 // a3 -= kc
435          ADD  x6,  x6, x14
436        $else:
437          STP q16, q17,  [x6]
438          SUB  x3,  x3, x2 // a0 -= kc
439          ADD  x6,  x6, x14
440          STP q18, q19,  [x9]
441          SUB x11, x11, x2 // a1 -= kc
442          ADD  x9,  x9, x14
443          STP q28, q29, [x10]
444          SUB x12, x12, x2 // a2 -= kc
445          ADD x10, x10, x14
446          STP q30, q31,  [x7]
447          SUB  x4,  x4, x2 // a3 -= kc
448          ADD  x7,  x7, x14
449
450        B.HI 0b
451
452        # Restore d8-d15 from stack
453        LDP d14, d15, [sp, 48]
454        LDP d12, d13, [sp, 32]
455        LDP d10, d11, [sp, 16]
456        LDP  d8,  d9, [sp], 64
457        RET
458
459        # Store odd width
4607:
461        TBZ x1, 2, 8f
462        $if INC:
463          STR q30, [x7], 16
464          MOV v30.16b, v31.16b
465          STR q28, [x10], 16
466          MOV v28.16b, v29.16b
467          STR q18, [x9], 16
468          MOV v18.16b, v19.16b
469          STR q16, [x6], 16
470          MOV v16.16b, v17.16b
471        $else:
472          STR q16, [x6], 16
473          MOV v16.16b, v17.16b
474          STR q18, [x9], 16
475          MOV v18.16b, v19.16b
476          STR q28, [x10], 16
477          MOV v28.16b, v29.16b
478          STR q30, [x7], 16
479          MOV v30.16b, v31.16b
480
4818:
482        TBZ x1, 1, 9f
483        $if INC:
484          STR d30, [x7], 8
485          DUP d30, v30.d[1]
486          STR d28, [x10], 8
487          DUP d28, v28.d[1]
488          STR d18, [x9], 8
489          DUP d18, v18.d[1]
490          STR d16, [x6], 8
491          DUP d16, v16.d[1]
492        $else:
493          STR d16, [x6], 8
494          DUP d16, v16.d[1]
495          STR d18, [x9], 8
496          DUP d18, v18.d[1]
497          STR d28, [x10], 8
498          DUP d28, v28.d[1]
499          STR d30, [x7], 8
500          DUP d30, v30.d[1]
501
5029:
503        TBZ x1, 0, 10f
504        $if INC:
505          STR s30,  [x7]
506          STR s28, [x10]
507          STR s18,  [x9]
508          STR s16,  [x6]
509        $else:
510          STR s16,  [x6]
511          STR s18,  [x9]
512          STR s28, [x10]
513          STR s30,  [x7]
51410:
515        # Restore d8-d15 from stack
516        LDP d14, d15, [sp, 48]
517        LDP d12, d13, [sp, 32]
518        LDP d10, d11, [sp, 16]
519        LDP  d8,  d9, [sp], 64
520        RET
521
522
523END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
524
525#ifdef __ELF__
526.section ".note.GNU-stack","",%progbits
527#endif
528