• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const float*restrict w,            x5
15#     float*restrict c,                  x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_output_params params [sp + 24] -> x8
21
22# d8-d15 need to be preserved if used.
23# x19-30 need to be preserved if used.
24
25# A pointers
26# x20 a0
27# x13 a1
28# x14 a2
29# x15 a3
30
31# C pointers
32# x6  c0
33# x16 c1
34# x17 c2
35# x7  c3 / cm_stride
36
37# Vector register usage
38# A0  v0  v4
39# A1  v1  v5
40# A2  v2  v6
41# A3  v3  v7
42# B   v8  v9 v10 v11
43# B  v12 v13 v14 v15
44# B  v20 v21 v22 v23
45# B  v24 v25 v26 v27
46# C  v16 v17
47# C  v18 v19
48# C  v28 v29
49# C  v30 v31
50# Clamp v4 v5
51
52BEGIN_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
53
54        # Load cn_stride, a_offset
55        LDP x10, x11, [sp]
56
57        # Load zero, clamping params pointer
58        LDP x12, x8, [sp, 16]
59
60        # Load clamping_params values
61        LD2R {v4.4s, v5.4s}, [x8]
62
63        # Save x20 on stack
64        STR x20, [sp, -80]!
65
66        # Save d8-d15 on stack
67        STP  d8,  d9, [sp, 16]
68        STP d10, d11, [sp, 32]
69        STP d12, d13, [sp, 48]
70        STP d14, d15, [sp, 64]
71
72        # Clamp C pointers
73        CMP x0, 2                // if mr < 2
74        ADD x16, x6, x7          // c1 = c0 + cm_stride
75        CSEL x16, x6, x16, LO    //   c1 = c0
76
77        ADD x17, x16, x7         // c2 = c1 + cm_stride
78                                 // if mr <= 2
79        CSEL x17, x16, x17, LS   //   c2 = c1
80
81        CMP x0, 4                // if mr < 4
82        ADD x7, x17, x7          // c3 = c2 + cm_stride
83        CSEL x7, x17, x7, LO     //   c3 = c2
84
850:
86        # Load initial bias from w into accumulators
87        LDP q16, q17, [x5], 32
88        MOV v18.16b, v16.16b
89        MOV v19.16b, v17.16b
90        MOV v28.16b, v16.16b
91        MOV v29.16b, v17.16b
92        MOV v30.16b, v16.16b
93        MOV v31.16b, v17.16b
94
95        MOV x9, x3  // p = ks
96
971:
98        # Load next 4 A pointers
99        LDP x20, x13, [x4], 16
100        LDP x14, x15, [x4], 16
101
102        CMP x20, x12            // if a0 == zero
103        ADD x20, x20, x11       // a0 += a_offset
104        CSEL x20, x12, x20, EQ  //   a0 = zero, else += a0 + a_offset
105        CMP x13, x12            // if a1 == zero
106        ADD x13, x13, x11       // a1 += a_offset
107        CSEL x13, x12, x13, EQ  //   a1 = zero, else += a1 + a_offset
108        CMP x14, x12            // if a2 == zero
109        ADD x14, x14, x11       // a2 += a_offset
110        CSEL x14, x12, x14, EQ  //   a2 = zero, else += a2 + a_offset
111        CMP x15, x12            // if a3 == zero
112        ADD x15, x15, x11       // a3 += a_offset
113        CSEL x15, x12, x15, EQ  //   a3 = zero, else += a3 + a_offset
114
115        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
116        SUBS x0, x2, 32  // k = kc - 32
117        B.LO 4f
118
119        # 16 prologue
120        # Read first block of 4 A and B.
121        LDR q0, [x20], 16
122        LDP q20, q21, [x5], 32
123        LDR q1, [x13], 16
124        LDR q2, [x14], 16
125        LDR q3, [x15], 16
126        LDP q22, q23, [x5], 32
127        LDP q24, q25, [x5], 32
128        LDP q26, q27, [x5], 32
129
130        # Is there at least 32.  yes do main loop
131        SUBS x0, x0, 32
132        B.LO 3f
133
134        # Main loop - 8 floats of A
1352:
136        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
137        FMLA v16.4s, v20.4s, v0.s[0]
138        LDP q8, q9, [x5], 32
139        FMLA v17.4s, v21.4s, v0.s[0]
140        FMLA v18.4s, v20.4s, v1.s[0]
141        LDP q10, q11, [x5], 32
142        FMLA v19.4s, v21.4s, v1.s[0]
143        FMLA v28.4s, v20.4s, v2.s[0]
144        LDP q12, q13, [x5], 32
145        FMLA v29.4s, v21.4s, v2.s[0]
146        FMLA v30.4s, v20.4s, v3.s[0]
147        LDP q14, q15, [x5], 32
148        FMLA v31.4s, v21.4s, v3.s[0]
149        FMLA v16.4s, v22.4s, v0.s[1]
150        LDR q4, [x20], 16
151        FMLA v17.4s, v23.4s, v0.s[1]
152        FMLA v18.4s, v22.4s, v1.s[1]
153        LDR q5, [x13], 16
154        FMLA v19.4s, v23.4s, v1.s[1]
155        FMLA v28.4s, v22.4s, v2.s[1]
156        LDR q6, [x14], 16
157        FMLA v29.4s, v23.4s, v2.s[1]
158        FMLA v30.4s, v22.4s, v3.s[1]
159        LDR q7, [x15], 16
160        FMLA v31.4s, v23.4s, v3.s[1]
161        FMLA v16.4s, v24.4s, v0.s[2]
162        $if PREFETCH:
163          PRFM PLDL1KEEP, [x5, 128]
164        FMLA v17.4s, v25.4s, v0.s[2]
165        FMLA v18.4s, v24.4s, v1.s[2]
166        $if PREFETCH:
167          PRFM PLDL1KEEP, [x5, 192]
168        FMLA v19.4s, v25.4s, v1.s[2]
169        FMLA v28.4s, v24.4s, v2.s[2]
170        $if PREFETCH:
171          PRFM PLDL1KEEP, [x5, 256]
172        FMLA v29.4s, v25.4s, v2.s[2]
173        FMLA v30.4s, v24.4s, v3.s[2]
174        $if PREFETCH:
175          PRFM PLDL1KEEP, [x5, 320]
176        FMLA v31.4s, v25.4s, v3.s[2]
177        FMLA v16.4s, v26.4s, v0.s[3]
178        FMLA v17.4s, v27.4s, v0.s[3]
179        FMLA v18.4s, v26.4s, v1.s[3]
180        FMLA v19.4s, v27.4s, v1.s[3]
181        FMLA v28.4s, v26.4s, v2.s[3]
182        FMLA v29.4s, v27.4s, v2.s[3]
183        FMLA v30.4s, v26.4s, v3.s[3]
184        FMLA v31.4s, v27.4s, v3.s[3]
185
186        # Second block of 4.  FMA for second 4, loads for 1nd block of 4.
187        FMLA v16.4s, v8.4s, v4.s[0]
188        LDP q20, q21, [x5], 32
189        FMLA v17.4s, v9.4s, v4.s[0]
190        FMLA v18.4s, v8.4s, v5.s[0]
191        LDP q22, q23, [x5], 32
192        FMLA v19.4s, v9.4s, v5.s[0]
193        FMLA v28.4s, v8.4s, v6.s[0]
194        LDP q24, q25, [x5], 32
195        FMLA v29.4s, v9.4s, v6.s[0]
196        FMLA v30.4s, v8.4s, v7.s[0]
197        LDP q26, q27, [x5], 32
198        FMLA v31.4s, v9.4s, v7.s[0]
199        FMLA v16.4s, v10.4s, v4.s[1]
200        LDR q0, [x20], 16
201        FMLA v17.4s, v11.4s, v4.s[1]
202        FMLA v18.4s, v10.4s, v5.s[1]
203        LDR q1, [x13], 16
204        FMLA v19.4s, v11.4s, v5.s[1]
205        FMLA v28.4s, v10.4s, v6.s[1]
206        LDR q2, [x14], 16
207        FMLA v29.4s, v11.4s, v6.s[1]
208        FMLA v30.4s, v10.4s, v7.s[1]
209        LDR q3, [x15], 16
210        FMLA v31.4s, v11.4s, v7.s[1]
211        FMLA v16.4s, v12.4s, v4.s[2]
212        FMLA v17.4s, v13.4s, v4.s[2]
213        FMLA v18.4s, v12.4s, v5.s[2]
214        FMLA v19.4s, v13.4s, v5.s[2]
215        FMLA v28.4s, v12.4s, v6.s[2]
216        FMLA v29.4s, v13.4s, v6.s[2]
217        FMLA v30.4s, v12.4s, v7.s[2]
218        FMLA v31.4s, v13.4s, v7.s[2]
219        FMLA v16.4s, v14.4s, v4.s[3]
220        FMLA v17.4s, v15.4s, v4.s[3]
221        FMLA v18.4s, v14.4s, v5.s[3]
222        FMLA v19.4s, v15.4s, v5.s[3]
223        FMLA v28.4s, v14.4s, v6.s[3]
224        FMLA v29.4s, v15.4s, v6.s[3]
225        SUBS x0, x0, 32
226        FMLA v30.4s, v14.4s, v7.s[3]
227        FMLA v31.4s, v15.4s, v7.s[3]
228
229        B.HS 2b
230
2313:
232        # Epilogue
233        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
234        FMLA v16.4s, v20.4s, v0.s[0]
235        LDP q8, q9, [x5], 32
236        FMLA v17.4s, v21.4s, v0.s[0]
237        FMLA v18.4s, v20.4s, v1.s[0]
238        LDP q10, q11, [x5], 32
239        FMLA v19.4s, v21.4s, v1.s[0]
240        FMLA v28.4s, v20.4s, v2.s[0]
241        LDP q12, q13, [x5], 32
242        FMLA v29.4s, v21.4s, v2.s[0]
243        FMLA v30.4s, v20.4s, v3.s[0]
244        LDP q14, q15, [x5], 32
245        FMLA v31.4s, v21.4s, v3.s[0]
246        FMLA v16.4s, v22.4s, v0.s[1]
247        LDR q4, [x20], 16
248        FMLA v17.4s, v23.4s, v0.s[1]
249        FMLA v18.4s, v22.4s, v1.s[1]
250        LDR q5, [x13], 16
251        FMLA v19.4s, v23.4s, v1.s[1]
252        FMLA v28.4s, v22.4s, v2.s[1]
253        LDR q6, [x14], 16
254        FMLA v29.4s, v23.4s, v2.s[1]
255        FMLA v30.4s, v22.4s, v3.s[1]
256        LDR q7, [x15], 16
257        FMLA v31.4s, v23.4s, v3.s[1]
258        FMLA v16.4s, v24.4s, v0.s[2]
259        FMLA v17.4s, v25.4s, v0.s[2]
260        FMLA v18.4s, v24.4s, v1.s[2]
261        FMLA v19.4s, v25.4s, v1.s[2]
262        FMLA v28.4s, v24.4s, v2.s[2]
263        FMLA v29.4s, v25.4s, v2.s[2]
264        FMLA v30.4s, v24.4s, v3.s[2]
265        FMLA v31.4s, v25.4s, v3.s[2]
266        FMLA v16.4s, v26.4s, v0.s[3]
267        FMLA v17.4s, v27.4s, v0.s[3]
268        FMLA v18.4s, v26.4s, v1.s[3]
269        FMLA v19.4s, v27.4s, v1.s[3]
270        FMLA v28.4s, v26.4s, v2.s[3]
271        FMLA v29.4s, v27.4s, v2.s[3]
272        FMLA v30.4s, v26.4s, v3.s[3]
273        FMLA v31.4s, v27.4s, v3.s[3]
274
275        # Second block of 4.  FMA for second 4, noloads
276        FMLA v16.4s, v8.4s, v4.s[0]
277        FMLA v17.4s, v9.4s, v4.s[0]
278        FMLA v18.4s, v8.4s, v5.s[0]
279        FMLA v19.4s, v9.4s, v5.s[0]
280        FMLA v28.4s, v8.4s, v6.s[0]
281        FMLA v29.4s, v9.4s, v6.s[0]
282        FMLA v30.4s, v8.4s, v7.s[0]
283        FMLA v31.4s, v9.4s, v7.s[0]
284        FMLA v16.4s, v10.4s, v4.s[1]
285        FMLA v17.4s, v11.4s, v4.s[1]
286        FMLA v18.4s, v10.4s, v5.s[1]
287        FMLA v19.4s, v11.4s, v5.s[1]
288        FMLA v28.4s, v10.4s, v6.s[1]
289        FMLA v29.4s, v11.4s, v6.s[1]
290        FMLA v30.4s, v10.4s, v7.s[1]
291        FMLA v31.4s, v11.4s, v7.s[1]
292        FMLA v16.4s, v12.4s, v4.s[2]
293        FMLA v17.4s, v13.4s, v4.s[2]
294        FMLA v18.4s, v12.4s, v5.s[2]
295        FMLA v19.4s, v13.4s, v5.s[2]
296        FMLA v28.4s, v12.4s, v6.s[2]
297        FMLA v29.4s, v13.4s, v6.s[2]
298        FMLA v30.4s, v12.4s, v7.s[2]
299        FMLA v31.4s, v13.4s, v7.s[2]
300
301        FMLA v16.4s, v14.4s, v4.s[3]
302        FMLA v17.4s, v15.4s, v4.s[3]
303        FMLA v18.4s, v14.4s, v5.s[3]
304        FMLA v19.4s, v15.4s, v5.s[3]
305
306        # Load clamping_params values
307        LD2R {v4.4s, v5.4s}, [x8]
308
309        FMLA v28.4s, v14.4s, v6.s[3]
310        FMLA v29.4s, v15.4s, v6.s[3]
311        FMLA v30.4s, v14.4s, v7.s[3]
312        FMLA v31.4s, v15.4s, v7.s[3]
313
3144:
315        # Remainder- 4 floats of A
316        TBZ x0, 4, 5f
317
318        LDR q0, [x20], 16
319        LDP q20, q21, [x5], 32
320        LDR q1, [x13], 16
321        LDR q2, [x14], 16
322        LDR q3, [x15], 16
323        FMLA v16.4s, v20.4s, v0.s[0]
324        FMLA v17.4s, v21.4s, v0.s[0]
325        LDP q22, q23, [x5], 32
326        FMLA v18.4s, v20.4s, v1.s[0]
327        FMLA v19.4s, v21.4s, v1.s[0]
328        LDP q24, q25, [x5], 32
329        FMLA v28.4s, v20.4s, v2.s[0]
330        FMLA v29.4s, v21.4s, v2.s[0]
331        LDP q26, q27, [x5], 32
332        FMLA v30.4s, v20.4s, v3.s[0]
333        FMLA v31.4s, v21.4s, v3.s[0]
334        FMLA v16.4s, v22.4s, v0.s[1]
335        FMLA v17.4s, v23.4s, v0.s[1]
336        FMLA v18.4s, v22.4s, v1.s[1]
337        FMLA v19.4s, v23.4s, v1.s[1]
338        FMLA v28.4s, v22.4s, v2.s[1]
339        FMLA v29.4s, v23.4s, v2.s[1]
340        FMLA v30.4s, v22.4s, v3.s[1]
341        FMLA v31.4s, v23.4s, v3.s[1]
342        FMLA v16.4s, v24.4s, v0.s[2]
343        FMLA v17.4s, v25.4s, v0.s[2]
344        FMLA v18.4s, v24.4s, v1.s[2]
345        FMLA v19.4s, v25.4s, v1.s[2]
346        FMLA v28.4s, v24.4s, v2.s[2]
347        FMLA v29.4s, v25.4s, v2.s[2]
348        FMLA v30.4s, v24.4s, v3.s[2]
349        FMLA v31.4s, v25.4s, v3.s[2]
350        FMLA v16.4s, v26.4s, v0.s[3]
351        FMLA v17.4s, v27.4s, v0.s[3]
352        FMLA v18.4s, v26.4s, v1.s[3]
353        FMLA v19.4s, v27.4s, v1.s[3]
354        FMLA v28.4s, v26.4s, v2.s[3]
355        FMLA v29.4s, v27.4s, v2.s[3]
356        FMLA v30.4s, v26.4s, v3.s[3]
357        FMLA v31.4s, v27.4s, v3.s[3]
358
3595:
360        # Remainder- 2 floats of A
361        TBZ x0, 3, 6f
362
363        LDR d0, [x20], 8
364        LDP q20, q21, [x5], 32
365        LDR d1, [x13], 8
366        LDR d2, [x14], 8
367        LDR d3, [x15], 8
368        FMLA v16.4s, v20.4s, v0.s[0]
369        FMLA v17.4s, v21.4s, v0.s[0]
370        LDP q22, q23, [x5], 32
371        FMLA v18.4s, v20.4s, v1.s[0]
372        FMLA v19.4s, v21.4s, v1.s[0]
373        FMLA v28.4s, v20.4s, v2.s[0]
374        FMLA v29.4s, v21.4s, v2.s[0]
375        FMLA v30.4s, v20.4s, v3.s[0]
376        FMLA v31.4s, v21.4s, v3.s[0]
377        FMLA v16.4s, v22.4s, v0.s[1]
378        FMLA v17.4s, v23.4s, v0.s[1]
379        FMLA v18.4s, v22.4s, v1.s[1]
380        FMLA v19.4s, v23.4s, v1.s[1]
381        FMLA v28.4s, v22.4s, v2.s[1]
382        FMLA v29.4s, v23.4s, v2.s[1]
383        FMLA v30.4s, v22.4s, v3.s[1]
384        FMLA v31.4s, v23.4s, v3.s[1]
385
3866:
387        # Remainder- 1 float of A
388        TBZ x0, 2, 7f
389
390        LDR s0, [x20], 4
391        LDP q20, q21, [x5], 32
392        LDR s1, [x13], 4
393        LDR s2, [x14], 4
394        LDR s3, [x15], 4
395        FMLA v16.4s, v20.4s, v0.s[0]
396        FMLA v17.4s, v21.4s, v0.s[0]
397        FMLA v18.4s, v20.4s, v1.s[0]
398        FMLA v19.4s, v21.4s, v1.s[0]
399        FMLA v28.4s, v20.4s, v2.s[0]
400        FMLA v29.4s, v21.4s, v2.s[0]
401        FMLA v30.4s, v20.4s, v3.s[0]
402        FMLA v31.4s, v21.4s, v3.s[0]
403
4047:
405        # ks loop
406        SUBS x9, x9, 32  // ks -= MR * sizeof(void*)
407        B.NE 1b
408
409        # Clamp
410        FMIN v16.4s, v16.4s, v4.4s
411        FMIN v17.4s, v17.4s, v4.4s
412        FMIN v18.4s, v18.4s, v4.4s
413        FMIN v19.4s, v19.4s, v4.4s
414        FMIN v28.4s, v28.4s, v4.4s
415        FMIN v29.4s, v29.4s, v4.4s
416        FMIN v30.4s, v30.4s, v4.4s
417        FMIN v31.4s, v31.4s, v4.4s
418        FMAX v16.4s, v16.4s, v5.4s
419        FMAX v17.4s, v17.4s, v5.4s
420        FMAX v18.4s, v18.4s, v5.4s
421        FMAX v19.4s, v19.4s, v5.4s
422        FMAX v28.4s, v28.4s, v5.4s
423        FMAX v29.4s, v29.4s, v5.4s
424        FMAX v30.4s, v30.4s, v5.4s
425        FMAX v31.4s, v31.4s, v5.4s
426
427        # Store full 4 x 8
428        SUBS x1, x1, 8
429        B.LO 8f
430
431        STP q30, q31,  [x7]
432        ADD  x7,  x7, x10
433        STP q28, q29, [x17]
434        ADD x17, x17, x10
435        STP q18, q19, [x16]
436        ADD x16, x16, x10
437        STP q16, q17,  [x6]
438        ADD  x6,  x6, x10
439
440        SUB x4, x4, x3  // a -= ks
441
442        # nc loop
443        B.HI 0b
444
445        # Restore d8-d15 from stack
446        LDP d14, d15, [sp, 64]
447        LDP d12, d13, [sp, 48]
448        LDP d10, d11, [sp, 32]
449        LDP  d8,  d9, [sp, 16]
450
451        # Restore x20 from stack
452        LDR x20, [sp], 80
453        RET
454
455        # Store odd width
4568:
457        TBZ x1, 2, 9f
458        STR q30, [x7], 16
459        MOV v30.16b, v31.16b
460        STR q28, [x17], 16
461        MOV v28.16b, v29.16b
462        STR q18, [x16], 16
463        MOV v18.16b, v19.16b
464        STR q16, [x6], 16
465        MOV v16.16b, v17.16b
466
4679:
468        TBZ x1, 1, 10f
469        STR d30, [x7], 8
470        DUP d30, v30.d[1]
471        STR d28, [x17], 8
472        DUP d28, v28.d[1]
473        STR d18, [x16], 8
474        DUP d18, v18.d[1]
475        STR d16, [x6], 8
476        DUP d16, v16.d[1]
477
47810:
479        TBZ x1, 0, 11f
480        STR s30,  [x7]
481        STR s28, [x17]
482        STR s18, [x16]
483        STR s16,  [x6]
48411:
485        # Restore d8-d15 from stack
486        LDP d14, d15, [sp, 64]
487        LDP d12, d13, [sp, 48]
488        LDP d10, d11, [sp, 32]
489        LDP  d8,  d9, [sp, 16]
490
491        # Restore x20 from stack
492        LDR x20, [sp], 80
493        RET
494
495END_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
496
497#ifdef __ELF__
498.section ".note.GNU-stack","",%progbits
499#endif
500