• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <xnnpack/assembly.h>
7
8# void xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}(
9#     size_t mr,                         x0
10#     size_t nc,                         x1
11#     size_t kc,                         x2 / x0
12#     size_t ks,                         x3 / x9
13#     const float**restrict a,           x4
14#     const float*restrict w,            x5
15#     float*restrict c,                  x6
16#     size_t cm_stride,                  x7
17#     size_t cn_stride,                  [sp] -> x10
18#     size_t a_offset,                   [sp + 8] -> x11
19#     const float* zero,                 [sp + 16] -> x12
20#     const xnn_f32_minmax_params params [sp + 24] -> x8
21
22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
23
24# A pointers
25# x20 a0
26# x13 a1
27# x14 a2
28# x15 a3
29
30# C pointers
31# x6  c0
32# x16 c1
33# x17 c2
34# x7  c3 / cm_stride
35
36# Vector register usage
37# A0  v0  v4
38# A1  v1  v5
39# A2  v2  v6
40# A3  v3  v7
41# B   v8  v9 v10 v11
42# B  v12 v13 v14 v15
43# B  v20 v21 v22 v23
44# B  v24 v25 v26 v27
45# C  v16 v17
46# C  v18 v19
47# C  v28 v29
48# C  v30 v31
49# Clamp v4 v5
50
51BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
52
53        # Load cn_stride, a_offset
54        LDP x10, x11, [sp]
55
56        # Load zero, params pointer
57        LDP x12, x8, [sp, 16]
58
59        # Load min/max values
60        LD2R {v4.4s, v5.4s}, [x8]
61
62        # Save x20 on stack
63        STR x20, [sp, -80]!
64
65        # Save d8-d15 on stack
66        STP  d8,  d9, [sp, 16]
67        STP d10, d11, [sp, 32]
68        STP d12, d13, [sp, 48]
69        STP d14, d15, [sp, 64]
70
71        # Clamp C pointers
72        CMP x0, 2                // if mr < 2
73        ADD x16, x6, x7          // c1 = c0 + cm_stride
74        CSEL x16, x6, x16, LO    //   c1 = c0
75
76        ADD x17, x16, x7         // c2 = c1 + cm_stride
77                                 // if mr <= 2
78        CSEL x17, x16, x17, LS   //   c2 = c1
79
80        CMP x0, 4                // if mr < 4
81        ADD x7, x17, x7          // c3 = c2 + cm_stride
82        CSEL x7, x17, x7, LO     //   c3 = c2
83
840:
85        # Load initial bias from w into accumulators
86        LDP q16, q17, [x5], 32
87        MOV v18.16b, v16.16b
88        MOV v19.16b, v17.16b
89        MOV v28.16b, v16.16b
90        MOV v29.16b, v17.16b
91        MOV v30.16b, v16.16b
92        MOV v31.16b, v17.16b
93
94        MOV x9, x3  // p = ks
95
961:
97        # Load next 4 A pointers
98        LDP x20, x13, [x4], 16
99        LDP x14, x15, [x4], 16
100
101        CMP x20, x12            // if a0 == zero
102        ADD x20, x20, x11       // a0 += a_offset
103        CSEL x20, x12, x20, EQ  //   a0 = zero, else += a0 + a_offset
104        CMP x13, x12            // if a1 == zero
105        ADD x13, x13, x11       // a1 += a_offset
106        CSEL x13, x12, x13, EQ  //   a1 = zero, else += a1 + a_offset
107        CMP x14, x12            // if a2 == zero
108        ADD x14, x14, x11       // a2 += a_offset
109        CSEL x14, x12, x14, EQ  //   a2 = zero, else += a2 + a_offset
110        CMP x15, x12            // if a3 == zero
111        ADD x15, x15, x11       // a3 += a_offset
112        CSEL x15, x12, x15, EQ  //   a3 = zero, else += a3 + a_offset
113
114        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
115        SUBS x0, x2, 32  // k = kc - 32
116        B.LO 4f
117
118        # 16 prologue
119        # Read first block of 4 A and B.
120        LDR q0, [x20], 16
121        LDP q20, q21, [x5], 32
122        LDR q1, [x13], 16
123        LDR q2, [x14], 16
124        LDR q3, [x15], 16
125        LDP q22, q23, [x5], 32
126        LDP q24, q25, [x5], 32
127        LDP q26, q27, [x5], 32
128
129        # Is there at least 32.  yes do main loop
130        SUBS x0, x0, 32
131        B.LO 3f
132
133        # Main loop - 8 floats of A
1342:
135        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
136        FMLA v16.4s, v20.4s, v0.s[0]
137        LDP q8, q9, [x5], 32
138        FMLA v17.4s, v21.4s, v0.s[0]
139        FMLA v18.4s, v20.4s, v1.s[0]
140        LDP q10, q11, [x5], 32
141        FMLA v19.4s, v21.4s, v1.s[0]
142        FMLA v28.4s, v20.4s, v2.s[0]
143        LDP q12, q13, [x5], 32
144        FMLA v29.4s, v21.4s, v2.s[0]
145        FMLA v30.4s, v20.4s, v3.s[0]
146        LDP q14, q15, [x5], 32
147        FMLA v31.4s, v21.4s, v3.s[0]
148        FMLA v16.4s, v22.4s, v0.s[1]
149        LDR q4, [x20], 16
150        FMLA v17.4s, v23.4s, v0.s[1]
151        FMLA v18.4s, v22.4s, v1.s[1]
152        LDR q5, [x13], 16
153        FMLA v19.4s, v23.4s, v1.s[1]
154        FMLA v28.4s, v22.4s, v2.s[1]
155        LDR q6, [x14], 16
156        FMLA v29.4s, v23.4s, v2.s[1]
157        FMLA v30.4s, v22.4s, v3.s[1]
158        LDR q7, [x15], 16
159        FMLA v31.4s, v23.4s, v3.s[1]
160        FMLA v16.4s, v24.4s, v0.s[2]
161        $if PREFETCH:
162          PRFM PLDL1KEEP, [x5, 128]
163        FMLA v17.4s, v25.4s, v0.s[2]
164        FMLA v18.4s, v24.4s, v1.s[2]
165        $if PREFETCH:
166          PRFM PLDL1KEEP, [x5, 192]
167        FMLA v19.4s, v25.4s, v1.s[2]
168        FMLA v28.4s, v24.4s, v2.s[2]
169        $if PREFETCH:
170          PRFM PLDL1KEEP, [x5, 256]
171        FMLA v29.4s, v25.4s, v2.s[2]
172        FMLA v30.4s, v24.4s, v3.s[2]
173        $if PREFETCH:
174          PRFM PLDL1KEEP, [x5, 320]
175        FMLA v31.4s, v25.4s, v3.s[2]
176        FMLA v16.4s, v26.4s, v0.s[3]
177        FMLA v17.4s, v27.4s, v0.s[3]
178        FMLA v18.4s, v26.4s, v1.s[3]
179        FMLA v19.4s, v27.4s, v1.s[3]
180        FMLA v28.4s, v26.4s, v2.s[3]
181        FMLA v29.4s, v27.4s, v2.s[3]
182        FMLA v30.4s, v26.4s, v3.s[3]
183        FMLA v31.4s, v27.4s, v3.s[3]
184
185        # Second block of 4.  FMA for second 4, loads for 1nd block of 4.
186        FMLA v16.4s, v8.4s, v4.s[0]
187        LDP q20, q21, [x5], 32
188        FMLA v17.4s, v9.4s, v4.s[0]
189        FMLA v18.4s, v8.4s, v5.s[0]
190        LDP q22, q23, [x5], 32
191        FMLA v19.4s, v9.4s, v5.s[0]
192        FMLA v28.4s, v8.4s, v6.s[0]
193        LDP q24, q25, [x5], 32
194        FMLA v29.4s, v9.4s, v6.s[0]
195        FMLA v30.4s, v8.4s, v7.s[0]
196        LDP q26, q27, [x5], 32
197        FMLA v31.4s, v9.4s, v7.s[0]
198        FMLA v16.4s, v10.4s, v4.s[1]
199        LDR q0, [x20], 16
200        FMLA v17.4s, v11.4s, v4.s[1]
201        FMLA v18.4s, v10.4s, v5.s[1]
202        LDR q1, [x13], 16
203        FMLA v19.4s, v11.4s, v5.s[1]
204        FMLA v28.4s, v10.4s, v6.s[1]
205        LDR q2, [x14], 16
206        FMLA v29.4s, v11.4s, v6.s[1]
207        FMLA v30.4s, v10.4s, v7.s[1]
208        LDR q3, [x15], 16
209        FMLA v31.4s, v11.4s, v7.s[1]
210        FMLA v16.4s, v12.4s, v4.s[2]
211        FMLA v17.4s, v13.4s, v4.s[2]
212        FMLA v18.4s, v12.4s, v5.s[2]
213        FMLA v19.4s, v13.4s, v5.s[2]
214        FMLA v28.4s, v12.4s, v6.s[2]
215        FMLA v29.4s, v13.4s, v6.s[2]
216        FMLA v30.4s, v12.4s, v7.s[2]
217        FMLA v31.4s, v13.4s, v7.s[2]
218        FMLA v16.4s, v14.4s, v4.s[3]
219        FMLA v17.4s, v15.4s, v4.s[3]
220        FMLA v18.4s, v14.4s, v5.s[3]
221        FMLA v19.4s, v15.4s, v5.s[3]
222        FMLA v28.4s, v14.4s, v6.s[3]
223        FMLA v29.4s, v15.4s, v6.s[3]
224        SUBS x0, x0, 32
225        FMLA v30.4s, v14.4s, v7.s[3]
226        FMLA v31.4s, v15.4s, v7.s[3]
227
228        B.HS 2b
229
2303:
231        # Epilogue
232        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
233        FMLA v16.4s, v20.4s, v0.s[0]
234        LDP q8, q9, [x5], 32
235        FMLA v17.4s, v21.4s, v0.s[0]
236        FMLA v18.4s, v20.4s, v1.s[0]
237        LDP q10, q11, [x5], 32
238        FMLA v19.4s, v21.4s, v1.s[0]
239        FMLA v28.4s, v20.4s, v2.s[0]
240        LDP q12, q13, [x5], 32
241        FMLA v29.4s, v21.4s, v2.s[0]
242        FMLA v30.4s, v20.4s, v3.s[0]
243        LDP q14, q15, [x5], 32
244        FMLA v31.4s, v21.4s, v3.s[0]
245        FMLA v16.4s, v22.4s, v0.s[1]
246        LDR q4, [x20], 16
247        FMLA v17.4s, v23.4s, v0.s[1]
248        FMLA v18.4s, v22.4s, v1.s[1]
249        LDR q5, [x13], 16
250        FMLA v19.4s, v23.4s, v1.s[1]
251        FMLA v28.4s, v22.4s, v2.s[1]
252        LDR q6, [x14], 16
253        FMLA v29.4s, v23.4s, v2.s[1]
254        FMLA v30.4s, v22.4s, v3.s[1]
255        LDR q7, [x15], 16
256        FMLA v31.4s, v23.4s, v3.s[1]
257        FMLA v16.4s, v24.4s, v0.s[2]
258        FMLA v17.4s, v25.4s, v0.s[2]
259        FMLA v18.4s, v24.4s, v1.s[2]
260        FMLA v19.4s, v25.4s, v1.s[2]
261        FMLA v28.4s, v24.4s, v2.s[2]
262        FMLA v29.4s, v25.4s, v2.s[2]
263        FMLA v30.4s, v24.4s, v3.s[2]
264        FMLA v31.4s, v25.4s, v3.s[2]
265        FMLA v16.4s, v26.4s, v0.s[3]
266        FMLA v17.4s, v27.4s, v0.s[3]
267        FMLA v18.4s, v26.4s, v1.s[3]
268        FMLA v19.4s, v27.4s, v1.s[3]
269        FMLA v28.4s, v26.4s, v2.s[3]
270        FMLA v29.4s, v27.4s, v2.s[3]
271        FMLA v30.4s, v26.4s, v3.s[3]
272        FMLA v31.4s, v27.4s, v3.s[3]
273
274        # Second block of 4.  FMA for second 4, noloads
275        FMLA v16.4s, v8.4s, v4.s[0]
276        FMLA v17.4s, v9.4s, v4.s[0]
277        FMLA v18.4s, v8.4s, v5.s[0]
278        FMLA v19.4s, v9.4s, v5.s[0]
279        FMLA v28.4s, v8.4s, v6.s[0]
280        FMLA v29.4s, v9.4s, v6.s[0]
281        FMLA v30.4s, v8.4s, v7.s[0]
282        FMLA v31.4s, v9.4s, v7.s[0]
283        FMLA v16.4s, v10.4s, v4.s[1]
284        FMLA v17.4s, v11.4s, v4.s[1]
285        FMLA v18.4s, v10.4s, v5.s[1]
286        FMLA v19.4s, v11.4s, v5.s[1]
287        FMLA v28.4s, v10.4s, v6.s[1]
288        FMLA v29.4s, v11.4s, v6.s[1]
289        FMLA v30.4s, v10.4s, v7.s[1]
290        FMLA v31.4s, v11.4s, v7.s[1]
291        FMLA v16.4s, v12.4s, v4.s[2]
292        FMLA v17.4s, v13.4s, v4.s[2]
293        FMLA v18.4s, v12.4s, v5.s[2]
294        FMLA v19.4s, v13.4s, v5.s[2]
295        FMLA v28.4s, v12.4s, v6.s[2]
296        FMLA v29.4s, v13.4s, v6.s[2]
297        FMLA v30.4s, v12.4s, v7.s[2]
298        FMLA v31.4s, v13.4s, v7.s[2]
299
300        FMLA v16.4s, v14.4s, v4.s[3]
301        FMLA v17.4s, v15.4s, v4.s[3]
302        FMLA v18.4s, v14.4s, v5.s[3]
303        FMLA v19.4s, v15.4s, v5.s[3]
304
305        # Load min/max values
306        LD2R {v4.4s, v5.4s}, [x8]
307
308        FMLA v28.4s, v14.4s, v6.s[3]
309        FMLA v29.4s, v15.4s, v6.s[3]
310        FMLA v30.4s, v14.4s, v7.s[3]
311        FMLA v31.4s, v15.4s, v7.s[3]
312
3134:
314        # Remainder- 4 floats of A
315        TBZ x0, 4, 5f
316
317        LDR q0, [x20], 16
318        LDP q20, q21, [x5], 32
319        LDR q1, [x13], 16
320        LDR q2, [x14], 16
321        LDR q3, [x15], 16
322        FMLA v16.4s, v20.4s, v0.s[0]
323        FMLA v17.4s, v21.4s, v0.s[0]
324        LDP q22, q23, [x5], 32
325        FMLA v18.4s, v20.4s, v1.s[0]
326        FMLA v19.4s, v21.4s, v1.s[0]
327        LDP q24, q25, [x5], 32
328        FMLA v28.4s, v20.4s, v2.s[0]
329        FMLA v29.4s, v21.4s, v2.s[0]
330        LDP q26, q27, [x5], 32
331        FMLA v30.4s, v20.4s, v3.s[0]
332        FMLA v31.4s, v21.4s, v3.s[0]
333        FMLA v16.4s, v22.4s, v0.s[1]
334        FMLA v17.4s, v23.4s, v0.s[1]
335        FMLA v18.4s, v22.4s, v1.s[1]
336        FMLA v19.4s, v23.4s, v1.s[1]
337        FMLA v28.4s, v22.4s, v2.s[1]
338        FMLA v29.4s, v23.4s, v2.s[1]
339        FMLA v30.4s, v22.4s, v3.s[1]
340        FMLA v31.4s, v23.4s, v3.s[1]
341        FMLA v16.4s, v24.4s, v0.s[2]
342        FMLA v17.4s, v25.4s, v0.s[2]
343        FMLA v18.4s, v24.4s, v1.s[2]
344        FMLA v19.4s, v25.4s, v1.s[2]
345        FMLA v28.4s, v24.4s, v2.s[2]
346        FMLA v29.4s, v25.4s, v2.s[2]
347        FMLA v30.4s, v24.4s, v3.s[2]
348        FMLA v31.4s, v25.4s, v3.s[2]
349        FMLA v16.4s, v26.4s, v0.s[3]
350        FMLA v17.4s, v27.4s, v0.s[3]
351        FMLA v18.4s, v26.4s, v1.s[3]
352        FMLA v19.4s, v27.4s, v1.s[3]
353        FMLA v28.4s, v26.4s, v2.s[3]
354        FMLA v29.4s, v27.4s, v2.s[3]
355        FMLA v30.4s, v26.4s, v3.s[3]
356        FMLA v31.4s, v27.4s, v3.s[3]
357
3585:
359        # Remainder- 2 floats of A
360        TBZ x0, 3, 6f
361
362        LDR d0, [x20], 8
363        LDP q20, q21, [x5], 32
364        LDR d1, [x13], 8
365        LDR d2, [x14], 8
366        LDR d3, [x15], 8
367        FMLA v16.4s, v20.4s, v0.s[0]
368        FMLA v17.4s, v21.4s, v0.s[0]
369        LDP q22, q23, [x5], 32
370        FMLA v18.4s, v20.4s, v1.s[0]
371        FMLA v19.4s, v21.4s, v1.s[0]
372        FMLA v28.4s, v20.4s, v2.s[0]
373        FMLA v29.4s, v21.4s, v2.s[0]
374        FMLA v30.4s, v20.4s, v3.s[0]
375        FMLA v31.4s, v21.4s, v3.s[0]
376        FMLA v16.4s, v22.4s, v0.s[1]
377        FMLA v17.4s, v23.4s, v0.s[1]
378        FMLA v18.4s, v22.4s, v1.s[1]
379        FMLA v19.4s, v23.4s, v1.s[1]
380        FMLA v28.4s, v22.4s, v2.s[1]
381        FMLA v29.4s, v23.4s, v2.s[1]
382        FMLA v30.4s, v22.4s, v3.s[1]
383        FMLA v31.4s, v23.4s, v3.s[1]
384
3856:
386        # Remainder- 1 float of A
387        TBZ x0, 2, 7f
388
389        LDR s0, [x20], 4
390        LDP q20, q21, [x5], 32
391        LDR s1, [x13], 4
392        LDR s2, [x14], 4
393        LDR s3, [x15], 4
394        FMLA v16.4s, v20.4s, v0.s[0]
395        FMLA v17.4s, v21.4s, v0.s[0]
396        FMLA v18.4s, v20.4s, v1.s[0]
397        FMLA v19.4s, v21.4s, v1.s[0]
398        FMLA v28.4s, v20.4s, v2.s[0]
399        FMLA v29.4s, v21.4s, v2.s[0]
400        FMLA v30.4s, v20.4s, v3.s[0]
401        FMLA v31.4s, v21.4s, v3.s[0]
402
4037:
404        # ks loop
405        SUBS x9, x9, 32  // ks -= MR * sizeof(void*)
406        B.HI 1b
407
408        # Clamp
409        FMAX v16.4s, v16.4s, v4.4s
410        FMAX v17.4s, v17.4s, v4.4s
411        FMAX v18.4s, v18.4s, v4.4s
412        FMAX v19.4s, v19.4s, v4.4s
413        FMAX v28.4s, v28.4s, v4.4s
414        FMAX v29.4s, v29.4s, v4.4s
415        FMAX v30.4s, v30.4s, v4.4s
416        FMAX v31.4s, v31.4s, v4.4s
417        FMIN v16.4s, v16.4s, v5.4s
418        FMIN v17.4s, v17.4s, v5.4s
419        FMIN v18.4s, v18.4s, v5.4s
420        FMIN v19.4s, v19.4s, v5.4s
421        FMIN v28.4s, v28.4s, v5.4s
422        FMIN v29.4s, v29.4s, v5.4s
423        FMIN v30.4s, v30.4s, v5.4s
424        FMIN v31.4s, v31.4s, v5.4s
425
426        # Store full 4 x 8
427        SUBS x1, x1, 8
428        B.LO 8f
429
430        STP q30, q31,  [x7]
431        ADD  x7,  x7, x10
432        STP q28, q29, [x17]
433        ADD x17, x17, x10
434        STP q18, q19, [x16]
435        ADD x16, x16, x10
436        STP q16, q17,  [x6]
437        ADD  x6,  x6, x10
438
439        SUB x4, x4, x3  // a -= ks
440
441        # nc loop
442        B.HI 0b
443
444        # Restore d8-d15 from stack
445        LDP d14, d15, [sp, 64]
446        LDP d12, d13, [sp, 48]
447        LDP d10, d11, [sp, 32]
448        LDP  d8,  d9, [sp, 16]
449
450        # Restore x20 from stack
451        LDR x20, [sp], 80
452        RET
453
454        # Store odd width
4558:
456        TBZ x1, 2, 9f
457        STR q30, [x7], 16
458        MOV v30.16b, v31.16b
459        STR q28, [x17], 16
460        MOV v28.16b, v29.16b
461        STR q18, [x16], 16
462        MOV v18.16b, v19.16b
463        STR q16, [x6], 16
464        MOV v16.16b, v17.16b
465
4669:
467        TBZ x1, 1, 10f
468        STR d30, [x7], 8
469        DUP d30, v30.d[1]
470        STR d28, [x17], 8
471        DUP d28, v28.d[1]
472        STR d18, [x16], 8
473        DUP d18, v18.d[1]
474        STR d16, [x6], 8
475        DUP d16, v16.d[1]
476
47710:
478        TBZ x1, 0, 11f
479        STR s30,  [x7]
480        STR s28, [x17]
481        STR s18, [x16]
482        STR s16,  [x6]
48311:
484        # Restore d8-d15 from stack
485        LDP d14, d15, [sp, 64]
486        LDP d12, d13, [sp, 48]
487        LDP d10, d11, [sp, 32]
488        LDP  d8,  d9, [sp, 16]
489
490        # Restore x20 from stack
491        LDR x20, [sp], 80
492        RET
493
494END_FUNCTION xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}
495
496#ifdef __ELF__
497.section ".note.GNU-stack","",%progbits
498#endif
499