• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  [sp + 8] -> x8
23
24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
25
26# A pointers
27# x3  a0
28# x11 a1
29# x12 a2
30# x4  a3 / a_stride
31
32# C pointers
33# x6  c0
34# x9  c1
35# x10 c2
36# x7  c3 / cm_stride
37
38# Vector register usage
39# A0  v0  v4
40# A1  v1  v5
41# A2  v2  v6
42# A3  v3  v7
43# B   v8  v9 v10 v11
44# B  v12 v13 v14 v15
45# B  v20 v21 v22 v23
46# B  v24 v25 v26 v27
47# C  v16 v17
48# C  v18 v19
49# C  v28 v29
50# C  v30 v31
51# Clamp v4 v5
52
53BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75
54
55        # Load cn_stride, params pointer
56        LDP x14, x8, [sp]
57
58        # Load min/max values
59        LD2R {v4.4s, v5.4s}, [x8]
60
61        # Save d8-d15 on stack
62        STP  d8,  d9, [sp, -64]!
63        STP d10, d11, [sp, 16]
64        STP d12, d13, [sp, 32]
65        STP d14, d15, [sp, 48]
66
67        # Clamp A and C pointers
68        CMP x0, 2                // if mr < 2
69        ADD x11, x3, x4          // a1 = a0 + a_stride
70        ADD x9, x6, x7           // c1 = c0 + cm_stride
71        CSEL x11, x3, x11, LO    //   a1 = a0
72        CSEL x9, x6, x9, LO      //   c1 = c0
73
74        ADD x12, x11, x4         // a2 = a1 + a_stride
75        ADD x10, x9, x7          // c2 = c1 + cm_stride
76                                 // if mr <= 2
77        CSEL x12, x11, x12, LS   //   a2 = a1
78        CSEL x10, x9, x10, LS    //   c2 = c1
79
80        CMP x0, 4                // if mr < 4
81        ADD x4, x12, x4          // a3 = a2 + a_stride
82        ADD x7, x10, x7          // c3 = c2 + cm_stride
83        CSEL x4, x12, x4, LO     //   a3 = a2
84        CSEL x7, x10, x7, LO     //   c3 = c2
85
860:
87        # Load initial bias from w into accumulators
88        LDP q16, q17, [x5], 32
89        MOV v18.16b, v16.16b
90        MOV v19.16b, v17.16b
91        MOV v28.16b, v16.16b
92        MOV v29.16b, v17.16b
93        MOV v30.16b, v16.16b
94        MOV v31.16b, v17.16b
95
96        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
97        SUBS x0, x2, 32  // k = kc - 32
98        B.LO 3f
99
100        # 16 prologue
101        # Read first block of 4 A and B.
102        LDR q0,  [x3], 16
103        LDP q20, q21, [x5], 32
104        LDR q1, [x11], 16
105        LDR q2, [x12], 16
106        LDR q3,  [x4], 16
107        LDP q22, q23, [x5], 32
108        LDP q24, q25, [x5], 32
109        LDP q26, q27, [x5], 32
110
111        # Is there at least 32.  yes do main loop
112        SUBS x0, x0, 32
113        B.LO 2f
114
115        # Main loop - 8 floats of A (32 bytes)
1161:
117        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
118        FMLA v16.4s, v20.4s, v0.s[0]
119        LDP q8, q9, [x5], 32
120        FMLA v17.4s, v21.4s, v0.s[0]
121        FMLA v18.4s, v20.4s, v1.s[0]
122        LDP q10, q11, [x5], 32
123        FMLA v19.4s, v21.4s, v1.s[0]
124        FMLA v28.4s, v20.4s, v2.s[0]
125        LDP q12, q13, [x5], 32
126        FMLA v29.4s, v21.4s, v2.s[0]
127        FMLA v30.4s, v20.4s, v3.s[0]
128        LDP q14, q15, [x5], 32
129        FMLA v31.4s, v21.4s, v3.s[0]
130        FMLA v16.4s, v22.4s, v0.s[1]
131        LDR q4, [x3], 16
132        FMLA v17.4s, v23.4s, v0.s[1]
133        FMLA v18.4s, v22.4s, v1.s[1]
134        LDR q5, [x11], 16
135        FMLA v19.4s, v23.4s, v1.s[1]
136        FMLA v28.4s, v22.4s, v2.s[1]
137        LDR q6, [x12], 16
138        FMLA v29.4s, v23.4s, v2.s[1]
139        FMLA v30.4s, v22.4s, v3.s[1]
140        LDR q7, [x4], 16
141        FMLA v31.4s, v23.4s, v3.s[1]
142        FMLA v16.4s, v24.4s, v0.s[2]
143        PRFM PLDL1KEEP, [x5, 128]
144        FMLA v17.4s, v25.4s, v0.s[2]
145        FMLA v18.4s, v24.4s, v1.s[2]
146        PRFM PLDL1KEEP, [x5, 192]
147        FMLA v19.4s, v25.4s, v1.s[2]
148        FMLA v28.4s, v24.4s, v2.s[2]
149        PRFM PLDL1KEEP, [x5, 256]
150        FMLA v29.4s, v25.4s, v2.s[2]
151        FMLA v30.4s, v24.4s, v3.s[2]
152        PRFM PLDL1KEEP, [x5, 320]
153        FMLA v31.4s, v25.4s, v3.s[2]
154        FMLA v16.4s, v26.4s, v0.s[3]
155        FMLA v17.4s, v27.4s, v0.s[3]
156        FMLA v18.4s, v26.4s, v1.s[3]
157        FMLA v19.4s, v27.4s, v1.s[3]
158        FMLA v28.4s, v26.4s, v2.s[3]
159        FMLA v29.4s, v27.4s, v2.s[3]
160        FMLA v30.4s, v26.4s, v3.s[3]
161        FMLA v31.4s, v27.4s, v3.s[3]
162
163        # Second block of 4.  FMA for second 4, loads for 1nd block of 4.
164        FMLA v16.4s, v8.4s, v4.s[0]
165        LDP q20, q21, [x5], 32
166        FMLA v17.4s, v9.4s, v4.s[0]
167        FMLA v18.4s, v8.4s, v5.s[0]
168        LDP q22, q23, [x5], 32
169        FMLA v19.4s, v9.4s, v5.s[0]
170        FMLA v28.4s, v8.4s, v6.s[0]
171        LDP q24, q25, [x5], 32
172        FMLA v29.4s, v9.4s, v6.s[0]
173        FMLA v30.4s, v8.4s, v7.s[0]
174        LDP q26, q27, [x5], 32
175        FMLA v31.4s, v9.4s, v7.s[0]
176        FMLA v16.4s, v10.4s, v4.s[1]
177        LDR q0, [x3], 16
178        FMLA v17.4s, v11.4s, v4.s[1]
179        FMLA v18.4s, v10.4s, v5.s[1]
180        LDR q1, [x11], 16
181        FMLA v19.4s, v11.4s, v5.s[1]
182        FMLA v28.4s, v10.4s, v6.s[1]
183        LDR q2, [x12], 16
184        FMLA v29.4s, v11.4s, v6.s[1]
185        FMLA v30.4s, v10.4s, v7.s[1]
186        LDR q3, [x4], 16
187        FMLA v31.4s, v11.4s, v7.s[1]
188        FMLA v16.4s, v12.4s, v4.s[2]
189        FMLA v17.4s, v13.4s, v4.s[2]
190        FMLA v18.4s, v12.4s, v5.s[2]
191        FMLA v19.4s, v13.4s, v5.s[2]
192        FMLA v28.4s, v12.4s, v6.s[2]
193        FMLA v29.4s, v13.4s, v6.s[2]
194        FMLA v30.4s, v12.4s, v7.s[2]
195        FMLA v31.4s, v13.4s, v7.s[2]
196        FMLA v16.4s, v14.4s, v4.s[3]
197        FMLA v17.4s, v15.4s, v4.s[3]
198        FMLA v18.4s, v14.4s, v5.s[3]
199        FMLA v19.4s, v15.4s, v5.s[3]
200        FMLA v28.4s, v14.4s, v6.s[3]
201        FMLA v29.4s, v15.4s, v6.s[3]
202        SUBS x0, x0, 32
203        FMLA v30.4s, v14.4s, v7.s[3]
204        FMLA v31.4s, v15.4s, v7.s[3]
205        B.HS 1b
206
2072:
208        # Epilogue
209        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
210        FMLA v16.4s, v20.4s, v0.s[0]
211        LDP q8, q9, [x5], 32
212        FMLA v17.4s, v21.4s, v0.s[0]
213        FMLA v18.4s, v20.4s, v1.s[0]
214        LDP q10, q11, [x5], 32
215        FMLA v19.4s, v21.4s, v1.s[0]
216        FMLA v28.4s, v20.4s, v2.s[0]
217        LDP q12, q13, [x5], 32
218        FMLA v29.4s, v21.4s, v2.s[0]
219        FMLA v30.4s, v20.4s, v3.s[0]
220        LDP q14, q15, [x5], 32
221        FMLA v31.4s, v21.4s, v3.s[0]
222        FMLA v16.4s, v22.4s, v0.s[1]
223        LDR q4, [x3], 16
224        FMLA v17.4s, v23.4s, v0.s[1]
225        FMLA v18.4s, v22.4s, v1.s[1]
226        LDR q5, [x11], 16
227        FMLA v19.4s, v23.4s, v1.s[1]
228        FMLA v28.4s, v22.4s, v2.s[1]
229        LDR q6, [x12], 16
230        FMLA v29.4s, v23.4s, v2.s[1]
231        FMLA v30.4s, v22.4s, v3.s[1]
232        LDR q7, [x4], 16
233        FMLA v31.4s, v23.4s, v3.s[1]
234        FMLA v16.4s, v24.4s, v0.s[2]
235        FMLA v17.4s, v25.4s, v0.s[2]
236        FMLA v18.4s, v24.4s, v1.s[2]
237        FMLA v19.4s, v25.4s, v1.s[2]
238        FMLA v28.4s, v24.4s, v2.s[2]
239        FMLA v29.4s, v25.4s, v2.s[2]
240        FMLA v30.4s, v24.4s, v3.s[2]
241        FMLA v31.4s, v25.4s, v3.s[2]
242        FMLA v16.4s, v26.4s, v0.s[3]
243        FMLA v17.4s, v27.4s, v0.s[3]
244        FMLA v18.4s, v26.4s, v1.s[3]
245        FMLA v19.4s, v27.4s, v1.s[3]
246        FMLA v28.4s, v26.4s, v2.s[3]
247        FMLA v29.4s, v27.4s, v2.s[3]
248        FMLA v30.4s, v26.4s, v3.s[3]
249        FMLA v31.4s, v27.4s, v3.s[3]
250
251        # Second block of 4.  FMA for second 4, noloads
252        FMLA v16.4s, v8.4s, v4.s[0]
253        FMLA v17.4s, v9.4s, v4.s[0]
254        FMLA v18.4s, v8.4s, v5.s[0]
255        FMLA v19.4s, v9.4s, v5.s[0]
256        FMLA v28.4s, v8.4s, v6.s[0]
257        FMLA v29.4s, v9.4s, v6.s[0]
258        FMLA v30.4s, v8.4s, v7.s[0]
259        FMLA v31.4s, v9.4s, v7.s[0]
260
261        FMLA v16.4s, v10.4s, v4.s[1]
262        FMLA v17.4s, v11.4s, v4.s[1]
263        FMLA v18.4s, v10.4s, v5.s[1]
264        FMLA v19.4s, v11.4s, v5.s[1]
265        FMLA v28.4s, v10.4s, v6.s[1]
266        FMLA v29.4s, v11.4s, v6.s[1]
267        FMLA v30.4s, v10.4s, v7.s[1]
268        FMLA v31.4s, v11.4s, v7.s[1]
269
270        FMLA v16.4s, v12.4s, v4.s[2]
271        FMLA v17.4s, v13.4s, v4.s[2]
272        FMLA v18.4s, v12.4s, v5.s[2]
273        FMLA v19.4s, v13.4s, v5.s[2]
274        FMLA v28.4s, v12.4s, v6.s[2]
275        FMLA v29.4s, v13.4s, v6.s[2]
276        FMLA v30.4s, v12.4s, v7.s[2]
277        FMLA v31.4s, v13.4s, v7.s[2]
278
279        FMLA v16.4s, v14.4s, v4.s[3]
280        FMLA v17.4s, v15.4s, v4.s[3]
281        FMLA v18.4s, v14.4s, v5.s[3]
282        FMLA v19.4s, v15.4s, v5.s[3]
283
284        # Load min/max values
285        LD2R {v4.4s, v5.4s}, [x8]
286
287        FMLA v28.4s, v14.4s, v6.s[3]
288        FMLA v29.4s, v15.4s, v6.s[3]
289        FMLA v30.4s, v14.4s, v7.s[3]
290        FMLA v31.4s, v15.4s, v7.s[3]
291
2923:
293        # Remainder- 4 floats of A (16 bytes)
294        TBZ x0, 4, 4f
295
296        LDR q0,  [x3], 16
297        LDP q20, q21, [x5], 32
298        LDR q1, [x11], 16
299        LDR q2, [x12], 16
300        LDR q3,  [x4], 16
301        FMLA v16.4s, v20.4s, v0.s[0]
302        FMLA v17.4s, v21.4s, v0.s[0]
303        LDP q22, q23, [x5], 32
304        FMLA v18.4s, v20.4s, v1.s[0]
305        FMLA v19.4s, v21.4s, v1.s[0]
306        LDP q24, q25, [x5], 32
307        FMLA v28.4s, v20.4s, v2.s[0]
308        FMLA v29.4s, v21.4s, v2.s[0]
309        LDP q26, q27, [x5], 32
310        FMLA v30.4s, v20.4s, v3.s[0]
311        FMLA v31.4s, v21.4s, v3.s[0]
312        FMLA v16.4s, v22.4s, v0.s[1]
313        FMLA v17.4s, v23.4s, v0.s[1]
314        FMLA v18.4s, v22.4s, v1.s[1]
315        FMLA v19.4s, v23.4s, v1.s[1]
316        FMLA v28.4s, v22.4s, v2.s[1]
317        FMLA v29.4s, v23.4s, v2.s[1]
318        FMLA v30.4s, v22.4s, v3.s[1]
319        FMLA v31.4s, v23.4s, v3.s[1]
320        FMLA v16.4s, v24.4s, v0.s[2]
321        FMLA v17.4s, v25.4s, v0.s[2]
322        FMLA v18.4s, v24.4s, v1.s[2]
323        FMLA v19.4s, v25.4s, v1.s[2]
324        FMLA v28.4s, v24.4s, v2.s[2]
325        FMLA v29.4s, v25.4s, v2.s[2]
326        FMLA v30.4s, v24.4s, v3.s[2]
327        FMLA v31.4s, v25.4s, v3.s[2]
328        FMLA v16.4s, v26.4s, v0.s[3]
329        FMLA v17.4s, v27.4s, v0.s[3]
330        FMLA v18.4s, v26.4s, v1.s[3]
331        FMLA v19.4s, v27.4s, v1.s[3]
332        FMLA v28.4s, v26.4s, v2.s[3]
333        FMLA v29.4s, v27.4s, v2.s[3]
334        FMLA v30.4s, v26.4s, v3.s[3]
335        FMLA v31.4s, v27.4s, v3.s[3]
336
3374:
338        # Remainder- 2 floats of A (8 bytes)
339        TBZ x0, 3, 5f
340
341        LDR d0,  [x3], 8
342        LDP q20, q21, [x5], 32
343        LDR d1, [x11], 8
344        LDR d2, [x12], 8
345        LDR d3,  [x4], 8
346        FMLA v16.4s, v20.4s, v0.s[0]
347        FMLA v17.4s, v21.4s, v0.s[0]
348        LDP q22, q23, [x5], 32
349        FMLA v18.4s, v20.4s, v1.s[0]
350        FMLA v19.4s, v21.4s, v1.s[0]
351        FMLA v28.4s, v20.4s, v2.s[0]
352        FMLA v29.4s, v21.4s, v2.s[0]
353        FMLA v30.4s, v20.4s, v3.s[0]
354        FMLA v31.4s, v21.4s, v3.s[0]
355        FMLA v16.4s, v22.4s, v0.s[1]
356        FMLA v17.4s, v23.4s, v0.s[1]
357        FMLA v18.4s, v22.4s, v1.s[1]
358        FMLA v19.4s, v23.4s, v1.s[1]
359        FMLA v28.4s, v22.4s, v2.s[1]
360        FMLA v29.4s, v23.4s, v2.s[1]
361        FMLA v30.4s, v22.4s, v3.s[1]
362        FMLA v31.4s, v23.4s, v3.s[1]
363
3645:
365        # Remainder- 1 float of A (4 bytes)
366        TBZ x0, 2, 6f
367
368        LDR s0,  [x3], 4
369        LDP q20, q21, [x5], 32
370        LDR s1, [x11], 4
371        LDR s2, [x12], 4
372        LDR s3,  [x4], 4
373        FMLA v16.4s, v20.4s, v0.s[0]
374        FMLA v17.4s, v21.4s, v0.s[0]
375        FMLA v18.4s, v20.4s, v1.s[0]
376        FMLA v19.4s, v21.4s, v1.s[0]
377        FMLA v28.4s, v20.4s, v2.s[0]
378        FMLA v29.4s, v21.4s, v2.s[0]
379        FMLA v30.4s, v20.4s, v3.s[0]
380        FMLA v31.4s, v21.4s, v3.s[0]
381
3826:
383        # Clamp
384        FMAX v16.4s, v16.4s, v4.4s
385        SUBS x1, x1, 8
386        FMAX v17.4s, v17.4s, v4.4s
387        FMAX v18.4s, v18.4s, v4.4s
388        FMAX v19.4s, v19.4s, v4.4s
389        FMAX v28.4s, v28.4s, v4.4s
390        FMAX v29.4s, v29.4s, v4.4s
391        FMAX v30.4s, v30.4s, v4.4s
392        FMAX v31.4s, v31.4s, v4.4s
393        FMIN v16.4s, v16.4s, v5.4s
394        FMIN v17.4s, v17.4s, v5.4s
395        FMIN v18.4s, v18.4s, v5.4s
396        FMIN v19.4s, v19.4s, v5.4s
397        FMIN v28.4s, v28.4s, v5.4s
398        FMIN v29.4s, v29.4s, v5.4s
399        FMIN v30.4s, v30.4s, v5.4s
400        FMIN v31.4s, v31.4s, v5.4s
401
402        # Store full 4 x 8
403        B.LO 7f
404
405        STP q16, q17,  [x6]
406        SUB  x3,  x3, x2 // a0 -= kc
407        ADD  x6,  x6, x14
408        STP q18, q19,  [x9]
409        SUB x11, x11, x2 // a1 -= kc
410        ADD  x9,  x9, x14
411        STP q28, q29, [x10]
412        SUB x12, x12, x2 // a2 -= kc
413        ADD x10, x10, x14
414        STP q30, q31,  [x7]
415        SUB  x4,  x4, x2 // a3 -= kc
416        ADD  x7,  x7, x14
417
418        B.HI 0b
419
420        # Restore d8-d15 from stack
421        LDP d14, d15, [sp, 48]
422        LDP d12, d13, [sp, 32]
423        LDP d10, d11, [sp, 16]
424        LDP  d8,  d9, [sp], 64
425        RET
426
427        # Store odd width
4287:
429        TBZ x1, 2, 8f
430        STR q16, [x6], 16
431        MOV v16.16b, v17.16b
432        STR q18, [x9], 16
433        MOV v18.16b, v19.16b
434        STR q28, [x10], 16
435        MOV v28.16b, v29.16b
436        STR q30, [x7], 16
437        MOV v30.16b, v31.16b
438
4398:
440        TBZ x1, 1, 9f
441        STR d16, [x6], 8
442        DUP d16, v16.d[1]
443        STR d18, [x9], 8
444        DUP d18, v18.d[1]
445        STR d28, [x10], 8
446        DUP d28, v28.d[1]
447        STR d30, [x7], 8
448        DUP d30, v30.d[1]
449
4509:
451        TBZ x1, 0, 10f
452        STR s16,  [x6]
453        STR s18,  [x9]
454        STR s28, [x10]
455        STR s30,  [x7]
45610:
457        # Restore d8-d15 from stack
458        LDP d14, d15, [sp, 48]
459        LDP d12, d13, [sp, 32]
460        LDP d10, d11, [sp, 16]
461        LDP  d8,  d9, [sp], 64
462        RET
463
464
465END_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75
466
467#ifdef __ELF__
468.section ".note.GNU-stack","",%progbits
469#endif
470