• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
24
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29# x3  a0
30# x11 a1
31# x12 a2
32# x4  a3 / a_stride
33
34# C pointers
35# x6  c0
36# x9  c1
37# x10 c2
38# x7  c3 / cm_stride
39
40# Vector register usage
41# A0  v0  v4
42# A1  v1  v5
43# A2  v2  v6
44# A3  v3  v7
45# B   v8  v9 v10 v11
46# B  v12 v13 v14 v15
47# B  v20 v21 v22 v23
48# B  v24 v25 v26 v27
49# C  v16 v17
50# C  v18 v19
51# C  v28 v29
52# C  v30 v31
53# Clamp v4 v5
54
55BEGIN_FUNCTION xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57
56
57        # Load cn_stride, acc
58        LDP x14, x15, [sp]
59        # Load params pointer
60        LDR x8, [sp, 16]
61
62        # Load clamping_params values
63        LD2R {v4.4s, v5.4s}, [x8]
64
65        # Save d8-d15 on stack
66        STP  d8,  d9, [sp, -64]!
67        STP d10, d11, [sp, 16]
68        STP d12, d13, [sp, 32]
69        STP d14, d15, [sp, 48]
70
71        # Clamp A and C pointers
72        CMP x0, 2                // if mr < 2
73        ADD x11, x3, x4          // a1 = a0 + a_stride
74        ADD x9, x6, x7           // c1 = c0 + cm_stride
75        CSEL x11, x3, x11, LO    //   a1 = a0
76        CSEL x9, x6, x9, LO      //   c1 = c0
77
78        ADD x12, x11, x4         // a2 = a1 + a_stride
79        ADD x10, x9, x7          // c2 = c1 + cm_stride
80                                 // if mr <= 2
81        CSEL x12, x11, x12, LS   //   a2 = a1
82        CSEL x10, x9, x10, LS    //   c2 = c1
83
84        CMP x0, 4                // if mr < 4
85        ADD x4, x12, x4          // a3 = a2 + a_stride
86        ADD x7, x10, x7          // c3 = c2 + cm_stride
87        CSEL x4, x12, x4, LO     //   a3 = a2
88        CSEL x7, x10, x7, LO     //   c3 = c2
89
900:
91        # Load initial accumulators
92        LDP q16, q17, [x15], 32
93        LDP q18, q19, [x15], 32
94        LDP q28, q29, [x15], 32
95        LDP q30, q31, [x15], 32
96
97        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
98        SUBS x0, x2, 32  // k = kc - 32
99        B.LO 3f
100
101        # 16 prologue
102        # Read first block of 4 A and B.
103        LDR q0,  [x3], 16
104        LDP q20, q21, [x5], 32
105        LDR q1, [x11], 16
106        LDR q2, [x12], 16
107        LDR q3,  [x4], 16
108        LDP q22, q23, [x5], 32
109        LDP q24, q25, [x5], 32
110        LDP q26, q27, [x5], 32
111
112        # Is there at least 32.  yes do main loop
113        SUBS x0, x0, 32
114        B.LO 2f
115
116        # Main loop - 8 floats of A (32 bytes)
1171:
118        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
119        FMLA v16.4s, v20.4s, v0.s[0]
120        LDP q8, q9, [x5], 32
121        FMLA v17.4s, v21.4s, v0.s[0]
122        FMLA v18.4s, v20.4s, v1.s[0]
123        LDP q10, q11, [x5], 32
124        FMLA v19.4s, v21.4s, v1.s[0]
125        FMLA v28.4s, v20.4s, v2.s[0]
126        LDP q12, q13, [x5], 32
127        FMLA v29.4s, v21.4s, v2.s[0]
128        FMLA v30.4s, v20.4s, v3.s[0]
129        LDP q14, q15, [x5], 32
130        FMLA v31.4s, v21.4s, v3.s[0]
131        FMLA v16.4s, v22.4s, v0.s[1]
132        LDR q4, [x3], 16
133        FMLA v17.4s, v23.4s, v0.s[1]
134        FMLA v18.4s, v22.4s, v1.s[1]
135        LDR q5, [x11], 16
136        FMLA v19.4s, v23.4s, v1.s[1]
137        FMLA v28.4s, v22.4s, v2.s[1]
138        LDR q6, [x12], 16
139        FMLA v29.4s, v23.4s, v2.s[1]
140        FMLA v30.4s, v22.4s, v3.s[1]
141        LDR q7, [x4], 16
142        FMLA v31.4s, v23.4s, v3.s[1]
143        FMLA v16.4s, v24.4s, v0.s[2]
144        FMLA v17.4s, v25.4s, v0.s[2]
145        FMLA v18.4s, v24.4s, v1.s[2]
146        FMLA v19.4s, v25.4s, v1.s[2]
147        FMLA v28.4s, v24.4s, v2.s[2]
148        FMLA v29.4s, v25.4s, v2.s[2]
149        FMLA v30.4s, v24.4s, v3.s[2]
150        FMLA v31.4s, v25.4s, v3.s[2]
151        FMLA v16.4s, v26.4s, v0.s[3]
152        FMLA v17.4s, v27.4s, v0.s[3]
153        FMLA v18.4s, v26.4s, v1.s[3]
154        FMLA v19.4s, v27.4s, v1.s[3]
155        FMLA v28.4s, v26.4s, v2.s[3]
156        FMLA v29.4s, v27.4s, v2.s[3]
157        FMLA v30.4s, v26.4s, v3.s[3]
158        FMLA v31.4s, v27.4s, v3.s[3]
159
160        # Second block of 4.  FMA for second 4, loads for 1nd block of 4.
161        FMLA v16.4s, v8.4s, v4.s[0]
162        LDP q20, q21, [x5], 32
163        FMLA v17.4s, v9.4s, v4.s[0]
164        FMLA v18.4s, v8.4s, v5.s[0]
165        LDP q22, q23, [x5], 32
166        FMLA v19.4s, v9.4s, v5.s[0]
167        FMLA v28.4s, v8.4s, v6.s[0]
168        LDP q24, q25, [x5], 32
169        FMLA v29.4s, v9.4s, v6.s[0]
170        FMLA v30.4s, v8.4s, v7.s[0]
171        LDP q26, q27, [x5], 32
172        FMLA v31.4s, v9.4s, v7.s[0]
173        FMLA v16.4s, v10.4s, v4.s[1]
174        LDR q0, [x3], 16
175        FMLA v17.4s, v11.4s, v4.s[1]
176        FMLA v18.4s, v10.4s, v5.s[1]
177        LDR q1, [x11], 16
178        FMLA v19.4s, v11.4s, v5.s[1]
179        FMLA v28.4s, v10.4s, v6.s[1]
180        LDR q2, [x12], 16
181        FMLA v29.4s, v11.4s, v6.s[1]
182        FMLA v30.4s, v10.4s, v7.s[1]
183        LDR q3, [x4], 16
184        FMLA v31.4s, v11.4s, v7.s[1]
185        FMLA v16.4s, v12.4s, v4.s[2]
186        FMLA v17.4s, v13.4s, v4.s[2]
187        FMLA v18.4s, v12.4s, v5.s[2]
188        FMLA v19.4s, v13.4s, v5.s[2]
189        FMLA v28.4s, v12.4s, v6.s[2]
190        FMLA v29.4s, v13.4s, v6.s[2]
191        FMLA v30.4s, v12.4s, v7.s[2]
192        FMLA v31.4s, v13.4s, v7.s[2]
193        FMLA v16.4s, v14.4s, v4.s[3]
194        FMLA v17.4s, v15.4s, v4.s[3]
195        FMLA v18.4s, v14.4s, v5.s[3]
196        FMLA v19.4s, v15.4s, v5.s[3]
197        FMLA v28.4s, v14.4s, v6.s[3]
198        FMLA v29.4s, v15.4s, v6.s[3]
199        SUBS x0, x0, 32
200        FMLA v30.4s, v14.4s, v7.s[3]
201        FMLA v31.4s, v15.4s, v7.s[3]
202        B.HS 1b
203
2042:
205        # Epilogue
206        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
207        FMLA v16.4s, v20.4s, v0.s[0]
208        LDP q8, q9, [x5], 32
209        FMLA v17.4s, v21.4s, v0.s[0]
210        FMLA v18.4s, v20.4s, v1.s[0]
211        LDP q10, q11, [x5], 32
212        FMLA v19.4s, v21.4s, v1.s[0]
213        FMLA v28.4s, v20.4s, v2.s[0]
214        LDP q12, q13, [x5], 32
215        FMLA v29.4s, v21.4s, v2.s[0]
216        FMLA v30.4s, v20.4s, v3.s[0]
217        LDP q14, q15, [x5], 32
218        FMLA v31.4s, v21.4s, v3.s[0]
219        FMLA v16.4s, v22.4s, v0.s[1]
220        LDR q4, [x3], 16
221        FMLA v17.4s, v23.4s, v0.s[1]
222        FMLA v18.4s, v22.4s, v1.s[1]
223        LDR q5, [x11], 16
224        FMLA v19.4s, v23.4s, v1.s[1]
225        FMLA v28.4s, v22.4s, v2.s[1]
226        LDR q6, [x12], 16
227        FMLA v29.4s, v23.4s, v2.s[1]
228        FMLA v30.4s, v22.4s, v3.s[1]
229        LDR q7, [x4], 16
230        FMLA v31.4s, v23.4s, v3.s[1]
231        FMLA v16.4s, v24.4s, v0.s[2]
232        FMLA v17.4s, v25.4s, v0.s[2]
233        FMLA v18.4s, v24.4s, v1.s[2]
234        FMLA v19.4s, v25.4s, v1.s[2]
235        FMLA v28.4s, v24.4s, v2.s[2]
236        FMLA v29.4s, v25.4s, v2.s[2]
237        FMLA v30.4s, v24.4s, v3.s[2]
238        FMLA v31.4s, v25.4s, v3.s[2]
239        FMLA v16.4s, v26.4s, v0.s[3]
240        FMLA v17.4s, v27.4s, v0.s[3]
241        FMLA v18.4s, v26.4s, v1.s[3]
242        FMLA v19.4s, v27.4s, v1.s[3]
243        FMLA v28.4s, v26.4s, v2.s[3]
244        FMLA v29.4s, v27.4s, v2.s[3]
245        FMLA v30.4s, v26.4s, v3.s[3]
246        FMLA v31.4s, v27.4s, v3.s[3]
247
248        # Second block of 4.  FMA for second 4, noloads
249        FMLA v16.4s, v8.4s, v4.s[0]
250        FMLA v17.4s, v9.4s, v4.s[0]
251        FMLA v18.4s, v8.4s, v5.s[0]
252        FMLA v19.4s, v9.4s, v5.s[0]
253        FMLA v28.4s, v8.4s, v6.s[0]
254        FMLA v29.4s, v9.4s, v6.s[0]
255        FMLA v30.4s, v8.4s, v7.s[0]
256        FMLA v31.4s, v9.4s, v7.s[0]
257
258        FMLA v16.4s, v10.4s, v4.s[1]
259        FMLA v17.4s, v11.4s, v4.s[1]
260        FMLA v18.4s, v10.4s, v5.s[1]
261        FMLA v19.4s, v11.4s, v5.s[1]
262        FMLA v28.4s, v10.4s, v6.s[1]
263        FMLA v29.4s, v11.4s, v6.s[1]
264        FMLA v30.4s, v10.4s, v7.s[1]
265        FMLA v31.4s, v11.4s, v7.s[1]
266
267        FMLA v16.4s, v12.4s, v4.s[2]
268        FMLA v17.4s, v13.4s, v4.s[2]
269        FMLA v18.4s, v12.4s, v5.s[2]
270        FMLA v19.4s, v13.4s, v5.s[2]
271        FMLA v28.4s, v12.4s, v6.s[2]
272        FMLA v29.4s, v13.4s, v6.s[2]
273        FMLA v30.4s, v12.4s, v7.s[2]
274        FMLA v31.4s, v13.4s, v7.s[2]
275
276        FMLA v16.4s, v14.4s, v4.s[3]
277        FMLA v17.4s, v15.4s, v4.s[3]
278        FMLA v18.4s, v14.4s, v5.s[3]
279        FMLA v19.4s, v15.4s, v5.s[3]
280
281        # Load clamping_params values
282        LD2R {v4.4s, v5.4s}, [x8]
283
284        FMLA v28.4s, v14.4s, v6.s[3]
285        FMLA v29.4s, v15.4s, v6.s[3]
286        FMLA v30.4s, v14.4s, v7.s[3]
287        FMLA v31.4s, v15.4s, v7.s[3]
288
2893:
290        # Remainder- 4 floats of A (16 bytes)
291        TBZ x0, 4, 4f
292
293        LDR q0,  [x3], 16
294        LDP q20, q21, [x5], 32
295        LDR q1, [x11], 16
296        LDR q2, [x12], 16
297        LDR q3,  [x4], 16
298        FMLA v16.4s, v20.4s, v0.s[0]
299        FMLA v17.4s, v21.4s, v0.s[0]
300        LDP q22, q23, [x5], 32
301        FMLA v18.4s, v20.4s, v1.s[0]
302        FMLA v19.4s, v21.4s, v1.s[0]
303        LDP q24, q25, [x5], 32
304        FMLA v28.4s, v20.4s, v2.s[0]
305        FMLA v29.4s, v21.4s, v2.s[0]
306        LDP q26, q27, [x5], 32
307        FMLA v30.4s, v20.4s, v3.s[0]
308        FMLA v31.4s, v21.4s, v3.s[0]
309        FMLA v16.4s, v22.4s, v0.s[1]
310        FMLA v17.4s, v23.4s, v0.s[1]
311        FMLA v18.4s, v22.4s, v1.s[1]
312        FMLA v19.4s, v23.4s, v1.s[1]
313        FMLA v28.4s, v22.4s, v2.s[1]
314        FMLA v29.4s, v23.4s, v2.s[1]
315        FMLA v30.4s, v22.4s, v3.s[1]
316        FMLA v31.4s, v23.4s, v3.s[1]
317        FMLA v16.4s, v24.4s, v0.s[2]
318        FMLA v17.4s, v25.4s, v0.s[2]
319        FMLA v18.4s, v24.4s, v1.s[2]
320        FMLA v19.4s, v25.4s, v1.s[2]
321        FMLA v28.4s, v24.4s, v2.s[2]
322        FMLA v29.4s, v25.4s, v2.s[2]
323        FMLA v30.4s, v24.4s, v3.s[2]
324        FMLA v31.4s, v25.4s, v3.s[2]
325        FMLA v16.4s, v26.4s, v0.s[3]
326        FMLA v17.4s, v27.4s, v0.s[3]
327        FMLA v18.4s, v26.4s, v1.s[3]
328        FMLA v19.4s, v27.4s, v1.s[3]
329        FMLA v28.4s, v26.4s, v2.s[3]
330        FMLA v29.4s, v27.4s, v2.s[3]
331        FMLA v30.4s, v26.4s, v3.s[3]
332        FMLA v31.4s, v27.4s, v3.s[3]
333
3344:
335        # Remainder- 2 floats of A (8 bytes)
336        TBZ x0, 3, 5f
337
338        LDR d0,  [x3], 8
339        LDP q20, q21, [x5], 32
340        LDR d1, [x11], 8
341        LDR d2, [x12], 8
342        LDR d3,  [x4], 8
343        FMLA v16.4s, v20.4s, v0.s[0]
344        FMLA v17.4s, v21.4s, v0.s[0]
345        LDP q22, q23, [x5], 32
346        FMLA v18.4s, v20.4s, v1.s[0]
347        FMLA v19.4s, v21.4s, v1.s[0]
348        FMLA v28.4s, v20.4s, v2.s[0]
349        FMLA v29.4s, v21.4s, v2.s[0]
350        FMLA v30.4s, v20.4s, v3.s[0]
351        FMLA v31.4s, v21.4s, v3.s[0]
352        FMLA v16.4s, v22.4s, v0.s[1]
353        FMLA v17.4s, v23.4s, v0.s[1]
354        FMLA v18.4s, v22.4s, v1.s[1]
355        FMLA v19.4s, v23.4s, v1.s[1]
356        FMLA v28.4s, v22.4s, v2.s[1]
357        FMLA v29.4s, v23.4s, v2.s[1]
358        FMLA v30.4s, v22.4s, v3.s[1]
359        FMLA v31.4s, v23.4s, v3.s[1]
360
3615:
362        # Remainder- 1 float of A (4 bytes)
363        TBZ x0, 2, 6f
364
365        LDR s0,  [x3], 4
366        LDP q20, q21, [x5], 32
367        LDR s1, [x11], 4
368        LDR s2, [x12], 4
369        LDR s3,  [x4], 4
370        FMLA v16.4s, v20.4s, v0.s[0]
371        FMLA v17.4s, v21.4s, v0.s[0]
372        FMLA v18.4s, v20.4s, v1.s[0]
373        FMLA v19.4s, v21.4s, v1.s[0]
374        FMLA v28.4s, v20.4s, v2.s[0]
375        FMLA v29.4s, v21.4s, v2.s[0]
376        FMLA v30.4s, v20.4s, v3.s[0]
377        FMLA v31.4s, v21.4s, v3.s[0]
378
3796:
380        # Clamp
381        FMIN v16.4s, v16.4s, v4.4s
382        SUBS x1, x1, 8
383        FMIN v17.4s, v17.4s, v4.4s
384        FMIN v18.4s, v18.4s, v4.4s
385        FMIN v19.4s, v19.4s, v4.4s
386        FMIN v28.4s, v28.4s, v4.4s
387        FMIN v29.4s, v29.4s, v4.4s
388        FMIN v30.4s, v30.4s, v4.4s
389        FMIN v31.4s, v31.4s, v4.4s
390        FMAX v16.4s, v16.4s, v5.4s
391        FMAX v17.4s, v17.4s, v5.4s
392        FMAX v18.4s, v18.4s, v5.4s
393        FMAX v19.4s, v19.4s, v5.4s
394        FMAX v28.4s, v28.4s, v5.4s
395        FMAX v29.4s, v29.4s, v5.4s
396        FMAX v30.4s, v30.4s, v5.4s
397        FMAX v31.4s, v31.4s, v5.4s
398
399        # Store full 4 x 8
400        B.LO 7f
401
402        STP q30, q31,  [x7]
403        SUB  x3,  x3, x2 // a0 -= kc
404        ADD  x7,  x7, x14
405        STP q28, q29, [x10]
406        SUB x11, x11, x2 // a1 -= kc
407        ADD x10, x10, x14
408        STP q18, q19,  [x9]
409        SUB x12, x12, x2 // a2 -= kc
410        ADD  x9,  x9, x14
411        STP q16, q17,  [x6]
412        SUB  x4,  x4, x2 // a3 -= kc
413        ADD  x6,  x6, x14
414
415        B.HI 0b
416
417        # Restore d8-d15 from stack
418        LDP d14, d15, [sp, 48]
419        LDP d12, d13, [sp, 32]
420        LDP d10, d11, [sp, 16]
421        LDP  d8,  d9, [sp], 64
422        RET
423
424        # Store odd width
4257:
426        TBZ x1, 2, 8f
427        STR q30, [x7], 16
428        MOV v30.16b, v31.16b
429        STR q28, [x10], 16
430        MOV v28.16b, v29.16b
431        STR q18, [x9], 16
432        MOV v18.16b, v19.16b
433        STR q16, [x6], 16
434        MOV v16.16b, v17.16b
435
4368:
437        TBZ x1, 1, 9f
438        STR d30, [x7], 8
439        DUP d30, v30.d[1]
440        STR d28, [x10], 8
441        DUP d28, v28.d[1]
442        STR d18, [x9], 8
443        DUP d18, v18.d[1]
444        STR d16, [x6], 8
445        DUP d16, v16.d[1]
446
4479:
448        TBZ x1, 0, 10f
449        STR s30,  [x7]
450        STR s28, [x10]
451        STR s18,  [x9]
452        STR s16,  [x6]
45310:
454        # Restore d8-d15 from stack
455        LDP d14, d15, [sp, 48]
456        LDP d12, d13, [sp, 32]
457        LDP d10, d11, [sp, 16]
458        LDP  d8,  d9, [sp], 64
459        RET
460
461
462END_FUNCTION xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57
463
464#ifdef __ELF__
465.section ".note.GNU-stack","",%progbits
466#endif
467