• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a75.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const union xnn_f32_output_params params[restrict static 1])  [sp + 8] -> x8
23
24# d8-d15 need to be preserved if used.
25# x19-30 need to be preserved if used.
26
27# A pointers
28# x3  a0
29# x11 a1
30# x12 a2
31# x4  a3 / a_stride
32
33# C pointers
34# x6  c0
35# x9  c1
36# x10 c2
37# x7  c3 / cm_stride
38
39# Vector register usage
40# A0  v0  v4
41# A1  v1  v5
42# A2  v2  v6
43# A3  v3  v7
44# B   v8  v9 v10 v11
45# B  v12 v13 v14 v15
46# B  v20 v21 v22 v23
47# B  v24 v25 v26 v27
48# C  v16 v17
49# C  v18 v19
50# C  v28 v29
51# C  v30 v31
52# Clamp v4 v5
53
54BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75
55
56        # Load cn_stride, params pointer
57        LDP x14, x8, [sp]
58
59        # Load clamping_params values
60        LD2R {v4.4s, v5.4s}, [x8]
61
62        # Save d8-d15 on stack
63        STP  d8,  d9, [sp, -64]!
64        STP d10, d11, [sp, 16]
65        STP d12, d13, [sp, 32]
66        STP d14, d15, [sp, 48]
67
68        # Clamp A and C pointers
69        CMP x0, 2                // if mr < 2
70        ADD x11, x3, x4          // a1 = a0 + a_stride
71        ADD x9, x6, x7           // c1 = c0 + cm_stride
72        CSEL x11, x3, x11, LO    //   a1 = a0
73        CSEL x9, x6, x9, LO      //   c1 = c0
74
75        ADD x12, x11, x4         // a2 = a1 + a_stride
76        ADD x10, x9, x7          // c2 = c1 + cm_stride
77                                 // if mr <= 2
78        CSEL x12, x11, x12, LS   //   a2 = a1
79        CSEL x10, x9, x10, LS    //   c2 = c1
80
81        CMP x0, 4                // if mr < 4
82        ADD x4, x12, x4          // a3 = a2 + a_stride
83        ADD x7, x10, x7          // c3 = c2 + cm_stride
84        CSEL x4, x12, x4, LO     //   a3 = a2
85        CSEL x7, x10, x7, LO     //   c3 = c2
86
870:
88        # Load initial bias from w into accumulators
89        LDP q16, q17, [x5], 32
90        MOV v18.16b, v16.16b
91        MOV v19.16b, v17.16b
92        MOV v28.16b, v16.16b
93        MOV v29.16b, v17.16b
94        MOV v30.16b, v16.16b
95        MOV v31.16b, v17.16b
96
97        # Is there at least 8 floats (32 bytes) for prologue + epilogue?
98        SUBS x0, x2, 32  // k = kc - 32
99        B.LO 3f
100
101        # 16 prologue
102        # Read first block of 4 A and B.
103        LDR q0,  [x3], 16
104        LDP q20, q21, [x5], 32
105        LDR q1, [x11], 16
106        LDR q2, [x12], 16
107        LDR q3,  [x4], 16
108        LDP q22, q23, [x5], 32
109        LDP q24, q25, [x5], 32
110        LDP q26, q27, [x5], 32
111
112        # Is there at least 32.  yes do main loop
113        SUBS x0, x0, 32
114        B.LO 2f
115
116        # Main loop - 8 floats of A (32 bytes)
1171:
118        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
119        FMLA v16.4s, v20.4s, v0.s[0]
120        LDP q8, q9, [x5], 32
121        FMLA v17.4s, v21.4s, v0.s[0]
122        FMLA v18.4s, v20.4s, v1.s[0]
123        LDP q10, q11, [x5], 32
124        FMLA v19.4s, v21.4s, v1.s[0]
125        FMLA v28.4s, v20.4s, v2.s[0]
126        LDP q12, q13, [x5], 32
127        FMLA v29.4s, v21.4s, v2.s[0]
128        FMLA v30.4s, v20.4s, v3.s[0]
129        LDP q14, q15, [x5], 32
130        FMLA v31.4s, v21.4s, v3.s[0]
131        FMLA v16.4s, v22.4s, v0.s[1]
132        LDR q4, [x3], 16
133        FMLA v17.4s, v23.4s, v0.s[1]
134        FMLA v18.4s, v22.4s, v1.s[1]
135        LDR q5, [x11], 16
136        FMLA v19.4s, v23.4s, v1.s[1]
137        FMLA v28.4s, v22.4s, v2.s[1]
138        LDR q6, [x12], 16
139        FMLA v29.4s, v23.4s, v2.s[1]
140        FMLA v30.4s, v22.4s, v3.s[1]
141        LDR q7, [x4], 16
142        FMLA v31.4s, v23.4s, v3.s[1]
143        FMLA v16.4s, v24.4s, v0.s[2]
144        PRFM PLDL1KEEP, [x5, 128]
145        FMLA v17.4s, v25.4s, v0.s[2]
146        FMLA v18.4s, v24.4s, v1.s[2]
147        PRFM PLDL1KEEP, [x5, 192]
148        FMLA v19.4s, v25.4s, v1.s[2]
149        FMLA v28.4s, v24.4s, v2.s[2]
150        PRFM PLDL1KEEP, [x5, 256]
151        FMLA v29.4s, v25.4s, v2.s[2]
152        FMLA v30.4s, v24.4s, v3.s[2]
153        PRFM PLDL1KEEP, [x5, 320]
154        FMLA v31.4s, v25.4s, v3.s[2]
155        FMLA v16.4s, v26.4s, v0.s[3]
156        FMLA v17.4s, v27.4s, v0.s[3]
157        FMLA v18.4s, v26.4s, v1.s[3]
158        FMLA v19.4s, v27.4s, v1.s[3]
159        FMLA v28.4s, v26.4s, v2.s[3]
160        FMLA v29.4s, v27.4s, v2.s[3]
161        FMLA v30.4s, v26.4s, v3.s[3]
162        FMLA v31.4s, v27.4s, v3.s[3]
163
164        # Second block of 4.  FMA for second 4, loads for 1nd block of 4.
165        FMLA v16.4s, v8.4s, v4.s[0]
166        LDP q20, q21, [x5], 32
167        FMLA v17.4s, v9.4s, v4.s[0]
168        FMLA v18.4s, v8.4s, v5.s[0]
169        LDP q22, q23, [x5], 32
170        FMLA v19.4s, v9.4s, v5.s[0]
171        FMLA v28.4s, v8.4s, v6.s[0]
172        LDP q24, q25, [x5], 32
173        FMLA v29.4s, v9.4s, v6.s[0]
174        FMLA v30.4s, v8.4s, v7.s[0]
175        LDP q26, q27, [x5], 32
176        FMLA v31.4s, v9.4s, v7.s[0]
177        FMLA v16.4s, v10.4s, v4.s[1]
178        LDR q0, [x3], 16
179        FMLA v17.4s, v11.4s, v4.s[1]
180        FMLA v18.4s, v10.4s, v5.s[1]
181        LDR q1, [x11], 16
182        FMLA v19.4s, v11.4s, v5.s[1]
183        FMLA v28.4s, v10.4s, v6.s[1]
184        LDR q2, [x12], 16
185        FMLA v29.4s, v11.4s, v6.s[1]
186        FMLA v30.4s, v10.4s, v7.s[1]
187        LDR q3, [x4], 16
188        FMLA v31.4s, v11.4s, v7.s[1]
189        FMLA v16.4s, v12.4s, v4.s[2]
190        FMLA v17.4s, v13.4s, v4.s[2]
191        FMLA v18.4s, v12.4s, v5.s[2]
192        FMLA v19.4s, v13.4s, v5.s[2]
193        FMLA v28.4s, v12.4s, v6.s[2]
194        FMLA v29.4s, v13.4s, v6.s[2]
195        FMLA v30.4s, v12.4s, v7.s[2]
196        FMLA v31.4s, v13.4s, v7.s[2]
197        FMLA v16.4s, v14.4s, v4.s[3]
198        FMLA v17.4s, v15.4s, v4.s[3]
199        FMLA v18.4s, v14.4s, v5.s[3]
200        FMLA v19.4s, v15.4s, v5.s[3]
201        FMLA v28.4s, v14.4s, v6.s[3]
202        FMLA v29.4s, v15.4s, v6.s[3]
203        SUBS x0, x0, 32
204        FMLA v30.4s, v14.4s, v7.s[3]
205        FMLA v31.4s, v15.4s, v7.s[3]
206        B.HS 1b
207
2082:
209        # Epilogue
210        # First block of 4.  FMA for first 4, loads for 2nd block of 4.
211        FMLA v16.4s, v20.4s, v0.s[0]
212        LDP q8, q9, [x5], 32
213        FMLA v17.4s, v21.4s, v0.s[0]
214        FMLA v18.4s, v20.4s, v1.s[0]
215        LDP q10, q11, [x5], 32
216        FMLA v19.4s, v21.4s, v1.s[0]
217        FMLA v28.4s, v20.4s, v2.s[0]
218        LDP q12, q13, [x5], 32
219        FMLA v29.4s, v21.4s, v2.s[0]
220        FMLA v30.4s, v20.4s, v3.s[0]
221        LDP q14, q15, [x5], 32
222        FMLA v31.4s, v21.4s, v3.s[0]
223        FMLA v16.4s, v22.4s, v0.s[1]
224        LDR q4, [x3], 16
225        FMLA v17.4s, v23.4s, v0.s[1]
226        FMLA v18.4s, v22.4s, v1.s[1]
227        LDR q5, [x11], 16
228        FMLA v19.4s, v23.4s, v1.s[1]
229        FMLA v28.4s, v22.4s, v2.s[1]
230        LDR q6, [x12], 16
231        FMLA v29.4s, v23.4s, v2.s[1]
232        FMLA v30.4s, v22.4s, v3.s[1]
233        LDR q7, [x4], 16
234        FMLA v31.4s, v23.4s, v3.s[1]
235        FMLA v16.4s, v24.4s, v0.s[2]
236        FMLA v17.4s, v25.4s, v0.s[2]
237        FMLA v18.4s, v24.4s, v1.s[2]
238        FMLA v19.4s, v25.4s, v1.s[2]
239        FMLA v28.4s, v24.4s, v2.s[2]
240        FMLA v29.4s, v25.4s, v2.s[2]
241        FMLA v30.4s, v24.4s, v3.s[2]
242        FMLA v31.4s, v25.4s, v3.s[2]
243        FMLA v16.4s, v26.4s, v0.s[3]
244        FMLA v17.4s, v27.4s, v0.s[3]
245        FMLA v18.4s, v26.4s, v1.s[3]
246        FMLA v19.4s, v27.4s, v1.s[3]
247        FMLA v28.4s, v26.4s, v2.s[3]
248        FMLA v29.4s, v27.4s, v2.s[3]
249        FMLA v30.4s, v26.4s, v3.s[3]
250        FMLA v31.4s, v27.4s, v3.s[3]
251
252        # Second block of 4.  FMA for second 4, noloads
253        FMLA v16.4s, v8.4s, v4.s[0]
254        FMLA v17.4s, v9.4s, v4.s[0]
255        FMLA v18.4s, v8.4s, v5.s[0]
256        FMLA v19.4s, v9.4s, v5.s[0]
257        FMLA v28.4s, v8.4s, v6.s[0]
258        FMLA v29.4s, v9.4s, v6.s[0]
259        FMLA v30.4s, v8.4s, v7.s[0]
260        FMLA v31.4s, v9.4s, v7.s[0]
261
262        FMLA v16.4s, v10.4s, v4.s[1]
263        FMLA v17.4s, v11.4s, v4.s[1]
264        FMLA v18.4s, v10.4s, v5.s[1]
265        FMLA v19.4s, v11.4s, v5.s[1]
266        FMLA v28.4s, v10.4s, v6.s[1]
267        FMLA v29.4s, v11.4s, v6.s[1]
268        FMLA v30.4s, v10.4s, v7.s[1]
269        FMLA v31.4s, v11.4s, v7.s[1]
270
271        FMLA v16.4s, v12.4s, v4.s[2]
272        FMLA v17.4s, v13.4s, v4.s[2]
273        FMLA v18.4s, v12.4s, v5.s[2]
274        FMLA v19.4s, v13.4s, v5.s[2]
275        FMLA v28.4s, v12.4s, v6.s[2]
276        FMLA v29.4s, v13.4s, v6.s[2]
277        FMLA v30.4s, v12.4s, v7.s[2]
278        FMLA v31.4s, v13.4s, v7.s[2]
279
280        FMLA v16.4s, v14.4s, v4.s[3]
281        FMLA v17.4s, v15.4s, v4.s[3]
282        FMLA v18.4s, v14.4s, v5.s[3]
283        FMLA v19.4s, v15.4s, v5.s[3]
284
285        # Load clamping_params values
286        LD2R {v4.4s, v5.4s}, [x8]
287
288        FMLA v28.4s, v14.4s, v6.s[3]
289        FMLA v29.4s, v15.4s, v6.s[3]
290        FMLA v30.4s, v14.4s, v7.s[3]
291        FMLA v31.4s, v15.4s, v7.s[3]
292
2933:
294        # Remainder- 4 floats of A (16 bytes)
295        TBZ x0, 4, 4f
296
297        LDR q0,  [x3], 16
298        LDP q20, q21, [x5], 32
299        LDR q1, [x11], 16
300        LDR q2, [x12], 16
301        LDR q3,  [x4], 16
302        FMLA v16.4s, v20.4s, v0.s[0]
303        FMLA v17.4s, v21.4s, v0.s[0]
304        LDP q22, q23, [x5], 32
305        FMLA v18.4s, v20.4s, v1.s[0]
306        FMLA v19.4s, v21.4s, v1.s[0]
307        LDP q24, q25, [x5], 32
308        FMLA v28.4s, v20.4s, v2.s[0]
309        FMLA v29.4s, v21.4s, v2.s[0]
310        LDP q26, q27, [x5], 32
311        FMLA v30.4s, v20.4s, v3.s[0]
312        FMLA v31.4s, v21.4s, v3.s[0]
313        FMLA v16.4s, v22.4s, v0.s[1]
314        FMLA v17.4s, v23.4s, v0.s[1]
315        FMLA v18.4s, v22.4s, v1.s[1]
316        FMLA v19.4s, v23.4s, v1.s[1]
317        FMLA v28.4s, v22.4s, v2.s[1]
318        FMLA v29.4s, v23.4s, v2.s[1]
319        FMLA v30.4s, v22.4s, v3.s[1]
320        FMLA v31.4s, v23.4s, v3.s[1]
321        FMLA v16.4s, v24.4s, v0.s[2]
322        FMLA v17.4s, v25.4s, v0.s[2]
323        FMLA v18.4s, v24.4s, v1.s[2]
324        FMLA v19.4s, v25.4s, v1.s[2]
325        FMLA v28.4s, v24.4s, v2.s[2]
326        FMLA v29.4s, v25.4s, v2.s[2]
327        FMLA v30.4s, v24.4s, v3.s[2]
328        FMLA v31.4s, v25.4s, v3.s[2]
329        FMLA v16.4s, v26.4s, v0.s[3]
330        FMLA v17.4s, v27.4s, v0.s[3]
331        FMLA v18.4s, v26.4s, v1.s[3]
332        FMLA v19.4s, v27.4s, v1.s[3]
333        FMLA v28.4s, v26.4s, v2.s[3]
334        FMLA v29.4s, v27.4s, v2.s[3]
335        FMLA v30.4s, v26.4s, v3.s[3]
336        FMLA v31.4s, v27.4s, v3.s[3]
337
3384:
339        # Remainder- 2 floats of A (8 bytes)
340        TBZ x0, 3, 5f
341
342        LDR d0,  [x3], 8
343        LDP q20, q21, [x5], 32
344        LDR d1, [x11], 8
345        LDR d2, [x12], 8
346        LDR d3,  [x4], 8
347        FMLA v16.4s, v20.4s, v0.s[0]
348        FMLA v17.4s, v21.4s, v0.s[0]
349        LDP q22, q23, [x5], 32
350        FMLA v18.4s, v20.4s, v1.s[0]
351        FMLA v19.4s, v21.4s, v1.s[0]
352        FMLA v28.4s, v20.4s, v2.s[0]
353        FMLA v29.4s, v21.4s, v2.s[0]
354        FMLA v30.4s, v20.4s, v3.s[0]
355        FMLA v31.4s, v21.4s, v3.s[0]
356        FMLA v16.4s, v22.4s, v0.s[1]
357        FMLA v17.4s, v23.4s, v0.s[1]
358        FMLA v18.4s, v22.4s, v1.s[1]
359        FMLA v19.4s, v23.4s, v1.s[1]
360        FMLA v28.4s, v22.4s, v2.s[1]
361        FMLA v29.4s, v23.4s, v2.s[1]
362        FMLA v30.4s, v22.4s, v3.s[1]
363        FMLA v31.4s, v23.4s, v3.s[1]
364
3655:
366        # Remainder- 1 float of A (4 bytes)
367        TBZ x0, 2, 6f
368
369        LDR s0,  [x3], 4
370        LDP q20, q21, [x5], 32
371        LDR s1, [x11], 4
372        LDR s2, [x12], 4
373        LDR s3,  [x4], 4
374        FMLA v16.4s, v20.4s, v0.s[0]
375        FMLA v17.4s, v21.4s, v0.s[0]
376        FMLA v18.4s, v20.4s, v1.s[0]
377        FMLA v19.4s, v21.4s, v1.s[0]
378        FMLA v28.4s, v20.4s, v2.s[0]
379        FMLA v29.4s, v21.4s, v2.s[0]
380        FMLA v30.4s, v20.4s, v3.s[0]
381        FMLA v31.4s, v21.4s, v3.s[0]
382
3836:
384        # Clamp
385        FMIN v16.4s, v16.4s, v4.4s
386        SUBS x1, x1, 8
387        FMIN v17.4s, v17.4s, v4.4s
388        FMIN v18.4s, v18.4s, v4.4s
389        FMIN v19.4s, v19.4s, v4.4s
390        FMIN v28.4s, v28.4s, v4.4s
391        FMIN v29.4s, v29.4s, v4.4s
392        FMIN v30.4s, v30.4s, v4.4s
393        FMIN v31.4s, v31.4s, v4.4s
394        FMAX v16.4s, v16.4s, v5.4s
395        FMAX v17.4s, v17.4s, v5.4s
396        FMAX v18.4s, v18.4s, v5.4s
397        FMAX v19.4s, v19.4s, v5.4s
398        FMAX v28.4s, v28.4s, v5.4s
399        FMAX v29.4s, v29.4s, v5.4s
400        FMAX v30.4s, v30.4s, v5.4s
401        FMAX v31.4s, v31.4s, v5.4s
402
403        # Store full 4 x 8
404        B.LO 7f
405
406        STP q16, q17,  [x6]
407        SUB  x3,  x3, x2 // a0 -= kc
408        ADD  x6,  x6, x14
409        STP q18, q19,  [x9]
410        SUB x11, x11, x2 // a1 -= kc
411        ADD  x9,  x9, x14
412        STP q28, q29, [x10]
413        SUB x12, x12, x2 // a2 -= kc
414        ADD x10, x10, x14
415        STP q30, q31,  [x7]
416        SUB  x4,  x4, x2 // a3 -= kc
417        ADD  x7,  x7, x14
418
419        B.HI 0b
420
421        # Restore d8-d15 from stack
422        LDP d14, d15, [sp, 48]
423        LDP d12, d13, [sp, 32]
424        LDP d10, d11, [sp, 16]
425        LDP  d8,  d9, [sp], 64
426        RET
427
428        # Store odd width
4297:
430        TBZ x1, 2, 8f
431        STR q16, [x6], 16
432        MOV v16.16b, v17.16b
433        STR q18, [x9], 16
434        MOV v18.16b, v19.16b
435        STR q28, [x10], 16
436        MOV v28.16b, v29.16b
437        STR q30, [x7], 16
438        MOV v30.16b, v31.16b
439
4408:
441        TBZ x1, 1, 9f
442        STR d16, [x6], 8
443        DUP d16, v16.d[1]
444        STR d18, [x9], 8
445        DUP d18, v18.d[1]
446        STR d28, [x10], 8
447        DUP d28, v28.d[1]
448        STR d30, [x7], 8
449        DUP d30, v30.d[1]
450
4519:
452        TBZ x1, 0, 10f
453        STR s16,  [x6]
454        STR s18,  [x9]
455        STR s28, [x10]
456        STR s30,  [x7]
45710:
458        # Restore d8-d15 from stack
459        LDP d14, d15, [sp, 48]
460        LDP d12, d13, [sp, 32]
461        LDP d10, d11, [sp, 16]
462        LDP  d8,  d9, [sp], 64
463        RET
464
465
466END_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75
467
468#ifdef __ELF__
469.section ".note.GNU-stack","",%progbits
470#endif
471