• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Auto-generated file. Do not edit!
2//   Template: src/f32-gemm/6x8-aarch64-neonfma-ld64.S.in
3//   Generator: tools/xngen
4//
5// Copyright 2019 Google LLC
6//
7// This source code is licensed under the BSD-style license found in the
8// LICENSE file in the root directory of this source tree.
9
10#include <xnnpack/assembly.h>
11
12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64(
13#     size_t mr,                x0
14#     size_t nc,                x1
15#     size_t kc,                x2 / x0
16#     const uint8_t*restrict a, x3
17#     size_t a_stride,          x4
18#     const void*restrict w,    x5
19#     uint8_t*restrict c,       x6
20#     size_t cm_stride,         x7
21#     size_t cn_stride,         [sp] -> x14
22#     const float*restrict acc,  [sp + 8] -> x15
23#     const union xnn_f32_output_params params[restrict static 1])  [sp + 16] -> x8
24
25# d8-d15 need to be preserved if used.
26# x19-30 need to be preserved if used.
27
28# A pointers
29#  x3 a0
30#  x9 a1
31# x10 a2
32# x11 a3
33# x12 a4
34#  x4 a5
35
36# C pointers
37#  x6 c0
38# x16 c1
39# x17 c2
40# x18 c3
41# x13 c4
42#  x7 c5
43
44# Vector register usage
45# A0   v0
46# A1   v1
47# A2   v2
48# A3   v3
49# A4   v4
50# A5   v5
51# B   v16 v17 v18 v19
52# C   v20 v21
53# C   v22 v23
54# C   v24 v25
55# C   v26 v27
56# C   v28 v29
57# C   v30 v31
58# Clamp v6 v7
59# unused A   v8 v9 v10 v11
60# unused B   v12 v13 v14 v15
61
62BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64
63
64        # Clamp A and C pointers
65        CMP x0, 2                // if mr < 2
66        ADD x9, x3, x4           // a1 = a0 + a_stride
67        ADD x16, x6, x7          // c1 = c0 + cm_stride
68        CSEL x9, x3, x9, LO      //   a1 = a0
69        CSEL x16, x6, x16, LO    //   c1 = c0
70
71        ADD x10, x9, x4          // a2 = a1 + a_stride
72        ADD x17, x16, x7         // c2 = c1 + cm_stride
73                                 // if mr <= 2
74        CSEL x10, x9, x10, LS    //   a2 = a1
75        CSEL x17, x16, x17, LS   //   c2 = c1
76
77        CMP x0, 4                // if mr < 4
78        ADD x11, x10, x4         // a3 = a2 + a_stride
79        ADD x18, x17, x7         // c3 = c2 + cm_stride
80        CSEL x11, x10, x11, LO   //   a3 = a2
81        CSEL x18, x17, x18, LO   //   c3 = c2
82
83        ADD x12, x11, x4         // a4 = a3 + a_stride
84        ADD x13, x18, x7         // c4 = c3 + cm_stride
85                                 // if mr <= 5
86        CSEL x12, x11, x12, LS   //   a4 = a3
87        CSEL x13, x18, x13, LS   //   c4 = c3
88
89        # Load acc, params pointer
90        LDP x15, x8, [sp, 8]
91
92        CMP x0, 6                // if mr < 6
93        ADD x4, x12, x4          // a5 = a4 + a_stride
94        ADD x7, x13, x7          // c5 = c4 + cm_stride
95        CSEL x4, x12, x4, LO     //   a5 = a4
96        CSEL x7, x13, x7, LO     //   c5 = c4
97
98        # Load clamping_params values
99        LD2R {v6.4s, v7.4s}, [x8]
100
101        # Load cn_stride
102        LDR x14, [sp]
103
1040:
105        # Load initial accumulators
106        LDP q20, q21, [x15], 32
107        LDP q22, q23, [x15], 32
108        LDP q24, q25, [x15], 32
109        LDP q26, q27, [x15], 32
110        LDP q28, q29, [x15], 32
111        LDP q30, q31, [x15], 32
112        PRFM PLDL1KEEP, [x5, 0]  // Prefetch B
113        PRFM PLDL1KEEP, [x5, 64]
114        PRFM PLDL1KEEP, [x5, 128]
115        PRFM PLDL1KEEP, [x5, 192]
116        PRFM PLDL1KEEP,  [x3]    // Prefetch A
117        PRFM PLDL1KEEP,  [x9]
118        PRFM PLDL1KEEP, [x10]
119        PRFM PLDL1KEEP, [x11]
120        PRFM PLDL1KEEP, [x12]
121        PRFM PLDL1KEEP,  [x4]
122
123        # Is there at least 2 floats (8 bytes) for main loop?
124        SUBS x0, x2, 8  // k = kc - 8
125        B.LO 4f
126
127        # Main loop - 2 floats of A (8 bytes)
128        # 24 FMA + 6 LD64 A + 2 LDP B
1291:
130        LDR   d0,  [x3], 8
131        LDP  q16,  q17, [x5], 32
132        LDR   d1,  [x9], 8
133        LDR   d2, [x10], 8
134        LDR   d3, [x11], 8
135        LDR   d4, [x12], 8
136        LDR   d5,  [x4], 8
137        FMLA v20.4s, v16.4s,  v0.s[0]
138        FMLA v22.4s, v16.4s,  v1.s[0]
139        FMLA v24.4s, v16.4s,  v2.s[0]
140        FMLA v26.4s, v16.4s,  v3.s[0]
141        LDP  q18,  q19, [x5], 32
142        FMLA v28.4s, v16.4s,  v4.s[0]
143        FMLA v30.4s, v16.4s,  v5.s[0]
144        FMLA v21.4s, v17.4s,  v0.s[0]
145        FMLA v23.4s, v17.4s,  v1.s[0]
146        FMLA v25.4s, v17.4s,  v2.s[0]
147        FMLA v27.4s, v17.4s,  v3.s[0]
148        FMLA v29.4s, v17.4s,  v4.s[0]
149        FMLA v31.4s, v17.4s,  v5.s[0]
150
151        FMLA v20.4s, v18.4s,  v0.s[1]
152        FMLA v22.4s, v18.4s,  v1.s[1]
153        FMLA v24.4s, v18.4s,  v2.s[1]
154        FMLA v26.4s, v18.4s,  v3.s[1]
155        FMLA v28.4s, v18.4s,  v4.s[1]
156        FMLA v30.4s, v18.4s,  v5.s[1]
157        FMLA v21.4s, v19.4s,  v0.s[1]
158        FMLA v23.4s, v19.4s,  v1.s[1]
159        FMLA v25.4s, v19.4s,  v2.s[1]
160        FMLA v27.4s, v19.4s,  v3.s[1]
161        SUBS x0, x0, 8
162        FMLA v29.4s, v19.4s,  v4.s[1]
163        FMLA v31.4s, v19.4s,  v5.s[1]
164        B.HS 1b
165
166        # Is there a remainder?- 1 floats of A (4 bytes)
167        TBNZ x0, 2, 4f
1683:
169        # Clamp
170        FMIN v20.4s, v20.4s, v6.4s
171        SUBS x1, x1, 8
172        FMIN v21.4s, v21.4s, v6.4s
173        FMIN v22.4s, v22.4s, v6.4s
174        FMIN v23.4s, v23.4s, v6.4s
175        FMIN v24.4s, v24.4s, v6.4s
176        FMIN v25.4s, v25.4s, v6.4s
177        FMIN v26.4s, v26.4s, v6.4s
178        FMIN v27.4s, v27.4s, v6.4s
179        FMIN v28.4s, v28.4s, v6.4s
180        FMIN v29.4s, v29.4s, v6.4s
181        FMIN v30.4s, v30.4s, v6.4s
182        FMIN v31.4s, v31.4s, v6.4s
183        FMAX v20.4s, v20.4s, v7.4s
184        FMAX v21.4s, v21.4s, v7.4s
185        FMAX v22.4s, v22.4s, v7.4s
186        FMAX v23.4s, v23.4s, v7.4s
187        FMAX v24.4s, v24.4s, v7.4s
188        FMAX v25.4s, v25.4s, v7.4s
189        FMAX v26.4s, v26.4s, v7.4s
190        FMAX v27.4s, v27.4s, v7.4s
191        FMAX v28.4s, v28.4s, v7.4s
192        FMAX v29.4s, v29.4s, v7.4s
193        FMAX v30.4s, v30.4s, v7.4s
194        FMAX v31.4s, v31.4s, v7.4s
195
196        # Store full 6 x 8
197        B.LO 5f
198
199        ST1 {v30.16b, v31.16b},  [x7], x14
200        SUB  x3,  x3, x2 // a0 -= kc
201        ST1 {v28.16b, v29.16b}, [x13], x14
202        SUB  x9,  x9, x2 // a1 -= kc
203        ST1 {v26.16b, v27.16b}, [x18], x14
204        SUB x10, x10, x2 // a2 -= kc
205        ST1 {v24.16b, v25.16b}, [x17], x14
206        SUB x11, x11, x2 // a3 -= kc
207        ST1 {v22.16b, v23.16b}, [x16], x14
208        SUB x12, x12, x2 // a4 -= kc
209        ST1 {v20.16b, v21.16b},  [x6], x14
210        SUB  x4,  x4, x2 // a5 -= kc
211
212        B.HI 0b
213        RET
214
2154:
216        # Remainder- 1 floats of A (4 bytes)
217        LDR   s0,  [x3], 4
218        LDP  q16,  q17, [x5], 32
219        LDR   s1,  [x9], 4
220        LDR   s2, [x10], 4
221        LDR   s3, [x11], 4
222        LDR   s4, [x12], 4
223        LDR   s5,  [x4], 4
224        FMLA v20.4s, v16.4s,  v0.s[0]
225        FMLA v22.4s, v16.4s,  v1.s[0]
226        FMLA v24.4s, v16.4s,  v2.s[0]
227        FMLA v26.4s, v16.4s,  v3.s[0]
228        FMLA v28.4s, v16.4s,  v4.s[0]
229        FMLA v30.4s, v16.4s,  v5.s[0]
230        FMLA v21.4s, v17.4s,  v0.s[0]
231        FMLA v23.4s, v17.4s,  v1.s[0]
232        FMLA v25.4s, v17.4s,  v2.s[0]
233        FMLA v27.4s, v17.4s,  v3.s[0]
234        FMLA v29.4s, v17.4s,  v4.s[0]
235        FMLA v31.4s, v17.4s,  v5.s[0]
236        B 3b
237
238        # Store odd width
2395:
240        TBZ x1, 2, 6f
241        STR q30,  [x7], 16
242        MOV v30.16b, v31.16b
243        STR q28, [x13], 16
244        MOV v28.16b, v29.16b
245        STR q26, [x18], 16
246        MOV v26.16b, v27.16b
247        STR q24, [x17], 16
248        MOV v24.16b, v25.16b
249        STR q22, [x16], 16
250        MOV v22.16b, v23.16b
251        STR q20,  [x6], 16
252        MOV v20.16b, v21.16b
253
2546:
255        TBZ x1, 1, 7f
256        STR d30,  [x7], 8
257        DUP d30, v30.d[1]
258        STR d28, [x13], 8
259        DUP d28, v28.d[1]
260        STR d26, [x18], 8
261        DUP d26, v26.d[1]
262        STR d24, [x17], 8
263        DUP d24, v24.d[1]
264        STR d22, [x16], 8
265        DUP d22, v22.d[1]
266        STR d20,  [x6], 8
267        DUP d20, v20.d[1]
268
2697:
270        TBZ x1, 0, 8f
271        STR s30,  [x7]
272        STR s28, [x13]
273        STR s26, [x18]
274        STR s24, [x17]
275        STR s22, [x16]
276        STR s20,  [x6]
2778:
278        RET
279
280END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64
281
282#ifdef __ELF__
283.section ".note.GNU-stack","",%progbits
284#endif
285