1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 24 25# d8-d15 need to be preserved if used. 26# x19-30 need to be preserved if used. 27 28# A pointers 29# x3 a0 30# x9 a1 31# x10 a2 32# x11 a3 33# x12 a4 34# x4 a5 35 36# C pointers 37# x6 c0 38# x16 c1 39# x17 c2 40# x18 c3 41# x13 c4 42# x7 c5 43 44# Vector register usage 45# A0 v0 46# A1 v1 47# A2 v2 48# A3 v3 49# A4 v4 50# A5 v5 51# B v16 v17 v18 v19 52# C v20 v21 53# C v22 v23 54# C v24 v25 55# C v26 v27 56# C v28 v29 57# C v30 v31 58# Clamp v6 v7 59# unused A v8 v9 v10 v11 60# unused B v12 v13 v14 v15 61 62BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128 63 64 # Clamp A and C pointers 65 CMP x0, 2 // if mr < 2 66 ADD x9, x3, x4 // a1 = a0 + a_stride 67 ADD x16, x6, x7 // c1 = c0 + cm_stride 68 CSEL x9, x3, x9, LO // a1 = a0 69 CSEL x16, x6, x16, LO // c1 = c0 70 71 ADD x10, x9, x4 // a2 = a1 + a_stride 72 ADD x17, x16, x7 // c2 = c1 + cm_stride 73 // if mr <= 2 74 CSEL x10, x9, x10, LS // a2 = a1 75 CSEL x17, x16, x17, LS // c2 = c1 76 77 CMP x0, 4 // if mr < 4 78 ADD x11, x10, x4 // a3 = a2 + a_stride 79 ADD x18, x17, x7 // c3 = c2 + cm_stride 80 CSEL x11, x10, x11, LO // a3 = a2 81 CSEL x18, x17, x18, LO // c3 = c2 82 83 ADD x12, x11, x4 // a4 = a3 + a_stride 84 ADD x13, x18, x7 // c4 = c3 + cm_stride 85 // if mr <= 5 86 CSEL x12, x11, x12, LS // a4 = a3 87 CSEL x13, x18, x13, LS // c4 = c3 88 89 # Load acc, params pointer 90 LDP x15, x8, [sp, 8] 91 92 CMP x0, 6 // if mr < 6 93 ADD x4, x12, x4 // a5 = a4 + a_stride 94 ADD x7, x13, x7 // c5 = c4 + cm_stride 95 CSEL x4, x12, x4, LO // a5 = a4 96 CSEL x7, x13, x7, LO // c5 = c4 97 98 # Load clamping_params values 99 LD2R {v6.4s, v7.4s}, [x8] 100 101 # Load cn_stride 102 LDR x14, [sp] 103 1040: 105 # Load initial accumulators 106 LDP q20, q21, [x15], 32 107 LDP q22, q23, [x15], 32 108 LDP q24, q25, [x15], 32 109 LDP q26, q27, [x15], 32 110 LDP q28, q29, [x15], 32 111 LDP q30, q31, [x15], 32 112 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 113 PRFM PLDL1KEEP, [x5, 64] 114 PRFM PLDL1KEEP, [x5, 128] 115 PRFM PLDL1KEEP, [x5, 192] 116 PRFM PLDL1KEEP, [x3] // Prefetch A 117 PRFM PLDL1KEEP, [x9] 118 PRFM PLDL1KEEP, [x10] 119 PRFM PLDL1KEEP, [x11] 120 PRFM PLDL1KEEP, [x12] 121 PRFM PLDL1KEEP, [x4] 122 123 # Is there at least 4 floats (16 bytes)? 124 SUBS x0, x2, 16 // k = kc - 16 125 B.LO 5f 126 127 # Main loop - 4 floats of A (16 bytes) 128 # 48 FMA + 6 ld128 A + 4 LDP B 1291: 130 LDR q0, [x3], 16 131 LDP q16, q17, [x5], 32 132 LDR q1, [x9], 16 133 LDR q2, [x10], 16 134 LDR q3, [x11], 16 135 LDR q4, [x12], 16 136 LDR q5, [x4], 16 137 FMLA v20.4s, v16.4s, v0.s[0] 138 FMLA v22.4s, v16.4s, v1.s[0] 139 FMLA v24.4s, v16.4s, v2.s[0] 140 FMLA v26.4s, v16.4s, v3.s[0] 141 LDP q18, q19, [x5], 32 142 FMLA v28.4s, v16.4s, v4.s[0] 143 FMLA v30.4s, v16.4s, v5.s[0] 144 FMLA v21.4s, v17.4s, v0.s[0] 145 FMLA v23.4s, v17.4s, v1.s[0] 146 FMLA v25.4s, v17.4s, v2.s[0] 147 FMLA v27.4s, v17.4s, v3.s[0] 148 FMLA v29.4s, v17.4s, v4.s[0] 149 FMLA v31.4s, v17.4s, v5.s[0] 150 151 FMLA v20.4s, v18.4s, v0.s[1] 152 LDP q16, q17, [x5], 32 153 FMLA v22.4s, v18.4s, v1.s[1] 154 FMLA v24.4s, v18.4s, v2.s[1] 155 FMLA v26.4s, v18.4s, v3.s[1] 156 FMLA v28.4s, v18.4s, v4.s[1] 157 FMLA v30.4s, v18.4s, v5.s[1] 158 FMLA v21.4s, v19.4s, v0.s[1] 159 FMLA v23.4s, v19.4s, v1.s[1] 160 FMLA v25.4s, v19.4s, v2.s[1] 161 FMLA v27.4s, v19.4s, v3.s[1] 162 FMLA v29.4s, v19.4s, v4.s[1] 163 FMLA v31.4s, v19.4s, v5.s[1] 164 165 FMLA v20.4s, v16.4s, v0.s[2] 166 LDP q18, q19, [x5], 32 167 FMLA v22.4s, v16.4s, v1.s[2] 168 FMLA v24.4s, v16.4s, v2.s[2] 169 FMLA v26.4s, v16.4s, v3.s[2] 170 FMLA v28.4s, v16.4s, v4.s[2] 171 FMLA v30.4s, v16.4s, v5.s[2] 172 FMLA v21.4s, v17.4s, v0.s[2] 173 FMLA v23.4s, v17.4s, v1.s[2] 174 FMLA v25.4s, v17.4s, v2.s[2] 175 FMLA v27.4s, v17.4s, v3.s[2] 176 FMLA v29.4s, v17.4s, v4.s[2] 177 FMLA v31.4s, v17.4s, v5.s[2] 178 179 FMLA v20.4s, v18.4s, v0.s[3] 180 FMLA v22.4s, v18.4s, v1.s[3] 181 FMLA v24.4s, v18.4s, v2.s[3] 182 FMLA v26.4s, v18.4s, v3.s[3] 183 FMLA v28.4s, v18.4s, v4.s[3] 184 FMLA v30.4s, v18.4s, v5.s[3] 185 FMLA v21.4s, v19.4s, v0.s[3] 186 FMLA v23.4s, v19.4s, v1.s[3] 187 FMLA v25.4s, v19.4s, v2.s[3] 188 FMLA v27.4s, v19.4s, v3.s[3] 189 SUBS x0, x0, 16 190 FMLA v29.4s, v19.4s, v4.s[3] 191 FMLA v31.4s, v19.4s, v5.s[3] 192 B.HS 1b 193 194 # Is there a remainder?- 2 floats of A (8 bytes) or less 195 TST x0, 15 196 B.NE 5f 197 1984: 199 # Clamp 200 FMIN v20.4s, v20.4s, v6.4s 201 SUBS x1, x1, 8 202 FMIN v21.4s, v21.4s, v6.4s 203 FMIN v22.4s, v22.4s, v6.4s 204 FMIN v23.4s, v23.4s, v6.4s 205 FMIN v24.4s, v24.4s, v6.4s 206 FMIN v25.4s, v25.4s, v6.4s 207 FMIN v26.4s, v26.4s, v6.4s 208 FMIN v27.4s, v27.4s, v6.4s 209 FMIN v28.4s, v28.4s, v6.4s 210 FMIN v29.4s, v29.4s, v6.4s 211 FMIN v30.4s, v30.4s, v6.4s 212 FMIN v31.4s, v31.4s, v6.4s 213 FMAX v20.4s, v20.4s, v7.4s 214 FMAX v21.4s, v21.4s, v7.4s 215 FMAX v22.4s, v22.4s, v7.4s 216 FMAX v23.4s, v23.4s, v7.4s 217 FMAX v24.4s, v24.4s, v7.4s 218 FMAX v25.4s, v25.4s, v7.4s 219 FMAX v26.4s, v26.4s, v7.4s 220 FMAX v27.4s, v27.4s, v7.4s 221 FMAX v28.4s, v28.4s, v7.4s 222 FMAX v29.4s, v29.4s, v7.4s 223 FMAX v30.4s, v30.4s, v7.4s 224 FMAX v31.4s, v31.4s, v7.4s 225 226 # Store full 6 x 8 227 B.LO 7f 228 229 ST1 {v30.16b, v31.16b}, [x7], x14 230 SUB x3, x3, x2 // a0 -= kc 231 ST1 {v28.16b, v29.16b}, [x13], x14 232 SUB x9, x9, x2 // a1 -= kc 233 ST1 {v26.16b, v27.16b}, [x18], x14 234 SUB x10, x10, x2 // a2 -= kc 235 ST1 {v24.16b, v25.16b}, [x17], x14 236 SUB x11, x11, x2 // a3 -= kc 237 ST1 {v22.16b, v23.16b}, [x16], x14 238 SUB x12, x12, x2 // a4 -= kc 239 ST1 {v20.16b, v21.16b}, [x6], x14 240 SUB x4, x4, x2 // a5 -= kc 241 242 B.HI 0b 243 RET 244 2455: 246 # Is there a remainder?- 2 floats of A (8 bytes) 247 TBZ x0, 3, 6f 248 249 # Remainder- 2 floats of A (8 bytes) 250 LDR d0, [x3], 8 251 LDP q16, q17, [x5], 32 252 LDR d1, [x9], 8 253 LDR d2, [x10], 8 254 LDR d3, [x11], 8 255 LDR d4, [x12], 8 256 LDR d5, [x4], 8 257 FMLA v20.4s, v16.4s, v0.s[0] 258 FMLA v22.4s, v16.4s, v1.s[0] 259 FMLA v24.4s, v16.4s, v2.s[0] 260 FMLA v26.4s, v16.4s, v3.s[0] 261 LDP q18, q19, [x5], 32 262 FMLA v28.4s, v16.4s, v4.s[0] 263 FMLA v30.4s, v16.4s, v5.s[0] 264 FMLA v21.4s, v17.4s, v0.s[0] 265 FMLA v23.4s, v17.4s, v1.s[0] 266 FMLA v25.4s, v17.4s, v2.s[0] 267 FMLA v27.4s, v17.4s, v3.s[0] 268 FMLA v29.4s, v17.4s, v4.s[0] 269 FMLA v31.4s, v17.4s, v5.s[0] 270 271 FMLA v20.4s, v18.4s, v0.s[1] 272 FMLA v22.4s, v18.4s, v1.s[1] 273 FMLA v24.4s, v18.4s, v2.s[1] 274 FMLA v26.4s, v18.4s, v3.s[1] 275 FMLA v28.4s, v18.4s, v4.s[1] 276 FMLA v30.4s, v18.4s, v5.s[1] 277 FMLA v21.4s, v19.4s, v0.s[1] 278 FMLA v23.4s, v19.4s, v1.s[1] 279 FMLA v25.4s, v19.4s, v2.s[1] 280 FMLA v27.4s, v19.4s, v3.s[1] 281 FMLA v29.4s, v19.4s, v4.s[1] 282 FMLA v31.4s, v19.4s, v5.s[1] 283 284 # Is there a remainder?- 1 floats of A (4 bytes) 285 TBZ x0, 2, 4b 286 287 # Remainder- 1 float of A (4 bytes) 2886: 289 LDR s0, [x3], 4 290 LDP q16, q17, [x5], 32 291 LDR s1, [x9], 4 292 LDR s2, [x10], 4 293 LDR s3, [x11], 4 294 LDR s4, [x12], 4 295 LDR s5, [x4], 4 296 FMLA v20.4s, v16.4s, v0.s[0] 297 FMLA v22.4s, v16.4s, v1.s[0] 298 FMLA v24.4s, v16.4s, v2.s[0] 299 FMLA v26.4s, v16.4s, v3.s[0] 300 FMLA v28.4s, v16.4s, v4.s[0] 301 FMLA v30.4s, v16.4s, v5.s[0] 302 FMLA v21.4s, v17.4s, v0.s[0] 303 FMLA v23.4s, v17.4s, v1.s[0] 304 FMLA v25.4s, v17.4s, v2.s[0] 305 FMLA v27.4s, v17.4s, v3.s[0] 306 FMLA v29.4s, v17.4s, v4.s[0] 307 FMLA v31.4s, v17.4s, v5.s[0] 308 B 4b 309 310 # Store odd width 3117: 312 TBZ x1, 2, 8f 313 STR q30, [x7], 16 314 MOV v30.16b, v31.16b 315 STR q28, [x13], 16 316 MOV v28.16b, v29.16b 317 STR q26, [x18], 16 318 MOV v26.16b, v27.16b 319 STR q24, [x17], 16 320 MOV v24.16b, v25.16b 321 STR q22, [x16], 16 322 MOV v22.16b, v23.16b 323 STR q20, [x6], 16 324 MOV v20.16b, v21.16b 325 3268: 327 TBZ x1, 1, 9f 328 STR d30, [x7], 8 329 DUP d30, v30.d[1] 330 STR d28, [x13], 8 331 DUP d28, v28.d[1] 332 STR d26, [x18], 8 333 DUP d26, v26.d[1] 334 STR d24, [x17], 8 335 DUP d24, v24.d[1] 336 STR d22, [x16], 8 337 DUP d22, v22.d[1] 338 STR d20, [x6], 8 339 DUP d20, v20.d[1] 340 3419: 342 TBZ x1, 0, 10f 343 STR s30, [x7] 344 STR s28, [x13] 345 STR s26, [x18] 346 STR s24, [x17] 347 STR s22, [x16] 348 STR s20, [x6] 34910: 350 RET 351 352END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128 353 354#ifdef __ELF__ 355.section ".note.GNU-stack","",%progbits 356#endif 357