1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 24 25# d8-d15 need to be preserved if used. 26# x19-30 need to be preserved if used. 27 28# A pointers 29# x3 a0 30# x9 a1 31# x10 a2 32# x11 a3 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x18 c3 39 40# x4 temporary vector shadow register 41 42# Vector register usage 43# A0 v0 v3 44# A1 v0[1] v3[1] 45# A2 v1 v4 46# A3 v1[1] v4[1] 47 48# B v12 v13 v14 v15 second set of B 49# B v16 v17 v18 v19 first set 50# C v20 v21 51# C v22 v23 52# C v24 v25 53# C v26 v27 54# Clamp v6 v7 55 56# unused A v8 v9 v10 v11 57# x12 a4 58# x13 c4 59# x7 c5 60# A4 v2 v5 61# A5 v2[1] v5[1] 62# C v28 v29 63# C v30 v31 64 65BEGIN_FUNCTION xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53 66 67 # Clamp A and C pointers 68 CMP x0, 2 // if mr < 2 69 ADD x9, x3, x4 // a1 = a0 + a_stride 70 ADD x16, x6, x7 // c1 = c0 + cm_stride 71 CSEL x9, x3, x9, LO // a1 = a0 72 CSEL x16, x6, x16, LO // c1 = c0 73 74 ADD x10, x9, x4 // a2 = a1 + a_stride 75 ADD x17, x16, x7 // c2 = c1 + cm_stride 76 // if mr <= 2 77 CSEL x10, x9, x10, LS // a2 = a1 78 CSEL x17, x16, x17, LS // c2 = c1 79 80 CMP x0, 4 // if mr < 4 81 ADD x11, x10, x4 // a3 = a2 + a_stride 82 ADD x18, x17, x7 // c3 = c2 + cm_stride 83 CSEL x11, x10, x11, LO // a3 = a2 84 CSEL x18, x17, x18, LO // c3 = c2 85 86 # Load acc, params pointer 87 LDP x15, x8, [sp, 8] 88 89 # Load clamping_params values 90 LD2R {v6.4s, v7.4s}, [x8] 91 92 # Load cn_stride 93 LDR x14, [sp] 94 95 // Save d12-d15 on stack 96 STP d12, d13, [sp, -32]! 97 STP d14, d15, [sp, 16] 98 990: 100 # Load initial accumulators 101 LDP q20, q21, [x15], 32 102 LDP q22, q23, [x15], 32 103 LDP q24, q25, [x15], 32 104 LDP q26, q27, [x15], 32 105 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 106 PRFM PLDL1KEEP, [x3, 64] 107 PRFM PLDL1KEEP, [x9, 0] 108 PRFM PLDL1KEEP, [x9, 64] 109 PRFM PLDL1KEEP, [x10, 0] 110 PRFM PLDL1KEEP, [x10, 64] 111 PRFM PLDL1KEEP, [x11, 0] 112 PRFM PLDL1KEEP, [x11, 64] 113 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 114 PRFM PLDL1KEEP, [x5, 64] 115 PRFM PLDL1KEEP, [x5, 128] 116 PRFM PLDL1KEEP, [x5, 192] 117 118 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 119 SUBS x0, x2, 16 // k = kc - 16 120 B.LO 5f 121 122 # Prologue - First group loads, no FMA 123 LDR d0, [x3], 8 // a0 124 LDP q16, q17, [x5], 32 // b 125 LDR d1, [x10], 8 // a2 126 LD1 {v0.d}[1], [x9], 8 // a1 127 LD1 {v1.d}[1], [x11], 8 // a3 128 SUBS x0, x0, 16 129 LDR q18, [x5], 16 130 LDR d19, [x5], 8 131 LDR x4, [x5], 8 // ins is in BLOCK 0 132 133 # Is there at least 4 floats (16 bytes) for main loop? 134 B.LO 2f 135 136 # Main loop - 4 floats of A (16 bytes) 137 # 32 FMA + 8 LD64 A + 8 LDR B 1381: 139 # First group of 16 FMA, Second group loads 140 // BLOCK 0 141 LDR d3, [x3], 8 // a0 142 INS v19.d[1], x4 // b from second group 143 FMLA v20.4s, v16.4s, v0.s[0] 144 LDR x4, [x9], 8 // a1 145 FMLA v22.4s, v16.4s, v0.s[2] 146 FMLA v24.4s, v16.4s, v1.s[0] 147 148 // BLOCK 1 149 LDR d12, [x5] 150 INS v3.d[1], x4 // a1 ins 151 FMLA v26.4s, v16.4s, v1.s[2] 152 LDR x4, [x5, 8] // b 153 FMLA v21.4s, v17.4s, v0.s[0] 154 FMLA v23.4s, v17.4s, v0.s[2] 155 156 // BLOCK 2 157 LDR d4, [x10], 8 // a2 158 INS v12.d[1], x4 // b ins 159 FMLA v25.4s, v17.4s, v1.s[0] 160 LDR x4, [x11], 8 // a3 161 FMLA v27.4s, v17.4s, v1.s[2] 162 FMLA v20.4s, v18.4s, v0.s[1] 163 164 // BLOCK 3 165 LDR d13, [x5, 16] 166 INS v4.d[1], x4 // a3 ins 167 FMLA v22.4s, v18.4s, v0.s[3] 168 LDR x4, [x5, 24] 169 FMLA v24.4s, v18.4s, v1.s[1] 170 FMLA v26.4s, v18.4s, v1.s[3] 171 172 // BLOCK 4 173 LDR d14, [x5, 32] 174 INS v13.d[1], x4 // b 175 FMLA v21.4s, v19.4s, v0.s[1] 176 LDR x4, [x5, 40] 177 FMLA v23.4s, v19.4s, v0.s[3] 178 FMLA v25.4s, v19.4s, v1.s[1] 179 180 // BLOCK 5 181 // NOPs to ensure 4 cycle LDR lands on next LDR 182 LDR d15, [x5, 48] 183 INS v14.d[1], x4 // b from previous 184 FMLA v27.4s, v19.4s, v1.s[3] 185 LDR x4, [x5, 56] 186 NOP 187 NOP 188 NOP 189 NOP 190 191 # Second group of 16 FMA, First group of loads 192 // BLOCK 0 193 LDR d0, [x3], 8 // a0 194 INS v15.d[1], x4 // b from previous 195 FMLA v20.4s, v12.4s, v3.s[0] 196 LDR x4, [x9], 8 // a1 197 FMLA v22.4s, v12.4s, v3.s[2] 198 FMLA v24.4s, v12.4s, v4.s[0] 199 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 200 201 // BLOCK 1 202 LDR d16, [x5, 64] 203 INS v0.d[1], x4 // a1 ins 204 FMLA v26.4s, v12.4s, v4.s[2] 205 LDR x4, [x5, 72] // b 206 FMLA v21.4s, v13.4s, v3.s[0] 207 FMLA v23.4s, v13.4s, v3.s[2] 208 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1 209 210 // BLOCK 2 211 LDR d1, [x10], 8 // a2 212 INS v16.d[1], x4 // b 213 FMLA v25.4s, v13.4s, v4.s[0] 214 LDR x4, [x11], 8 // a3 215 FMLA v27.4s, v13.4s, v4.s[2] 216 FMLA v20.4s, v14.4s, v3.s[1] 217 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2 218 219 // BLOCK 3 220 LDR d17, [x5, 80] 221 INS v1.d[1], x4 // a3 ins 222 FMLA v22.4s, v14.4s, v3.s[3] 223 LDR x4, [x5, 88] 224 FMLA v24.4s, v14.4s, v4.s[1] 225 FMLA v26.4s, v14.4s, v4.s[3] 226 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3 227 228 // BLOCK 4 229 LDR d18, [x5, 96] 230 INS v17.d[1], x4 // b 231 FMLA v21.4s, v15.4s, v3.s[1] 232 LDR x4, [x5, 104] 233 FMLA v23.4s, v15.4s, v3.s[3] 234 FMLA v25.4s, v15.4s, v4.s[1] 235 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 236 237 // BLOCK 5 238 // NOTE that block needs to be 4 cycles for LDR not to stall 239 LDR d19, [x5, 112] 240 INS v18.d[1], x4 241 FMLA v27.4s, v15.4s, v4.s[3] 242 LDR x4, [x5, 120] 243 SUBS x0, x0, 16 244 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 245 ADD x5, x5, 128 246 B.HS 1b 247 248 # Epilogue - 4 floats of A (16 bytes) 249 # 32 FMA + 8 LD64 A + 8 LDR B 2502: 251 # First group of 16 FMA, Second group loads 252 // BLOCK 0 253 LDR d3, [x3], 8 // a0 254 INS v19.d[1], x4 // b from second group 255 FMLA v20.4s, v16.4s, v0.s[0] 256 LDR x4, [x9], 8 // a1 257 FMLA v22.4s, v16.4s, v0.s[2] 258 FMLA v24.4s, v16.4s, v1.s[0] 259 260 // BLOCK 1 261 LDR d12, [x5] 262 INS v3.d[1], x4 // a1 ins 263 FMLA v26.4s, v16.4s, v1.s[2] 264 LDR x4, [x5, 8] // b 265 FMLA v21.4s, v17.4s, v0.s[0] 266 FMLA v23.4s, v17.4s, v0.s[2] 267 268 // BLOCK 2 269 LDR d4, [x10], 8 // a2 270 INS v12.d[1], x4 // b ins 271 FMLA v25.4s, v17.4s, v1.s[0] 272 LDR x4, [x11], 8 // a3 273 FMLA v27.4s, v17.4s, v1.s[2] 274 FMLA v20.4s, v18.4s, v0.s[1] 275 276 // BLOCK 3 277 LDR d13, [x5, 16] 278 INS v4.d[1], x4 // a3 ins 279 FMLA v22.4s, v18.4s, v0.s[3] 280 LDR x4, [x5, 24] 281 FMLA v24.4s, v18.4s, v1.s[1] 282 FMLA v26.4s, v18.4s, v1.s[3] 283 284 // BLOCK 4 285 LDR d14, [x5, 32] 286 INS v13.d[1], x4 // b 287 FMLA v21.4s, v19.4s, v0.s[1] 288 LDR x4, [x5, 40] 289 FMLA v23.4s, v19.4s, v0.s[3] 290 FMLA v25.4s, v19.4s, v1.s[1] 291 292 // BLOCK 5 293 // NOPs to ensure 4 cycle LDR lands on next LDR 294 LDR d15, [x5, 48] 295 INS v14.d[1], x4 296 FMLA v27.4s, v19.4s, v1.s[3] 297 LDR x4, [x5, 56] 298 NOP // fma 299 NOP 300 NOP // fma 301 NOP 302 303 # Second group of 16 FMA, no loads 304 // BLOCK 0 305 INS v15.d[1], x4 // b from previous 306 FMLA v20.4s, v12.4s, v3.s[0] 307 FMLA v22.4s, v12.4s, v3.s[2] 308 FMLA v24.4s, v12.4s, v4.s[0] 309 310 // BLOCK 1 311 FMLA v26.4s, v12.4s, v4.s[2] 312 FMLA v21.4s, v13.4s, v3.s[0] 313 FMLA v23.4s, v13.4s, v3.s[2] 314 315 // BLOCK 2 316 FMLA v25.4s, v13.4s, v4.s[0] 317 FMLA v27.4s, v13.4s, v4.s[2] 318 FMLA v20.4s, v14.4s, v3.s[1] 319 320 // BLOCK 3 321 FMLA v22.4s, v14.4s, v3.s[3] 322 FMLA v24.4s, v14.4s, v4.s[1] 323 FMLA v26.4s, v14.4s, v4.s[3] 324 TST x0, 15 325 326 // BLOCK 4 327 FMLA v21.4s, v15.4s, v3.s[1] 328 FMLA v23.4s, v15.4s, v3.s[3] 329 FMLA v25.4s, v15.4s, v4.s[1] 330 ADD x5, x5, 64 331 332 // BLOCK 5 333 FMLA v27.4s, v15.4s, v4.s[3] 334 335 # Is there a remainder?- 2 floats of A (8 bytes) or less 336 B.NE 5f 337 3384: 339 # Clamp 340 FMIN v20.4s, v20.4s, v6.4s 341 SUBS x1, x1, 8 342 FMIN v21.4s, v21.4s, v6.4s 343 FMIN v22.4s, v22.4s, v6.4s 344 FMIN v23.4s, v23.4s, v6.4s 345 FMIN v24.4s, v24.4s, v6.4s 346 FMIN v25.4s, v25.4s, v6.4s 347 FMIN v26.4s, v26.4s, v6.4s 348 FMIN v27.4s, v27.4s, v6.4s 349 FMAX v20.4s, v20.4s, v7.4s 350 FMAX v21.4s, v21.4s, v7.4s 351 FMAX v22.4s, v22.4s, v7.4s 352 FMAX v23.4s, v23.4s, v7.4s 353 FMAX v24.4s, v24.4s, v7.4s 354 FMAX v25.4s, v25.4s, v7.4s 355 FMAX v26.4s, v26.4s, v7.4s 356 FMAX v27.4s, v27.4s, v7.4s 357 358 # Store full 4 x 8 359 B.LO 8f 360 361 ST1 {v26.16b, v27.16b}, [x18], x14 362 SUB x3, x3, x2 // a0 -= kc 363 ST1 {v24.16b, v25.16b}, [x17], x14 364 SUB x9, x9, x2 // a1 -= kc 365 ST1 {v22.16b, v23.16b}, [x16], x14 366 SUB x10, x10, x2 // a2 -= kc 367 ST1 {v20.16b, v21.16b}, [x6], x14 368 SUB x11, x11, x2 // a3 -= kc 369 370 B.HI 0b 371 372 // Restore d12-d15 from stack 373 LDP d14, d15, [sp, 16] 374 LDP d12, d13, [sp], 32 375 RET 376 3775: 378 # Is there a remainder?- 2 floats of A (8 bytes) 379 TBZ x0, 3, 6f 380 381 # Remainder- 2 floats of A (8 bytes) 382 LDR d0, [x3], 8 383 LDR q16, [x5], 16 384 LD1 {v0.d}[1], [x9], 8 385 LDR d1, [x10], 8 386 LD1 {v1.d}[1], [x11], 8 387 LDR q17, [x5], 16 388 LDR q18, [x5], 16 389 LDR q19, [x5], 16 390 FMLA v20.4s, v16.4s, v0.s[0] 391 FMLA v22.4s, v16.4s, v0.s[2] 392 FMLA v24.4s, v16.4s, v1.s[0] 393 FMLA v26.4s, v16.4s, v1.s[2] 394 FMLA v21.4s, v17.4s, v0.s[0] 395 FMLA v23.4s, v17.4s, v0.s[2] 396 FMLA v25.4s, v17.4s, v1.s[0] 397 FMLA v27.4s, v17.4s, v1.s[2] 398 399 FMLA v20.4s, v18.4s, v0.s[1] 400 FMLA v22.4s, v18.4s, v0.s[3] 401 FMLA v24.4s, v18.4s, v1.s[1] 402 FMLA v26.4s, v18.4s, v1.s[3] 403 FMLA v21.4s, v19.4s, v0.s[1] 404 FMLA v23.4s, v19.4s, v0.s[3] 405 FMLA v25.4s, v19.4s, v1.s[1] 406 FMLA v27.4s, v19.4s, v1.s[3] 407 408 # Is there a remainder?- 1 floats of A (4 bytes) 409 TBZ x0, 2, 4b 410 4116: 412 # Remainder- 1 floats of A (4 bytes) 413 LDR s0, [x3], 4 414 LDR q16, [x5], 16 415 LD1 {v0.s}[2], [x9], 4 416 LDR s1, [x10], 4 417 LD1 {v1.s}[2], [x11], 4 418 LDR q17, [x5], 16 419 420 FMLA v20.4s, v16.4s, v0.s[0] 421 FMLA v22.4s, v16.4s, v0.s[2] 422 FMLA v24.4s, v16.4s, v1.s[0] 423 FMLA v26.4s, v16.4s, v1.s[2] 424 FMLA v21.4s, v17.4s, v0.s[0] 425 FMLA v23.4s, v17.4s, v0.s[2] 426 FMLA v25.4s, v17.4s, v1.s[0] 427 FMLA v27.4s, v17.4s, v1.s[2] 428 B 4b 429 430 # Store odd width 4318: 432 TBZ x1, 2, 9f 433 STR q26, [x18], 16 434 MOV v26.16b, v27.16b 435 STR q24, [x17], 16 436 MOV v24.16b, v25.16b 437 STR q22, [x16], 16 438 MOV v22.16b, v23.16b 439 STR q20, [x6], 16 440 MOV v20.16b, v21.16b 441 4429: 443 TBZ x1, 1, 10f 444 STR d26, [x18], 8 445 DUP d26, v26.d[1] 446 STR d24, [x17], 8 447 DUP d24, v24.d[1] 448 STR d22, [x16], 8 449 DUP d22, v22.d[1] 450 STR d20, [x6], 8 451 DUP d20, v20.d[1] 452 45310: 454 TBZ x1, 0, 11f 455 STR s26, [x18] 456 STR s24, [x17] 457 STR s22, [x16] 458 STR s20, [x6] 45911: 460 // Restore d12-d15 from stack 461 LDP d14, d15, [sp, 16] 462 LDP d12, d13, [sp], 32 463 RET 464 465END_FUNCTION xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53 466 467#ifdef __ELF__ 468.section ".note.GNU-stack","",%progbits 469#endif 470