1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x10 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_output_params params [sp + 24] -> x8 21 22# d8-d15 need to be preserved if used. 23# x19-30 need to be preserved if used. 24 25# A pointers 26# x13 a0 27# x14 a1 28# x15 a2 29# x8 a3 30 31# C pointers 32# x6 c0 33# x16 c1 34# x17 c2 35# x7 c3 36 37# x19 temporary vector shadow register 38 39# Vector register usage 40# A0 v0 v3 41# A1 v0[1] v3[1] 42# A2 v1 v4 43# A3 v1[1] v4[1] 44 45# B v12 v13 v14 v15 second set of B 46# B v16 v17 v18 v19 first set 47# C v20 v21 48# C v22 v23 49# C v24 v25 50# C v26 v27 51# Clamp v6 v7 52 53# unused A v8 v9 v10 v11 54# x12 a4 55# x4 a5 56# x13 c4 57# x7 c5 58# A4 v2 v5 59# A5 v2[1] v5[1] 60# C v28 v29 61# C v30 v31 62 63BEGIN_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53 64 65 # Clamp C pointers 66 CMP x0, 2 // if mr < 2 67 ADD x16, x6, x7 // c1 = c0 + cm_stride 68 CSEL x16, x6, x16, LO // c1 = c0 69 70 ADD x17, x16, x7 // c2 = c1 + cm_stride 71 // if mr <= 2 72 CSEL x17, x16, x17, LS // c2 = c1 73 74 CMP x0, 4 // if mr < 4 75 ADD x7, x17, x7 // c3 = c2 + cm_stride 76 CSEL x7, x17, x7, LO // c3 = c2 77 78 # Load cn_stride, a_offset 79 LDP x10, x11, [sp] 80 81 # Load zero, clamping params pointer 82 LDP x12, x8, [sp, 16] 83 84 # Load clamping_params values 85 LD2R {v6.4s, v7.4s}, [x8] 86 87 // Save x19, d12-d15 on stack 88 STP d12, d13, [sp, -48]! 89 STP d14, d15, [sp, 16] 90 STP x19, x19, [sp, 32] 91 920: 93 # Load initial bias from w into accumulators 94 LDP q20, q21, [x5], 32 95 MOV v22.16b, v20.16b 96 PRFM PLDL1KEEP, [x13, 0] // Prefetch A 97 PRFM PLDL1KEEP, [x13, 64] 98 MOV v23.16b, v21.16b 99 PRFM PLDL1KEEP, [x14, 0] 100 PRFM PLDL1KEEP, [x14, 64] 101 MOV v24.16b, v20.16b 102 PRFM PLDL1KEEP, [x15, 0] 103 PRFM PLDL1KEEP, [x15, 64] 104 MOV v25.16b, v21.16b 105 PRFM PLDL1KEEP, [x8, 0] 106 PRFM PLDL1KEEP, [x8, 64] 107 MOV v26.16b, v20.16b 108 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 109 PRFM PLDL1KEEP, [x5, 64] 110 MOV v27.16b, v21.16b 111 PRFM PLDL1KEEP, [x5, 128] 112 PRFM PLDL1KEEP, [x5, 192] 113 114 MOV x9, x3 // p = ks 115 1161: 117 # Load next 4 A pointers 118 LDP x13, x14, [x4], 16 119 LDP x15, x8, [x4], 16 120 121 122 CMP x13, x12 // if a0 == zero 123 ADD x13, x13, x11 // a0 += a_offset 124 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 125 CMP x14, x12 // if a1 == zero 126 ADD x14, x14, x11 // a1 += a_offset 127 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 128 CMP x15, x12 // if a2 == zero 129 ADD x15, x15, x11 // a2 += a_offset 130 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 131 CMP x8, x12 // if a3 == zero 132 ADD x8, x8, x11 // a3 += a_offset 133 CSEL x8, x12, x8, EQ // a3 = zero, else += a3 + a_offset 134 135 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 136 SUBS x0, x2, 16 // k = kc - 16 137 B.LO 4f 138 139 # Prologue - First group loads, no FMA 140 LDR d0, [x13], 8 // a0 141 LDP q16, q17, [x5], 32 // b 142 LDR d1, [x15], 8 // a2 143 LD1 {v0.d}[1], [x14], 8 // a1 144 LD1 {v1.d}[1], [x8], 8 // a3 145 SUBS x0, x0, 16 146 LDR q18, [x5], 16 147 LDR d19, [x5], 8 148 LDR x19, [x5], 8 // ins is in BLOCK 0 149 150 # Is there at least 4 floats (16 bytes) for main loop? 151 B.LO 3f 152 153 # Main loop - 4 floats of A (16 bytes) 154 # 32 FMA + 8 LD64 A + 8 LDR B 1552: 156 # First group of 16 FMA, Second group loads 157 // BLOCK 0 158 LDR d3, [x13], 8 // a0 159 INS v19.d[1], x19 // b from second group 160 FMLA v20.4s, v16.4s, v0.s[0] 161 LDR x19, [x14], 8 // a1 162 FMLA v22.4s, v16.4s, v0.s[2] 163 FMLA v24.4s, v16.4s, v1.s[0] 164 165 // BLOCK 1 166 LDR d12, [x5] 167 INS v3.d[1], x19 // a1 ins 168 FMLA v26.4s, v16.4s, v1.s[2] 169 LDR x19, [x5, 8] // b 170 FMLA v21.4s, v17.4s, v0.s[0] 171 FMLA v23.4s, v17.4s, v0.s[2] 172 173 // BLOCK 2 174 LDR d4, [x15], 8 // a2 175 INS v12.d[1], x19 // b ins 176 FMLA v25.4s, v17.4s, v1.s[0] 177 LDR x19, [x8], 8 // a3 178 FMLA v27.4s, v17.4s, v1.s[2] 179 FMLA v20.4s, v18.4s, v0.s[1] 180 181 // BLOCK 3 182 LDR d13, [x5, 16] 183 INS v4.d[1], x19 // a3 ins 184 FMLA v22.4s, v18.4s, v0.s[3] 185 LDR x19, [x5, 24] 186 FMLA v24.4s, v18.4s, v1.s[1] 187 FMLA v26.4s, v18.4s, v1.s[3] 188 189 // BLOCK 4 190 LDR d14, [x5, 32] 191 INS v13.d[1], x19 // b 192 FMLA v21.4s, v19.4s, v0.s[1] 193 LDR x19, [x5, 40] 194 FMLA v23.4s, v19.4s, v0.s[3] 195 FMLA v25.4s, v19.4s, v1.s[1] 196 197 // BLOCK 5 198 // NOPs to ensure 4 cycle LDR lands on next LDR 199 LDR d15, [x5, 48] 200 INS v14.d[1], x19 // b from previous 201 FMLA v27.4s, v19.4s, v1.s[3] 202 LDR x19, [x5, 56] 203 NOP 204 NOP 205 NOP 206 NOP 207 208 # Second group of 16 FMA, First group of loads 209 // BLOCK 0 210 LDR d0, [x13], 8 // a0 211 INS v15.d[1], x19 // b from previous 212 FMLA v20.4s, v12.4s, v3.s[0] 213 LDR x19, [x14], 8 // a1 214 FMLA v22.4s, v12.4s, v3.s[2] 215 FMLA v24.4s, v12.4s, v4.s[0] 216 PRFM PLDL1KEEP, [x13, 128] // Prefetch A0 217 218 // BLOCK 1 219 LDR d16, [x5, 64] 220 INS v0.d[1], x19 // a1 ins 221 FMLA v26.4s, v12.4s, v4.s[2] 222 LDR x19, [x5, 72] // b 223 FMLA v21.4s, v13.4s, v3.s[0] 224 FMLA v23.4s, v13.4s, v3.s[2] 225 PRFM PLDL1KEEP, [x14, 128] // Prefetch A1 226 227 // BLOCK 2 228 LDR d1, [x15], 8 // a2 229 INS v16.d[1], x19 // b 230 FMLA v25.4s, v13.4s, v4.s[0] 231 LDR x19, [x8], 8 // a3 232 FMLA v27.4s, v13.4s, v4.s[2] 233 FMLA v20.4s, v14.4s, v3.s[1] 234 PRFM PLDL1KEEP, [x15, 128] // Prefetch A2 235 236 // BLOCK 3 237 LDR d17, [x5, 80] 238 INS v1.d[1], x19 // a3 ins 239 FMLA v22.4s, v14.4s, v3.s[3] 240 LDR x19, [x5, 88] 241 FMLA v24.4s, v14.4s, v4.s[1] 242 FMLA v26.4s, v14.4s, v4.s[3] 243 PRFM PLDL1KEEP, [x8, 128] // Prefetch A3 244 245 // BLOCK 4 246 LDR d18, [x5, 96] 247 INS v17.d[1], x19 // b 248 FMLA v21.4s, v15.4s, v3.s[1] 249 LDR x19, [x5, 104] 250 FMLA v23.4s, v15.4s, v3.s[3] 251 FMLA v25.4s, v15.4s, v4.s[1] 252 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 253 254 // BLOCK 5 255 // NOTE that block needs to be 4 cycles for LDR not to stall 256 LDR d19, [x5, 112] 257 INS v18.d[1], x19 258 FMLA v27.4s, v15.4s, v4.s[3] 259 LDR x19, [x5, 120] 260 SUBS x0, x0, 16 261 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 262 ADD x5, x5, 128 263 B.HS 2b 264 265 # Epilogue - 4 floats of A (16 bytes) 266 # 32 FMA + 8 LD64 A + 8 LDR B 2673: 268 # First group of 16 FMA, Second group loads 269 // BLOCK 0 270 LDR d3, [x13], 8 // a0 271 INS v19.d[1], x19 // b from second group 272 FMLA v20.4s, v16.4s, v0.s[0] 273 LDR x19, [x14], 8 // a1 274 FMLA v22.4s, v16.4s, v0.s[2] 275 FMLA v24.4s, v16.4s, v1.s[0] 276 277 // BLOCK 1 278 LDR d12, [x5] 279 INS v3.d[1], x19 // a1 ins 280 FMLA v26.4s, v16.4s, v1.s[2] 281 LDR x19, [x5, 8] // b 282 FMLA v21.4s, v17.4s, v0.s[0] 283 FMLA v23.4s, v17.4s, v0.s[2] 284 285 // BLOCK 2 286 LDR d4, [x15], 8 // a2 287 INS v12.d[1], x19 // b ins 288 FMLA v25.4s, v17.4s, v1.s[0] 289 LDR x19, [x8], 8 // a3 290 FMLA v27.4s, v17.4s, v1.s[2] 291 FMLA v20.4s, v18.4s, v0.s[1] 292 293 // BLOCK 3 294 LDR d13, [x5, 16] 295 INS v4.d[1], x19 // a3 ins 296 FMLA v22.4s, v18.4s, v0.s[3] 297 LDR x19, [x5, 24] 298 FMLA v24.4s, v18.4s, v1.s[1] 299 FMLA v26.4s, v18.4s, v1.s[3] 300 301 // BLOCK 4 302 LDR d14, [x5, 32] 303 INS v13.d[1], x19 // b 304 FMLA v21.4s, v19.4s, v0.s[1] 305 LDR x19, [x5, 40] 306 FMLA v23.4s, v19.4s, v0.s[3] 307 FMLA v25.4s, v19.4s, v1.s[1] 308 309 // BLOCK 5 310 // NOPs to ensure 4 cycle LDR lands on next LDR 311 LDR d15, [x5, 48] 312 INS v14.d[1], x19 313 FMLA v27.4s, v19.4s, v1.s[3] 314 LDR x19, [x5, 56] 315 NOP // fma 316 NOP 317 NOP // fma 318 NOP 319 320 # Second group of 16 FMA, no loads 321 // BLOCK 0 322 INS v15.d[1], x19 // b from previous 323 FMLA v20.4s, v12.4s, v3.s[0] 324 FMLA v22.4s, v12.4s, v3.s[2] 325 FMLA v24.4s, v12.4s, v4.s[0] 326 327 // BLOCK 1 328 FMLA v26.4s, v12.4s, v4.s[2] 329 FMLA v21.4s, v13.4s, v3.s[0] 330 FMLA v23.4s, v13.4s, v3.s[2] 331 332 // BLOCK 2 333 FMLA v25.4s, v13.4s, v4.s[0] 334 FMLA v27.4s, v13.4s, v4.s[2] 335 FMLA v20.4s, v14.4s, v3.s[1] 336 337 // BLOCK 3 338 FMLA v22.4s, v14.4s, v3.s[3] 339 FMLA v24.4s, v14.4s, v4.s[1] 340 FMLA v26.4s, v14.4s, v4.s[3] 341 342 // BLOCK 4 343 FMLA v21.4s, v15.4s, v3.s[1] 344 FMLA v23.4s, v15.4s, v3.s[3] 345 FMLA v25.4s, v15.4s, v4.s[1] 346 ADD x5, x5, 64 347 348 // BLOCK 5 349 FMLA v27.4s, v15.4s, v4.s[3] 350 3514: 352 # Is there a remainder?- 2 floats of A (8 bytes) 353 TBNZ x0, 3, 6f 354 # Is there a remainder?- 1 floats of A (4 bytes) 355 TBNZ x0, 2, 7f 3565: 357 # ks loop 358 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 359 B.NE 1b 360 361 # Clamp 362 FMIN v20.4s, v20.4s, v6.4s 363 FMIN v21.4s, v21.4s, v6.4s 364 FMIN v22.4s, v22.4s, v6.4s 365 FMIN v23.4s, v23.4s, v6.4s 366 FMIN v24.4s, v24.4s, v6.4s 367 FMIN v25.4s, v25.4s, v6.4s 368 FMIN v26.4s, v26.4s, v6.4s 369 FMIN v27.4s, v27.4s, v6.4s 370 FMAX v20.4s, v20.4s, v7.4s 371 FMAX v21.4s, v21.4s, v7.4s 372 FMAX v22.4s, v22.4s, v7.4s 373 FMAX v23.4s, v23.4s, v7.4s 374 FMAX v24.4s, v24.4s, v7.4s 375 FMAX v25.4s, v25.4s, v7.4s 376 FMAX v26.4s, v26.4s, v7.4s 377 FMAX v27.4s, v27.4s, v7.4s 378 379 # Store full 4 x 8 380 SUBS x1, x1, 8 381 B.LO 8f 382 383 STP q26, q27, [x7] 384 ADD x7, x7, x10 385 STP q24, q25, [x17] 386 ADD x17, x17, x10 387 STP q22, q23, [x16] 388 ADD x16, x16, x10 389 STP q20, q21, [x6] 390 ADD x6, x6, x10 391 392 SUB x4, x4, x3 // a -= ks 393 394 # nc loop 395 B.HI 0b 396 397 // Restore x19, d12-d15 from stack 398 LDR x19, [sp, 32] 399 LDP d14, d15, [sp, 16] 400 LDP d12, d13, [sp], 48 401 RET 402 403 # Remainder - 2 floats of A (8 bytes) 404 # 16 FMA + 4 LD64 A + 2 LDP B 4056: 406 LDR d0, [x13], 8 407 LDP q16, q17, [x5], 32 408 LD1 {v0.d}[1], [x14], 8 409 LDR d1, [x15], 8 410 LD1 {v1.d}[1], [x8], 8 411 LDP q18, q19, [x5], 32 412 FMLA v20.4s, v16.4s, v0.s[0] 413 FMLA v22.4s, v16.4s, v0.s[2] 414 FMLA v24.4s, v16.4s, v1.s[0] 415 FMLA v26.4s, v16.4s, v1.s[2] 416 FMLA v21.4s, v17.4s, v0.s[0] 417 FMLA v23.4s, v17.4s, v0.s[2] 418 FMLA v25.4s, v17.4s, v1.s[0] 419 FMLA v27.4s, v17.4s, v1.s[2] 420 421 FMLA v20.4s, v18.4s, v0.s[1] 422 FMLA v22.4s, v18.4s, v0.s[3] 423 FMLA v24.4s, v18.4s, v1.s[1] 424 FMLA v26.4s, v18.4s, v1.s[3] 425 FMLA v21.4s, v19.4s, v0.s[1] 426 FMLA v23.4s, v19.4s, v0.s[3] 427 FMLA v25.4s, v19.4s, v1.s[1] 428 FMLA v27.4s, v19.4s, v1.s[3] 429 430 # Is there a remainder?- 1 floats of A (4 bytes) 431 TBZ x0, 2, 5b 432 4337: 434 # Remainder- 1 floats of A (4 bytes) 435 LDR s0, [x13], 4 436 LDP q16, q17, [x5], 32 437 LD1 {v0.s}[2], [x14], 4 438 LDR s1, [x15], 4 439 LD1 {v1.s}[2], [x8], 4 440 441 FMLA v20.4s, v16.4s, v0.s[0] 442 FMLA v22.4s, v16.4s, v0.s[2] 443 FMLA v24.4s, v16.4s, v1.s[0] 444 FMLA v26.4s, v16.4s, v1.s[2] 445 FMLA v21.4s, v17.4s, v0.s[0] 446 FMLA v23.4s, v17.4s, v0.s[2] 447 FMLA v25.4s, v17.4s, v1.s[0] 448 FMLA v27.4s, v17.4s, v1.s[2] 449 B 5b 450 451 # Store odd width 4528: 453 TBZ x1, 2, 9f 454 STR q26, [x7], 16 455 MOV v26.16b, v27.16b 456 STR q24, [x17], 16 457 MOV v24.16b, v25.16b 458 STR q22, [x16], 16 459 MOV v22.16b, v23.16b 460 STR q20, [x6], 16 461 MOV v20.16b, v21.16b 4629: 463 TBZ x1, 1, 10f 464 STR d26, [x7], 8 465 DUP d26, v26.d[1] 466 STR d24, [x17], 8 467 DUP d24, v24.d[1] 468 STR d22, [x16], 8 469 DUP d22, v22.d[1] 470 STR d20, [x6], 8 471 DUP d20, v20.d[1] 472 47310: 474 TBZ x1, 0, 11f 475 STR s26, [x7] 476 STR s24, [x17] 477 STR s22, [x16] 478 STR s20, [x6] 47911: 480 // Restore x19, d12-d15 from stack 481 LDR x19, [sp, 32] 482 LDP d14, d15, [sp, 16] 483 LDP d12, d13, [sp], 48 484 RET 485 486END_FUNCTION xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53 487 488#ifdef __ELF__ 489.section ".note.GNU-stack","",%progbits 490#endif 491