1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31 32# C pointers 33# x6 c0 34# x16 c1 35# x17 c2 36# x14 c3 37 38# x4 temporary vector shadow register 39 40# Vector register usage 41# A0 v0 v3 42# A1 v0[1] v3[1] 43# A2 v1 v4 44# A3 v1[1] v4[1] 45 46# B v12 v13 v14 v15 second set of B 47# B v16 v17 v18 v19 first set 48# C v20 v21 49# C v22 v23 50# C v24 v25 51# C v26 v27 52# Clamp v6 v7 53 54# unused A v8 v9 v10 v11 55# x12 a4 56# x13 c4 57# x7 c5 58# A4 v2 v5 59# A5 v2[1] v5[1] 60# C v28 v29 61# C v30 v31 62 63BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55 64 65 $if INC: 66 # Load acc, params pointer 67 LDP x15, x8, [sp, 8] 68 $else: 69 # Load params pointer 70 LDR x8, [sp, 8] 71 72 # Clamp A and C pointers 73 CMP x0, 2 // if mr < 2 74 ADD x9, x3, x4 // a1 = a0 + a_stride 75 ADD x16, x6, x7 // c1 = c0 + cm_stride 76 CSEL x9, x3, x9, LO // a1 = a0 77 CSEL x16, x6, x16, LO // c1 = c0 78 79 ADD x10, x9, x4 // a2 = a1 + a_stride 80 ADD x17, x16, x7 // c2 = c1 + cm_stride 81 // if mr <= 2 82 CSEL x10, x9, x10, LS // a2 = a1 83 CSEL x17, x16, x17, LS // c2 = c1 84 85 CMP x0, 4 // if mr < 4 86 ADD x11, x10, x4 // a3 = a2 + a_stride 87 ADD x14, x17, x7 // c3 = c2 + cm_stride 88 CSEL x11, x10, x11, LO // a3 = a2 89 CSEL x14, x17, x14, LO // c3 = c2 90 91 # Load min/max values 92 LD2R {v6.4s, v7.4s}, [x8] 93 94 // Save d12-d15 on stack 95 STP d12, d13, [sp, -32]! 96 STP d14, d15, [sp, 16] 97 980: 99 $if INC: 100 # Load initial accumulators 101 LDP q20, q21, [x15], 32 102 LDP q22, q23, [x15], 32 103 LDP q24, q25, [x15], 32 104 LDP q26, q27, [x15], 32 105 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 106 PRFM PLDL1KEEP, [x3, 64] 107 PRFM PLDL1KEEP, [x9, 0] 108 PRFM PLDL1KEEP, [x9, 64] 109 PRFM PLDL1KEEP, [x10, 0] 110 PRFM PLDL1KEEP, [x10, 64] 111 PRFM PLDL1KEEP, [x11, 0] 112 PRFM PLDL1KEEP, [x11, 64] 113 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 114 PRFM PLDL1KEEP, [x5, 64] 115 PRFM PLDL1KEEP, [x5, 128] 116 PRFM PLDL1KEEP, [x5, 192] 117 $else: 118 # Load initial bias from w into accumulators 119 LDP q20, q21, [x5], 32 120 MOV v22.16b, v20.16b 121 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 122 PRFM PLDL1KEEP, [x3, 64] 123 MOV v23.16b, v21.16b 124 PRFM PLDL1KEEP, [x9, 0] 125 PRFM PLDL1KEEP, [x9, 64] 126 MOV v24.16b, v20.16b 127 PRFM PLDL1KEEP, [x10, 0] 128 PRFM PLDL1KEEP, [x10, 64] 129 MOV v25.16b, v21.16b 130 PRFM PLDL1KEEP, [x11, 0] 131 PRFM PLDL1KEEP, [x11, 64] 132 MOV v26.16b, v20.16b 133 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 134 MOV v27.16b, v21.16b 135 PRFM PLDL1KEEP, [x5, 64] 136 PRFM PLDL1KEEP, [x5, 128] 137 PRFM PLDL1KEEP, [x5, 192] 138 139 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 140 SUBS x0, x2, 16 // k = kc - 16 141 B.LO 4f 142 143 # Prologue - First group loads, no FMA 144 LDR d0, [x3], 8 // a0 145 LDP q16, q17, [x5], 32 // b 146 LDR d1, [x10], 8 // a2 147 LD1 {v0.d}[1], [x9], 8 // a1 148 LD1 {v1.d}[1], [x11], 8 // a3 149 SUBS x0, x0, 16 150 LDR q18, [x5], 16 151 LDR d19, [x5], 8 152 LDR x4, [x5], 8 // ins is in BLOCK 0 153 154 # Is there at least 4 floats (16 bytes) for main loop? 155 B.LO 2f 156 157 # Main loop - 4 floats of A (16 bytes) 158 # 32 FMA + 8 LD64 A + 8 LDR B 1591: 160 # First group of 16 FMA, Second group loads 161 // BLOCK 0 162 FMLA v20.4s, v16.4s, v0.s[0] 163 LDR d3, [x3], 8 // a0 164 FMLA v22.4s, v16.4s, v0.s[2] 165 INS v19.d[1], x4 // b from second group 166 FMLA v24.4s, v16.4s, v1.s[0] 167 LDR x4, [x9], 8 // a1 168 169 // BLOCK 1 170 FMLA v26.4s, v16.4s, v1.s[2] 171 LDR d12, [x5] 172 FMLA v21.4s, v17.4s, v0.s[0] 173 INS v3.d[1], x4 // a1 ins 174 FMLA v23.4s, v17.4s, v0.s[2] 175 LDR x4, [x5, 8] // b 176 177 // BLOCK 2 178 FMLA v25.4s, v17.4s, v1.s[0] 179 LDR d4, [x10], 8 // a2 180 FMLA v27.4s, v17.4s, v1.s[2] 181 INS v12.d[1], x4 // b ins 182 FMLA v20.4s, v18.4s, v0.s[1] 183 LDR x4, [x11], 8 // a3 184 185 // BLOCK 3 186 FMLA v22.4s, v18.4s, v0.s[3] 187 LDR d13, [x5, 16] 188 FMLA v24.4s, v18.4s, v1.s[1] 189 INS v4.d[1], x4 // a3 ins 190 FMLA v26.4s, v18.4s, v1.s[3] 191 LDR x4, [x5, 24] 192 193 // BLOCK 4 194 FMLA v21.4s, v19.4s, v0.s[1] 195 LDR d14, [x5, 32] 196 FMLA v23.4s, v19.4s, v0.s[3] 197 INS v13.d[1], x4 // b 198 FMLA v25.4s, v19.4s, v1.s[1] 199 LDR x4, [x5, 40] 200 201 // BLOCK 5 202 // NOPs to ensure 4 cycle LDR lands on next LDR 203 FMLA v27.4s, v19.4s, v1.s[3] 204 LDR d15, [x5, 48] 205 NOP 206 INS v14.d[1], x4 // b from previous 207 SUBS x0, x0, 16 208 LDR x4, [x5, 56] 209 210 # Second group of 16 FMA, First group of loads 211 // BLOCK 0 212 FMLA v20.4s, v12.4s, v3.s[0] 213 LDR d0, [x3], 8 // a0 214 FMLA v22.4s, v12.4s, v3.s[2] 215 INS v15.d[1], x4 // b from previous 216 FMLA v24.4s, v12.4s, v4.s[0] 217 LDR x4, [x9], 8 // a1 218 219 // BLOCK 1 220 FMLA v26.4s, v12.4s, v4.s[2] 221 LDR d16, [x5, 64] 222 FMLA v21.4s, v13.4s, v3.s[0] 223 INS v0.d[1], x4 // a1 ins 224 FMLA v23.4s, v13.4s, v3.s[2] 225 LDR x4, [x5, 72] // b 226 227 // BLOCK 2 228 FMLA v25.4s, v13.4s, v4.s[0] 229 LDR d1, [x10], 8 // a2 230 FMLA v27.4s, v13.4s, v4.s[2] 231 INS v16.d[1], x4 // b 232 FMLA v20.4s, v14.4s, v3.s[1] 233 LDR x4, [x11], 8 // a3 234 235 // BLOCK 3 236 FMLA v22.4s, v14.4s, v3.s[3] 237 LDR d17, [x5, 80] 238 FMLA v24.4s, v14.4s, v4.s[1] 239 INS v1.d[1], x4 // a3 ins 240 FMLA v26.4s, v14.4s, v4.s[3] 241 LDR x4, [x5, 88] 242 243 // BLOCK 4 244 FMLA v21.4s, v15.4s, v3.s[1] 245 LDR d18, [x5, 96] 246 FMLA v23.4s, v15.4s, v3.s[3] 247 INS v17.d[1], x4 // b 248 FMLA v25.4s, v15.4s, v4.s[1] 249 LDR x4, [x5, 104] 250 251 // BLOCK 5 252 // NOTE that block needs to be 4 cycles for LDR not to stall 253 FMLA v27.4s, v15.4s, v4.s[3] 254 LDR d19, [x5, 112] 255 INS v18.d[1], x4 256 LDR x4, [x5, 120] 257 ADD x5, x5, 128 258 B.HS 1b 259 260 # Epilogue - 4 floats of A (16 bytes) 261 # 32 FMA + 8 LD64 A + 8 LDR B 2622: 263 # First group of 16 FMA, Second group loads 264 // BLOCK 0 265 FMLA v20.4s, v16.4s, v0.s[0] 266 LDR d3, [x3], 8 // a0 267 FMLA v22.4s, v16.4s, v0.s[2] 268 INS v19.d[1], x4 // b from second group 269 FMLA v24.4s, v16.4s, v1.s[0] 270 LDR x4, [x9], 8 // a1 271 272 // BLOCK 1 273 FMLA v26.4s, v16.4s, v1.s[2] 274 LDR d12, [x5] 275 FMLA v21.4s, v17.4s, v0.s[0] 276 INS v3.d[1], x4 // a1 ins 277 FMLA v23.4s, v17.4s, v0.s[2] 278 LDR x4, [x5, 8] // b 279 280 // BLOCK 2 281 FMLA v25.4s, v17.4s, v1.s[0] 282 LDR d4, [x10], 8 // a2 283 FMLA v27.4s, v17.4s, v1.s[2] 284 INS v12.d[1], x4 // b ins 285 FMLA v20.4s, v18.4s, v0.s[1] 286 LDR x4, [x11], 8 // a3 287 288 // BLOCK 3 289 FMLA v22.4s, v18.4s, v0.s[3] 290 LDR d13, [x5, 16] 291 FMLA v24.4s, v18.4s, v1.s[1] 292 INS v4.d[1], x4 // a3 ins 293 FMLA v26.4s, v18.4s, v1.s[3] 294 LDR x4, [x5, 24] 295 296 // BLOCK 4 297 FMLA v21.4s, v19.4s, v0.s[1] 298 LDR d14, [x5, 32] 299 FMLA v23.4s, v19.4s, v0.s[3] 300 INS v13.d[1], x4 // b 301 FMLA v25.4s, v19.4s, v1.s[1] 302 LDR x4, [x5, 40] 303 304 // BLOCK 5 305 // NOPs to ensure 4 cycle LDR lands on next LDR 306 FMLA v27.4s, v19.4s, v1.s[3] 307 LDR d15, [x5, 48] 308 NOP // fma 309 INS v14.d[1], x4 310 NOP 311 LDR x4, [x5, 56] 312 313 # Second group of 16 FMA, no loads 314 // BLOCK 0 315 FMLA v20.4s, v12.4s, v3.s[0] 316 FMLA v22.4s, v12.4s, v3.s[2] 317 INS v15.d[1], x4 // b from previous 318 FMLA v24.4s, v12.4s, v4.s[0] 319 320 // BLOCK 1 321 FMLA v26.4s, v12.4s, v4.s[2] 322 FMLA v21.4s, v13.4s, v3.s[0] 323 FMLA v23.4s, v13.4s, v3.s[2] 324 325 // BLOCK 2 326 FMLA v25.4s, v13.4s, v4.s[0] 327 FMLA v27.4s, v13.4s, v4.s[2] 328 FMLA v20.4s, v14.4s, v3.s[1] 329 330 // BLOCK 3 331 FMLA v22.4s, v14.4s, v3.s[3] 332 FMLA v24.4s, v14.4s, v4.s[1] 333 FMLA v26.4s, v14.4s, v4.s[3] 334 TST x0, 15 335 336 // BLOCK 4 337 FMLA v21.4s, v15.4s, v3.s[1] 338 FMLA v23.4s, v15.4s, v3.s[3] 339 FMLA v25.4s, v15.4s, v4.s[1] 340 ADD x5, x5, 64 341 342 // BLOCK 5 343 FMLA v27.4s, v15.4s, v4.s[3] 344 345 # Is there a remainder?- 2 floats of A (8 bytes) or less 346 B.NE 4f 347 3483: 349 # Clamp 350 FMAX v20.4s, v20.4s, v6.4s 351 # Load cn_stride 352 LDR x0, [sp, 32] 353 FMAX v21.4s, v21.4s, v6.4s 354 FMAX v22.4s, v22.4s, v6.4s 355 FMAX v23.4s, v23.4s, v6.4s 356 FMAX v24.4s, v24.4s, v6.4s 357 FMAX v25.4s, v25.4s, v6.4s 358 FMAX v26.4s, v26.4s, v6.4s 359 FMAX v27.4s, v27.4s, v6.4s 360 SUBS x1, x1, 8 361 FMIN v20.4s, v20.4s, v7.4s 362 FMIN v21.4s, v21.4s, v7.4s 363 FMIN v22.4s, v22.4s, v7.4s 364 FMIN v23.4s, v23.4s, v7.4s 365 FMIN v24.4s, v24.4s, v7.4s 366 FMIN v25.4s, v25.4s, v7.4s 367 FMIN v26.4s, v26.4s, v7.4s 368 FMIN v27.4s, v27.4s, v7.4s 369 370 # Store full 4 x 8 371 B.LO 6f 372 373 $if INC: 374 ST1 {v26.16b, v27.16b}, [x14], x0 375 SUB x3, x3, x2 // a0 -= kc 376 ST1 {v24.16b, v25.16b}, [x17], x0 377 SUB x9, x9, x2 // a1 -= kc 378 ST1 {v22.16b, v23.16b}, [x16], x0 379 SUB x10, x10, x2 // a2 -= kc 380 ST1 {v20.16b, v21.16b}, [x6], x0 381 SUB x11, x11, x2 // a3 -= kc 382 $else: 383 ST1 {v20.16b, v21.16b}, [x6], x0 384 SUB x3, x3, x2 // a0 -= kc 385 ST1 {v22.16b, v23.16b}, [x16], x0 386 SUB x9, x9, x2 // a1 -= kc 387 ST1 {v24.16b, v25.16b}, [x17], x0 388 SUB x10, x10, x2 // a2 -= kc 389 ST1 {v26.16b, v27.16b}, [x14], x0 390 SUB x11, x11, x2 // a3 -= kc 391 392 B.HI 0b 393 394 // Restore d12-d15 from stack 395 LDP d14, d15, [sp, 16] 396 LDP d12, d13, [sp], 32 397 RET 398 3994: 400 # Is there a remainder?- 2 floats of A (8 bytes) 401 TBZ x0, 3, 5f 402 403 # Remainder- 2 floats of A (8 bytes) 404 LDR d0, [x3], 8 405 LDR q16, [x5], 16 406 LD1 {v0.d}[1], [x9], 8 407 LDR d1, [x10], 8 408 LD1 {v1.d}[1], [x11], 8 409 LDR q17, [x5], 16 410 LDR q18, [x5], 16 411 LDR q19, [x5], 16 412 FMLA v20.4s, v16.4s, v0.s[0] 413 FMLA v22.4s, v16.4s, v0.s[2] 414 FMLA v24.4s, v16.4s, v1.s[0] 415 FMLA v26.4s, v16.4s, v1.s[2] 416 FMLA v21.4s, v17.4s, v0.s[0] 417 FMLA v23.4s, v17.4s, v0.s[2] 418 FMLA v25.4s, v17.4s, v1.s[0] 419 FMLA v27.4s, v17.4s, v1.s[2] 420 421 FMLA v20.4s, v18.4s, v0.s[1] 422 FMLA v22.4s, v18.4s, v0.s[3] 423 FMLA v24.4s, v18.4s, v1.s[1] 424 FMLA v26.4s, v18.4s, v1.s[3] 425 FMLA v21.4s, v19.4s, v0.s[1] 426 FMLA v23.4s, v19.4s, v0.s[3] 427 FMLA v25.4s, v19.4s, v1.s[1] 428 FMLA v27.4s, v19.4s, v1.s[3] 429 430 # Is there a remainder?- 1 floats of A (4 bytes) 431 TBZ x0, 2, 3b 432 4335: 434 # Remainder- 1 floats of A (4 bytes) 435 LDR s0, [x3], 4 436 LDR q16, [x5], 16 437 LD1 {v0.s}[2], [x9], 4 438 LDR s1, [x10], 4 439 LD1 {v1.s}[2], [x11], 4 440 LDR q17, [x5], 16 441 442 FMLA v20.4s, v16.4s, v0.s[0] 443 FMLA v22.4s, v16.4s, v0.s[2] 444 FMLA v24.4s, v16.4s, v1.s[0] 445 FMLA v26.4s, v16.4s, v1.s[2] 446 FMLA v21.4s, v17.4s, v0.s[0] 447 FMLA v23.4s, v17.4s, v0.s[2] 448 FMLA v25.4s, v17.4s, v1.s[0] 449 FMLA v27.4s, v17.4s, v1.s[2] 450 B 3b 451 452 # Store odd width 4536: 454 TBZ x1, 2, 7f 455 $if INC: 456 STR q26, [x14], 16 457 MOV v26.16b, v27.16b 458 STR q24, [x17], 16 459 MOV v24.16b, v25.16b 460 STR q22, [x16], 16 461 MOV v22.16b, v23.16b 462 STR q20, [x6], 16 463 MOV v20.16b, v21.16b 464 $else: 465 STR q20, [x6], 16 466 MOV v20.16b, v21.16b 467 STR q22, [x16], 16 468 MOV v22.16b, v23.16b 469 STR q24, [x17], 16 470 MOV v24.16b, v25.16b 471 STR q26, [x14], 16 472 MOV v26.16b, v27.16b 473 4747: 475 TBZ x1, 1, 8f 476 $if INC: 477 STR d26, [x14], 8 478 DUP d26, v26.d[1] 479 STR d24, [x17], 8 480 DUP d24, v24.d[1] 481 STR d22, [x16], 8 482 DUP d22, v22.d[1] 483 STR d20, [x6], 8 484 DUP d20, v20.d[1] 485 $else: 486 STR d20, [x6], 8 487 DUP d20, v20.d[1] 488 STR d22, [x16], 8 489 DUP d22, v22.d[1] 490 STR d24, [x17], 8 491 DUP d24, v24.d[1] 492 STR d26, [x14], 8 493 DUP d26, v26.d[1] 494 4958: 496 TBZ x1, 0, 9f 497 $if INC: 498 STR s26, [x14] 499 STR s24, [x17] 500 STR s22, [x16] 501 STR s20, [x6] 502 $else: 503 STR s20, [x6] 504 STR s22, [x16] 505 STR s24, [x17] 506 STR s26, [x14] 5079: 508 // Restore d12-d15 from stack 509 LDP d14, d15, [sp, 16] 510 LDP d12, d13, [sp], 32 511 RET 512 513END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55 514 515#ifdef __ELF__ 516.section ".note.GNU-stack","",%progbits 517#endif 518