1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31 32# C pointers 33# x6 c0 34# x16 c1 35# x17 c2 36# x14 c3 37 38# x4 temporary vector shadow register 39 40# Vector register usage 41# A0 v0 v3 42# A1 v0[1] v3[1] 43# A2 v1 v4 44# A3 v1[1] v4[1] 45 46# B v12 v13 v14 v15 second set of B 47# B v16 v17 v18 v19 first set 48# C v20 v21 49# C v22 v23 50# C v24 v25 51# C v26 v27 52# Clamp v6 v7 53 54# unused A v8 v9 v10 v11 55# x12 a4 56# x13 c4 57# x7 c5 58# A4 v2 v5 59# A5 v2[1] v5[1] 60# C v28 v29 61# C v30 v31 62 63BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53 64 65 $if INC: 66 # Load acc, params pointer 67 LDP x15, x8, [sp, 8] 68 $else: 69 # Load params pointer 70 LDR x8, [sp, 8] 71 72 # Clamp A and C pointers 73 CMP x0, 2 // if mr < 2 74 ADD x9, x3, x4 // a1 = a0 + a_stride 75 ADD x16, x6, x7 // c1 = c0 + cm_stride 76 CSEL x9, x3, x9, LO // a1 = a0 77 CSEL x16, x6, x16, LO // c1 = c0 78 79 ADD x10, x9, x4 // a2 = a1 + a_stride 80 ADD x17, x16, x7 // c2 = c1 + cm_stride 81 // if mr <= 2 82 CSEL x10, x9, x10, LS // a2 = a1 83 CSEL x17, x16, x17, LS // c2 = c1 84 85 CMP x0, 4 // if mr < 4 86 ADD x11, x10, x4 // a3 = a2 + a_stride 87 ADD x14, x17, x7 // c3 = c2 + cm_stride 88 CSEL x11, x10, x11, LO // a3 = a2 89 CSEL x14, x17, x14, LO // c3 = c2 90 91 # Load min/max values 92 LD2R {v6.4s, v7.4s}, [x8] 93 94 // Save d12-d15 on stack 95 STP d12, d13, [sp, -32]! 96 STP d14, d15, [sp, 16] 97 980: 99 $if INC: 100 # Load initial accumulators 101 LDP q20, q21, [x15], 32 102 LDP q22, q23, [x15], 32 103 LDP q24, q25, [x15], 32 104 LDP q26, q27, [x15], 32 105 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 106 PRFM PLDL1KEEP, [x3, 64] 107 PRFM PLDL1KEEP, [x9, 0] 108 PRFM PLDL1KEEP, [x9, 64] 109 PRFM PLDL1KEEP, [x10, 0] 110 PRFM PLDL1KEEP, [x10, 64] 111 PRFM PLDL1KEEP, [x11, 0] 112 PRFM PLDL1KEEP, [x11, 64] 113 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 114 PRFM PLDL1KEEP, [x5, 64] 115 PRFM PLDL1KEEP, [x5, 128] 116 PRFM PLDL1KEEP, [x5, 192] 117 $else: 118 # Load initial bias from w into accumulators 119 LDP q20, q21, [x5], 32 120 MOV v22.16b, v20.16b 121 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 122 PRFM PLDL1KEEP, [x3, 64] 123 MOV v23.16b, v21.16b 124 PRFM PLDL1KEEP, [x9, 0] 125 PRFM PLDL1KEEP, [x9, 64] 126 MOV v24.16b, v20.16b 127 PRFM PLDL1KEEP, [x10, 0] 128 PRFM PLDL1KEEP, [x10, 64] 129 MOV v25.16b, v21.16b 130 PRFM PLDL1KEEP, [x11, 0] 131 PRFM PLDL1KEEP, [x11, 64] 132 MOV v26.16b, v20.16b 133 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 134 MOV v27.16b, v21.16b 135 PRFM PLDL1KEEP, [x5, 64] 136 PRFM PLDL1KEEP, [x5, 128] 137 PRFM PLDL1KEEP, [x5, 192] 138 139 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 140 SUBS x0, x2, 16 // k = kc - 16 141 B.LO 4f 142 143 # Prologue - First group loads, no FMA 144 LDR d0, [x3], 8 // a0 145 LDP q16, q17, [x5], 32 // b 146 LDR d1, [x10], 8 // a2 147 LD1 {v0.d}[1], [x9], 8 // a1 148 LD1 {v1.d}[1], [x11], 8 // a3 149 SUBS x0, x0, 16 150 LDR q18, [x5], 16 151 LDR d19, [x5], 8 152 LDR x4, [x5], 8 // ins is in BLOCK 0 153 154 # Is there at least 4 floats (16 bytes) for main loop? 155 B.LO 2f 156 157 # Main loop - 4 floats of A (16 bytes) 158 # 32 FMA + 8 LD64 A + 8 LDR B 1591: 160 # First group of 16 FMA, Second group loads 161 // BLOCK 0 162 LDR d3, [x3], 8 // a0 163 INS v19.d[1], x4 // b from second group 164 FMLA v20.4s, v16.4s, v0.s[0] 165 LDR x4, [x9], 8 // a1 166 FMLA v22.4s, v16.4s, v0.s[2] 167 FMLA v24.4s, v16.4s, v1.s[0] 168 169 // BLOCK 1 170 LDR d12, [x5] 171 INS v3.d[1], x4 // a1 ins 172 FMLA v26.4s, v16.4s, v1.s[2] 173 LDR x4, [x5, 8] // b 174 FMLA v21.4s, v17.4s, v0.s[0] 175 FMLA v23.4s, v17.4s, v0.s[2] 176 177 // BLOCK 2 178 LDR d4, [x10], 8 // a2 179 INS v12.d[1], x4 // b ins 180 FMLA v25.4s, v17.4s, v1.s[0] 181 LDR x4, [x11], 8 // a3 182 FMLA v27.4s, v17.4s, v1.s[2] 183 FMLA v20.4s, v18.4s, v0.s[1] 184 185 // BLOCK 3 186 LDR d13, [x5, 16] 187 INS v4.d[1], x4 // a3 ins 188 FMLA v22.4s, v18.4s, v0.s[3] 189 LDR x4, [x5, 24] 190 FMLA v24.4s, v18.4s, v1.s[1] 191 FMLA v26.4s, v18.4s, v1.s[3] 192 193 // BLOCK 4 194 LDR d14, [x5, 32] 195 INS v13.d[1], x4 // b 196 FMLA v21.4s, v19.4s, v0.s[1] 197 LDR x4, [x5, 40] 198 FMLA v23.4s, v19.4s, v0.s[3] 199 FMLA v25.4s, v19.4s, v1.s[1] 200 201 // BLOCK 5 202 // NOPs to ensure 4 cycle LDR lands on next LDR 203 LDR d15, [x5, 48] 204 INS v14.d[1], x4 // b from previous 205 FMLA v27.4s, v19.4s, v1.s[3] 206 LDR x4, [x5, 56] 207 NOP 208 NOP 209 NOP 210 NOP 211 212 # Second group of 16 FMA, First group of loads 213 // BLOCK 0 214 LDR d0, [x3], 8 // a0 215 INS v15.d[1], x4 // b from previous 216 FMLA v20.4s, v12.4s, v3.s[0] 217 LDR x4, [x9], 8 // a1 218 FMLA v22.4s, v12.4s, v3.s[2] 219 FMLA v24.4s, v12.4s, v4.s[0] 220 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 221 222 // BLOCK 1 223 LDR d16, [x5, 64] 224 INS v0.d[1], x4 // a1 ins 225 FMLA v26.4s, v12.4s, v4.s[2] 226 LDR x4, [x5, 72] // b 227 FMLA v21.4s, v13.4s, v3.s[0] 228 FMLA v23.4s, v13.4s, v3.s[2] 229 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1 230 231 // BLOCK 2 232 LDR d1, [x10], 8 // a2 233 INS v16.d[1], x4 // b 234 FMLA v25.4s, v13.4s, v4.s[0] 235 LDR x4, [x11], 8 // a3 236 FMLA v27.4s, v13.4s, v4.s[2] 237 FMLA v20.4s, v14.4s, v3.s[1] 238 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2 239 240 // BLOCK 3 241 LDR d17, [x5, 80] 242 INS v1.d[1], x4 // a3 ins 243 FMLA v22.4s, v14.4s, v3.s[3] 244 LDR x4, [x5, 88] 245 FMLA v24.4s, v14.4s, v4.s[1] 246 FMLA v26.4s, v14.4s, v4.s[3] 247 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3 248 249 // BLOCK 4 250 LDR d18, [x5, 96] 251 INS v17.d[1], x4 // b 252 FMLA v21.4s, v15.4s, v3.s[1] 253 LDR x4, [x5, 104] 254 FMLA v23.4s, v15.4s, v3.s[3] 255 FMLA v25.4s, v15.4s, v4.s[1] 256 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 257 258 // BLOCK 5 259 // NOTE that block needs to be 4 cycles for LDR not to stall 260 LDR d19, [x5, 112] 261 INS v18.d[1], x4 262 FMLA v27.4s, v15.4s, v4.s[3] 263 LDR x4, [x5, 120] 264 SUBS x0, x0, 16 265 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 266 ADD x5, x5, 128 267 B.HS 1b 268 269 # Epilogue - 4 floats of A (16 bytes) 270 # 32 FMA + 8 LD64 A + 8 LDR B 2712: 272 # First group of 16 FMA, Second group loads 273 // BLOCK 0 274 LDR d3, [x3], 8 // a0 275 INS v19.d[1], x4 // b from second group 276 FMLA v20.4s, v16.4s, v0.s[0] 277 LDR x4, [x9], 8 // a1 278 FMLA v22.4s, v16.4s, v0.s[2] 279 FMLA v24.4s, v16.4s, v1.s[0] 280 281 // BLOCK 1 282 LDR d12, [x5] 283 INS v3.d[1], x4 // a1 ins 284 FMLA v26.4s, v16.4s, v1.s[2] 285 LDR x4, [x5, 8] // b 286 FMLA v21.4s, v17.4s, v0.s[0] 287 FMLA v23.4s, v17.4s, v0.s[2] 288 289 // BLOCK 2 290 LDR d4, [x10], 8 // a2 291 INS v12.d[1], x4 // b ins 292 FMLA v25.4s, v17.4s, v1.s[0] 293 LDR x4, [x11], 8 // a3 294 FMLA v27.4s, v17.4s, v1.s[2] 295 FMLA v20.4s, v18.4s, v0.s[1] 296 297 // BLOCK 3 298 LDR d13, [x5, 16] 299 INS v4.d[1], x4 // a3 ins 300 FMLA v22.4s, v18.4s, v0.s[3] 301 LDR x4, [x5, 24] 302 FMLA v24.4s, v18.4s, v1.s[1] 303 FMLA v26.4s, v18.4s, v1.s[3] 304 305 // BLOCK 4 306 LDR d14, [x5, 32] 307 INS v13.d[1], x4 // b 308 FMLA v21.4s, v19.4s, v0.s[1] 309 LDR x4, [x5, 40] 310 FMLA v23.4s, v19.4s, v0.s[3] 311 FMLA v25.4s, v19.4s, v1.s[1] 312 313 // BLOCK 5 314 // NOPs to ensure 4 cycle LDR lands on next LDR 315 LDR d15, [x5, 48] 316 INS v14.d[1], x4 317 FMLA v27.4s, v19.4s, v1.s[3] 318 LDR x4, [x5, 56] 319 NOP // fma 320 NOP 321 NOP // fma 322 NOP 323 324 # Second group of 16 FMA, no loads 325 // BLOCK 0 326 INS v15.d[1], x4 // b from previous 327 FMLA v20.4s, v12.4s, v3.s[0] 328 FMLA v22.4s, v12.4s, v3.s[2] 329 FMLA v24.4s, v12.4s, v4.s[0] 330 331 // BLOCK 1 332 FMLA v26.4s, v12.4s, v4.s[2] 333 FMLA v21.4s, v13.4s, v3.s[0] 334 FMLA v23.4s, v13.4s, v3.s[2] 335 336 // BLOCK 2 337 FMLA v25.4s, v13.4s, v4.s[0] 338 FMLA v27.4s, v13.4s, v4.s[2] 339 FMLA v20.4s, v14.4s, v3.s[1] 340 341 // BLOCK 3 342 FMLA v22.4s, v14.4s, v3.s[3] 343 FMLA v24.4s, v14.4s, v4.s[1] 344 FMLA v26.4s, v14.4s, v4.s[3] 345 TST x0, 15 346 347 // BLOCK 4 348 FMLA v21.4s, v15.4s, v3.s[1] 349 FMLA v23.4s, v15.4s, v3.s[3] 350 FMLA v25.4s, v15.4s, v4.s[1] 351 ADD x5, x5, 64 352 353 // BLOCK 5 354 FMLA v27.4s, v15.4s, v4.s[3] 355 356 # Is there a remainder?- 2 floats of A (8 bytes) or less 357 B.NE 4f 358 3593: 360 # Clamp 361 FMAX v20.4s, v20.4s, v6.4s 362 # Load cn_stride 363 LDR x0, [sp, 32] 364 FMAX v21.4s, v21.4s, v6.4s 365 FMAX v22.4s, v22.4s, v6.4s 366 FMAX v23.4s, v23.4s, v6.4s 367 FMAX v24.4s, v24.4s, v6.4s 368 FMAX v25.4s, v25.4s, v6.4s 369 FMAX v26.4s, v26.4s, v6.4s 370 FMAX v27.4s, v27.4s, v6.4s 371 SUBS x1, x1, 8 372 FMIN v20.4s, v20.4s, v7.4s 373 FMIN v21.4s, v21.4s, v7.4s 374 FMIN v22.4s, v22.4s, v7.4s 375 FMIN v23.4s, v23.4s, v7.4s 376 FMIN v24.4s, v24.4s, v7.4s 377 FMIN v25.4s, v25.4s, v7.4s 378 FMIN v26.4s, v26.4s, v7.4s 379 FMIN v27.4s, v27.4s, v7.4s 380 381 # Store full 4 x 8 382 B.LO 6f 383 384 $if INC: 385 ST1 {v26.16b, v27.16b}, [x14], x0 386 SUB x3, x3, x2 // a0 -= kc 387 ST1 {v24.16b, v25.16b}, [x17], x0 388 SUB x9, x9, x2 // a1 -= kc 389 ST1 {v22.16b, v23.16b}, [x16], x0 390 SUB x10, x10, x2 // a2 -= kc 391 ST1 {v20.16b, v21.16b}, [x6], x0 392 SUB x11, x11, x2 // a3 -= kc 393 $else: 394 ST1 {v20.16b, v21.16b}, [x6], x0 395 SUB x3, x3, x2 // a0 -= kc 396 ST1 {v22.16b, v23.16b}, [x16], x0 397 SUB x9, x9, x2 // a1 -= kc 398 ST1 {v24.16b, v25.16b}, [x17], x0 399 SUB x10, x10, x2 // a2 -= kc 400 ST1 {v26.16b, v27.16b}, [x14], x0 401 SUB x11, x11, x2 // a3 -= kc 402 403 B.HI 0b 404 405 // Restore d12-d15 from stack 406 LDP d14, d15, [sp, 16] 407 LDP d12, d13, [sp], 32 408 RET 409 4104: 411 # Is there a remainder?- 2 floats of A (8 bytes) 412 TBZ x0, 3, 5f 413 414 # Remainder- 2 floats of A (8 bytes) 415 LDR d0, [x3], 8 416 LDR q16, [x5], 16 417 LD1 {v0.d}[1], [x9], 8 418 LDR d1, [x10], 8 419 LD1 {v1.d}[1], [x11], 8 420 LDR q17, [x5], 16 421 LDR q18, [x5], 16 422 LDR q19, [x5], 16 423 FMLA v20.4s, v16.4s, v0.s[0] 424 FMLA v22.4s, v16.4s, v0.s[2] 425 FMLA v24.4s, v16.4s, v1.s[0] 426 FMLA v26.4s, v16.4s, v1.s[2] 427 FMLA v21.4s, v17.4s, v0.s[0] 428 FMLA v23.4s, v17.4s, v0.s[2] 429 FMLA v25.4s, v17.4s, v1.s[0] 430 FMLA v27.4s, v17.4s, v1.s[2] 431 432 FMLA v20.4s, v18.4s, v0.s[1] 433 FMLA v22.4s, v18.4s, v0.s[3] 434 FMLA v24.4s, v18.4s, v1.s[1] 435 FMLA v26.4s, v18.4s, v1.s[3] 436 FMLA v21.4s, v19.4s, v0.s[1] 437 FMLA v23.4s, v19.4s, v0.s[3] 438 FMLA v25.4s, v19.4s, v1.s[1] 439 FMLA v27.4s, v19.4s, v1.s[3] 440 441 # Is there a remainder?- 1 floats of A (4 bytes) 442 TBZ x0, 2, 3b 443 4445: 445 # Remainder- 1 floats of A (4 bytes) 446 LDR s0, [x3], 4 447 LDR q16, [x5], 16 448 LD1 {v0.s}[2], [x9], 4 449 LDR s1, [x10], 4 450 LD1 {v1.s}[2], [x11], 4 451 LDR q17, [x5], 16 452 453 FMLA v20.4s, v16.4s, v0.s[0] 454 FMLA v22.4s, v16.4s, v0.s[2] 455 FMLA v24.4s, v16.4s, v1.s[0] 456 FMLA v26.4s, v16.4s, v1.s[2] 457 FMLA v21.4s, v17.4s, v0.s[0] 458 FMLA v23.4s, v17.4s, v0.s[2] 459 FMLA v25.4s, v17.4s, v1.s[0] 460 FMLA v27.4s, v17.4s, v1.s[2] 461 B 3b 462 463 # Store odd width 4646: 465 TBZ x1, 2, 7f 466 $if INC: 467 STR q26, [x14], 16 468 MOV v26.16b, v27.16b 469 STR q24, [x17], 16 470 MOV v24.16b, v25.16b 471 STR q22, [x16], 16 472 MOV v22.16b, v23.16b 473 STR q20, [x6], 16 474 MOV v20.16b, v21.16b 475 $else: 476 STR q20, [x6], 16 477 MOV v20.16b, v21.16b 478 STR q22, [x16], 16 479 MOV v22.16b, v23.16b 480 STR q24, [x17], 16 481 MOV v24.16b, v25.16b 482 STR q26, [x14], 16 483 MOV v26.16b, v27.16b 484 4857: 486 TBZ x1, 1, 8f 487 $if INC: 488 STR d26, [x14], 8 489 DUP d26, v26.d[1] 490 STR d24, [x17], 8 491 DUP d24, v24.d[1] 492 STR d22, [x16], 8 493 DUP d22, v22.d[1] 494 STR d20, [x6], 8 495 DUP d20, v20.d[1] 496 $else: 497 STR d20, [x6], 8 498 DUP d20, v20.d[1] 499 STR d22, [x16], 8 500 DUP d22, v22.d[1] 501 STR d24, [x17], 8 502 DUP d24, v24.d[1] 503 STR d26, [x14], 8 504 DUP d26, v26.d[1] 505 5068: 507 TBZ x1, 0, 9f 508 $if INC: 509 STR s26, [x14] 510 STR s24, [x17] 511 STR s22, [x16] 512 STR s20, [x6] 513 $else: 514 STR s20, [x6] 515 STR s22, [x16] 516 STR s24, [x17] 517 STR s26, [x14] 5189: 519 // Restore d12-d15 from stack 520 LDP d14, d15, [sp, 16] 521 LDP d12, d13, [sp], 32 522 RET 523 524END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53 525 526#ifdef __ELF__ 527.section ".note.GNU-stack","",%progbits 528#endif 529