1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_a53( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32 33# C pointers 34# x6 c0 35# x16 c1 36# x17 c2 37# x18 c3 38 39# x4 temporary vector shadow register 40 41# Vector register usage 42# A0 v0 v3 43# A1 v0[1] v3[1] 44# A2 v1 v4 45# A3 v1[1] v4[1] 46 47# B v12 v13 v14 v15 second set of B 48# B v16 v17 v18 v19 first set 49# C v20 v21 50# C v22 v23 51# C v24 v25 52# C v26 v27 53# Clamp v6 v7 54 55# unused A v8 v9 v10 v11 56# x12 a4 57# x13 c4 58# x7 c5 59# A4 v2 v5 60# A5 v2[1] v5[1] 61# C v28 v29 62# C v30 v31 63 64BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_a53 65 66 # Clamp A and C pointers 67 CMP x0, 2 // if mr < 2 68 ADD x9, x3, x4 // a1 = a0 + a_stride 69 ADD x16, x6, x7 // c1 = c0 + cm_stride 70 CSEL x9, x3, x9, LO // a1 = a0 71 CSEL x16, x6, x16, LO // c1 = c0 72 73 ADD x10, x9, x4 // a2 = a1 + a_stride 74 ADD x17, x16, x7 // c2 = c1 + cm_stride 75 // if mr <= 2 76 CSEL x10, x9, x10, LS // a2 = a1 77 CSEL x17, x16, x17, LS // c2 = c1 78 79 CMP x0, 4 // if mr < 4 80 ADD x11, x10, x4 // a3 = a2 + a_stride 81 ADD x18, x17, x7 // c3 = c2 + cm_stride 82 CSEL x11, x10, x11, LO // a3 = a2 83 CSEL x18, x17, x18, LO // c3 = c2 84 85 $if INC: 86 # Load acc, params pointer 87 LDP x15, x8, [sp, 8] 88 $else: 89 # Load params pointer 90 LDR x8, [sp, 8] 91 92 # Load clamping_params values 93 LD2R {v6.4s, v7.4s}, [x8] 94 95 # Load cn_stride 96 LDR x14, [sp] 97 98 // Save d12-d15 on stack 99 STP d12, d13, [sp, -32]! 100 STP d14, d15, [sp, 16] 101 1020: 103 $if INC: 104 # Load initial accumulators 105 LDP q20, q21, [x15], 32 106 LDP q22, q23, [x15], 32 107 LDP q24, q25, [x15], 32 108 LDP q26, q27, [x15], 32 109 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 110 PRFM PLDL1KEEP, [x3, 64] 111 PRFM PLDL1KEEP, [x9, 0] 112 PRFM PLDL1KEEP, [x9, 64] 113 PRFM PLDL1KEEP, [x10, 0] 114 PRFM PLDL1KEEP, [x10, 64] 115 PRFM PLDL1KEEP, [x11, 0] 116 PRFM PLDL1KEEP, [x11, 64] 117 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 118 PRFM PLDL1KEEP, [x5, 64] 119 PRFM PLDL1KEEP, [x5, 128] 120 PRFM PLDL1KEEP, [x5, 192] 121 $else: 122 # Load initial bias from w into accumulators 123 LDP q20, q21, [x5], 32 124 MOV v22.16b, v20.16b 125 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 126 PRFM PLDL1KEEP, [x3, 64] 127 MOV v23.16b, v21.16b 128 PRFM PLDL1KEEP, [x9, 0] 129 PRFM PLDL1KEEP, [x9, 64] 130 MOV v24.16b, v20.16b 131 PRFM PLDL1KEEP, [x10, 0] 132 PRFM PLDL1KEEP, [x10, 64] 133 MOV v25.16b, v21.16b 134 PRFM PLDL1KEEP, [x11, 0] 135 PRFM PLDL1KEEP, [x11, 64] 136 MOV v26.16b, v20.16b 137 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 138 MOV v27.16b, v21.16b 139 PRFM PLDL1KEEP, [x5, 64] 140 PRFM PLDL1KEEP, [x5, 128] 141 PRFM PLDL1KEEP, [x5, 192] 142 143 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 144 SUBS x0, x2, 16 // k = kc - 16 145 B.LO 5f 146 147 # Prologue - First group loads, no FMA 148 LDR d0, [x3], 8 // a0 149 LDP q16, q17, [x5], 32 // b 150 LDR d1, [x10], 8 // a2 151 LD1 {v0.d}[1], [x9], 8 // a1 152 LD1 {v1.d}[1], [x11], 8 // a3 153 SUBS x0, x0, 16 154 LDR q18, [x5], 16 155 LDR d19, [x5], 8 156 LDR x4, [x5], 8 // ins is in BLOCK 0 157 158 # Is there at least 4 floats (16 bytes) for main loop? 159 B.LO 2f 160 161 # Main loop - 4 floats of A (16 bytes) 162 # 32 FMA + 8 LD64 A + 8 LDR B 1631: 164 # First group of 16 FMA, Second group loads 165 // BLOCK 0 166 LDR d3, [x3], 8 // a0 167 INS v19.d[1], x4 // b from second group 168 FMLA v20.4s, v16.4s, v0.s[0] 169 LDR x4, [x9], 8 // a1 170 FMLA v22.4s, v16.4s, v0.s[2] 171 FMLA v24.4s, v16.4s, v1.s[0] 172 173 // BLOCK 1 174 LDR d12, [x5] 175 INS v3.d[1], x4 // a1 ins 176 FMLA v26.4s, v16.4s, v1.s[2] 177 LDR x4, [x5, 8] // b 178 FMLA v21.4s, v17.4s, v0.s[0] 179 FMLA v23.4s, v17.4s, v0.s[2] 180 181 // BLOCK 2 182 LDR d4, [x10], 8 // a2 183 INS v12.d[1], x4 // b ins 184 FMLA v25.4s, v17.4s, v1.s[0] 185 LDR x4, [x11], 8 // a3 186 FMLA v27.4s, v17.4s, v1.s[2] 187 FMLA v20.4s, v18.4s, v0.s[1] 188 189 // BLOCK 3 190 LDR d13, [x5, 16] 191 INS v4.d[1], x4 // a3 ins 192 FMLA v22.4s, v18.4s, v0.s[3] 193 LDR x4, [x5, 24] 194 FMLA v24.4s, v18.4s, v1.s[1] 195 FMLA v26.4s, v18.4s, v1.s[3] 196 197 // BLOCK 4 198 LDR d14, [x5, 32] 199 INS v13.d[1], x4 // b 200 FMLA v21.4s, v19.4s, v0.s[1] 201 LDR x4, [x5, 40] 202 FMLA v23.4s, v19.4s, v0.s[3] 203 FMLA v25.4s, v19.4s, v1.s[1] 204 205 // BLOCK 5 206 // NOPs to ensure 4 cycle LDR lands on next LDR 207 LDR d15, [x5, 48] 208 INS v14.d[1], x4 // b from previous 209 FMLA v27.4s, v19.4s, v1.s[3] 210 LDR x4, [x5, 56] 211 NOP 212 NOP 213 NOP 214 NOP 215 216 # Second group of 16 FMA, First group of loads 217 // BLOCK 0 218 LDR d0, [x3], 8 // a0 219 INS v15.d[1], x4 // b from previous 220 FMLA v20.4s, v12.4s, v3.s[0] 221 LDR x4, [x9], 8 // a1 222 FMLA v22.4s, v12.4s, v3.s[2] 223 FMLA v24.4s, v12.4s, v4.s[0] 224 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 225 226 // BLOCK 1 227 LDR d16, [x5, 64] 228 INS v0.d[1], x4 // a1 ins 229 FMLA v26.4s, v12.4s, v4.s[2] 230 LDR x4, [x5, 72] // b 231 FMLA v21.4s, v13.4s, v3.s[0] 232 FMLA v23.4s, v13.4s, v3.s[2] 233 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1 234 235 // BLOCK 2 236 LDR d1, [x10], 8 // a2 237 INS v16.d[1], x4 // b 238 FMLA v25.4s, v13.4s, v4.s[0] 239 LDR x4, [x11], 8 // a3 240 FMLA v27.4s, v13.4s, v4.s[2] 241 FMLA v20.4s, v14.4s, v3.s[1] 242 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2 243 244 // BLOCK 3 245 LDR d17, [x5, 80] 246 INS v1.d[1], x4 // a3 ins 247 FMLA v22.4s, v14.4s, v3.s[3] 248 LDR x4, [x5, 88] 249 FMLA v24.4s, v14.4s, v4.s[1] 250 FMLA v26.4s, v14.4s, v4.s[3] 251 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3 252 253 // BLOCK 4 254 LDR d18, [x5, 96] 255 INS v17.d[1], x4 // b 256 FMLA v21.4s, v15.4s, v3.s[1] 257 LDR x4, [x5, 104] 258 FMLA v23.4s, v15.4s, v3.s[3] 259 FMLA v25.4s, v15.4s, v4.s[1] 260 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 261 262 // BLOCK 5 263 // NOTE that block needs to be 4 cycles for LDR not to stall 264 LDR d19, [x5, 112] 265 INS v18.d[1], x4 266 FMLA v27.4s, v15.4s, v4.s[3] 267 LDR x4, [x5, 120] 268 SUBS x0, x0, 16 269 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 270 ADD x5, x5, 128 271 B.HS 1b 272 273 # Epilogue - 4 floats of A (16 bytes) 274 # 32 FMA + 8 LD64 A + 8 LDR B 2752: 276 # First group of 16 FMA, Second group loads 277 // BLOCK 0 278 LDR d3, [x3], 8 // a0 279 INS v19.d[1], x4 // b from second group 280 FMLA v20.4s, v16.4s, v0.s[0] 281 LDR x4, [x9], 8 // a1 282 FMLA v22.4s, v16.4s, v0.s[2] 283 FMLA v24.4s, v16.4s, v1.s[0] 284 285 // BLOCK 1 286 LDR d12, [x5] 287 INS v3.d[1], x4 // a1 ins 288 FMLA v26.4s, v16.4s, v1.s[2] 289 LDR x4, [x5, 8] // b 290 FMLA v21.4s, v17.4s, v0.s[0] 291 FMLA v23.4s, v17.4s, v0.s[2] 292 293 // BLOCK 2 294 LDR d4, [x10], 8 // a2 295 INS v12.d[1], x4 // b ins 296 FMLA v25.4s, v17.4s, v1.s[0] 297 LDR x4, [x11], 8 // a3 298 FMLA v27.4s, v17.4s, v1.s[2] 299 FMLA v20.4s, v18.4s, v0.s[1] 300 301 // BLOCK 3 302 LDR d13, [x5, 16] 303 INS v4.d[1], x4 // a3 ins 304 FMLA v22.4s, v18.4s, v0.s[3] 305 LDR x4, [x5, 24] 306 FMLA v24.4s, v18.4s, v1.s[1] 307 FMLA v26.4s, v18.4s, v1.s[3] 308 309 // BLOCK 4 310 LDR d14, [x5, 32] 311 INS v13.d[1], x4 // b 312 FMLA v21.4s, v19.4s, v0.s[1] 313 LDR x4, [x5, 40] 314 FMLA v23.4s, v19.4s, v0.s[3] 315 FMLA v25.4s, v19.4s, v1.s[1] 316 317 // BLOCK 5 318 // NOPs to ensure 4 cycle LDR lands on next LDR 319 LDR d15, [x5, 48] 320 INS v14.d[1], x4 321 FMLA v27.4s, v19.4s, v1.s[3] 322 LDR x4, [x5, 56] 323 NOP // fma 324 NOP 325 NOP // fma 326 NOP 327 328 # Second group of 16 FMA, no loads 329 // BLOCK 0 330 INS v15.d[1], x4 // b from previous 331 FMLA v20.4s, v12.4s, v3.s[0] 332 FMLA v22.4s, v12.4s, v3.s[2] 333 FMLA v24.4s, v12.4s, v4.s[0] 334 335 // BLOCK 1 336 FMLA v26.4s, v12.4s, v4.s[2] 337 FMLA v21.4s, v13.4s, v3.s[0] 338 FMLA v23.4s, v13.4s, v3.s[2] 339 340 // BLOCK 2 341 FMLA v25.4s, v13.4s, v4.s[0] 342 FMLA v27.4s, v13.4s, v4.s[2] 343 FMLA v20.4s, v14.4s, v3.s[1] 344 345 // BLOCK 3 346 FMLA v22.4s, v14.4s, v3.s[3] 347 FMLA v24.4s, v14.4s, v4.s[1] 348 FMLA v26.4s, v14.4s, v4.s[3] 349 TST x0, 15 350 351 // BLOCK 4 352 FMLA v21.4s, v15.4s, v3.s[1] 353 FMLA v23.4s, v15.4s, v3.s[3] 354 FMLA v25.4s, v15.4s, v4.s[1] 355 ADD x5, x5, 64 356 357 // BLOCK 5 358 FMLA v27.4s, v15.4s, v4.s[3] 359 360 # Is there a remainder?- 2 floats of A (8 bytes) or less 361 B.NE 5f 362 3634: 364 # Clamp 365 FMIN v20.4s, v20.4s, v6.4s 366 SUBS x1, x1, 8 367 FMIN v21.4s, v21.4s, v6.4s 368 FMIN v22.4s, v22.4s, v6.4s 369 FMIN v23.4s, v23.4s, v6.4s 370 FMIN v24.4s, v24.4s, v6.4s 371 FMIN v25.4s, v25.4s, v6.4s 372 FMIN v26.4s, v26.4s, v6.4s 373 FMIN v27.4s, v27.4s, v6.4s 374 FMAX v20.4s, v20.4s, v7.4s 375 FMAX v21.4s, v21.4s, v7.4s 376 FMAX v22.4s, v22.4s, v7.4s 377 FMAX v23.4s, v23.4s, v7.4s 378 FMAX v24.4s, v24.4s, v7.4s 379 FMAX v25.4s, v25.4s, v7.4s 380 FMAX v26.4s, v26.4s, v7.4s 381 FMAX v27.4s, v27.4s, v7.4s 382 383 # Store full 4 x 8 384 B.LO 8f 385 386 $if INC: 387 ST1 {v26.16b, v27.16b}, [x18], x14 388 SUB x3, x3, x2 // a0 -= kc 389 ST1 {v24.16b, v25.16b}, [x17], x14 390 SUB x9, x9, x2 // a1 -= kc 391 ST1 {v22.16b, v23.16b}, [x16], x14 392 SUB x10, x10, x2 // a2 -= kc 393 ST1 {v20.16b, v21.16b}, [x6], x14 394 SUB x11, x11, x2 // a3 -= kc 395 $else: 396 ST1 {v20.16b, v21.16b}, [x6], x14 397 SUB x3, x3, x2 // a0 -= kc 398 ST1 {v22.16b, v23.16b}, [x16], x14 399 SUB x9, x9, x2 // a1 -= kc 400 ST1 {v24.16b, v25.16b}, [x17], x14 401 SUB x10, x10, x2 // a2 -= kc 402 ST1 {v26.16b, v27.16b}, [x18], x14 403 SUB x11, x11, x2 // a3 -= kc 404 405 B.HI 0b 406 407 // Restore d12-d15 from stack 408 LDP d14, d15, [sp, 16] 409 LDP d12, d13, [sp], 32 410 RET 411 4125: 413 # Is there a remainder?- 2 floats of A (8 bytes) 414 TBZ x0, 3, 6f 415 416 # Remainder- 2 floats of A (8 bytes) 417 LDR d0, [x3], 8 418 LDR q16, [x5], 16 419 LD1 {v0.d}[1], [x9], 8 420 LDR d1, [x10], 8 421 LD1 {v1.d}[1], [x11], 8 422 LDR q17, [x5], 16 423 LDR q18, [x5], 16 424 LDR q19, [x5], 16 425 FMLA v20.4s, v16.4s, v0.s[0] 426 FMLA v22.4s, v16.4s, v0.s[2] 427 FMLA v24.4s, v16.4s, v1.s[0] 428 FMLA v26.4s, v16.4s, v1.s[2] 429 FMLA v21.4s, v17.4s, v0.s[0] 430 FMLA v23.4s, v17.4s, v0.s[2] 431 FMLA v25.4s, v17.4s, v1.s[0] 432 FMLA v27.4s, v17.4s, v1.s[2] 433 434 FMLA v20.4s, v18.4s, v0.s[1] 435 FMLA v22.4s, v18.4s, v0.s[3] 436 FMLA v24.4s, v18.4s, v1.s[1] 437 FMLA v26.4s, v18.4s, v1.s[3] 438 FMLA v21.4s, v19.4s, v0.s[1] 439 FMLA v23.4s, v19.4s, v0.s[3] 440 FMLA v25.4s, v19.4s, v1.s[1] 441 FMLA v27.4s, v19.4s, v1.s[3] 442 443 # Is there a remainder?- 1 floats of A (4 bytes) 444 TBZ x0, 2, 4b 445 4466: 447 # Remainder- 1 floats of A (4 bytes) 448 LDR s0, [x3], 4 449 LDR q16, [x5], 16 450 LD1 {v0.s}[2], [x9], 4 451 LDR s1, [x10], 4 452 LD1 {v1.s}[2], [x11], 4 453 LDR q17, [x5], 16 454 455 FMLA v20.4s, v16.4s, v0.s[0] 456 FMLA v22.4s, v16.4s, v0.s[2] 457 FMLA v24.4s, v16.4s, v1.s[0] 458 FMLA v26.4s, v16.4s, v1.s[2] 459 FMLA v21.4s, v17.4s, v0.s[0] 460 FMLA v23.4s, v17.4s, v0.s[2] 461 FMLA v25.4s, v17.4s, v1.s[0] 462 FMLA v27.4s, v17.4s, v1.s[2] 463 B 4b 464 465 # Store odd width 4668: 467 TBZ x1, 2, 9f 468 $if INC: 469 STR q26, [x18], 16 470 MOV v26.16b, v27.16b 471 STR q24, [x17], 16 472 MOV v24.16b, v25.16b 473 STR q22, [x16], 16 474 MOV v22.16b, v23.16b 475 STR q20, [x6], 16 476 MOV v20.16b, v21.16b 477 $else: 478 STR q20, [x6], 16 479 MOV v20.16b, v21.16b 480 STR q22, [x16], 16 481 MOV v22.16b, v23.16b 482 STR q24, [x17], 16 483 MOV v24.16b, v25.16b 484 STR q26, [x18], 16 485 MOV v26.16b, v27.16b 486 4879: 488 TBZ x1, 1, 10f 489 $if INC: 490 STR d26, [x18], 8 491 DUP d26, v26.d[1] 492 STR d24, [x17], 8 493 DUP d24, v24.d[1] 494 STR d22, [x16], 8 495 DUP d22, v22.d[1] 496 STR d20, [x6], 8 497 DUP d20, v20.d[1] 498 $else: 499 STR d20, [x6], 8 500 DUP d20, v20.d[1] 501 STR d22, [x16], 8 502 DUP d22, v22.d[1] 503 STR d24, [x17], 8 504 DUP d24, v24.d[1] 505 STR d26, [x18], 8 506 DUP d26, v26.d[1] 507 50810: 509 TBZ x1, 0, 11f 510 $if INC: 511 STR s26, [x18] 512 STR s24, [x17] 513 STR s22, [x16] 514 STR s20, [x6] 515 $else: 516 STR s20, [x6] 517 STR s22, [x16] 518 STR s24, [x17] 519 STR s26, [x18] 52011: 521 // Restore d12-d15 from stack 522 LDP d14, d15, [sp, 16] 523 LDP d12, d13, [sp], 32 524 RET 525 526END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x8__aarch64_neonfma_cortex_a53 527 528#ifdef __ELF__ 529.section ".note.GNU-stack","",%progbits 530#endif 531