1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# x8 temporary vector shadow register 43 44# Vector register usage 45# A0 v0 v3 46# A1 v0[1] v3[1] 47# A2 v1 v4 48# A3 v1[1] v4[1] 49# A4 v2 v5 50# A5 v2[1] v5[1] 51# B v12 v13 v14 v15 second set of B 52# B v16 v17 v18 v19 first set 53# C v20 v21 54# C v22 v23 55# C v24 v25 56# C v26 v27 57# C v28 v29 58# C v30 v31 59# Clamp v6 v7 60# unused A v8 v9 v10 v11 61 62BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55 63 64 # Load params pointer 65 LDR x8, [sp, 8] 66 67 # Clamp A and C pointers 68 CMP x0, 2 // if mr < 2 69 ADD x9, x3, x4 // a1 = a0 + a_stride 70 ADD x16, x6, x7 // c1 = c0 + cm_stride 71 CSEL x9, x3, x9, LO // a1 = a0 72 CSEL x16, x6, x16, LO // c1 = c0 73 74 ADD x10, x9, x4 // a2 = a1 + a_stride 75 ADD x17, x16, x7 // c2 = c1 + cm_stride 76 // if mr <= 2 77 CSEL x10, x9, x10, LS // a2 = a1 78 CSEL x17, x16, x17, LS // c2 = c1 79 80 CMP x0, 4 // if mr < 4 81 ADD x11, x10, x4 // a3 = a2 + a_stride 82 ADD x14, x17, x7 // c3 = c2 + cm_stride 83 CSEL x11, x10, x11, LO // a3 = a2 84 CSEL x14, x17, x14, LO // c3 = c2 85 86 ADD x12, x11, x4 // a4 = a3 + a_stride 87 ADD x13, x14, x7 // c4 = c3 + cm_stride 88 // if mr <= 4 89 CSEL x12, x11, x12, LS // a4 = a3 90 CSEL x13, x14, x13, LS // c4 = c3 91 92 CMP x0, 6 // if mr < 6 93 ADD x4, x12, x4 // a5 = a4 + a_stride 94 ADD x7, x13, x7 // c5 = c4 + cm_stride 95 CSEL x4, x12, x4, LO // a5 = a4 96 CSEL x7, x13, x7, LO // c5 = c4 97 98 # Load min/max values 99 LD2R {v6.4s, v7.4s}, [x8] 100 101 // Save d12-d15 on stack 102 STP d12, d13, [sp, -32]! 103 STP d14, d15, [sp, 16] 104 1050: 106 # Load initial bias from w into accumulators 107 LDP q20, q21, [x5], 32 108 SUBS x0, x2, 16 // k = kc - 16 109 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 110 PRFM PLDL1KEEP, [x3, 64] 111 MOV v22.16b, v20.16b 112 PRFM PLDL1KEEP, [x9, 0] 113 PRFM PLDL1KEEP, [x9, 64] 114 MOV v23.16b, v21.16b 115 PRFM PLDL1KEEP, [x10, 0] 116 PRFM PLDL1KEEP, [x10, 64] 117 MOV v24.16b, v20.16b 118 PRFM PLDL1KEEP, [x11, 0] 119 PRFM PLDL1KEEP, [x11, 64] 120 MOV v25.16b, v21.16b 121 PRFM PLDL1KEEP, [x12, 0] 122 PRFM PLDL1KEEP, [x12, 64] 123 MOV v26.16b, v20.16b 124 PRFM PLDL1KEEP, [x4, 0] 125 PRFM PLDL1KEEP, [x4, 64] 126 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 127 MOV v27.16b, v21.16b 128 PRFM PLDL1KEEP, [x5, 64] 129 MOV v28.16b, v20.16b 130 PRFM PLDL1KEEP, [x5, 128] 131 MOV v29.16b, v21.16b 132 PRFM PLDL1KEEP, [x5, 192] 133 MOV v30.16b, v20.16b 134 PRFM PLDL1KEEP, [x5, 256] 135 MOV v31.16b, v21.16b 136 PRFM PLDL1KEEP, [x5, 320] 137 138 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 139 B.LO 4f 140 141 # Prologue - First group loads, no FMA 142 LDR d0, [x3], 8 // a0 143 LDP q16, q17, [x5], 32 // b 144 LDR d1, [x10], 8 // a2 145 LDR d2, [x12], 8 // a4 146 LD1 {v0.d}[1], [x9], 8 // a1 147 LD1 {v1.d}[1], [x11], 8 // a3 148 LD1 {v2.d}[1], [x4], 8 // a5 149 SUBS x0, x0, 16 150 LDR q18, [x5], 16 151 LDR d19, [x5], 8 152 LDR x8, [x5], 8 // ins is in BLOCK 0 153 154 # Is there at least 4 floats (16 bytes) for main loop? 155 B.LO 2f 156 157 # Main loop - 4 floats of A (16 bytes) 158 # 48 FMA + 12 LD64 A + 8 LDR B 1591: 160 # First group of 24 FMA, Second group loads 161 // BLOCK 0 162 FMLA v20.4s, v16.4s, v0.s[0] 163 LDR d3, [x3], 8 // a0 164 FMLA v22.4s, v16.4s, v0.s[2] 165 INS v19.d[1], x8 // b from second group 166 FMLA v24.4s, v16.4s, v1.s[0] 167 LDR x8, [x9], 8 // a1 168 169 // BLOCK 1 170 FMLA v26.4s, v16.4s, v1.s[2] 171 LDR d12, [x5] 172 FMLA v28.4s, v16.4s, v2.s[0] 173 INS v3.d[1], x8 // a1 ins 174 FMLA v30.4s, v16.4s, v2.s[2] 175 LDR x8, [x5, 8] // b 176 177 // BLOCK 2 178 FMLA v21.4s, v17.4s, v0.s[0] 179 LDR d4, [x10], 8 // a2 180 FMLA v23.4s, v17.4s, v0.s[2] 181 INS v12.d[1], x8 // b ins 182 FMLA v25.4s, v17.4s, v1.s[0] 183 LDR x8, [x11], 8 // a3 184 185 // BLOCK 3 186 FMLA v27.4s, v17.4s, v1.s[2] 187 LDR d5, [x12], 8 // a4 188 FMLA v29.4s, v17.4s, v2.s[0] 189 INS v4.d[1], x8 // a3 ins 190 FMLA v31.4s, v17.4s, v2.s[2] 191 LDR x8, [x4], 8 // a5 192 193 // BLOCK 4 194 FMLA v20.4s, v18.4s, v0.s[1] 195 LDR d13, [x5, 16] 196 FMLA v22.4s, v18.4s, v0.s[3] 197 INS v5.d[1], x8 // a5 ins 198 FMLA v24.4s, v18.4s, v1.s[1] 199 LDR x8, [x5, 24] 200 201 // BLOCK 5 202 FMLA v26.4s, v18.4s, v1.s[3] 203 LDR d14, [x5, 32] 204 FMLA v28.4s, v18.4s, v2.s[1] 205 INS v13.d[1], x8 // b 206 FMLA v30.4s, v18.4s, v2.s[3] 207 LDR x8, [x5, 40] 208 209 // BLOCK 6 210 FMLA v21.4s, v19.4s, v0.s[1] 211 LDR d15, [x5, 48] 212 FMLA v23.4s, v19.4s, v0.s[3] 213 INS v14.d[1], x8 // b 214 FMLA v25.4s, v19.4s, v1.s[1] 215 LDR x8, [x5, 56] 216 217 // BLOCK 7 218 FMLA v27.4s, v19.4s, v1.s[3] 219 FMLA v29.4s, v19.4s, v2.s[1] 220 INS v15.d[1], x8 221 FMLA v31.4s, v19.4s, v2.s[3] 222 223 # Second group of 24 FMA, First group of loads 224 // BLOCK 0 225 FMLA v20.4s, v12.4s, v3.s[0] 226 LDR d0, [x3], 8 // a0 227 FMLA v22.4s, v12.4s, v3.s[2] 228 FMLA v24.4s, v12.4s, v4.s[0] 229 LDR x8, [x9], 8 // a1 230 231 // BLOCK 1 232 FMLA v26.4s, v12.4s, v4.s[2] 233 LDR d16, [x5, 64] 234 FMLA v28.4s, v12.4s, v5.s[0] 235 INS v0.d[1], x8 // a1 ins 236 FMLA v30.4s, v12.4s, v5.s[2] 237 LDR x8, [x5, 72] // b 238 239 // BLOCK 2 240 FMLA v21.4s, v13.4s, v3.s[0] 241 LDR d1, [x10], 8 // a2 242 FMLA v23.4s, v13.4s, v3.s[2] 243 INS v16.d[1], x8 // b 244 FMLA v25.4s, v13.4s, v4.s[0] 245 LDR x8, [x11], 8 // a3 246 247 // BLOCK 3 248 FMLA v27.4s, v13.4s, v4.s[2] 249 LDR d2, [x12], 8 // a4 250 FMLA v29.4s, v13.4s, v5.s[0] 251 INS v1.d[1], x8 // a3 ins 252 FMLA v31.4s, v13.4s, v5.s[2] 253 LDR x8, [x4], 8 // a5 254 255 // BLOCK 4 256 FMLA v20.4s, v14.4s, v3.s[1] 257 LDR d17, [x5, 80] 258 FMLA v22.4s, v14.4s, v3.s[3] 259 INS v2.d[1], x8 // a5 ins 260 FMLA v24.4s, v14.4s, v4.s[1] 261 LDR x8, [x5, 88] 262 263 // BLOCK 5 264 FMLA v26.4s, v14.4s, v4.s[3] 265 LDR d18, [x5, 96] 266 FMLA v28.4s, v14.4s, v5.s[1] 267 INS v17.d[1], x8 // b 268 FMLA v30.4s, v14.4s, v5.s[3] 269 LDR x8, [x5, 104] 270 271 // BLOCK 6 272 FMLA v21.4s, v15.4s, v3.s[1] 273 LDR d19, [x5, 112] 274 FMLA v23.4s, v15.4s, v3.s[3] 275 INS v18.d[1], x8 // b 276 FMLA v25.4s, v15.4s, v4.s[1] 277 LDR x8, [x5, 120] 278 279 // BLOCK 7 280 FMLA v27.4s, v15.4s, v4.s[3] 281 SUBS x0, x0, 16 282 FMLA v29.4s, v15.4s, v5.s[1] 283 ADD x5, x5, 128 284 FMLA v31.4s, v15.4s, v5.s[3] 285 B.HS 1b 286 287 # Epilogue - 4 floats of A (16 bytes) 288 # 48 FMA + 12 LD64 A + 8 LDR B 2892: 290 # First group of 24 FMA, Second group loads 291 // BLOCK 0 292 FMLA v20.4s, v16.4s, v0.s[0] 293 LDR d3, [x3], 8 // a0 294 FMLA v22.4s, v16.4s, v0.s[2] 295 INS v19.d[1], x8 // b from second group 296 FMLA v24.4s, v16.4s, v1.s[0] 297 LDR x8, [x9], 8 // a1 298 299 // BLOCK 1 300 FMLA v26.4s, v16.4s, v1.s[2] 301 LDR d12, [x5] 302 FMLA v28.4s, v16.4s, v2.s[0] 303 INS v3.d[1], x8 // a1 ins 304 FMLA v30.4s, v16.4s, v2.s[2] 305 LDR x8, [x5, 8] // b 306 307 // BLOCK 2 308 FMLA v21.4s, v17.4s, v0.s[0] 309 LDR d4, [x10], 8 // a2 310 FMLA v23.4s, v17.4s, v0.s[2] 311 INS v12.d[1], x8 // b ins 312 FMLA v25.4s, v17.4s, v1.s[0] 313 LDR x8, [x11], 8 // a3 314 315 // BLOCK 3 316 FMLA v27.4s, v17.4s, v1.s[2] 317 LDR d5, [x12], 8 // a4 318 FMLA v29.4s, v17.4s, v2.s[0] 319 INS v4.d[1], x8 // a3 ins 320 FMLA v31.4s, v17.4s, v2.s[2] 321 LDR x8, [x4], 8 // a5 322 323 // BLOCK 4 324 FMLA v20.4s, v18.4s, v0.s[1] 325 LDR d13, [x5, 16] 326 FMLA v22.4s, v18.4s, v0.s[3] 327 INS v5.d[1], x8 // a5 ins 328 FMLA v24.4s, v18.4s, v1.s[1] 329 LDR x8, [x5, 24] 330 331 // BLOCK 5 332 FMLA v26.4s, v18.4s, v1.s[3] 333 LDR d14, [x5, 32] 334 FMLA v28.4s, v18.4s, v2.s[1] 335 INS v13.d[1], x8 // b 336 FMLA v30.4s, v18.4s, v2.s[3] 337 LDR x8, [x5, 40] 338 339 // BLOCK 6 340 FMLA v21.4s, v19.4s, v0.s[1] 341 LDR d15, [x5, 48] 342 FMLA v23.4s, v19.4s, v0.s[3] 343 INS v14.d[1], x8 // b 344 FMLA v25.4s, v19.4s, v1.s[1] 345 LDR x8, [x5, 56] 346 347 // BLOCK 7 348 FMLA v27.4s, v19.4s, v1.s[3] 349 FMLA v29.4s, v19.4s, v2.s[1] 350 INS v15.d[1], x8 // b 351 FMLA v31.4s, v19.4s, v2.s[3] 352 353 # Second group of 24 FMA, First group of loads 354 // BLOCK 0 355 FMLA v20.4s, v12.4s, v3.s[0] 356 PRFM PSTL1KEEP, [x6] // Prefetch C0 357 FMLA v22.4s, v12.4s, v3.s[2] 358 PRFM PSTL1KEEP, [x16] // Prefetch C1 359 FMLA v24.4s, v12.4s, v4.s[0] 360 PRFM PSTL1KEEP, [x17] // Prefetch C2 361 362 // BLOCK 1 363 FMLA v26.4s, v12.4s, v4.s[2] 364 PRFM PSTL1KEEP, [x14] // Prefetch C3 365 FMLA v28.4s, v12.4s, v5.s[0] 366 PRFM PSTL1KEEP, [x13] // Prefetch C4 367 FMLA v30.4s, v12.4s, v5.s[2] 368 PRFM PSTL1KEEP, [x7] // Prefetch C5 369 370 // BLOCK 2 371 FMLA v21.4s, v13.4s, v3.s[0] 372 FMLA v23.4s, v13.4s, v3.s[2] 373 FMLA v25.4s, v13.4s, v4.s[0] 374 375 // BLOCK 3 376 FMLA v27.4s, v13.4s, v4.s[2] 377 FMLA v29.4s, v13.4s, v5.s[0] 378 FMLA v31.4s, v13.4s, v5.s[2] 379 380 // BLOCK 4 381 FMLA v20.4s, v14.4s, v3.s[1] 382 FMLA v22.4s, v14.4s, v3.s[3] 383 FMLA v24.4s, v14.4s, v4.s[1] 384 385 // BLOCK 5 386 FMLA v26.4s, v14.4s, v4.s[3] 387 FMLA v28.4s, v14.4s, v5.s[1] 388 FMLA v30.4s, v14.4s, v5.s[3] 389 TST x0, 15 390 391 // BLOCK 6 392 FMLA v21.4s, v15.4s, v3.s[1] 393 FMLA v23.4s, v15.4s, v3.s[3] 394 FMLA v25.4s, v15.4s, v4.s[1] 395 ADD x5, x5, 64 396 397 // BLOCK 7 398 FMLA v27.4s, v15.4s, v4.s[3] 399 FMLA v29.4s, v15.4s, v5.s[1] 400 FMLA v31.4s, v15.4s, v5.s[3] 401 402 # Is there a remainder?- 2 floats of A (8 bytes) or less 403 B.NE 4f 4043: 405 # Clamp 406 FMAX v20.4s, v20.4s, v6.4s 407 # Load cn_stride 408 LDR x0, [sp, 32] 409 FMAX v21.4s, v21.4s, v6.4s 410 FMAX v22.4s, v22.4s, v6.4s 411 FMAX v23.4s, v23.4s, v6.4s 412 FMAX v24.4s, v24.4s, v6.4s 413 FMAX v25.4s, v25.4s, v6.4s 414 FMAX v26.4s, v26.4s, v6.4s 415 FMAX v27.4s, v27.4s, v6.4s 416 FMAX v28.4s, v28.4s, v6.4s 417 FMAX v29.4s, v29.4s, v6.4s 418 FMAX v30.4s, v30.4s, v6.4s 419 FMAX v31.4s, v31.4s, v6.4s 420 SUBS x1, x1, 8 421 FMIN v20.4s, v20.4s, v7.4s 422 FMIN v21.4s, v21.4s, v7.4s 423 FMIN v22.4s, v22.4s, v7.4s 424 FMIN v23.4s, v23.4s, v7.4s 425 FMIN v24.4s, v24.4s, v7.4s 426 FMIN v25.4s, v25.4s, v7.4s 427 FMIN v26.4s, v26.4s, v7.4s 428 FMIN v27.4s, v27.4s, v7.4s 429 FMIN v28.4s, v28.4s, v7.4s 430 FMIN v29.4s, v29.4s, v7.4s 431 FMIN v30.4s, v30.4s, v7.4s 432 FMIN v31.4s, v31.4s, v7.4s 433 434 # Store full 6 x 8 435 B.LO 6f 436 437 ST1 {v20.16b, v21.16b}, [x6], x0 438 SUB x3, x3, x2 // a0 -= kc 439 ST1 {v22.16b, v23.16b}, [x16], x0 440 SUB x9, x9, x2 // a1 -= kc 441 ST1 {v24.16b, v25.16b}, [x17], x0 442 SUB x10, x10, x2 // a2 -= kc 443 ST1 {v26.16b, v27.16b}, [x14], x0 444 SUB x11, x11, x2 // a3 -= kc 445 ST1 {v28.16b, v29.16b}, [x13], x0 446 SUB x12, x12, x2 // a4 -= kc 447 ST1 {v30.16b, v31.16b}, [x7], x0 448 SUB x4, x4, x2 // a5 -= kc 449 450 B.HI 0b 451 452 // Restore d12-d15 from stack 453 LDP d14, d15, [sp, 16] 454 LDP d12, d13, [sp], 32 455 RET 456 4574: 458 # Is there a remainder?- 2 floats of A (8 bytes) 459 TBZ x0, 3, 5f 460 461 # Remainder- 2 floats of A (8 bytes) 462 LDR d0, [x3], 8 463 LDR q16, [x5], 16 464 LD1 {v0.d}[1], [x9], 8 465 LDR d1, [x10], 8 466 LD1 {v1.d}[1], [x11], 8 467 LDR d2, [x12], 8 468 LD1 {v2.d}[1], [x4], 8 469 LDR q17, [x5], 16 470 LDR q18, [x5], 16 471 LDR q19, [x5], 16 472 473 FMLA v20.4s, v16.4s, v0.s[0] 474 FMLA v22.4s, v16.4s, v0.s[2] 475 FMLA v24.4s, v16.4s, v1.s[0] 476 FMLA v26.4s, v16.4s, v1.s[2] 477 FMLA v28.4s, v16.4s, v2.s[0] 478 FMLA v30.4s, v16.4s, v2.s[2] 479 FMLA v21.4s, v17.4s, v0.s[0] 480 FMLA v23.4s, v17.4s, v0.s[2] 481 FMLA v25.4s, v17.4s, v1.s[0] 482 FMLA v27.4s, v17.4s, v1.s[2] 483 FMLA v29.4s, v17.4s, v2.s[0] 484 FMLA v31.4s, v17.4s, v2.s[2] 485 486 FMLA v20.4s, v18.4s, v0.s[1] 487 FMLA v22.4s, v18.4s, v0.s[3] 488 FMLA v24.4s, v18.4s, v1.s[1] 489 FMLA v26.4s, v18.4s, v1.s[3] 490 FMLA v28.4s, v18.4s, v2.s[1] 491 FMLA v30.4s, v18.4s, v2.s[3] 492 FMLA v21.4s, v19.4s, v0.s[1] 493 FMLA v23.4s, v19.4s, v0.s[3] 494 FMLA v25.4s, v19.4s, v1.s[1] 495 FMLA v27.4s, v19.4s, v1.s[3] 496 FMLA v29.4s, v19.4s, v2.s[1] 497 FMLA v31.4s, v19.4s, v2.s[3] 498 499 # Is there a remainder?- 1 floats of A (4 bytes) 500 TBZ x0, 2, 3b 5015: 502 # Remainder- 1 floats of A (4 bytes) 503 LDR s0, [x3], 4 504 LDR q16, [x5], 16 505 LD1 {v0.s}[2], [x9], 4 506 LDR s1, [x10], 4 507 LD1 {v1.s}[2], [x11], 4 508 LDR s2, [x12], 4 509 LD1 {v2.s}[2], [x4], 4 510 LDR q17, [x5], 16 511 512 FMLA v20.4s, v16.4s, v0.s[0] 513 FMLA v22.4s, v16.4s, v0.s[2] 514 FMLA v24.4s, v16.4s, v1.s[0] 515 FMLA v26.4s, v16.4s, v1.s[2] 516 FMLA v28.4s, v16.4s, v2.s[0] 517 FMLA v30.4s, v16.4s, v2.s[2] 518 FMLA v21.4s, v17.4s, v0.s[0] 519 FMLA v23.4s, v17.4s, v0.s[2] 520 FMLA v25.4s, v17.4s, v1.s[0] 521 FMLA v27.4s, v17.4s, v1.s[2] 522 FMLA v29.4s, v17.4s, v2.s[0] 523 FMLA v31.4s, v17.4s, v2.s[2] 524 B 3b 525 526 # Store odd width 5276: 528 TBZ x1, 2, 7f 529 STR q20, [x6], 16 530 MOV v20.16b, v21.16b 531 STR q22, [x16], 16 532 MOV v22.16b, v23.16b 533 STR q24, [x17], 16 534 MOV v24.16b, v25.16b 535 STR q26, [x14], 16 536 MOV v26.16b, v27.16b 537 STR q28, [x13], 16 538 MOV v28.16b, v29.16b 539 STR q30, [x7], 16 540 MOV v30.16b, v31.16b 541 5427: 543 TBZ x1, 1, 8f 544 STR d20, [x6], 8 545 DUP d20, v20.d[1] 546 STR d22, [x16], 8 547 DUP d22, v22.d[1] 548 STR d24, [x17], 8 549 DUP d24, v24.d[1] 550 STR d26, [x14], 8 551 DUP d26, v26.d[1] 552 STR d28, [x13], 8 553 DUP d28, v28.d[1] 554 STR d30, [x7], 8 555 DUP d30, v30.d[1] 556 5578: 558 TBZ x1, 0, 9f 559 STR s20, [x6] 560 STR s22, [x16] 561 STR s24, [x17] 562 STR s26, [x14] 563 STR s28, [x13] 564 STR s30, [x7] 5659: 566 // Restore d12-d15 from stack 567 LDP d14, d15, [sp, 16] 568 LDP d12, d13, [sp], 32 569 RET 570 571END_FUNCTION xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55 572 573#ifdef __ELF__ 574.section ".note.GNU-stack","",%progbits 575#endif 576