1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32 33# C pointers 34# x6 c0 35# x16 c1 36# x17 c2 37# x18 c3 38 39# x4 temporary vector shadow register 40 41# Vector register usage 42# A0 v0 v3 43# A1 v0[1] v3[1] 44# A2 v1 v4 45# A3 v1[1] v4[1] 46 47# B v12 v13 v14 v15 second set of B 48# B v16 v17 v18 v19 first set 49# C v20 v21 50# C v22 v23 51# C v24 v25 52# C v26 v27 53# Clamp v6 v7 54 55# unused A v8 v9 v10 v11 56# x12 a4 57# x13 c4 58# x7 c5 59# A4 v2 v5 60# A5 v2[1] v5[1] 61# C v28 v29 62# C v30 v31 63 64BEGIN_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53 65 66 # Clamp A and C pointers 67 CMP x0, 2 // if mr < 2 68 ADD x9, x3, x4 // a1 = a0 + a_stride 69 ADD x16, x6, x7 // c1 = c0 + cm_stride 70 CSEL x9, x3, x9, LO // a1 = a0 71 CSEL x16, x6, x16, LO // c1 = c0 72 73 ADD x10, x9, x4 // a2 = a1 + a_stride 74 ADD x17, x16, x7 // c2 = c1 + cm_stride 75 // if mr <= 2 76 CSEL x10, x9, x10, LS // a2 = a1 77 CSEL x17, x16, x17, LS // c2 = c1 78 79 CMP x0, 4 // if mr < 4 80 ADD x11, x10, x4 // a3 = a2 + a_stride 81 ADD x18, x17, x7 // c3 = c2 + cm_stride 82 CSEL x11, x10, x11, LO // a3 = a2 83 CSEL x18, x17, x18, LO // c3 = c2 84 85 # Load params pointer 86 LDR x8, [sp, 8] 87 88 # Load clamping_params values 89 LD2R {v6.4s, v7.4s}, [x8] 90 91 # Load cn_stride 92 LDR x14, [sp] 93 94 // Save d12-d15 on stack 95 STP d12, d13, [sp, -32]! 96 STP d14, d15, [sp, 16] 97 980: 99 # Load initial bias from w into accumulators 100 LDP q20, q21, [x5], 32 101 MOV v22.16b, v20.16b 102 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 103 PRFM PLDL1KEEP, [x3, 64] 104 MOV v23.16b, v21.16b 105 PRFM PLDL1KEEP, [x9, 0] 106 PRFM PLDL1KEEP, [x9, 64] 107 MOV v24.16b, v20.16b 108 PRFM PLDL1KEEP, [x10, 0] 109 PRFM PLDL1KEEP, [x10, 64] 110 MOV v25.16b, v21.16b 111 PRFM PLDL1KEEP, [x11, 0] 112 PRFM PLDL1KEEP, [x11, 64] 113 MOV v26.16b, v20.16b 114 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 115 MOV v27.16b, v21.16b 116 PRFM PLDL1KEEP, [x5, 64] 117 PRFM PLDL1KEEP, [x5, 128] 118 PRFM PLDL1KEEP, [x5, 192] 119 120 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 121 SUBS x0, x2, 16 // k = kc - 16 122 B.LO 5f 123 124 # Prologue - First group loads, no FMA 125 LDR d0, [x3], 8 // a0 126 LDP q16, q17, [x5], 32 // b 127 LDR d1, [x10], 8 // a2 128 LD1 {v0.d}[1], [x9], 8 // a1 129 LD1 {v1.d}[1], [x11], 8 // a3 130 SUBS x0, x0, 16 131 LDR q18, [x5], 16 132 LDR d19, [x5], 8 133 LDR x4, [x5], 8 // ins is in BLOCK 0 134 135 # Is there at least 4 floats (16 bytes) for main loop? 136 B.LO 2f 137 138 # Main loop - 4 floats of A (16 bytes) 139 # 32 FMA + 8 LD64 A + 8 LDR B 1401: 141 # First group of 16 FMA, Second group loads 142 // BLOCK 0 143 LDR d3, [x3], 8 // a0 144 INS v19.d[1], x4 // b from second group 145 FMLA v20.4s, v16.4s, v0.s[0] 146 LDR x4, [x9], 8 // a1 147 FMLA v22.4s, v16.4s, v0.s[2] 148 FMLA v24.4s, v16.4s, v1.s[0] 149 150 // BLOCK 1 151 LDR d12, [x5] 152 INS v3.d[1], x4 // a1 ins 153 FMLA v26.4s, v16.4s, v1.s[2] 154 LDR x4, [x5, 8] // b 155 FMLA v21.4s, v17.4s, v0.s[0] 156 FMLA v23.4s, v17.4s, v0.s[2] 157 158 // BLOCK 2 159 LDR d4, [x10], 8 // a2 160 INS v12.d[1], x4 // b ins 161 FMLA v25.4s, v17.4s, v1.s[0] 162 LDR x4, [x11], 8 // a3 163 FMLA v27.4s, v17.4s, v1.s[2] 164 FMLA v20.4s, v18.4s, v0.s[1] 165 166 // BLOCK 3 167 LDR d13, [x5, 16] 168 INS v4.d[1], x4 // a3 ins 169 FMLA v22.4s, v18.4s, v0.s[3] 170 LDR x4, [x5, 24] 171 FMLA v24.4s, v18.4s, v1.s[1] 172 FMLA v26.4s, v18.4s, v1.s[3] 173 174 // BLOCK 4 175 LDR d14, [x5, 32] 176 INS v13.d[1], x4 // b 177 FMLA v21.4s, v19.4s, v0.s[1] 178 LDR x4, [x5, 40] 179 FMLA v23.4s, v19.4s, v0.s[3] 180 FMLA v25.4s, v19.4s, v1.s[1] 181 182 // BLOCK 5 183 // NOPs to ensure 4 cycle LDR lands on next LDR 184 LDR d15, [x5, 48] 185 INS v14.d[1], x4 // b from previous 186 FMLA v27.4s, v19.4s, v1.s[3] 187 LDR x4, [x5, 56] 188 NOP 189 NOP 190 NOP 191 NOP 192 193 # Second group of 16 FMA, First group of loads 194 // BLOCK 0 195 LDR d0, [x3], 8 // a0 196 INS v15.d[1], x4 // b from previous 197 FMLA v20.4s, v12.4s, v3.s[0] 198 LDR x4, [x9], 8 // a1 199 FMLA v22.4s, v12.4s, v3.s[2] 200 FMLA v24.4s, v12.4s, v4.s[0] 201 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 202 203 // BLOCK 1 204 LDR d16, [x5, 64] 205 INS v0.d[1], x4 // a1 ins 206 FMLA v26.4s, v12.4s, v4.s[2] 207 LDR x4, [x5, 72] // b 208 FMLA v21.4s, v13.4s, v3.s[0] 209 FMLA v23.4s, v13.4s, v3.s[2] 210 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1 211 212 // BLOCK 2 213 LDR d1, [x10], 8 // a2 214 INS v16.d[1], x4 // b 215 FMLA v25.4s, v13.4s, v4.s[0] 216 LDR x4, [x11], 8 // a3 217 FMLA v27.4s, v13.4s, v4.s[2] 218 FMLA v20.4s, v14.4s, v3.s[1] 219 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2 220 221 // BLOCK 3 222 LDR d17, [x5, 80] 223 INS v1.d[1], x4 // a3 ins 224 FMLA v22.4s, v14.4s, v3.s[3] 225 LDR x4, [x5, 88] 226 FMLA v24.4s, v14.4s, v4.s[1] 227 FMLA v26.4s, v14.4s, v4.s[3] 228 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3 229 230 // BLOCK 4 231 LDR d18, [x5, 96] 232 INS v17.d[1], x4 // b 233 FMLA v21.4s, v15.4s, v3.s[1] 234 LDR x4, [x5, 104] 235 FMLA v23.4s, v15.4s, v3.s[3] 236 FMLA v25.4s, v15.4s, v4.s[1] 237 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 238 239 // BLOCK 5 240 // NOTE that block needs to be 4 cycles for LDR not to stall 241 LDR d19, [x5, 112] 242 INS v18.d[1], x4 243 FMLA v27.4s, v15.4s, v4.s[3] 244 LDR x4, [x5, 120] 245 SUBS x0, x0, 16 246 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 247 ADD x5, x5, 128 248 B.HS 1b 249 250 # Epilogue - 4 floats of A (16 bytes) 251 # 32 FMA + 8 LD64 A + 8 LDR B 2522: 253 # First group of 16 FMA, Second group loads 254 // BLOCK 0 255 LDR d3, [x3], 8 // a0 256 INS v19.d[1], x4 // b from second group 257 FMLA v20.4s, v16.4s, v0.s[0] 258 LDR x4, [x9], 8 // a1 259 FMLA v22.4s, v16.4s, v0.s[2] 260 FMLA v24.4s, v16.4s, v1.s[0] 261 262 // BLOCK 1 263 LDR d12, [x5] 264 INS v3.d[1], x4 // a1 ins 265 FMLA v26.4s, v16.4s, v1.s[2] 266 LDR x4, [x5, 8] // b 267 FMLA v21.4s, v17.4s, v0.s[0] 268 FMLA v23.4s, v17.4s, v0.s[2] 269 270 // BLOCK 2 271 LDR d4, [x10], 8 // a2 272 INS v12.d[1], x4 // b ins 273 FMLA v25.4s, v17.4s, v1.s[0] 274 LDR x4, [x11], 8 // a3 275 FMLA v27.4s, v17.4s, v1.s[2] 276 FMLA v20.4s, v18.4s, v0.s[1] 277 278 // BLOCK 3 279 LDR d13, [x5, 16] 280 INS v4.d[1], x4 // a3 ins 281 FMLA v22.4s, v18.4s, v0.s[3] 282 LDR x4, [x5, 24] 283 FMLA v24.4s, v18.4s, v1.s[1] 284 FMLA v26.4s, v18.4s, v1.s[3] 285 286 // BLOCK 4 287 LDR d14, [x5, 32] 288 INS v13.d[1], x4 // b 289 FMLA v21.4s, v19.4s, v0.s[1] 290 LDR x4, [x5, 40] 291 FMLA v23.4s, v19.4s, v0.s[3] 292 FMLA v25.4s, v19.4s, v1.s[1] 293 294 // BLOCK 5 295 // NOPs to ensure 4 cycle LDR lands on next LDR 296 LDR d15, [x5, 48] 297 INS v14.d[1], x4 298 FMLA v27.4s, v19.4s, v1.s[3] 299 LDR x4, [x5, 56] 300 NOP // fma 301 NOP 302 NOP // fma 303 NOP 304 305 # Second group of 16 FMA, no loads 306 // BLOCK 0 307 INS v15.d[1], x4 // b from previous 308 FMLA v20.4s, v12.4s, v3.s[0] 309 FMLA v22.4s, v12.4s, v3.s[2] 310 FMLA v24.4s, v12.4s, v4.s[0] 311 312 // BLOCK 1 313 FMLA v26.4s, v12.4s, v4.s[2] 314 FMLA v21.4s, v13.4s, v3.s[0] 315 FMLA v23.4s, v13.4s, v3.s[2] 316 317 // BLOCK 2 318 FMLA v25.4s, v13.4s, v4.s[0] 319 FMLA v27.4s, v13.4s, v4.s[2] 320 FMLA v20.4s, v14.4s, v3.s[1] 321 322 // BLOCK 3 323 FMLA v22.4s, v14.4s, v3.s[3] 324 FMLA v24.4s, v14.4s, v4.s[1] 325 FMLA v26.4s, v14.4s, v4.s[3] 326 TST x0, 15 327 328 // BLOCK 4 329 FMLA v21.4s, v15.4s, v3.s[1] 330 FMLA v23.4s, v15.4s, v3.s[3] 331 FMLA v25.4s, v15.4s, v4.s[1] 332 ADD x5, x5, 64 333 334 // BLOCK 5 335 FMLA v27.4s, v15.4s, v4.s[3] 336 337 # Is there a remainder?- 2 floats of A (8 bytes) or less 338 B.NE 5f 339 3404: 341 # Clamp 342 FMIN v20.4s, v20.4s, v6.4s 343 SUBS x1, x1, 8 344 FMIN v21.4s, v21.4s, v6.4s 345 FMIN v22.4s, v22.4s, v6.4s 346 FMIN v23.4s, v23.4s, v6.4s 347 FMIN v24.4s, v24.4s, v6.4s 348 FMIN v25.4s, v25.4s, v6.4s 349 FMIN v26.4s, v26.4s, v6.4s 350 FMIN v27.4s, v27.4s, v6.4s 351 FMAX v20.4s, v20.4s, v7.4s 352 FMAX v21.4s, v21.4s, v7.4s 353 FMAX v22.4s, v22.4s, v7.4s 354 FMAX v23.4s, v23.4s, v7.4s 355 FMAX v24.4s, v24.4s, v7.4s 356 FMAX v25.4s, v25.4s, v7.4s 357 FMAX v26.4s, v26.4s, v7.4s 358 FMAX v27.4s, v27.4s, v7.4s 359 360 # Store full 4 x 8 361 B.LO 8f 362 363 ST1 {v20.16b, v21.16b}, [x6], x14 364 SUB x3, x3, x2 // a0 -= kc 365 ST1 {v22.16b, v23.16b}, [x16], x14 366 SUB x9, x9, x2 // a1 -= kc 367 ST1 {v24.16b, v25.16b}, [x17], x14 368 SUB x10, x10, x2 // a2 -= kc 369 ST1 {v26.16b, v27.16b}, [x18], x14 370 SUB x11, x11, x2 // a3 -= kc 371 372 B.HI 0b 373 374 // Restore d12-d15 from stack 375 LDP d14, d15, [sp, 16] 376 LDP d12, d13, [sp], 32 377 RET 378 3795: 380 # Is there a remainder?- 2 floats of A (8 bytes) 381 TBZ x0, 3, 6f 382 383 # Remainder- 2 floats of A (8 bytes) 384 LDR d0, [x3], 8 385 LDR q16, [x5], 16 386 LD1 {v0.d}[1], [x9], 8 387 LDR d1, [x10], 8 388 LD1 {v1.d}[1], [x11], 8 389 LDR q17, [x5], 16 390 LDR q18, [x5], 16 391 LDR q19, [x5], 16 392 FMLA v20.4s, v16.4s, v0.s[0] 393 FMLA v22.4s, v16.4s, v0.s[2] 394 FMLA v24.4s, v16.4s, v1.s[0] 395 FMLA v26.4s, v16.4s, v1.s[2] 396 FMLA v21.4s, v17.4s, v0.s[0] 397 FMLA v23.4s, v17.4s, v0.s[2] 398 FMLA v25.4s, v17.4s, v1.s[0] 399 FMLA v27.4s, v17.4s, v1.s[2] 400 401 FMLA v20.4s, v18.4s, v0.s[1] 402 FMLA v22.4s, v18.4s, v0.s[3] 403 FMLA v24.4s, v18.4s, v1.s[1] 404 FMLA v26.4s, v18.4s, v1.s[3] 405 FMLA v21.4s, v19.4s, v0.s[1] 406 FMLA v23.4s, v19.4s, v0.s[3] 407 FMLA v25.4s, v19.4s, v1.s[1] 408 FMLA v27.4s, v19.4s, v1.s[3] 409 410 # Is there a remainder?- 1 floats of A (4 bytes) 411 TBZ x0, 2, 4b 412 4136: 414 # Remainder- 1 floats of A (4 bytes) 415 LDR s0, [x3], 4 416 LDR q16, [x5], 16 417 LD1 {v0.s}[2], [x9], 4 418 LDR s1, [x10], 4 419 LD1 {v1.s}[2], [x11], 4 420 LDR q17, [x5], 16 421 422 FMLA v20.4s, v16.4s, v0.s[0] 423 FMLA v22.4s, v16.4s, v0.s[2] 424 FMLA v24.4s, v16.4s, v1.s[0] 425 FMLA v26.4s, v16.4s, v1.s[2] 426 FMLA v21.4s, v17.4s, v0.s[0] 427 FMLA v23.4s, v17.4s, v0.s[2] 428 FMLA v25.4s, v17.4s, v1.s[0] 429 FMLA v27.4s, v17.4s, v1.s[2] 430 B 4b 431 432 # Store odd width 4338: 434 TBZ x1, 2, 9f 435 STR q20, [x6], 16 436 MOV v20.16b, v21.16b 437 STR q22, [x16], 16 438 MOV v22.16b, v23.16b 439 STR q24, [x17], 16 440 MOV v24.16b, v25.16b 441 STR q26, [x18], 16 442 MOV v26.16b, v27.16b 443 4449: 445 TBZ x1, 1, 10f 446 STR d20, [x6], 8 447 DUP d20, v20.d[1] 448 STR d22, [x16], 8 449 DUP d22, v22.d[1] 450 STR d24, [x17], 8 451 DUP d24, v24.d[1] 452 STR d26, [x18], 8 453 DUP d26, v26.d[1] 454 45510: 456 TBZ x1, 0, 11f 457 STR s20, [x6] 458 STR s22, [x16] 459 STR s24, [x17] 460 STR s26, [x18] 46111: 462 // Restore d12-d15 from stack 463 LDP d14, d15, [sp, 16] 464 LDP d12, d13, [sp], 32 465 RET 466 467END_FUNCTION xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53 468 469#ifdef __ELF__ 470.section ".note.GNU-stack","",%progbits 471#endif 472