1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/4x8-aarch64-neonfma-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32 33# C pointers 34# x6 c0 35# x16 c1 36# x17 c2 37# x14 c3 38 39# x4 temporary vector shadow register 40 41# Vector register usage 42# A0 v0 v3 43# A1 v0[1] v3[1] 44# A2 v1 v4 45# A3 v1[1] v4[1] 46 47# B v12 v13 v14 v15 second set of B 48# B v16 v17 v18 v19 first set 49# C v20 v21 50# C v22 v23 51# C v24 v25 52# C v26 v27 53# Clamp v6 v7 54 55# unused A v8 v9 v10 v11 56# x12 a4 57# x13 c4 58# x7 c5 59# A4 v2 v5 60# A5 v2[1] v5[1] 61# C v28 v29 62# C v30 v31 63 64BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55 65 66 # Load acc, params pointer 67 LDP x15, x8, [sp, 8] 68 69 # Clamp A and C pointers 70 CMP x0, 2 // if mr < 2 71 ADD x9, x3, x4 // a1 = a0 + a_stride 72 ADD x16, x6, x7 // c1 = c0 + cm_stride 73 CSEL x9, x3, x9, LO // a1 = a0 74 CSEL x16, x6, x16, LO // c1 = c0 75 76 ADD x10, x9, x4 // a2 = a1 + a_stride 77 ADD x17, x16, x7 // c2 = c1 + cm_stride 78 // if mr <= 2 79 CSEL x10, x9, x10, LS // a2 = a1 80 CSEL x17, x16, x17, LS // c2 = c1 81 82 CMP x0, 4 // if mr < 4 83 ADD x11, x10, x4 // a3 = a2 + a_stride 84 ADD x14, x17, x7 // c3 = c2 + cm_stride 85 CSEL x11, x10, x11, LO // a3 = a2 86 CSEL x14, x17, x14, LO // c3 = c2 87 88 # Load min/max values 89 LD2R {v6.4s, v7.4s}, [x8] 90 91 // Save d12-d15 on stack 92 STP d12, d13, [sp, -32]! 93 STP d14, d15, [sp, 16] 94 950: 96 # Load initial accumulators 97 LDP q20, q21, [x15], 32 98 LDP q22, q23, [x15], 32 99 LDP q24, q25, [x15], 32 100 LDP q26, q27, [x15], 32 101 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 102 PRFM PLDL1KEEP, [x3, 64] 103 PRFM PLDL1KEEP, [x9, 0] 104 PRFM PLDL1KEEP, [x9, 64] 105 PRFM PLDL1KEEP, [x10, 0] 106 PRFM PLDL1KEEP, [x10, 64] 107 PRFM PLDL1KEEP, [x11, 0] 108 PRFM PLDL1KEEP, [x11, 64] 109 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 110 PRFM PLDL1KEEP, [x5, 64] 111 PRFM PLDL1KEEP, [x5, 128] 112 PRFM PLDL1KEEP, [x5, 192] 113 114 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 115 SUBS x0, x2, 16 // k = kc - 16 116 B.LO 4f 117 118 # Prologue - First group loads, no FMA 119 LDR d0, [x3], 8 // a0 120 LDP q16, q17, [x5], 32 // b 121 LDR d1, [x10], 8 // a2 122 LD1 {v0.d}[1], [x9], 8 // a1 123 LD1 {v1.d}[1], [x11], 8 // a3 124 SUBS x0, x0, 16 125 LDR q18, [x5], 16 126 LDR d19, [x5], 8 127 LDR x4, [x5], 8 // ins is in BLOCK 0 128 129 # Is there at least 4 floats (16 bytes) for main loop? 130 B.LO 2f 131 132 # Main loop - 4 floats of A (16 bytes) 133 # 32 FMA + 8 LD64 A + 8 LDR B 1341: 135 # First group of 16 FMA, Second group loads 136 // BLOCK 0 137 FMLA v20.4s, v16.4s, v0.s[0] 138 LDR d3, [x3], 8 // a0 139 FMLA v22.4s, v16.4s, v0.s[2] 140 INS v19.d[1], x4 // b from second group 141 FMLA v24.4s, v16.4s, v1.s[0] 142 LDR x4, [x9], 8 // a1 143 144 // BLOCK 1 145 FMLA v26.4s, v16.4s, v1.s[2] 146 LDR d12, [x5] 147 FMLA v21.4s, v17.4s, v0.s[0] 148 INS v3.d[1], x4 // a1 ins 149 FMLA v23.4s, v17.4s, v0.s[2] 150 LDR x4, [x5, 8] // b 151 152 // BLOCK 2 153 FMLA v25.4s, v17.4s, v1.s[0] 154 LDR d4, [x10], 8 // a2 155 FMLA v27.4s, v17.4s, v1.s[2] 156 INS v12.d[1], x4 // b ins 157 FMLA v20.4s, v18.4s, v0.s[1] 158 LDR x4, [x11], 8 // a3 159 160 // BLOCK 3 161 FMLA v22.4s, v18.4s, v0.s[3] 162 LDR d13, [x5, 16] 163 FMLA v24.4s, v18.4s, v1.s[1] 164 INS v4.d[1], x4 // a3 ins 165 FMLA v26.4s, v18.4s, v1.s[3] 166 LDR x4, [x5, 24] 167 168 // BLOCK 4 169 FMLA v21.4s, v19.4s, v0.s[1] 170 LDR d14, [x5, 32] 171 FMLA v23.4s, v19.4s, v0.s[3] 172 INS v13.d[1], x4 // b 173 FMLA v25.4s, v19.4s, v1.s[1] 174 LDR x4, [x5, 40] 175 176 // BLOCK 5 177 // NOPs to ensure 4 cycle LDR lands on next LDR 178 FMLA v27.4s, v19.4s, v1.s[3] 179 LDR d15, [x5, 48] 180 NOP 181 INS v14.d[1], x4 // b from previous 182 SUBS x0, x0, 16 183 LDR x4, [x5, 56] 184 185 # Second group of 16 FMA, First group of loads 186 // BLOCK 0 187 FMLA v20.4s, v12.4s, v3.s[0] 188 LDR d0, [x3], 8 // a0 189 FMLA v22.4s, v12.4s, v3.s[2] 190 INS v15.d[1], x4 // b from previous 191 FMLA v24.4s, v12.4s, v4.s[0] 192 LDR x4, [x9], 8 // a1 193 194 // BLOCK 1 195 FMLA v26.4s, v12.4s, v4.s[2] 196 LDR d16, [x5, 64] 197 FMLA v21.4s, v13.4s, v3.s[0] 198 INS v0.d[1], x4 // a1 ins 199 FMLA v23.4s, v13.4s, v3.s[2] 200 LDR x4, [x5, 72] // b 201 202 // BLOCK 2 203 FMLA v25.4s, v13.4s, v4.s[0] 204 LDR d1, [x10], 8 // a2 205 FMLA v27.4s, v13.4s, v4.s[2] 206 INS v16.d[1], x4 // b 207 FMLA v20.4s, v14.4s, v3.s[1] 208 LDR x4, [x11], 8 // a3 209 210 // BLOCK 3 211 FMLA v22.4s, v14.4s, v3.s[3] 212 LDR d17, [x5, 80] 213 FMLA v24.4s, v14.4s, v4.s[1] 214 INS v1.d[1], x4 // a3 ins 215 FMLA v26.4s, v14.4s, v4.s[3] 216 LDR x4, [x5, 88] 217 218 // BLOCK 4 219 FMLA v21.4s, v15.4s, v3.s[1] 220 LDR d18, [x5, 96] 221 FMLA v23.4s, v15.4s, v3.s[3] 222 INS v17.d[1], x4 // b 223 FMLA v25.4s, v15.4s, v4.s[1] 224 LDR x4, [x5, 104] 225 226 // BLOCK 5 227 // NOTE that block needs to be 4 cycles for LDR not to stall 228 FMLA v27.4s, v15.4s, v4.s[3] 229 LDR d19, [x5, 112] 230 INS v18.d[1], x4 231 LDR x4, [x5, 120] 232 ADD x5, x5, 128 233 B.HS 1b 234 235 # Epilogue - 4 floats of A (16 bytes) 236 # 32 FMA + 8 LD64 A + 8 LDR B 2372: 238 # First group of 16 FMA, Second group loads 239 // BLOCK 0 240 FMLA v20.4s, v16.4s, v0.s[0] 241 LDR d3, [x3], 8 // a0 242 FMLA v22.4s, v16.4s, v0.s[2] 243 INS v19.d[1], x4 // b from second group 244 FMLA v24.4s, v16.4s, v1.s[0] 245 LDR x4, [x9], 8 // a1 246 247 // BLOCK 1 248 FMLA v26.4s, v16.4s, v1.s[2] 249 LDR d12, [x5] 250 FMLA v21.4s, v17.4s, v0.s[0] 251 INS v3.d[1], x4 // a1 ins 252 FMLA v23.4s, v17.4s, v0.s[2] 253 LDR x4, [x5, 8] // b 254 255 // BLOCK 2 256 FMLA v25.4s, v17.4s, v1.s[0] 257 LDR d4, [x10], 8 // a2 258 FMLA v27.4s, v17.4s, v1.s[2] 259 INS v12.d[1], x4 // b ins 260 FMLA v20.4s, v18.4s, v0.s[1] 261 LDR x4, [x11], 8 // a3 262 263 // BLOCK 3 264 FMLA v22.4s, v18.4s, v0.s[3] 265 LDR d13, [x5, 16] 266 FMLA v24.4s, v18.4s, v1.s[1] 267 INS v4.d[1], x4 // a3 ins 268 FMLA v26.4s, v18.4s, v1.s[3] 269 LDR x4, [x5, 24] 270 271 // BLOCK 4 272 FMLA v21.4s, v19.4s, v0.s[1] 273 LDR d14, [x5, 32] 274 FMLA v23.4s, v19.4s, v0.s[3] 275 INS v13.d[1], x4 // b 276 FMLA v25.4s, v19.4s, v1.s[1] 277 LDR x4, [x5, 40] 278 279 // BLOCK 5 280 // NOPs to ensure 4 cycle LDR lands on next LDR 281 FMLA v27.4s, v19.4s, v1.s[3] 282 LDR d15, [x5, 48] 283 NOP // fma 284 INS v14.d[1], x4 285 NOP 286 LDR x4, [x5, 56] 287 288 # Second group of 16 FMA, no loads 289 // BLOCK 0 290 FMLA v20.4s, v12.4s, v3.s[0] 291 FMLA v22.4s, v12.4s, v3.s[2] 292 INS v15.d[1], x4 // b from previous 293 FMLA v24.4s, v12.4s, v4.s[0] 294 295 // BLOCK 1 296 FMLA v26.4s, v12.4s, v4.s[2] 297 FMLA v21.4s, v13.4s, v3.s[0] 298 FMLA v23.4s, v13.4s, v3.s[2] 299 300 // BLOCK 2 301 FMLA v25.4s, v13.4s, v4.s[0] 302 FMLA v27.4s, v13.4s, v4.s[2] 303 FMLA v20.4s, v14.4s, v3.s[1] 304 305 // BLOCK 3 306 FMLA v22.4s, v14.4s, v3.s[3] 307 FMLA v24.4s, v14.4s, v4.s[1] 308 FMLA v26.4s, v14.4s, v4.s[3] 309 TST x0, 15 310 311 // BLOCK 4 312 FMLA v21.4s, v15.4s, v3.s[1] 313 FMLA v23.4s, v15.4s, v3.s[3] 314 FMLA v25.4s, v15.4s, v4.s[1] 315 ADD x5, x5, 64 316 317 // BLOCK 5 318 FMLA v27.4s, v15.4s, v4.s[3] 319 320 # Is there a remainder?- 2 floats of A (8 bytes) or less 321 B.NE 4f 322 3233: 324 # Clamp 325 FMAX v20.4s, v20.4s, v6.4s 326 # Load cn_stride 327 LDR x0, [sp, 32] 328 FMAX v21.4s, v21.4s, v6.4s 329 FMAX v22.4s, v22.4s, v6.4s 330 FMAX v23.4s, v23.4s, v6.4s 331 FMAX v24.4s, v24.4s, v6.4s 332 FMAX v25.4s, v25.4s, v6.4s 333 FMAX v26.4s, v26.4s, v6.4s 334 FMAX v27.4s, v27.4s, v6.4s 335 SUBS x1, x1, 8 336 FMIN v20.4s, v20.4s, v7.4s 337 FMIN v21.4s, v21.4s, v7.4s 338 FMIN v22.4s, v22.4s, v7.4s 339 FMIN v23.4s, v23.4s, v7.4s 340 FMIN v24.4s, v24.4s, v7.4s 341 FMIN v25.4s, v25.4s, v7.4s 342 FMIN v26.4s, v26.4s, v7.4s 343 FMIN v27.4s, v27.4s, v7.4s 344 345 # Store full 4 x 8 346 B.LO 6f 347 348 ST1 {v26.16b, v27.16b}, [x14], x0 349 SUB x3, x3, x2 // a0 -= kc 350 ST1 {v24.16b, v25.16b}, [x17], x0 351 SUB x9, x9, x2 // a1 -= kc 352 ST1 {v22.16b, v23.16b}, [x16], x0 353 SUB x10, x10, x2 // a2 -= kc 354 ST1 {v20.16b, v21.16b}, [x6], x0 355 SUB x11, x11, x2 // a3 -= kc 356 357 B.HI 0b 358 359 // Restore d12-d15 from stack 360 LDP d14, d15, [sp, 16] 361 LDP d12, d13, [sp], 32 362 RET 363 3644: 365 # Is there a remainder?- 2 floats of A (8 bytes) 366 TBZ x0, 3, 5f 367 368 # Remainder- 2 floats of A (8 bytes) 369 LDR d0, [x3], 8 370 LDR q16, [x5], 16 371 LD1 {v0.d}[1], [x9], 8 372 LDR d1, [x10], 8 373 LD1 {v1.d}[1], [x11], 8 374 LDR q17, [x5], 16 375 LDR q18, [x5], 16 376 LDR q19, [x5], 16 377 FMLA v20.4s, v16.4s, v0.s[0] 378 FMLA v22.4s, v16.4s, v0.s[2] 379 FMLA v24.4s, v16.4s, v1.s[0] 380 FMLA v26.4s, v16.4s, v1.s[2] 381 FMLA v21.4s, v17.4s, v0.s[0] 382 FMLA v23.4s, v17.4s, v0.s[2] 383 FMLA v25.4s, v17.4s, v1.s[0] 384 FMLA v27.4s, v17.4s, v1.s[2] 385 386 FMLA v20.4s, v18.4s, v0.s[1] 387 FMLA v22.4s, v18.4s, v0.s[3] 388 FMLA v24.4s, v18.4s, v1.s[1] 389 FMLA v26.4s, v18.4s, v1.s[3] 390 FMLA v21.4s, v19.4s, v0.s[1] 391 FMLA v23.4s, v19.4s, v0.s[3] 392 FMLA v25.4s, v19.4s, v1.s[1] 393 FMLA v27.4s, v19.4s, v1.s[3] 394 395 # Is there a remainder?- 1 floats of A (4 bytes) 396 TBZ x0, 2, 3b 397 3985: 399 # Remainder- 1 floats of A (4 bytes) 400 LDR s0, [x3], 4 401 LDR q16, [x5], 16 402 LD1 {v0.s}[2], [x9], 4 403 LDR s1, [x10], 4 404 LD1 {v1.s}[2], [x11], 4 405 LDR q17, [x5], 16 406 407 FMLA v20.4s, v16.4s, v0.s[0] 408 FMLA v22.4s, v16.4s, v0.s[2] 409 FMLA v24.4s, v16.4s, v1.s[0] 410 FMLA v26.4s, v16.4s, v1.s[2] 411 FMLA v21.4s, v17.4s, v0.s[0] 412 FMLA v23.4s, v17.4s, v0.s[2] 413 FMLA v25.4s, v17.4s, v1.s[0] 414 FMLA v27.4s, v17.4s, v1.s[2] 415 B 3b 416 417 # Store odd width 4186: 419 TBZ x1, 2, 7f 420 STR q26, [x14], 16 421 MOV v26.16b, v27.16b 422 STR q24, [x17], 16 423 MOV v24.16b, v25.16b 424 STR q22, [x16], 16 425 MOV v22.16b, v23.16b 426 STR q20, [x6], 16 427 MOV v20.16b, v21.16b 428 4297: 430 TBZ x1, 1, 8f 431 STR d26, [x14], 8 432 DUP d26, v26.d[1] 433 STR d24, [x17], 8 434 DUP d24, v24.d[1] 435 STR d22, [x16], 8 436 DUP d22, v22.d[1] 437 STR d20, [x6], 8 438 DUP d20, v20.d[1] 439 4408: 441 TBZ x1, 0, 9f 442 STR s26, [x14] 443 STR s24, [x17] 444 STR s22, [x16] 445 STR s20, [x6] 4469: 447 // Restore d12-d15 from stack 448 LDP d14, d15, [sp, 16] 449 LDP d12, d13, [sp], 32 450 RET 451 452END_FUNCTION xnn_f32_gemminc_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55 453 454#ifdef __ELF__ 455.section ".note.GNU-stack","",%progbits 456#endif 457