1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_minmax_params params [sp + 24] -> x8 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# A pointers 25# x14 a0 26# x15 a1 27# x20 a2 28# x21 a3 29# x22 a4 30# x23 a5 31 32# C pointers 33# x6 c0 34# x16 c1 35# x17 c2 36# x10 c3 37# x13 c4 38# x7 c5 39 40# x19 temporary vector shadow register 41 42# Vector register usage 43# A0 v0 v3 44# A1 v0[1] v3[1] 45# A2 v1 v4 46# A3 v1[1] v4[1] 47# A4 v2 v5 48# A5 v2[1] v5[1] 49# B v12 v13 v14 v15 second set of B 50# B v16 v17 v18 v19 first set 51# C v20 v21 52# C v22 v23 53# C v24 v25 54# C v26 v27 55# C v28 v29 56# C v30 v31 57# Clamp v6 v7 58# unused A v8 v9 v10 v11 59 60BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53 61 62 # Clamp C pointers 63 CMP x0, 2 // if mr < 2 64 ADD x16, x6, x7 // c1 = c0 + cm_stride 65 CSEL x16, x6, x16, LO // c1 = c0 66 67 ADD x17, x16, x7 // c2 = c1 + cm_stride 68 // if mr <= 2 69 CSEL x17, x16, x17, LS // c2 = c1 70 71 CMP x0, 4 // if mr < 4 72 ADD x10, x17, x7 // c3 = c2 + cm_stride 73 CSEL x10, x17, x10, LO // c3 = c2 74 75 ADD x13, x10, x7 // c4 = c3 + cm_stride 76 // if mr <= 4 77 CSEL x13, x10, x13, LS // c4 = c3 78 79 80 CMP x0, 6 // if mr < 6 81 ADD x7, x13, x7 // c5 = c4 + cm_stride 82 CSEL x7, x13, x7, LO // c5 = c4 83 84 # Load a_offset 85 LDR x11, [sp, 8] 86 87 # Load zero, params pointer 88 LDP x12, x8, [sp, 16] 89 90 # Load min/max values 91 LD2R {v6.4s, v7.4s}, [x8] 92 93 // Save x19-x23, d12-d15 on stack 94 STP d12, d13, [sp, -80]! 95 STP d14, d15, [sp, 16] 96 STP x19, x20, [sp, 32] 97 STP x21, x22, [sp, 48] 98 STR x23, [sp, 64] 99 1000: 101 # Load initial bias from w into accumulators 102 LDP q20, q21, [x5], 32 103 MOV v22.16b, v20.16b 104 MOV v23.16b, v21.16b 105 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 106 MOV v24.16b, v20.16b 107 PRFM PLDL1KEEP, [x5, 64] 108 MOV v25.16b, v21.16b 109 PRFM PLDL1KEEP, [x5, 128] 110 MOV v26.16b, v20.16b 111 PRFM PLDL1KEEP, [x5, 192] 112 MOV v27.16b, v21.16b 113 MOV v28.16b, v20.16b 114 MOV v29.16b, v21.16b 115 MOV v30.16b, v20.16b 116 MOV v31.16b, v21.16b 117 118 MOV x9, x3 // p = ks 119 1201: 121 # Load next 6 A pointers 122 LDP x14, x15, [x4], 16 123 LDP x20, x21, [x4], 16 124 LDP x22, x23, [x4], 16 125 126 CMP x14, x12 // if a0 == zero 127 ADD x14, x14, x11 // a0 += a_offset 128 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 129 CMP x15, x12 // if a1 == zero 130 ADD x15, x15, x11 // a1 += a_offset 131 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 132 CMP x20, x12 // if a2 == zero 133 ADD x20, x20, x11 // a2 += a_offset 134 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 135 CMP x21, x12 // if a3 == zero 136 ADD x21, x21, x11 // a3 += a_offset 137 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 138 CMP x22, x12 // if a4 == zero 139 ADD x22, x22, x11 // a4 += a_offset 140 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 141 CMP x23, x12 // if a5 == zero 142 ADD x23, x23, x11 // a5 += a_offset 143 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 144 145 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 146 SUBS x0, x2, 16 // k = kc - 16 147 B.LO 5f 148 149 # Prologue - First group loads, no FMA 150 LDR d0, [x14], 8 // a0 151 LDP q16, q17, [x5], 32 // b 152 LDR d1, [x20], 8 // a2 153 LDR d2, [x22], 8 // a4 154 LD1 {v0.d}[1], [x15], 8 // a1 155 LD1 {v1.d}[1], [x21], 8 // a3 156 LD1 {v2.d}[1], [x23], 8 // a5 157 SUBS x0, x0, 16 158 LDR q18, [x5], 16 159 LDR d19, [x5], 8 160 LDR x19, [x5], 8 // ins is in BLOCK 0 161 162 # Is there at least 4 floats (16 bytes) for main loop? 163 B.LO 3f 164 165 # Main loop - 4 floats of A (16 bytes) 166 # 48 FMA + 12 LD64 A + 8 LDR B 1672: 168 # First group of 24 FMA, Second group loads 169 // BLOCK 0 170 LDR d3, [x14], 8 // a0 171 INS v19.d[1], x19 // b from second group 172 FMLA v20.4s, v16.4s, v0.s[0] 173 LDR x19, [x15], 8 // a1 174 FMLA v22.4s, v16.4s, v0.s[2] 175 FMLA v24.4s, v16.4s, v1.s[0] 176 177 // BLOCK 1 178 LDR d12, [x5] 179 INS v3.d[1], x19 // a1 ins 180 FMLA v26.4s, v16.4s, v1.s[2] 181 LDR x19, [x5, 8] // b 182 FMLA v28.4s, v16.4s, v2.s[0] 183 FMLA v30.4s, v16.4s, v2.s[2] 184 185 // BLOCK 2 186 LDR d4, [x20], 8 // a2 187 INS v12.d[1], x19 // b ins 188 FMLA v21.4s, v17.4s, v0.s[0] 189 LDR x19, [x21], 8 // a3 190 FMLA v23.4s, v17.4s, v0.s[2] 191 FMLA v25.4s, v17.4s, v1.s[0] 192 193 // BLOCK 3 194 LDR d5, [x22], 8 // a4 195 INS v4.d[1], x19 // a3 ins 196 FMLA v27.4s, v17.4s, v1.s[2] 197 LDR x19, [x23], 8 // a5 198 FMLA v29.4s, v17.4s, v2.s[0] 199 FMLA v31.4s, v17.4s, v2.s[2] 200 201 // BLOCK 4 202 LDR d13, [x5, 16] 203 INS v5.d[1], x19 // a5 ins 204 FMLA v20.4s, v18.4s, v0.s[1] 205 LDR x19, [x5, 24] 206 FMLA v22.4s, v18.4s, v0.s[3] 207 FMLA v24.4s, v18.4s, v1.s[1] 208 209 // BLOCK 5 210 LDR d14, [x5, 32] 211 INS v13.d[1], x19 // b 212 FMLA v26.4s, v18.4s, v1.s[3] 213 LDR x19, [x5, 40] 214 FMLA v28.4s, v18.4s, v2.s[1] 215 FMLA v30.4s, v18.4s, v2.s[3] 216 217 // BLOCK 6 218 LDR d15, [x5, 48] 219 INS v14.d[1], x19 // b 220 FMLA v21.4s, v19.4s, v0.s[1] 221 LDR x19, [x5, 56] 222 FMLA v23.4s, v19.4s, v0.s[3] 223 FMLA v25.4s, v19.4s, v1.s[1] 224 225 // BLOCK 7 226 INS v15.d[1], x19 227 FMLA v27.4s, v19.4s, v1.s[3] 228 FMLA v29.4s, v19.4s, v2.s[1] 229 FMLA v31.4s, v19.4s, v2.s[3] 230 231 # Second group of 24 FMA, First group of loads 232 // BLOCK 0 233 LDR d0, [x14], 8 // a0 234 FMLA v20.4s, v12.4s, v3.s[0] 235 LDR x19, [x15], 8 // a1 236 FMLA v22.4s, v12.4s, v3.s[2] 237 FMLA v24.4s, v12.4s, v4.s[0] 238 PRFM PLDL1KEEP, [x14, 128] // Prefetch A0 239 240 // BLOCK 1 241 LDR d16, [x5, 64] 242 INS v0.d[1], x19 // a1 ins 243 FMLA v26.4s, v12.4s, v4.s[2] 244 LDR x19, [x5, 72] // b 245 FMLA v28.4s, v12.4s, v5.s[0] 246 FMLA v30.4s, v12.4s, v5.s[2] 247 PRFM PLDL1KEEP, [x15, 128] // Prefetch A1 248 249 // BLOCK 2 250 LDR d1, [x20], 8 // a2 251 INS v16.d[1], x19 // b 252 FMLA v21.4s, v13.4s, v3.s[0] 253 LDR x19, [x21], 8 // a3 254 FMLA v23.4s, v13.4s, v3.s[2] 255 FMLA v25.4s, v13.4s, v4.s[0] 256 PRFM PLDL1KEEP, [x20, 128] // Prefetch A2 257 258 // BLOCK 3 259 LDR d2, [x22], 8 // a4 260 INS v1.d[1], x19 // a3 ins 261 FMLA v27.4s, v13.4s, v4.s[2] 262 LDR x19, [x23], 8 // a5 263 FMLA v29.4s, v13.4s, v5.s[0] 264 FMLA v31.4s, v13.4s, v5.s[2] 265 PRFM PLDL1KEEP, [x21, 128] // Prefetch A3 266 267 // BLOCK 4 268 LDR d17, [x5, 80] 269 INS v2.d[1], x19 // a5 ins 270 FMLA v20.4s, v14.4s, v3.s[1] 271 LDR x19, [x5, 88] 272 FMLA v22.4s, v14.4s, v3.s[3] 273 FMLA v24.4s, v14.4s, v4.s[1] 274 PRFM PLDL1KEEP, [x22, 128] // Prefetch A4 275 276 // BLOCK 5 277 LDR d18, [x5, 96] 278 INS v17.d[1], x19 // b 279 FMLA v26.4s, v14.4s, v4.s[3] 280 LDR x19, [x5, 104] 281 FMLA v28.4s, v14.4s, v5.s[1] 282 FMLA v30.4s, v14.4s, v5.s[3] 283 PRFM PLDL1KEEP, [x23, 128] // Prefetch A5 284 285 // BLOCK 6 286 LDR d19, [x5, 112] 287 INS v18.d[1], x19 // b 288 FMLA v21.4s, v15.4s, v3.s[1] 289 LDR x19, [x5, 120] 290 FMLA v23.4s, v15.4s, v3.s[3] 291 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 292 FMLA v25.4s, v15.4s, v4.s[1] 293 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 294 295 // BLOCK 7 296 SUBS x0, x0, 16 // LDR lands here 297 FMLA v27.4s, v15.4s, v4.s[3] 298 FMLA v29.4s, v15.4s, v5.s[1] 299 ADD x5, x5, 128 300 FMLA v31.4s, v15.4s, v5.s[3] 301 B.HS 2b 302 303 # Epilogue - 4 floats of A (16 bytes) 304 # 48 FMA + 12 LD64 A + 8 LDR B 3053: 306 # First group of 24 FMA, Second group loads 307 // BLOCK 0 308 LDR d3, [x14], 8 // a0 309 INS v19.d[1], x19 // b from second group 310 FMLA v20.4s, v16.4s, v0.s[0] 311 LDR x19, [x15], 8 // a1 312 FMLA v22.4s, v16.4s, v0.s[2] 313 FMLA v24.4s, v16.4s, v1.s[0] 314 PRFM PSTL1KEEP, [x6] // Prefetch C0 315 316 // BLOCK 1 317 LDR d12, [x5] 318 INS v3.d[1], x19 // a1 ins 319 FMLA v26.4s, v16.4s, v1.s[2] 320 LDR x19, [x5, 8] // b 321 FMLA v28.4s, v16.4s, v2.s[0] 322 FMLA v30.4s, v16.4s, v2.s[2] 323 PRFM PSTL1KEEP, [x16] // Prefetch C1 324 325 // BLOCK 2 326 LDR d4, [x20], 8 // a2 327 INS v12.d[1], x19 // b ins 328 FMLA v21.4s, v17.4s, v0.s[0] 329 LDR x19, [x21], 8 // a3 330 FMLA v23.4s, v17.4s, v0.s[2] 331 FMLA v25.4s, v17.4s, v1.s[0] 332 PRFM PSTL1KEEP, [x17] // Prefetch C2 333 334 // BLOCK 3 335 LDR d5, [x22], 8 // a4 336 INS v4.d[1], x19 // a3 ins 337 FMLA v27.4s, v17.4s, v1.s[2] 338 LDR x19, [x23], 8 // a5 339 FMLA v29.4s, v17.4s, v2.s[0] 340 FMLA v31.4s, v17.4s, v2.s[2] 341 PRFM PSTL1KEEP, [x10] // Prefetch C3 342 343 // BLOCK 4 344 LDR d13, [x5, 16] 345 INS v5.d[1], x19 // a5 ins 346 FMLA v20.4s, v18.4s, v0.s[1] 347 LDR x19, [x5, 24] 348 FMLA v22.4s, v18.4s, v0.s[3] 349 FMLA v24.4s, v18.4s, v1.s[1] 350 PRFM PSTL1KEEP, [x13] // Prefetch C4 351 352 // BLOCK 5 353 LDR d14, [x5, 32] 354 INS v13.d[1], x19 // b 355 FMLA v26.4s, v18.4s, v1.s[3] 356 LDR x19, [x5, 40] 357 FMLA v28.4s, v18.4s, v2.s[1] 358 FMLA v30.4s, v18.4s, v2.s[3] 359 PRFM PSTL1KEEP, [x7] // Prefetch C5 360 361 // BLOCK 6 362 LDR d15, [x5, 48] 363 INS v14.d[1], x19 // b 364 FMLA v21.4s, v19.4s, v0.s[1] 365 LDR x19, [x5, 56] 366 FMLA v23.4s, v19.4s, v0.s[3] 367 FMLA v25.4s, v19.4s, v1.s[1] 368 369 // BLOCK 7 370 INS v15.d[1], x19 // b from previous 371 FMLA v27.4s, v19.4s, v1.s[3] 372 FMLA v29.4s, v19.4s, v2.s[1] 373 FMLA v31.4s, v19.4s, v2.s[3] 374 375 # Second group of 24 FMA, First group of loads 376 // BLOCK 0 377 FMLA v20.4s, v12.4s, v3.s[0] 378 FMLA v22.4s, v12.4s, v3.s[2] 379 FMLA v24.4s, v12.4s, v4.s[0] 380 381 // BLOCK 1 382 FMLA v26.4s, v12.4s, v4.s[2] 383 FMLA v28.4s, v12.4s, v5.s[0] 384 FMLA v30.4s, v12.4s, v5.s[2] 385 386 // BLOCK 2 387 FMLA v21.4s, v13.4s, v3.s[0] 388 FMLA v23.4s, v13.4s, v3.s[2] 389 FMLA v25.4s, v13.4s, v4.s[0] 390 391 // BLOCK 3 392 FMLA v27.4s, v13.4s, v4.s[2] 393 FMLA v29.4s, v13.4s, v5.s[0] 394 FMLA v31.4s, v13.4s, v5.s[2] 395 396 // BLOCK 4 397 FMLA v20.4s, v14.4s, v3.s[1] 398 FMLA v22.4s, v14.4s, v3.s[3] 399 FMLA v24.4s, v14.4s, v4.s[1] 400 401 // BLOCK 5 402 FMLA v26.4s, v14.4s, v4.s[3] 403 FMLA v28.4s, v14.4s, v5.s[1] 404 FMLA v30.4s, v14.4s, v5.s[3] 405 406 // BLOCK 6 407 FMLA v21.4s, v15.4s, v3.s[1] 408 FMLA v23.4s, v15.4s, v3.s[3] 409 FMLA v25.4s, v15.4s, v4.s[1] 410 TST x0, 15 411 412 // BLOCK 7 413 FMLA v27.4s, v15.4s, v4.s[3] 414 FMLA v29.4s, v15.4s, v5.s[1] 415 FMLA v31.4s, v15.4s, v5.s[3] 416 ADD x5, x5, 64 417 418 # Is there a remainder?- 2 floats of A (8 bytes) or less 419 B.NE 5f 420 4214: 422 # ks loop 423 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 424 B.HI 1b 425 426 # Clamp 427 FMAX v20.4s, v20.4s, v6.4s 428 # Load cn_stride 429 LDR x0, [sp, 80] 430 FMAX v21.4s, v21.4s, v6.4s 431 FMAX v22.4s, v22.4s, v6.4s 432 FMAX v23.4s, v23.4s, v6.4s 433 FMAX v24.4s, v24.4s, v6.4s 434 FMAX v25.4s, v25.4s, v6.4s 435 FMAX v26.4s, v26.4s, v6.4s 436 FMAX v27.4s, v27.4s, v6.4s 437 FMAX v28.4s, v28.4s, v6.4s 438 FMAX v29.4s, v29.4s, v6.4s 439 FMAX v30.4s, v30.4s, v6.4s 440 FMAX v31.4s, v31.4s, v6.4s 441 SUBS x1, x1, 8 442 FMIN v20.4s, v20.4s, v7.4s 443 FMIN v21.4s, v21.4s, v7.4s 444 FMIN v22.4s, v22.4s, v7.4s 445 FMIN v23.4s, v23.4s, v7.4s 446 FMIN v24.4s, v24.4s, v7.4s 447 FMIN v25.4s, v25.4s, v7.4s 448 FMIN v26.4s, v26.4s, v7.4s 449 FMIN v27.4s, v27.4s, v7.4s 450 FMIN v28.4s, v28.4s, v7.4s 451 FMIN v29.4s, v29.4s, v7.4s 452 FMIN v30.4s, v30.4s, v7.4s 453 FMIN v31.4s, v31.4s, v7.4s 454 455 # Store full 6 x 8 456 B.LO 7f 457 458 STP q30, q31, [x7] 459 ADD x7, x7, x0 460 STP q28, q29, [x13] 461 ADD x13, x13, x0 462 STP q26, q27, [x10] 463 ADD x10, x10, x0 464 STP q24, q25, [x17] 465 ADD x17, x17, x0 466 STP q22, q23, [x16] 467 ADD x16, x16, x0 468 STP q20, q21, [x6] 469 ADD x6, x6, x0 470 471 SUB x4, x4, x3 // a -= ks 472 473 # nc loop 474 B.HI 0b 475 476 // Restore x19-x23, d12-d15 from stack 477 LDR x23, [sp, 64] 478 LDP x21, x22, [sp, 48] 479 LDP x19, x20, [sp, 32] 480 LDP d14, d15, [sp, 16] 481 LDP d12, d13, [sp], 80 482 RET 483 4845: 485 # Is there a remainder?- 2 floats of A (8 bytes) 486 TBZ x0, 3, 6f 487 488 # Remainder- 2 floats of A (8 bytes) 489 LDR d0, [x14], 8 490 LDR q16, [x5], 16 491 LD1 {v0.d}[1], [x15], 8 492 LDR d1, [x20], 8 493 LD1 {v1.d}[1], [x21], 8 494 LDR d2, [x22], 8 495 LD1 {v2.d}[1], [x23], 8 496 LDR q17, [x5], 16 497 LDR q18, [x5], 16 498 LDR q19, [x5], 16 499 FMLA v20.4s, v16.4s, v0.s[0] 500 FMLA v22.4s, v16.4s, v0.s[2] 501 FMLA v24.4s, v16.4s, v1.s[0] 502 FMLA v26.4s, v16.4s, v1.s[2] 503 FMLA v28.4s, v16.4s, v2.s[0] 504 FMLA v30.4s, v16.4s, v2.s[2] 505 FMLA v21.4s, v17.4s, v0.s[0] 506 FMLA v23.4s, v17.4s, v0.s[2] 507 FMLA v25.4s, v17.4s, v1.s[0] 508 FMLA v27.4s, v17.4s, v1.s[2] 509 FMLA v29.4s, v17.4s, v2.s[0] 510 FMLA v31.4s, v17.4s, v2.s[2] 511 512 FMLA v20.4s, v18.4s, v0.s[1] 513 FMLA v22.4s, v18.4s, v0.s[3] 514 FMLA v24.4s, v18.4s, v1.s[1] 515 FMLA v26.4s, v18.4s, v1.s[3] 516 FMLA v28.4s, v18.4s, v2.s[1] 517 FMLA v30.4s, v18.4s, v2.s[3] 518 FMLA v21.4s, v19.4s, v0.s[1] 519 FMLA v23.4s, v19.4s, v0.s[3] 520 FMLA v25.4s, v19.4s, v1.s[1] 521 FMLA v27.4s, v19.4s, v1.s[3] 522 FMLA v29.4s, v19.4s, v2.s[1] 523 FMLA v31.4s, v19.4s, v2.s[3] 524 525 # Is there a remainder?- 1 floats of A (4 bytes) 526 TBZ x0, 2, 4b 5276: 528 # Remainder- 1 floats of A (4 bytes) 529 LDR s0, [x14], 4 530 LDR q16, [x5], 16 531 LD1 {v0.s}[2], [x15], 4 532 LDR s1, [x20], 4 533 LD1 {v1.s}[2], [x21], 4 534 LDR s2, [x22], 4 535 LD1 {v2.s}[2], [x23], 4 536 LDR q17, [x5], 16 537 538 FMLA v20.4s, v16.4s, v0.s[0] 539 FMLA v22.4s, v16.4s, v0.s[2] 540 FMLA v24.4s, v16.4s, v1.s[0] 541 FMLA v26.4s, v16.4s, v1.s[2] 542 FMLA v28.4s, v16.4s, v2.s[0] 543 FMLA v30.4s, v16.4s, v2.s[2] 544 FMLA v21.4s, v17.4s, v0.s[0] 545 FMLA v23.4s, v17.4s, v0.s[2] 546 FMLA v25.4s, v17.4s, v1.s[0] 547 FMLA v27.4s, v17.4s, v1.s[2] 548 FMLA v29.4s, v17.4s, v2.s[0] 549 FMLA v31.4s, v17.4s, v2.s[2] 550 B 4b 551 552 # Store odd width 5537: 554 TBZ x1, 2, 8f 555 STR q30, [x7], 16 556 MOV v30.16b, v31.16b 557 STR q28, [x13], 16 558 MOV v28.16b, v29.16b 559 STR q26, [x10], 16 560 MOV v26.16b, v27.16b 561 STR q24, [x17], 16 562 MOV v24.16b, v25.16b 563 STR q22, [x16], 16 564 MOV v22.16b, v23.16b 565 STR q20, [x6], 16 566 MOV v20.16b, v21.16b 5678: 568 TBZ x1, 1, 9f 569 STR d30, [x7], 8 570 DUP d30, v30.d[1] 571 STR d28, [x13], 8 572 DUP d28, v28.d[1] 573 STR d26, [x10], 8 574 DUP d26, v26.d[1] 575 STR d24, [x17], 8 576 DUP d24, v24.d[1] 577 STR d22, [x16], 8 578 DUP d22, v22.d[1] 579 STR d20, [x6], 8 580 DUP d20, v20.d[1] 581 5829: 583 TBZ x1, 0, 10f 584 STR s30, [x7] 585 STR s28, [x13] 586 STR s26, [x10] 587 STR s24, [x17] 588 STR s22, [x16] 589 STR s20, [x6] 59010: 591 // Restore x19-x23, d12-d15 from stack 592 LDR x23, [sp, 64] 593 LDP x21, x22, [sp, 48] 594 LDP x19, x20, [sp, 32] 595 LDP d14, d15, [sp, 16] 596 LDP d12, d13, [sp], 80 597 RET 598 599END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53 600 601#ifdef __ELF__ 602.section ".note.GNU-stack","",%progbits 603#endif 604