1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# x8 temporary vector shadow register 43 44# Vector register usage 45# A0 v0 v3 46# A1 v0[1] v3[1] 47# A2 v1 v4 48# A3 v1[1] v4[1] 49# A4 v2 v5 50# A5 v2[1] v5[1] 51# B v12 v13 v14 v15 second set of B 52# B v16 v17 v18 v19 first set 53# C v20 v21 54# C v22 v23 55# C v24 v25 56# C v26 v27 57# C v28 v29 58# C v30 v31 59# Clamp v6 v7 60# unused A v8 v9 v10 v11 61 62BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55 63 64 $if INC: 65 # Load acc, params pointer 66 LDP x15, x8, [sp, 8] 67 $else: 68 # Load params pointer 69 LDR x8, [sp, 8] 70 71 # Clamp A and C pointers 72 CMP x0, 2 // if mr < 2 73 ADD x9, x3, x4 // a1 = a0 + a_stride 74 ADD x16, x6, x7 // c1 = c0 + cm_stride 75 CSEL x9, x3, x9, LO // a1 = a0 76 CSEL x16, x6, x16, LO // c1 = c0 77 78 ADD x10, x9, x4 // a2 = a1 + a_stride 79 ADD x17, x16, x7 // c2 = c1 + cm_stride 80 // if mr <= 2 81 CSEL x10, x9, x10, LS // a2 = a1 82 CSEL x17, x16, x17, LS // c2 = c1 83 84 CMP x0, 4 // if mr < 4 85 ADD x11, x10, x4 // a3 = a2 + a_stride 86 ADD x14, x17, x7 // c3 = c2 + cm_stride 87 CSEL x11, x10, x11, LO // a3 = a2 88 CSEL x14, x17, x14, LO // c3 = c2 89 90 ADD x12, x11, x4 // a4 = a3 + a_stride 91 ADD x13, x14, x7 // c4 = c3 + cm_stride 92 // if mr <= 4 93 CSEL x12, x11, x12, LS // a4 = a3 94 CSEL x13, x14, x13, LS // c4 = c3 95 96 CMP x0, 6 // if mr < 6 97 ADD x4, x12, x4 // a5 = a4 + a_stride 98 ADD x7, x13, x7 // c5 = c4 + cm_stride 99 CSEL x4, x12, x4, LO // a5 = a4 100 CSEL x7, x13, x7, LO // c5 = c4 101 102 # Load min/max values 103 LD2R {v6.4s, v7.4s}, [x8] 104 105 // Save d12-d15 on stack 106 STP d12, d13, [sp, -32]! 107 STP d14, d15, [sp, 16] 108 1090: 110 $if INC: 111 # Load initial accumulators 112 LDP q20, q21, [x15], 32 113 LDP q22, q23, [x15], 32 114 LDP q24, q25, [x15], 32 115 LDP q26, q27, [x15], 32 116 LDP q28, q29, [x15], 32 117 LDP q30, q31, [x15], 32 118 SUBS x0, x2, 16 // k = kc - 16 119 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 120 PRFM PLDL1KEEP, [x3, 64] 121 PRFM PLDL1KEEP, [x9, 0] 122 PRFM PLDL1KEEP, [x9, 64] 123 PRFM PLDL1KEEP, [x10, 0] 124 PRFM PLDL1KEEP, [x10, 64] 125 PRFM PLDL1KEEP, [x11, 0] 126 PRFM PLDL1KEEP, [x11, 64] 127 PRFM PLDL1KEEP, [x12, 0] 128 PRFM PLDL1KEEP, [x12, 64] 129 PRFM PLDL1KEEP, [x4, 0] 130 PRFM PLDL1KEEP, [x4, 64] 131 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 132 PRFM PLDL1KEEP, [x5, 64] 133 PRFM PLDL1KEEP, [x5, 128] 134 PRFM PLDL1KEEP, [x5, 192] 135 PRFM PLDL1KEEP, [x5, 256] 136 PRFM PLDL1KEEP, [x5, 320] 137 $else: 138 # Load initial bias from w into accumulators 139 LDP q20, q21, [x5], 32 140 SUBS x0, x2, 16 // k = kc - 16 141 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 142 PRFM PLDL1KEEP, [x3, 64] 143 MOV v22.16b, v20.16b 144 PRFM PLDL1KEEP, [x9, 0] 145 PRFM PLDL1KEEP, [x9, 64] 146 MOV v23.16b, v21.16b 147 PRFM PLDL1KEEP, [x10, 0] 148 PRFM PLDL1KEEP, [x10, 64] 149 MOV v24.16b, v20.16b 150 PRFM PLDL1KEEP, [x11, 0] 151 PRFM PLDL1KEEP, [x11, 64] 152 MOV v25.16b, v21.16b 153 PRFM PLDL1KEEP, [x12, 0] 154 PRFM PLDL1KEEP, [x12, 64] 155 MOV v26.16b, v20.16b 156 PRFM PLDL1KEEP, [x4, 0] 157 PRFM PLDL1KEEP, [x4, 64] 158 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 159 MOV v27.16b, v21.16b 160 PRFM PLDL1KEEP, [x5, 64] 161 MOV v28.16b, v20.16b 162 PRFM PLDL1KEEP, [x5, 128] 163 MOV v29.16b, v21.16b 164 PRFM PLDL1KEEP, [x5, 192] 165 MOV v30.16b, v20.16b 166 PRFM PLDL1KEEP, [x5, 256] 167 MOV v31.16b, v21.16b 168 PRFM PLDL1KEEP, [x5, 320] 169 170 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 171 B.LO 4f 172 173 # Prologue - First group loads, no FMA 174 LDR d0, [x3], 8 // a0 175 LDP q16, q17, [x5], 32 // b 176 LDR d1, [x10], 8 // a2 177 LDR d2, [x12], 8 // a4 178 LD1 {v0.d}[1], [x9], 8 // a1 179 LD1 {v1.d}[1], [x11], 8 // a3 180 LD1 {v2.d}[1], [x4], 8 // a5 181 SUBS x0, x0, 16 182 LDR q18, [x5], 16 183 LDR d19, [x5], 8 184 LDR x8, [x5], 8 // ins is in BLOCK 0 185 186 # Is there at least 4 floats (16 bytes) for main loop? 187 B.LO 2f 188 189 # Main loop - 4 floats of A (16 bytes) 190 # 48 FMA + 12 LD64 A + 8 LDR B 1911: 192 # First group of 24 FMA, Second group loads 193 // BLOCK 0 194 FMLA v20.4s, v16.4s, v0.s[0] 195 LDR d3, [x3], 8 // a0 196 FMLA v22.4s, v16.4s, v0.s[2] 197 INS v19.d[1], x8 // b from second group 198 FMLA v24.4s, v16.4s, v1.s[0] 199 LDR x8, [x9], 8 // a1 200 201 // BLOCK 1 202 FMLA v26.4s, v16.4s, v1.s[2] 203 LDR d12, [x5] 204 FMLA v28.4s, v16.4s, v2.s[0] 205 INS v3.d[1], x8 // a1 ins 206 FMLA v30.4s, v16.4s, v2.s[2] 207 LDR x8, [x5, 8] // b 208 209 // BLOCK 2 210 FMLA v21.4s, v17.4s, v0.s[0] 211 LDR d4, [x10], 8 // a2 212 FMLA v23.4s, v17.4s, v0.s[2] 213 INS v12.d[1], x8 // b ins 214 FMLA v25.4s, v17.4s, v1.s[0] 215 LDR x8, [x11], 8 // a3 216 217 // BLOCK 3 218 FMLA v27.4s, v17.4s, v1.s[2] 219 LDR d5, [x12], 8 // a4 220 FMLA v29.4s, v17.4s, v2.s[0] 221 INS v4.d[1], x8 // a3 ins 222 FMLA v31.4s, v17.4s, v2.s[2] 223 LDR x8, [x4], 8 // a5 224 225 // BLOCK 4 226 FMLA v20.4s, v18.4s, v0.s[1] 227 LDR d13, [x5, 16] 228 FMLA v22.4s, v18.4s, v0.s[3] 229 INS v5.d[1], x8 // a5 ins 230 FMLA v24.4s, v18.4s, v1.s[1] 231 LDR x8, [x5, 24] 232 233 // BLOCK 5 234 FMLA v26.4s, v18.4s, v1.s[3] 235 LDR d14, [x5, 32] 236 FMLA v28.4s, v18.4s, v2.s[1] 237 INS v13.d[1], x8 // b 238 FMLA v30.4s, v18.4s, v2.s[3] 239 LDR x8, [x5, 40] 240 241 // BLOCK 6 242 FMLA v21.4s, v19.4s, v0.s[1] 243 LDR d15, [x5, 48] 244 FMLA v23.4s, v19.4s, v0.s[3] 245 INS v14.d[1], x8 // b 246 FMLA v25.4s, v19.4s, v1.s[1] 247 LDR x8, [x5, 56] 248 249 // BLOCK 7 250 FMLA v27.4s, v19.4s, v1.s[3] 251 FMLA v29.4s, v19.4s, v2.s[1] 252 INS v15.d[1], x8 253 FMLA v31.4s, v19.4s, v2.s[3] 254 255 # Second group of 24 FMA, First group of loads 256 // BLOCK 0 257 FMLA v20.4s, v12.4s, v3.s[0] 258 LDR d0, [x3], 8 // a0 259 FMLA v22.4s, v12.4s, v3.s[2] 260 FMLA v24.4s, v12.4s, v4.s[0] 261 LDR x8, [x9], 8 // a1 262 263 // BLOCK 1 264 FMLA v26.4s, v12.4s, v4.s[2] 265 LDR d16, [x5, 64] 266 FMLA v28.4s, v12.4s, v5.s[0] 267 INS v0.d[1], x8 // a1 ins 268 FMLA v30.4s, v12.4s, v5.s[2] 269 LDR x8, [x5, 72] // b 270 271 // BLOCK 2 272 FMLA v21.4s, v13.4s, v3.s[0] 273 LDR d1, [x10], 8 // a2 274 FMLA v23.4s, v13.4s, v3.s[2] 275 INS v16.d[1], x8 // b 276 FMLA v25.4s, v13.4s, v4.s[0] 277 LDR x8, [x11], 8 // a3 278 279 // BLOCK 3 280 FMLA v27.4s, v13.4s, v4.s[2] 281 LDR d2, [x12], 8 // a4 282 FMLA v29.4s, v13.4s, v5.s[0] 283 INS v1.d[1], x8 // a3 ins 284 FMLA v31.4s, v13.4s, v5.s[2] 285 LDR x8, [x4], 8 // a5 286 287 // BLOCK 4 288 FMLA v20.4s, v14.4s, v3.s[1] 289 LDR d17, [x5, 80] 290 FMLA v22.4s, v14.4s, v3.s[3] 291 INS v2.d[1], x8 // a5 ins 292 FMLA v24.4s, v14.4s, v4.s[1] 293 LDR x8, [x5, 88] 294 295 // BLOCK 5 296 FMLA v26.4s, v14.4s, v4.s[3] 297 LDR d18, [x5, 96] 298 FMLA v28.4s, v14.4s, v5.s[1] 299 INS v17.d[1], x8 // b 300 FMLA v30.4s, v14.4s, v5.s[3] 301 LDR x8, [x5, 104] 302 303 // BLOCK 6 304 FMLA v21.4s, v15.4s, v3.s[1] 305 LDR d19, [x5, 112] 306 FMLA v23.4s, v15.4s, v3.s[3] 307 INS v18.d[1], x8 // b 308 FMLA v25.4s, v15.4s, v4.s[1] 309 LDR x8, [x5, 120] 310 311 // BLOCK 7 312 FMLA v27.4s, v15.4s, v4.s[3] 313 SUBS x0, x0, 16 314 FMLA v29.4s, v15.4s, v5.s[1] 315 ADD x5, x5, 128 316 FMLA v31.4s, v15.4s, v5.s[3] 317 B.HS 1b 318 319 # Epilogue - 4 floats of A (16 bytes) 320 # 48 FMA + 12 LD64 A + 8 LDR B 3212: 322 # First group of 24 FMA, Second group loads 323 // BLOCK 0 324 FMLA v20.4s, v16.4s, v0.s[0] 325 LDR d3, [x3], 8 // a0 326 FMLA v22.4s, v16.4s, v0.s[2] 327 INS v19.d[1], x8 // b from second group 328 FMLA v24.4s, v16.4s, v1.s[0] 329 LDR x8, [x9], 8 // a1 330 331 // BLOCK 1 332 FMLA v26.4s, v16.4s, v1.s[2] 333 LDR d12, [x5] 334 FMLA v28.4s, v16.4s, v2.s[0] 335 INS v3.d[1], x8 // a1 ins 336 FMLA v30.4s, v16.4s, v2.s[2] 337 LDR x8, [x5, 8] // b 338 339 // BLOCK 2 340 FMLA v21.4s, v17.4s, v0.s[0] 341 LDR d4, [x10], 8 // a2 342 FMLA v23.4s, v17.4s, v0.s[2] 343 INS v12.d[1], x8 // b ins 344 FMLA v25.4s, v17.4s, v1.s[0] 345 LDR x8, [x11], 8 // a3 346 347 // BLOCK 3 348 FMLA v27.4s, v17.4s, v1.s[2] 349 LDR d5, [x12], 8 // a4 350 FMLA v29.4s, v17.4s, v2.s[0] 351 INS v4.d[1], x8 // a3 ins 352 FMLA v31.4s, v17.4s, v2.s[2] 353 LDR x8, [x4], 8 // a5 354 355 // BLOCK 4 356 FMLA v20.4s, v18.4s, v0.s[1] 357 LDR d13, [x5, 16] 358 FMLA v22.4s, v18.4s, v0.s[3] 359 INS v5.d[1], x8 // a5 ins 360 FMLA v24.4s, v18.4s, v1.s[1] 361 LDR x8, [x5, 24] 362 363 // BLOCK 5 364 FMLA v26.4s, v18.4s, v1.s[3] 365 LDR d14, [x5, 32] 366 FMLA v28.4s, v18.4s, v2.s[1] 367 INS v13.d[1], x8 // b 368 FMLA v30.4s, v18.4s, v2.s[3] 369 LDR x8, [x5, 40] 370 371 // BLOCK 6 372 FMLA v21.4s, v19.4s, v0.s[1] 373 LDR d15, [x5, 48] 374 FMLA v23.4s, v19.4s, v0.s[3] 375 INS v14.d[1], x8 // b 376 FMLA v25.4s, v19.4s, v1.s[1] 377 LDR x8, [x5, 56] 378 379 // BLOCK 7 380 FMLA v27.4s, v19.4s, v1.s[3] 381 FMLA v29.4s, v19.4s, v2.s[1] 382 INS v15.d[1], x8 // b 383 FMLA v31.4s, v19.4s, v2.s[3] 384 385 # Second group of 24 FMA, First group of loads 386 // BLOCK 0 387 FMLA v20.4s, v12.4s, v3.s[0] 388 PRFM PSTL1KEEP, [x6] // Prefetch C0 389 FMLA v22.4s, v12.4s, v3.s[2] 390 PRFM PSTL1KEEP, [x16] // Prefetch C1 391 FMLA v24.4s, v12.4s, v4.s[0] 392 PRFM PSTL1KEEP, [x17] // Prefetch C2 393 394 // BLOCK 1 395 FMLA v26.4s, v12.4s, v4.s[2] 396 PRFM PSTL1KEEP, [x14] // Prefetch C3 397 FMLA v28.4s, v12.4s, v5.s[0] 398 PRFM PSTL1KEEP, [x13] // Prefetch C4 399 FMLA v30.4s, v12.4s, v5.s[2] 400 PRFM PSTL1KEEP, [x7] // Prefetch C5 401 402 // BLOCK 2 403 FMLA v21.4s, v13.4s, v3.s[0] 404 FMLA v23.4s, v13.4s, v3.s[2] 405 FMLA v25.4s, v13.4s, v4.s[0] 406 407 // BLOCK 3 408 FMLA v27.4s, v13.4s, v4.s[2] 409 FMLA v29.4s, v13.4s, v5.s[0] 410 FMLA v31.4s, v13.4s, v5.s[2] 411 412 // BLOCK 4 413 FMLA v20.4s, v14.4s, v3.s[1] 414 FMLA v22.4s, v14.4s, v3.s[3] 415 FMLA v24.4s, v14.4s, v4.s[1] 416 417 // BLOCK 5 418 FMLA v26.4s, v14.4s, v4.s[3] 419 FMLA v28.4s, v14.4s, v5.s[1] 420 FMLA v30.4s, v14.4s, v5.s[3] 421 TST x0, 15 422 423 // BLOCK 6 424 FMLA v21.4s, v15.4s, v3.s[1] 425 FMLA v23.4s, v15.4s, v3.s[3] 426 FMLA v25.4s, v15.4s, v4.s[1] 427 ADD x5, x5, 64 428 429 // BLOCK 7 430 FMLA v27.4s, v15.4s, v4.s[3] 431 FMLA v29.4s, v15.4s, v5.s[1] 432 FMLA v31.4s, v15.4s, v5.s[3] 433 434 # Is there a remainder?- 2 floats of A (8 bytes) or less 435 B.NE 4f 4363: 437 # Clamp 438 FMAX v20.4s, v20.4s, v6.4s 439 # Load cn_stride 440 LDR x0, [sp, 32] 441 FMAX v21.4s, v21.4s, v6.4s 442 FMAX v22.4s, v22.4s, v6.4s 443 FMAX v23.4s, v23.4s, v6.4s 444 FMAX v24.4s, v24.4s, v6.4s 445 FMAX v25.4s, v25.4s, v6.4s 446 FMAX v26.4s, v26.4s, v6.4s 447 FMAX v27.4s, v27.4s, v6.4s 448 FMAX v28.4s, v28.4s, v6.4s 449 FMAX v29.4s, v29.4s, v6.4s 450 FMAX v30.4s, v30.4s, v6.4s 451 FMAX v31.4s, v31.4s, v6.4s 452 SUBS x1, x1, 8 453 FMIN v20.4s, v20.4s, v7.4s 454 FMIN v21.4s, v21.4s, v7.4s 455 FMIN v22.4s, v22.4s, v7.4s 456 FMIN v23.4s, v23.4s, v7.4s 457 FMIN v24.4s, v24.4s, v7.4s 458 FMIN v25.4s, v25.4s, v7.4s 459 FMIN v26.4s, v26.4s, v7.4s 460 FMIN v27.4s, v27.4s, v7.4s 461 FMIN v28.4s, v28.4s, v7.4s 462 FMIN v29.4s, v29.4s, v7.4s 463 FMIN v30.4s, v30.4s, v7.4s 464 FMIN v31.4s, v31.4s, v7.4s 465 466 # Store full 6 x 8 467 B.LO 6f 468 469 $if INC: 470 ST1 {v30.16b, v31.16b}, [x7], x0 471 SUB x3, x3, x2 // a0 -= kc 472 ST1 {v28.16b, v29.16b}, [x13], x0 473 SUB x9, x9, x2 // a1 -= kc 474 ST1 {v26.16b, v27.16b}, [x14], x0 475 SUB x10, x10, x2 // a2 -= kc 476 ST1 {v24.16b, v25.16b}, [x17], x0 477 SUB x11, x11, x2 // a3 -= kc 478 ST1 {v22.16b, v23.16b}, [x16], x0 479 SUB x12, x12, x2 // a4 -= kc 480 ST1 {v20.16b, v21.16b}, [x6], x0 481 SUB x4, x4, x2 // a5 -= kc 482 $else: 483 ST1 {v20.16b, v21.16b}, [x6], x0 484 SUB x3, x3, x2 // a0 -= kc 485 ST1 {v22.16b, v23.16b}, [x16], x0 486 SUB x9, x9, x2 // a1 -= kc 487 ST1 {v24.16b, v25.16b}, [x17], x0 488 SUB x10, x10, x2 // a2 -= kc 489 ST1 {v26.16b, v27.16b}, [x14], x0 490 SUB x11, x11, x2 // a3 -= kc 491 ST1 {v28.16b, v29.16b}, [x13], x0 492 SUB x12, x12, x2 // a4 -= kc 493 ST1 {v30.16b, v31.16b}, [x7], x0 494 SUB x4, x4, x2 // a5 -= kc 495 496 B.HI 0b 497 498 // Restore d12-d15 from stack 499 LDP d14, d15, [sp, 16] 500 LDP d12, d13, [sp], 32 501 RET 502 5034: 504 # Is there a remainder?- 2 floats of A (8 bytes) 505 TBZ x0, 3, 5f 506 507 # Remainder- 2 floats of A (8 bytes) 508 LDR d0, [x3], 8 509 LDR q16, [x5], 16 510 LD1 {v0.d}[1], [x9], 8 511 LDR d1, [x10], 8 512 LD1 {v1.d}[1], [x11], 8 513 LDR d2, [x12], 8 514 LD1 {v2.d}[1], [x4], 8 515 LDR q17, [x5], 16 516 LDR q18, [x5], 16 517 LDR q19, [x5], 16 518 519 FMLA v20.4s, v16.4s, v0.s[0] 520 FMLA v22.4s, v16.4s, v0.s[2] 521 FMLA v24.4s, v16.4s, v1.s[0] 522 FMLA v26.4s, v16.4s, v1.s[2] 523 FMLA v28.4s, v16.4s, v2.s[0] 524 FMLA v30.4s, v16.4s, v2.s[2] 525 FMLA v21.4s, v17.4s, v0.s[0] 526 FMLA v23.4s, v17.4s, v0.s[2] 527 FMLA v25.4s, v17.4s, v1.s[0] 528 FMLA v27.4s, v17.4s, v1.s[2] 529 FMLA v29.4s, v17.4s, v2.s[0] 530 FMLA v31.4s, v17.4s, v2.s[2] 531 532 FMLA v20.4s, v18.4s, v0.s[1] 533 FMLA v22.4s, v18.4s, v0.s[3] 534 FMLA v24.4s, v18.4s, v1.s[1] 535 FMLA v26.4s, v18.4s, v1.s[3] 536 FMLA v28.4s, v18.4s, v2.s[1] 537 FMLA v30.4s, v18.4s, v2.s[3] 538 FMLA v21.4s, v19.4s, v0.s[1] 539 FMLA v23.4s, v19.4s, v0.s[3] 540 FMLA v25.4s, v19.4s, v1.s[1] 541 FMLA v27.4s, v19.4s, v1.s[3] 542 FMLA v29.4s, v19.4s, v2.s[1] 543 FMLA v31.4s, v19.4s, v2.s[3] 544 545 # Is there a remainder?- 1 floats of A (4 bytes) 546 TBZ x0, 2, 3b 5475: 548 # Remainder- 1 floats of A (4 bytes) 549 LDR s0, [x3], 4 550 LDR q16, [x5], 16 551 LD1 {v0.s}[2], [x9], 4 552 LDR s1, [x10], 4 553 LD1 {v1.s}[2], [x11], 4 554 LDR s2, [x12], 4 555 LD1 {v2.s}[2], [x4], 4 556 LDR q17, [x5], 16 557 558 FMLA v20.4s, v16.4s, v0.s[0] 559 FMLA v22.4s, v16.4s, v0.s[2] 560 FMLA v24.4s, v16.4s, v1.s[0] 561 FMLA v26.4s, v16.4s, v1.s[2] 562 FMLA v28.4s, v16.4s, v2.s[0] 563 FMLA v30.4s, v16.4s, v2.s[2] 564 FMLA v21.4s, v17.4s, v0.s[0] 565 FMLA v23.4s, v17.4s, v0.s[2] 566 FMLA v25.4s, v17.4s, v1.s[0] 567 FMLA v27.4s, v17.4s, v1.s[2] 568 FMLA v29.4s, v17.4s, v2.s[0] 569 FMLA v31.4s, v17.4s, v2.s[2] 570 B 3b 571 572 # Store odd width 5736: 574 TBZ x1, 2, 7f 575 $if INC: 576 STR q30, [x7], 16 577 MOV v30.16b, v31.16b 578 STR q28, [x13], 16 579 MOV v28.16b, v29.16b 580 STR q26, [x14], 16 581 MOV v26.16b, v27.16b 582 STR q24, [x17], 16 583 MOV v24.16b, v25.16b 584 STR q22, [x16], 16 585 MOV v22.16b, v23.16b 586 STR q20, [x6], 16 587 MOV v20.16b, v21.16b 588 $else: 589 STR q20, [x6], 16 590 MOV v20.16b, v21.16b 591 STR q22, [x16], 16 592 MOV v22.16b, v23.16b 593 STR q24, [x17], 16 594 MOV v24.16b, v25.16b 595 STR q26, [x14], 16 596 MOV v26.16b, v27.16b 597 STR q28, [x13], 16 598 MOV v28.16b, v29.16b 599 STR q30, [x7], 16 600 MOV v30.16b, v31.16b 601 6027: 603 TBZ x1, 1, 8f 604 $if INC: 605 STR d30, [x7], 8 606 DUP d30, v30.d[1] 607 STR d28, [x13], 8 608 DUP d28, v28.d[1] 609 STR d26, [x14], 8 610 DUP d26, v26.d[1] 611 STR d24, [x17], 8 612 DUP d24, v24.d[1] 613 STR d22, [x16], 8 614 DUP d22, v22.d[1] 615 STR d20, [x6], 8 616 DUP d20, v20.d[1] 617 $else: 618 STR d20, [x6], 8 619 DUP d20, v20.d[1] 620 STR d22, [x16], 8 621 DUP d22, v22.d[1] 622 STR d24, [x17], 8 623 DUP d24, v24.d[1] 624 STR d26, [x14], 8 625 DUP d26, v26.d[1] 626 STR d28, [x13], 8 627 DUP d28, v28.d[1] 628 STR d30, [x7], 8 629 DUP d30, v30.d[1] 630 6318: 632 TBZ x1, 0, 9f 633 $if INC: 634 STR s30, [x7] 635 STR s28, [x13] 636 STR s26, [x14] 637 STR s24, [x17] 638 STR s22, [x16] 639 STR s20, [x6] 640 $else: 641 STR s20, [x6] 642 STR s22, [x16] 643 STR s24, [x17] 644 STR s26, [x14] 645 STR s28, [x13] 646 STR s30, [x7] 6479: 648 // Restore d12-d15 from stack 649 LDP d14, d15, [sp, 16] 650 LDP d12, d13, [sp], 32 651 RET 652 653END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55 654 655#ifdef __ELF__ 656.section ".note.GNU-stack","",%progbits 657#endif 658