1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# x8 temporary vector shadow register 43 44# Vector register usage 45# A0 v0 v3 46# A1 v0[1] v3[1] 47# A2 v1 v4 48# A3 v1[1] v4[1] 49# A4 v2 v5 50# A5 v2[1] v5[1] 51# B v12 v13 v14 v15 second set of B 52# B v16 v17 v18 v19 first set 53# C v20 v21 54# C v22 v23 55# C v24 v25 56# C v26 v27 57# C v28 v29 58# C v30 v31 59# Clamp v6 v7 60# unused A v8 v9 v10 v11 61 62BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53 63 64 $if INC: 65 # Load acc, params pointer 66 LDP x15, x8, [sp, 8] 67 $else: 68 # Load params pointer 69 LDR x8, [sp, 8] 70 71 # Clamp A and C pointers 72 CMP x0, 2 // if mr < 2 73 ADD x9, x3, x4 // a1 = a0 + a_stride 74 ADD x16, x6, x7 // c1 = c0 + cm_stride 75 CSEL x9, x3, x9, LO // a1 = a0 76 CSEL x16, x6, x16, LO // c1 = c0 77 78 ADD x10, x9, x4 // a2 = a1 + a_stride 79 ADD x17, x16, x7 // c2 = c1 + cm_stride 80 // if mr <= 2 81 CSEL x10, x9, x10, LS // a2 = a1 82 CSEL x17, x16, x17, LS // c2 = c1 83 84 CMP x0, 4 // if mr < 4 85 ADD x11, x10, x4 // a3 = a2 + a_stride 86 ADD x14, x17, x7 // c3 = c2 + cm_stride 87 CSEL x11, x10, x11, LO // a3 = a2 88 CSEL x14, x17, x14, LO // c3 = c2 89 90 ADD x12, x11, x4 // a4 = a3 + a_stride 91 ADD x13, x14, x7 // c4 = c3 + cm_stride 92 // if mr <= 4 93 CSEL x12, x11, x12, LS // a4 = a3 94 CSEL x13, x14, x13, LS // c4 = c3 95 96 CMP x0, 6 // if mr < 6 97 ADD x4, x12, x4 // a5 = a4 + a_stride 98 ADD x7, x13, x7 // c5 = c4 + cm_stride 99 CSEL x4, x12, x4, LO // a5 = a4 100 CSEL x7, x13, x7, LO // c5 = c4 101 102 # Load min/max values 103 LD2R {v6.4s, v7.4s}, [x8] 104 105 // Save d12-d15 on stack 106 STP d12, d13, [sp, -32]! 107 STP d14, d15, [sp, 16] 108 1090: 110 $if INC: 111 # Load initial accumulators 112 LDP q20, q21, [x15], 32 113 LDP q22, q23, [x15], 32 114 LDP q24, q25, [x15], 32 115 LDP q26, q27, [x15], 32 116 LDP q28, q29, [x15], 32 117 LDP q30, q31, [x15], 32 118 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 119 PRFM PLDL1KEEP, [x3, 64] 120 PRFM PLDL1KEEP, [x9, 0] 121 PRFM PLDL1KEEP, [x9, 64] 122 PRFM PLDL1KEEP, [x10, 0] 123 PRFM PLDL1KEEP, [x10, 64] 124 PRFM PLDL1KEEP, [x11, 0] 125 PRFM PLDL1KEEP, [x11, 64] 126 PRFM PLDL1KEEP, [x12, 0] 127 PRFM PLDL1KEEP, [x12, 64] 128 PRFM PLDL1KEEP, [x4, 0] 129 PRFM PLDL1KEEP, [x4, 64] 130 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 131 PRFM PLDL1KEEP, [x5, 64] 132 PRFM PLDL1KEEP, [x5, 128] 133 PRFM PLDL1KEEP, [x5, 192] 134 $else: 135 # Load initial bias from w into accumulators 136 LDP q20, q21, [x5], 32 137 MOV v22.16b, v20.16b 138 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 139 PRFM PLDL1KEEP, [x3, 64] 140 MOV v23.16b, v21.16b 141 PRFM PLDL1KEEP, [x9, 0] 142 PRFM PLDL1KEEP, [x9, 64] 143 MOV v24.16b, v20.16b 144 PRFM PLDL1KEEP, [x10, 0] 145 PRFM PLDL1KEEP, [x10, 64] 146 MOV v25.16b, v21.16b 147 PRFM PLDL1KEEP, [x11, 0] 148 PRFM PLDL1KEEP, [x11, 64] 149 MOV v26.16b, v20.16b 150 PRFM PLDL1KEEP, [x12, 0] 151 PRFM PLDL1KEEP, [x12, 64] 152 MOV v27.16b, v21.16b 153 PRFM PLDL1KEEP, [x4, 0] 154 PRFM PLDL1KEEP, [x4, 64] 155 MOV v28.16b, v20.16b 156 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 157 MOV v29.16b, v21.16b 158 PRFM PLDL1KEEP, [x5, 64] 159 MOV v30.16b, v20.16b 160 PRFM PLDL1KEEP, [x5, 128] 161 MOV v31.16b, v21.16b 162 PRFM PLDL1KEEP, [x5, 192] 163 164 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 165 SUBS x0, x2, 16 // k = kc - 16 166 B.LO 4f 167 168 # Prologue - First group loads, no FMA 169 LDR d0, [x3], 8 // a0 170 LDP q16, q17, [x5], 32 // b 171 LDR d1, [x10], 8 // a2 172 LDR d2, [x12], 8 // a4 173 LD1 {v0.d}[1], [x9], 8 // a1 174 LD1 {v1.d}[1], [x11], 8 // a3 175 LD1 {v2.d}[1], [x4], 8 // a5 176 SUBS x0, x0, 16 177 LDR q18, [x5], 16 178 LDR d19, [x5], 8 179 LDR x8, [x5], 8 // ins is in BLOCK 0 180 181 # Is there at least 4 floats (16 bytes) for main loop? 182 B.LO 2f 183 184 # Main loop - 4 floats of A (16 bytes) 185 # 48 FMA + 12 LD64 A + 8 LDR B 1861: 187 # First group of 24 FMA, Second group loads 188 // BLOCK 0 189 LDR d3, [x3], 8 // a0 190 INS v19.d[1], x8 // b from second group 191 FMLA v20.4s, v16.4s, v0.s[0] 192 LDR x8, [x9], 8 // a1 193 FMLA v22.4s, v16.4s, v0.s[2] 194 FMLA v24.4s, v16.4s, v1.s[0] 195 196 // BLOCK 1 197 LDR d12, [x5] 198 INS v3.d[1], x8 // a1 ins 199 FMLA v26.4s, v16.4s, v1.s[2] 200 LDR x8, [x5, 8] // b 201 FMLA v28.4s, v16.4s, v2.s[0] 202 FMLA v30.4s, v16.4s, v2.s[2] 203 204 // BLOCK 2 205 LDR d4, [x10], 8 // a2 206 INS v12.d[1], x8 // b ins 207 FMLA v21.4s, v17.4s, v0.s[0] 208 LDR x8, [x11], 8 // a3 209 FMLA v23.4s, v17.4s, v0.s[2] 210 FMLA v25.4s, v17.4s, v1.s[0] 211 212 // BLOCK 3 213 LDR d5, [x12], 8 // a4 214 INS v4.d[1], x8 // a3 ins 215 FMLA v27.4s, v17.4s, v1.s[2] 216 LDR x8, [x4], 8 // a5 217 FMLA v29.4s, v17.4s, v2.s[0] 218 FMLA v31.4s, v17.4s, v2.s[2] 219 220 // BLOCK 4 221 LDR d13, [x5, 16] 222 INS v5.d[1], x8 // a5 ins 223 FMLA v20.4s, v18.4s, v0.s[1] 224 LDR x8, [x5, 24] 225 FMLA v22.4s, v18.4s, v0.s[3] 226 FMLA v24.4s, v18.4s, v1.s[1] 227 228 // BLOCK 5 229 LDR d14, [x5, 32] 230 INS v13.d[1], x8 // b 231 FMLA v26.4s, v18.4s, v1.s[3] 232 LDR x8, [x5, 40] 233 FMLA v28.4s, v18.4s, v2.s[1] 234 FMLA v30.4s, v18.4s, v2.s[3] 235 236 // BLOCK 6 237 LDR d15, [x5, 48] 238 INS v14.d[1], x8 // b 239 FMLA v21.4s, v19.4s, v0.s[1] 240 LDR x8, [x5, 56] 241 FMLA v23.4s, v19.4s, v0.s[3] 242 FMLA v25.4s, v19.4s, v1.s[1] 243 244 // BLOCK 7 245 INS v15.d[1], x8 246 FMLA v27.4s, v19.4s, v1.s[3] 247 FMLA v29.4s, v19.4s, v2.s[1] 248 FMLA v31.4s, v19.4s, v2.s[3] 249 250 # Second group of 24 FMA, First group of loads 251 // BLOCK 0 252 LDR d0, [x3], 8 // a0 253 FMLA v20.4s, v12.4s, v3.s[0] 254 LDR x8, [x9], 8 // a1 255 FMLA v22.4s, v12.4s, v3.s[2] 256 FMLA v24.4s, v12.4s, v4.s[0] 257 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 258 259 // BLOCK 1 260 LDR d16, [x5, 64] 261 INS v0.d[1], x8 // a1 ins 262 FMLA v26.4s, v12.4s, v4.s[2] 263 LDR x8, [x5, 72] // b 264 FMLA v28.4s, v12.4s, v5.s[0] 265 FMLA v30.4s, v12.4s, v5.s[2] 266 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1 267 268 // BLOCK 2 269 LDR d1, [x10], 8 // a2 270 INS v16.d[1], x8 // b 271 FMLA v21.4s, v13.4s, v3.s[0] 272 LDR x8, [x11], 8 // a3 273 FMLA v23.4s, v13.4s, v3.s[2] 274 FMLA v25.4s, v13.4s, v4.s[0] 275 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2 276 277 // BLOCK 3 278 LDR d2, [x12], 8 // a4 279 INS v1.d[1], x8 // a3 ins 280 FMLA v27.4s, v13.4s, v4.s[2] 281 LDR x8, [x4], 8 // a5 282 FMLA v29.4s, v13.4s, v5.s[0] 283 FMLA v31.4s, v13.4s, v5.s[2] 284 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3 285 286 // BLOCK 4 287 LDR d17, [x5, 80] 288 INS v2.d[1], x8 // a5 ins 289 FMLA v20.4s, v14.4s, v3.s[1] 290 LDR x8, [x5, 88] 291 FMLA v22.4s, v14.4s, v3.s[3] 292 FMLA v24.4s, v14.4s, v4.s[1] 293 PRFM PLDL1KEEP, [x12, 128] // Prefetch A4 294 295 // BLOCK 5 296 LDR d18, [x5, 96] 297 INS v17.d[1], x8 // b 298 FMLA v26.4s, v14.4s, v4.s[3] 299 LDR x8, [x5, 104] 300 FMLA v28.4s, v14.4s, v5.s[1] 301 FMLA v30.4s, v14.4s, v5.s[3] 302 PRFM PLDL1KEEP, [x4, 128] // Prefetch A5 303 304 // BLOCK 6 305 LDR d19, [x5, 112] 306 INS v18.d[1], x8 // b 307 FMLA v21.4s, v15.4s, v3.s[1] 308 LDR x8, [x5, 120] 309 FMLA v23.4s, v15.4s, v3.s[3] 310 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 311 FMLA v25.4s, v15.4s, v4.s[1] 312 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 313 314 // BLOCK 7 315 SUBS x0, x0, 16 // LDR lands here 316 FMLA v27.4s, v15.4s, v4.s[3] 317 FMLA v29.4s, v15.4s, v5.s[1] 318 ADD x5, x5, 128 319 FMLA v31.4s, v15.4s, v5.s[3] 320 B.HS 1b 321 322 # Epilogue - 4 floats of A (16 bytes) 323 # 48 FMA + 12 LD64 A + 8 LDR B 3242: 325 # First group of 24 FMA, Second group loads 326 // BLOCK 0 327 LDR d3, [x3], 8 // a0 328 INS v19.d[1], x8 // b from second group 329 FMLA v20.4s, v16.4s, v0.s[0] 330 LDR x8, [x9], 8 // a1 331 FMLA v22.4s, v16.4s, v0.s[2] 332 FMLA v24.4s, v16.4s, v1.s[0] 333 PRFM PSTL1KEEP, [x6] // Prefetch C0 334 335 // BLOCK 1 336 LDR d12, [x5] 337 INS v3.d[1], x8 // a1 ins 338 FMLA v26.4s, v16.4s, v1.s[2] 339 LDR x8, [x5, 8] // b 340 FMLA v28.4s, v16.4s, v2.s[0] 341 FMLA v30.4s, v16.4s, v2.s[2] 342 PRFM PSTL1KEEP, [x16] // Prefetch C1 343 344 // BLOCK 2 345 LDR d4, [x10], 8 // a2 346 INS v12.d[1], x8 // b ins 347 FMLA v21.4s, v17.4s, v0.s[0] 348 LDR x8, [x11], 8 // a3 349 FMLA v23.4s, v17.4s, v0.s[2] 350 FMLA v25.4s, v17.4s, v1.s[0] 351 PRFM PSTL1KEEP, [x17] // Prefetch C2 352 353 // BLOCK 3 354 LDR d5, [x12], 8 // a4 355 INS v4.d[1], x8 // a3 ins 356 FMLA v27.4s, v17.4s, v1.s[2] 357 LDR x8, [x4], 8 // a5 358 FMLA v29.4s, v17.4s, v2.s[0] 359 FMLA v31.4s, v17.4s, v2.s[2] 360 PRFM PSTL1KEEP, [x14] // Prefetch C3 361 362 // BLOCK 4 363 LDR d13, [x5, 16] 364 INS v5.d[1], x8 // a5 ins 365 FMLA v20.4s, v18.4s, v0.s[1] 366 LDR x8, [x5, 24] 367 FMLA v22.4s, v18.4s, v0.s[3] 368 FMLA v24.4s, v18.4s, v1.s[1] 369 PRFM PSTL1KEEP, [x13] // Prefetch C4 370 371 // BLOCK 5 372 LDR d14, [x5, 32] 373 INS v13.d[1], x8 // b 374 FMLA v26.4s, v18.4s, v1.s[3] 375 LDR x8, [x5, 40] 376 FMLA v28.4s, v18.4s, v2.s[1] 377 FMLA v30.4s, v18.4s, v2.s[3] 378 PRFM PSTL1KEEP, [x7] // Prefetch C5 379 380 // BLOCK 6 381 LDR d15, [x5, 48] 382 INS v14.d[1], x8 // b 383 FMLA v21.4s, v19.4s, v0.s[1] 384 LDR x8, [x5, 56] 385 FMLA v23.4s, v19.4s, v0.s[3] 386 FMLA v25.4s, v19.4s, v1.s[1] 387 388 // BLOCK 7 389 INS v15.d[1], x8 // b 390 FMLA v27.4s, v19.4s, v1.s[3] 391 FMLA v29.4s, v19.4s, v2.s[1] 392 FMLA v31.4s, v19.4s, v2.s[3] 393 394 # Second group of 24 FMA, First group of loads 395 // BLOCK 0 396 FMLA v20.4s, v12.4s, v3.s[0] 397 FMLA v22.4s, v12.4s, v3.s[2] 398 FMLA v24.4s, v12.4s, v4.s[0] 399 400 // BLOCK 1 401 FMLA v26.4s, v12.4s, v4.s[2] 402 FMLA v28.4s, v12.4s, v5.s[0] 403 FMLA v30.4s, v12.4s, v5.s[2] 404 405 // BLOCK 2 406 FMLA v21.4s, v13.4s, v3.s[0] 407 FMLA v23.4s, v13.4s, v3.s[2] 408 FMLA v25.4s, v13.4s, v4.s[0] 409 410 // BLOCK 3 411 FMLA v27.4s, v13.4s, v4.s[2] 412 FMLA v29.4s, v13.4s, v5.s[0] 413 FMLA v31.4s, v13.4s, v5.s[2] 414 415 // BLOCK 4 416 FMLA v20.4s, v14.4s, v3.s[1] 417 FMLA v22.4s, v14.4s, v3.s[3] 418 FMLA v24.4s, v14.4s, v4.s[1] 419 420 // BLOCK 5 421 FMLA v26.4s, v14.4s, v4.s[3] 422 FMLA v28.4s, v14.4s, v5.s[1] 423 FMLA v30.4s, v14.4s, v5.s[3] 424 425 // BLOCK 6 426 FMLA v21.4s, v15.4s, v3.s[1] 427 FMLA v23.4s, v15.4s, v3.s[3] 428 FMLA v25.4s, v15.4s, v4.s[1] 429 TST x0, 15 430 431 // BLOCK 7 432 FMLA v27.4s, v15.4s, v4.s[3] 433 FMLA v29.4s, v15.4s, v5.s[1] 434 FMLA v31.4s, v15.4s, v5.s[3] 435 ADD x5, x5, 64 436 437 # Is there a remainder?- 2 floats of A (8 bytes) or less 438 B.NE 4f 4393: 440 # Clamp 441 FMAX v20.4s, v20.4s, v6.4s 442 # Load cn_stride 443 LDR x0, [sp, 32] 444 FMAX v21.4s, v21.4s, v6.4s 445 FMAX v22.4s, v22.4s, v6.4s 446 FMAX v23.4s, v23.4s, v6.4s 447 FMAX v24.4s, v24.4s, v6.4s 448 FMAX v25.4s, v25.4s, v6.4s 449 FMAX v26.4s, v26.4s, v6.4s 450 FMAX v27.4s, v27.4s, v6.4s 451 FMAX v28.4s, v28.4s, v6.4s 452 FMAX v29.4s, v29.4s, v6.4s 453 FMAX v30.4s, v30.4s, v6.4s 454 FMAX v31.4s, v31.4s, v6.4s 455 SUBS x1, x1, 8 456 FMIN v20.4s, v20.4s, v7.4s 457 FMIN v21.4s, v21.4s, v7.4s 458 FMIN v22.4s, v22.4s, v7.4s 459 FMIN v23.4s, v23.4s, v7.4s 460 FMIN v24.4s, v24.4s, v7.4s 461 FMIN v25.4s, v25.4s, v7.4s 462 FMIN v26.4s, v26.4s, v7.4s 463 FMIN v27.4s, v27.4s, v7.4s 464 FMIN v28.4s, v28.4s, v7.4s 465 FMIN v29.4s, v29.4s, v7.4s 466 FMIN v30.4s, v30.4s, v7.4s 467 FMIN v31.4s, v31.4s, v7.4s 468 469 # Store full 6 x 8 470 B.LO 6f 471 472 $if INC: 473 ST1 {v30.16b, v31.16b}, [x7], x0 474 SUB x3, x3, x2 // a0 -= kc 475 ST1 {v28.16b, v29.16b}, [x13], x0 476 SUB x9, x9, x2 // a1 -= kc 477 ST1 {v26.16b, v27.16b}, [x14], x0 478 SUB x10, x10, x2 // a2 -= kc 479 ST1 {v24.16b, v25.16b}, [x17], x0 480 SUB x11, x11, x2 // a3 -= kc 481 ST1 {v22.16b, v23.16b}, [x16], x0 482 SUB x12, x12, x2 // a4 -= kc 483 ST1 {v20.16b, v21.16b}, [x6], x0 484 SUB x4, x4, x2 // a5 -= kc 485 $else: 486 ST1 {v20.16b, v21.16b}, [x6], x0 487 SUB x3, x3, x2 // a0 -= kc 488 ST1 {v22.16b, v23.16b}, [x16], x0 489 SUB x9, x9, x2 // a1 -= kc 490 ST1 {v24.16b, v25.16b}, [x17], x0 491 SUB x10, x10, x2 // a2 -= kc 492 ST1 {v26.16b, v27.16b}, [x14], x0 493 SUB x11, x11, x2 // a3 -= kc 494 ST1 {v28.16b, v29.16b}, [x13], x0 495 SUB x12, x12, x2 // a4 -= kc 496 ST1 {v30.16b, v31.16b}, [x7], x0 497 SUB x4, x4, x2 // a5 -= kc 498 499 B.HI 0b 500 501 // Restore d12-d15 from stack 502 LDP d14, d15, [sp, 16] 503 LDP d12, d13, [sp], 32 504 RET 505 5064: 507 # Is there a remainder?- 2 floats of A (8 bytes) 508 TBZ x0, 3, 5f 509 510 # Remainder- 2 floats of A (8 bytes) 511 LDR d0, [x3], 8 512 LDR q16, [x5], 16 513 LD1 {v0.d}[1], [x9], 8 514 LDR d1, [x10], 8 515 LD1 {v1.d}[1], [x11], 8 516 LDR d2, [x12], 8 517 LD1 {v2.d}[1], [x4], 8 518 LDR q17, [x5], 16 519 LDR q18, [x5], 16 520 LDR q19, [x5], 16 521 522 FMLA v20.4s, v16.4s, v0.s[0] 523 FMLA v22.4s, v16.4s, v0.s[2] 524 FMLA v24.4s, v16.4s, v1.s[0] 525 FMLA v26.4s, v16.4s, v1.s[2] 526 FMLA v28.4s, v16.4s, v2.s[0] 527 FMLA v30.4s, v16.4s, v2.s[2] 528 FMLA v21.4s, v17.4s, v0.s[0] 529 FMLA v23.4s, v17.4s, v0.s[2] 530 FMLA v25.4s, v17.4s, v1.s[0] 531 FMLA v27.4s, v17.4s, v1.s[2] 532 FMLA v29.4s, v17.4s, v2.s[0] 533 FMLA v31.4s, v17.4s, v2.s[2] 534 535 FMLA v20.4s, v18.4s, v0.s[1] 536 FMLA v22.4s, v18.4s, v0.s[3] 537 FMLA v24.4s, v18.4s, v1.s[1] 538 FMLA v26.4s, v18.4s, v1.s[3] 539 FMLA v28.4s, v18.4s, v2.s[1] 540 FMLA v30.4s, v18.4s, v2.s[3] 541 FMLA v21.4s, v19.4s, v0.s[1] 542 FMLA v23.4s, v19.4s, v0.s[3] 543 FMLA v25.4s, v19.4s, v1.s[1] 544 FMLA v27.4s, v19.4s, v1.s[3] 545 FMLA v29.4s, v19.4s, v2.s[1] 546 FMLA v31.4s, v19.4s, v2.s[3] 547 548 # Is there a remainder?- 1 floats of A (4 bytes) 549 TBZ x0, 2, 3b 5505: 551 # Remainder- 1 floats of A (4 bytes) 552 LDR s0, [x3], 4 553 LDR q16, [x5], 16 554 LD1 {v0.s}[2], [x9], 4 555 LDR s1, [x10], 4 556 LD1 {v1.s}[2], [x11], 4 557 LDR s2, [x12], 4 558 LD1 {v2.s}[2], [x4], 4 559 LDR q17, [x5], 16 560 561 FMLA v20.4s, v16.4s, v0.s[0] 562 FMLA v22.4s, v16.4s, v0.s[2] 563 FMLA v24.4s, v16.4s, v1.s[0] 564 FMLA v26.4s, v16.4s, v1.s[2] 565 FMLA v28.4s, v16.4s, v2.s[0] 566 FMLA v30.4s, v16.4s, v2.s[2] 567 FMLA v21.4s, v17.4s, v0.s[0] 568 FMLA v23.4s, v17.4s, v0.s[2] 569 FMLA v25.4s, v17.4s, v1.s[0] 570 FMLA v27.4s, v17.4s, v1.s[2] 571 FMLA v29.4s, v17.4s, v2.s[0] 572 FMLA v31.4s, v17.4s, v2.s[2] 573 B 3b 574 575 # Store odd width 5766: 577 TBZ x1, 2, 7f 578 $if INC: 579 STR q30, [x7], 16 580 MOV v30.16b, v31.16b 581 STR q28, [x13], 16 582 MOV v28.16b, v29.16b 583 STR q26, [x14], 16 584 MOV v26.16b, v27.16b 585 STR q24, [x17], 16 586 MOV v24.16b, v25.16b 587 STR q22, [x16], 16 588 MOV v22.16b, v23.16b 589 STR q20, [x6], 16 590 MOV v20.16b, v21.16b 591 $else: 592 STR q20, [x6], 16 593 MOV v20.16b, v21.16b 594 STR q22, [x16], 16 595 MOV v22.16b, v23.16b 596 STR q24, [x17], 16 597 MOV v24.16b, v25.16b 598 STR q26, [x14], 16 599 MOV v26.16b, v27.16b 600 STR q28, [x13], 16 601 MOV v28.16b, v29.16b 602 STR q30, [x7], 16 603 MOV v30.16b, v31.16b 604 6057: 606 TBZ x1, 1, 8f 607 $if INC: 608 STR d30, [x7], 8 609 DUP d30, v30.d[1] 610 STR d28, [x13], 8 611 DUP d28, v28.d[1] 612 STR d26, [x14], 8 613 DUP d26, v26.d[1] 614 STR d24, [x17], 8 615 DUP d24, v24.d[1] 616 STR d22, [x16], 8 617 DUP d22, v22.d[1] 618 STR d20, [x6], 8 619 DUP d20, v20.d[1] 620 $else: 621 STR d20, [x6], 8 622 DUP d20, v20.d[1] 623 STR d22, [x16], 8 624 DUP d22, v22.d[1] 625 STR d24, [x17], 8 626 DUP d24, v24.d[1] 627 STR d26, [x14], 8 628 DUP d26, v26.d[1] 629 STR d28, [x13], 8 630 DUP d28, v28.d[1] 631 STR d30, [x7], 8 632 DUP d30, v30.d[1] 633 6348: 635 TBZ x1, 0, 9f 636 $if INC: 637 STR s30, [x7] 638 STR s28, [x13] 639 STR s26, [x14] 640 STR s24, [x17] 641 STR s22, [x16] 642 STR s20, [x6] 643 $else: 644 STR s20, [x6] 645 STR s22, [x16] 646 STR s24, [x17] 647 STR s26, [x14] 648 STR s28, [x13] 649 STR s30, [x7] 6509: 651 // Restore d12-d15 from stack 652 LDP d14, d15, [sp, 16] 653 LDP d12, d13, [sp], 32 654 RET 655 656END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53 657 658#ifdef __ELF__ 659.section ".note.GNU-stack","",%progbits 660#endif 661