1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a53( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x18 c3 40# x13 c4 41# x7 c5 42 43# x8 temporary vector shadow register 44 45# Vector register usage 46# A0 v0 v3 47# A1 v0[1] v3[1] 48# A2 v1 v4 49# A3 v1[1] v4[1] 50# A4 v2 v5 51# A5 v2[1] v5[1] 52# B v12 v13 v14 v15 second set of B 53# B v16 v17 v18 v19 first set 54# C v20 v21 55# C v22 v23 56# C v24 v25 57# C v26 v27 58# C v28 v29 59# C v30 v31 60# Clamp v6 v7 61# unused A v8 v9 v10 v11 62 63BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a53 64 65 # Clamp A and C pointers 66 CMP x0, 2 // if mr < 2 67 ADD x9, x3, x4 // a1 = a0 + a_stride 68 ADD x16, x6, x7 // c1 = c0 + cm_stride 69 CSEL x9, x3, x9, LO // a1 = a0 70 CSEL x16, x6, x16, LO // c1 = c0 71 72 ADD x10, x9, x4 // a2 = a1 + a_stride 73 ADD x17, x16, x7 // c2 = c1 + cm_stride 74 // if mr <= 2 75 CSEL x10, x9, x10, LS // a2 = a1 76 CSEL x17, x16, x17, LS // c2 = c1 77 78 CMP x0, 4 // if mr < 4 79 ADD x11, x10, x4 // a3 = a2 + a_stride 80 ADD x18, x17, x7 // c3 = c2 + cm_stride 81 CSEL x11, x10, x11, LO // a3 = a2 82 CSEL x18, x17, x18, LO // c3 = c2 83 84 ADD x12, x11, x4 // a4 = a3 + a_stride 85 ADD x13, x18, x7 // c4 = c3 + cm_stride 86 // if mr <= 5 87 CSEL x12, x11, x12, LS // a4 = a3 88 CSEL x13, x18, x13, LS // c4 = c3 89 90 $if INC: 91 # Load acc, params pointer 92 LDP x15, x8, [sp, 8] 93 $else: 94 # Load params pointer 95 LDR x8, [sp, 8] 96 97 CMP x0, 6 // if mr < 6 98 ADD x4, x12, x4 // a5 = a4 + a_stride 99 ADD x7, x13, x7 // c5 = c4 + cm_stride 100 CSEL x4, x12, x4, LO // a5 = a4 101 CSEL x7, x13, x7, LO // c5 = c4 102 103 # Load clamping_params values 104 LD2R {v6.4s, v7.4s}, [x8] 105 106 # Load cn_stride 107 LDR x14, [sp] 108 109 // Save d12-d15 on stack 110 STP d12, d13, [sp, -32]! 111 STP d14, d15, [sp, 16] 112 1130: 114 $if INC: 115 # Load initial accumulators 116 LDP q20, q21, [x15], 32 117 LDP q22, q23, [x15], 32 118 LDP q24, q25, [x15], 32 119 LDP q26, q27, [x15], 32 120 LDP q28, q29, [x15], 32 121 LDP q30, q31, [x15], 32 122 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 123 PRFM PLDL1KEEP, [x3, 64] 124 PRFM PLDL1KEEP, [x9, 0] 125 PRFM PLDL1KEEP, [x9, 64] 126 PRFM PLDL1KEEP, [x10, 0] 127 PRFM PLDL1KEEP, [x10, 64] 128 PRFM PLDL1KEEP, [x11, 0] 129 PRFM PLDL1KEEP, [x11, 64] 130 PRFM PLDL1KEEP, [x12, 0] 131 PRFM PLDL1KEEP, [x12, 64] 132 PRFM PLDL1KEEP, [x4, 0] 133 PRFM PLDL1KEEP, [x4, 64] 134 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 135 PRFM PLDL1KEEP, [x5, 64] 136 PRFM PLDL1KEEP, [x5, 128] 137 PRFM PLDL1KEEP, [x5, 192] 138 $else: 139 # Load initial bias from w into accumulators 140 LDP q20, q21, [x5], 32 141 MOV v22.16b, v20.16b 142 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 143 PRFM PLDL1KEEP, [x3, 64] 144 MOV v23.16b, v21.16b 145 PRFM PLDL1KEEP, [x9, 0] 146 PRFM PLDL1KEEP, [x9, 64] 147 MOV v24.16b, v20.16b 148 PRFM PLDL1KEEP, [x10, 0] 149 PRFM PLDL1KEEP, [x10, 64] 150 MOV v25.16b, v21.16b 151 PRFM PLDL1KEEP, [x11, 0] 152 PRFM PLDL1KEEP, [x11, 64] 153 MOV v26.16b, v20.16b 154 PRFM PLDL1KEEP, [x12, 0] 155 PRFM PLDL1KEEP, [x12, 64] 156 MOV v27.16b, v21.16b 157 PRFM PLDL1KEEP, [x4, 0] 158 PRFM PLDL1KEEP, [x4, 64] 159 MOV v28.16b, v20.16b 160 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 161 MOV v29.16b, v21.16b 162 PRFM PLDL1KEEP, [x5, 64] 163 MOV v30.16b, v20.16b 164 PRFM PLDL1KEEP, [x5, 128] 165 MOV v31.16b, v21.16b 166 PRFM PLDL1KEEP, [x5, 192] 167 168 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 169 SUBS x0, x2, 16 // k = kc - 16 170 B.LO 5f 171 172 # Prologue - First group loads, no FMA 173 LDR d0, [x3], 8 // a0 174 LDP q16, q17, [x5], 32 // b 175 LDR d1, [x10], 8 // a2 176 LDR d2, [x12], 8 // a4 177 LD1 {v0.d}[1], [x9], 8 // a1 178 LD1 {v1.d}[1], [x11], 8 // a3 179 LD1 {v2.d}[1], [x4], 8 // a5 180 SUBS x0, x0, 16 181 LDR q18, [x5], 16 182 LDR d19, [x5], 8 183 LDR x8, [x5], 8 // ins is in BLOCK 0 184 185 # Is there at least 4 floats (16 bytes) for main loop? 186 B.LO 2f 187 188 # Main loop - 4 floats of A (16 bytes) 189 # 48 FMA + 12 LD64 A + 8 LDR B 1901: 191 # First group of 24 FMA, Second group loads 192 // BLOCK 0 193 LDR d3, [x3], 8 // a0 194 INS v19.d[1], x8 // b from second group 195 FMLA v20.4s, v16.4s, v0.s[0] 196 LDR x8, [x9], 8 // a1 197 FMLA v22.4s, v16.4s, v0.s[2] 198 FMLA v24.4s, v16.4s, v1.s[0] 199 200 // BLOCK 1 201 LDR d12, [x5] 202 INS v3.d[1], x8 // a1 ins 203 FMLA v26.4s, v16.4s, v1.s[2] 204 LDR x8, [x5, 8] // b 205 FMLA v28.4s, v16.4s, v2.s[0] 206 FMLA v30.4s, v16.4s, v2.s[2] 207 208 // BLOCK 2 209 LDR d4, [x10], 8 // a2 210 INS v12.d[1], x8 // b ins 211 FMLA v21.4s, v17.4s, v0.s[0] 212 LDR x8, [x11], 8 // a3 213 FMLA v23.4s, v17.4s, v0.s[2] 214 FMLA v25.4s, v17.4s, v1.s[0] 215 216 // BLOCK 3 217 LDR d5, [x12], 8 // a4 218 INS v4.d[1], x8 // a3 ins 219 FMLA v27.4s, v17.4s, v1.s[2] 220 LDR x8, [x4], 8 // a5 221 FMLA v29.4s, v17.4s, v2.s[0] 222 FMLA v31.4s, v17.4s, v2.s[2] 223 224 // BLOCK 4 225 LDR d13, [x5, 16] 226 INS v5.d[1], x8 // a5 ins 227 FMLA v20.4s, v18.4s, v0.s[1] 228 LDR x8, [x5, 24] 229 FMLA v22.4s, v18.4s, v0.s[3] 230 FMLA v24.4s, v18.4s, v1.s[1] 231 232 // BLOCK 5 233 LDR d14, [x5, 32] 234 INS v13.d[1], x8 // b 235 FMLA v26.4s, v18.4s, v1.s[3] 236 LDR x8, [x5, 40] 237 FMLA v28.4s, v18.4s, v2.s[1] 238 FMLA v30.4s, v18.4s, v2.s[3] 239 240 // BLOCK 6 241 LDR d15, [x5, 48] 242 INS v14.d[1], x8 // b 243 FMLA v21.4s, v19.4s, v0.s[1] 244 LDR x8, [x5, 56] 245 FMLA v23.4s, v19.4s, v0.s[3] 246 FMLA v25.4s, v19.4s, v1.s[1] 247 248 // BLOCK 7 249 INS v15.d[1], x8 250 FMLA v27.4s, v19.4s, v1.s[3] 251 FMLA v29.4s, v19.4s, v2.s[1] 252 FMLA v31.4s, v19.4s, v2.s[3] 253 254 # Second group of 24 FMA, First group of loads 255 // BLOCK 0 256 LDR d0, [x3], 8 // a0 257 FMLA v20.4s, v12.4s, v3.s[0] 258 LDR x8, [x9], 8 // a1 259 FMLA v22.4s, v12.4s, v3.s[2] 260 FMLA v24.4s, v12.4s, v4.s[0] 261 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 262 263 // BLOCK 1 264 LDR d16, [x5, 64] 265 INS v0.d[1], x8 // a1 ins 266 FMLA v26.4s, v12.4s, v4.s[2] 267 LDR x8, [x5, 72] // b 268 FMLA v28.4s, v12.4s, v5.s[0] 269 FMLA v30.4s, v12.4s, v5.s[2] 270 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1 271 272 // BLOCK 2 273 LDR d1, [x10], 8 // a2 274 INS v16.d[1], x8 // b 275 FMLA v21.4s, v13.4s, v3.s[0] 276 LDR x8, [x11], 8 // a3 277 FMLA v23.4s, v13.4s, v3.s[2] 278 FMLA v25.4s, v13.4s, v4.s[0] 279 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2 280 281 // BLOCK 3 282 LDR d2, [x12], 8 // a4 283 INS v1.d[1], x8 // a3 ins 284 FMLA v27.4s, v13.4s, v4.s[2] 285 LDR x8, [x4], 8 // a5 286 FMLA v29.4s, v13.4s, v5.s[0] 287 FMLA v31.4s, v13.4s, v5.s[2] 288 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3 289 290 // BLOCK 4 291 LDR d17, [x5, 80] 292 INS v2.d[1], x8 // a5 ins 293 FMLA v20.4s, v14.4s, v3.s[1] 294 LDR x8, [x5, 88] 295 FMLA v22.4s, v14.4s, v3.s[3] 296 FMLA v24.4s, v14.4s, v4.s[1] 297 PRFM PLDL1KEEP, [x12, 128] // Prefetch A4 298 299 // BLOCK 5 300 LDR d18, [x5, 96] 301 INS v17.d[1], x8 // b 302 FMLA v26.4s, v14.4s, v4.s[3] 303 LDR x8, [x5, 104] 304 FMLA v28.4s, v14.4s, v5.s[1] 305 FMLA v30.4s, v14.4s, v5.s[3] 306 PRFM PLDL1KEEP, [x4, 128] // Prefetch A5 307 308 // BLOCK 6 309 LDR d19, [x5, 112] 310 INS v18.d[1], x8 // b 311 FMLA v21.4s, v15.4s, v3.s[1] 312 LDR x8, [x5, 120] 313 FMLA v23.4s, v15.4s, v3.s[3] 314 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 315 FMLA v25.4s, v15.4s, v4.s[1] 316 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 317 318 // BLOCK 7 319 SUBS x0, x0, 16 // LDR lands here 320 FMLA v27.4s, v15.4s, v4.s[3] 321 FMLA v29.4s, v15.4s, v5.s[1] 322 ADD x5, x5, 128 323 FMLA v31.4s, v15.4s, v5.s[3] 324 B.HS 1b 325 326 # Epilogue - 4 floats of A (16 bytes) 327 # 48 FMA + 12 LD64 A + 8 LDR B 3282: 329 # First group of 24 FMA, Second group loads 330 // BLOCK 0 331 LDR d3, [x3], 8 // a0 332 INS v19.d[1], x8 // b from second group 333 FMLA v20.4s, v16.4s, v0.s[0] 334 LDR x8, [x9], 8 // a1 335 FMLA v22.4s, v16.4s, v0.s[2] 336 FMLA v24.4s, v16.4s, v1.s[0] 337 PRFM PSTL1KEEP, [x6] // Prefetch C0 338 339 // BLOCK 1 340 LDR d12, [x5] 341 INS v3.d[1], x8 // a1 ins 342 FMLA v26.4s, v16.4s, v1.s[2] 343 LDR x8, [x5, 8] // b 344 FMLA v28.4s, v16.4s, v2.s[0] 345 FMLA v30.4s, v16.4s, v2.s[2] 346 PRFM PSTL1KEEP, [x16] // Prefetch C1 347 348 // BLOCK 2 349 LDR d4, [x10], 8 // a2 350 INS v12.d[1], x8 // b ins 351 FMLA v21.4s, v17.4s, v0.s[0] 352 LDR x8, [x11], 8 // a3 353 FMLA v23.4s, v17.4s, v0.s[2] 354 FMLA v25.4s, v17.4s, v1.s[0] 355 PRFM PSTL1KEEP, [x17] // Prefetch C2 356 357 // BLOCK 3 358 LDR d5, [x12], 8 // a4 359 INS v4.d[1], x8 // a3 ins 360 FMLA v27.4s, v17.4s, v1.s[2] 361 LDR x8, [x4], 8 // a5 362 FMLA v29.4s, v17.4s, v2.s[0] 363 FMLA v31.4s, v17.4s, v2.s[2] 364 PRFM PSTL1KEEP, [x18] // Prefetch C3 365 366 // BLOCK 4 367 LDR d13, [x5, 16] 368 INS v5.d[1], x8 // a5 ins 369 FMLA v20.4s, v18.4s, v0.s[1] 370 LDR x8, [x5, 24] 371 FMLA v22.4s, v18.4s, v0.s[3] 372 FMLA v24.4s, v18.4s, v1.s[1] 373 PRFM PSTL1KEEP, [x13] // Prefetch C4 374 375 // BLOCK 5 376 LDR d14, [x5, 32] 377 INS v13.d[1], x8 // b 378 FMLA v26.4s, v18.4s, v1.s[3] 379 LDR x8, [x5, 40] 380 FMLA v28.4s, v18.4s, v2.s[1] 381 FMLA v30.4s, v18.4s, v2.s[3] 382 PRFM PSTL1KEEP, [x7] // Prefetch C5 383 384 // BLOCK 6 385 LDR d15, [x5, 48] 386 INS v14.d[1], x8 // b 387 FMLA v21.4s, v19.4s, v0.s[1] 388 LDR x8, [x5, 56] 389 FMLA v23.4s, v19.4s, v0.s[3] 390 FMLA v25.4s, v19.4s, v1.s[1] 391 392 // BLOCK 7 393 INS v15.d[1], x8 // b 394 FMLA v27.4s, v19.4s, v1.s[3] 395 FMLA v29.4s, v19.4s, v2.s[1] 396 FMLA v31.4s, v19.4s, v2.s[3] 397 398 # Second group of 24 FMA, First group of loads 399 // BLOCK 0 400 FMLA v20.4s, v12.4s, v3.s[0] 401 FMLA v22.4s, v12.4s, v3.s[2] 402 FMLA v24.4s, v12.4s, v4.s[0] 403 404 // BLOCK 1 405 FMLA v26.4s, v12.4s, v4.s[2] 406 FMLA v28.4s, v12.4s, v5.s[0] 407 FMLA v30.4s, v12.4s, v5.s[2] 408 409 // BLOCK 2 410 FMLA v21.4s, v13.4s, v3.s[0] 411 FMLA v23.4s, v13.4s, v3.s[2] 412 FMLA v25.4s, v13.4s, v4.s[0] 413 414 // BLOCK 3 415 FMLA v27.4s, v13.4s, v4.s[2] 416 FMLA v29.4s, v13.4s, v5.s[0] 417 FMLA v31.4s, v13.4s, v5.s[2] 418 419 // BLOCK 4 420 FMLA v20.4s, v14.4s, v3.s[1] 421 FMLA v22.4s, v14.4s, v3.s[3] 422 FMLA v24.4s, v14.4s, v4.s[1] 423 424 // BLOCK 5 425 FMLA v26.4s, v14.4s, v4.s[3] 426 FMLA v28.4s, v14.4s, v5.s[1] 427 FMLA v30.4s, v14.4s, v5.s[3] 428 429 // BLOCK 6 430 FMLA v21.4s, v15.4s, v3.s[1] 431 FMLA v23.4s, v15.4s, v3.s[3] 432 FMLA v25.4s, v15.4s, v4.s[1] 433 TST x0, 15 434 435 // BLOCK 7 436 FMLA v27.4s, v15.4s, v4.s[3] 437 FMLA v29.4s, v15.4s, v5.s[1] 438 FMLA v31.4s, v15.4s, v5.s[3] 439 ADD x5, x5, 64 440 441 # Is there a remainder?- 2 floats of A (8 bytes) or less 442 B.NE 5f 4434: 444 # Clamp 445 FMIN v20.4s, v20.4s, v6.4s 446 SUBS x1, x1, 8 447 FMIN v21.4s, v21.4s, v6.4s 448 FMIN v22.4s, v22.4s, v6.4s 449 FMIN v23.4s, v23.4s, v6.4s 450 FMIN v24.4s, v24.4s, v6.4s 451 FMIN v25.4s, v25.4s, v6.4s 452 FMIN v26.4s, v26.4s, v6.4s 453 FMIN v27.4s, v27.4s, v6.4s 454 FMIN v28.4s, v28.4s, v6.4s 455 FMIN v29.4s, v29.4s, v6.4s 456 FMIN v30.4s, v30.4s, v6.4s 457 FMIN v31.4s, v31.4s, v6.4s 458 FMAX v20.4s, v20.4s, v7.4s 459 FMAX v21.4s, v21.4s, v7.4s 460 FMAX v22.4s, v22.4s, v7.4s 461 FMAX v23.4s, v23.4s, v7.4s 462 FMAX v24.4s, v24.4s, v7.4s 463 FMAX v25.4s, v25.4s, v7.4s 464 FMAX v26.4s, v26.4s, v7.4s 465 FMAX v27.4s, v27.4s, v7.4s 466 FMAX v28.4s, v28.4s, v7.4s 467 FMAX v29.4s, v29.4s, v7.4s 468 FMAX v30.4s, v30.4s, v7.4s 469 FMAX v31.4s, v31.4s, v7.4s 470 471 # Store full 6 x 8 472 B.LO 8f 473 474 $if INC: 475 ST1 {v30.16b, v31.16b}, [x7], x14 476 SUB x3, x3, x2 // a0 -= kc 477 ST1 {v28.16b, v29.16b}, [x13], x14 478 SUB x9, x9, x2 // a1 -= kc 479 ST1 {v26.16b, v27.16b}, [x18], x14 480 SUB x10, x10, x2 // a2 -= kc 481 ST1 {v24.16b, v25.16b}, [x17], x14 482 SUB x11, x11, x2 // a3 -= kc 483 ST1 {v22.16b, v23.16b}, [x16], x14 484 SUB x12, x12, x2 // a4 -= kc 485 ST1 {v20.16b, v21.16b}, [x6], x14 486 SUB x4, x4, x2 // a5 -= kc 487 $else: 488 ST1 {v20.16b, v21.16b}, [x6], x14 489 SUB x3, x3, x2 // a0 -= kc 490 ST1 {v22.16b, v23.16b}, [x16], x14 491 SUB x9, x9, x2 // a1 -= kc 492 ST1 {v24.16b, v25.16b}, [x17], x14 493 SUB x10, x10, x2 // a2 -= kc 494 ST1 {v26.16b, v27.16b}, [x18], x14 495 SUB x11, x11, x2 // a3 -= kc 496 ST1 {v28.16b, v29.16b}, [x13], x14 497 SUB x12, x12, x2 // a4 -= kc 498 ST1 {v30.16b, v31.16b}, [x7], x14 499 SUB x4, x4, x2 // a5 -= kc 500 501 B.HI 0b 502 503 // Restore d12-d15 from stack 504 LDP d14, d15, [sp, 16] 505 LDP d12, d13, [sp], 32 506 RET 507 5085: 509 # Is there a remainder?- 2 floats of A (8 bytes) 510 TBZ x0, 3, 6f 511 512 # Remainder- 2 floats of A (8 bytes) 513 LDR d0, [x3], 8 514 LDR q16, [x5], 16 515 LD1 {v0.d}[1], [x9], 8 516 LDR d1, [x10], 8 517 LD1 {v1.d}[1], [x11], 8 518 LDR d2, [x12], 8 519 LD1 {v2.d}[1], [x4], 8 520 LDR q17, [x5], 16 521 LDR q18, [x5], 16 522 LDR q19, [x5], 16 523 524 FMLA v20.4s, v16.4s, v0.s[0] 525 FMLA v22.4s, v16.4s, v0.s[2] 526 FMLA v24.4s, v16.4s, v1.s[0] 527 FMLA v26.4s, v16.4s, v1.s[2] 528 FMLA v28.4s, v16.4s, v2.s[0] 529 FMLA v30.4s, v16.4s, v2.s[2] 530 FMLA v21.4s, v17.4s, v0.s[0] 531 FMLA v23.4s, v17.4s, v0.s[2] 532 FMLA v25.4s, v17.4s, v1.s[0] 533 FMLA v27.4s, v17.4s, v1.s[2] 534 FMLA v29.4s, v17.4s, v2.s[0] 535 FMLA v31.4s, v17.4s, v2.s[2] 536 537 FMLA v20.4s, v18.4s, v0.s[1] 538 FMLA v22.4s, v18.4s, v0.s[3] 539 FMLA v24.4s, v18.4s, v1.s[1] 540 FMLA v26.4s, v18.4s, v1.s[3] 541 FMLA v28.4s, v18.4s, v2.s[1] 542 FMLA v30.4s, v18.4s, v2.s[3] 543 FMLA v21.4s, v19.4s, v0.s[1] 544 FMLA v23.4s, v19.4s, v0.s[3] 545 FMLA v25.4s, v19.4s, v1.s[1] 546 FMLA v27.4s, v19.4s, v1.s[3] 547 FMLA v29.4s, v19.4s, v2.s[1] 548 FMLA v31.4s, v19.4s, v2.s[3] 549 550 # Is there a remainder?- 1 floats of A (4 bytes) 551 TBZ x0, 2, 4b 5526: 553 # Remainder- 1 floats of A (4 bytes) 554 LDR s0, [x3], 4 555 LDR q16, [x5], 16 556 LD1 {v0.s}[2], [x9], 4 557 LDR s1, [x10], 4 558 LD1 {v1.s}[2], [x11], 4 559 LDR s2, [x12], 4 560 LD1 {v2.s}[2], [x4], 4 561 LDR q17, [x5], 16 562 563 FMLA v20.4s, v16.4s, v0.s[0] 564 FMLA v22.4s, v16.4s, v0.s[2] 565 FMLA v24.4s, v16.4s, v1.s[0] 566 FMLA v26.4s, v16.4s, v1.s[2] 567 FMLA v28.4s, v16.4s, v2.s[0] 568 FMLA v30.4s, v16.4s, v2.s[2] 569 FMLA v21.4s, v17.4s, v0.s[0] 570 FMLA v23.4s, v17.4s, v0.s[2] 571 FMLA v25.4s, v17.4s, v1.s[0] 572 FMLA v27.4s, v17.4s, v1.s[2] 573 FMLA v29.4s, v17.4s, v2.s[0] 574 FMLA v31.4s, v17.4s, v2.s[2] 575 B 4b 576 577 # Store odd width 5788: 579 TBZ x1, 2, 9f 580 $if INC: 581 STR q30, [x7], 16 582 MOV v30.16b, v31.16b 583 STR q28, [x13], 16 584 MOV v28.16b, v29.16b 585 STR q26, [x18], 16 586 MOV v26.16b, v27.16b 587 STR q24, [x17], 16 588 MOV v24.16b, v25.16b 589 STR q22, [x16], 16 590 MOV v22.16b, v23.16b 591 STR q20, [x6], 16 592 MOV v20.16b, v21.16b 593 $else: 594 STR q20, [x6], 16 595 MOV v20.16b, v21.16b 596 STR q22, [x16], 16 597 MOV v22.16b, v23.16b 598 STR q24, [x17], 16 599 MOV v24.16b, v25.16b 600 STR q26, [x18], 16 601 MOV v26.16b, v27.16b 602 STR q28, [x13], 16 603 MOV v28.16b, v29.16b 604 STR q30, [x7], 16 605 MOV v30.16b, v31.16b 606 6079: 608 TBZ x1, 1, 10f 609 $if INC: 610 STR d30, [x7], 8 611 DUP d30, v30.d[1] 612 STR d28, [x13], 8 613 DUP d28, v28.d[1] 614 STR d26, [x18], 8 615 DUP d26, v26.d[1] 616 STR d24, [x17], 8 617 DUP d24, v24.d[1] 618 STR d22, [x16], 8 619 DUP d22, v22.d[1] 620 STR d20, [x6], 8 621 DUP d20, v20.d[1] 622 $else: 623 STR d20, [x6], 8 624 DUP d20, v20.d[1] 625 STR d22, [x16], 8 626 DUP d22, v22.d[1] 627 STR d24, [x17], 8 628 DUP d24, v24.d[1] 629 STR d26, [x18], 8 630 DUP d26, v26.d[1] 631 STR d28, [x13], 8 632 DUP d28, v28.d[1] 633 STR d30, [x7], 8 634 DUP d30, v30.d[1] 635 63610: 637 TBZ x1, 0, 11f 638 $if INC: 639 STR s30, [x7] 640 STR s28, [x13] 641 STR s26, [x18] 642 STR s24, [x17] 643 STR s22, [x16] 644 STR s20, [x6] 645 $else: 646 STR s20, [x6] 647 STR s22, [x16] 648 STR s24, [x17] 649 STR s26, [x18] 650 STR s28, [x13] 651 STR s30, [x7] 65211: 653 // Restore d12-d15 from stack 654 LDP d14, d15, [sp, 16] 655 LDP d12, d13, [sp], 32 656 RET 657 658END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a53 659 660#ifdef __ELF__ 661.section ".note.GNU-stack","",%progbits 662#endif 663