1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointers 28# x3 a0 29# x11 a1 30# x12 a2 31# x4 a3 / a_stride 32 33# C pointers 34# x6 c0 35# x9 c1 36# x10 c2 37# x7 c3 / cm_stride 38 39# x8 temporary vector shadow register 40 41# Vector register usage and GPR shadows 42# a0 v0 43# a1 v0[1] 44# a2 v1 45# a3 v1[1] 46# a0 v2 47# a1 v2[1] 48# a2 v3 49# a3 v3[1] 50# B v6 v7 v8 51# B v9 v10 v11 52# B v14 v15 v16 53# B v17 v18 v19 54# C v20 v21 v22 55# C v23 v24 v25 56# C v26 v27 v28 57# C v29 v30 v31 58# Clamp v4 v5 59# v12 to v13 unused. 60 61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53 62 63 $if INC: 64 # Load cn_stride, acc 65 LDP x14, x15, [sp] 66 # Load params pointer 67 LDR x8, [sp, 16] 68 $else: 69 # Load cn_stride, params pointer 70 LDP x14, x8, [sp] 71 72 # Load clamping_params values 73 LD2R {v4.4s, v5.4s}, [x8] 74 75 # Save d8-d11,d14,d15 on stack 76 STP d8, d9, [sp, -48]! 77 STP d10, d11, [sp, 16] 78 STP d14, d15, [sp, 32] 79 80 # Clamp A and C pointers 81 CMP x0, 2 // if mr < 2 82 ADD x11, x3, x4 // a1 = a0 + a_stride 83 ADD x9, x6, x7 // c1 = c0 + cm_stride 84 CSEL x11, x3, x11, LO // a1 = a0 85 CSEL x9, x6, x9, LO // c1 = c0 86 ADD x12, x11, x4 // a2 = a1 + a_stride 87 ADD x10, x9, x7 // c2 = c1 + cm_stride 88 // if mr <= 2 89 CSEL x12, x11, x12, LS // a2 = a1 90 CSEL x10, x9, x10, LS // c2 = c1 91 CMP x0, 4 // if mr < 4 92 ADD x4, x12, x4 // a3 = a2 + a_stride 93 ADD x7, x10, x7 // c3 = c2 + cm_stride 94 CSEL x4, x12, x4, LO // a3 = a2 95 CSEL x7, x10, x7, LO // c3 = c2 96 970: 98 $if INC: 99 # Load initial accumulators 100 LD1 {v20.16b, v21.16b, v22.16b}, [x15], 48 101 LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48 102 LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48 103 LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48 104 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 105 PRFM PLDL1KEEP, [x3, 64] 106 PRFM PLDL1KEEP, [x11, 0] 107 PRFM PLDL1KEEP, [x11, 64] 108 PRFM PLDL1KEEP, [x12, 0] 109 PRFM PLDL1KEEP, [x12, 64] 110 PRFM PLDL1KEEP, [x4, 0] 111 PRFM PLDL1KEEP, [x4, 64] 112 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 113 PRFM PLDL1KEEP, [x5, 64] 114 PRFM PLDL1KEEP, [x5, 128] 115 PRFM PLDL1KEEP, [x5, 192] 116 PRFM PLDL1KEEP, [x5, 256] 117 PRFM PLDL1KEEP, [x5, 320] 118 $else: 119 # Load initial bias from w into accumulators 120 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48 121 MOV v23.16b, v20.16b 122 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 123 PRFM PLDL1KEEP, [x3, 64] 124 MOV v24.16b, v21.16b 125 PRFM PLDL1KEEP, [x11, 0] 126 PRFM PLDL1KEEP, [x11, 64] 127 MOV v25.16b, v22.16b 128 PRFM PLDL1KEEP, [x12, 0] 129 PRFM PLDL1KEEP, [x12, 64] 130 MOV v26.16b, v20.16b 131 PRFM PLDL1KEEP, [x4, 0] 132 PRFM PLDL1KEEP, [x4, 64] 133 MOV v27.16b, v21.16b 134 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 135 PRFM PLDL1KEEP, [x5, 64] 136 MOV v28.16b, v22.16b 137 PRFM PLDL1KEEP, [x5, 128] 138 PRFM PLDL1KEEP, [x5, 192] 139 MOV v29.16b, v20.16b 140 PRFM PLDL1KEEP, [x5, 256] 141 MOV v30.16b, v21.16b 142 PRFM PLDL1KEEP, [x5, 320] 143 MOV v31.16b, v22.16b 144 145 # Is there at least 4 floats (16 bytes)? 146 SUBS x0, x2, 16 // k = kc - 16 147 B.LO 5f 148 149 SUBS x0, x0, 16 150 151 # Prologue - loads for first group of 24 FMA 152 153 # Read first block of 4 A. 154 LDR d0, [x3], 8 // a0 155 LDR d1, [x12], 8 // a2 156 LD1 {v0.d}[1], [x11], 8 // a1 157 LD1 {v1.d}[1], [x4], 8 // a3 158 159 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 160 LD1 {v9.16b, v10.16b}, [x5], 32 161 LDR d11, [x5], 8 162 LDR x8, [x5], 8 163 164 # Is there at least 4 floats (16 bytes) for main loop? 165 B.LO 2f 166 167 # Main loop - 4 floats of A (16 bytes) 1681: 169 # First group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 170 # A is loaded for 2nd group into v2/v3 171 # INS is 4 blocks (16 cycles) after load 172 173 # BLOCK 0 174 LDR d2, [x3], 8 // a0 175 INS v11.d[1], x8 176 FMLA v20.4s, v6.4s, v0.s[0] 177 LDR x8, [x11], 8 // a1 178 FMLA v23.4s, v6.4s, v0.s[2] 179 FMLA v26.4s, v6.4s, v1.s[0] 180 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 181 182 # BLOCK 1 183 LDR d3, [x12], 8 // a2 184 INS v2.d[1], x8 // a1 was loaded in block 0 185 FMLA v29.4s, v6.4s, v1.s[2] 186 LDR x8, [x4], 8 // a3 187 FMLA v21.4s, v7.4s, v0.s[0] 188 FMLA v24.4s, v7.4s, v0.s[2] 189 PRFM PLDL1KEEP, [x11, 128] // Prefetch A1 190 191 # BLOCK 2 192 LDR d14, [x5] // vb0x0123 193 INS v3.d[1], x8 // a3 was loaded in block 1 194 FMLA v27.4s, v7.4s, v1.s[0] 195 LDR x8, [x5, 8] 196 FMLA v30.4s, v7.4s, v1.s[2] 197 FMLA v22.4s, v8.4s, v0.s[0] 198 PRFM PLDL1KEEP, [x12, 128] // Prefetch A2 199 200 # BLOCK 3 201 LDR d15, [x5, 16] // vb0x4567 202 INS v14.d[1], x8 // v14 was loaded in block 2 203 FMLA v25.4s, v8.4s, v0.s[2] 204 LDR x8, [x5, 24] 205 FMLA v28.4s, v8.4s, v1.s[0] 206 FMLA v31.4s, v8.4s, v1.s[2] 207 PRFM PLDL1KEEP, [x4, 128] // Prefetch A3 208 209 # BLOCK 4 210 LDR d16, [x5, 32] // vb0x89AB 211 INS v15.d[1], x8 212 FMLA v20.4s, v9.4s, v0.s[1] 213 LDR x8, [x5, 40] 214 FMLA v23.4s, v9.4s, v0.s[3] 215 FMLA v26.4s, v9.4s, v1.s[1] 216 PRFM PLDL1KEEP, [x5, 320] // Prefetch B 217 218 # BLOCK 5 219 LDR d17, [x5, 48] // vb1x0123 220 INS v16.d[1], x8 221 FMLA v29.4s, v9.4s, v1.s[3] 222 LDR x8, [x5, 56] 223 FMLA v21.4s, v10.4s, v0.s[1] 224 FMLA v24.4s, v10.4s, v0.s[3] 225 PRFM PLDL1KEEP, [x5, 384] // Prefetch B 226 227 # BLOCK 6 228 LDR d18, [x5, 64] // vb1x4567 229 INS v17.d[1], x8 230 FMLA v27.4s, v10.4s, v1.s[1] 231 LDR x8, [x5, 72] 232 FMLA v30.4s, v10.4s, v1.s[3] 233 FMLA v22.4s, v11.4s, v0.s[1] 234 PRFM PLDL1KEEP, [x5, 448] // Prefetch B 235 236 # BLOCK 7 237 LDR d19, [x5, 80] // vb1x89AB 238 INS v18.d[1], x8 239 FMLA v25.4s, v11.4s, v0.s[3] 240 LDR x8, [x5, 88] 241 FMLA v28.4s, v11.4s, v1.s[1] 242 FMLA v31.4s, v11.4s, v1.s[3] 243 244 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 245 # A is loaded for 1st group into v0/v1 246 247 # BLOCK 0 248 LDR d0, [x3], 8 // a0 249 INS v19.d[1], x8 250 FMLA v20.4s, v14.4s, v2.s[0] 251 LDR x8, [x11], 8 // a1 252 FMLA v23.4s, v14.4s, v2.s[2] 253 FMLA v26.4s, v14.4s, v3.s[0] 254 255 # BLOCK 1 256 LDR d1, [x12], 8 // a2 257 INS v0.d[1], x8 // a1 258 FMLA v29.4s, v14.4s, v3.s[2] 259 LDR x8, [x4], 8 // a3 260 FMLA v21.4s, v15.4s, v2.s[0] 261 FMLA v24.4s, v15.4s, v2.s[2] 262 263 # BLOCK 2 264 LDR d6, [x5, 96] // vb0x0123 265 INS v1.d[1], x8 // a3 266 FMLA v27.4s, v15.4s, v3.s[0] 267 LDR x8, [x5, 104] 268 FMLA v30.4s, v15.4s, v3.s[2] 269 FMLA v22.4s, v16.4s, v2.s[0] 270 271 # BLOCK 3 272 LDR d7, [x5, 112] // vb0x4567 273 INS v6.d[1], x8 274 FMLA v25.4s, v16.4s, v2.s[2] 275 LDR x8, [x5, 120] 276 FMLA v28.4s, v16.4s, v3.s[0] 277 FMLA v31.4s, v16.4s, v3.s[2] 278 279 # BLOCK 4 280 LDR d8, [x5, 128] // vb0x89AB 281 INS v7.d[1], x8 282 FMLA v20.4s, v17.4s, v2.s[1] 283 LDR x8, [x5, 136] 284 FMLA v23.4s, v17.4s, v2.s[3] 285 FMLA v26.4s, v17.4s, v3.s[1] 286 287 # BLOCK 5 288 LDR d9, [x5, 144] // vb1x0123 289 INS v8.d[1], x8 290 FMLA v29.4s, v17.4s, v3.s[3] 291 LDR x8, [x5, 152] 292 FMLA v21.4s, v18.4s, v2.s[1] 293 FMLA v24.4s, v18.4s, v2.s[3] 294 295 # BLOCK 6 296 LDR d10, [x5, 160] // vb1x4567 297 INS v9.d[1], x8 298 FMLA v27.4s, v18.4s, v3.s[1] 299 LDR x8, [x5, 168] 300 FMLA v30.4s, v18.4s, v3.s[3] 301 SUBS x0, x0, 16 302 FMLA v22.4s, v19.4s, v2.s[1] 303 304 # BLOCK 7 305 LDR d11, [x5, 176] // vb1x89AB 306 INS v10.d[1], x8 307 FMLA v25.4s, v19.4s, v2.s[3] 308 LDR x8, [x5, 184] 309 FMLA v28.4s, v19.4s, v3.s[1] 310 ADD x5, x5, 192 311 FMLA v31.4s, v19.4s, v3.s[3] 312 B.HS 1b 313 314 # Epilogue 315 # First block same as main loop. Second block has no loads. 3162: 317 # BLOCK 0 318 LDR d2, [x3], 8 // a0 319 INS v11.d[1], x8 320 FMLA v20.4s, v6.4s, v0.s[0] 321 LDR x8, [x11], 8 // a1 322 FMLA v23.4s, v6.4s, v0.s[2] 323 FMLA v26.4s, v6.4s, v1.s[0] 324 325 # BLOCK 1 326 LDR d3, [x12], 8 // a2 327 INS v2.d[1], x8 // a1 was loaded in block 0 328 FMLA v29.4s, v6.4s, v1.s[2] 329 LDR x8, [x4], 8 // a3 330 FMLA v21.4s, v7.4s, v0.s[0] 331 FMLA v24.4s, v7.4s, v0.s[2] 332 333 # BLOCK 2 334 LDR d14, [x5] // vb0x0123 335 INS v3.d[1], x8 // a3 was loaded in block 1 336 FMLA v27.4s, v7.4s, v1.s[0] 337 LDR x8, [x5, 8] 338 FMLA v30.4s, v7.4s, v1.s[2] 339 FMLA v22.4s, v8.4s, v0.s[0] 340 341 # BLOCK 3 342 LDR d15, [x5, 16] // vb0x4567 343 INS v14.d[1], x8 // v14 was loaded in block 2 344 FMLA v25.4s, v8.4s, v0.s[2] 345 LDR x8, [x5, 24] 346 FMLA v28.4s, v8.4s, v1.s[0] 347 FMLA v31.4s, v8.4s, v1.s[2] 348 349 # BLOCK 4 350 LDR d16, [x5, 32] // vb0x89AB 351 INS v15.d[1], x8 352 FMLA v20.4s, v9.4s, v0.s[1] 353 LDR x8, [x5, 40] 354 FMLA v23.4s, v9.4s, v0.s[3] 355 FMLA v26.4s, v9.4s, v1.s[1] 356 357 # BLOCK 5 358 LDR d17, [x5, 48] // vb1x0123 359 INS v16.d[1], x8 360 FMLA v29.4s, v9.4s, v1.s[3] 361 LDR x8, [x5, 56] 362 FMLA v21.4s, v10.4s, v0.s[1] 363 FMLA v24.4s, v10.4s, v0.s[3] 364 365 # BLOCK 6 366 LDR d18, [x5, 64] // vb1x4567 367 INS v17.d[1], x8 368 FMLA v27.4s, v10.4s, v1.s[1] 369 LDR x8, [x5, 72] 370 FMLA v30.4s, v10.4s, v1.s[3] 371 FMLA v22.4s, v11.4s, v0.s[1] 372 373 # BLOCK 7 374 LDR d19, [x5, 80] // vb1x89AB 375 INS v18.d[1], x8 376 FMLA v25.4s, v11.4s, v0.s[3] 377 LDR x8, [x5, 88] 378 FMLA v28.4s, v11.4s, v1.s[1] 379 FMLA v31.4s, v11.4s, v1.s[3] 380 381 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 382 # A is loaded for 1st group into v0/v1 383 384 # BLOCK 0 385 INS v19.d[1], x8 386 FMLA v20.4s, v14.4s, v2.s[0] 387 FMLA v23.4s, v14.4s, v2.s[2] 388 FMLA v26.4s, v14.4s, v3.s[0] 389 390 # BLOCK 1 391 FMLA v29.4s, v14.4s, v3.s[2] 392 FMLA v21.4s, v15.4s, v2.s[0] 393 FMLA v24.4s, v15.4s, v2.s[2] 394 395 # BLOCK 2 396 FMLA v27.4s, v15.4s, v3.s[0] 397 FMLA v30.4s, v15.4s, v3.s[2] 398 FMLA v22.4s, v16.4s, v2.s[0] 399 400 # BLOCK 3 401 FMLA v25.4s, v16.4s, v2.s[2] 402 FMLA v28.4s, v16.4s, v3.s[0] 403 FMLA v31.4s, v16.4s, v3.s[2] 404 405 # BLOCK 4 406 FMLA v20.4s, v17.4s, v2.s[1] 407 FMLA v23.4s, v17.4s, v2.s[3] 408 FMLA v26.4s, v17.4s, v3.s[1] 409 410 # BLOCK 5 411 FMLA v29.4s, v17.4s, v3.s[3] 412 FMLA v21.4s, v18.4s, v2.s[1] 413 FMLA v24.4s, v18.4s, v2.s[3] 414 415 # BLOCK 6 416 FMLA v27.4s, v18.4s, v3.s[1] 417 FMLA v30.4s, v18.4s, v3.s[3] 418 FMLA v22.4s, v19.4s, v2.s[1] 419 TST x0, 15 420 421 # BLOCK 7 422 FMLA v25.4s, v19.4s, v2.s[3] 423 FMLA v28.4s, v19.4s, v3.s[1] 424 ADD x5, x5, 96 425 FMLA v31.4s, v19.4s, v3.s[3] 426 427 # Is there a remainder?- 2 floats of A (8 bytes) or less 428 B.NE 5f 429 4304: 431 # Clamp 432 FMIN v20.4s, v20.4s, v4.4s 433 SUBS x1, x1, 12 434 FMIN v21.4s, v21.4s, v4.4s 435 FMIN v22.4s, v22.4s, v4.4s 436 FMIN v23.4s, v23.4s, v4.4s 437 FMIN v24.4s, v24.4s, v4.4s 438 FMIN v25.4s, v25.4s, v4.4s 439 FMIN v26.4s, v26.4s, v4.4s 440 FMIN v27.4s, v27.4s, v4.4s 441 FMIN v28.4s, v28.4s, v4.4s 442 FMIN v29.4s, v29.4s, v4.4s 443 FMIN v30.4s, v30.4s, v4.4s 444 FMIN v31.4s, v31.4s, v4.4s 445 FMAX v20.4s, v20.4s, v5.4s 446 FMAX v21.4s, v21.4s, v5.4s 447 FMAX v22.4s, v22.4s, v5.4s 448 FMAX v23.4s, v23.4s, v5.4s 449 FMAX v24.4s, v24.4s, v5.4s 450 FMAX v25.4s, v25.4s, v5.4s 451 FMAX v26.4s, v26.4s, v5.4s 452 FMAX v27.4s, v27.4s, v5.4s 453 FMAX v28.4s, v28.4s, v5.4s 454 FMAX v29.4s, v29.4s, v5.4s 455 FMAX v30.4s, v30.4s, v5.4s 456 FMAX v31.4s, v31.4s, v5.4s 457 458 # Store full 4 x 12 459 B.LO 7f 460 461 $if INC: 462 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14 463 SUB x3, x3, x2 // a0 -= kc 464 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14 465 SUB x11, x11, x2 // a1 -= kc 466 ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14 467 SUB x12, x12, x2 // a2 -= kc 468 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14 469 SUB x4, x4, x2 // a3 -= kc 470 $else: 471 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14 472 SUB x3, x3, x2 // a0 -= kc 473 ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14 474 SUB x11, x11, x2 // a1 -= kc 475 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14 476 SUB x12, x12, x2 // a2 -= kc 477 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14 478 SUB x4, x4, x2 // a3 -= kc 479 480 B.HI 0b 481 482 # Restore d8-d11,d14,d15 from stack 483 LDP d14, d15, [sp, 32] 484 LDP d10, d11, [sp, 16] 485 LDP d8, d9, [sp], 48 486 RET 487 4885: 489 # Is there a remainder?- 2 floats of A (8 bytes) 490 TBZ x0, 3, 6f 491 492 # Remainder - 2 floats of A (8 bytes) 493 # Read first block of 4 A. 494 LDR d0, [x3], 8 // a0 495 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 496 LDR d1, [x11], 8 // a1 497 LDR d2, [x12], 8 // a2 498 LDR d3, [x4], 8 // a3 499 LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48 500 501 # First block of 3 B 502 FMLA v20.4s, v6.4s, v0.s[0] 503 FMLA v23.4s, v6.4s, v1.s[0] 504 FMLA v26.4s, v6.4s, v2.s[0] 505 FMLA v29.4s, v6.4s, v3.s[0] 506 FMLA v21.4s, v7.4s, v0.s[0] 507 FMLA v24.4s, v7.4s, v1.s[0] 508 FMLA v27.4s, v7.4s, v2.s[0] 509 FMLA v30.4s, v7.4s, v3.s[0] 510 FMLA v22.4s, v8.4s, v0.s[0] 511 FMLA v25.4s, v8.4s, v1.s[0] 512 FMLA v28.4s, v8.4s, v2.s[0] 513 FMLA v31.4s, v8.4s, v3.s[0] 514 515 # Second block of 3 B 516 FMLA v20.4s, v9.4s, v0.s[1] 517 FMLA v23.4s, v9.4s, v1.s[1] 518 FMLA v26.4s, v9.4s, v2.s[1] 519 FMLA v29.4s, v9.4s, v3.s[1] 520 FMLA v21.4s, v10.4s, v0.s[1] 521 FMLA v24.4s, v10.4s, v1.s[1] 522 FMLA v27.4s, v10.4s, v2.s[1] 523 FMLA v30.4s, v10.4s, v3.s[1] 524 FMLA v22.4s, v11.4s, v0.s[1] 525 FMLA v25.4s, v11.4s, v1.s[1] 526 FMLA v28.4s, v11.4s, v2.s[1] 527 FMLA v31.4s, v11.4s, v3.s[1] 528 529 TBZ x0, 2, 4b 5306: 531 # Remainder - 1 float of A (4 bytes) 532 LDR s0, [x3], 4 // a0 533 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 534 LDR s1, [x11], 4 // a1 535 LDR s2, [x12], 4 // a2 536 LDR s3, [x4], 4 // a3 537 538 FMLA v20.4s, v6.4s, v0.s[0] 539 FMLA v23.4s, v6.4s, v1.s[0] 540 FMLA v26.4s, v6.4s, v2.s[0] 541 FMLA v29.4s, v6.4s, v3.s[0] 542 FMLA v21.4s, v7.4s, v0.s[0] 543 FMLA v24.4s, v7.4s, v1.s[0] 544 FMLA v27.4s, v7.4s, v2.s[0] 545 FMLA v30.4s, v7.4s, v3.s[0] 546 FMLA v22.4s, v8.4s, v0.s[0] 547 FMLA v25.4s, v8.4s, v1.s[0] 548 FMLA v28.4s, v8.4s, v2.s[0] 549 FMLA v31.4s, v8.4s, v3.s[0] 550 B 4b 551 5527: 553 ADD x1, x1, 12 554 # Store odd channels 555 TBZ x1, 3, 8f 556 $if INC: 557 STP q29, q30, [x7], 32 558 MOV v29.16b, v31.16b 559 STP q26, q27, [x10], 32 560 MOV v26.16b, v28.16b 561 STP q23, q24, [x9], 32 562 MOV v23.16b, v25.16b 563 STP q20, q21, [x6], 32 564 MOV v20.16b, v22.16b 565 $else: 566 STP q20, q21, [x6], 32 567 MOV v20.16b, v22.16b 568 STP q23, q24, [x9], 32 569 MOV v23.16b, v25.16b 570 STP q26, q27, [x10], 32 571 MOV v26.16b, v28.16b 572 STP q29, q30, [x7], 32 573 MOV v29.16b, v31.16b 574 5758: 576 TBZ x1, 2, 9f 577 $if INC: 578 STR q29, [x7], 16 579 MOV v29.16b, v30.16b 580 STR q26, [x10], 16 581 MOV v26.16b, v27.16b 582 STR q23, [x9], 16 583 MOV v23.16b, v24.16b 584 STR q20, [x6], 16 585 MOV v20.16b, v21.16b 586 $else: 587 STR q20, [x6], 16 588 MOV v20.16b, v21.16b 589 STR q23, [x9], 16 590 MOV v23.16b, v24.16b 591 STR q26, [x10], 16 592 MOV v26.16b, v27.16b 593 STR q29, [x7], 16 594 MOV v29.16b, v30.16b 595 5969: 597 TBZ x1, 1, 10f 598 $if INC: 599 STR d29, [x7], 8 600 DUP d29, v29.d[1] 601 STR d26, [x10], 8 602 DUP d26, v26.d[1] 603 STR d23, [x9], 8 604 DUP d23, v23.d[1] 605 STR d20, [x6], 8 606 DUP d20, v20.d[1] 607 $else: 608 STR d20, [x6], 8 609 DUP d20, v20.d[1] 610 STR d23, [x9], 8 611 DUP d23, v23.d[1] 612 STR d26, [x10], 8 613 DUP d26, v26.d[1] 614 STR d29, [x7], 8 615 DUP d29, v29.d[1] 616 61710: 618 TBZ x1, 0, 11f 619 $if INC: 620 STR s29, [x7] 621 STR s26, [x10] 622 STR s23, [x9] 623 STR s20, [x6] 624 $else: 625 STR s20, [x6] 626 STR s23, [x9] 627 STR s26, [x10] 628 STR s29, [x7] 62911: 630 # Restore d8-d11,d14,d15 from stack 631 LDP d14, d15, [sp, 32] 632 LDP d10, d11, [sp, 16] 633 LDP d8, d9, [sp], 48 634 RET 635 636END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_4x12__aarch64_neonfma_cortex_a53 637 638#ifdef __ELF__ 639.section ".note.GNU-stack","",%progbits 640#endif 641