1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38# x8 temporary vector shadow register 39 40# Vector register usage and GPR shadows 41# a0 v0 42# a1 v0[1] 43# a2 v1 44# a3 v1[1] 45# a0 v2 46# a1 v2[1] 47# a2 v3 48# a3 v3[1] 49# B v6 v7 v8 50# B v9 v10 v11 51# B v14 v15 v16 52# B v17 v18 v19 53# C v20 v21 v22 54# C v23 v24 v25 55# C v26 v27 v28 56# C v29 v30 v31 57# Clamp v4 v5 58# v12 to v13 unused. 59 60BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53 61 62 $if INC: 63 # Load cn_stride, acc 64 LDP x14, x15, [sp] 65 # Load params pointer 66 LDR x8, [sp, 16] 67 $else: 68 # Load cn_stride, params pointer 69 LDP x14, x8, [sp] 70 71 # Load min/max values 72 LD2R {v4.4s, v5.4s}, [x8] 73 74 # Save d8-d11,d14,d15 on stack 75 STP d8, d9, [sp, -48]! 76 STP d10, d11, [sp, 16] 77 STP d14, d15, [sp, 32] 78 79 # Clamp A and C pointers 80 CMP x0, 2 // if mr < 2 81 ADD x11, x3, x4 // a1 = a0 + a_stride 82 ADD x9, x6, x7 // c1 = c0 + cm_stride 83 CSEL x11, x3, x11, LO // a1 = a0 84 CSEL x9, x6, x9, LO // c1 = c0 85 ADD x12, x11, x4 // a2 = a1 + a_stride 86 ADD x10, x9, x7 // c2 = c1 + cm_stride 87 // if mr <= 2 88 CSEL x12, x11, x12, LS // a2 = a1 89 CSEL x10, x9, x10, LS // c2 = c1 90 CMP x0, 4 // if mr < 4 91 ADD x4, x12, x4 // a3 = a2 + a_stride 92 ADD x7, x10, x7 // c3 = c2 + cm_stride 93 CSEL x4, x12, x4, LO // a3 = a2 94 CSEL x7, x10, x7, LO // c3 = c2 95 960: 97 $if INC: 98 # Load initial accumulators 99 LD1 {v20.16b, v21.16b, v22.16b}, [x15], 48 100 LD1 {v23.16b, v24.16b, v25.16b}, [x15], 48 101 LD1 {v26.16b, v27.16b, v28.16b}, [x15], 48 102 LD1 {v29.16b, v30.16b, v31.16b}, [x15], 48 103 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 104 PRFM PLDL1KEEP, [x3, 64] 105 PRFM PLDL1KEEP, [x11, 0] 106 PRFM PLDL1KEEP, [x11, 64] 107 PRFM PLDL1KEEP, [x12, 0] 108 PRFM PLDL1KEEP, [x12, 64] 109 PRFM PLDL1KEEP, [x4, 0] 110 PRFM PLDL1KEEP, [x4, 64] 111 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 112 PRFM PLDL1KEEP, [x5, 64] 113 PRFM PLDL1KEEP, [x5, 128] 114 PRFM PLDL1KEEP, [x5, 192] 115 PRFM PLDL1KEEP, [x5, 256] 116 PRFM PLDL1KEEP, [x5, 320] 117 $else: 118 # Load initial bias from w into accumulators 119 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48 120 MOV v23.16b, v20.16b 121 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 122 PRFM PLDL1KEEP, [x3, 64] 123 MOV v24.16b, v21.16b 124 PRFM PLDL1KEEP, [x11, 0] 125 PRFM PLDL1KEEP, [x11, 64] 126 MOV v25.16b, v22.16b 127 PRFM PLDL1KEEP, [x12, 0] 128 PRFM PLDL1KEEP, [x12, 64] 129 MOV v26.16b, v20.16b 130 PRFM PLDL1KEEP, [x4, 0] 131 PRFM PLDL1KEEP, [x4, 64] 132 MOV v27.16b, v21.16b 133 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 134 PRFM PLDL1KEEP, [x5, 64] 135 MOV v28.16b, v22.16b 136 PRFM PLDL1KEEP, [x5, 128] 137 PRFM PLDL1KEEP, [x5, 192] 138 MOV v29.16b, v20.16b 139 PRFM PLDL1KEEP, [x5, 256] 140 MOV v30.16b, v21.16b 141 PRFM PLDL1KEEP, [x5, 320] 142 MOV v31.16b, v22.16b 143 144 # Is there at least 4 floats (16 bytes)? 145 SUBS x0, x2, 16 // k = kc - 16 146 B.LO 4f 147 148 SUBS x0, x0, 16 149 150 # Prologue - loads for first group of 24 FMA 151 152 # Read first block of 4 A. 153 LDR d0, [x3], 8 // a0 154 LDR d1, [x12], 8 // a2 155 LD1 {v0.d}[1], [x11], 8 // a1 156 LD1 {v1.d}[1], [x4], 8 // a3 157 158 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 159 LD1 {v9.16b, v10.16b}, [x5], 32 160 LDR d11, [x5], 8 161 LDR x8, [x5], 8 162 163 # Is there at least 4 floats (16 bytes) for main loop? 164 B.LO 2f 165 166 # Main loop - 4 floats of A (16 bytes) 1671: 168 # First group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 169 # A is loaded for 2nd group into v2/v3 170 # INS is 4 blocks (16 cycles) after load 171 172 # BLOCK 0 173 LDR d2, [x3], 8 // a0 174 INS v11.d[1], x8 175 FMLA v20.4s, v6.4s, v0.s[0] 176 LDR x8, [x11], 8 // a1 177 FMLA v23.4s, v6.4s, v0.s[2] 178 FMLA v26.4s, v6.4s, v1.s[0] 179 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 180 181 # BLOCK 1 182 LDR d3, [x12], 8 // a2 183 INS v2.d[1], x8 // a1 was loaded in block 0 184 FMLA v29.4s, v6.4s, v1.s[2] 185 LDR x8, [x4], 8 // a3 186 FMLA v21.4s, v7.4s, v0.s[0] 187 FMLA v24.4s, v7.4s, v0.s[2] 188 PRFM PLDL1KEEP, [x11, 128] // Prefetch A1 189 190 # BLOCK 2 191 LDR d14, [x5] // vb0x0123 192 INS v3.d[1], x8 // a3 was loaded in block 1 193 FMLA v27.4s, v7.4s, v1.s[0] 194 LDR x8, [x5, 8] 195 FMLA v30.4s, v7.4s, v1.s[2] 196 FMLA v22.4s, v8.4s, v0.s[0] 197 PRFM PLDL1KEEP, [x12, 128] // Prefetch A2 198 199 # BLOCK 3 200 LDR d15, [x5, 16] // vb0x4567 201 INS v14.d[1], x8 // v14 was loaded in block 2 202 FMLA v25.4s, v8.4s, v0.s[2] 203 LDR x8, [x5, 24] 204 FMLA v28.4s, v8.4s, v1.s[0] 205 FMLA v31.4s, v8.4s, v1.s[2] 206 PRFM PLDL1KEEP, [x4, 128] // Prefetch A3 207 208 # BLOCK 4 209 LDR d16, [x5, 32] // vb0x89AB 210 INS v15.d[1], x8 211 FMLA v20.4s, v9.4s, v0.s[1] 212 LDR x8, [x5, 40] 213 FMLA v23.4s, v9.4s, v0.s[3] 214 FMLA v26.4s, v9.4s, v1.s[1] 215 PRFM PLDL1KEEP, [x5, 320] // Prefetch B 216 217 # BLOCK 5 218 LDR d17, [x5, 48] // vb1x0123 219 INS v16.d[1], x8 220 FMLA v29.4s, v9.4s, v1.s[3] 221 LDR x8, [x5, 56] 222 FMLA v21.4s, v10.4s, v0.s[1] 223 FMLA v24.4s, v10.4s, v0.s[3] 224 PRFM PLDL1KEEP, [x5, 384] // Prefetch B 225 226 # BLOCK 6 227 LDR d18, [x5, 64] // vb1x4567 228 INS v17.d[1], x8 229 FMLA v27.4s, v10.4s, v1.s[1] 230 LDR x8, [x5, 72] 231 FMLA v30.4s, v10.4s, v1.s[3] 232 FMLA v22.4s, v11.4s, v0.s[1] 233 PRFM PLDL1KEEP, [x5, 448] // Prefetch B 234 235 # BLOCK 7 236 LDR d19, [x5, 80] // vb1x89AB 237 INS v18.d[1], x8 238 FMLA v25.4s, v11.4s, v0.s[3] 239 LDR x8, [x5, 88] 240 FMLA v28.4s, v11.4s, v1.s[1] 241 FMLA v31.4s, v11.4s, v1.s[3] 242 243 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 244 # A is loaded for 1st group into v0/v1 245 246 # BLOCK 0 247 LDR d0, [x3], 8 // a0 248 INS v19.d[1], x8 249 FMLA v20.4s, v14.4s, v2.s[0] 250 LDR x8, [x11], 8 // a1 251 FMLA v23.4s, v14.4s, v2.s[2] 252 FMLA v26.4s, v14.4s, v3.s[0] 253 254 # BLOCK 1 255 LDR d1, [x12], 8 // a2 256 INS v0.d[1], x8 // a1 257 FMLA v29.4s, v14.4s, v3.s[2] 258 LDR x8, [x4], 8 // a3 259 FMLA v21.4s, v15.4s, v2.s[0] 260 FMLA v24.4s, v15.4s, v2.s[2] 261 262 # BLOCK 2 263 LDR d6, [x5, 96] // vb0x0123 264 INS v1.d[1], x8 // a3 265 FMLA v27.4s, v15.4s, v3.s[0] 266 LDR x8, [x5, 104] 267 FMLA v30.4s, v15.4s, v3.s[2] 268 FMLA v22.4s, v16.4s, v2.s[0] 269 270 # BLOCK 3 271 LDR d7, [x5, 112] // vb0x4567 272 INS v6.d[1], x8 273 FMLA v25.4s, v16.4s, v2.s[2] 274 LDR x8, [x5, 120] 275 FMLA v28.4s, v16.4s, v3.s[0] 276 FMLA v31.4s, v16.4s, v3.s[2] 277 278 # BLOCK 4 279 LDR d8, [x5, 128] // vb0x89AB 280 INS v7.d[1], x8 281 FMLA v20.4s, v17.4s, v2.s[1] 282 LDR x8, [x5, 136] 283 FMLA v23.4s, v17.4s, v2.s[3] 284 FMLA v26.4s, v17.4s, v3.s[1] 285 286 # BLOCK 5 287 LDR d9, [x5, 144] // vb1x0123 288 INS v8.d[1], x8 289 FMLA v29.4s, v17.4s, v3.s[3] 290 LDR x8, [x5, 152] 291 FMLA v21.4s, v18.4s, v2.s[1] 292 FMLA v24.4s, v18.4s, v2.s[3] 293 294 # BLOCK 6 295 LDR d10, [x5, 160] // vb1x4567 296 INS v9.d[1], x8 297 FMLA v27.4s, v18.4s, v3.s[1] 298 LDR x8, [x5, 168] 299 FMLA v30.4s, v18.4s, v3.s[3] 300 SUBS x0, x0, 16 301 FMLA v22.4s, v19.4s, v2.s[1] 302 303 # BLOCK 7 304 LDR d11, [x5, 176] // vb1x89AB 305 INS v10.d[1], x8 306 FMLA v25.4s, v19.4s, v2.s[3] 307 LDR x8, [x5, 184] 308 FMLA v28.4s, v19.4s, v3.s[1] 309 ADD x5, x5, 192 310 FMLA v31.4s, v19.4s, v3.s[3] 311 B.HS 1b 312 313 # Epilogue 314 # First block same as main loop. Second block has no loads. 3152: 316 # BLOCK 0 317 LDR d2, [x3], 8 // a0 318 INS v11.d[1], x8 319 FMLA v20.4s, v6.4s, v0.s[0] 320 LDR x8, [x11], 8 // a1 321 FMLA v23.4s, v6.4s, v0.s[2] 322 FMLA v26.4s, v6.4s, v1.s[0] 323 324 # BLOCK 1 325 LDR d3, [x12], 8 // a2 326 INS v2.d[1], x8 // a1 was loaded in block 0 327 FMLA v29.4s, v6.4s, v1.s[2] 328 LDR x8, [x4], 8 // a3 329 FMLA v21.4s, v7.4s, v0.s[0] 330 FMLA v24.4s, v7.4s, v0.s[2] 331 332 # BLOCK 2 333 LDR d14, [x5] // vb0x0123 334 INS v3.d[1], x8 // a3 was loaded in block 1 335 FMLA v27.4s, v7.4s, v1.s[0] 336 LDR x8, [x5, 8] 337 FMLA v30.4s, v7.4s, v1.s[2] 338 FMLA v22.4s, v8.4s, v0.s[0] 339 340 # BLOCK 3 341 LDR d15, [x5, 16] // vb0x4567 342 INS v14.d[1], x8 // v14 was loaded in block 2 343 FMLA v25.4s, v8.4s, v0.s[2] 344 LDR x8, [x5, 24] 345 FMLA v28.4s, v8.4s, v1.s[0] 346 FMLA v31.4s, v8.4s, v1.s[2] 347 348 # BLOCK 4 349 LDR d16, [x5, 32] // vb0x89AB 350 INS v15.d[1], x8 351 FMLA v20.4s, v9.4s, v0.s[1] 352 LDR x8, [x5, 40] 353 FMLA v23.4s, v9.4s, v0.s[3] 354 FMLA v26.4s, v9.4s, v1.s[1] 355 356 # BLOCK 5 357 LDR d17, [x5, 48] // vb1x0123 358 INS v16.d[1], x8 359 FMLA v29.4s, v9.4s, v1.s[3] 360 LDR x8, [x5, 56] 361 FMLA v21.4s, v10.4s, v0.s[1] 362 FMLA v24.4s, v10.4s, v0.s[3] 363 364 # BLOCK 6 365 LDR d18, [x5, 64] // vb1x4567 366 INS v17.d[1], x8 367 FMLA v27.4s, v10.4s, v1.s[1] 368 LDR x8, [x5, 72] 369 FMLA v30.4s, v10.4s, v1.s[3] 370 FMLA v22.4s, v11.4s, v0.s[1] 371 372 # BLOCK 7 373 LDR d19, [x5, 80] // vb1x89AB 374 INS v18.d[1], x8 375 FMLA v25.4s, v11.4s, v0.s[3] 376 LDR x8, [x5, 88] 377 FMLA v28.4s, v11.4s, v1.s[1] 378 FMLA v31.4s, v11.4s, v1.s[3] 379 380 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 381 # A is loaded for 1st group into v0/v1 382 383 # BLOCK 0 384 INS v19.d[1], x8 385 FMLA v20.4s, v14.4s, v2.s[0] 386 FMLA v23.4s, v14.4s, v2.s[2] 387 FMLA v26.4s, v14.4s, v3.s[0] 388 389 # BLOCK 1 390 FMLA v29.4s, v14.4s, v3.s[2] 391 FMLA v21.4s, v15.4s, v2.s[0] 392 FMLA v24.4s, v15.4s, v2.s[2] 393 394 # BLOCK 2 395 FMLA v27.4s, v15.4s, v3.s[0] 396 FMLA v30.4s, v15.4s, v3.s[2] 397 FMLA v22.4s, v16.4s, v2.s[0] 398 399 # BLOCK 3 400 FMLA v25.4s, v16.4s, v2.s[2] 401 FMLA v28.4s, v16.4s, v3.s[0] 402 FMLA v31.4s, v16.4s, v3.s[2] 403 404 # BLOCK 4 405 FMLA v20.4s, v17.4s, v2.s[1] 406 FMLA v23.4s, v17.4s, v2.s[3] 407 FMLA v26.4s, v17.4s, v3.s[1] 408 409 # BLOCK 5 410 FMLA v29.4s, v17.4s, v3.s[3] 411 FMLA v21.4s, v18.4s, v2.s[1] 412 FMLA v24.4s, v18.4s, v2.s[3] 413 414 # BLOCK 6 415 FMLA v27.4s, v18.4s, v3.s[1] 416 FMLA v30.4s, v18.4s, v3.s[3] 417 FMLA v22.4s, v19.4s, v2.s[1] 418 TST x0, 15 419 420 # BLOCK 7 421 FMLA v25.4s, v19.4s, v2.s[3] 422 FMLA v28.4s, v19.4s, v3.s[1] 423 ADD x5, x5, 96 424 FMLA v31.4s, v19.4s, v3.s[3] 425 426 # Is there a remainder?- 2 floats of A (8 bytes) or less 427 B.NE 4f 428 4293: 430 # Clamp 431 FMAX v20.4s, v20.4s, v4.4s 432 SUBS x1, x1, 12 433 FMAX v21.4s, v21.4s, v4.4s 434 FMAX v22.4s, v22.4s, v4.4s 435 FMAX v23.4s, v23.4s, v4.4s 436 FMAX v24.4s, v24.4s, v4.4s 437 FMAX v25.4s, v25.4s, v4.4s 438 FMAX v26.4s, v26.4s, v4.4s 439 FMAX v27.4s, v27.4s, v4.4s 440 FMAX v28.4s, v28.4s, v4.4s 441 FMAX v29.4s, v29.4s, v4.4s 442 FMAX v30.4s, v30.4s, v4.4s 443 FMAX v31.4s, v31.4s, v4.4s 444 FMIN v20.4s, v20.4s, v5.4s 445 FMIN v21.4s, v21.4s, v5.4s 446 FMIN v22.4s, v22.4s, v5.4s 447 FMIN v23.4s, v23.4s, v5.4s 448 FMIN v24.4s, v24.4s, v5.4s 449 FMIN v25.4s, v25.4s, v5.4s 450 FMIN v26.4s, v26.4s, v5.4s 451 FMIN v27.4s, v27.4s, v5.4s 452 FMIN v28.4s, v28.4s, v5.4s 453 FMIN v29.4s, v29.4s, v5.4s 454 FMIN v30.4s, v30.4s, v5.4s 455 FMIN v31.4s, v31.4s, v5.4s 456 457 # Store full 4 x 12 458 B.LO 6f 459 460 $if INC: 461 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14 462 SUB x3, x3, x2 // a0 -= kc 463 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14 464 SUB x11, x11, x2 // a1 -= kc 465 ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14 466 SUB x12, x12, x2 // a2 -= kc 467 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14 468 SUB x4, x4, x2 // a3 -= kc 469 $else: 470 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x14 471 SUB x3, x3, x2 // a0 -= kc 472 ST1 {v23.16b, v24.16b, v25.16b}, [x9], x14 473 SUB x11, x11, x2 // a1 -= kc 474 ST1 {v26.16b, v27.16b, v28.16b}, [x10], x14 475 SUB x12, x12, x2 // a2 -= kc 476 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x14 477 SUB x4, x4, x2 // a3 -= kc 478 479 B.HI 0b 480 481 # Restore d8-d11,d14,d15 from stack 482 LDP d14, d15, [sp, 32] 483 LDP d10, d11, [sp, 16] 484 LDP d8, d9, [sp], 48 485 RET 486 4874: 488 # Is there a remainder?- 2 floats of A (8 bytes) 489 TBZ x0, 3, 5f 490 491 # Remainder - 2 floats of A (8 bytes) 492 # Read first block of 4 A. 493 LDR d0, [x3], 8 // a0 494 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 495 LDR d1, [x11], 8 // a1 496 LDR d2, [x12], 8 // a2 497 LDR d3, [x4], 8 // a3 498 LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48 499 500 # First block of 3 B 501 FMLA v20.4s, v6.4s, v0.s[0] 502 FMLA v23.4s, v6.4s, v1.s[0] 503 FMLA v26.4s, v6.4s, v2.s[0] 504 FMLA v29.4s, v6.4s, v3.s[0] 505 FMLA v21.4s, v7.4s, v0.s[0] 506 FMLA v24.4s, v7.4s, v1.s[0] 507 FMLA v27.4s, v7.4s, v2.s[0] 508 FMLA v30.4s, v7.4s, v3.s[0] 509 FMLA v22.4s, v8.4s, v0.s[0] 510 FMLA v25.4s, v8.4s, v1.s[0] 511 FMLA v28.4s, v8.4s, v2.s[0] 512 FMLA v31.4s, v8.4s, v3.s[0] 513 514 # Second block of 3 B 515 FMLA v20.4s, v9.4s, v0.s[1] 516 FMLA v23.4s, v9.4s, v1.s[1] 517 FMLA v26.4s, v9.4s, v2.s[1] 518 FMLA v29.4s, v9.4s, v3.s[1] 519 FMLA v21.4s, v10.4s, v0.s[1] 520 FMLA v24.4s, v10.4s, v1.s[1] 521 FMLA v27.4s, v10.4s, v2.s[1] 522 FMLA v30.4s, v10.4s, v3.s[1] 523 FMLA v22.4s, v11.4s, v0.s[1] 524 FMLA v25.4s, v11.4s, v1.s[1] 525 FMLA v28.4s, v11.4s, v2.s[1] 526 FMLA v31.4s, v11.4s, v3.s[1] 527 528 TBZ x0, 2, 3b 5295: 530 # Remainder - 1 float of A (4 bytes) 531 LDR s0, [x3], 4 // a0 532 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 533 LDR s1, [x11], 4 // a1 534 LDR s2, [x12], 4 // a2 535 LDR s3, [x4], 4 // a3 536 537 FMLA v20.4s, v6.4s, v0.s[0] 538 FMLA v23.4s, v6.4s, v1.s[0] 539 FMLA v26.4s, v6.4s, v2.s[0] 540 FMLA v29.4s, v6.4s, v3.s[0] 541 FMLA v21.4s, v7.4s, v0.s[0] 542 FMLA v24.4s, v7.4s, v1.s[0] 543 FMLA v27.4s, v7.4s, v2.s[0] 544 FMLA v30.4s, v7.4s, v3.s[0] 545 FMLA v22.4s, v8.4s, v0.s[0] 546 FMLA v25.4s, v8.4s, v1.s[0] 547 FMLA v28.4s, v8.4s, v2.s[0] 548 FMLA v31.4s, v8.4s, v3.s[0] 549 B 3b 550 5516: 552 ADD x1, x1, 12 553 # Store odd channels 554 TBZ x1, 3, 7f 555 $if INC: 556 STP q29, q30, [x7], 32 557 MOV v29.16b, v31.16b 558 STP q26, q27, [x10], 32 559 MOV v26.16b, v28.16b 560 STP q23, q24, [x9], 32 561 MOV v23.16b, v25.16b 562 STP q20, q21, [x6], 32 563 MOV v20.16b, v22.16b 564 $else: 565 STP q20, q21, [x6], 32 566 MOV v20.16b, v22.16b 567 STP q23, q24, [x9], 32 568 MOV v23.16b, v25.16b 569 STP q26, q27, [x10], 32 570 MOV v26.16b, v28.16b 571 STP q29, q30, [x7], 32 572 MOV v29.16b, v31.16b 573 5747: 575 TBZ x1, 2, 8f 576 $if INC: 577 STR q29, [x7], 16 578 MOV v29.16b, v30.16b 579 STR q26, [x10], 16 580 MOV v26.16b, v27.16b 581 STR q23, [x9], 16 582 MOV v23.16b, v24.16b 583 STR q20, [x6], 16 584 MOV v20.16b, v21.16b 585 $else: 586 STR q20, [x6], 16 587 MOV v20.16b, v21.16b 588 STR q23, [x9], 16 589 MOV v23.16b, v24.16b 590 STR q26, [x10], 16 591 MOV v26.16b, v27.16b 592 STR q29, [x7], 16 593 MOV v29.16b, v30.16b 594 5958: 596 TBZ x1, 1, 9f 597 $if INC: 598 STR d29, [x7], 8 599 DUP d29, v29.d[1] 600 STR d26, [x10], 8 601 DUP d26, v26.d[1] 602 STR d23, [x9], 8 603 DUP d23, v23.d[1] 604 STR d20, [x6], 8 605 DUP d20, v20.d[1] 606 $else: 607 STR d20, [x6], 8 608 DUP d20, v20.d[1] 609 STR d23, [x9], 8 610 DUP d23, v23.d[1] 611 STR d26, [x10], 8 612 DUP d26, v26.d[1] 613 STR d29, [x7], 8 614 DUP d29, v29.d[1] 615 6169: 617 TBZ x1, 0, 10f 618 $if INC: 619 STR s29, [x7] 620 STR s26, [x10] 621 STR s23, [x9] 622 STR s20, [x6] 623 $else: 624 STR s20, [x6] 625 STR s23, [x9] 626 STR s26, [x10] 627 STR s29, [x7] 62810: 629 # Restore d8-d11,d14,d15 from stack 630 LDP d14, d15, [sp, 32] 631 LDP d10, d11, [sp, 16] 632 LDP d8, d9, [sp], 48 633 RET 634 635END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_4x12__aarch64_neonfma_cortex_a53 636 637#ifdef __ELF__ 638.section ".note.GNU-stack","",%progbits 639#endif 640