1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x14 c3 40# x13 c4 41# x7 c5 42 43# x8 temporary vector shadow register 44 45# Vector register usage 46# A0 v0 v3 47# A1 v0[1] v3[1] 48# A2 v1 v4 49# A3 v1[1] v4[1] 50# A4 v2 v5 51# A5 v2[1] v5[1] 52# B v12 v13 v14 v15 second set of B 53# B v16 v17 v18 v19 first set 54# C v20 v21 55# C v22 v23 56# C v24 v25 57# C v26 v27 58# C v28 v29 59# C v30 v31 60# Clamp v6 v7 61# unused A v8 v9 v10 v11 62 63BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53 64 65 # Load acc, params pointer 66 LDP x15, x8, [sp, 8] 67 68 # Clamp A and C pointers 69 CMP x0, 2 // if mr < 2 70 ADD x9, x3, x4 // a1 = a0 + a_stride 71 ADD x16, x6, x7 // c1 = c0 + cm_stride 72 CSEL x9, x3, x9, LO // a1 = a0 73 CSEL x16, x6, x16, LO // c1 = c0 74 75 ADD x10, x9, x4 // a2 = a1 + a_stride 76 ADD x17, x16, x7 // c2 = c1 + cm_stride 77 // if mr <= 2 78 CSEL x10, x9, x10, LS // a2 = a1 79 CSEL x17, x16, x17, LS // c2 = c1 80 81 CMP x0, 4 // if mr < 4 82 ADD x11, x10, x4 // a3 = a2 + a_stride 83 ADD x14, x17, x7 // c3 = c2 + cm_stride 84 CSEL x11, x10, x11, LO // a3 = a2 85 CSEL x14, x17, x14, LO // c3 = c2 86 87 ADD x12, x11, x4 // a4 = a3 + a_stride 88 ADD x13, x14, x7 // c4 = c3 + cm_stride 89 // if mr <= 4 90 CSEL x12, x11, x12, LS // a4 = a3 91 CSEL x13, x14, x13, LS // c4 = c3 92 93 CMP x0, 6 // if mr < 6 94 ADD x4, x12, x4 // a5 = a4 + a_stride 95 ADD x7, x13, x7 // c5 = c4 + cm_stride 96 CSEL x4, x12, x4, LO // a5 = a4 97 CSEL x7, x13, x7, LO // c5 = c4 98 99 # Load min/max values 100 LD2R {v6.4s, v7.4s}, [x8] 101 102 // Save d12-d15 on stack 103 STP d12, d13, [sp, -32]! 104 STP d14, d15, [sp, 16] 105 1060: 107 # Load initial accumulators 108 LDP q20, q21, [x15], 32 109 LDP q22, q23, [x15], 32 110 LDP q24, q25, [x15], 32 111 LDP q26, q27, [x15], 32 112 LDP q28, q29, [x15], 32 113 LDP q30, q31, [x15], 32 114 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 115 PRFM PLDL1KEEP, [x3, 64] 116 PRFM PLDL1KEEP, [x9, 0] 117 PRFM PLDL1KEEP, [x9, 64] 118 PRFM PLDL1KEEP, [x10, 0] 119 PRFM PLDL1KEEP, [x10, 64] 120 PRFM PLDL1KEEP, [x11, 0] 121 PRFM PLDL1KEEP, [x11, 64] 122 PRFM PLDL1KEEP, [x12, 0] 123 PRFM PLDL1KEEP, [x12, 64] 124 PRFM PLDL1KEEP, [x4, 0] 125 PRFM PLDL1KEEP, [x4, 64] 126 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 127 PRFM PLDL1KEEP, [x5, 64] 128 PRFM PLDL1KEEP, [x5, 128] 129 PRFM PLDL1KEEP, [x5, 192] 130 131 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 132 SUBS x0, x2, 16 // k = kc - 16 133 B.LO 4f 134 135 # Prologue - First group loads, no FMA 136 LDR d0, [x3], 8 // a0 137 LDP q16, q17, [x5], 32 // b 138 LDR d1, [x10], 8 // a2 139 LDR d2, [x12], 8 // a4 140 LD1 {v0.d}[1], [x9], 8 // a1 141 LD1 {v1.d}[1], [x11], 8 // a3 142 LD1 {v2.d}[1], [x4], 8 // a5 143 SUBS x0, x0, 16 144 LDR q18, [x5], 16 145 LDR d19, [x5], 8 146 LDR x8, [x5], 8 // ins is in BLOCK 0 147 148 # Is there at least 4 floats (16 bytes) for main loop? 149 B.LO 2f 150 151 # Main loop - 4 floats of A (16 bytes) 152 # 48 FMA + 12 LD64 A + 8 LDR B 1531: 154 # First group of 24 FMA, Second group loads 155 // BLOCK 0 156 LDR d3, [x3], 8 // a0 157 INS v19.d[1], x8 // b from second group 158 FMLA v20.4s, v16.4s, v0.s[0] 159 LDR x8, [x9], 8 // a1 160 FMLA v22.4s, v16.4s, v0.s[2] 161 FMLA v24.4s, v16.4s, v1.s[0] 162 163 // BLOCK 1 164 LDR d12, [x5] 165 INS v3.d[1], x8 // a1 ins 166 FMLA v26.4s, v16.4s, v1.s[2] 167 LDR x8, [x5, 8] // b 168 FMLA v28.4s, v16.4s, v2.s[0] 169 FMLA v30.4s, v16.4s, v2.s[2] 170 171 // BLOCK 2 172 LDR d4, [x10], 8 // a2 173 INS v12.d[1], x8 // b ins 174 FMLA v21.4s, v17.4s, v0.s[0] 175 LDR x8, [x11], 8 // a3 176 FMLA v23.4s, v17.4s, v0.s[2] 177 FMLA v25.4s, v17.4s, v1.s[0] 178 179 // BLOCK 3 180 LDR d5, [x12], 8 // a4 181 INS v4.d[1], x8 // a3 ins 182 FMLA v27.4s, v17.4s, v1.s[2] 183 LDR x8, [x4], 8 // a5 184 FMLA v29.4s, v17.4s, v2.s[0] 185 FMLA v31.4s, v17.4s, v2.s[2] 186 187 // BLOCK 4 188 LDR d13, [x5, 16] 189 INS v5.d[1], x8 // a5 ins 190 FMLA v20.4s, v18.4s, v0.s[1] 191 LDR x8, [x5, 24] 192 FMLA v22.4s, v18.4s, v0.s[3] 193 FMLA v24.4s, v18.4s, v1.s[1] 194 195 // BLOCK 5 196 LDR d14, [x5, 32] 197 INS v13.d[1], x8 // b 198 FMLA v26.4s, v18.4s, v1.s[3] 199 LDR x8, [x5, 40] 200 FMLA v28.4s, v18.4s, v2.s[1] 201 FMLA v30.4s, v18.4s, v2.s[3] 202 203 // BLOCK 6 204 LDR d15, [x5, 48] 205 INS v14.d[1], x8 // b 206 FMLA v21.4s, v19.4s, v0.s[1] 207 LDR x8, [x5, 56] 208 FMLA v23.4s, v19.4s, v0.s[3] 209 FMLA v25.4s, v19.4s, v1.s[1] 210 211 // BLOCK 7 212 INS v15.d[1], x8 213 FMLA v27.4s, v19.4s, v1.s[3] 214 FMLA v29.4s, v19.4s, v2.s[1] 215 FMLA v31.4s, v19.4s, v2.s[3] 216 217 # Second group of 24 FMA, First group of loads 218 // BLOCK 0 219 LDR d0, [x3], 8 // a0 220 FMLA v20.4s, v12.4s, v3.s[0] 221 LDR x8, [x9], 8 // a1 222 FMLA v22.4s, v12.4s, v3.s[2] 223 FMLA v24.4s, v12.4s, v4.s[0] 224 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 225 226 // BLOCK 1 227 LDR d16, [x5, 64] 228 INS v0.d[1], x8 // a1 ins 229 FMLA v26.4s, v12.4s, v4.s[2] 230 LDR x8, [x5, 72] // b 231 FMLA v28.4s, v12.4s, v5.s[0] 232 FMLA v30.4s, v12.4s, v5.s[2] 233 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1 234 235 // BLOCK 2 236 LDR d1, [x10], 8 // a2 237 INS v16.d[1], x8 // b 238 FMLA v21.4s, v13.4s, v3.s[0] 239 LDR x8, [x11], 8 // a3 240 FMLA v23.4s, v13.4s, v3.s[2] 241 FMLA v25.4s, v13.4s, v4.s[0] 242 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2 243 244 // BLOCK 3 245 LDR d2, [x12], 8 // a4 246 INS v1.d[1], x8 // a3 ins 247 FMLA v27.4s, v13.4s, v4.s[2] 248 LDR x8, [x4], 8 // a5 249 FMLA v29.4s, v13.4s, v5.s[0] 250 FMLA v31.4s, v13.4s, v5.s[2] 251 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3 252 253 // BLOCK 4 254 LDR d17, [x5, 80] 255 INS v2.d[1], x8 // a5 ins 256 FMLA v20.4s, v14.4s, v3.s[1] 257 LDR x8, [x5, 88] 258 FMLA v22.4s, v14.4s, v3.s[3] 259 FMLA v24.4s, v14.4s, v4.s[1] 260 PRFM PLDL1KEEP, [x12, 128] // Prefetch A4 261 262 // BLOCK 5 263 LDR d18, [x5, 96] 264 INS v17.d[1], x8 // b 265 FMLA v26.4s, v14.4s, v4.s[3] 266 LDR x8, [x5, 104] 267 FMLA v28.4s, v14.4s, v5.s[1] 268 FMLA v30.4s, v14.4s, v5.s[3] 269 PRFM PLDL1KEEP, [x4, 128] // Prefetch A5 270 271 // BLOCK 6 272 LDR d19, [x5, 112] 273 INS v18.d[1], x8 // b 274 FMLA v21.4s, v15.4s, v3.s[1] 275 LDR x8, [x5, 120] 276 FMLA v23.4s, v15.4s, v3.s[3] 277 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 278 FMLA v25.4s, v15.4s, v4.s[1] 279 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 280 281 // BLOCK 7 282 SUBS x0, x0, 16 // LDR lands here 283 FMLA v27.4s, v15.4s, v4.s[3] 284 FMLA v29.4s, v15.4s, v5.s[1] 285 ADD x5, x5, 128 286 FMLA v31.4s, v15.4s, v5.s[3] 287 B.HS 1b 288 289 # Epilogue - 4 floats of A (16 bytes) 290 # 48 FMA + 12 LD64 A + 8 LDR B 2912: 292 # First group of 24 FMA, Second group loads 293 // BLOCK 0 294 LDR d3, [x3], 8 // a0 295 INS v19.d[1], x8 // b from second group 296 FMLA v20.4s, v16.4s, v0.s[0] 297 LDR x8, [x9], 8 // a1 298 FMLA v22.4s, v16.4s, v0.s[2] 299 FMLA v24.4s, v16.4s, v1.s[0] 300 PRFM PSTL1KEEP, [x6] // Prefetch C0 301 302 // BLOCK 1 303 LDR d12, [x5] 304 INS v3.d[1], x8 // a1 ins 305 FMLA v26.4s, v16.4s, v1.s[2] 306 LDR x8, [x5, 8] // b 307 FMLA v28.4s, v16.4s, v2.s[0] 308 FMLA v30.4s, v16.4s, v2.s[2] 309 PRFM PSTL1KEEP, [x16] // Prefetch C1 310 311 // BLOCK 2 312 LDR d4, [x10], 8 // a2 313 INS v12.d[1], x8 // b ins 314 FMLA v21.4s, v17.4s, v0.s[0] 315 LDR x8, [x11], 8 // a3 316 FMLA v23.4s, v17.4s, v0.s[2] 317 FMLA v25.4s, v17.4s, v1.s[0] 318 PRFM PSTL1KEEP, [x17] // Prefetch C2 319 320 // BLOCK 3 321 LDR d5, [x12], 8 // a4 322 INS v4.d[1], x8 // a3 ins 323 FMLA v27.4s, v17.4s, v1.s[2] 324 LDR x8, [x4], 8 // a5 325 FMLA v29.4s, v17.4s, v2.s[0] 326 FMLA v31.4s, v17.4s, v2.s[2] 327 PRFM PSTL1KEEP, [x14] // Prefetch C3 328 329 // BLOCK 4 330 LDR d13, [x5, 16] 331 INS v5.d[1], x8 // a5 ins 332 FMLA v20.4s, v18.4s, v0.s[1] 333 LDR x8, [x5, 24] 334 FMLA v22.4s, v18.4s, v0.s[3] 335 FMLA v24.4s, v18.4s, v1.s[1] 336 PRFM PSTL1KEEP, [x13] // Prefetch C4 337 338 // BLOCK 5 339 LDR d14, [x5, 32] 340 INS v13.d[1], x8 // b 341 FMLA v26.4s, v18.4s, v1.s[3] 342 LDR x8, [x5, 40] 343 FMLA v28.4s, v18.4s, v2.s[1] 344 FMLA v30.4s, v18.4s, v2.s[3] 345 PRFM PSTL1KEEP, [x7] // Prefetch C5 346 347 // BLOCK 6 348 LDR d15, [x5, 48] 349 INS v14.d[1], x8 // b 350 FMLA v21.4s, v19.4s, v0.s[1] 351 LDR x8, [x5, 56] 352 FMLA v23.4s, v19.4s, v0.s[3] 353 FMLA v25.4s, v19.4s, v1.s[1] 354 355 // BLOCK 7 356 INS v15.d[1], x8 // b 357 FMLA v27.4s, v19.4s, v1.s[3] 358 FMLA v29.4s, v19.4s, v2.s[1] 359 FMLA v31.4s, v19.4s, v2.s[3] 360 361 # Second group of 24 FMA, First group of loads 362 // BLOCK 0 363 FMLA v20.4s, v12.4s, v3.s[0] 364 FMLA v22.4s, v12.4s, v3.s[2] 365 FMLA v24.4s, v12.4s, v4.s[0] 366 367 // BLOCK 1 368 FMLA v26.4s, v12.4s, v4.s[2] 369 FMLA v28.4s, v12.4s, v5.s[0] 370 FMLA v30.4s, v12.4s, v5.s[2] 371 372 // BLOCK 2 373 FMLA v21.4s, v13.4s, v3.s[0] 374 FMLA v23.4s, v13.4s, v3.s[2] 375 FMLA v25.4s, v13.4s, v4.s[0] 376 377 // BLOCK 3 378 FMLA v27.4s, v13.4s, v4.s[2] 379 FMLA v29.4s, v13.4s, v5.s[0] 380 FMLA v31.4s, v13.4s, v5.s[2] 381 382 // BLOCK 4 383 FMLA v20.4s, v14.4s, v3.s[1] 384 FMLA v22.4s, v14.4s, v3.s[3] 385 FMLA v24.4s, v14.4s, v4.s[1] 386 387 // BLOCK 5 388 FMLA v26.4s, v14.4s, v4.s[3] 389 FMLA v28.4s, v14.4s, v5.s[1] 390 FMLA v30.4s, v14.4s, v5.s[3] 391 392 // BLOCK 6 393 FMLA v21.4s, v15.4s, v3.s[1] 394 FMLA v23.4s, v15.4s, v3.s[3] 395 FMLA v25.4s, v15.4s, v4.s[1] 396 TST x0, 15 397 398 // BLOCK 7 399 FMLA v27.4s, v15.4s, v4.s[3] 400 FMLA v29.4s, v15.4s, v5.s[1] 401 FMLA v31.4s, v15.4s, v5.s[3] 402 ADD x5, x5, 64 403 404 # Is there a remainder?- 2 floats of A (8 bytes) or less 405 B.NE 4f 4063: 407 # Clamp 408 FMAX v20.4s, v20.4s, v6.4s 409 # Load cn_stride 410 LDR x0, [sp, 32] 411 FMAX v21.4s, v21.4s, v6.4s 412 FMAX v22.4s, v22.4s, v6.4s 413 FMAX v23.4s, v23.4s, v6.4s 414 FMAX v24.4s, v24.4s, v6.4s 415 FMAX v25.4s, v25.4s, v6.4s 416 FMAX v26.4s, v26.4s, v6.4s 417 FMAX v27.4s, v27.4s, v6.4s 418 FMAX v28.4s, v28.4s, v6.4s 419 FMAX v29.4s, v29.4s, v6.4s 420 FMAX v30.4s, v30.4s, v6.4s 421 FMAX v31.4s, v31.4s, v6.4s 422 SUBS x1, x1, 8 423 FMIN v20.4s, v20.4s, v7.4s 424 FMIN v21.4s, v21.4s, v7.4s 425 FMIN v22.4s, v22.4s, v7.4s 426 FMIN v23.4s, v23.4s, v7.4s 427 FMIN v24.4s, v24.4s, v7.4s 428 FMIN v25.4s, v25.4s, v7.4s 429 FMIN v26.4s, v26.4s, v7.4s 430 FMIN v27.4s, v27.4s, v7.4s 431 FMIN v28.4s, v28.4s, v7.4s 432 FMIN v29.4s, v29.4s, v7.4s 433 FMIN v30.4s, v30.4s, v7.4s 434 FMIN v31.4s, v31.4s, v7.4s 435 436 # Store full 6 x 8 437 B.LO 6f 438 439 ST1 {v30.16b, v31.16b}, [x7], x0 440 SUB x3, x3, x2 // a0 -= kc 441 ST1 {v28.16b, v29.16b}, [x13], x0 442 SUB x9, x9, x2 // a1 -= kc 443 ST1 {v26.16b, v27.16b}, [x14], x0 444 SUB x10, x10, x2 // a2 -= kc 445 ST1 {v24.16b, v25.16b}, [x17], x0 446 SUB x11, x11, x2 // a3 -= kc 447 ST1 {v22.16b, v23.16b}, [x16], x0 448 SUB x12, x12, x2 // a4 -= kc 449 ST1 {v20.16b, v21.16b}, [x6], x0 450 SUB x4, x4, x2 // a5 -= kc 451 452 B.HI 0b 453 454 // Restore d12-d15 from stack 455 LDP d14, d15, [sp, 16] 456 LDP d12, d13, [sp], 32 457 RET 458 4594: 460 # Is there a remainder?- 2 floats of A (8 bytes) 461 TBZ x0, 3, 5f 462 463 # Remainder- 2 floats of A (8 bytes) 464 LDR d0, [x3], 8 465 LDR q16, [x5], 16 466 LD1 {v0.d}[1], [x9], 8 467 LDR d1, [x10], 8 468 LD1 {v1.d}[1], [x11], 8 469 LDR d2, [x12], 8 470 LD1 {v2.d}[1], [x4], 8 471 LDR q17, [x5], 16 472 LDR q18, [x5], 16 473 LDR q19, [x5], 16 474 475 FMLA v20.4s, v16.4s, v0.s[0] 476 FMLA v22.4s, v16.4s, v0.s[2] 477 FMLA v24.4s, v16.4s, v1.s[0] 478 FMLA v26.4s, v16.4s, v1.s[2] 479 FMLA v28.4s, v16.4s, v2.s[0] 480 FMLA v30.4s, v16.4s, v2.s[2] 481 FMLA v21.4s, v17.4s, v0.s[0] 482 FMLA v23.4s, v17.4s, v0.s[2] 483 FMLA v25.4s, v17.4s, v1.s[0] 484 FMLA v27.4s, v17.4s, v1.s[2] 485 FMLA v29.4s, v17.4s, v2.s[0] 486 FMLA v31.4s, v17.4s, v2.s[2] 487 488 FMLA v20.4s, v18.4s, v0.s[1] 489 FMLA v22.4s, v18.4s, v0.s[3] 490 FMLA v24.4s, v18.4s, v1.s[1] 491 FMLA v26.4s, v18.4s, v1.s[3] 492 FMLA v28.4s, v18.4s, v2.s[1] 493 FMLA v30.4s, v18.4s, v2.s[3] 494 FMLA v21.4s, v19.4s, v0.s[1] 495 FMLA v23.4s, v19.4s, v0.s[3] 496 FMLA v25.4s, v19.4s, v1.s[1] 497 FMLA v27.4s, v19.4s, v1.s[3] 498 FMLA v29.4s, v19.4s, v2.s[1] 499 FMLA v31.4s, v19.4s, v2.s[3] 500 501 # Is there a remainder?- 1 floats of A (4 bytes) 502 TBZ x0, 2, 3b 5035: 504 # Remainder- 1 floats of A (4 bytes) 505 LDR s0, [x3], 4 506 LDR q16, [x5], 16 507 LD1 {v0.s}[2], [x9], 4 508 LDR s1, [x10], 4 509 LD1 {v1.s}[2], [x11], 4 510 LDR s2, [x12], 4 511 LD1 {v2.s}[2], [x4], 4 512 LDR q17, [x5], 16 513 514 FMLA v20.4s, v16.4s, v0.s[0] 515 FMLA v22.4s, v16.4s, v0.s[2] 516 FMLA v24.4s, v16.4s, v1.s[0] 517 FMLA v26.4s, v16.4s, v1.s[2] 518 FMLA v28.4s, v16.4s, v2.s[0] 519 FMLA v30.4s, v16.4s, v2.s[2] 520 FMLA v21.4s, v17.4s, v0.s[0] 521 FMLA v23.4s, v17.4s, v0.s[2] 522 FMLA v25.4s, v17.4s, v1.s[0] 523 FMLA v27.4s, v17.4s, v1.s[2] 524 FMLA v29.4s, v17.4s, v2.s[0] 525 FMLA v31.4s, v17.4s, v2.s[2] 526 B 3b 527 528 # Store odd width 5296: 530 TBZ x1, 2, 7f 531 STR q30, [x7], 16 532 MOV v30.16b, v31.16b 533 STR q28, [x13], 16 534 MOV v28.16b, v29.16b 535 STR q26, [x14], 16 536 MOV v26.16b, v27.16b 537 STR q24, [x17], 16 538 MOV v24.16b, v25.16b 539 STR q22, [x16], 16 540 MOV v22.16b, v23.16b 541 STR q20, [x6], 16 542 MOV v20.16b, v21.16b 543 5447: 545 TBZ x1, 1, 8f 546 STR d30, [x7], 8 547 DUP d30, v30.d[1] 548 STR d28, [x13], 8 549 DUP d28, v28.d[1] 550 STR d26, [x14], 8 551 DUP d26, v26.d[1] 552 STR d24, [x17], 8 553 DUP d24, v24.d[1] 554 STR d22, [x16], 8 555 DUP d22, v22.d[1] 556 STR d20, [x6], 8 557 DUP d20, v20.d[1] 558 5598: 560 TBZ x1, 0, 9f 561 STR s30, [x7] 562 STR s28, [x13] 563 STR s26, [x14] 564 STR s24, [x17] 565 STR s22, [x16] 566 STR s20, [x6] 5679: 568 // Restore d12-d15 from stack 569 LDP d14, d15, [sp, 16] 570 LDP d12, d13, [sp], 32 571 RET 572 573END_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53 574 575#ifdef __ELF__ 576.section ".note.GNU-stack","",%progbits 577#endif 578