1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a53.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x18 c3 40# x13 c4 41# x7 c5 42 43# x8 temporary vector shadow register 44 45# Vector register usage 46# A0 v0 v3 47# A1 v0[1] v3[1] 48# A2 v1 v4 49# A3 v1[1] v4[1] 50# A4 v2 v5 51# A5 v2[1] v5[1] 52# B v12 v13 v14 v15 second set of B 53# B v16 v17 v18 v19 first set 54# C v20 v21 55# C v22 v23 56# C v24 v25 57# C v26 v27 58# C v28 v29 59# C v30 v31 60# Clamp v6 v7 61# unused A v8 v9 v10 v11 62 63BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53 64 65 # Clamp A and C pointers 66 CMP x0, 2 // if mr < 2 67 ADD x9, x3, x4 // a1 = a0 + a_stride 68 ADD x16, x6, x7 // c1 = c0 + cm_stride 69 CSEL x9, x3, x9, LO // a1 = a0 70 CSEL x16, x6, x16, LO // c1 = c0 71 72 ADD x10, x9, x4 // a2 = a1 + a_stride 73 ADD x17, x16, x7 // c2 = c1 + cm_stride 74 // if mr <= 2 75 CSEL x10, x9, x10, LS // a2 = a1 76 CSEL x17, x16, x17, LS // c2 = c1 77 78 CMP x0, 4 // if mr < 4 79 ADD x11, x10, x4 // a3 = a2 + a_stride 80 ADD x18, x17, x7 // c3 = c2 + cm_stride 81 CSEL x11, x10, x11, LO // a3 = a2 82 CSEL x18, x17, x18, LO // c3 = c2 83 84 ADD x12, x11, x4 // a4 = a3 + a_stride 85 ADD x13, x18, x7 // c4 = c3 + cm_stride 86 // if mr <= 5 87 CSEL x12, x11, x12, LS // a4 = a3 88 CSEL x13, x18, x13, LS // c4 = c3 89 90 # Load params pointer 91 LDR x8, [sp, 8] 92 93 CMP x0, 6 // if mr < 6 94 ADD x4, x12, x4 // a5 = a4 + a_stride 95 ADD x7, x13, x7 // c5 = c4 + cm_stride 96 CSEL x4, x12, x4, LO // a5 = a4 97 CSEL x7, x13, x7, LO // c5 = c4 98 99 # Load clamping_params values 100 LD2R {v6.4s, v7.4s}, [x8] 101 102 # Load cn_stride 103 LDR x14, [sp] 104 105 // Save d12-d15 on stack 106 STP d12, d13, [sp, -32]! 107 STP d14, d15, [sp, 16] 108 1090: 110 # Load initial bias from w into accumulators 111 LDP q20, q21, [x5], 32 112 MOV v22.16b, v20.16b 113 PRFM PLDL1KEEP, [x3, 0] // Prefetch A 114 PRFM PLDL1KEEP, [x3, 64] 115 MOV v23.16b, v21.16b 116 PRFM PLDL1KEEP, [x9, 0] 117 PRFM PLDL1KEEP, [x9, 64] 118 MOV v24.16b, v20.16b 119 PRFM PLDL1KEEP, [x10, 0] 120 PRFM PLDL1KEEP, [x10, 64] 121 MOV v25.16b, v21.16b 122 PRFM PLDL1KEEP, [x11, 0] 123 PRFM PLDL1KEEP, [x11, 64] 124 MOV v26.16b, v20.16b 125 PRFM PLDL1KEEP, [x12, 0] 126 PRFM PLDL1KEEP, [x12, 64] 127 MOV v27.16b, v21.16b 128 PRFM PLDL1KEEP, [x4, 0] 129 PRFM PLDL1KEEP, [x4, 64] 130 MOV v28.16b, v20.16b 131 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 132 MOV v29.16b, v21.16b 133 PRFM PLDL1KEEP, [x5, 64] 134 MOV v30.16b, v20.16b 135 PRFM PLDL1KEEP, [x5, 128] 136 MOV v31.16b, v21.16b 137 PRFM PLDL1KEEP, [x5, 192] 138 139 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 140 SUBS x0, x2, 16 // k = kc - 16 141 B.LO 5f 142 143 # Prologue - First group loads, no FMA 144 LDR d0, [x3], 8 // a0 145 LDP q16, q17, [x5], 32 // b 146 LDR d1, [x10], 8 // a2 147 LDR d2, [x12], 8 // a4 148 LD1 {v0.d}[1], [x9], 8 // a1 149 LD1 {v1.d}[1], [x11], 8 // a3 150 LD1 {v2.d}[1], [x4], 8 // a5 151 SUBS x0, x0, 16 152 LDR q18, [x5], 16 153 LDR d19, [x5], 8 154 LDR x8, [x5], 8 // ins is in BLOCK 0 155 156 # Is there at least 4 floats (16 bytes) for main loop? 157 B.LO 2f 158 159 # Main loop - 4 floats of A (16 bytes) 160 # 48 FMA + 12 LD64 A + 8 LDR B 1611: 162 # First group of 24 FMA, Second group loads 163 // BLOCK 0 164 LDR d3, [x3], 8 // a0 165 INS v19.d[1], x8 // b from second group 166 FMLA v20.4s, v16.4s, v0.s[0] 167 LDR x8, [x9], 8 // a1 168 FMLA v22.4s, v16.4s, v0.s[2] 169 FMLA v24.4s, v16.4s, v1.s[0] 170 171 // BLOCK 1 172 LDR d12, [x5] 173 INS v3.d[1], x8 // a1 ins 174 FMLA v26.4s, v16.4s, v1.s[2] 175 LDR x8, [x5, 8] // b 176 FMLA v28.4s, v16.4s, v2.s[0] 177 FMLA v30.4s, v16.4s, v2.s[2] 178 179 // BLOCK 2 180 LDR d4, [x10], 8 // a2 181 INS v12.d[1], x8 // b ins 182 FMLA v21.4s, v17.4s, v0.s[0] 183 LDR x8, [x11], 8 // a3 184 FMLA v23.4s, v17.4s, v0.s[2] 185 FMLA v25.4s, v17.4s, v1.s[0] 186 187 // BLOCK 3 188 LDR d5, [x12], 8 // a4 189 INS v4.d[1], x8 // a3 ins 190 FMLA v27.4s, v17.4s, v1.s[2] 191 LDR x8, [x4], 8 // a5 192 FMLA v29.4s, v17.4s, v2.s[0] 193 FMLA v31.4s, v17.4s, v2.s[2] 194 195 // BLOCK 4 196 LDR d13, [x5, 16] 197 INS v5.d[1], x8 // a5 ins 198 FMLA v20.4s, v18.4s, v0.s[1] 199 LDR x8, [x5, 24] 200 FMLA v22.4s, v18.4s, v0.s[3] 201 FMLA v24.4s, v18.4s, v1.s[1] 202 203 // BLOCK 5 204 LDR d14, [x5, 32] 205 INS v13.d[1], x8 // b 206 FMLA v26.4s, v18.4s, v1.s[3] 207 LDR x8, [x5, 40] 208 FMLA v28.4s, v18.4s, v2.s[1] 209 FMLA v30.4s, v18.4s, v2.s[3] 210 211 // BLOCK 6 212 LDR d15, [x5, 48] 213 INS v14.d[1], x8 // b 214 FMLA v21.4s, v19.4s, v0.s[1] 215 LDR x8, [x5, 56] 216 FMLA v23.4s, v19.4s, v0.s[3] 217 FMLA v25.4s, v19.4s, v1.s[1] 218 219 // BLOCK 7 220 INS v15.d[1], x8 221 FMLA v27.4s, v19.4s, v1.s[3] 222 FMLA v29.4s, v19.4s, v2.s[1] 223 FMLA v31.4s, v19.4s, v2.s[3] 224 225 # Second group of 24 FMA, First group of loads 226 // BLOCK 0 227 LDR d0, [x3], 8 // a0 228 FMLA v20.4s, v12.4s, v3.s[0] 229 LDR x8, [x9], 8 // a1 230 FMLA v22.4s, v12.4s, v3.s[2] 231 FMLA v24.4s, v12.4s, v4.s[0] 232 PRFM PLDL1KEEP, [x3, 128] // Prefetch A0 233 234 // BLOCK 1 235 LDR d16, [x5, 64] 236 INS v0.d[1], x8 // a1 ins 237 FMLA v26.4s, v12.4s, v4.s[2] 238 LDR x8, [x5, 72] // b 239 FMLA v28.4s, v12.4s, v5.s[0] 240 FMLA v30.4s, v12.4s, v5.s[2] 241 PRFM PLDL1KEEP, [x9, 128] // Prefetch A1 242 243 // BLOCK 2 244 LDR d1, [x10], 8 // a2 245 INS v16.d[1], x8 // b 246 FMLA v21.4s, v13.4s, v3.s[0] 247 LDR x8, [x11], 8 // a3 248 FMLA v23.4s, v13.4s, v3.s[2] 249 FMLA v25.4s, v13.4s, v4.s[0] 250 PRFM PLDL1KEEP, [x10, 128] // Prefetch A2 251 252 // BLOCK 3 253 LDR d2, [x12], 8 // a4 254 INS v1.d[1], x8 // a3 ins 255 FMLA v27.4s, v13.4s, v4.s[2] 256 LDR x8, [x4], 8 // a5 257 FMLA v29.4s, v13.4s, v5.s[0] 258 FMLA v31.4s, v13.4s, v5.s[2] 259 PRFM PLDL1KEEP, [x11, 128] // Prefetch A3 260 261 // BLOCK 4 262 LDR d17, [x5, 80] 263 INS v2.d[1], x8 // a5 ins 264 FMLA v20.4s, v14.4s, v3.s[1] 265 LDR x8, [x5, 88] 266 FMLA v22.4s, v14.4s, v3.s[3] 267 FMLA v24.4s, v14.4s, v4.s[1] 268 PRFM PLDL1KEEP, [x12, 128] // Prefetch A4 269 270 // BLOCK 5 271 LDR d18, [x5, 96] 272 INS v17.d[1], x8 // b 273 FMLA v26.4s, v14.4s, v4.s[3] 274 LDR x8, [x5, 104] 275 FMLA v28.4s, v14.4s, v5.s[1] 276 FMLA v30.4s, v14.4s, v5.s[3] 277 PRFM PLDL1KEEP, [x4, 128] // Prefetch A5 278 279 // BLOCK 6 280 LDR d19, [x5, 112] 281 INS v18.d[1], x8 // b 282 FMLA v21.4s, v15.4s, v3.s[1] 283 LDR x8, [x5, 120] 284 FMLA v23.4s, v15.4s, v3.s[3] 285 PRFM PLDL1KEEP, [x5, 192] // Prefetch B 286 FMLA v25.4s, v15.4s, v4.s[1] 287 PRFM PLDL1KEEP, [x5, 256] // Prefetch B 288 289 // BLOCK 7 290 SUBS x0, x0, 16 // LDR lands here 291 FMLA v27.4s, v15.4s, v4.s[3] 292 FMLA v29.4s, v15.4s, v5.s[1] 293 ADD x5, x5, 128 294 FMLA v31.4s, v15.4s, v5.s[3] 295 B.HS 1b 296 297 # Epilogue - 4 floats of A (16 bytes) 298 # 48 FMA + 12 LD64 A + 8 LDR B 2992: 300 # First group of 24 FMA, Second group loads 301 // BLOCK 0 302 LDR d3, [x3], 8 // a0 303 INS v19.d[1], x8 // b from second group 304 FMLA v20.4s, v16.4s, v0.s[0] 305 LDR x8, [x9], 8 // a1 306 FMLA v22.4s, v16.4s, v0.s[2] 307 FMLA v24.4s, v16.4s, v1.s[0] 308 PRFM PSTL1KEEP, [x6] // Prefetch C0 309 310 // BLOCK 1 311 LDR d12, [x5] 312 INS v3.d[1], x8 // a1 ins 313 FMLA v26.4s, v16.4s, v1.s[2] 314 LDR x8, [x5, 8] // b 315 FMLA v28.4s, v16.4s, v2.s[0] 316 FMLA v30.4s, v16.4s, v2.s[2] 317 PRFM PSTL1KEEP, [x16] // Prefetch C1 318 319 // BLOCK 2 320 LDR d4, [x10], 8 // a2 321 INS v12.d[1], x8 // b ins 322 FMLA v21.4s, v17.4s, v0.s[0] 323 LDR x8, [x11], 8 // a3 324 FMLA v23.4s, v17.4s, v0.s[2] 325 FMLA v25.4s, v17.4s, v1.s[0] 326 PRFM PSTL1KEEP, [x17] // Prefetch C2 327 328 // BLOCK 3 329 LDR d5, [x12], 8 // a4 330 INS v4.d[1], x8 // a3 ins 331 FMLA v27.4s, v17.4s, v1.s[2] 332 LDR x8, [x4], 8 // a5 333 FMLA v29.4s, v17.4s, v2.s[0] 334 FMLA v31.4s, v17.4s, v2.s[2] 335 PRFM PSTL1KEEP, [x18] // Prefetch C3 336 337 // BLOCK 4 338 LDR d13, [x5, 16] 339 INS v5.d[1], x8 // a5 ins 340 FMLA v20.4s, v18.4s, v0.s[1] 341 LDR x8, [x5, 24] 342 FMLA v22.4s, v18.4s, v0.s[3] 343 FMLA v24.4s, v18.4s, v1.s[1] 344 PRFM PSTL1KEEP, [x13] // Prefetch C4 345 346 // BLOCK 5 347 LDR d14, [x5, 32] 348 INS v13.d[1], x8 // b 349 FMLA v26.4s, v18.4s, v1.s[3] 350 LDR x8, [x5, 40] 351 FMLA v28.4s, v18.4s, v2.s[1] 352 FMLA v30.4s, v18.4s, v2.s[3] 353 PRFM PSTL1KEEP, [x7] // Prefetch C5 354 355 // BLOCK 6 356 LDR d15, [x5, 48] 357 INS v14.d[1], x8 // b 358 FMLA v21.4s, v19.4s, v0.s[1] 359 LDR x8, [x5, 56] 360 FMLA v23.4s, v19.4s, v0.s[3] 361 FMLA v25.4s, v19.4s, v1.s[1] 362 363 // BLOCK 7 364 INS v15.d[1], x8 // b 365 FMLA v27.4s, v19.4s, v1.s[3] 366 FMLA v29.4s, v19.4s, v2.s[1] 367 FMLA v31.4s, v19.4s, v2.s[3] 368 369 # Second group of 24 FMA, First group of loads 370 // BLOCK 0 371 FMLA v20.4s, v12.4s, v3.s[0] 372 FMLA v22.4s, v12.4s, v3.s[2] 373 FMLA v24.4s, v12.4s, v4.s[0] 374 375 // BLOCK 1 376 FMLA v26.4s, v12.4s, v4.s[2] 377 FMLA v28.4s, v12.4s, v5.s[0] 378 FMLA v30.4s, v12.4s, v5.s[2] 379 380 // BLOCK 2 381 FMLA v21.4s, v13.4s, v3.s[0] 382 FMLA v23.4s, v13.4s, v3.s[2] 383 FMLA v25.4s, v13.4s, v4.s[0] 384 385 // BLOCK 3 386 FMLA v27.4s, v13.4s, v4.s[2] 387 FMLA v29.4s, v13.4s, v5.s[0] 388 FMLA v31.4s, v13.4s, v5.s[2] 389 390 // BLOCK 4 391 FMLA v20.4s, v14.4s, v3.s[1] 392 FMLA v22.4s, v14.4s, v3.s[3] 393 FMLA v24.4s, v14.4s, v4.s[1] 394 395 // BLOCK 5 396 FMLA v26.4s, v14.4s, v4.s[3] 397 FMLA v28.4s, v14.4s, v5.s[1] 398 FMLA v30.4s, v14.4s, v5.s[3] 399 400 // BLOCK 6 401 FMLA v21.4s, v15.4s, v3.s[1] 402 FMLA v23.4s, v15.4s, v3.s[3] 403 FMLA v25.4s, v15.4s, v4.s[1] 404 TST x0, 15 405 406 // BLOCK 7 407 FMLA v27.4s, v15.4s, v4.s[3] 408 FMLA v29.4s, v15.4s, v5.s[1] 409 FMLA v31.4s, v15.4s, v5.s[3] 410 ADD x5, x5, 64 411 412 # Is there a remainder?- 2 floats of A (8 bytes) or less 413 B.NE 5f 4144: 415 # Clamp 416 FMIN v20.4s, v20.4s, v6.4s 417 SUBS x1, x1, 8 418 FMIN v21.4s, v21.4s, v6.4s 419 FMIN v22.4s, v22.4s, v6.4s 420 FMIN v23.4s, v23.4s, v6.4s 421 FMIN v24.4s, v24.4s, v6.4s 422 FMIN v25.4s, v25.4s, v6.4s 423 FMIN v26.4s, v26.4s, v6.4s 424 FMIN v27.4s, v27.4s, v6.4s 425 FMIN v28.4s, v28.4s, v6.4s 426 FMIN v29.4s, v29.4s, v6.4s 427 FMIN v30.4s, v30.4s, v6.4s 428 FMIN v31.4s, v31.4s, v6.4s 429 FMAX v20.4s, v20.4s, v7.4s 430 FMAX v21.4s, v21.4s, v7.4s 431 FMAX v22.4s, v22.4s, v7.4s 432 FMAX v23.4s, v23.4s, v7.4s 433 FMAX v24.4s, v24.4s, v7.4s 434 FMAX v25.4s, v25.4s, v7.4s 435 FMAX v26.4s, v26.4s, v7.4s 436 FMAX v27.4s, v27.4s, v7.4s 437 FMAX v28.4s, v28.4s, v7.4s 438 FMAX v29.4s, v29.4s, v7.4s 439 FMAX v30.4s, v30.4s, v7.4s 440 FMAX v31.4s, v31.4s, v7.4s 441 442 # Store full 6 x 8 443 B.LO 8f 444 445 ST1 {v20.16b, v21.16b}, [x6], x14 446 SUB x3, x3, x2 // a0 -= kc 447 ST1 {v22.16b, v23.16b}, [x16], x14 448 SUB x9, x9, x2 // a1 -= kc 449 ST1 {v24.16b, v25.16b}, [x17], x14 450 SUB x10, x10, x2 // a2 -= kc 451 ST1 {v26.16b, v27.16b}, [x18], x14 452 SUB x11, x11, x2 // a3 -= kc 453 ST1 {v28.16b, v29.16b}, [x13], x14 454 SUB x12, x12, x2 // a4 -= kc 455 ST1 {v30.16b, v31.16b}, [x7], x14 456 SUB x4, x4, x2 // a5 -= kc 457 458 B.HI 0b 459 460 // Restore d12-d15 from stack 461 LDP d14, d15, [sp, 16] 462 LDP d12, d13, [sp], 32 463 RET 464 4655: 466 # Is there a remainder?- 2 floats of A (8 bytes) 467 TBZ x0, 3, 6f 468 469 # Remainder- 2 floats of A (8 bytes) 470 LDR d0, [x3], 8 471 LDR q16, [x5], 16 472 LD1 {v0.d}[1], [x9], 8 473 LDR d1, [x10], 8 474 LD1 {v1.d}[1], [x11], 8 475 LDR d2, [x12], 8 476 LD1 {v2.d}[1], [x4], 8 477 LDR q17, [x5], 16 478 LDR q18, [x5], 16 479 LDR q19, [x5], 16 480 481 FMLA v20.4s, v16.4s, v0.s[0] 482 FMLA v22.4s, v16.4s, v0.s[2] 483 FMLA v24.4s, v16.4s, v1.s[0] 484 FMLA v26.4s, v16.4s, v1.s[2] 485 FMLA v28.4s, v16.4s, v2.s[0] 486 FMLA v30.4s, v16.4s, v2.s[2] 487 FMLA v21.4s, v17.4s, v0.s[0] 488 FMLA v23.4s, v17.4s, v0.s[2] 489 FMLA v25.4s, v17.4s, v1.s[0] 490 FMLA v27.4s, v17.4s, v1.s[2] 491 FMLA v29.4s, v17.4s, v2.s[0] 492 FMLA v31.4s, v17.4s, v2.s[2] 493 494 FMLA v20.4s, v18.4s, v0.s[1] 495 FMLA v22.4s, v18.4s, v0.s[3] 496 FMLA v24.4s, v18.4s, v1.s[1] 497 FMLA v26.4s, v18.4s, v1.s[3] 498 FMLA v28.4s, v18.4s, v2.s[1] 499 FMLA v30.4s, v18.4s, v2.s[3] 500 FMLA v21.4s, v19.4s, v0.s[1] 501 FMLA v23.4s, v19.4s, v0.s[3] 502 FMLA v25.4s, v19.4s, v1.s[1] 503 FMLA v27.4s, v19.4s, v1.s[3] 504 FMLA v29.4s, v19.4s, v2.s[1] 505 FMLA v31.4s, v19.4s, v2.s[3] 506 507 # Is there a remainder?- 1 floats of A (4 bytes) 508 TBZ x0, 2, 4b 5096: 510 # Remainder- 1 floats of A (4 bytes) 511 LDR s0, [x3], 4 512 LDR q16, [x5], 16 513 LD1 {v0.s}[2], [x9], 4 514 LDR s1, [x10], 4 515 LD1 {v1.s}[2], [x11], 4 516 LDR s2, [x12], 4 517 LD1 {v2.s}[2], [x4], 4 518 LDR q17, [x5], 16 519 520 FMLA v20.4s, v16.4s, v0.s[0] 521 FMLA v22.4s, v16.4s, v0.s[2] 522 FMLA v24.4s, v16.4s, v1.s[0] 523 FMLA v26.4s, v16.4s, v1.s[2] 524 FMLA v28.4s, v16.4s, v2.s[0] 525 FMLA v30.4s, v16.4s, v2.s[2] 526 FMLA v21.4s, v17.4s, v0.s[0] 527 FMLA v23.4s, v17.4s, v0.s[2] 528 FMLA v25.4s, v17.4s, v1.s[0] 529 FMLA v27.4s, v17.4s, v1.s[2] 530 FMLA v29.4s, v17.4s, v2.s[0] 531 FMLA v31.4s, v17.4s, v2.s[2] 532 B 4b 533 534 # Store odd width 5358: 536 TBZ x1, 2, 9f 537 STR q20, [x6], 16 538 MOV v20.16b, v21.16b 539 STR q22, [x16], 16 540 MOV v22.16b, v23.16b 541 STR q24, [x17], 16 542 MOV v24.16b, v25.16b 543 STR q26, [x18], 16 544 MOV v26.16b, v27.16b 545 STR q28, [x13], 16 546 MOV v28.16b, v29.16b 547 STR q30, [x7], 16 548 MOV v30.16b, v31.16b 549 5509: 551 TBZ x1, 1, 10f 552 STR d20, [x6], 8 553 DUP d20, v20.d[1] 554 STR d22, [x16], 8 555 DUP d22, v22.d[1] 556 STR d24, [x17], 8 557 DUP d24, v24.d[1] 558 STR d26, [x18], 8 559 DUP d26, v26.d[1] 560 STR d28, [x13], 8 561 DUP d28, v28.d[1] 562 STR d30, [x7], 8 563 DUP d30, v30.d[1] 564 56510: 566 TBZ x1, 0, 11f 567 STR s20, [x6] 568 STR s22, [x16] 569 STR s24, [x17] 570 STR s26, [x18] 571 STR s28, [x13] 572 STR s30, [x7] 57311: 574 // Restore d12-d15 from stack 575 LDP d14, d15, [sp, 16] 576 LDP d12, d13, [sp], 32 577 RET 578 579END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53 580 581#ifdef __ELF__ 582.section ".note.GNU-stack","",%progbits 583#endif 584