1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const float*restrict w, x5 15# float*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x10 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_output_params params [sp + 24] -> (x8) 21 22# d8-d15 need to be preserved if used. 23# x19-30 need to be preserved if used. 24 25# A pointers 26# x13 a0 27# x14 a1 28# x15 a2 29# x16 a3 30 31# C pointers 32# x6 c0 33# x17 c1 34# x18 c2 35# x7 c3 / cm_stride 36 37# x8 temporary vector shadow register 38 39# Vector register usage and GPR shadows 40# A0 v0 41# A1 v0[1] 42# A2 v1 43# A3 v1[1] 44# A0 v2 45# A1 v2[1] 46# A2 v3 47# A3 v3[1] 48# B v6 v7 v8 49# B v9 v10 v11 50# B v14 v15 v16 51# B v17 v18 v19 52# C v20 v21 v22 53# C v23 v24 v25 54# C v26 v27 v28 55# C v29 v30 v31 56# Clamp v4 v5 57# v12 to v13 unused. 58 59BEGIN_FUNCTION xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53 60 61 # Load cn_stride, a_offset 62 LDP x10, x11, [sp] 63 64 # Load zero, clamping params pointer 65 LDP x12, x8, [sp, 16] 66 67 # Save d8-d11,d14,d15 on stack 68 STP d8, d9, [sp, -48]! 69 STP d10, d11, [sp, 16] 70 STP d14, d15, [sp, 32] 71 72 # Load clamping_params values 73 LD2R {v4.4s, v5.4s}, [x8] 74 75 # Clamp C pointers 76 CMP x0, 2 // if mr < 2 77 ADD x17, x6, x7 // c1 = c0 + cm_stride 78 CSEL x17, x6, x17, LO // c1 = c0 79 80 ADD x18, x17, x7 // c2 = c1 + cm_stride 81 // if mr <= 2 82 83 CSEL x18, x17, x18, LS // c2 = c1 84 85 CMP x0, 4 // if mr < 4 86 ADD x7, x18, x7 // c3 = c2 + cm_stride 87 CSEL x7, x18, x7, LO // c3 = c2 88 890: 90 # Load initial bias from w into accumulators 91 LD1 {v20.16b, v21.16b, v22.16b}, [x5], 48 92 MOV v23.16b, v20.16b 93 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 94 MOV v24.16b, v21.16b 95 PRFM PLDL1KEEP, [x5, 64] 96 MOV v25.16b, v22.16b 97 PRFM PLDL1KEEP, [x5, 128] 98 MOV v26.16b, v20.16b 99 PRFM PLDL1KEEP, [x5, 192] 100 MOV v27.16b, v21.16b 101 PRFM PLDL1KEEP, [x5, 256] 102 MOV v28.16b, v22.16b 103 PRFM PLDL1KEEP, [x5, 320] 104 MOV v29.16b, v20.16b 105 MOV v30.16b, v21.16b 106 MOV v31.16b, v22.16b 107 108 MOV x9, x3 // p = ks 109 1101: 111 # Load next 4 A pointers 112 LDP x13, x14, [x4], 16 113 LDP x15, x16, [x4], 16 114 115 CMP x13, x12 // if a0 == zero 116 ADD x13, x13, x11 // a0 += a_offset 117 CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset 118 CMP x14, x12 // if a1 == zero 119 ADD x14, x14, x11 // a1 += a_offset 120 CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset 121 CMP x15, x12 // if a2 == zero 122 ADD x15, x15, x11 // a2 += a_offset 123 CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset 124 CMP x16, x12 // if a3 == zero 125 ADD x16, x16, x11 // a3 += a_offset 126 CSEL x16, x12, x16, EQ // a3 = zero, else += a3 + a_offset 127 128 # Is there at least 4 floats (16 bytes) for prologue + epilogue? 129 SUBS x0, x2, 16 // k = kc - 16 130 131 PRFM PLDL1KEEP, [x13, 0] // Prefetch A 132 PRFM PLDL1KEEP, [x13, 64] 133 PRFM PLDL1KEEP, [x14, 0] 134 PRFM PLDL1KEEP, [x14, 64] 135 PRFM PLDL1KEEP, [x15, 0] 136 PRFM PLDL1KEEP, [x15, 64] 137 PRFM PLDL1KEEP, [x16, 0] 138 PRFM PLDL1KEEP, [x16, 64] 139 B.LO 5f 140 141 SUBS x0, x0, 16 // 4 floats for main loop 142 143 # Prologue - loads for first group of 24 FMA 144 145 # Read first block of 4 A. 146 LDR d0, [x13], 8 // a0 147 LDR d1, [x15], 8 // a2 148 LD1 {v0.d}[1], [x14], 8 // a1 149 LD1 {v1.d}[1], [x16], 8 // a3 150 151 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 152 LD1 {v9.16b, v10.16b}, [x5], 32 153 LDR d11, [x5], 8 154 LDR x8, [x5], 8 155 156 # Is there at least 4 floats (16 bytes) for main loop? 157 B.LO 3f 158 159 # Main loop - 4 floats of A (16 bytes) 1602: 161 # First group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 162 # A is loaded for 2nd group into v2/v3 163 # INS is 4 blocks (16 cycles) after load 164 165 # BLOCK 0 166 LDR d2, [x13], 8 // a0 167 INS v11.d[1], x8 168 FMLA v20.4s, v6.4s, v0.s[0] 169 LDR x8, [x14], 8 // a1 170 FMLA v23.4s, v6.4s, v0.s[2] 171 FMLA v26.4s, v6.4s, v1.s[0] 172 PRFM PLDL1KEEP, [x13, 128] // Prefetch A0 173 174 # BLOCK 1 175 LDR d3, [x15], 8 // a2 176 INS v2.d[1], x8 // a1 was loaded in block 0 177 FMLA v29.4s, v6.4s, v1.s[2] 178 LDR x8, [x16], 8 // a3 179 FMLA v21.4s, v7.4s, v0.s[0] 180 FMLA v24.4s, v7.4s, v0.s[2] 181 PRFM PLDL1KEEP, [x14, 128] // Prefetch A1 182 183 # BLOCK 2 184 LDR d14, [x5] // vb0x0123 185 INS v3.d[1], x8 // a3 was loaded in block 1 186 FMLA v27.4s, v7.4s, v1.s[0] 187 LDR x8, [x5, 8] 188 FMLA v30.4s, v7.4s, v1.s[2] 189 FMLA v22.4s, v8.4s, v0.s[0] 190 PRFM PLDL1KEEP, [x15, 128] // Prefetch A2 191 192 # BLOCK 3 193 LDR d15, [x5, 16] // vb0x4567 194 INS v14.d[1], x8 // v14 was loaded in block 2 195 FMLA v25.4s, v8.4s, v0.s[2] 196 LDR x8, [x5, 24] 197 FMLA v28.4s, v8.4s, v1.s[0] 198 FMLA v31.4s, v8.4s, v1.s[2] 199 PRFM PLDL1KEEP, [x16, 128] // Prefetch A3 200 201 # BLOCK 4 202 LDR d16, [x5, 32] // vb0x89AB 203 INS v15.d[1], x8 204 FMLA v20.4s, v9.4s, v0.s[1] 205 LDR x8, [x5, 40] 206 FMLA v23.4s, v9.4s, v0.s[3] 207 FMLA v26.4s, v9.4s, v1.s[1] 208 PRFM PLDL1KEEP, [x5, 320] // Prefetch B 209 210 # BLOCK 5 211 LDR d17, [x5, 48] // vb1x0123 212 INS v16.d[1], x8 213 FMLA v29.4s, v9.4s, v1.s[3] 214 LDR x8, [x5, 56] 215 FMLA v21.4s, v10.4s, v0.s[1] 216 FMLA v24.4s, v10.4s, v0.s[3] 217 PRFM PLDL1KEEP, [x5, 384] // Prefetch B 218 219 # BLOCK 6 220 LDR d18, [x5, 64] // vb1x4567 221 INS v17.d[1], x8 222 FMLA v27.4s, v10.4s, v1.s[1] 223 LDR x8, [x5, 72] 224 FMLA v30.4s, v10.4s, v1.s[3] 225 FMLA v22.4s, v11.4s, v0.s[1] 226 PRFM PLDL1KEEP, [x5, 448] // Prefetch B 227 228 # BLOCK 7 229 LDR d19, [x5, 80] // vb1x89AB 230 INS v18.d[1], x8 231 FMLA v25.4s, v11.4s, v0.s[3] 232 LDR x8, [x5, 88] 233 FMLA v28.4s, v11.4s, v1.s[1] 234 FMLA v31.4s, v11.4s, v1.s[3] 235 236 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 237 # A is loaded for 1st group into v0/v1 238 239 # BLOCK 0 240 LDR d0, [x13], 8 // a0 241 INS v19.d[1], x8 242 FMLA v20.4s, v14.4s, v2.s[0] 243 LDR x8, [x14], 8 // a1 244 FMLA v23.4s, v14.4s, v2.s[2] 245 FMLA v26.4s, v14.4s, v3.s[0] 246 247 # BLOCK 1 248 LDR d1, [x15], 8 // a2 249 INS v0.d[1], x8 // a1 250 FMLA v29.4s, v14.4s, v3.s[2] 251 LDR x8, [x16], 8 // a3 252 FMLA v21.4s, v15.4s, v2.s[0] 253 FMLA v24.4s, v15.4s, v2.s[2] 254 255 # BLOCK 2 256 LDR d6, [x5, 96] // vb0x0123 257 INS v1.d[1], x8 // a3 258 FMLA v27.4s, v15.4s, v3.s[0] 259 LDR x8, [x5, 104] 260 FMLA v30.4s, v15.4s, v3.s[2] 261 FMLA v22.4s, v16.4s, v2.s[0] 262 263 # BLOCK 3 264 LDR d7, [x5, 112] // vb0x4567 265 INS v6.d[1], x8 266 FMLA v25.4s, v16.4s, v2.s[2] 267 LDR x8, [x5, 120] 268 FMLA v28.4s, v16.4s, v3.s[0] 269 FMLA v31.4s, v16.4s, v3.s[2] 270 271 # BLOCK 4 272 LDR d8, [x5, 128] // vb0x89AB 273 INS v7.d[1], x8 274 FMLA v20.4s, v17.4s, v2.s[1] 275 LDR x8, [x5, 136] 276 FMLA v23.4s, v17.4s, v2.s[3] 277 FMLA v26.4s, v17.4s, v3.s[1] 278 279 # BLOCK 5 280 LDR d9, [x5, 144] // vb1x0123 281 INS v8.d[1], x8 282 FMLA v29.4s, v17.4s, v3.s[3] 283 LDR x8, [x5, 152] 284 FMLA v21.4s, v18.4s, v2.s[1] 285 FMLA v24.4s, v18.4s, v2.s[3] 286 287 # BLOCK 6 288 LDR d10, [x5, 160] // vb1x4567 289 INS v9.d[1], x8 290 FMLA v27.4s, v18.4s, v3.s[1] 291 LDR x8, [x5, 168] 292 FMLA v30.4s, v18.4s, v3.s[3] 293 SUBS x0, x0, 16 294 FMLA v22.4s, v19.4s, v2.s[1] 295 296 # BLOCK 7 297 LDR d11, [x5, 176] // vb1x89AB 298 INS v10.d[1], x8 299 FMLA v25.4s, v19.4s, v2.s[3] 300 LDR x8, [x5, 184] 301 FMLA v28.4s, v19.4s, v3.s[1] 302 ADD x5, x5, 192 303 FMLA v31.4s, v19.4s, v3.s[3] 304 B.HS 2b 305 306 # Epilogue 307 # First block same as main loop. Second block has no loads. 3083: 309 # BLOCK 0 310 LDR d2, [x13], 8 // a0 311 INS v11.d[1], x8 312 FMLA v20.4s, v6.4s, v0.s[0] 313 LDR x8, [x14], 8 // a1 314 FMLA v23.4s, v6.4s, v0.s[2] 315 FMLA v26.4s, v6.4s, v1.s[0] 316 317 # BLOCK 1 318 LDR d3, [x15], 8 // a2 319 INS v2.d[1], x8 // a1 was loaded in block 0 320 FMLA v29.4s, v6.4s, v1.s[2] 321 LDR x8, [x16], 8 // a3 322 FMLA v21.4s, v7.4s, v0.s[0] 323 FMLA v24.4s, v7.4s, v0.s[2] 324 325 # BLOCK 2 326 LDR d14, [x5] // vb0x0123 327 INS v3.d[1], x8 // a3 was loaded in block 1 328 FMLA v27.4s, v7.4s, v1.s[0] 329 LDR x8, [x5, 8] 330 FMLA v30.4s, v7.4s, v1.s[2] 331 FMLA v22.4s, v8.4s, v0.s[0] 332 333 # BLOCK 3 334 LDR d15, [x5, 16] // vb0x4567 335 INS v14.d[1], x8 // v14 was loaded in block 2 336 FMLA v25.4s, v8.4s, v0.s[2] 337 LDR x8, [x5, 24] 338 FMLA v28.4s, v8.4s, v1.s[0] 339 FMLA v31.4s, v8.4s, v1.s[2] 340 341 # BLOCK 4 342 LDR d16, [x5, 32] // vb0x89AB 343 INS v15.d[1], x8 344 FMLA v20.4s, v9.4s, v0.s[1] 345 LDR x8, [x5, 40] 346 FMLA v23.4s, v9.4s, v0.s[3] 347 FMLA v26.4s, v9.4s, v1.s[1] 348 349 # BLOCK 5 350 LDR d17, [x5, 48] // vb1x0123 351 INS v16.d[1], x8 352 FMLA v29.4s, v9.4s, v1.s[3] 353 LDR x8, [x5, 56] 354 FMLA v21.4s, v10.4s, v0.s[1] 355 FMLA v24.4s, v10.4s, v0.s[3] 356 357 # BLOCK 6 358 LDR d18, [x5, 64] // vb1x4567 359 INS v17.d[1], x8 360 FMLA v27.4s, v10.4s, v1.s[1] 361 LDR x8, [x5, 72] 362 FMLA v30.4s, v10.4s, v1.s[3] 363 FMLA v22.4s, v11.4s, v0.s[1] 364 365 # BLOCK 7 366 LDR d19, [x5, 80] // vb1x89AB 367 INS v18.d[1], x8 368 FMLA v25.4s, v11.4s, v0.s[3] 369 LDR x8, [x5, 88] 370 FMLA v28.4s, v11.4s, v1.s[1] 371 FMLA v31.4s, v11.4s, v1.s[3] 372 373 # Second group of 24 fma. 8 blocks of 4 cycles. LDR + 3 FMA 374 # A is loaded for 1st group into v0/v1 375 376 # BLOCK 0 377 INS v19.d[1], x8 378 FMLA v20.4s, v14.4s, v2.s[0] 379 FMLA v23.4s, v14.4s, v2.s[2] 380 FMLA v26.4s, v14.4s, v3.s[0] 381 382 # BLOCK 1 383 FMLA v29.4s, v14.4s, v3.s[2] 384 FMLA v21.4s, v15.4s, v2.s[0] 385 FMLA v24.4s, v15.4s, v2.s[2] 386 387 # BLOCK 2 388 FMLA v27.4s, v15.4s, v3.s[0] 389 FMLA v30.4s, v15.4s, v3.s[2] 390 FMLA v22.4s, v16.4s, v2.s[0] 391 392 # BLOCK 3 393 FMLA v25.4s, v16.4s, v2.s[2] 394 FMLA v28.4s, v16.4s, v3.s[0] 395 FMLA v31.4s, v16.4s, v3.s[2] 396 397 # BLOCK 4 398 FMLA v20.4s, v17.4s, v2.s[1] 399 FMLA v23.4s, v17.4s, v2.s[3] 400 FMLA v26.4s, v17.4s, v3.s[1] 401 402 # BLOCK 5 403 FMLA v29.4s, v17.4s, v3.s[3] 404 FMLA v21.4s, v18.4s, v2.s[1] 405 FMLA v24.4s, v18.4s, v2.s[3] 406 407 # BLOCK 6 408 FMLA v27.4s, v18.4s, v3.s[1] 409 FMLA v30.4s, v18.4s, v3.s[3] 410 FMLA v22.4s, v19.4s, v2.s[1] 411 TST x0, 15 412 413 # BLOCK 7 414 FMLA v25.4s, v19.4s, v2.s[3] 415 FMLA v28.4s, v19.4s, v3.s[1] 416 ADD x5, x5, 96 417 FMLA v31.4s, v19.4s, v3.s[3] 418 419 # Is there a remainder?- 2 floats of A (8 bytes) or less 420 B.NE 5f 421 4224: 423 # ks loop 424 SUBS x9, x9, 32 // ks -= MR * sizeof(void*) 425 B.NE 1b 426 427 # Clamp 428 FMIN v20.4s, v20.4s, v4.4s 429 SUBS x1, x1, 12 430 FMIN v21.4s, v21.4s, v4.4s 431 FMIN v22.4s, v22.4s, v4.4s 432 FMIN v23.4s, v23.4s, v4.4s 433 FMIN v24.4s, v24.4s, v4.4s 434 FMIN v25.4s, v25.4s, v4.4s 435 FMIN v26.4s, v26.4s, v4.4s 436 FMIN v27.4s, v27.4s, v4.4s 437 FMIN v28.4s, v28.4s, v4.4s 438 FMIN v29.4s, v29.4s, v4.4s 439 FMIN v30.4s, v30.4s, v4.4s 440 FMIN v31.4s, v31.4s, v4.4s 441 FMAX v20.4s, v20.4s, v5.4s 442 FMAX v21.4s, v21.4s, v5.4s 443 FMAX v22.4s, v22.4s, v5.4s 444 FMAX v23.4s, v23.4s, v5.4s 445 FMAX v24.4s, v24.4s, v5.4s 446 FMAX v25.4s, v25.4s, v5.4s 447 FMAX v26.4s, v26.4s, v5.4s 448 FMAX v27.4s, v27.4s, v5.4s 449 FMAX v28.4s, v28.4s, v5.4s 450 FMAX v29.4s, v29.4s, v5.4s 451 FMAX v30.4s, v30.4s, v5.4s 452 FMAX v31.4s, v31.4s, v5.4s 453 454 # Store full 4 x 12 455 B.LO 8f 456 457 ST1 {v29.16b, v30.16b, v31.16b}, [x7], x10 458 ST1 {v26.16b, v27.16b, v28.16b}, [x18], x10 459 ST1 {v23.16b, v24.16b, v25.16b}, [x17], x10 460 ST1 {v20.16b, v21.16b, v22.16b}, [x6], x10 461 SUB x4, x4, x3 // a -= ks 462 463 # nc loop 464 B.HI 0b 465 466 # Restore d8-d11,d14,d15 from stack 467 LDP d14, d15, [sp, 32] 468 LDP d10, d11, [sp, 16] 469 LDP d8, d9, [sp], 48 470 RET 471 4725: 473 # Is there a remainder?- 2 floats of A (8 bytes) 474 TBZ x0, 3, 6f 475 476 # Remainder- 2 floats of A (8 bytes) 477 LDR d0, [x13], 8 // a0 478 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 479 LDR d1, [x14], 8 // a1 480 LDR d2, [x15], 8 // a2 481 LDR d3, [x16], 8 // a3 482 LD1 {v9.16b, v10.16b, v11.16b}, [x5], 48 483 484 # First block of 3 B 485 FMLA v20.4s, v6.4s, v0.s[0] 486 FMLA v23.4s, v6.4s, v1.s[0] 487 FMLA v26.4s, v6.4s, v2.s[0] 488 FMLA v29.4s, v6.4s, v3.s[0] 489 FMLA v21.4s, v7.4s, v0.s[0] 490 FMLA v24.4s, v7.4s, v1.s[0] 491 FMLA v27.4s, v7.4s, v2.s[0] 492 FMLA v30.4s, v7.4s, v3.s[0] 493 FMLA v22.4s, v8.4s, v0.s[0] 494 FMLA v25.4s, v8.4s, v1.s[0] 495 FMLA v28.4s, v8.4s, v2.s[0] 496 FMLA v31.4s, v8.4s, v3.s[0] 497 498 # Second block of 3 B 499 FMLA v20.4s, v9.4s, v0.s[1] 500 FMLA v23.4s, v9.4s, v1.s[1] 501 FMLA v26.4s, v9.4s, v2.s[1] 502 FMLA v29.4s, v9.4s, v3.s[1] 503 FMLA v21.4s, v10.4s, v0.s[1] 504 FMLA v24.4s, v10.4s, v1.s[1] 505 FMLA v27.4s, v10.4s, v2.s[1] 506 FMLA v30.4s, v10.4s, v3.s[1] 507 FMLA v22.4s, v11.4s, v0.s[1] 508 FMLA v25.4s, v11.4s, v1.s[1] 509 FMLA v28.4s, v11.4s, v2.s[1] 510 FMLA v31.4s, v11.4s, v3.s[1] 511 512 # Is there a remainder?- 1 floats of A (4 bytes) 513 TBZ x0, 2, 4b 5146: 515 # Remainder- 1 floats of A (4 bytes) 516 LDR s0, [x13], 4 // a0 517 LD1 {v6.16b, v7.16b, v8.16b}, [x5], 48 518 LDR s1, [x14], 4 // a1 519 LDR s2, [x15], 4 // a2 520 LDR s3, [x16], 4 // a3 521 522 FMLA v20.4s, v6.4s, v0.s[0] 523 FMLA v23.4s, v6.4s, v1.s[0] 524 FMLA v26.4s, v6.4s, v2.s[0] 525 FMLA v29.4s, v6.4s, v3.s[0] 526 FMLA v21.4s, v7.4s, v0.s[0] 527 FMLA v24.4s, v7.4s, v1.s[0] 528 FMLA v27.4s, v7.4s, v2.s[0] 529 FMLA v30.4s, v7.4s, v3.s[0] 530 FMLA v22.4s, v8.4s, v0.s[0] 531 FMLA v25.4s, v8.4s, v1.s[0] 532 FMLA v28.4s, v8.4s, v2.s[0] 533 FMLA v31.4s, v8.4s, v3.s[0] 534 B 4b 535 5368: 537 ADD x1, x1, 12 538 # Store odd channels 539 TBZ x1, 3, 9f 540 STP q29, q30, [x7], 32 541 MOV v29.16b, v31.16b 542 STP q26, q27, [x18], 32 543 MOV v26.16b, v28.16b 544 STP q23, q24, [x17], 32 545 MOV v23.16b, v25.16b 546 STP q20, q21, [x6], 32 547 MOV v20.16b, v22.16b 548 5499: 550 TBZ x1, 2, 10f 551 STR q29, [x7], 16 552 MOV v29.16b, v30.16b 553 STR q26, [x18], 16 554 MOV v26.16b, v27.16b 555 STR q23, [x17], 16 556 MOV v23.16b, v24.16b 557 STR q20, [x6], 16 558 MOV v20.16b, v21.16b 559 56010: 561 TBZ x1, 1, 11f 562 STR d29, [x7], 8 563 DUP d29, v29.d[1] 564 STR d26, [x18], 8 565 DUP d26, v26.d[1] 566 STR d23, [x17], 8 567 DUP d23, v23.d[1] 568 STR d20, [x6], 8 569 DUP d20, v20.d[1] 570 57111: 572 TBZ x1, 0, 12f 573 STR s29, [x7] 574 STR s26, [x18] 575 STR s23, [x17] 576 STR s20, [x6] 57712: 578 # Restore d8-d11,d14,d15 from stack 579 LDP d14, d15, [sp, 32] 580 LDP d10, d11, [sp, 16] 581 LDP d8, d9, [sp], 48 582 RET 583 584END_FUNCTION xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53 585 586#ifdef __ELF__ 587.section ".note.GNU-stack","",%progbits 588#endif 589