1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# unused compared to 5x8 25# x4 a5 26# x7 c5 27# A5 v10 v11 28# C v30 v31 29 30# d8-d15 need to be preserved if used. 31# x19-x30 need to be preserved if used. x18 is reserved for OS. 32 33# A pointers 34# x3 a0 35# x9 a1 36# x10 a2 37# x11 a3 38# x12 a4 39 40# C pointers 41# x6 c0 42# x16 c1 43# x17 c2 44# x13 c3 45# x7 c4 46 47# Vector register usage 48# A0 v0 v1 49# A1 v2 v3 50# A2 v4 v5 51# A3 v6 v7 52# A4 v8 v9 53# B v12 v13 v14 v15 54# B v16 v17 v18 v19 55# C v20 v21 56# C v22 v23 57# C v24 v25 58# C v26 v27 59# C v28 v29 60# Clamp v30 v31 61 62BEGIN_FUNCTION xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57 63 64 # Clamp A and C pointers / Save d8-d15 on stack 65 STP d8, d9, [sp, -48]! 66 CMP x0, 2 // if mr < 2 67 ADD x9, x3, x4 // a1 = a0 + a_stride 68 ADD x16, x6, x7 // c1 = c0 + cm_stride 69 CSEL x9, x3, x9, LO // a1 = a0 70 CSEL x16, x6, x16, LO // c1 = c0 71 72 STP d12, d13, [sp, 16] 73 ADD x10, x9, x4 // a2 = a1 + a_stride 74 ADD x17, x16, x7 // c2 = c1 + cm_stride 75 // if mr <= 2 76 CSEL x10, x9, x10, LS // a2 = a1 77 CSEL x17, x16, x17, LS // c2 = c1 78 79 STP d14, d15, [sp, 32] 80 CMP x0, 4 // if mr < 4 81 ADD x11, x10, x4 // a3 = a2 + a_stride 82 ADD x13, x17, x7 // c3 = c2 + cm_stride 83 CSEL x11, x10, x11, LO // a3 = a2 84 CSEL x13, x17, x13, LO // c3 = c2 85 86 # Load params pointer 87 LDR x8, [sp, 56] 88 89 ADD x12, x11, x4 // a4 = a3 + a_stride 90 ADD x7, x13, x7 // c4 = c3 + cm_stride 91 // if mr <= 5 92 CSEL x12, x11, x12, LS // a4 = a3 93 CSEL x7, x13, x7, LS // c4 = c3 94 95 # Load clamp values 96 LD2R {v30.4s, v31.4s}, [x8] 97 98 # Load cn_stride 99 LDR x14, [sp, 48] 100 1010: 102 # Load initial bias from w into accumulators 103 LDP q20, q21, [x5], 32 104 MOV v22.16b, v20.16b 105 MOV v23.16b, v21.16b 106 MOV v24.16b, v20.16b 107 MOV v25.16b, v21.16b 108 MOV v26.16b, v20.16b 109 MOV v27.16b, v21.16b 110 MOV v28.16b, v20.16b 111 MOV v29.16b, v21.16b 112 113 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 114 SUBS x0, x2, 32 // k = kc - 32 115 B.LO 4f 116 117 # Prologue - loads for main loop of 80 FMA 118 LDR q0, [x3], 16 119 LDR q2, [x9], 16 120 LDR q4, [x10], 16 121 LDR q6, [x11], 16 122 LDR q8, [x12], 16 123 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 124 LDP q14, q15, [x5], 32 125 LDP q16, q17, [x5], 32 126 127 # Is there at least 8 floats (32 bytes) for main loop? 128 SUBS x0, x0, 32 129 B.LO 2f 130 131 # Main loop - 8 floats of A (32 bytes) 132 # 80 FMA + 5 LDP A + 8 LDP B 1331: 134 # First group of 4 A. 40 FMA. 135 FMLA v20.4s, v12.4s, v0.s[0] 136 LDP q18, q19, [x5], 32 // Load last B 137 FMLA v22.4s, v12.4s, v2.s[0] 138 FMLA v24.4s, v12.4s, v4.s[0] 139 FMLA v26.4s, v12.4s, v6.s[0] 140 FMLA v28.4s, v12.4s, v8.s[0] 141 FMLA v21.4s, v13.4s, v0.s[0] 142 FMLA v23.4s, v13.4s, v2.s[0] 143 FMLA v25.4s, v13.4s, v4.s[0] 144 FMLA v27.4s, v13.4s, v6.s[0] 145 FMLA v29.4s, v13.4s, v8.s[0] 146 LDR q1, [x3], 16 // Load next 5 A 147 148 FMLA v20.4s, v14.4s, v0.s[1] 149 FMLA v22.4s, v14.4s, v2.s[1] 150 FMLA v24.4s, v14.4s, v4.s[1] 151 LDR q3, [x9], 16 152 FMLA v26.4s, v14.4s, v6.s[1] 153 FMLA v28.4s, v14.4s, v8.s[1] 154 FMLA v21.4s, v15.4s, v0.s[1] 155 LDR q5, [x10], 16 156 FMLA v23.4s, v15.4s, v2.s[1] 157 FMLA v25.4s, v15.4s, v4.s[1] 158 FMLA v27.4s, v15.4s, v6.s[1] 159 LDR q7, [x11], 16 160 FMLA v29.4s, v15.4s, v8.s[1] 161 162 FMLA v20.4s, v16.4s, v0.s[2] 163 FMLA v22.4s, v16.4s, v2.s[2] 164 LDR q9, [x12], 16 165 FMLA v24.4s, v16.4s, v4.s[2] 166 FMLA v26.4s, v16.4s, v6.s[2] 167 FMLA v28.4s, v16.4s, v8.s[2] 168 LDP q12, q13, [x5], 32 // Load 4 B 169 FMLA v21.4s, v17.4s, v0.s[2] 170 FMLA v23.4s, v17.4s, v2.s[2] 171 FMLA v25.4s, v17.4s, v4.s[2] 172 LDP q14, q15, [x5], 32 173 FMLA v27.4s, v17.4s, v6.s[2] 174 FMLA v29.4s, v17.4s, v8.s[2] 175 176 FMLA v20.4s, v18.4s, v0.s[3] 177 LDP q16, q17, [x5], 32 178 FMLA v22.4s, v18.4s, v2.s[3] 179 FMLA v24.4s, v18.4s, v4.s[3] 180 FMLA v26.4s, v18.4s, v6.s[3] 181 FMLA v28.4s, v18.4s, v8.s[3] 182 FMLA v21.4s, v19.4s, v0.s[3] 183 FMLA v23.4s, v19.4s, v2.s[3] 184 FMLA v25.4s, v19.4s, v4.s[3] 185 FMLA v27.4s, v19.4s, v6.s[3] 186 FMLA v29.4s, v19.4s, v8.s[3] 187 LDP q18, q19, [x5], 32 188 189 # Second group of 4 A. 40 FMA. 190 FMLA v20.4s, v12.4s, v1.s[0] 191 FMLA v22.4s, v12.4s, v3.s[0] 192 FMLA v24.4s, v12.4s, v5.s[0] 193 LDR q0, [x3], 16 // Load next 5 A 194 FMLA v26.4s, v12.4s, v7.s[0] 195 FMLA v28.4s, v12.4s, v9.s[0] 196 FMLA v21.4s, v13.4s, v1.s[0] 197 LDR q2, [x9], 16 198 FMLA v23.4s, v13.4s, v3.s[0] 199 FMLA v25.4s, v13.4s, v5.s[0] 200 FMLA v27.4s, v13.4s, v7.s[0] 201 LDR q4, [x10], 16 202 FMLA v29.4s, v13.4s, v9.s[0] 203 204 FMLA v20.4s, v14.4s, v1.s[1] 205 FMLA v22.4s, v14.4s, v3.s[1] 206 LDR q6, [x11], 16 207 FMLA v24.4s, v14.4s, v5.s[1] 208 FMLA v26.4s, v14.4s, v7.s[1] 209 FMLA v28.4s, v14.4s, v9.s[1] 210 LDR q8, [x12], 16 211 FMLA v21.4s, v15.4s, v1.s[1] 212 FMLA v23.4s, v15.4s, v3.s[1] 213 FMLA v25.4s, v15.4s, v5.s[1] 214 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 215 FMLA v27.4s, v15.4s, v7.s[1] 216 FMLA v29.4s, v15.4s, v9.s[1] 217 218 FMLA v20.4s, v16.4s, v1.s[2] 219 LDP q14, q15, [x5], 32 220 FMLA v22.4s, v16.4s, v3.s[2] 221 FMLA v24.4s, v16.4s, v5.s[2] 222 FMLA v26.4s, v16.4s, v7.s[2] 223 FMLA v28.4s, v16.4s, v9.s[2] 224 FMLA v21.4s, v17.4s, v1.s[2] 225 FMLA v23.4s, v17.4s, v3.s[2] 226 FMLA v25.4s, v17.4s, v5.s[2] 227 FMLA v27.4s, v17.4s, v7.s[2] 228 FMLA v29.4s, v17.4s, v9.s[2] 229 LDP q16, q17, [x5], 32 230 231 FMLA v20.4s, v18.4s, v1.s[3] 232 FMLA v22.4s, v18.4s, v3.s[3] 233 SUBS x0, x0, 32 234 FMLA v24.4s, v18.4s, v5.s[3] 235 FMLA v26.4s, v18.4s, v7.s[3] 236 FMLA v28.4s, v18.4s, v9.s[3] 237 FMLA v21.4s, v19.4s, v1.s[3] 238 FMLA v23.4s, v19.4s, v3.s[3] 239 FMLA v25.4s, v19.4s, v5.s[3] 240 FMLA v27.4s, v19.4s, v7.s[3] 241 FMLA v29.4s, v19.4s, v9.s[3] 242 B.HS 1b 243 244 # Epilogue - 8 floats of A (32 bytes) 245 # 80 FMA + 5 LDP A + 8 LDP B 246 # First block same as main loop. Second block has no preloads. 2472: 248 # First group of 4 A. 40 FMA. 249 FMLA v20.4s, v12.4s, v0.s[0] 250 LDP q18, q19, [x5], 32 // Load last B 251 FMLA v22.4s, v12.4s, v2.s[0] 252 FMLA v24.4s, v12.4s, v4.s[0] 253 FMLA v26.4s, v12.4s, v6.s[0] 254 FMLA v28.4s, v12.4s, v8.s[0] 255 FMLA v21.4s, v13.4s, v0.s[0] 256 FMLA v23.4s, v13.4s, v2.s[0] 257 FMLA v25.4s, v13.4s, v4.s[0] 258 FMLA v27.4s, v13.4s, v6.s[0] 259 FMLA v29.4s, v13.4s, v8.s[0] 260 LDR q1, [x3], 16 // Load next 5 A 261 262 FMLA v20.4s, v14.4s, v0.s[1] 263 FMLA v22.4s, v14.4s, v2.s[1] 264 FMLA v24.4s, v14.4s, v4.s[1] 265 LDR q3, [x9], 16 266 FMLA v26.4s, v14.4s, v6.s[1] 267 FMLA v28.4s, v14.4s, v8.s[1] 268 FMLA v21.4s, v15.4s, v0.s[1] 269 LDR q5, [x10], 16 270 FMLA v23.4s, v15.4s, v2.s[1] 271 FMLA v25.4s, v15.4s, v4.s[1] 272 FMLA v27.4s, v15.4s, v6.s[1] 273 LDR q7, [x11], 16 274 FMLA v29.4s, v15.4s, v8.s[1] 275 276 FMLA v20.4s, v16.4s, v0.s[2] 277 FMLA v22.4s, v16.4s, v2.s[2] 278 LDR q9, [x12], 16 279 FMLA v24.4s, v16.4s, v4.s[2] 280 FMLA v26.4s, v16.4s, v6.s[2] 281 FMLA v28.4s, v16.4s, v8.s[2] 282 LDP q12, q13, [x5], 32 // Load 4 B 283 FMLA v21.4s, v17.4s, v0.s[2] 284 FMLA v23.4s, v17.4s, v2.s[2] 285 FMLA v25.4s, v17.4s, v4.s[2] 286 LDP q14, q15, [x5], 32 287 FMLA v27.4s, v17.4s, v6.s[2] 288 FMLA v29.4s, v17.4s, v8.s[2] 289 290 FMLA v20.4s, v18.4s, v0.s[3] 291 LDP q16, q17, [x5], 32 292 FMLA v22.4s, v18.4s, v2.s[3] 293 FMLA v24.4s, v18.4s, v4.s[3] 294 FMLA v26.4s, v18.4s, v6.s[3] 295 FMLA v28.4s, v18.4s, v8.s[3] 296 FMLA v21.4s, v19.4s, v0.s[3] 297 FMLA v23.4s, v19.4s, v2.s[3] 298 FMLA v25.4s, v19.4s, v4.s[3] 299 FMLA v27.4s, v19.4s, v6.s[3] 300 FMLA v29.4s, v19.4s, v8.s[3] 301 LDP q18, q19, [x5], 32 302 303 # Second group of 4 A. 40 FMA. 304 FMLA v20.4s, v12.4s, v1.s[0] 305 FMLA v22.4s, v12.4s, v3.s[0] 306 FMLA v24.4s, v12.4s, v5.s[0] 307 FMLA v26.4s, v12.4s, v7.s[0] 308 FMLA v28.4s, v12.4s, v9.s[0] 309 FMLA v21.4s, v13.4s, v1.s[0] 310 FMLA v23.4s, v13.4s, v3.s[0] 311 FMLA v25.4s, v13.4s, v5.s[0] 312 FMLA v27.4s, v13.4s, v7.s[0] 313 FMLA v29.4s, v13.4s, v9.s[0] 314 315 FMLA v20.4s, v14.4s, v1.s[1] 316 FMLA v22.4s, v14.4s, v3.s[1] 317 FMLA v24.4s, v14.4s, v5.s[1] 318 FMLA v26.4s, v14.4s, v7.s[1] 319 FMLA v28.4s, v14.4s, v9.s[1] 320 FMLA v21.4s, v15.4s, v1.s[1] 321 FMLA v23.4s, v15.4s, v3.s[1] 322 FMLA v25.4s, v15.4s, v5.s[1] 323 FMLA v27.4s, v15.4s, v7.s[1] 324 FMLA v29.4s, v15.4s, v9.s[1] 325 326 FMLA v20.4s, v16.4s, v1.s[2] 327 FMLA v22.4s, v16.4s, v3.s[2] 328 FMLA v24.4s, v16.4s, v5.s[2] 329 FMLA v26.4s, v16.4s, v7.s[2] 330 FMLA v28.4s, v16.4s, v9.s[2] 331 FMLA v21.4s, v17.4s, v1.s[2] 332 FMLA v23.4s, v17.4s, v3.s[2] 333 FMLA v25.4s, v17.4s, v5.s[2] 334 FMLA v27.4s, v17.4s, v7.s[2] 335 FMLA v29.4s, v17.4s, v9.s[2] 336 TST x0, 31 337 338 FMLA v20.4s, v18.4s, v1.s[3] 339 FMLA v22.4s, v18.4s, v3.s[3] 340 FMLA v24.4s, v18.4s, v5.s[3] 341 FMLA v26.4s, v18.4s, v7.s[3] 342 FMLA v28.4s, v18.4s, v9.s[3] 343 FMLA v21.4s, v19.4s, v1.s[3] 344 FMLA v23.4s, v19.4s, v3.s[3] 345 FMLA v25.4s, v19.4s, v5.s[3] 346 FMLA v27.4s, v19.4s, v7.s[3] 347 FMLA v29.4s, v19.4s, v9.s[3] 348 B.NE 4f 349 350 # Clamp 3513: 352 FMIN v20.4s, v20.4s, v30.4s 353 SUBS x1, x1, 8 354 FMIN v21.4s, v21.4s, v30.4s 355 FMIN v22.4s, v22.4s, v30.4s 356 FMIN v23.4s, v23.4s, v30.4s 357 FMIN v24.4s, v24.4s, v30.4s 358 FMIN v25.4s, v25.4s, v30.4s 359 FMIN v26.4s, v26.4s, v30.4s 360 FMIN v27.4s, v27.4s, v30.4s 361 FMIN v28.4s, v28.4s, v30.4s 362 FMIN v29.4s, v29.4s, v30.4s 363 FMAX v20.4s, v20.4s, v31.4s 364 FMAX v21.4s, v21.4s, v31.4s 365 FMAX v22.4s, v22.4s, v31.4s 366 FMAX v23.4s, v23.4s, v31.4s 367 FMAX v24.4s, v24.4s, v31.4s 368 FMAX v25.4s, v25.4s, v31.4s 369 FMAX v26.4s, v26.4s, v31.4s 370 FMAX v27.4s, v27.4s, v31.4s 371 FMAX v28.4s, v28.4s, v31.4s 372 FMAX v29.4s, v29.4s, v31.4s 373 374 # Store full 5 x 8 375 B.LO 7f 376 377 STP q20, q21, [x6] 378 ADD x6, x6, x14 379 SUB x3, x3, x2 // a0 -= kc 380 STP q22, q23, [x16] 381 ADD x16, x16, x14 382 SUB x9, x9, x2 // a1 -= kc 383 STP q24, q25, [x17] 384 ADD x17, x17, x14 385 SUB x10, x10, x2 // a2 -= kc 386 STP q26, q27, [x13] 387 ADD x13, x13, x14 388 SUB x11, x11, x2 // a3 -= kc 389 STP q28, q29, [x7] 390 ADD x7, x7, x14 391 SUB x12, x12, x2 // a4 -= kc 392 393 B.HI 0b 394 395 # Restore d8-d15 from stack 396 LDP d14, d15, [sp, 32] 397 LDP d12, d13, [sp, 16] 398 LDP d8, d9, [sp], 48 399 RET 400 401 # Load clamp values 4024: 403 # Is there a remainder?- 4 floats of A (16 bytes) 404 TBZ x0, 4, 5f 405 406 # Remainder- 4 floats of A (16 bytes) 407 # Load A 408 LDR q0, [x3], 16 409 LDR q2, [x9], 16 410 LDR q4, [x10], 16 411 LDR q6, [x11], 16 412 LDR q8, [x12], 16 413 # Load B 414 LDP q12, q13, [x5], 32 415 LDP q14, q15, [x5], 32 416 LDP q16, q17, [x5], 32 417 LDP q18, q19, [x5], 32 418 419 FMLA v20.4s, v12.4s, v0.s[0] 420 FMLA v22.4s, v12.4s, v2.s[0] 421 FMLA v24.4s, v12.4s, v4.s[0] 422 FMLA v26.4s, v12.4s, v6.s[0] 423 FMLA v28.4s, v12.4s, v8.s[0] 424 FMLA v21.4s, v13.4s, v0.s[0] 425 FMLA v23.4s, v13.4s, v2.s[0] 426 FMLA v25.4s, v13.4s, v4.s[0] 427 FMLA v27.4s, v13.4s, v6.s[0] 428 FMLA v29.4s, v13.4s, v8.s[0] 429 430 FMLA v20.4s, v14.4s, v0.s[1] 431 FMLA v22.4s, v14.4s, v2.s[1] 432 FMLA v24.4s, v14.4s, v4.s[1] 433 FMLA v26.4s, v14.4s, v6.s[1] 434 FMLA v28.4s, v14.4s, v8.s[1] 435 FMLA v21.4s, v15.4s, v0.s[1] 436 FMLA v23.4s, v15.4s, v2.s[1] 437 FMLA v25.4s, v15.4s, v4.s[1] 438 FMLA v27.4s, v15.4s, v6.s[1] 439 FMLA v29.4s, v15.4s, v8.s[1] 440 441 FMLA v20.4s, v16.4s, v0.s[2] 442 FMLA v22.4s, v16.4s, v2.s[2] 443 FMLA v24.4s, v16.4s, v4.s[2] 444 FMLA v26.4s, v16.4s, v6.s[2] 445 FMLA v28.4s, v16.4s, v8.s[2] 446 FMLA v21.4s, v17.4s, v0.s[2] 447 FMLA v23.4s, v17.4s, v2.s[2] 448 FMLA v25.4s, v17.4s, v4.s[2] 449 FMLA v27.4s, v17.4s, v6.s[2] 450 FMLA v29.4s, v17.4s, v8.s[2] 451 452 FMLA v20.4s, v18.4s, v0.s[3] 453 FMLA v22.4s, v18.4s, v2.s[3] 454 FMLA v24.4s, v18.4s, v4.s[3] 455 FMLA v26.4s, v18.4s, v6.s[3] 456 FMLA v28.4s, v18.4s, v8.s[3] 457 FMLA v21.4s, v19.4s, v0.s[3] 458 FMLA v23.4s, v19.4s, v2.s[3] 459 FMLA v25.4s, v19.4s, v4.s[3] 460 FMLA v27.4s, v19.4s, v6.s[3] 461 FMLA v29.4s, v19.4s, v8.s[3] 462 463 # Is there a remainder?- 2 floats of A (8 bytes) 4645: 465 TBZ x0, 3, 6f 466 467 # Remainder- 2 floats of A (8 bytes) 468 # Load A 469 LDR d0, [x3], 8 470 LDR d2, [x9], 8 471 LDR d4, [x10], 8 472 LDR d6, [x11], 8 473 LDR d8, [x12], 8 474 # Load B 475 LDP q12, q13, [x5], 32 476 LDP q14, q15, [x5], 32 477 478 FMLA v20.4s, v12.4s, v0.s[0] 479 FMLA v22.4s, v12.4s, v2.s[0] 480 FMLA v24.4s, v12.4s, v4.s[0] 481 FMLA v26.4s, v12.4s, v6.s[0] 482 FMLA v28.4s, v12.4s, v8.s[0] 483 FMLA v21.4s, v13.4s, v0.s[0] 484 FMLA v23.4s, v13.4s, v2.s[0] 485 FMLA v25.4s, v13.4s, v4.s[0] 486 FMLA v27.4s, v13.4s, v6.s[0] 487 FMLA v29.4s, v13.4s, v8.s[0] 488 489 FMLA v20.4s, v14.4s, v0.s[1] 490 FMLA v22.4s, v14.4s, v2.s[1] 491 FMLA v24.4s, v14.4s, v4.s[1] 492 FMLA v26.4s, v14.4s, v6.s[1] 493 FMLA v28.4s, v14.4s, v8.s[1] 494 FMLA v21.4s, v15.4s, v0.s[1] 495 FMLA v23.4s, v15.4s, v2.s[1] 496 FMLA v25.4s, v15.4s, v4.s[1] 497 FMLA v27.4s, v15.4s, v6.s[1] 498 FMLA v29.4s, v15.4s, v8.s[1] 499 500 # Is there a remainder?- 1 float of A (4 bytes) 5016: 502 TBZ x0, 2, 3b 503 504 # Remainder- 1 float of A (4 bytes) 505 # Load A 506 LDR s0, [x3], 4 507 LDR s2, [x9], 4 508 LDR s4, [x10], 4 509 LDR s6, [x11], 4 510 LDR s8, [x12], 4 511 # Load B 512 LDP q12, q13, [x5], 32 513 514 FMLA v20.4s, v12.4s, v0.s[0] 515 FMLA v22.4s, v12.4s, v2.s[0] 516 FMLA v24.4s, v12.4s, v4.s[0] 517 FMLA v26.4s, v12.4s, v6.s[0] 518 FMLA v28.4s, v12.4s, v8.s[0] 519 FMLA v21.4s, v13.4s, v0.s[0] 520 FMLA v23.4s, v13.4s, v2.s[0] 521 FMLA v25.4s, v13.4s, v4.s[0] 522 FMLA v27.4s, v13.4s, v6.s[0] 523 FMLA v29.4s, v13.4s, v8.s[0] 524 B 3b 525 526 # Store odd width 5277: 528 TBZ x1, 2, 8f 529 STR q20, [x6], 16 530 MOV v20.16b, v21.16b 531 STR q22, [x16], 16 532 MOV v22.16b, v23.16b 533 STR q24, [x17], 16 534 MOV v24.16b, v25.16b 535 STR q26, [x13], 16 536 MOV v26.16b, v27.16b 537 STR q28, [x7], 16 538 MOV v28.16b, v29.16b 5398: 540 TBZ x1, 1, 9f 541 STR d20, [x6], 8 542 DUP d20, v20.d[1] 543 STR d22, [x16], 8 544 DUP d22, v22.d[1] 545 STR d24, [x17], 8 546 DUP d24, v24.d[1] 547 STR d26, [x13], 8 548 DUP d26, v26.d[1] 549 STR d28, [x7], 8 550 DUP d28, v28.d[1] 551 5529: 553 TBZ x1, 0, 10f 554 STR s20, [x6] 555 STR s22, [x16] 556 STR s24, [x17] 557 STR s26, [x13] 558 STR s28, [x7] 55910: 560 # Restore d8-d15 from stack 561 LDP d14, d15, [sp, 32] 562 LDP d12, d13, [sp, 16] 563 LDP d8, d9, [sp], 48 564 RET 565 566END_FUNCTION xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57 567 568#ifdef __ELF__ 569.section ".note.GNU-stack","",%progbits 570#endif 571