1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 24 25# unused compared to 5x8 26# x4 a5 27# x7 c5 28# A5 v10 v11 29# C v30 v31 30 31# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 32 33# A pointers 34# x3 a0 35# x9 a1 36# x10 a2 37# x11 a3 38# x12 a4 39 40# C pointers 41# x6 c0 42# x16 c1 43# x17 c2 44# x13 c3 45# x7 c4 46 47# Vector register usage 48# A0 v0 v1 49# A1 v2 v3 50# A2 v4 v5 51# A3 v6 v7 52# A4 v8 v9 53# B v12 v13 v14 v15 54# B v16 v17 v18 v19 55# C v20 v21 56# C v22 v23 57# C v24 v25 58# C v26 v27 59# C v28 v29 60# Clamp v30 v31 61 62BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75 63 64 # Load cn_stride, acc 65 LDP x14, x15, [sp] 66 # Load params pointer 67 LDR x8, [sp, 16] 68 69 # Clamp A and C pointers / Save d8-d15 on stack 70 STP d8, d9, [sp, -48]! 71 CMP x0, 2 // if mr < 2 72 ADD x9, x3, x4 // a1 = a0 + a_stride 73 ADD x16, x6, x7 // c1 = c0 + cm_stride 74 CSEL x9, x3, x9, LO // a1 = a0 75 CSEL x16, x6, x16, LO // c1 = c0 76 77 STP d12, d13, [sp, 16] 78 ADD x10, x9, x4 // a2 = a1 + a_stride 79 ADD x17, x16, x7 // c2 = c1 + cm_stride 80 // if mr <= 2 81 CSEL x10, x9, x10, LS // a2 = a1 82 CSEL x17, x16, x17, LS // c2 = c1 83 84 STP d14, d15, [sp, 32] 85 CMP x0, 4 // if mr < 4 86 ADD x11, x10, x4 // a3 = a2 + a_stride 87 ADD x13, x17, x7 // c3 = c2 + cm_stride 88 CSEL x11, x10, x11, LO // a3 = a2 89 CSEL x13, x17, x13, LO // c3 = c2 90 91 ADD x12, x11, x4 // a4 = a3 + a_stride 92 ADD x7, x13, x7 // c4 = c3 + cm_stride 93 // if mr <= 4 94 CSEL x12, x11, x12, LS // a4 = a3 95 CSEL x7, x13, x7, LS // c4 = c3 96 97 # Load clamp values 98 LD2R {v30.4s, v31.4s}, [x8] 99 1000: 101 # Load initial accumulators 102 LDP q20, q21, [x15], 32 103 LDP q22, q23, [x15], 32 104 LDP q24, q25, [x15], 32 105 LDP q26, q27, [x15], 32 106 LDP q28, q29, [x15], 32 107 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 108 PRFM PLDL1KEEP, [x5, 64] 109 PRFM PLDL1KEEP, [x5, 128] 110 PRFM PLDL1KEEP, [x5, 192] 111 PRFM PLDL1KEEP, [x3] // Prefetch A 112 PRFM PLDL1KEEP, [x9] 113 PRFM PLDL1KEEP, [x10] 114 PRFM PLDL1KEEP, [x11] 115 PRFM PLDL1KEEP, [x12] 116 117 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 118 SUBS x0, x2, 32 // k = kc - 32 119 B.LO 4f 120 121 # Prologue - loads for main loop of 80 FMA 122 LDR q0, [x3], 16 123 LDR q2, [x9], 16 124 LDR q4, [x10], 16 125 LDR q6, [x11], 16 126 LDR q8, [x12], 16 127 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 128 LDP q14, q15, [x5], 32 129 LDP q16, q17, [x5], 32 130 131 # Is there at least 8 floats (32 bytes) for main loop? 132 SUBS x0, x0, 32 133 B.LO 2f 134 135 # Main loop - 8 floats of A (32 bytes) 136 # 80 FMA + 5 LDP A + 8 LDP B 1371: 138 # First group of 4 A. 40 FMA. 139 FMLA v20.4s, v12.4s, v0.s[0] 140 LDP q18, q19, [x5], 32 // Load last B 141 FMLA v22.4s, v12.4s, v2.s[0] 142 FMLA v24.4s, v12.4s, v4.s[0] 143 FMLA v26.4s, v12.4s, v6.s[0] 144 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 145 FMLA v28.4s, v12.4s, v8.s[0] 146 FMLA v21.4s, v13.4s, v0.s[0] 147 FMLA v23.4s, v13.4s, v2.s[0] 148 PRFM PLDL1KEEP, [x5, 256] 149 FMLA v25.4s, v13.4s, v4.s[0] 150 FMLA v27.4s, v13.4s, v6.s[0] 151 FMLA v29.4s, v13.4s, v8.s[0] 152 LDR q1, [x3], 16 // Load next 5 A 153 154 FMLA v20.4s, v14.4s, v0.s[1] 155 FMLA v22.4s, v14.4s, v2.s[1] 156 FMLA v24.4s, v14.4s, v4.s[1] 157 LDR q3, [x9], 16 158 FMLA v26.4s, v14.4s, v6.s[1] 159 FMLA v28.4s, v14.4s, v8.s[1] 160 FMLA v21.4s, v15.4s, v0.s[1] 161 LDR q5, [x10], 16 162 FMLA v23.4s, v15.4s, v2.s[1] 163 FMLA v25.4s, v15.4s, v4.s[1] 164 FMLA v27.4s, v15.4s, v6.s[1] 165 LDR q7, [x11], 16 166 FMLA v29.4s, v15.4s, v8.s[1] 167 168 FMLA v20.4s, v16.4s, v0.s[2] 169 FMLA v22.4s, v16.4s, v2.s[2] 170 LDR q9, [x12], 16 171 FMLA v24.4s, v16.4s, v4.s[2] 172 FMLA v26.4s, v16.4s, v6.s[2] 173 FMLA v28.4s, v16.4s, v8.s[2] 174 LDP q12, q13, [x5], 32 // Load 4 B 175 FMLA v21.4s, v17.4s, v0.s[2] 176 FMLA v23.4s, v17.4s, v2.s[2] 177 FMLA v25.4s, v17.4s, v4.s[2] 178 LDP q14, q15, [x5], 32 179 FMLA v27.4s, v17.4s, v6.s[2] 180 FMLA v29.4s, v17.4s, v8.s[2] 181 182 FMLA v20.4s, v18.4s, v0.s[3] 183 LDP q16, q17, [x5], 32 184 FMLA v22.4s, v18.4s, v2.s[3] 185 FMLA v24.4s, v18.4s, v4.s[3] 186 FMLA v26.4s, v18.4s, v6.s[3] 187 FMLA v28.4s, v18.4s, v8.s[3] 188 FMLA v21.4s, v19.4s, v0.s[3] 189 FMLA v23.4s, v19.4s, v2.s[3] 190 FMLA v25.4s, v19.4s, v4.s[3] 191 FMLA v27.4s, v19.4s, v6.s[3] 192 FMLA v29.4s, v19.4s, v8.s[3] 193 LDP q18, q19, [x5], 32 194 195 # Second group of 4 A. 40 FMA. 196 FMLA v20.4s, v12.4s, v1.s[0] 197 FMLA v22.4s, v12.4s, v3.s[0] 198 FMLA v24.4s, v12.4s, v5.s[0] 199 LDR q0, [x3], 16 // Load next 5 A 200 FMLA v26.4s, v12.4s, v7.s[0] 201 FMLA v28.4s, v12.4s, v9.s[0] 202 FMLA v21.4s, v13.4s, v1.s[0] 203 LDR q2, [x9], 16 204 FMLA v23.4s, v13.4s, v3.s[0] 205 FMLA v25.4s, v13.4s, v5.s[0] 206 FMLA v27.4s, v13.4s, v7.s[0] 207 LDR q4, [x10], 16 208 FMLA v29.4s, v13.4s, v9.s[0] 209 210 FMLA v20.4s, v14.4s, v1.s[1] 211 FMLA v22.4s, v14.4s, v3.s[1] 212 LDR q6, [x11], 16 213 FMLA v24.4s, v14.4s, v5.s[1] 214 FMLA v26.4s, v14.4s, v7.s[1] 215 FMLA v28.4s, v14.4s, v9.s[1] 216 LDR q8, [x12], 16 217 FMLA v21.4s, v15.4s, v1.s[1] 218 FMLA v23.4s, v15.4s, v3.s[1] 219 FMLA v25.4s, v15.4s, v5.s[1] 220 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 221 FMLA v27.4s, v15.4s, v7.s[1] 222 FMLA v29.4s, v15.4s, v9.s[1] 223 224 FMLA v20.4s, v16.4s, v1.s[2] 225 LDP q14, q15, [x5], 32 226 FMLA v22.4s, v16.4s, v3.s[2] 227 FMLA v24.4s, v16.4s, v5.s[2] 228 FMLA v26.4s, v16.4s, v7.s[2] 229 FMLA v28.4s, v16.4s, v9.s[2] 230 FMLA v21.4s, v17.4s, v1.s[2] 231 FMLA v23.4s, v17.4s, v3.s[2] 232 FMLA v25.4s, v17.4s, v5.s[2] 233 FMLA v27.4s, v17.4s, v7.s[2] 234 FMLA v29.4s, v17.4s, v9.s[2] 235 LDP q16, q17, [x5], 32 236 237 FMLA v20.4s, v18.4s, v1.s[3] 238 FMLA v22.4s, v18.4s, v3.s[3] 239 SUBS x0, x0, 32 240 FMLA v24.4s, v18.4s, v5.s[3] 241 FMLA v26.4s, v18.4s, v7.s[3] 242 FMLA v28.4s, v18.4s, v9.s[3] 243 FMLA v21.4s, v19.4s, v1.s[3] 244 FMLA v23.4s, v19.4s, v3.s[3] 245 FMLA v25.4s, v19.4s, v5.s[3] 246 FMLA v27.4s, v19.4s, v7.s[3] 247 FMLA v29.4s, v19.4s, v9.s[3] 248 B.HS 1b 249 250 # Epilogue - 8 floats of A (32 bytes) 251 # 80 FMA + 5 LDP A + 8 LDP B 252 # First block same as main loop. Second block has no preloads. 2532: 254 # First group of 4 A. 40 FMA. 255 FMLA v20.4s, v12.4s, v0.s[0] 256 LDP q18, q19, [x5], 32 // Load last B 257 FMLA v22.4s, v12.4s, v2.s[0] 258 FMLA v24.4s, v12.4s, v4.s[0] 259 FMLA v26.4s, v12.4s, v6.s[0] 260 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 261 FMLA v28.4s, v12.4s, v8.s[0] 262 FMLA v21.4s, v13.4s, v0.s[0] 263 FMLA v23.4s, v13.4s, v2.s[0] 264 PRFM PLDL1KEEP, [x5, 256] 265 FMLA v25.4s, v13.4s, v4.s[0] 266 FMLA v27.4s, v13.4s, v6.s[0] 267 FMLA v29.4s, v13.4s, v8.s[0] 268 LDR q1, [x3], 16 // Load next 5 A 269 270 FMLA v20.4s, v14.4s, v0.s[1] 271 FMLA v22.4s, v14.4s, v2.s[1] 272 FMLA v24.4s, v14.4s, v4.s[1] 273 LDR q3, [x9], 16 274 FMLA v26.4s, v14.4s, v6.s[1] 275 FMLA v28.4s, v14.4s, v8.s[1] 276 FMLA v21.4s, v15.4s, v0.s[1] 277 LDR q5, [x10], 16 278 FMLA v23.4s, v15.4s, v2.s[1] 279 FMLA v25.4s, v15.4s, v4.s[1] 280 FMLA v27.4s, v15.4s, v6.s[1] 281 LDR q7, [x11], 16 282 FMLA v29.4s, v15.4s, v8.s[1] 283 284 FMLA v20.4s, v16.4s, v0.s[2] 285 FMLA v22.4s, v16.4s, v2.s[2] 286 LDR q9, [x12], 16 287 FMLA v24.4s, v16.4s, v4.s[2] 288 FMLA v26.4s, v16.4s, v6.s[2] 289 FMLA v28.4s, v16.4s, v8.s[2] 290 LDP q12, q13, [x5], 32 // Load 4 B 291 FMLA v21.4s, v17.4s, v0.s[2] 292 FMLA v23.4s, v17.4s, v2.s[2] 293 FMLA v25.4s, v17.4s, v4.s[2] 294 LDP q14, q15, [x5], 32 295 FMLA v27.4s, v17.4s, v6.s[2] 296 FMLA v29.4s, v17.4s, v8.s[2] 297 298 FMLA v20.4s, v18.4s, v0.s[3] 299 LDP q16, q17, [x5], 32 300 FMLA v22.4s, v18.4s, v2.s[3] 301 FMLA v24.4s, v18.4s, v4.s[3] 302 FMLA v26.4s, v18.4s, v6.s[3] 303 FMLA v28.4s, v18.4s, v8.s[3] 304 FMLA v21.4s, v19.4s, v0.s[3] 305 FMLA v23.4s, v19.4s, v2.s[3] 306 FMLA v25.4s, v19.4s, v4.s[3] 307 FMLA v27.4s, v19.4s, v6.s[3] 308 FMLA v29.4s, v19.4s, v8.s[3] 309 LDP q18, q19, [x5], 32 310 311 # Second group of 4 A. 40 FMA. 312 FMLA v20.4s, v12.4s, v1.s[0] 313 FMLA v22.4s, v12.4s, v3.s[0] 314 FMLA v24.4s, v12.4s, v5.s[0] 315 FMLA v26.4s, v12.4s, v7.s[0] 316 FMLA v28.4s, v12.4s, v9.s[0] 317 FMLA v21.4s, v13.4s, v1.s[0] 318 FMLA v23.4s, v13.4s, v3.s[0] 319 FMLA v25.4s, v13.4s, v5.s[0] 320 FMLA v27.4s, v13.4s, v7.s[0] 321 FMLA v29.4s, v13.4s, v9.s[0] 322 323 FMLA v20.4s, v14.4s, v1.s[1] 324 FMLA v22.4s, v14.4s, v3.s[1] 325 FMLA v24.4s, v14.4s, v5.s[1] 326 FMLA v26.4s, v14.4s, v7.s[1] 327 FMLA v28.4s, v14.4s, v9.s[1] 328 FMLA v21.4s, v15.4s, v1.s[1] 329 FMLA v23.4s, v15.4s, v3.s[1] 330 FMLA v25.4s, v15.4s, v5.s[1] 331 FMLA v27.4s, v15.4s, v7.s[1] 332 FMLA v29.4s, v15.4s, v9.s[1] 333 334 FMLA v20.4s, v16.4s, v1.s[2] 335 FMLA v22.4s, v16.4s, v3.s[2] 336 FMLA v24.4s, v16.4s, v5.s[2] 337 FMLA v26.4s, v16.4s, v7.s[2] 338 FMLA v28.4s, v16.4s, v9.s[2] 339 FMLA v21.4s, v17.4s, v1.s[2] 340 FMLA v23.4s, v17.4s, v3.s[2] 341 FMLA v25.4s, v17.4s, v5.s[2] 342 FMLA v27.4s, v17.4s, v7.s[2] 343 FMLA v29.4s, v17.4s, v9.s[2] 344 TST x0, 31 345 346 FMLA v20.4s, v18.4s, v1.s[3] 347 FMLA v22.4s, v18.4s, v3.s[3] 348 FMLA v24.4s, v18.4s, v5.s[3] 349 FMLA v26.4s, v18.4s, v7.s[3] 350 FMLA v28.4s, v18.4s, v9.s[3] 351 FMLA v21.4s, v19.4s, v1.s[3] 352 FMLA v23.4s, v19.4s, v3.s[3] 353 FMLA v25.4s, v19.4s, v5.s[3] 354 FMLA v27.4s, v19.4s, v7.s[3] 355 FMLA v29.4s, v19.4s, v9.s[3] 356 B.NE 4f 357 358 # Clamp 3593: 360 FMAX v20.4s, v20.4s, v30.4s 361 SUBS x1, x1, 8 362 FMAX v21.4s, v21.4s, v30.4s 363 FMAX v22.4s, v22.4s, v30.4s 364 FMAX v23.4s, v23.4s, v30.4s 365 FMAX v24.4s, v24.4s, v30.4s 366 FMAX v25.4s, v25.4s, v30.4s 367 FMAX v26.4s, v26.4s, v30.4s 368 FMAX v27.4s, v27.4s, v30.4s 369 FMAX v28.4s, v28.4s, v30.4s 370 FMAX v29.4s, v29.4s, v30.4s 371 FMIN v20.4s, v20.4s, v31.4s 372 FMIN v21.4s, v21.4s, v31.4s 373 FMIN v22.4s, v22.4s, v31.4s 374 FMIN v23.4s, v23.4s, v31.4s 375 FMIN v24.4s, v24.4s, v31.4s 376 FMIN v25.4s, v25.4s, v31.4s 377 FMIN v26.4s, v26.4s, v31.4s 378 FMIN v27.4s, v27.4s, v31.4s 379 FMIN v28.4s, v28.4s, v31.4s 380 FMIN v29.4s, v29.4s, v31.4s 381 382 # Store full 5 x 8 383 B.LO 7f 384 385 SUB x3, x3, x2 // a0 -= kc 386 STP q28, q29, [x7] 387 ADD x7, x7, x14 388 SUB x9, x9, x2 // a1 -= kc 389 STP q26, q27, [x13] 390 ADD x13, x13, x14 391 SUB x10, x10, x2 // a2 -= kc 392 STP q24, q25, [x17] 393 ADD x17, x17, x14 394 SUB x11, x11, x2 // a3 -= kc 395 STP q22, q23, [x16] 396 ADD x16, x16, x14 397 SUB x12, x12, x2 // a4 -= kc 398 STP q20, q21, [x6] 399 ADD x6, x6, x14 400 401 B.HI 0b 402 403 # Restore d8-d15 from stack 404 LDP d14, d15, [sp, 32] 405 LDP d12, d13, [sp, 16] 406 LDP d8, d9, [sp], 48 407 RET 408 409 # Load clamp values 4104: 411 # Is there a remainder?- 4 floats of A (16 bytes) 412 TBZ x0, 4, 5f 413 414 # Remainder- 4 floats of A (16 bytes) 415 # Load A 416 LDR q0, [x3], 16 417 LDR q2, [x9], 16 418 LDR q4, [x10], 16 419 LDR q6, [x11], 16 420 LDR q8, [x12], 16 421 # Load B 422 LDP q12, q13, [x5], 32 423 LDP q14, q15, [x5], 32 424 LDP q16, q17, [x5], 32 425 LDP q18, q19, [x5], 32 426 427 FMLA v20.4s, v12.4s, v0.s[0] 428 FMLA v22.4s, v12.4s, v2.s[0] 429 FMLA v24.4s, v12.4s, v4.s[0] 430 FMLA v26.4s, v12.4s, v6.s[0] 431 FMLA v28.4s, v12.4s, v8.s[0] 432 FMLA v21.4s, v13.4s, v0.s[0] 433 FMLA v23.4s, v13.4s, v2.s[0] 434 FMLA v25.4s, v13.4s, v4.s[0] 435 FMLA v27.4s, v13.4s, v6.s[0] 436 FMLA v29.4s, v13.4s, v8.s[0] 437 438 FMLA v20.4s, v14.4s, v0.s[1] 439 FMLA v22.4s, v14.4s, v2.s[1] 440 FMLA v24.4s, v14.4s, v4.s[1] 441 FMLA v26.4s, v14.4s, v6.s[1] 442 FMLA v28.4s, v14.4s, v8.s[1] 443 FMLA v21.4s, v15.4s, v0.s[1] 444 FMLA v23.4s, v15.4s, v2.s[1] 445 FMLA v25.4s, v15.4s, v4.s[1] 446 FMLA v27.4s, v15.4s, v6.s[1] 447 FMLA v29.4s, v15.4s, v8.s[1] 448 449 FMLA v20.4s, v16.4s, v0.s[2] 450 FMLA v22.4s, v16.4s, v2.s[2] 451 FMLA v24.4s, v16.4s, v4.s[2] 452 FMLA v26.4s, v16.4s, v6.s[2] 453 FMLA v28.4s, v16.4s, v8.s[2] 454 FMLA v21.4s, v17.4s, v0.s[2] 455 FMLA v23.4s, v17.4s, v2.s[2] 456 FMLA v25.4s, v17.4s, v4.s[2] 457 FMLA v27.4s, v17.4s, v6.s[2] 458 FMLA v29.4s, v17.4s, v8.s[2] 459 460 FMLA v20.4s, v18.4s, v0.s[3] 461 FMLA v22.4s, v18.4s, v2.s[3] 462 FMLA v24.4s, v18.4s, v4.s[3] 463 FMLA v26.4s, v18.4s, v6.s[3] 464 FMLA v28.4s, v18.4s, v8.s[3] 465 FMLA v21.4s, v19.4s, v0.s[3] 466 FMLA v23.4s, v19.4s, v2.s[3] 467 FMLA v25.4s, v19.4s, v4.s[3] 468 FMLA v27.4s, v19.4s, v6.s[3] 469 FMLA v29.4s, v19.4s, v8.s[3] 470 471 # Is there a remainder?- 2 floats of A (8 bytes) 4725: 473 TBZ x0, 3, 6f 474 475 # Remainder- 2 floats of A (8 bytes) 476 # Load A 477 LDR d0, [x3], 8 478 LDR d2, [x9], 8 479 LDR d4, [x10], 8 480 LDR d6, [x11], 8 481 LDR d8, [x12], 8 482 # Load B 483 LDP q12, q13, [x5], 32 484 LDP q14, q15, [x5], 32 485 486 FMLA v20.4s, v12.4s, v0.s[0] 487 FMLA v22.4s, v12.4s, v2.s[0] 488 FMLA v24.4s, v12.4s, v4.s[0] 489 FMLA v26.4s, v12.4s, v6.s[0] 490 FMLA v28.4s, v12.4s, v8.s[0] 491 FMLA v21.4s, v13.4s, v0.s[0] 492 FMLA v23.4s, v13.4s, v2.s[0] 493 FMLA v25.4s, v13.4s, v4.s[0] 494 FMLA v27.4s, v13.4s, v6.s[0] 495 FMLA v29.4s, v13.4s, v8.s[0] 496 497 FMLA v20.4s, v14.4s, v0.s[1] 498 FMLA v22.4s, v14.4s, v2.s[1] 499 FMLA v24.4s, v14.4s, v4.s[1] 500 FMLA v26.4s, v14.4s, v6.s[1] 501 FMLA v28.4s, v14.4s, v8.s[1] 502 FMLA v21.4s, v15.4s, v0.s[1] 503 FMLA v23.4s, v15.4s, v2.s[1] 504 FMLA v25.4s, v15.4s, v4.s[1] 505 FMLA v27.4s, v15.4s, v6.s[1] 506 FMLA v29.4s, v15.4s, v8.s[1] 507 508 # Is there a remainder?- 1 float of A (4 bytes) 5096: 510 TBZ x0, 2, 3b 511 512 # Remainder- 1 float of A (4 bytes) 513 # Load A 514 LDR s0, [x3], 4 515 LDR s2, [x9], 4 516 LDR s4, [x10], 4 517 LDR s6, [x11], 4 518 LDR s8, [x12], 4 519 # Load B 520 LDP q12, q13, [x5], 32 521 522 FMLA v20.4s, v12.4s, v0.s[0] 523 FMLA v22.4s, v12.4s, v2.s[0] 524 FMLA v24.4s, v12.4s, v4.s[0] 525 FMLA v26.4s, v12.4s, v6.s[0] 526 FMLA v28.4s, v12.4s, v8.s[0] 527 FMLA v21.4s, v13.4s, v0.s[0] 528 FMLA v23.4s, v13.4s, v2.s[0] 529 FMLA v25.4s, v13.4s, v4.s[0] 530 FMLA v27.4s, v13.4s, v6.s[0] 531 FMLA v29.4s, v13.4s, v8.s[0] 532 B 3b 533 534 # Store odd width 5357: 536 TBZ x1, 2, 8f 537 STR q28, [x7], 16 538 MOV v28.16b, v29.16b 539 STR q26, [x13], 16 540 MOV v26.16b, v27.16b 541 STR q24, [x17], 16 542 MOV v24.16b, v25.16b 543 STR q22, [x16], 16 544 MOV v22.16b, v23.16b 545 STR q20, [x6], 16 546 MOV v20.16b, v21.16b 5478: 548 TBZ x1, 1, 9f 549 STR d28, [x7], 8 550 DUP d28, v28.d[1] 551 STR d26, [x13], 8 552 DUP d26, v26.d[1] 553 STR d24, [x17], 8 554 DUP d24, v24.d[1] 555 STR d22, [x16], 8 556 DUP d22, v22.d[1] 557 STR d20, [x6], 8 558 DUP d20, v20.d[1] 559 5609: 561 TBZ x1, 0, 10f 562 STR s28, [x7] 563 STR s26, [x13] 564 STR s24, [x17] 565 STR s22, [x16] 566 STR s20, [x6] 56710: 568 # Restore d8-d15 from stack 569 LDP d14, d15, [sp, 32] 570 LDP d12, d13, [sp, 16] 571 LDP d8, d9, [sp], 48 572 RET 573 574END_FUNCTION xnn_f32_gemminc_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75 575 576#ifdef __ELF__ 577.section ".note.GNU-stack","",%progbits 578#endif 579