1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/5x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 23 24# unused compared to 5x8 25# x4 a5 26# x7 c5 27# A5 v10 v11 28# C v30 v31 29 30# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 31 32# A pointers 33# x3 a0 34# x9 a1 35# x10 a2 36# x11 a3 37# x12 a4 38 39# C pointers 40# x6 c0 41# x16 c1 42# x17 c2 43# x13 c3 44# x7 c4 45 46# Vector register usage 47# A0 v0 v1 48# A1 v2 v3 49# A2 v4 v5 50# A3 v6 v7 51# A4 v8 v9 52# B v12 v13 v14 v15 53# B v16 v17 v18 v19 54# C v20 v21 55# C v22 v23 56# C v24 v25 57# C v26 v27 58# C v28 v29 59# Clamp v30 v31 60 61BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75 62 63 # Load cn_stride, params pointer 64 LDP x14, x8, [sp] 65 66 # Clamp A and C pointers / Save d8-d15 on stack 67 STP d8, d9, [sp, -48]! 68 CMP x0, 2 // if mr < 2 69 ADD x9, x3, x4 // a1 = a0 + a_stride 70 ADD x16, x6, x7 // c1 = c0 + cm_stride 71 CSEL x9, x3, x9, LO // a1 = a0 72 CSEL x16, x6, x16, LO // c1 = c0 73 74 STP d12, d13, [sp, 16] 75 ADD x10, x9, x4 // a2 = a1 + a_stride 76 ADD x17, x16, x7 // c2 = c1 + cm_stride 77 // if mr <= 2 78 CSEL x10, x9, x10, LS // a2 = a1 79 CSEL x17, x16, x17, LS // c2 = c1 80 81 STP d14, d15, [sp, 32] 82 CMP x0, 4 // if mr < 4 83 ADD x11, x10, x4 // a3 = a2 + a_stride 84 ADD x13, x17, x7 // c3 = c2 + cm_stride 85 CSEL x11, x10, x11, LO // a3 = a2 86 CSEL x13, x17, x13, LO // c3 = c2 87 88 ADD x12, x11, x4 // a4 = a3 + a_stride 89 ADD x7, x13, x7 // c4 = c3 + cm_stride 90 // if mr <= 4 91 CSEL x12, x11, x12, LS // a4 = a3 92 CSEL x7, x13, x7, LS // c4 = c3 93 94 # Load clamp values 95 LD2R {v30.4s, v31.4s}, [x8] 96 970: 98 # Load initial bias from w into accumulators 99 LDP q20, q21, [x5], 32 100 MOV v22.16b, v20.16b 101 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 102 MOV v23.16b, v21.16b 103 PRFM PLDL1KEEP, [x5, 64] 104 MOV v24.16b, v20.16b 105 PRFM PLDL1KEEP, [x5, 128] 106 MOV v25.16b, v21.16b 107 PRFM PLDL1KEEP, [x5, 192] 108 MOV v26.16b, v20.16b 109 PRFM PLDL1KEEP, [x3] // Prefetch A 110 MOV v27.16b, v21.16b 111 PRFM PLDL1KEEP, [x9] 112 MOV v28.16b, v20.16b 113 PRFM PLDL1KEEP, [x10] 114 MOV v29.16b, v21.16b 115 PRFM PLDL1KEEP, [x11] 116 PRFM PLDL1KEEP, [x12] 117 118 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 119 SUBS x0, x2, 32 // k = kc - 32 120 B.LO 4f 121 122 # Prologue - loads for main loop of 80 FMA 123 LDR q0, [x3], 16 124 LDR q2, [x9], 16 125 LDR q4, [x10], 16 126 LDR q6, [x11], 16 127 LDR q8, [x12], 16 128 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 129 LDP q14, q15, [x5], 32 130 LDP q16, q17, [x5], 32 131 132 # Is there at least 8 floats (32 bytes) for main loop? 133 SUBS x0, x0, 32 134 B.LO 2f 135 136 # Main loop - 8 floats of A (32 bytes) 137 # 80 FMA + 5 LDP A + 8 LDP B 1381: 139 # First group of 4 A. 40 FMA. 140 FMLA v20.4s, v12.4s, v0.s[0] 141 LDP q18, q19, [x5], 32 // Load last B 142 FMLA v22.4s, v12.4s, v2.s[0] 143 FMLA v24.4s, v12.4s, v4.s[0] 144 FMLA v26.4s, v12.4s, v6.s[0] 145 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 146 FMLA v28.4s, v12.4s, v8.s[0] 147 FMLA v21.4s, v13.4s, v0.s[0] 148 FMLA v23.4s, v13.4s, v2.s[0] 149 PRFM PLDL1KEEP, [x5, 256] 150 FMLA v25.4s, v13.4s, v4.s[0] 151 FMLA v27.4s, v13.4s, v6.s[0] 152 FMLA v29.4s, v13.4s, v8.s[0] 153 LDR q1, [x3], 16 // Load next 5 A 154 155 FMLA v20.4s, v14.4s, v0.s[1] 156 FMLA v22.4s, v14.4s, v2.s[1] 157 FMLA v24.4s, v14.4s, v4.s[1] 158 LDR q3, [x9], 16 159 FMLA v26.4s, v14.4s, v6.s[1] 160 FMLA v28.4s, v14.4s, v8.s[1] 161 FMLA v21.4s, v15.4s, v0.s[1] 162 LDR q5, [x10], 16 163 FMLA v23.4s, v15.4s, v2.s[1] 164 FMLA v25.4s, v15.4s, v4.s[1] 165 FMLA v27.4s, v15.4s, v6.s[1] 166 LDR q7, [x11], 16 167 FMLA v29.4s, v15.4s, v8.s[1] 168 169 FMLA v20.4s, v16.4s, v0.s[2] 170 FMLA v22.4s, v16.4s, v2.s[2] 171 LDR q9, [x12], 16 172 FMLA v24.4s, v16.4s, v4.s[2] 173 FMLA v26.4s, v16.4s, v6.s[2] 174 FMLA v28.4s, v16.4s, v8.s[2] 175 LDP q12, q13, [x5], 32 // Load 4 B 176 FMLA v21.4s, v17.4s, v0.s[2] 177 FMLA v23.4s, v17.4s, v2.s[2] 178 FMLA v25.4s, v17.4s, v4.s[2] 179 LDP q14, q15, [x5], 32 180 FMLA v27.4s, v17.4s, v6.s[2] 181 FMLA v29.4s, v17.4s, v8.s[2] 182 183 FMLA v20.4s, v18.4s, v0.s[3] 184 LDP q16, q17, [x5], 32 185 FMLA v22.4s, v18.4s, v2.s[3] 186 FMLA v24.4s, v18.4s, v4.s[3] 187 FMLA v26.4s, v18.4s, v6.s[3] 188 FMLA v28.4s, v18.4s, v8.s[3] 189 FMLA v21.4s, v19.4s, v0.s[3] 190 FMLA v23.4s, v19.4s, v2.s[3] 191 FMLA v25.4s, v19.4s, v4.s[3] 192 FMLA v27.4s, v19.4s, v6.s[3] 193 FMLA v29.4s, v19.4s, v8.s[3] 194 LDP q18, q19, [x5], 32 195 196 # Second group of 4 A. 40 FMA. 197 FMLA v20.4s, v12.4s, v1.s[0] 198 FMLA v22.4s, v12.4s, v3.s[0] 199 FMLA v24.4s, v12.4s, v5.s[0] 200 LDR q0, [x3], 16 // Load next 5 A 201 FMLA v26.4s, v12.4s, v7.s[0] 202 FMLA v28.4s, v12.4s, v9.s[0] 203 FMLA v21.4s, v13.4s, v1.s[0] 204 LDR q2, [x9], 16 205 FMLA v23.4s, v13.4s, v3.s[0] 206 FMLA v25.4s, v13.4s, v5.s[0] 207 FMLA v27.4s, v13.4s, v7.s[0] 208 LDR q4, [x10], 16 209 FMLA v29.4s, v13.4s, v9.s[0] 210 211 FMLA v20.4s, v14.4s, v1.s[1] 212 FMLA v22.4s, v14.4s, v3.s[1] 213 LDR q6, [x11], 16 214 FMLA v24.4s, v14.4s, v5.s[1] 215 FMLA v26.4s, v14.4s, v7.s[1] 216 FMLA v28.4s, v14.4s, v9.s[1] 217 LDR q8, [x12], 16 218 FMLA v21.4s, v15.4s, v1.s[1] 219 FMLA v23.4s, v15.4s, v3.s[1] 220 FMLA v25.4s, v15.4s, v5.s[1] 221 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 222 FMLA v27.4s, v15.4s, v7.s[1] 223 FMLA v29.4s, v15.4s, v9.s[1] 224 225 FMLA v20.4s, v16.4s, v1.s[2] 226 LDP q14, q15, [x5], 32 227 FMLA v22.4s, v16.4s, v3.s[2] 228 FMLA v24.4s, v16.4s, v5.s[2] 229 FMLA v26.4s, v16.4s, v7.s[2] 230 FMLA v28.4s, v16.4s, v9.s[2] 231 FMLA v21.4s, v17.4s, v1.s[2] 232 FMLA v23.4s, v17.4s, v3.s[2] 233 FMLA v25.4s, v17.4s, v5.s[2] 234 FMLA v27.4s, v17.4s, v7.s[2] 235 FMLA v29.4s, v17.4s, v9.s[2] 236 LDP q16, q17, [x5], 32 237 238 FMLA v20.4s, v18.4s, v1.s[3] 239 FMLA v22.4s, v18.4s, v3.s[3] 240 SUBS x0, x0, 32 241 FMLA v24.4s, v18.4s, v5.s[3] 242 FMLA v26.4s, v18.4s, v7.s[3] 243 FMLA v28.4s, v18.4s, v9.s[3] 244 FMLA v21.4s, v19.4s, v1.s[3] 245 FMLA v23.4s, v19.4s, v3.s[3] 246 FMLA v25.4s, v19.4s, v5.s[3] 247 FMLA v27.4s, v19.4s, v7.s[3] 248 FMLA v29.4s, v19.4s, v9.s[3] 249 B.HS 1b 250 251 # Epilogue - 8 floats of A (32 bytes) 252 # 80 FMA + 5 LDP A + 8 LDP B 253 # First block same as main loop. Second block has no preloads. 2542: 255 # First group of 4 A. 40 FMA. 256 FMLA v20.4s, v12.4s, v0.s[0] 257 LDP q18, q19, [x5], 32 // Load last B 258 FMLA v22.4s, v12.4s, v2.s[0] 259 FMLA v24.4s, v12.4s, v4.s[0] 260 FMLA v26.4s, v12.4s, v6.s[0] 261 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 262 FMLA v28.4s, v12.4s, v8.s[0] 263 FMLA v21.4s, v13.4s, v0.s[0] 264 FMLA v23.4s, v13.4s, v2.s[0] 265 PRFM PLDL1KEEP, [x5, 256] 266 FMLA v25.4s, v13.4s, v4.s[0] 267 FMLA v27.4s, v13.4s, v6.s[0] 268 FMLA v29.4s, v13.4s, v8.s[0] 269 LDR q1, [x3], 16 // Load next 5 A 270 271 FMLA v20.4s, v14.4s, v0.s[1] 272 FMLA v22.4s, v14.4s, v2.s[1] 273 FMLA v24.4s, v14.4s, v4.s[1] 274 LDR q3, [x9], 16 275 FMLA v26.4s, v14.4s, v6.s[1] 276 FMLA v28.4s, v14.4s, v8.s[1] 277 FMLA v21.4s, v15.4s, v0.s[1] 278 LDR q5, [x10], 16 279 FMLA v23.4s, v15.4s, v2.s[1] 280 FMLA v25.4s, v15.4s, v4.s[1] 281 FMLA v27.4s, v15.4s, v6.s[1] 282 LDR q7, [x11], 16 283 FMLA v29.4s, v15.4s, v8.s[1] 284 285 FMLA v20.4s, v16.4s, v0.s[2] 286 FMLA v22.4s, v16.4s, v2.s[2] 287 LDR q9, [x12], 16 288 FMLA v24.4s, v16.4s, v4.s[2] 289 FMLA v26.4s, v16.4s, v6.s[2] 290 FMLA v28.4s, v16.4s, v8.s[2] 291 LDP q12, q13, [x5], 32 // Load 4 B 292 FMLA v21.4s, v17.4s, v0.s[2] 293 FMLA v23.4s, v17.4s, v2.s[2] 294 FMLA v25.4s, v17.4s, v4.s[2] 295 LDP q14, q15, [x5], 32 296 FMLA v27.4s, v17.4s, v6.s[2] 297 FMLA v29.4s, v17.4s, v8.s[2] 298 299 FMLA v20.4s, v18.4s, v0.s[3] 300 LDP q16, q17, [x5], 32 301 FMLA v22.4s, v18.4s, v2.s[3] 302 FMLA v24.4s, v18.4s, v4.s[3] 303 FMLA v26.4s, v18.4s, v6.s[3] 304 FMLA v28.4s, v18.4s, v8.s[3] 305 FMLA v21.4s, v19.4s, v0.s[3] 306 FMLA v23.4s, v19.4s, v2.s[3] 307 FMLA v25.4s, v19.4s, v4.s[3] 308 FMLA v27.4s, v19.4s, v6.s[3] 309 FMLA v29.4s, v19.4s, v8.s[3] 310 LDP q18, q19, [x5], 32 311 312 # Second group of 4 A. 40 FMA. 313 FMLA v20.4s, v12.4s, v1.s[0] 314 FMLA v22.4s, v12.4s, v3.s[0] 315 FMLA v24.4s, v12.4s, v5.s[0] 316 FMLA v26.4s, v12.4s, v7.s[0] 317 FMLA v28.4s, v12.4s, v9.s[0] 318 FMLA v21.4s, v13.4s, v1.s[0] 319 FMLA v23.4s, v13.4s, v3.s[0] 320 FMLA v25.4s, v13.4s, v5.s[0] 321 FMLA v27.4s, v13.4s, v7.s[0] 322 FMLA v29.4s, v13.4s, v9.s[0] 323 324 FMLA v20.4s, v14.4s, v1.s[1] 325 FMLA v22.4s, v14.4s, v3.s[1] 326 FMLA v24.4s, v14.4s, v5.s[1] 327 FMLA v26.4s, v14.4s, v7.s[1] 328 FMLA v28.4s, v14.4s, v9.s[1] 329 FMLA v21.4s, v15.4s, v1.s[1] 330 FMLA v23.4s, v15.4s, v3.s[1] 331 FMLA v25.4s, v15.4s, v5.s[1] 332 FMLA v27.4s, v15.4s, v7.s[1] 333 FMLA v29.4s, v15.4s, v9.s[1] 334 335 FMLA v20.4s, v16.4s, v1.s[2] 336 FMLA v22.4s, v16.4s, v3.s[2] 337 FMLA v24.4s, v16.4s, v5.s[2] 338 FMLA v26.4s, v16.4s, v7.s[2] 339 FMLA v28.4s, v16.4s, v9.s[2] 340 FMLA v21.4s, v17.4s, v1.s[2] 341 FMLA v23.4s, v17.4s, v3.s[2] 342 FMLA v25.4s, v17.4s, v5.s[2] 343 FMLA v27.4s, v17.4s, v7.s[2] 344 FMLA v29.4s, v17.4s, v9.s[2] 345 TST x0, 31 346 347 FMLA v20.4s, v18.4s, v1.s[3] 348 FMLA v22.4s, v18.4s, v3.s[3] 349 FMLA v24.4s, v18.4s, v5.s[3] 350 FMLA v26.4s, v18.4s, v7.s[3] 351 FMLA v28.4s, v18.4s, v9.s[3] 352 FMLA v21.4s, v19.4s, v1.s[3] 353 FMLA v23.4s, v19.4s, v3.s[3] 354 FMLA v25.4s, v19.4s, v5.s[3] 355 FMLA v27.4s, v19.4s, v7.s[3] 356 FMLA v29.4s, v19.4s, v9.s[3] 357 B.NE 4f 358 359 # Clamp 3603: 361 FMAX v20.4s, v20.4s, v30.4s 362 SUBS x1, x1, 8 363 FMAX v21.4s, v21.4s, v30.4s 364 FMAX v22.4s, v22.4s, v30.4s 365 FMAX v23.4s, v23.4s, v30.4s 366 FMAX v24.4s, v24.4s, v30.4s 367 FMAX v25.4s, v25.4s, v30.4s 368 FMAX v26.4s, v26.4s, v30.4s 369 FMAX v27.4s, v27.4s, v30.4s 370 FMAX v28.4s, v28.4s, v30.4s 371 FMAX v29.4s, v29.4s, v30.4s 372 FMIN v20.4s, v20.4s, v31.4s 373 FMIN v21.4s, v21.4s, v31.4s 374 FMIN v22.4s, v22.4s, v31.4s 375 FMIN v23.4s, v23.4s, v31.4s 376 FMIN v24.4s, v24.4s, v31.4s 377 FMIN v25.4s, v25.4s, v31.4s 378 FMIN v26.4s, v26.4s, v31.4s 379 FMIN v27.4s, v27.4s, v31.4s 380 FMIN v28.4s, v28.4s, v31.4s 381 FMIN v29.4s, v29.4s, v31.4s 382 383 # Store full 5 x 8 384 B.LO 7f 385 386 STP q20, q21, [x6] 387 ADD x6, x6, x14 388 SUB x3, x3, x2 // a0 -= kc 389 STP q22, q23, [x16] 390 ADD x16, x16, x14 391 SUB x9, x9, x2 // a1 -= kc 392 STP q24, q25, [x17] 393 ADD x17, x17, x14 394 SUB x10, x10, x2 // a2 -= kc 395 STP q26, q27, [x13] 396 ADD x13, x13, x14 397 SUB x11, x11, x2 // a3 -= kc 398 STP q28, q29, [x7] 399 ADD x7, x7, x14 400 SUB x12, x12, x2 // a4 -= kc 401 402 B.HI 0b 403 404 # Restore d8-d15 from stack 405 LDP d14, d15, [sp, 32] 406 LDP d12, d13, [sp, 16] 407 LDP d8, d9, [sp], 48 408 RET 409 410 # Load clamp values 4114: 412 # Is there a remainder?- 4 floats of A (16 bytes) 413 TBZ x0, 4, 5f 414 415 # Remainder- 4 floats of A (16 bytes) 416 # Load A 417 LDR q0, [x3], 16 418 LDR q2, [x9], 16 419 LDR q4, [x10], 16 420 LDR q6, [x11], 16 421 LDR q8, [x12], 16 422 # Load B 423 LDP q12, q13, [x5], 32 424 LDP q14, q15, [x5], 32 425 LDP q16, q17, [x5], 32 426 LDP q18, q19, [x5], 32 427 428 FMLA v20.4s, v12.4s, v0.s[0] 429 FMLA v22.4s, v12.4s, v2.s[0] 430 FMLA v24.4s, v12.4s, v4.s[0] 431 FMLA v26.4s, v12.4s, v6.s[0] 432 FMLA v28.4s, v12.4s, v8.s[0] 433 FMLA v21.4s, v13.4s, v0.s[0] 434 FMLA v23.4s, v13.4s, v2.s[0] 435 FMLA v25.4s, v13.4s, v4.s[0] 436 FMLA v27.4s, v13.4s, v6.s[0] 437 FMLA v29.4s, v13.4s, v8.s[0] 438 439 FMLA v20.4s, v14.4s, v0.s[1] 440 FMLA v22.4s, v14.4s, v2.s[1] 441 FMLA v24.4s, v14.4s, v4.s[1] 442 FMLA v26.4s, v14.4s, v6.s[1] 443 FMLA v28.4s, v14.4s, v8.s[1] 444 FMLA v21.4s, v15.4s, v0.s[1] 445 FMLA v23.4s, v15.4s, v2.s[1] 446 FMLA v25.4s, v15.4s, v4.s[1] 447 FMLA v27.4s, v15.4s, v6.s[1] 448 FMLA v29.4s, v15.4s, v8.s[1] 449 450 FMLA v20.4s, v16.4s, v0.s[2] 451 FMLA v22.4s, v16.4s, v2.s[2] 452 FMLA v24.4s, v16.4s, v4.s[2] 453 FMLA v26.4s, v16.4s, v6.s[2] 454 FMLA v28.4s, v16.4s, v8.s[2] 455 FMLA v21.4s, v17.4s, v0.s[2] 456 FMLA v23.4s, v17.4s, v2.s[2] 457 FMLA v25.4s, v17.4s, v4.s[2] 458 FMLA v27.4s, v17.4s, v6.s[2] 459 FMLA v29.4s, v17.4s, v8.s[2] 460 461 FMLA v20.4s, v18.4s, v0.s[3] 462 FMLA v22.4s, v18.4s, v2.s[3] 463 FMLA v24.4s, v18.4s, v4.s[3] 464 FMLA v26.4s, v18.4s, v6.s[3] 465 FMLA v28.4s, v18.4s, v8.s[3] 466 FMLA v21.4s, v19.4s, v0.s[3] 467 FMLA v23.4s, v19.4s, v2.s[3] 468 FMLA v25.4s, v19.4s, v4.s[3] 469 FMLA v27.4s, v19.4s, v6.s[3] 470 FMLA v29.4s, v19.4s, v8.s[3] 471 472 # Is there a remainder?- 2 floats of A (8 bytes) 4735: 474 TBZ x0, 3, 6f 475 476 # Remainder- 2 floats of A (8 bytes) 477 # Load A 478 LDR d0, [x3], 8 479 LDR d2, [x9], 8 480 LDR d4, [x10], 8 481 LDR d6, [x11], 8 482 LDR d8, [x12], 8 483 # Load B 484 LDP q12, q13, [x5], 32 485 LDP q14, q15, [x5], 32 486 487 FMLA v20.4s, v12.4s, v0.s[0] 488 FMLA v22.4s, v12.4s, v2.s[0] 489 FMLA v24.4s, v12.4s, v4.s[0] 490 FMLA v26.4s, v12.4s, v6.s[0] 491 FMLA v28.4s, v12.4s, v8.s[0] 492 FMLA v21.4s, v13.4s, v0.s[0] 493 FMLA v23.4s, v13.4s, v2.s[0] 494 FMLA v25.4s, v13.4s, v4.s[0] 495 FMLA v27.4s, v13.4s, v6.s[0] 496 FMLA v29.4s, v13.4s, v8.s[0] 497 498 FMLA v20.4s, v14.4s, v0.s[1] 499 FMLA v22.4s, v14.4s, v2.s[1] 500 FMLA v24.4s, v14.4s, v4.s[1] 501 FMLA v26.4s, v14.4s, v6.s[1] 502 FMLA v28.4s, v14.4s, v8.s[1] 503 FMLA v21.4s, v15.4s, v0.s[1] 504 FMLA v23.4s, v15.4s, v2.s[1] 505 FMLA v25.4s, v15.4s, v4.s[1] 506 FMLA v27.4s, v15.4s, v6.s[1] 507 FMLA v29.4s, v15.4s, v8.s[1] 508 509 # Is there a remainder?- 1 float of A (4 bytes) 5106: 511 TBZ x0, 2, 3b 512 513 # Remainder- 1 float of A (4 bytes) 514 # Load A 515 LDR s0, [x3], 4 516 LDR s2, [x9], 4 517 LDR s4, [x10], 4 518 LDR s6, [x11], 4 519 LDR s8, [x12], 4 520 # Load B 521 LDP q12, q13, [x5], 32 522 523 FMLA v20.4s, v12.4s, v0.s[0] 524 FMLA v22.4s, v12.4s, v2.s[0] 525 FMLA v24.4s, v12.4s, v4.s[0] 526 FMLA v26.4s, v12.4s, v6.s[0] 527 FMLA v28.4s, v12.4s, v8.s[0] 528 FMLA v21.4s, v13.4s, v0.s[0] 529 FMLA v23.4s, v13.4s, v2.s[0] 530 FMLA v25.4s, v13.4s, v4.s[0] 531 FMLA v27.4s, v13.4s, v6.s[0] 532 FMLA v29.4s, v13.4s, v8.s[0] 533 B 3b 534 535 # Store odd width 5367: 537 TBZ x1, 2, 8f 538 STR q20, [x6], 16 539 MOV v20.16b, v21.16b 540 STR q22, [x16], 16 541 MOV v22.16b, v23.16b 542 STR q24, [x17], 16 543 MOV v24.16b, v25.16b 544 STR q26, [x13], 16 545 MOV v26.16b, v27.16b 546 STR q28, [x7], 16 547 MOV v28.16b, v29.16b 5488: 549 TBZ x1, 1, 9f 550 STR d20, [x6], 8 551 STR d22, [x16], 8 552 DUP d20, v20.d[1] 553 DUP d22, v22.d[1] 554 STR d24, [x17], 8 555 STR d26, [x13], 8 556 DUP d24, v24.d[1] 557 DUP d26, v26.d[1] 558 STR d28, [x7], 8 559 DUP d28, v28.d[1] 560 5619: 562 TBZ x1, 0, 10f 563 STR s20, [x6] 564 STR s22, [x16] 565 STR s24, [x17] 566 STR s26, [x13] 567 STR s28, [x7] 56810: 569 # Restore d8-d15 from stack 570 LDP d14, d15, [sp, 32] 571 LDP d12, d13, [sp, 16] 572 LDP d8, d9, [sp], 48 573 RET 574 575END_FUNCTION xnn_f32_gemm_minmax_ukernel_5x8__aarch64_neonfma_prfm_cortex_a75 576 577#ifdef __ELF__ 578.section ".note.GNU-stack","",%progbits 579#endif 580