1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x10 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_output_params params [sp + 24] -> x8 21 22# 5x8 strips the following out of 5x8 23# x23 a5 24# x7 c5 x13 unused 25# A5 v10 v11 26# C v30 v31 27 28# d8-d15 need to be preserved if used. 29# x19-x30 need to be preserved if used. x18 is reserved for OS. 30 31# A pointers 32# x14 a0 33# x15 a1 34# x20 a2 35# x21 a3 36# x8 a4 37 38# C pointers 39# x6 c0 40# x16 c1 41# x17 c2 42# x13 c3 43# x7 c4 44 45# Vector register usage 46# A0 v0 v1 47# A1 v2 v3 48# A2 v4 v5 49# A3 v6 v7 50# A4 v8 v9 51# B v12 v13 v14 v15 52# B v16 v17 v18 v19 53# C v20 v21 54# C v22 v23 55# C v24 v25 56# C v26 v27 57# C v28 v29 58# Clamp v30 v31 59 60BEGIN_FUNCTION xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 61 62 # Clamp C pointers / Save d8-d15 on stack 63 STP d8, d9, [sp, -64]! 64 CMP x0, 2 // if mr < 2 65 ADD x16, x6, x7 // c1 = c0 + cm_stride 66 CSEL x16, x6, x16, LO // c1 = c0 67 68 STP d12, d13, [sp, 16] 69 ADD x17, x16, x7 // c2 = c1 + cm_stride 70 // if mr <= 2 71 CSEL x17, x16, x17, LS // c2 = c1 72 73 STP d14, d15, [sp, 32] 74 CMP x0, 4 // if mr < 4 75 ADD x13, x17, x7 // c3 = c2 + cm_stride 76 CSEL x13, x17, x13, LO // c3 = c2 77 78 # Load zero, clamping params pointer 79 LDP x12, x8, [sp, 80] 80 ADD x7, x13, x7 // c4 = c3 + cm_stride 81 // if mr <= 5 82 CSEL x7, x13, x7, LS // c4 = c3 83 84 # Save x20,x21 on stack 85 STP x20, x21, [sp, 48] 86 87 # Load clamp values 88 LD2R {v30.4s, v31.4s}, [x8] 89 90 # Load cn_stride, a_offset 91 LDP x10, x11, [sp, 64] 92 930: 94 # Load initial bias from w into accumulators 95 LDP q20, q21, [x5], 32 96 MOV v22.16b, v20.16b 97 MOV v23.16b, v21.16b 98 $if PREFETCH: 99 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 100 MOV v24.16b, v20.16b 101 MOV v25.16b, v21.16b 102 $if PREFETCH: 103 PRFM PLDL1KEEP, [x5, 64] 104 MOV v26.16b, v20.16b 105 MOV v27.16b, v21.16b 106 $if PREFETCH: 107 PRFM PLDL1KEEP, [x5, 128] 108 MOV v28.16b, v20.16b 109 MOV v29.16b, v21.16b 110 $if PREFETCH: 111 PRFM PLDL1KEEP, [x5, 192] 112 113 MOV x9, x3 // p = ks 114 1151: 116 # Load next 5 A pointers 117 LDP x14, x15, [x4], 16 118 LDP x20, x21, [x4], 16 119 LDR x8, [x4], 8 120 121 CMP x14, x12 // if a0 == zero 122 ADD x14, x14, x11 // a0 += a_offset 123 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 124 CMP x15, x12 // if a1 == zero 125 ADD x15, x15, x11 // a1 += a_offset 126 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 127 CMP x20, x12 // if a2 == zero 128 ADD x20, x20, x11 // a2 += a_offset 129 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 130 CMP x21, x12 // if a3 == zero 131 ADD x21, x21, x11 // a3 += a_offset 132 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 133 CMP x8, x12 // if a4 == zero 134 ADD x8, x8, x11 // a4 += a_offset 135 CSEL x8, x12, x8, EQ // a4 = zero, else += a4 + a_offset 136 137 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 138 SUBS x0, x2, 32 // k = kc - 32 139 B.LO 5f 140 141 # Prologue - loads for main loop of 96 FMA 142 LDR q0, [x14], 16 143 LDR q2, [x15], 16 144 LDR q4, [x20], 16 145 LDR q6, [x21], 16 146 LDR q8, [x8], 16 147 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 148 LDP q14, q15, [x5], 32 149 LDP q16, q17, [x5], 32 150 151 # Is there at least 8 floats (32 bytes) for main loop? 152 SUBS x0, x0, 32 153 B.LO 3f 154 155 # Main loop - 8 floats of A (32 bytes) 156 # 80 FMA + 5 LDP A + 8 LDP B 1572: 158 # First group of 4 A. 40 FMA. 159 FMLA v20.4s, v12.4s, v0.s[0] 160 LDP q18, q19, [x5], 32 // Load last B 161 FMLA v22.4s, v12.4s, v2.s[0] 162 FMLA v24.4s, v12.4s, v4.s[0] 163 FMLA v26.4s, v12.4s, v6.s[0] 164 $if PREFETCH: 165 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 166 FMLA v28.4s, v12.4s, v8.s[0] 167 FMLA v21.4s, v13.4s, v0.s[0] 168 FMLA v23.4s, v13.4s, v2.s[0] 169 $if PREFETCH: 170 PRFM PLDL1KEEP, [x5, 256] 171 FMLA v25.4s, v13.4s, v4.s[0] 172 FMLA v27.4s, v13.4s, v6.s[0] 173 FMLA v29.4s, v13.4s, v8.s[0] 174 LDR q1, [x14], 16 // Load next 5 A 175 176 FMLA v20.4s, v14.4s, v0.s[1] 177 FMLA v22.4s, v14.4s, v2.s[1] 178 FMLA v24.4s, v14.4s, v4.s[1] 179 LDR q3, [x15], 16 180 FMLA v26.4s, v14.4s, v6.s[1] 181 FMLA v28.4s, v14.4s, v8.s[1] 182 FMLA v21.4s, v15.4s, v0.s[1] 183 LDR q5, [x20], 16 184 FMLA v23.4s, v15.4s, v2.s[1] 185 FMLA v25.4s, v15.4s, v4.s[1] 186 FMLA v27.4s, v15.4s, v6.s[1] 187 LDR q7, [x21], 16 188 FMLA v29.4s, v15.4s, v8.s[1] 189 190 FMLA v20.4s, v16.4s, v0.s[2] 191 FMLA v22.4s, v16.4s, v2.s[2] 192 LDR q9, [x8], 16 193 FMLA v24.4s, v16.4s, v4.s[2] 194 FMLA v26.4s, v16.4s, v6.s[2] 195 FMLA v28.4s, v16.4s, v8.s[2] 196 LDP q12, q13, [x5], 32 // Load 4 B 197 FMLA v21.4s, v17.4s, v0.s[2] 198 FMLA v23.4s, v17.4s, v2.s[2] 199 FMLA v25.4s, v17.4s, v4.s[2] 200 FMLA v27.4s, v17.4s, v6.s[2] 201 FMLA v29.4s, v17.4s, v8.s[2] 202 203 FMLA v20.4s, v18.4s, v0.s[3] 204 FMLA v22.4s, v18.4s, v2.s[3] 205 FMLA v24.4s, v18.4s, v4.s[3] 206 FMLA v26.4s, v18.4s, v6.s[3] 207 LDP q14, q15, [x5], 32 208 FMLA v28.4s, v18.4s, v8.s[3] 209 FMLA v21.4s, v19.4s, v0.s[3] 210 FMLA v23.4s, v19.4s, v2.s[3] 211 LDP q16, q17, [x5], 32 212 FMLA v25.4s, v19.4s, v4.s[3] 213 FMLA v27.4s, v19.4s, v6.s[3] 214 FMLA v29.4s, v19.4s, v8.s[3] 215 LDP q18, q19, [x5], 32 216 217 # Second group of 4 A. 40 FMA. 218 FMLA v20.4s, v12.4s, v1.s[0] 219 FMLA v22.4s, v12.4s, v3.s[0] 220 FMLA v24.4s, v12.4s, v5.s[0] 221 LDR q0, [x14], 16 // Load next 5 A 222 FMLA v26.4s, v12.4s, v7.s[0] 223 FMLA v28.4s, v12.4s, v9.s[0] 224 FMLA v21.4s, v13.4s, v1.s[0] 225 LDR q2, [x15], 16 226 FMLA v23.4s, v13.4s, v3.s[0] 227 FMLA v25.4s, v13.4s, v5.s[0] 228 FMLA v27.4s, v13.4s, v7.s[0] 229 LDR q4, [x20], 16 230 FMLA v29.4s, v13.4s, v9.s[0] 231 232 FMLA v20.4s, v14.4s, v1.s[1] 233 FMLA v22.4s, v14.4s, v3.s[1] 234 LDR q6, [x21], 16 235 FMLA v24.4s, v14.4s, v5.s[1] 236 FMLA v26.4s, v14.4s, v7.s[1] 237 FMLA v28.4s, v14.4s, v9.s[1] 238 LDR q8, [x8], 16 239 FMLA v21.4s, v15.4s, v1.s[1] 240 FMLA v23.4s, v15.4s, v3.s[1] 241 FMLA v25.4s, v15.4s, v5.s[1] 242 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 243 FMLA v27.4s, v15.4s, v7.s[1] 244 FMLA v29.4s, v15.4s, v9.s[1] 245 246 FMLA v20.4s, v16.4s, v1.s[2] 247 FMLA v22.4s, v16.4s, v3.s[2] 248 FMLA v24.4s, v16.4s, v5.s[2] 249 FMLA v26.4s, v16.4s, v7.s[2] 250 FMLA v28.4s, v16.4s, v9.s[2] 251 FMLA v21.4s, v17.4s, v1.s[2] 252 FMLA v23.4s, v17.4s, v3.s[2] 253 LDP q14, q15, [x5], 32 254 FMLA v25.4s, v17.4s, v5.s[2] 255 FMLA v27.4s, v17.4s, v7.s[2] 256 FMLA v29.4s, v17.4s, v9.s[2] 257 LDP q16, q17, [x5], 32 258 259 FMLA v20.4s, v18.4s, v1.s[3] 260 FMLA v22.4s, v18.4s, v3.s[3] 261 SUBS x0, x0, 32 262 FMLA v24.4s, v18.4s, v5.s[3] 263 FMLA v26.4s, v18.4s, v7.s[3] 264 FMLA v28.4s, v18.4s, v9.s[3] 265 FMLA v21.4s, v19.4s, v1.s[3] 266 FMLA v23.4s, v19.4s, v3.s[3] 267 FMLA v25.4s, v19.4s, v5.s[3] 268 FMLA v27.4s, v19.4s, v7.s[3] 269 FMLA v29.4s, v19.4s, v9.s[3] 270 B.HS 2b 271 272 # Epilogue - 8 floats of A (32 bytes) 273 # 80 FMA + 5 LDP A + 8 LDP B 274 # First block same as main loop. Second block has no preloads. 2753: 276 # First group of 4 A. 40 FMA. 277 FMLA v20.4s, v12.4s, v0.s[0] 278 LDP q18, q19, [x5], 32 // Load last B 279 FMLA v22.4s, v12.4s, v2.s[0] 280 FMLA v24.4s, v12.4s, v4.s[0] 281 FMLA v26.4s, v12.4s, v6.s[0] 282 $if PREFETCH: 283 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 284 FMLA v28.4s, v12.4s, v8.s[0] 285 FMLA v21.4s, v13.4s, v0.s[0] 286 FMLA v23.4s, v13.4s, v2.s[0] 287 $if PREFETCH: 288 PRFM PLDL1KEEP, [x5, 256] 289 FMLA v25.4s, v13.4s, v4.s[0] 290 FMLA v27.4s, v13.4s, v6.s[0] 291 FMLA v29.4s, v13.4s, v8.s[0] 292 LDR q1, [x14], 16 // Load next 5 A 293 294 FMLA v20.4s, v14.4s, v0.s[1] 295 FMLA v22.4s, v14.4s, v2.s[1] 296 FMLA v24.4s, v14.4s, v4.s[1] 297 LDR q3, [x15], 16 298 FMLA v26.4s, v14.4s, v6.s[1] 299 FMLA v28.4s, v14.4s, v8.s[1] 300 FMLA v21.4s, v15.4s, v0.s[1] 301 LDR q5, [x20], 16 302 FMLA v23.4s, v15.4s, v2.s[1] 303 FMLA v25.4s, v15.4s, v4.s[1] 304 FMLA v27.4s, v15.4s, v6.s[1] 305 LDR q7, [x21], 16 306 FMLA v29.4s, v15.4s, v8.s[1] 307 308 FMLA v20.4s, v16.4s, v0.s[2] 309 FMLA v22.4s, v16.4s, v2.s[2] 310 LDR q9, [x8], 16 311 FMLA v24.4s, v16.4s, v4.s[2] 312 FMLA v26.4s, v16.4s, v6.s[2] 313 FMLA v28.4s, v16.4s, v8.s[2] 314 LDP q12, q13, [x5], 32 // Load 4 B 315 FMLA v21.4s, v17.4s, v0.s[2] 316 FMLA v23.4s, v17.4s, v2.s[2] 317 FMLA v25.4s, v17.4s, v4.s[2] 318 FMLA v27.4s, v17.4s, v6.s[2] 319 FMLA v29.4s, v17.4s, v8.s[2] 320 321 FMLA v20.4s, v18.4s, v0.s[3] 322 FMLA v22.4s, v18.4s, v2.s[3] 323 FMLA v24.4s, v18.4s, v4.s[3] 324 FMLA v26.4s, v18.4s, v6.s[3] 325 LDP q14, q15, [x5], 32 326 FMLA v28.4s, v18.4s, v8.s[3] 327 FMLA v21.4s, v19.4s, v0.s[3] 328 FMLA v23.4s, v19.4s, v2.s[3] 329 LDP q16, q17, [x5], 32 330 FMLA v25.4s, v19.4s, v4.s[3] 331 FMLA v27.4s, v19.4s, v6.s[3] 332 FMLA v29.4s, v19.4s, v8.s[3] 333 LDP q18, q19, [x5], 32 334 335 # Second group of 4 A. 40 FMA. 336 FMLA v20.4s, v12.4s, v1.s[0] 337 FMLA v22.4s, v12.4s, v3.s[0] 338 FMLA v24.4s, v12.4s, v5.s[0] 339 FMLA v26.4s, v12.4s, v7.s[0] 340 FMLA v28.4s, v12.4s, v9.s[0] 341 FMLA v21.4s, v13.4s, v1.s[0] 342 FMLA v23.4s, v13.4s, v3.s[0] 343 FMLA v25.4s, v13.4s, v5.s[0] 344 FMLA v27.4s, v13.4s, v7.s[0] 345 FMLA v29.4s, v13.4s, v9.s[0] 346 347 FMLA v20.4s, v14.4s, v1.s[1] 348 FMLA v22.4s, v14.4s, v3.s[1] 349 FMLA v24.4s, v14.4s, v5.s[1] 350 FMLA v26.4s, v14.4s, v7.s[1] 351 FMLA v28.4s, v14.4s, v9.s[1] 352 FMLA v21.4s, v15.4s, v1.s[1] 353 FMLA v23.4s, v15.4s, v3.s[1] 354 FMLA v25.4s, v15.4s, v5.s[1] 355 FMLA v27.4s, v15.4s, v7.s[1] 356 FMLA v29.4s, v15.4s, v9.s[1] 357 358 FMLA v20.4s, v16.4s, v1.s[2] 359 FMLA v22.4s, v16.4s, v3.s[2] 360 FMLA v24.4s, v16.4s, v5.s[2] 361 FMLA v26.4s, v16.4s, v7.s[2] 362 FMLA v28.4s, v16.4s, v9.s[2] 363 FMLA v21.4s, v17.4s, v1.s[2] 364 FMLA v23.4s, v17.4s, v3.s[2] 365 FMLA v25.4s, v17.4s, v5.s[2] 366 FMLA v27.4s, v17.4s, v7.s[2] 367 FMLA v29.4s, v17.4s, v9.s[2] 368 369 FMLA v20.4s, v18.4s, v1.s[3] 370 FMLA v22.4s, v18.4s, v3.s[3] 371 FMLA v24.4s, v18.4s, v5.s[3] 372 FMLA v26.4s, v18.4s, v7.s[3] 373 FMLA v28.4s, v18.4s, v9.s[3] 374 FMLA v21.4s, v19.4s, v1.s[3] 375 FMLA v23.4s, v19.4s, v3.s[3] 376 FMLA v25.4s, v19.4s, v5.s[3] 377 FMLA v27.4s, v19.4s, v7.s[3] 378 FMLA v29.4s, v19.4s, v9.s[3] 379 # Is there a remainder?- 4 floats of A (16 bytes) or less 380 TST x0, 31 381 B.NE 5f 382 3834: 384 # ks loop 385 SUBS x9, x9, 40 // ks -= MR * sizeof(void*) 386 B.NE 1b 387 388 # Clamp 389 FMIN v20.4s, v20.4s, v30.4s 390 FMIN v21.4s, v21.4s, v30.4s 391 FMIN v22.4s, v22.4s, v30.4s 392 FMIN v23.4s, v23.4s, v30.4s 393 FMIN v24.4s, v24.4s, v30.4s 394 FMIN v25.4s, v25.4s, v30.4s 395 FMIN v26.4s, v26.4s, v30.4s 396 FMIN v27.4s, v27.4s, v30.4s 397 FMIN v28.4s, v28.4s, v30.4s 398 FMIN v29.4s, v29.4s, v30.4s 399 FMAX v20.4s, v20.4s, v31.4s 400 FMAX v21.4s, v21.4s, v31.4s 401 FMAX v22.4s, v22.4s, v31.4s 402 FMAX v23.4s, v23.4s, v31.4s 403 FMAX v24.4s, v24.4s, v31.4s 404 FMAX v25.4s, v25.4s, v31.4s 405 FMAX v26.4s, v26.4s, v31.4s 406 FMAX v27.4s, v27.4s, v31.4s 407 FMAX v28.4s, v28.4s, v31.4s 408 FMAX v29.4s, v29.4s, v31.4s 409 410 # Store full 5 x 8 411 SUBS x1, x1, 8 412 B.LO 8f 413 414 STP q28, q29, [x7] 415 ADD x7, x7, x10 416 STP q26, q27, [x13] 417 ADD x13, x13, x10 418 STP q24, q25, [x17] 419 ADD x17, x17, x10 420 STP q22, q23, [x16] 421 ADD x16, x16, x10 422 STP q20, q21, [x6] 423 ADD x6, x6, x10 424 425 SUB x4, x4, x3 // a -= ks 426 427 # nc loop 428 B.HI 0b 429 430 # Restore x20,x21 from stack 431 LDP x20, x21, [sp, 48] 432 433 # Restore d8-d15 from stack 434 LDP d14, d15, [sp, 32] 435 LDP d12, d13, [sp, 16] 436 LDP d8, d9, [sp], 64 437 RET 438 4395: 440 # Is there a remainder?- 4 floats of A (16 bytes) 441 TBZ x0, 4, 6f 442 443 # Remainder- 4 floats of A (16 bytes) 444 # Load A 445 LDR q0, [x14], 16 446 LDR q2, [x15], 16 447 LDR q4, [x20], 16 448 LDR q6, [x21], 16 449 LDR q8, [x8], 16 450 # Load B 451 LDP q12, q13, [x5], 32 452 LDP q14, q15, [x5], 32 453 LDP q16, q17, [x5], 32 454 LDP q18, q19, [x5], 32 455 456 FMLA v20.4s, v12.4s, v0.s[0] 457 FMLA v22.4s, v12.4s, v2.s[0] 458 FMLA v24.4s, v12.4s, v4.s[0] 459 FMLA v26.4s, v12.4s, v6.s[0] 460 FMLA v28.4s, v12.4s, v8.s[0] 461 FMLA v21.4s, v13.4s, v0.s[0] 462 FMLA v23.4s, v13.4s, v2.s[0] 463 FMLA v25.4s, v13.4s, v4.s[0] 464 FMLA v27.4s, v13.4s, v6.s[0] 465 FMLA v29.4s, v13.4s, v8.s[0] 466 467 FMLA v20.4s, v14.4s, v0.s[1] 468 FMLA v22.4s, v14.4s, v2.s[1] 469 FMLA v24.4s, v14.4s, v4.s[1] 470 FMLA v26.4s, v14.4s, v6.s[1] 471 FMLA v28.4s, v14.4s, v8.s[1] 472 FMLA v21.4s, v15.4s, v0.s[1] 473 FMLA v23.4s, v15.4s, v2.s[1] 474 FMLA v25.4s, v15.4s, v4.s[1] 475 FMLA v27.4s, v15.4s, v6.s[1] 476 FMLA v29.4s, v15.4s, v8.s[1] 477 478 FMLA v20.4s, v16.4s, v0.s[2] 479 FMLA v22.4s, v16.4s, v2.s[2] 480 FMLA v24.4s, v16.4s, v4.s[2] 481 FMLA v26.4s, v16.4s, v6.s[2] 482 FMLA v28.4s, v16.4s, v8.s[2] 483 FMLA v21.4s, v17.4s, v0.s[2] 484 FMLA v23.4s, v17.4s, v2.s[2] 485 FMLA v25.4s, v17.4s, v4.s[2] 486 FMLA v27.4s, v17.4s, v6.s[2] 487 FMLA v29.4s, v17.4s, v8.s[2] 488 489 FMLA v20.4s, v18.4s, v0.s[3] 490 FMLA v22.4s, v18.4s, v2.s[3] 491 FMLA v24.4s, v18.4s, v4.s[3] 492 FMLA v26.4s, v18.4s, v6.s[3] 493 FMLA v28.4s, v18.4s, v8.s[3] 494 FMLA v21.4s, v19.4s, v0.s[3] 495 FMLA v23.4s, v19.4s, v2.s[3] 496 FMLA v25.4s, v19.4s, v4.s[3] 497 FMLA v27.4s, v19.4s, v6.s[3] 498 FMLA v29.4s, v19.4s, v8.s[3] 499 500 # Is there a remainder?- 2 floats of A (8 bytes) 5016: 502 TBZ x0, 3, 7f 503 504 # Remainder- 2 floats of A (8 bytes) 505 # Load A 506 LDR d0, [x14], 8 507 LDR d2, [x15], 8 508 LDR d4, [x20], 8 509 LDR d6, [x21], 8 510 LDR d8, [x8], 8 511 # Load B 512 LDP q12, q13, [x5], 32 513 LDP q14, q15, [x5], 32 514 515 FMLA v20.4s, v12.4s, v0.s[0] 516 FMLA v22.4s, v12.4s, v2.s[0] 517 FMLA v24.4s, v12.4s, v4.s[0] 518 FMLA v26.4s, v12.4s, v6.s[0] 519 FMLA v28.4s, v12.4s, v8.s[0] 520 FMLA v21.4s, v13.4s, v0.s[0] 521 FMLA v23.4s, v13.4s, v2.s[0] 522 FMLA v25.4s, v13.4s, v4.s[0] 523 FMLA v27.4s, v13.4s, v6.s[0] 524 FMLA v29.4s, v13.4s, v8.s[0] 525 526 FMLA v20.4s, v14.4s, v0.s[1] 527 FMLA v22.4s, v14.4s, v2.s[1] 528 FMLA v24.4s, v14.4s, v4.s[1] 529 FMLA v26.4s, v14.4s, v6.s[1] 530 FMLA v28.4s, v14.4s, v8.s[1] 531 FMLA v21.4s, v15.4s, v0.s[1] 532 FMLA v23.4s, v15.4s, v2.s[1] 533 FMLA v25.4s, v15.4s, v4.s[1] 534 FMLA v27.4s, v15.4s, v6.s[1] 535 FMLA v29.4s, v15.4s, v8.s[1] 536 537 # Is there a remainder?- 1 float of A (4 bytes) 5387: 539 TBZ x0, 2, 4b 540 541 # Remainder- 1 float of A (4 bytes) 542 # Load A 543 LDR s0, [x14], 4 544 LDR s2, [x15], 4 545 LDR s4, [x20], 4 546 LDR s6, [x21], 4 547 LDR s8, [x8], 4 548 # Load B 549 LDP q12, q13, [x5], 32 550 551 FMLA v20.4s, v12.4s, v0.s[0] 552 FMLA v22.4s, v12.4s, v2.s[0] 553 FMLA v24.4s, v12.4s, v4.s[0] 554 FMLA v26.4s, v12.4s, v6.s[0] 555 FMLA v28.4s, v12.4s, v8.s[0] 556 FMLA v21.4s, v13.4s, v0.s[0] 557 FMLA v23.4s, v13.4s, v2.s[0] 558 FMLA v25.4s, v13.4s, v4.s[0] 559 FMLA v27.4s, v13.4s, v6.s[0] 560 FMLA v29.4s, v13.4s, v8.s[0] 561 B 4b 562 563 # Store odd width 5648: 565 TBZ x1, 2, 9f 566 STR q28, [x7], 16 567 MOV v28.16b, v29.16b 568 STR q26, [x13], 16 569 MOV v26.16b, v27.16b 570 STR q24, [x17], 16 571 MOV v24.16b, v25.16b 572 STR q22, [x16], 16 573 MOV v22.16b, v23.16b 574 STR q20, [x6], 16 575 MOV v20.16b, v21.16b 5769: 577 TBZ x1, 1, 10f 578 STR d28, [x7], 8 579 DUP d28, v28.d[1] 580 STR d26, [x13], 8 581 DUP d26, v26.d[1] 582 STR d24, [x17], 8 583 DUP d24, v24.d[1] 584 STR d22, [x16], 8 585 DUP d22, v22.d[1] 586 STR d20, [x6], 8 587 DUP d20, v20.d[1] 588 58910: 590 TBZ x1, 0, 11f 591 STR s28, [x7] 592 STR s26, [x13] 593 STR s24, [x17] 594 STR s22, [x16] 595 STR s20, [x6] 59611: 597 # Restore x20,x21 from stack 598 LDP x20, x21, [sp, 48] 599 600 # Restore d8-d15 from stack 601 LDP d14, d15, [sp, 32] 602 LDP d12, d13, [sp, 16] 603 LDP d8, d9, [sp], 64 604 RET 605 606END_FUNCTION xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 607 608#ifdef __ELF__ 609.section ".note.GNU-stack","",%progbits 610#endif 611