1// Auto-generated file. Do not edit! 2// Template: src/f32-igemm/5x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# size_t ks, x3 / x9 17# const float**restrict a, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x10 22# size_t a_offset, [sp + 8] -> x11 23# const float* zero, [sp + 16] -> x12 24# const xnn_f32_minmax_params params [sp + 24] -> (x8) 25 26# 5x8 strips the following out of 5x8 27# x23 a5 28# x7 c5 x13 unused 29# A5 v10 v11 30# C v30 v31 31 32# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 33 34# A pointers 35# x14 a0 36# x15 a1 37# x20 a2 38# x21 a3 39# x8 a4 40 41# C pointers 42# x6 c0 43# x16 c1 44# x17 c2 45# x13 c3 46# x7 c4 47 48# Vector register usage 49# A0 v0 v1 50# A1 v2 v3 51# A2 v4 v5 52# A3 v6 v7 53# A4 v8 v9 54# B v12 v13 v14 v15 55# B v16 v17 v18 v19 56# C v20 v21 57# C v22 v23 58# C v24 v25 59# C v26 v27 60# C v28 v29 61# Clamp v30 v31 62 63BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75 64 65 # Clamp C pointers / Save d8-d15 on stack 66 STP d8, d9, [sp, -64]! 67 CMP x0, 2 // if mr < 2 68 ADD x16, x6, x7 // c1 = c0 + cm_stride 69 CSEL x16, x6, x16, LO // c1 = c0 70 71 STP d12, d13, [sp, 16] 72 ADD x17, x16, x7 // c2 = c1 + cm_stride 73 // if mr <= 2 74 CSEL x17, x16, x17, LS // c2 = c1 75 76 STP d14, d15, [sp, 32] 77 CMP x0, 4 // if mr < 4 78 ADD x13, x17, x7 // c3 = c2 + cm_stride 79 CSEL x13, x17, x13, LO // c3 = c2 80 81 # Load zero, params pointer 82 LDP x12, x8, [sp, 80] 83 ADD x7, x13, x7 // c4 = c3 + cm_stride 84 // if mr <= 4 85 CSEL x7, x13, x7, LS // c4 = c3 86 87 # Save x20,x21 on stack 88 STP x20, x21, [sp, 48] 89 90 # Load clamp values 91 LD2R {v30.4s, v31.4s}, [x8] 92 93 # Load cn_stride, a_offset 94 LDP x10, x11, [sp, 64] 95 960: 97 # Load initial bias from w into accumulators 98 LDP q20, q21, [x5], 32 99 MOV v22.16b, v20.16b 100 MOV v23.16b, v21.16b 101 MOV v24.16b, v20.16b 102 MOV v25.16b, v21.16b 103 MOV v26.16b, v20.16b 104 MOV v27.16b, v21.16b 105 MOV v28.16b, v20.16b 106 MOV v29.16b, v21.16b 107 108 MOV x9, x3 // p = ks 109 1101: 111 # Load next 5 A pointers 112 LDP x14, x15, [x4], 16 113 LDP x20, x21, [x4], 16 114 LDR x8, [x4], 8 115 116 CMP x14, x12 // if a0 == zero 117 ADD x14, x14, x11 // a0 += a_offset 118 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 119 CMP x15, x12 // if a1 == zero 120 ADD x15, x15, x11 // a1 += a_offset 121 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 122 CMP x20, x12 // if a2 == zero 123 ADD x20, x20, x11 // a2 += a_offset 124 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 125 CMP x21, x12 // if a3 == zero 126 ADD x21, x21, x11 // a3 += a_offset 127 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 128 CMP x8, x12 // if a4 == zero 129 ADD x8, x8, x11 // a4 += a_offset 130 CSEL x8, x12, x8, EQ // a4 = zero, else += a4 + a_offset 131 132 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 133 SUBS x0, x2, 32 // k = kc - 32 134 B.LO 5f 135 136 # Prologue - loads for main loop of 96 FMA 137 LDR q0, [x14], 16 138 LDR q2, [x15], 16 139 LDR q4, [x20], 16 140 LDR q6, [x21], 16 141 LDR q8, [x8], 16 142 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 143 LDP q14, q15, [x5], 32 144 LDP q16, q17, [x5], 32 145 146 # Is there at least 8 floats (32 bytes) for main loop? 147 SUBS x0, x0, 32 148 B.LO 3f 149 150 # Main loop - 8 floats of A (32 bytes) 151 # 80 FMA + 5 LDP A + 8 LDP B 1522: 153 # First group of 4 A. 40 FMA. 154 FMLA v20.4s, v12.4s, v0.s[0] 155 LDP q18, q19, [x5], 32 // Load last B 156 FMLA v22.4s, v12.4s, v2.s[0] 157 FMLA v24.4s, v12.4s, v4.s[0] 158 FMLA v26.4s, v12.4s, v6.s[0] 159 FMLA v28.4s, v12.4s, v8.s[0] 160 FMLA v21.4s, v13.4s, v0.s[0] 161 FMLA v23.4s, v13.4s, v2.s[0] 162 FMLA v25.4s, v13.4s, v4.s[0] 163 FMLA v27.4s, v13.4s, v6.s[0] 164 FMLA v29.4s, v13.4s, v8.s[0] 165 LDR q1, [x14], 16 // Load next 5 A 166 167 FMLA v20.4s, v14.4s, v0.s[1] 168 FMLA v22.4s, v14.4s, v2.s[1] 169 FMLA v24.4s, v14.4s, v4.s[1] 170 LDR q3, [x15], 16 171 FMLA v26.4s, v14.4s, v6.s[1] 172 FMLA v28.4s, v14.4s, v8.s[1] 173 FMLA v21.4s, v15.4s, v0.s[1] 174 LDR q5, [x20], 16 175 FMLA v23.4s, v15.4s, v2.s[1] 176 FMLA v25.4s, v15.4s, v4.s[1] 177 FMLA v27.4s, v15.4s, v6.s[1] 178 LDR q7, [x21], 16 179 FMLA v29.4s, v15.4s, v8.s[1] 180 181 FMLA v20.4s, v16.4s, v0.s[2] 182 FMLA v22.4s, v16.4s, v2.s[2] 183 LDR q9, [x8], 16 184 FMLA v24.4s, v16.4s, v4.s[2] 185 FMLA v26.4s, v16.4s, v6.s[2] 186 FMLA v28.4s, v16.4s, v8.s[2] 187 LDP q12, q13, [x5], 32 // Load 4 B 188 FMLA v21.4s, v17.4s, v0.s[2] 189 FMLA v23.4s, v17.4s, v2.s[2] 190 FMLA v25.4s, v17.4s, v4.s[2] 191 FMLA v27.4s, v17.4s, v6.s[2] 192 FMLA v29.4s, v17.4s, v8.s[2] 193 194 FMLA v20.4s, v18.4s, v0.s[3] 195 FMLA v22.4s, v18.4s, v2.s[3] 196 FMLA v24.4s, v18.4s, v4.s[3] 197 FMLA v26.4s, v18.4s, v6.s[3] 198 LDP q14, q15, [x5], 32 199 FMLA v28.4s, v18.4s, v8.s[3] 200 FMLA v21.4s, v19.4s, v0.s[3] 201 FMLA v23.4s, v19.4s, v2.s[3] 202 LDP q16, q17, [x5], 32 203 FMLA v25.4s, v19.4s, v4.s[3] 204 FMLA v27.4s, v19.4s, v6.s[3] 205 FMLA v29.4s, v19.4s, v8.s[3] 206 LDP q18, q19, [x5], 32 207 208 # Second group of 4 A. 40 FMA. 209 FMLA v20.4s, v12.4s, v1.s[0] 210 FMLA v22.4s, v12.4s, v3.s[0] 211 FMLA v24.4s, v12.4s, v5.s[0] 212 LDR q0, [x14], 16 // Load next 5 A 213 FMLA v26.4s, v12.4s, v7.s[0] 214 FMLA v28.4s, v12.4s, v9.s[0] 215 FMLA v21.4s, v13.4s, v1.s[0] 216 LDR q2, [x15], 16 217 FMLA v23.4s, v13.4s, v3.s[0] 218 FMLA v25.4s, v13.4s, v5.s[0] 219 FMLA v27.4s, v13.4s, v7.s[0] 220 LDR q4, [x20], 16 221 FMLA v29.4s, v13.4s, v9.s[0] 222 223 FMLA v20.4s, v14.4s, v1.s[1] 224 FMLA v22.4s, v14.4s, v3.s[1] 225 LDR q6, [x21], 16 226 FMLA v24.4s, v14.4s, v5.s[1] 227 FMLA v26.4s, v14.4s, v7.s[1] 228 FMLA v28.4s, v14.4s, v9.s[1] 229 LDR q8, [x8], 16 230 FMLA v21.4s, v15.4s, v1.s[1] 231 FMLA v23.4s, v15.4s, v3.s[1] 232 FMLA v25.4s, v15.4s, v5.s[1] 233 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 234 FMLA v27.4s, v15.4s, v7.s[1] 235 FMLA v29.4s, v15.4s, v9.s[1] 236 237 FMLA v20.4s, v16.4s, v1.s[2] 238 FMLA v22.4s, v16.4s, v3.s[2] 239 FMLA v24.4s, v16.4s, v5.s[2] 240 FMLA v26.4s, v16.4s, v7.s[2] 241 FMLA v28.4s, v16.4s, v9.s[2] 242 FMLA v21.4s, v17.4s, v1.s[2] 243 FMLA v23.4s, v17.4s, v3.s[2] 244 LDP q14, q15, [x5], 32 245 FMLA v25.4s, v17.4s, v5.s[2] 246 FMLA v27.4s, v17.4s, v7.s[2] 247 FMLA v29.4s, v17.4s, v9.s[2] 248 LDP q16, q17, [x5], 32 249 250 FMLA v20.4s, v18.4s, v1.s[3] 251 FMLA v22.4s, v18.4s, v3.s[3] 252 SUBS x0, x0, 32 253 FMLA v24.4s, v18.4s, v5.s[3] 254 FMLA v26.4s, v18.4s, v7.s[3] 255 FMLA v28.4s, v18.4s, v9.s[3] 256 FMLA v21.4s, v19.4s, v1.s[3] 257 FMLA v23.4s, v19.4s, v3.s[3] 258 FMLA v25.4s, v19.4s, v5.s[3] 259 FMLA v27.4s, v19.4s, v7.s[3] 260 FMLA v29.4s, v19.4s, v9.s[3] 261 B.HS 2b 262 263 # Epilogue - 8 floats of A (32 bytes) 264 # 80 FMA + 5 LDP A + 8 LDP B 265 # First block same as main loop. Second block has no preloads. 2663: 267 # First group of 4 A. 40 FMA. 268 FMLA v20.4s, v12.4s, v0.s[0] 269 LDP q18, q19, [x5], 32 // Load last B 270 FMLA v22.4s, v12.4s, v2.s[0] 271 FMLA v24.4s, v12.4s, v4.s[0] 272 FMLA v26.4s, v12.4s, v6.s[0] 273 FMLA v28.4s, v12.4s, v8.s[0] 274 FMLA v21.4s, v13.4s, v0.s[0] 275 FMLA v23.4s, v13.4s, v2.s[0] 276 FMLA v25.4s, v13.4s, v4.s[0] 277 FMLA v27.4s, v13.4s, v6.s[0] 278 FMLA v29.4s, v13.4s, v8.s[0] 279 LDR q1, [x14], 16 // Load next 5 A 280 281 FMLA v20.4s, v14.4s, v0.s[1] 282 FMLA v22.4s, v14.4s, v2.s[1] 283 FMLA v24.4s, v14.4s, v4.s[1] 284 LDR q3, [x15], 16 285 FMLA v26.4s, v14.4s, v6.s[1] 286 FMLA v28.4s, v14.4s, v8.s[1] 287 FMLA v21.4s, v15.4s, v0.s[1] 288 LDR q5, [x20], 16 289 FMLA v23.4s, v15.4s, v2.s[1] 290 FMLA v25.4s, v15.4s, v4.s[1] 291 FMLA v27.4s, v15.4s, v6.s[1] 292 LDR q7, [x21], 16 293 FMLA v29.4s, v15.4s, v8.s[1] 294 295 FMLA v20.4s, v16.4s, v0.s[2] 296 FMLA v22.4s, v16.4s, v2.s[2] 297 LDR q9, [x8], 16 298 FMLA v24.4s, v16.4s, v4.s[2] 299 FMLA v26.4s, v16.4s, v6.s[2] 300 FMLA v28.4s, v16.4s, v8.s[2] 301 LDP q12, q13, [x5], 32 // Load 4 B 302 FMLA v21.4s, v17.4s, v0.s[2] 303 FMLA v23.4s, v17.4s, v2.s[2] 304 FMLA v25.4s, v17.4s, v4.s[2] 305 FMLA v27.4s, v17.4s, v6.s[2] 306 FMLA v29.4s, v17.4s, v8.s[2] 307 308 FMLA v20.4s, v18.4s, v0.s[3] 309 FMLA v22.4s, v18.4s, v2.s[3] 310 FMLA v24.4s, v18.4s, v4.s[3] 311 FMLA v26.4s, v18.4s, v6.s[3] 312 LDP q14, q15, [x5], 32 313 FMLA v28.4s, v18.4s, v8.s[3] 314 FMLA v21.4s, v19.4s, v0.s[3] 315 FMLA v23.4s, v19.4s, v2.s[3] 316 LDP q16, q17, [x5], 32 317 FMLA v25.4s, v19.4s, v4.s[3] 318 FMLA v27.4s, v19.4s, v6.s[3] 319 FMLA v29.4s, v19.4s, v8.s[3] 320 LDP q18, q19, [x5], 32 321 322 # Second group of 4 A. 40 FMA. 323 FMLA v20.4s, v12.4s, v1.s[0] 324 FMLA v22.4s, v12.4s, v3.s[0] 325 FMLA v24.4s, v12.4s, v5.s[0] 326 FMLA v26.4s, v12.4s, v7.s[0] 327 FMLA v28.4s, v12.4s, v9.s[0] 328 FMLA v21.4s, v13.4s, v1.s[0] 329 FMLA v23.4s, v13.4s, v3.s[0] 330 FMLA v25.4s, v13.4s, v5.s[0] 331 FMLA v27.4s, v13.4s, v7.s[0] 332 FMLA v29.4s, v13.4s, v9.s[0] 333 334 FMLA v20.4s, v14.4s, v1.s[1] 335 FMLA v22.4s, v14.4s, v3.s[1] 336 FMLA v24.4s, v14.4s, v5.s[1] 337 FMLA v26.4s, v14.4s, v7.s[1] 338 FMLA v28.4s, v14.4s, v9.s[1] 339 FMLA v21.4s, v15.4s, v1.s[1] 340 FMLA v23.4s, v15.4s, v3.s[1] 341 FMLA v25.4s, v15.4s, v5.s[1] 342 FMLA v27.4s, v15.4s, v7.s[1] 343 FMLA v29.4s, v15.4s, v9.s[1] 344 345 FMLA v20.4s, v16.4s, v1.s[2] 346 FMLA v22.4s, v16.4s, v3.s[2] 347 FMLA v24.4s, v16.4s, v5.s[2] 348 FMLA v26.4s, v16.4s, v7.s[2] 349 FMLA v28.4s, v16.4s, v9.s[2] 350 FMLA v21.4s, v17.4s, v1.s[2] 351 FMLA v23.4s, v17.4s, v3.s[2] 352 FMLA v25.4s, v17.4s, v5.s[2] 353 FMLA v27.4s, v17.4s, v7.s[2] 354 FMLA v29.4s, v17.4s, v9.s[2] 355 356 FMLA v20.4s, v18.4s, v1.s[3] 357 FMLA v22.4s, v18.4s, v3.s[3] 358 FMLA v24.4s, v18.4s, v5.s[3] 359 FMLA v26.4s, v18.4s, v7.s[3] 360 FMLA v28.4s, v18.4s, v9.s[3] 361 FMLA v21.4s, v19.4s, v1.s[3] 362 FMLA v23.4s, v19.4s, v3.s[3] 363 FMLA v25.4s, v19.4s, v5.s[3] 364 FMLA v27.4s, v19.4s, v7.s[3] 365 FMLA v29.4s, v19.4s, v9.s[3] 366 # Is there a remainder?- 4 floats of A (16 bytes) or less 367 TST x0, 31 368 B.NE 5f 369 3704: 371 # ks loop 372 SUBS x9, x9, 40 // ks -= MR * sizeof(void*) 373 B.HI 1b 374 375 # Clamp 376 FMAX v20.4s, v20.4s, v30.4s 377 FMAX v21.4s, v21.4s, v30.4s 378 FMAX v22.4s, v22.4s, v30.4s 379 FMAX v23.4s, v23.4s, v30.4s 380 FMAX v24.4s, v24.4s, v30.4s 381 FMAX v25.4s, v25.4s, v30.4s 382 FMAX v26.4s, v26.4s, v30.4s 383 FMAX v27.4s, v27.4s, v30.4s 384 FMAX v28.4s, v28.4s, v30.4s 385 FMAX v29.4s, v29.4s, v30.4s 386 FMIN v20.4s, v20.4s, v31.4s 387 FMIN v21.4s, v21.4s, v31.4s 388 FMIN v22.4s, v22.4s, v31.4s 389 FMIN v23.4s, v23.4s, v31.4s 390 FMIN v24.4s, v24.4s, v31.4s 391 FMIN v25.4s, v25.4s, v31.4s 392 FMIN v26.4s, v26.4s, v31.4s 393 FMIN v27.4s, v27.4s, v31.4s 394 FMIN v28.4s, v28.4s, v31.4s 395 FMIN v29.4s, v29.4s, v31.4s 396 397 # Store full 5 x 8 398 SUBS x1, x1, 8 399 B.LO 8f 400 401 STP q28, q29, [x7] 402 ADD x7, x7, x10 403 STP q26, q27, [x13] 404 ADD x13, x13, x10 405 STP q24, q25, [x17] 406 ADD x17, x17, x10 407 STP q22, q23, [x16] 408 ADD x16, x16, x10 409 STP q20, q21, [x6] 410 ADD x6, x6, x10 411 412 SUB x4, x4, x3 // a -= ks 413 414 # nc loop 415 B.HI 0b 416 417 # Restore x20,x21 from stack 418 LDP x20, x21, [sp, 48] 419 420 # Restore d8-d15 from stack 421 LDP d14, d15, [sp, 32] 422 LDP d12, d13, [sp, 16] 423 LDP d8, d9, [sp], 64 424 RET 425 4265: 427 # Is there a remainder?- 4 floats of A (16 bytes) 428 TBZ x0, 4, 6f 429 430 # Remainder- 4 floats of A (16 bytes) 431 # Load A 432 LDR q0, [x14], 16 433 LDR q2, [x15], 16 434 LDR q4, [x20], 16 435 LDR q6, [x21], 16 436 LDR q8, [x8], 16 437 # Load B 438 LDP q12, q13, [x5], 32 439 LDP q14, q15, [x5], 32 440 LDP q16, q17, [x5], 32 441 LDP q18, q19, [x5], 32 442 443 FMLA v20.4s, v12.4s, v0.s[0] 444 FMLA v22.4s, v12.4s, v2.s[0] 445 FMLA v24.4s, v12.4s, v4.s[0] 446 FMLA v26.4s, v12.4s, v6.s[0] 447 FMLA v28.4s, v12.4s, v8.s[0] 448 FMLA v21.4s, v13.4s, v0.s[0] 449 FMLA v23.4s, v13.4s, v2.s[0] 450 FMLA v25.4s, v13.4s, v4.s[0] 451 FMLA v27.4s, v13.4s, v6.s[0] 452 FMLA v29.4s, v13.4s, v8.s[0] 453 454 FMLA v20.4s, v14.4s, v0.s[1] 455 FMLA v22.4s, v14.4s, v2.s[1] 456 FMLA v24.4s, v14.4s, v4.s[1] 457 FMLA v26.4s, v14.4s, v6.s[1] 458 FMLA v28.4s, v14.4s, v8.s[1] 459 FMLA v21.4s, v15.4s, v0.s[1] 460 FMLA v23.4s, v15.4s, v2.s[1] 461 FMLA v25.4s, v15.4s, v4.s[1] 462 FMLA v27.4s, v15.4s, v6.s[1] 463 FMLA v29.4s, v15.4s, v8.s[1] 464 465 FMLA v20.4s, v16.4s, v0.s[2] 466 FMLA v22.4s, v16.4s, v2.s[2] 467 FMLA v24.4s, v16.4s, v4.s[2] 468 FMLA v26.4s, v16.4s, v6.s[2] 469 FMLA v28.4s, v16.4s, v8.s[2] 470 FMLA v21.4s, v17.4s, v0.s[2] 471 FMLA v23.4s, v17.4s, v2.s[2] 472 FMLA v25.4s, v17.4s, v4.s[2] 473 FMLA v27.4s, v17.4s, v6.s[2] 474 FMLA v29.4s, v17.4s, v8.s[2] 475 476 FMLA v20.4s, v18.4s, v0.s[3] 477 FMLA v22.4s, v18.4s, v2.s[3] 478 FMLA v24.4s, v18.4s, v4.s[3] 479 FMLA v26.4s, v18.4s, v6.s[3] 480 FMLA v28.4s, v18.4s, v8.s[3] 481 FMLA v21.4s, v19.4s, v0.s[3] 482 FMLA v23.4s, v19.4s, v2.s[3] 483 FMLA v25.4s, v19.4s, v4.s[3] 484 FMLA v27.4s, v19.4s, v6.s[3] 485 FMLA v29.4s, v19.4s, v8.s[3] 486 487 # Is there a remainder?- 2 floats of A (8 bytes) 4886: 489 TBZ x0, 3, 7f 490 491 # Remainder- 2 floats of A (8 bytes) 492 # Load A 493 LDR d0, [x14], 8 494 LDR d2, [x15], 8 495 LDR d4, [x20], 8 496 LDR d6, [x21], 8 497 LDR d8, [x8], 8 498 # Load B 499 LDP q12, q13, [x5], 32 500 LDP q14, q15, [x5], 32 501 502 FMLA v20.4s, v12.4s, v0.s[0] 503 FMLA v22.4s, v12.4s, v2.s[0] 504 FMLA v24.4s, v12.4s, v4.s[0] 505 FMLA v26.4s, v12.4s, v6.s[0] 506 FMLA v28.4s, v12.4s, v8.s[0] 507 FMLA v21.4s, v13.4s, v0.s[0] 508 FMLA v23.4s, v13.4s, v2.s[0] 509 FMLA v25.4s, v13.4s, v4.s[0] 510 FMLA v27.4s, v13.4s, v6.s[0] 511 FMLA v29.4s, v13.4s, v8.s[0] 512 513 FMLA v20.4s, v14.4s, v0.s[1] 514 FMLA v22.4s, v14.4s, v2.s[1] 515 FMLA v24.4s, v14.4s, v4.s[1] 516 FMLA v26.4s, v14.4s, v6.s[1] 517 FMLA v28.4s, v14.4s, v8.s[1] 518 FMLA v21.4s, v15.4s, v0.s[1] 519 FMLA v23.4s, v15.4s, v2.s[1] 520 FMLA v25.4s, v15.4s, v4.s[1] 521 FMLA v27.4s, v15.4s, v6.s[1] 522 FMLA v29.4s, v15.4s, v8.s[1] 523 524 # Is there a remainder?- 1 float of A (4 bytes) 5257: 526 TBZ x0, 2, 4b 527 528 # Remainder- 1 float of A (4 bytes) 529 # Load A 530 LDR s0, [x14], 4 531 LDR s2, [x15], 4 532 LDR s4, [x20], 4 533 LDR s6, [x21], 4 534 LDR s8, [x8], 4 535 # Load B 536 LDP q12, q13, [x5], 32 537 538 FMLA v20.4s, v12.4s, v0.s[0] 539 FMLA v22.4s, v12.4s, v2.s[0] 540 FMLA v24.4s, v12.4s, v4.s[0] 541 FMLA v26.4s, v12.4s, v6.s[0] 542 FMLA v28.4s, v12.4s, v8.s[0] 543 FMLA v21.4s, v13.4s, v0.s[0] 544 FMLA v23.4s, v13.4s, v2.s[0] 545 FMLA v25.4s, v13.4s, v4.s[0] 546 FMLA v27.4s, v13.4s, v6.s[0] 547 FMLA v29.4s, v13.4s, v8.s[0] 548 B 4b 549 550 # Store odd width 5518: 552 TBZ x1, 2, 9f 553 STR q28, [x7], 16 554 MOV v28.16b, v29.16b 555 STR q26, [x13], 16 556 MOV v26.16b, v27.16b 557 STR q24, [x17], 16 558 MOV v24.16b, v25.16b 559 STR q22, [x16], 16 560 MOV v22.16b, v23.16b 561 STR q20, [x6], 16 562 MOV v20.16b, v21.16b 5639: 564 TBZ x1, 1, 10f 565 STR d28, [x7], 8 566 STR d26, [x13], 8 567 DUP d28, v28.d[1] 568 DUP d26, v26.d[1] 569 STR d24, [x17], 8 570 STR d22, [x16], 8 571 DUP d24, v24.d[1] 572 DUP d22, v22.d[1] 573 STR d20, [x6], 8 574 DUP d20, v20.d[1] 575 57610: 577 TBZ x1, 0, 11f 578 STR s28, [x7] 579 STR s26, [x13] 580 STR s24, [x17] 581 STR s22, [x16] 582 STR s20, [x6] 58311: 584 # Restore x20,x21 from stack 585 LDP x20, x21, [sp, 48] 586 587 # Restore d8-d15 from stack 588 LDP d14, d15, [sp, 32] 589 LDP d12, d13, [sp, 16] 590 LDP d8, d9, [sp], 64 591 RET 592 593END_FUNCTION xnn_f32_igemm_minmax_ukernel_5x8__aarch64_neonfma_cortex_a75 594 595#ifdef __ELF__ 596.section ".note.GNU-stack","",%progbits 597#endif 598