1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_minmax_params params [sp + 24] -> x8 21 22# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 23 24# A pointers 25# x14 a0 26# x15 a1 27# x20 a2 28# x21 a3 29# x22 a4 30# x23 a5 31 32# C pointers 33# x6 c0 34# x16 c1 35# x17 c2 36# x10 c3 37# x13 c4 38# x7 c5 39 40# Vector register usage 41# A0 v0 v6 42# A1 v1 v7 43# A2 v2 v8 44# A3 v3 v9 45# A4 v4 v10 46# A5 v5 v11 47# B v12 v13 v14 v15 48# B v16 v17 v18 v19 49# C v20 v21 50# C v22 v23 51# C v24 v25 52# C v26 v27 53# C v28 v29 54# C v30 v31 55# Clamp v6 v7 56 57BEGIN_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 58 59 # Clamp C pointers / Save d8-d15 on stack 60 STP d8, d9, [sp, -96]! 61 CMP x0, 2 // if mr < 2 62 ADD x16, x6, x7 // c1 = c0 + cm_stride 63 CSEL x16, x6, x16, LO // c1 = c0 64 65 STP d10, d11, [sp, 16] 66 ADD x17, x16, x7 // c2 = c1 + cm_stride 67 // if mr <= 2 68 CSEL x17, x16, x17, LS // c2 = c1 69 70 STP d12, d13, [sp, 32] 71 CMP x0, 4 // if mr < 4 72 ADD x10, x17, x7 // c3 = c2 + cm_stride 73 CSEL x10, x17, x10, LO // c3 = c2 74 75 STP d14, d15, [sp, 48] 76 ADD x13, x10, x7 // c4 = c3 + cm_stride 77 // if mr <= 4 78 CSEL x13, x10, x13, LS // c4 = c3 79 80 # Save x20,x21,x22,x23 on stack 81 STP x20, x21, [sp, 64] 82 STP x22, x23, [sp, 80] 83 84 CMP x0, 6 // if mr < 6 85 ADD x7, x13, x7 // c5 = c4 + cm_stride 86 CSEL x7, x13, x7, LO // c5 = c4 87 88 # Load a_offset 89 LDR x11, [sp, 104] 90 91 # Load zero, params pointer 92 LDP x12, x8, [sp, 112] 93 940: 95 # Load initial bias from w into accumulators 96 LDP q20, q21, [x5], 32 97 MOV v22.16b, v20.16b 98 MOV v23.16b, v21.16b 99 $if PREFETCH: 100 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 101 MOV v24.16b, v20.16b 102 MOV v25.16b, v21.16b 103 $if PREFETCH: 104 PRFM PLDL1KEEP, [x5, 64] 105 MOV v26.16b, v20.16b 106 MOV v27.16b, v21.16b 107 $if PREFETCH: 108 PRFM PLDL1KEEP, [x5, 128] 109 MOV v28.16b, v20.16b 110 MOV v29.16b, v21.16b 111 $if PREFETCH: 112 PRFM PLDL1KEEP, [x5, 192] 113 MOV v30.16b, v20.16b 114 MOV v31.16b, v21.16b 115 116 MOV x9, x3 // p = ks 117 1181: 119 # Load next 6 A pointers 120 LDP x14, x15, [x4], 16 121 LDP x20, x21, [x4], 16 122 LDP x22, x23, [x4], 16 123 124 CMP x14, x12 // if a0 == zero 125 ADD x14, x14, x11 // a0 += a_offset 126 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 127 CMP x15, x12 // if a1 == zero 128 ADD x15, x15, x11 // a1 += a_offset 129 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 130 CMP x20, x12 // if a2 == zero 131 ADD x20, x20, x11 // a2 += a_offset 132 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 133 CMP x21, x12 // if a3 == zero 134 ADD x21, x21, x11 // a3 += a_offset 135 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 136 CMP x22, x12 // if a4 == zero 137 ADD x22, x22, x11 // a4 += a_offset 138 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 139 CMP x23, x12 // if a5 == zero 140 ADD x23, x23, x11 // a5 += a_offset 141 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 142 143 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 144 SUBS x0, x2, 32 // k = kc - 32 145 B.LO 5f 146 147 # Prologue - loads for main loop of 96 FMA 148 LDR q0, [x14], 16 149 LDR q1, [x15], 16 150 LDR q2, [x20], 16 151 LDR q3, [x21], 16 152 LDR q4, [x22], 16 153 LDR q5, [x23], 16 154 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 155 LDP q14, q15, [x5], 32 156 LDP q16, q17, [x5], 32 157 158 # Is there at least 8 floats (32 bytes) for main loop? 159 SUBS x0, x0, 32 160 B.LO 3f 161 162 # Main loop - 8 floats of A (32 bytes) 163 # 96 FMA + 6 LDP A + 8 LDP B 1642: 165 # First group of 4 A. 48 FMA. 166 FMLA v20.4s, v12.4s, v0.s[0] 167 LDP q18, q19, [x5], 32 // Load last B 168 FMLA v22.4s, v12.4s, v1.s[0] 169 FMLA v24.4s, v12.4s, v2.s[0] 170 FMLA v26.4s, v12.4s, v3.s[0] 171 FMLA v28.4s, v12.4s, v4.s[0] 172 FMLA v30.4s, v12.4s, v5.s[0] 173 FMLA v21.4s, v13.4s, v0.s[0] 174 FMLA v23.4s, v13.4s, v1.s[0] 175 FMLA v25.4s, v13.4s, v2.s[0] 176 FMLA v27.4s, v13.4s, v3.s[0] 177 FMLA v29.4s, v13.4s, v4.s[0] 178 179 FMLA v31.4s, v13.4s, v5.s[0] 180 FMLA v20.4s, v14.4s, v0.s[1] 181 $if PREFETCH: 182 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 183 FMLA v22.4s, v14.4s, v1.s[1] 184 FMLA v24.4s, v14.4s, v2.s[1] 185 FMLA v26.4s, v14.4s, v3.s[1] 186 FMLA v28.4s, v14.4s, v4.s[1] 187 $if PREFETCH: 188 PRFM PLDL1KEEP, [x5, 256] 189 FMLA v30.4s, v14.4s, v5.s[1] 190 FMLA v21.4s, v15.4s, v0.s[1] 191 FMLA v23.4s, v15.4s, v1.s[1] 192 FMLA v25.4s, v15.4s, v2.s[1] 193 LDR q6, [x14], 16 // Load next 6 A 194 FMLA v27.4s, v15.4s, v3.s[1] 195 FMLA v29.4s, v15.4s, v4.s[1] 196 FMLA v31.4s, v15.4s, v5.s[1] 197 LDR q7, [x15], 16 198 199 FMLA v20.4s, v16.4s, v0.s[2] 200 FMLA v22.4s, v16.4s, v1.s[2] 201 FMLA v24.4s, v16.4s, v2.s[2] 202 LDR q8, [x20], 16 203 FMLA v26.4s, v16.4s, v3.s[2] 204 FMLA v28.4s, v16.4s, v4.s[2] 205 FMLA v30.4s, v16.4s, v5.s[2] 206 LDR q9, [x21], 16 207 FMLA v21.4s, v17.4s, v0.s[2] 208 FMLA v23.4s, v17.4s, v1.s[2] 209 FMLA v25.4s, v17.4s, v2.s[2] 210 LDR q10, [x22], 16 211 FMLA v27.4s, v17.4s, v3.s[2] 212 FMLA v29.4s, v17.4s, v4.s[2] 213 FMLA v31.4s, v17.4s, v5.s[2] 214 LDR q11, [x23], 16 215 216 FMLA v20.4s, v18.4s, v0.s[3] 217 FMLA v22.4s, v18.4s, v1.s[3] 218 FMLA v24.4s, v18.4s, v2.s[3] 219 LDP q12, q13, [x5], 32 // Load 4 B 220 FMLA v26.4s, v18.4s, v3.s[3] 221 FMLA v28.4s, v18.4s, v4.s[3] 222 FMLA v30.4s, v18.4s, v5.s[3] 223 LDP q14, q15, [x5], 32 224 FMLA v21.4s, v19.4s, v0.s[3] 225 FMLA v23.4s, v19.4s, v1.s[3] 226 FMLA v25.4s, v19.4s, v2.s[3] 227 LDP q16, q17, [x5], 32 228 FMLA v27.4s, v19.4s, v3.s[3] 229 FMLA v29.4s, v19.4s, v4.s[3] 230 FMLA v31.4s, v19.4s, v5.s[3] 231 LDP q18, q19, [x5], 32 232 233 # Second group of 4 A. 48 FMA. 234 FMLA v20.4s, v12.4s, v6.s[0] 235 FMLA v22.4s, v12.4s, v7.s[0] 236 FMLA v24.4s, v12.4s, v8.s[0] 237 LDR q0, [x14], 16 // Load next 6 A 238 FMLA v26.4s, v12.4s, v9.s[0] 239 FMLA v28.4s, v12.4s, v10.s[0] 240 FMLA v30.4s, v12.4s, v11.s[0] 241 LDR q1, [x15], 16 242 FMLA v21.4s, v13.4s, v6.s[0] 243 FMLA v23.4s, v13.4s, v7.s[0] 244 FMLA v25.4s, v13.4s, v8.s[0] 245 LDR q2, [x20], 16 246 FMLA v27.4s, v13.4s, v9.s[0] 247 FMLA v29.4s, v13.4s, v10.s[0] 248 FMLA v31.4s, v13.4s, v11.s[0] 249 LDR q3, [x21], 16 250 251 FMLA v20.4s, v14.4s, v6.s[1] 252 FMLA v22.4s, v14.4s, v7.s[1] 253 FMLA v24.4s, v14.4s, v8.s[1] 254 LDR q4, [x22], 16 255 FMLA v26.4s, v14.4s, v9.s[1] 256 FMLA v28.4s, v14.4s, v10.s[1] 257 FMLA v30.4s, v14.4s, v11.s[1] 258 LDR q5, [x23], 16 259 FMLA v21.4s, v15.4s, v6.s[1] 260 FMLA v23.4s, v15.4s, v7.s[1] 261 FMLA v25.4s, v15.4s, v8.s[1] 262 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 263 FMLA v27.4s, v15.4s, v9.s[1] 264 FMLA v29.4s, v15.4s, v10.s[1] 265 FMLA v31.4s, v15.4s, v11.s[1] 266 LDP q14, q15, [x5], 32 267 268 FMLA v20.4s, v16.4s, v6.s[2] 269 FMLA v22.4s, v16.4s, v7.s[2] 270 FMLA v24.4s, v16.4s, v8.s[2] 271 FMLA v26.4s, v16.4s, v9.s[2] 272 FMLA v28.4s, v16.4s, v10.s[2] 273 FMLA v30.4s, v16.4s, v11.s[2] 274 FMLA v21.4s, v17.4s, v6.s[2] 275 FMLA v23.4s, v17.4s, v7.s[2] 276 FMLA v25.4s, v17.4s, v8.s[2] 277 FMLA v27.4s, v17.4s, v9.s[2] 278 FMLA v29.4s, v17.4s, v10.s[2] 279 FMLA v31.4s, v17.4s, v11.s[2] 280 LDP q16, q17, [x5], 32 281 282 FMLA v20.4s, v18.4s, v6.s[3] 283 FMLA v22.4s, v18.4s, v7.s[3] 284 SUBS x0, x0, 32 285 FMLA v24.4s, v18.4s, v8.s[3] 286 FMLA v26.4s, v18.4s, v9.s[3] 287 FMLA v28.4s, v18.4s, v10.s[3] 288 FMLA v30.4s, v18.4s, v11.s[3] 289 FMLA v21.4s, v19.4s, v6.s[3] 290 FMLA v23.4s, v19.4s, v7.s[3] 291 FMLA v25.4s, v19.4s, v8.s[3] 292 FMLA v27.4s, v19.4s, v9.s[3] 293 FMLA v29.4s, v19.4s, v10.s[3] 294 FMLA v31.4s, v19.4s, v11.s[3] 295 B.HS 2b 296 297 # Epilogue - 8 floats of A (32 bytes) 298 # 96 FMA + 6 LDP A + 8 LDP B 299 # First block same as main loop. Second block has no preloads. 3003: 301 # First group of 4 A. 48 FMA. 302 FMLA v20.4s, v12.4s, v0.s[0] 303 LDP q18, q19, [x5], 32 // Load last B 304 FMLA v22.4s, v12.4s, v1.s[0] 305 FMLA v24.4s, v12.4s, v2.s[0] 306 FMLA v26.4s, v12.4s, v3.s[0] 307 FMLA v28.4s, v12.4s, v4.s[0] 308 FMLA v30.4s, v12.4s, v5.s[0] 309 FMLA v21.4s, v13.4s, v0.s[0] 310 FMLA v23.4s, v13.4s, v1.s[0] 311 FMLA v25.4s, v13.4s, v2.s[0] 312 FMLA v27.4s, v13.4s, v3.s[0] 313 FMLA v29.4s, v13.4s, v4.s[0] 314 315 FMLA v31.4s, v13.4s, v5.s[0] 316 FMLA v20.4s, v14.4s, v0.s[1] 317 $if PREFETCH: 318 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 319 FMLA v22.4s, v14.4s, v1.s[1] 320 FMLA v24.4s, v14.4s, v2.s[1] 321 FMLA v26.4s, v14.4s, v3.s[1] 322 FMLA v28.4s, v14.4s, v4.s[1] 323 $if PREFETCH: 324 PRFM PLDL1KEEP, [x5, 256] 325 FMLA v30.4s, v14.4s, v5.s[1] 326 FMLA v21.4s, v15.4s, v0.s[1] 327 FMLA v23.4s, v15.4s, v1.s[1] 328 FMLA v25.4s, v15.4s, v2.s[1] 329 LDR q6, [x14], 16 // Load next 6 A 330 FMLA v27.4s, v15.4s, v3.s[1] 331 FMLA v29.4s, v15.4s, v4.s[1] 332 FMLA v31.4s, v15.4s, v5.s[1] 333 LDR q7, [x15], 16 334 335 FMLA v20.4s, v16.4s, v0.s[2] 336 FMLA v22.4s, v16.4s, v1.s[2] 337 FMLA v24.4s, v16.4s, v2.s[2] 338 LDR q8, [x20], 16 339 FMLA v26.4s, v16.4s, v3.s[2] 340 FMLA v28.4s, v16.4s, v4.s[2] 341 FMLA v30.4s, v16.4s, v5.s[2] 342 LDR q9, [x21], 16 343 FMLA v21.4s, v17.4s, v0.s[2] 344 FMLA v23.4s, v17.4s, v1.s[2] 345 FMLA v25.4s, v17.4s, v2.s[2] 346 LDR q10, [x22], 16 347 FMLA v27.4s, v17.4s, v3.s[2] 348 FMLA v29.4s, v17.4s, v4.s[2] 349 FMLA v31.4s, v17.4s, v5.s[2] 350 LDR q11, [x23], 16 351 352 FMLA v20.4s, v18.4s, v0.s[3] 353 FMLA v22.4s, v18.4s, v1.s[3] 354 FMLA v24.4s, v18.4s, v2.s[3] 355 LDP q12, q13, [x5], 32 // Load 4 B 356 FMLA v26.4s, v18.4s, v3.s[3] 357 FMLA v28.4s, v18.4s, v4.s[3] 358 FMLA v30.4s, v18.4s, v5.s[3] 359 LDP q14, q15, [x5], 32 360 FMLA v21.4s, v19.4s, v0.s[3] 361 FMLA v23.4s, v19.4s, v1.s[3] 362 FMLA v25.4s, v19.4s, v2.s[3] 363 LDP q16, q17, [x5], 32 364 FMLA v27.4s, v19.4s, v3.s[3] 365 FMLA v29.4s, v19.4s, v4.s[3] 366 FMLA v31.4s, v19.4s, v5.s[3] 367 LDP q18, q19, [x5], 32 368 369 # Second group of 4 A. 48 FMA. 370 FMLA v20.4s, v12.4s, v6.s[0] 371 FMLA v22.4s, v12.4s, v7.s[0] 372 FMLA v24.4s, v12.4s, v8.s[0] 373 FMLA v26.4s, v12.4s, v9.s[0] 374 FMLA v28.4s, v12.4s, v10.s[0] 375 FMLA v30.4s, v12.4s, v11.s[0] 376 FMLA v21.4s, v13.4s, v6.s[0] 377 FMLA v23.4s, v13.4s, v7.s[0] 378 FMLA v25.4s, v13.4s, v8.s[0] 379 FMLA v27.4s, v13.4s, v9.s[0] 380 FMLA v29.4s, v13.4s, v10.s[0] 381 FMLA v31.4s, v13.4s, v11.s[0] 382 383 FMLA v20.4s, v14.4s, v6.s[1] 384 FMLA v22.4s, v14.4s, v7.s[1] 385 FMLA v24.4s, v14.4s, v8.s[1] 386 FMLA v26.4s, v14.4s, v9.s[1] 387 FMLA v28.4s, v14.4s, v10.s[1] 388 FMLA v30.4s, v14.4s, v11.s[1] 389 FMLA v21.4s, v15.4s, v6.s[1] 390 FMLA v23.4s, v15.4s, v7.s[1] 391 FMLA v25.4s, v15.4s, v8.s[1] 392 FMLA v27.4s, v15.4s, v9.s[1] 393 FMLA v29.4s, v15.4s, v10.s[1] 394 FMLA v31.4s, v15.4s, v11.s[1] 395 396 FMLA v20.4s, v16.4s, v6.s[2] 397 FMLA v22.4s, v16.4s, v7.s[2] 398 FMLA v24.4s, v16.4s, v8.s[2] 399 FMLA v26.4s, v16.4s, v9.s[2] 400 FMLA v28.4s, v16.4s, v10.s[2] 401 FMLA v30.4s, v16.4s, v11.s[2] 402 FMLA v21.4s, v17.4s, v6.s[2] 403 FMLA v23.4s, v17.4s, v7.s[2] 404 FMLA v25.4s, v17.4s, v8.s[2] 405 FMLA v27.4s, v17.4s, v9.s[2] 406 FMLA v29.4s, v17.4s, v10.s[2] 407 FMLA v31.4s, v17.4s, v11.s[2] 408 409 FMLA v20.4s, v18.4s, v6.s[3] 410 FMLA v22.4s, v18.4s, v7.s[3] 411 FMLA v24.4s, v18.4s, v8.s[3] 412 FMLA v26.4s, v18.4s, v9.s[3] 413 FMLA v28.4s, v18.4s, v10.s[3] 414 FMLA v30.4s, v18.4s, v11.s[3] 415 FMLA v21.4s, v19.4s, v6.s[3] 416 FMLA v23.4s, v19.4s, v7.s[3] 417 418 # Load min/max values 419 LD2R {v6.4s, v7.4s}, [x8] 420 421 FMLA v25.4s, v19.4s, v8.s[3] 422 FMLA v27.4s, v19.4s, v9.s[3] 423 # Is there a remainder?- 4 floats of A (16 bytes) or less 424 TST x0, 31 425 FMLA v29.4s, v19.4s, v10.s[3] 426 FMLA v31.4s, v19.4s, v11.s[3] 427 B.NE 5f 428 4294: 430 # ks loop 431 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 432 B.HI 1b 433 434 # Clamp 435 FMAX v20.4s, v20.4s, v6.4s 436 # Load cn_stride 437 LDR x0, [sp, 96] 438 FMAX v21.4s, v21.4s, v6.4s 439 FMAX v22.4s, v22.4s, v6.4s 440 FMAX v23.4s, v23.4s, v6.4s 441 FMAX v24.4s, v24.4s, v6.4s 442 FMAX v25.4s, v25.4s, v6.4s 443 FMAX v26.4s, v26.4s, v6.4s 444 FMAX v27.4s, v27.4s, v6.4s 445 FMAX v28.4s, v28.4s, v6.4s 446 FMAX v29.4s, v29.4s, v6.4s 447 FMAX v30.4s, v30.4s, v6.4s 448 FMAX v31.4s, v31.4s, v6.4s 449 SUBS x1, x1, 8 450 FMIN v20.4s, v20.4s, v7.4s 451 FMIN v21.4s, v21.4s, v7.4s 452 FMIN v22.4s, v22.4s, v7.4s 453 FMIN v23.4s, v23.4s, v7.4s 454 FMIN v24.4s, v24.4s, v7.4s 455 FMIN v25.4s, v25.4s, v7.4s 456 FMIN v26.4s, v26.4s, v7.4s 457 FMIN v27.4s, v27.4s, v7.4s 458 FMIN v28.4s, v28.4s, v7.4s 459 FMIN v29.4s, v29.4s, v7.4s 460 FMIN v30.4s, v30.4s, v7.4s 461 FMIN v31.4s, v31.4s, v7.4s 462 463 # Store full 6 x 8 464 B.LO 8f 465 466 STP q30, q31, [x7] 467 ADD x7, x7, x0 468 STP q28, q29, [x13] 469 ADD x13, x13, x0 470 STP q26, q27, [x10] 471 ADD x10, x10, x0 472 STP q24, q25, [x17] 473 ADD x17, x17, x0 474 STP q22, q23, [x16] 475 ADD x16, x16, x0 476 STP q20, q21, [x6] 477 ADD x6, x6, x0 478 479 SUB x4, x4, x3 // a -= ks 480 481 # nc loop 482 B.HI 0b 483 484 # Restore x20,x21,x22,x23 from stack 485 LDP x22, x23, [sp, 80] 486 LDP x20, x21, [sp, 64] 487 488 # Restore d8-d15 from stack 489 LDP d14, d15, [sp, 48] 490 LDP d12, d13, [sp, 32] 491 LDP d10, d11, [sp, 16] 492 LDP d8, d9, [sp], 96 493 RET 494 4955: 496 # Load min/max values 497 LD2R {v6.4s, v7.4s}, [x8] 498 499 # Is there a remainder?- 4 floats of A (16 bytes) 500 TBZ x0, 4, 6f 501 502 # Remainder- 4 floats of A (16 bytes) 503 # Load A 504 LDR q0, [x14], 16 505 LDR q1, [x15], 16 506 LDR q2, [x20], 16 507 LDR q3, [x21], 16 508 LDR q4, [x22], 16 509 LDR q5, [x23], 16 510 # Load B 511 LDP q12, q13, [x5], 32 512 LDP q14, q15, [x5], 32 513 LDP q16, q17, [x5], 32 514 LDP q18, q19, [x5], 32 515 516 FMLA v20.4s, v12.4s, v0.s[0] 517 FMLA v22.4s, v12.4s, v1.s[0] 518 FMLA v24.4s, v12.4s, v2.s[0] 519 FMLA v26.4s, v12.4s, v3.s[0] 520 FMLA v28.4s, v12.4s, v4.s[0] 521 FMLA v30.4s, v12.4s, v5.s[0] 522 FMLA v21.4s, v13.4s, v0.s[0] 523 FMLA v23.4s, v13.4s, v1.s[0] 524 FMLA v25.4s, v13.4s, v2.s[0] 525 FMLA v27.4s, v13.4s, v3.s[0] 526 FMLA v29.4s, v13.4s, v4.s[0] 527 FMLA v31.4s, v13.4s, v5.s[0] 528 529 FMLA v20.4s, v14.4s, v0.s[1] 530 FMLA v22.4s, v14.4s, v1.s[1] 531 FMLA v24.4s, v14.4s, v2.s[1] 532 FMLA v26.4s, v14.4s, v3.s[1] 533 FMLA v28.4s, v14.4s, v4.s[1] 534 FMLA v30.4s, v14.4s, v5.s[1] 535 FMLA v21.4s, v15.4s, v0.s[1] 536 FMLA v23.4s, v15.4s, v1.s[1] 537 FMLA v25.4s, v15.4s, v2.s[1] 538 FMLA v27.4s, v15.4s, v3.s[1] 539 FMLA v29.4s, v15.4s, v4.s[1] 540 FMLA v31.4s, v15.4s, v5.s[1] 541 542 FMLA v20.4s, v16.4s, v0.s[2] 543 FMLA v22.4s, v16.4s, v1.s[2] 544 FMLA v24.4s, v16.4s, v2.s[2] 545 FMLA v26.4s, v16.4s, v3.s[2] 546 FMLA v28.4s, v16.4s, v4.s[2] 547 FMLA v30.4s, v16.4s, v5.s[2] 548 FMLA v21.4s, v17.4s, v0.s[2] 549 FMLA v23.4s, v17.4s, v1.s[2] 550 FMLA v25.4s, v17.4s, v2.s[2] 551 FMLA v27.4s, v17.4s, v3.s[2] 552 FMLA v29.4s, v17.4s, v4.s[2] 553 FMLA v31.4s, v17.4s, v5.s[2] 554 555 FMLA v20.4s, v18.4s, v0.s[3] 556 FMLA v22.4s, v18.4s, v1.s[3] 557 FMLA v24.4s, v18.4s, v2.s[3] 558 FMLA v26.4s, v18.4s, v3.s[3] 559 FMLA v28.4s, v18.4s, v4.s[3] 560 FMLA v30.4s, v18.4s, v5.s[3] 561 FMLA v21.4s, v19.4s, v0.s[3] 562 FMLA v23.4s, v19.4s, v1.s[3] 563 FMLA v25.4s, v19.4s, v2.s[3] 564 FMLA v27.4s, v19.4s, v3.s[3] 565 FMLA v29.4s, v19.4s, v4.s[3] 566 FMLA v31.4s, v19.4s, v5.s[3] 567 568 # Is there a remainder?- 2 floats of A (8 bytes) 5696: 570 TBZ x0, 3, 7f 571 572 # Remainder- 2 floats of A (8 bytes) 573 # Load A 574 LDR d0, [x14], 8 575 LDR d1, [x15], 8 576 LDR d2, [x20], 8 577 LDR d3, [x21], 8 578 LDR d4, [x22], 8 579 LDR d5, [x23], 8 580 # Load B 581 LDP q12, q13, [x5], 32 582 LDP q14, q15, [x5], 32 583 584 FMLA v20.4s, v12.4s, v0.s[0] 585 FMLA v22.4s, v12.4s, v1.s[0] 586 FMLA v24.4s, v12.4s, v2.s[0] 587 FMLA v26.4s, v12.4s, v3.s[0] 588 FMLA v28.4s, v12.4s, v4.s[0] 589 FMLA v30.4s, v12.4s, v5.s[0] 590 FMLA v21.4s, v13.4s, v0.s[0] 591 FMLA v23.4s, v13.4s, v1.s[0] 592 FMLA v25.4s, v13.4s, v2.s[0] 593 FMLA v27.4s, v13.4s, v3.s[0] 594 FMLA v29.4s, v13.4s, v4.s[0] 595 FMLA v31.4s, v13.4s, v5.s[0] 596 597 FMLA v20.4s, v14.4s, v0.s[1] 598 FMLA v22.4s, v14.4s, v1.s[1] 599 FMLA v24.4s, v14.4s, v2.s[1] 600 FMLA v26.4s, v14.4s, v3.s[1] 601 FMLA v28.4s, v14.4s, v4.s[1] 602 FMLA v30.4s, v14.4s, v5.s[1] 603 FMLA v21.4s, v15.4s, v0.s[1] 604 FMLA v23.4s, v15.4s, v1.s[1] 605 FMLA v25.4s, v15.4s, v2.s[1] 606 FMLA v27.4s, v15.4s, v3.s[1] 607 FMLA v29.4s, v15.4s, v4.s[1] 608 FMLA v31.4s, v15.4s, v5.s[1] 609 610 # Is there a remainder?- 1 float of A (4 bytes) 6117: 612 TBZ x0, 2, 4b 613 614 # Remainder- 1 float of A (4 bytes) 615 # Load A 616 LDR s0, [x14], 4 617 LDR s1, [x15], 4 618 LDR s2, [x20], 4 619 LDR s3, [x21], 4 620 LDR s4, [x22], 4 621 LDR s5, [x23], 4 622 # Load B 623 LDP q12, q13, [x5], 32 624 625 FMLA v20.4s, v12.4s, v0.s[0] 626 FMLA v22.4s, v12.4s, v1.s[0] 627 FMLA v24.4s, v12.4s, v2.s[0] 628 FMLA v26.4s, v12.4s, v3.s[0] 629 FMLA v28.4s, v12.4s, v4.s[0] 630 FMLA v30.4s, v12.4s, v5.s[0] 631 FMLA v21.4s, v13.4s, v0.s[0] 632 FMLA v23.4s, v13.4s, v1.s[0] 633 FMLA v25.4s, v13.4s, v2.s[0] 634 FMLA v27.4s, v13.4s, v3.s[0] 635 FMLA v29.4s, v13.4s, v4.s[0] 636 FMLA v31.4s, v13.4s, v5.s[0] 637 B 4b 638 639 # Store odd width 6408: 641 TBZ x1, 2, 9f 642 STR q30, [x7], 16 643 MOV v30.16b, v31.16b 644 STR q28, [x13], 16 645 MOV v28.16b, v29.16b 646 STR q26, [x10], 16 647 MOV v26.16b, v27.16b 648 STR q24, [x17], 16 649 MOV v24.16b, v25.16b 650 STR q22, [x16], 16 651 MOV v22.16b, v23.16b 652 STR q20, [x6], 16 653 MOV v20.16b, v21.16b 6549: 655 TBZ x1, 1, 10f 656 STR d30, [x7], 8 657 DUP d30, v30.d[1] 658 STR d28, [x13], 8 659 DUP d28, v28.d[1] 660 STR d26, [x10], 8 661 DUP d26, v26.d[1] 662 STR d24, [x17], 8 663 DUP d24, v24.d[1] 664 STR d22, [x16], 8 665 DUP d22, v22.d[1] 666 STR d20, [x6], 8 667 DUP d20, v20.d[1] 668 66910: 670 TBZ x1, 0, 11f 671 STR s30, [x7] 672 STR s28, [x13] 673 STR s26, [x10] 674 STR s24, [x17] 675 STR s22, [x16] 676 STR s20, [x6] 67711: 678 # Restore x20,x21,x22,x23 from stack 679 LDP x22, x23, [sp, 80] 680 LDP x20, x21, [sp, 64] 681 682 # Restore d8-d15 from stack 683 LDP d14, d15, [sp, 48] 684 LDP d12, d13, [sp, 32] 685 LDP d10, d11, [sp, 16] 686 LDP d8, d9, [sp], 96 687 RET 688 689END_FUNCTION xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 690 691#ifdef __ELF__ 692.section ".note.GNU-stack","",%progbits 693#endif 694