1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# unused compared to 5x8 25# x4 a5 26# x7 c5 27# A5 v10 v11 28# C v30 v31 29 30# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 31 32# A pointers 33# x3 a0 34# x9 a1 35# x10 a2 36# x11 a3 37# x12 a4 38 39# C pointers 40# x6 c0 41# x16 c1 42# x17 c2 43# x13 c3 44# x7 c4 45 46# Vector register usage 47# A0 v0 v1 48# A1 v2 v3 49# A2 v4 v5 50# A3 v6 v7 51# A4 v8 v9 52# B v12 v13 v14 v15 53# B v16 v17 v18 v19 54# C v20 v21 55# C v22 v23 56# C v24 v25 57# C v26 v27 58# C v28 v29 59# Clamp v30 v31 60 61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 62 63 $if INC: 64 # Load cn_stride, acc 65 LDP x14, x15, [sp] 66 # Load params pointer 67 LDR x8, [sp, 16] 68 $else: 69 # Load cn_stride, params pointer 70 LDP x14, x8, [sp] 71 72 # Clamp A and C pointers / Save d8-d15 on stack 73 STP d8, d9, [sp, -48]! 74 CMP x0, 2 // if mr < 2 75 ADD x9, x3, x4 // a1 = a0 + a_stride 76 ADD x16, x6, x7 // c1 = c0 + cm_stride 77 CSEL x9, x3, x9, LO // a1 = a0 78 CSEL x16, x6, x16, LO // c1 = c0 79 80 STP d12, d13, [sp, 16] 81 ADD x10, x9, x4 // a2 = a1 + a_stride 82 ADD x17, x16, x7 // c2 = c1 + cm_stride 83 // if mr <= 2 84 CSEL x10, x9, x10, LS // a2 = a1 85 CSEL x17, x16, x17, LS // c2 = c1 86 87 STP d14, d15, [sp, 32] 88 CMP x0, 4 // if mr < 4 89 ADD x11, x10, x4 // a3 = a2 + a_stride 90 ADD x13, x17, x7 // c3 = c2 + cm_stride 91 CSEL x11, x10, x11, LO // a3 = a2 92 CSEL x13, x17, x13, LO // c3 = c2 93 94 ADD x12, x11, x4 // a4 = a3 + a_stride 95 ADD x7, x13, x7 // c4 = c3 + cm_stride 96 // if mr <= 4 97 CSEL x12, x11, x12, LS // a4 = a3 98 CSEL x7, x13, x7, LS // c4 = c3 99 100 # Load clamp values 101 LD2R {v30.4s, v31.4s}, [x8] 102 1030: 104 $if INC: 105 # Load initial accumulators 106 LDP q20, q21, [x15], 32 107 LDP q22, q23, [x15], 32 108 LDP q24, q25, [x15], 32 109 LDP q26, q27, [x15], 32 110 LDP q28, q29, [x15], 32 111 $if PREFETCH: 112 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 113 PRFM PLDL1KEEP, [x5, 64] 114 PRFM PLDL1KEEP, [x5, 128] 115 PRFM PLDL1KEEP, [x5, 192] 116 PRFM PLDL1KEEP, [x3] // Prefetch A 117 PRFM PLDL1KEEP, [x9] 118 PRFM PLDL1KEEP, [x10] 119 PRFM PLDL1KEEP, [x11] 120 PRFM PLDL1KEEP, [x12] 121 $else: 122 # Load initial bias from w into accumulators 123 LDP q20, q21, [x5], 32 124 MOV v22.16b, v20.16b 125 $if PREFETCH: 126 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 127 MOV v23.16b, v21.16b 128 $if PREFETCH: 129 PRFM PLDL1KEEP, [x5, 64] 130 MOV v24.16b, v20.16b 131 $if PREFETCH: 132 PRFM PLDL1KEEP, [x5, 128] 133 MOV v25.16b, v21.16b 134 $if PREFETCH: 135 PRFM PLDL1KEEP, [x5, 192] 136 MOV v26.16b, v20.16b 137 $if PREFETCH: 138 PRFM PLDL1KEEP, [x3] // Prefetch A 139 MOV v27.16b, v21.16b 140 $if PREFETCH: 141 PRFM PLDL1KEEP, [x9] 142 MOV v28.16b, v20.16b 143 $if PREFETCH: 144 PRFM PLDL1KEEP, [x10] 145 MOV v29.16b, v21.16b 146 $if PREFETCH: 147 PRFM PLDL1KEEP, [x11] 148 PRFM PLDL1KEEP, [x12] 149 150 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 151 SUBS x0, x2, 32 // k = kc - 32 152 B.LO 4f 153 154 # Prologue - loads for main loop of 80 FMA 155 LDR q0, [x3], 16 156 LDR q2, [x9], 16 157 LDR q4, [x10], 16 158 LDR q6, [x11], 16 159 LDR q8, [x12], 16 160 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 161 LDP q14, q15, [x5], 32 162 LDP q16, q17, [x5], 32 163 164 # Is there at least 8 floats (32 bytes) for main loop? 165 SUBS x0, x0, 32 166 B.LO 2f 167 168 # Main loop - 8 floats of A (32 bytes) 169 # 80 FMA + 5 LDP A + 8 LDP B 1701: 171 # First group of 4 A. 40 FMA. 172 FMLA v20.4s, v12.4s, v0.s[0] 173 LDP q18, q19, [x5], 32 // Load last B 174 FMLA v22.4s, v12.4s, v2.s[0] 175 FMLA v24.4s, v12.4s, v4.s[0] 176 FMLA v26.4s, v12.4s, v6.s[0] 177 $if PREFETCH: 178 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 179 FMLA v28.4s, v12.4s, v8.s[0] 180 FMLA v21.4s, v13.4s, v0.s[0] 181 FMLA v23.4s, v13.4s, v2.s[0] 182 $if PREFETCH: 183 PRFM PLDL1KEEP, [x5, 256] 184 FMLA v25.4s, v13.4s, v4.s[0] 185 FMLA v27.4s, v13.4s, v6.s[0] 186 FMLA v29.4s, v13.4s, v8.s[0] 187 LDR q1, [x3], 16 // Load next 5 A 188 189 FMLA v20.4s, v14.4s, v0.s[1] 190 FMLA v22.4s, v14.4s, v2.s[1] 191 FMLA v24.4s, v14.4s, v4.s[1] 192 LDR q3, [x9], 16 193 FMLA v26.4s, v14.4s, v6.s[1] 194 FMLA v28.4s, v14.4s, v8.s[1] 195 FMLA v21.4s, v15.4s, v0.s[1] 196 LDR q5, [x10], 16 197 FMLA v23.4s, v15.4s, v2.s[1] 198 FMLA v25.4s, v15.4s, v4.s[1] 199 FMLA v27.4s, v15.4s, v6.s[1] 200 LDR q7, [x11], 16 201 FMLA v29.4s, v15.4s, v8.s[1] 202 203 FMLA v20.4s, v16.4s, v0.s[2] 204 FMLA v22.4s, v16.4s, v2.s[2] 205 LDR q9, [x12], 16 206 FMLA v24.4s, v16.4s, v4.s[2] 207 FMLA v26.4s, v16.4s, v6.s[2] 208 FMLA v28.4s, v16.4s, v8.s[2] 209 LDP q12, q13, [x5], 32 // Load 4 B 210 FMLA v21.4s, v17.4s, v0.s[2] 211 FMLA v23.4s, v17.4s, v2.s[2] 212 FMLA v25.4s, v17.4s, v4.s[2] 213 LDP q14, q15, [x5], 32 214 FMLA v27.4s, v17.4s, v6.s[2] 215 FMLA v29.4s, v17.4s, v8.s[2] 216 217 FMLA v20.4s, v18.4s, v0.s[3] 218 LDP q16, q17, [x5], 32 219 FMLA v22.4s, v18.4s, v2.s[3] 220 FMLA v24.4s, v18.4s, v4.s[3] 221 FMLA v26.4s, v18.4s, v6.s[3] 222 FMLA v28.4s, v18.4s, v8.s[3] 223 FMLA v21.4s, v19.4s, v0.s[3] 224 FMLA v23.4s, v19.4s, v2.s[3] 225 FMLA v25.4s, v19.4s, v4.s[3] 226 FMLA v27.4s, v19.4s, v6.s[3] 227 FMLA v29.4s, v19.4s, v8.s[3] 228 LDP q18, q19, [x5], 32 229 230 # Second group of 4 A. 40 FMA. 231 FMLA v20.4s, v12.4s, v1.s[0] 232 FMLA v22.4s, v12.4s, v3.s[0] 233 FMLA v24.4s, v12.4s, v5.s[0] 234 LDR q0, [x3], 16 // Load next 5 A 235 FMLA v26.4s, v12.4s, v7.s[0] 236 FMLA v28.4s, v12.4s, v9.s[0] 237 FMLA v21.4s, v13.4s, v1.s[0] 238 LDR q2, [x9], 16 239 FMLA v23.4s, v13.4s, v3.s[0] 240 FMLA v25.4s, v13.4s, v5.s[0] 241 FMLA v27.4s, v13.4s, v7.s[0] 242 LDR q4, [x10], 16 243 FMLA v29.4s, v13.4s, v9.s[0] 244 245 FMLA v20.4s, v14.4s, v1.s[1] 246 FMLA v22.4s, v14.4s, v3.s[1] 247 LDR q6, [x11], 16 248 FMLA v24.4s, v14.4s, v5.s[1] 249 FMLA v26.4s, v14.4s, v7.s[1] 250 FMLA v28.4s, v14.4s, v9.s[1] 251 LDR q8, [x12], 16 252 FMLA v21.4s, v15.4s, v1.s[1] 253 FMLA v23.4s, v15.4s, v3.s[1] 254 FMLA v25.4s, v15.4s, v5.s[1] 255 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 256 FMLA v27.4s, v15.4s, v7.s[1] 257 FMLA v29.4s, v15.4s, v9.s[1] 258 259 FMLA v20.4s, v16.4s, v1.s[2] 260 LDP q14, q15, [x5], 32 261 FMLA v22.4s, v16.4s, v3.s[2] 262 FMLA v24.4s, v16.4s, v5.s[2] 263 FMLA v26.4s, v16.4s, v7.s[2] 264 FMLA v28.4s, v16.4s, v9.s[2] 265 FMLA v21.4s, v17.4s, v1.s[2] 266 FMLA v23.4s, v17.4s, v3.s[2] 267 FMLA v25.4s, v17.4s, v5.s[2] 268 FMLA v27.4s, v17.4s, v7.s[2] 269 FMLA v29.4s, v17.4s, v9.s[2] 270 LDP q16, q17, [x5], 32 271 272 FMLA v20.4s, v18.4s, v1.s[3] 273 FMLA v22.4s, v18.4s, v3.s[3] 274 SUBS x0, x0, 32 275 FMLA v24.4s, v18.4s, v5.s[3] 276 FMLA v26.4s, v18.4s, v7.s[3] 277 FMLA v28.4s, v18.4s, v9.s[3] 278 FMLA v21.4s, v19.4s, v1.s[3] 279 FMLA v23.4s, v19.4s, v3.s[3] 280 FMLA v25.4s, v19.4s, v5.s[3] 281 FMLA v27.4s, v19.4s, v7.s[3] 282 FMLA v29.4s, v19.4s, v9.s[3] 283 B.HS 1b 284 285 # Epilogue - 8 floats of A (32 bytes) 286 # 80 FMA + 5 LDP A + 8 LDP B 287 # First block same as main loop. Second block has no preloads. 2882: 289 # First group of 4 A. 40 FMA. 290 FMLA v20.4s, v12.4s, v0.s[0] 291 LDP q18, q19, [x5], 32 // Load last B 292 FMLA v22.4s, v12.4s, v2.s[0] 293 FMLA v24.4s, v12.4s, v4.s[0] 294 FMLA v26.4s, v12.4s, v6.s[0] 295 $if PREFETCH: 296 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 297 FMLA v28.4s, v12.4s, v8.s[0] 298 FMLA v21.4s, v13.4s, v0.s[0] 299 FMLA v23.4s, v13.4s, v2.s[0] 300 $if PREFETCH: 301 PRFM PLDL1KEEP, [x5, 256] 302 FMLA v25.4s, v13.4s, v4.s[0] 303 FMLA v27.4s, v13.4s, v6.s[0] 304 FMLA v29.4s, v13.4s, v8.s[0] 305 LDR q1, [x3], 16 // Load next 5 A 306 307 FMLA v20.4s, v14.4s, v0.s[1] 308 FMLA v22.4s, v14.4s, v2.s[1] 309 FMLA v24.4s, v14.4s, v4.s[1] 310 LDR q3, [x9], 16 311 FMLA v26.4s, v14.4s, v6.s[1] 312 FMLA v28.4s, v14.4s, v8.s[1] 313 FMLA v21.4s, v15.4s, v0.s[1] 314 LDR q5, [x10], 16 315 FMLA v23.4s, v15.4s, v2.s[1] 316 FMLA v25.4s, v15.4s, v4.s[1] 317 FMLA v27.4s, v15.4s, v6.s[1] 318 LDR q7, [x11], 16 319 FMLA v29.4s, v15.4s, v8.s[1] 320 321 FMLA v20.4s, v16.4s, v0.s[2] 322 FMLA v22.4s, v16.4s, v2.s[2] 323 LDR q9, [x12], 16 324 FMLA v24.4s, v16.4s, v4.s[2] 325 FMLA v26.4s, v16.4s, v6.s[2] 326 FMLA v28.4s, v16.4s, v8.s[2] 327 LDP q12, q13, [x5], 32 // Load 4 B 328 FMLA v21.4s, v17.4s, v0.s[2] 329 FMLA v23.4s, v17.4s, v2.s[2] 330 FMLA v25.4s, v17.4s, v4.s[2] 331 LDP q14, q15, [x5], 32 332 FMLA v27.4s, v17.4s, v6.s[2] 333 FMLA v29.4s, v17.4s, v8.s[2] 334 335 FMLA v20.4s, v18.4s, v0.s[3] 336 LDP q16, q17, [x5], 32 337 FMLA v22.4s, v18.4s, v2.s[3] 338 FMLA v24.4s, v18.4s, v4.s[3] 339 FMLA v26.4s, v18.4s, v6.s[3] 340 FMLA v28.4s, v18.4s, v8.s[3] 341 FMLA v21.4s, v19.4s, v0.s[3] 342 FMLA v23.4s, v19.4s, v2.s[3] 343 FMLA v25.4s, v19.4s, v4.s[3] 344 FMLA v27.4s, v19.4s, v6.s[3] 345 FMLA v29.4s, v19.4s, v8.s[3] 346 LDP q18, q19, [x5], 32 347 348 # Second group of 4 A. 40 FMA. 349 FMLA v20.4s, v12.4s, v1.s[0] 350 FMLA v22.4s, v12.4s, v3.s[0] 351 FMLA v24.4s, v12.4s, v5.s[0] 352 FMLA v26.4s, v12.4s, v7.s[0] 353 FMLA v28.4s, v12.4s, v9.s[0] 354 FMLA v21.4s, v13.4s, v1.s[0] 355 FMLA v23.4s, v13.4s, v3.s[0] 356 FMLA v25.4s, v13.4s, v5.s[0] 357 FMLA v27.4s, v13.4s, v7.s[0] 358 FMLA v29.4s, v13.4s, v9.s[0] 359 360 FMLA v20.4s, v14.4s, v1.s[1] 361 FMLA v22.4s, v14.4s, v3.s[1] 362 FMLA v24.4s, v14.4s, v5.s[1] 363 FMLA v26.4s, v14.4s, v7.s[1] 364 FMLA v28.4s, v14.4s, v9.s[1] 365 FMLA v21.4s, v15.4s, v1.s[1] 366 FMLA v23.4s, v15.4s, v3.s[1] 367 FMLA v25.4s, v15.4s, v5.s[1] 368 FMLA v27.4s, v15.4s, v7.s[1] 369 FMLA v29.4s, v15.4s, v9.s[1] 370 371 FMLA v20.4s, v16.4s, v1.s[2] 372 FMLA v22.4s, v16.4s, v3.s[2] 373 FMLA v24.4s, v16.4s, v5.s[2] 374 FMLA v26.4s, v16.4s, v7.s[2] 375 FMLA v28.4s, v16.4s, v9.s[2] 376 FMLA v21.4s, v17.4s, v1.s[2] 377 FMLA v23.4s, v17.4s, v3.s[2] 378 FMLA v25.4s, v17.4s, v5.s[2] 379 FMLA v27.4s, v17.4s, v7.s[2] 380 FMLA v29.4s, v17.4s, v9.s[2] 381 TST x0, 31 382 383 FMLA v20.4s, v18.4s, v1.s[3] 384 FMLA v22.4s, v18.4s, v3.s[3] 385 FMLA v24.4s, v18.4s, v5.s[3] 386 FMLA v26.4s, v18.4s, v7.s[3] 387 FMLA v28.4s, v18.4s, v9.s[3] 388 FMLA v21.4s, v19.4s, v1.s[3] 389 FMLA v23.4s, v19.4s, v3.s[3] 390 FMLA v25.4s, v19.4s, v5.s[3] 391 FMLA v27.4s, v19.4s, v7.s[3] 392 FMLA v29.4s, v19.4s, v9.s[3] 393 B.NE 4f 394 395 # Clamp 3963: 397 FMAX v20.4s, v20.4s, v30.4s 398 SUBS x1, x1, 8 399 FMAX v21.4s, v21.4s, v30.4s 400 FMAX v22.4s, v22.4s, v30.4s 401 FMAX v23.4s, v23.4s, v30.4s 402 FMAX v24.4s, v24.4s, v30.4s 403 FMAX v25.4s, v25.4s, v30.4s 404 FMAX v26.4s, v26.4s, v30.4s 405 FMAX v27.4s, v27.4s, v30.4s 406 FMAX v28.4s, v28.4s, v30.4s 407 FMAX v29.4s, v29.4s, v30.4s 408 FMIN v20.4s, v20.4s, v31.4s 409 FMIN v21.4s, v21.4s, v31.4s 410 FMIN v22.4s, v22.4s, v31.4s 411 FMIN v23.4s, v23.4s, v31.4s 412 FMIN v24.4s, v24.4s, v31.4s 413 FMIN v25.4s, v25.4s, v31.4s 414 FMIN v26.4s, v26.4s, v31.4s 415 FMIN v27.4s, v27.4s, v31.4s 416 FMIN v28.4s, v28.4s, v31.4s 417 FMIN v29.4s, v29.4s, v31.4s 418 419 # Store full 5 x 8 420 B.LO 7f 421 422 $if INC: 423 SUB x3, x3, x2 // a0 -= kc 424 STP q28, q29, [x7] 425 ADD x7, x7, x14 426 SUB x9, x9, x2 // a1 -= kc 427 STP q26, q27, [x13] 428 ADD x13, x13, x14 429 SUB x10, x10, x2 // a2 -= kc 430 STP q24, q25, [x17] 431 ADD x17, x17, x14 432 SUB x11, x11, x2 // a3 -= kc 433 STP q22, q23, [x16] 434 ADD x16, x16, x14 435 SUB x12, x12, x2 // a4 -= kc 436 STP q20, q21, [x6] 437 ADD x6, x6, x14 438 $else: 439 STP q20, q21, [x6] 440 ADD x6, x6, x14 441 SUB x3, x3, x2 // a0 -= kc 442 STP q22, q23, [x16] 443 ADD x16, x16, x14 444 SUB x9, x9, x2 // a1 -= kc 445 STP q24, q25, [x17] 446 ADD x17, x17, x14 447 SUB x10, x10, x2 // a2 -= kc 448 STP q26, q27, [x13] 449 ADD x13, x13, x14 450 SUB x11, x11, x2 // a3 -= kc 451 STP q28, q29, [x7] 452 ADD x7, x7, x14 453 SUB x12, x12, x2 // a4 -= kc 454 455 B.HI 0b 456 457 # Restore d8-d15 from stack 458 LDP d14, d15, [sp, 32] 459 LDP d12, d13, [sp, 16] 460 LDP d8, d9, [sp], 48 461 RET 462 463 # Load clamp values 4644: 465 # Is there a remainder?- 4 floats of A (16 bytes) 466 TBZ x0, 4, 5f 467 468 # Remainder- 4 floats of A (16 bytes) 469 # Load A 470 LDR q0, [x3], 16 471 LDR q2, [x9], 16 472 LDR q4, [x10], 16 473 LDR q6, [x11], 16 474 LDR q8, [x12], 16 475 # Load B 476 LDP q12, q13, [x5], 32 477 LDP q14, q15, [x5], 32 478 LDP q16, q17, [x5], 32 479 LDP q18, q19, [x5], 32 480 481 FMLA v20.4s, v12.4s, v0.s[0] 482 FMLA v22.4s, v12.4s, v2.s[0] 483 FMLA v24.4s, v12.4s, v4.s[0] 484 FMLA v26.4s, v12.4s, v6.s[0] 485 FMLA v28.4s, v12.4s, v8.s[0] 486 FMLA v21.4s, v13.4s, v0.s[0] 487 FMLA v23.4s, v13.4s, v2.s[0] 488 FMLA v25.4s, v13.4s, v4.s[0] 489 FMLA v27.4s, v13.4s, v6.s[0] 490 FMLA v29.4s, v13.4s, v8.s[0] 491 492 FMLA v20.4s, v14.4s, v0.s[1] 493 FMLA v22.4s, v14.4s, v2.s[1] 494 FMLA v24.4s, v14.4s, v4.s[1] 495 FMLA v26.4s, v14.4s, v6.s[1] 496 FMLA v28.4s, v14.4s, v8.s[1] 497 FMLA v21.4s, v15.4s, v0.s[1] 498 FMLA v23.4s, v15.4s, v2.s[1] 499 FMLA v25.4s, v15.4s, v4.s[1] 500 FMLA v27.4s, v15.4s, v6.s[1] 501 FMLA v29.4s, v15.4s, v8.s[1] 502 503 FMLA v20.4s, v16.4s, v0.s[2] 504 FMLA v22.4s, v16.4s, v2.s[2] 505 FMLA v24.4s, v16.4s, v4.s[2] 506 FMLA v26.4s, v16.4s, v6.s[2] 507 FMLA v28.4s, v16.4s, v8.s[2] 508 FMLA v21.4s, v17.4s, v0.s[2] 509 FMLA v23.4s, v17.4s, v2.s[2] 510 FMLA v25.4s, v17.4s, v4.s[2] 511 FMLA v27.4s, v17.4s, v6.s[2] 512 FMLA v29.4s, v17.4s, v8.s[2] 513 514 FMLA v20.4s, v18.4s, v0.s[3] 515 FMLA v22.4s, v18.4s, v2.s[3] 516 FMLA v24.4s, v18.4s, v4.s[3] 517 FMLA v26.4s, v18.4s, v6.s[3] 518 FMLA v28.4s, v18.4s, v8.s[3] 519 FMLA v21.4s, v19.4s, v0.s[3] 520 FMLA v23.4s, v19.4s, v2.s[3] 521 FMLA v25.4s, v19.4s, v4.s[3] 522 FMLA v27.4s, v19.4s, v6.s[3] 523 FMLA v29.4s, v19.4s, v8.s[3] 524 525 # Is there a remainder?- 2 floats of A (8 bytes) 5265: 527 TBZ x0, 3, 6f 528 529 # Remainder- 2 floats of A (8 bytes) 530 # Load A 531 LDR d0, [x3], 8 532 LDR d2, [x9], 8 533 LDR d4, [x10], 8 534 LDR d6, [x11], 8 535 LDR d8, [x12], 8 536 # Load B 537 LDP q12, q13, [x5], 32 538 LDP q14, q15, [x5], 32 539 540 FMLA v20.4s, v12.4s, v0.s[0] 541 FMLA v22.4s, v12.4s, v2.s[0] 542 FMLA v24.4s, v12.4s, v4.s[0] 543 FMLA v26.4s, v12.4s, v6.s[0] 544 FMLA v28.4s, v12.4s, v8.s[0] 545 FMLA v21.4s, v13.4s, v0.s[0] 546 FMLA v23.4s, v13.4s, v2.s[0] 547 FMLA v25.4s, v13.4s, v4.s[0] 548 FMLA v27.4s, v13.4s, v6.s[0] 549 FMLA v29.4s, v13.4s, v8.s[0] 550 551 FMLA v20.4s, v14.4s, v0.s[1] 552 FMLA v22.4s, v14.4s, v2.s[1] 553 FMLA v24.4s, v14.4s, v4.s[1] 554 FMLA v26.4s, v14.4s, v6.s[1] 555 FMLA v28.4s, v14.4s, v8.s[1] 556 FMLA v21.4s, v15.4s, v0.s[1] 557 FMLA v23.4s, v15.4s, v2.s[1] 558 FMLA v25.4s, v15.4s, v4.s[1] 559 FMLA v27.4s, v15.4s, v6.s[1] 560 FMLA v29.4s, v15.4s, v8.s[1] 561 562 # Is there a remainder?- 1 float of A (4 bytes) 5636: 564 TBZ x0, 2, 3b 565 566 # Remainder- 1 float of A (4 bytes) 567 # Load A 568 LDR s0, [x3], 4 569 LDR s2, [x9], 4 570 LDR s4, [x10], 4 571 LDR s6, [x11], 4 572 LDR s8, [x12], 4 573 # Load B 574 LDP q12, q13, [x5], 32 575 576 FMLA v20.4s, v12.4s, v0.s[0] 577 FMLA v22.4s, v12.4s, v2.s[0] 578 FMLA v24.4s, v12.4s, v4.s[0] 579 FMLA v26.4s, v12.4s, v6.s[0] 580 FMLA v28.4s, v12.4s, v8.s[0] 581 FMLA v21.4s, v13.4s, v0.s[0] 582 FMLA v23.4s, v13.4s, v2.s[0] 583 FMLA v25.4s, v13.4s, v4.s[0] 584 FMLA v27.4s, v13.4s, v6.s[0] 585 FMLA v29.4s, v13.4s, v8.s[0] 586 B 3b 587 588 # Store odd width 5897: 590 TBZ x1, 2, 8f 591 $if INC: 592 STR q28, [x7], 16 593 MOV v28.16b, v29.16b 594 STR q26, [x13], 16 595 MOV v26.16b, v27.16b 596 STR q24, [x17], 16 597 MOV v24.16b, v25.16b 598 STR q22, [x16], 16 599 MOV v22.16b, v23.16b 600 STR q20, [x6], 16 601 MOV v20.16b, v21.16b 602 $else: 603 STR q20, [x6], 16 604 MOV v20.16b, v21.16b 605 STR q22, [x16], 16 606 MOV v22.16b, v23.16b 607 STR q24, [x17], 16 608 MOV v24.16b, v25.16b 609 STR q26, [x13], 16 610 MOV v26.16b, v27.16b 611 STR q28, [x7], 16 612 MOV v28.16b, v29.16b 6138: 614 TBZ x1, 1, 9f 615 $if INC: 616 STR d28, [x7], 8 617 DUP d28, v28.d[1] 618 STR d26, [x13], 8 619 DUP d26, v26.d[1] 620 STR d24, [x17], 8 621 DUP d24, v24.d[1] 622 STR d22, [x16], 8 623 DUP d22, v22.d[1] 624 STR d20, [x6], 8 625 DUP d20, v20.d[1] 626 $else: 627 STR d20, [x6], 8 628 DUP d20, v20.d[1] 629 STR d22, [x16], 8 630 DUP d22, v22.d[1] 631 STR d24, [x17], 8 632 DUP d24, v24.d[1] 633 STR d26, [x13], 8 634 DUP d26, v26.d[1] 635 STR d28, [x7], 8 636 DUP d28, v28.d[1] 637 6389: 639 TBZ x1, 0, 10f 640 $if INC: 641 STR s28, [x7] 642 STR s26, [x13] 643 STR s24, [x17] 644 STR s22, [x16] 645 STR s20, [x6] 646 $else: 647 STR s20, [x6] 648 STR s22, [x16] 649 STR s24, [x17] 650 STR s26, [x13] 651 STR s28, [x7] 65210: 653 # Restore d8-d15 from stack 654 LDP d14, d15, [sp, 32] 655 LDP d12, d13, [sp, 16] 656 LDP d8, d9, [sp], 48 657 RET 658 659END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 660 661#ifdef __ELF__ 662.section ".note.GNU-stack","",%progbits 663#endif 664