1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# unused compared to 5x8 25# x4 a5 26# x7 c5 27# A5 v10 v11 28# C v30 v31 29 30# d8-d15 need to be preserved if used. 31# x19-x30 need to be preserved if used. x18 is reserved for OS. 32 33# A pointers 34# x3 a0 35# x9 a1 36# x10 a2 37# x11 a3 38# x12 a4 39 40# C pointers 41# x6 c0 42# x16 c1 43# x17 c2 44# x13 c3 45# x7 c4 46 47# Vector register usage 48# A0 v0 v1 49# A1 v2 v3 50# A2 v4 v5 51# A3 v6 v7 52# A4 v8 v9 53# B v12 v13 v14 v15 54# B v16 v17 v18 v19 55# C v20 v21 56# C v22 v23 57# C v24 v25 58# C v26 v27 59# C v28 v29 60# Clamp v30 v31 61 62BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 63 64 # Clamp A and C pointers / Save d8-d15 on stack 65 STP d8, d9, [sp, -48]! 66 CMP x0, 2 // if mr < 2 67 ADD x9, x3, x4 // a1 = a0 + a_stride 68 ADD x16, x6, x7 // c1 = c0 + cm_stride 69 CSEL x9, x3, x9, LO // a1 = a0 70 CSEL x16, x6, x16, LO // c1 = c0 71 72 STP d12, d13, [sp, 16] 73 ADD x10, x9, x4 // a2 = a1 + a_stride 74 ADD x17, x16, x7 // c2 = c1 + cm_stride 75 // if mr <= 2 76 CSEL x10, x9, x10, LS // a2 = a1 77 CSEL x17, x16, x17, LS // c2 = c1 78 79 STP d14, d15, [sp, 32] 80 CMP x0, 4 // if mr < 4 81 ADD x11, x10, x4 // a3 = a2 + a_stride 82 ADD x13, x17, x7 // c3 = c2 + cm_stride 83 CSEL x11, x10, x11, LO // a3 = a2 84 CSEL x13, x17, x13, LO // c3 = c2 85 86 $if INC: 87 # Load acc, params pointer 88 LDP x15, x8, [sp, 56] 89 $else: 90 # Load params pointer 91 LDR x8, [sp, 56] 92 93 ADD x12, x11, x4 // a4 = a3 + a_stride 94 ADD x7, x13, x7 // c4 = c3 + cm_stride 95 // if mr <= 5 96 CSEL x12, x11, x12, LS // a4 = a3 97 CSEL x7, x13, x7, LS // c4 = c3 98 99 # Load clamp values 100 LD2R {v30.4s, v31.4s}, [x8] 101 102 # Load cn_stride 103 LDR x14, [sp, 48] 104 1050: 106 $if INC: 107 # Load initial accumulators 108 LDP q20, q21, [x15], 32 109 LDP q22, q23, [x15], 32 110 LDP q24, q25, [x15], 32 111 LDP q26, q27, [x15], 32 112 LDP q28, q29, [x15], 32 113 $if PREFETCH: 114 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 115 PRFM PLDL1KEEP, [x5, 64] 116 PRFM PLDL1KEEP, [x5, 128] 117 PRFM PLDL1KEEP, [x5, 192] 118 PRFM PLDL1KEEP, [x3] // Prefetch A 119 PRFM PLDL1KEEP, [x9] 120 PRFM PLDL1KEEP, [x10] 121 PRFM PLDL1KEEP, [x11] 122 PRFM PLDL1KEEP, [x12] 123 $else: 124 # Load initial bias from w into accumulators 125 LDP q20, q21, [x5], 32 126 MOV v22.16b, v20.16b 127 $if PREFETCH: 128 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 129 MOV v23.16b, v21.16b 130 $if PREFETCH: 131 PRFM PLDL1KEEP, [x5, 64] 132 MOV v24.16b, v20.16b 133 $if PREFETCH: 134 PRFM PLDL1KEEP, [x5, 128] 135 MOV v25.16b, v21.16b 136 $if PREFETCH: 137 PRFM PLDL1KEEP, [x5, 192] 138 MOV v26.16b, v20.16b 139 $if PREFETCH: 140 PRFM PLDL1KEEP, [x3] // Prefetch A 141 MOV v27.16b, v21.16b 142 $if PREFETCH: 143 PRFM PLDL1KEEP, [x9] 144 MOV v28.16b, v20.16b 145 $if PREFETCH: 146 PRFM PLDL1KEEP, [x10] 147 MOV v29.16b, v21.16b 148 $if PREFETCH: 149 PRFM PLDL1KEEP, [x11] 150 PRFM PLDL1KEEP, [x12] 151 152 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 153 SUBS x0, x2, 32 // k = kc - 32 154 B.LO 4f 155 156 # Prologue - loads for main loop of 80 FMA 157 LDR q0, [x3], 16 158 LDR q2, [x9], 16 159 LDR q4, [x10], 16 160 LDR q6, [x11], 16 161 LDR q8, [x12], 16 162 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 163 LDP q14, q15, [x5], 32 164 LDP q16, q17, [x5], 32 165 166 # Is there at least 8 floats (32 bytes) for main loop? 167 SUBS x0, x0, 32 168 B.LO 2f 169 170 # Main loop - 8 floats of A (32 bytes) 171 # 80 FMA + 5 LDP A + 8 LDP B 1721: 173 # First group of 4 A. 40 FMA. 174 FMLA v20.4s, v12.4s, v0.s[0] 175 LDP q18, q19, [x5], 32 // Load last B 176 FMLA v22.4s, v12.4s, v2.s[0] 177 FMLA v24.4s, v12.4s, v4.s[0] 178 FMLA v26.4s, v12.4s, v6.s[0] 179 $if PREFETCH: 180 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 181 FMLA v28.4s, v12.4s, v8.s[0] 182 FMLA v21.4s, v13.4s, v0.s[0] 183 FMLA v23.4s, v13.4s, v2.s[0] 184 $if PREFETCH: 185 PRFM PLDL1KEEP, [x5, 256] 186 FMLA v25.4s, v13.4s, v4.s[0] 187 FMLA v27.4s, v13.4s, v6.s[0] 188 FMLA v29.4s, v13.4s, v8.s[0] 189 LDR q1, [x3], 16 // Load next 5 A 190 191 FMLA v20.4s, v14.4s, v0.s[1] 192 FMLA v22.4s, v14.4s, v2.s[1] 193 FMLA v24.4s, v14.4s, v4.s[1] 194 LDR q3, [x9], 16 195 FMLA v26.4s, v14.4s, v6.s[1] 196 FMLA v28.4s, v14.4s, v8.s[1] 197 FMLA v21.4s, v15.4s, v0.s[1] 198 LDR q5, [x10], 16 199 FMLA v23.4s, v15.4s, v2.s[1] 200 FMLA v25.4s, v15.4s, v4.s[1] 201 FMLA v27.4s, v15.4s, v6.s[1] 202 LDR q7, [x11], 16 203 FMLA v29.4s, v15.4s, v8.s[1] 204 205 FMLA v20.4s, v16.4s, v0.s[2] 206 FMLA v22.4s, v16.4s, v2.s[2] 207 LDR q9, [x12], 16 208 FMLA v24.4s, v16.4s, v4.s[2] 209 FMLA v26.4s, v16.4s, v6.s[2] 210 FMLA v28.4s, v16.4s, v8.s[2] 211 LDP q12, q13, [x5], 32 // Load 4 B 212 FMLA v21.4s, v17.4s, v0.s[2] 213 FMLA v23.4s, v17.4s, v2.s[2] 214 FMLA v25.4s, v17.4s, v4.s[2] 215 LDP q14, q15, [x5], 32 216 FMLA v27.4s, v17.4s, v6.s[2] 217 FMLA v29.4s, v17.4s, v8.s[2] 218 219 FMLA v20.4s, v18.4s, v0.s[3] 220 LDP q16, q17, [x5], 32 221 FMLA v22.4s, v18.4s, v2.s[3] 222 FMLA v24.4s, v18.4s, v4.s[3] 223 FMLA v26.4s, v18.4s, v6.s[3] 224 FMLA v28.4s, v18.4s, v8.s[3] 225 FMLA v21.4s, v19.4s, v0.s[3] 226 FMLA v23.4s, v19.4s, v2.s[3] 227 FMLA v25.4s, v19.4s, v4.s[3] 228 FMLA v27.4s, v19.4s, v6.s[3] 229 FMLA v29.4s, v19.4s, v8.s[3] 230 LDP q18, q19, [x5], 32 231 232 # Second group of 4 A. 40 FMA. 233 FMLA v20.4s, v12.4s, v1.s[0] 234 FMLA v22.4s, v12.4s, v3.s[0] 235 FMLA v24.4s, v12.4s, v5.s[0] 236 LDR q0, [x3], 16 // Load next 5 A 237 FMLA v26.4s, v12.4s, v7.s[0] 238 FMLA v28.4s, v12.4s, v9.s[0] 239 FMLA v21.4s, v13.4s, v1.s[0] 240 LDR q2, [x9], 16 241 FMLA v23.4s, v13.4s, v3.s[0] 242 FMLA v25.4s, v13.4s, v5.s[0] 243 FMLA v27.4s, v13.4s, v7.s[0] 244 LDR q4, [x10], 16 245 FMLA v29.4s, v13.4s, v9.s[0] 246 247 FMLA v20.4s, v14.4s, v1.s[1] 248 FMLA v22.4s, v14.4s, v3.s[1] 249 LDR q6, [x11], 16 250 FMLA v24.4s, v14.4s, v5.s[1] 251 FMLA v26.4s, v14.4s, v7.s[1] 252 FMLA v28.4s, v14.4s, v9.s[1] 253 LDR q8, [x12], 16 254 FMLA v21.4s, v15.4s, v1.s[1] 255 FMLA v23.4s, v15.4s, v3.s[1] 256 FMLA v25.4s, v15.4s, v5.s[1] 257 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 258 FMLA v27.4s, v15.4s, v7.s[1] 259 FMLA v29.4s, v15.4s, v9.s[1] 260 261 FMLA v20.4s, v16.4s, v1.s[2] 262 LDP q14, q15, [x5], 32 263 FMLA v22.4s, v16.4s, v3.s[2] 264 FMLA v24.4s, v16.4s, v5.s[2] 265 FMLA v26.4s, v16.4s, v7.s[2] 266 FMLA v28.4s, v16.4s, v9.s[2] 267 FMLA v21.4s, v17.4s, v1.s[2] 268 FMLA v23.4s, v17.4s, v3.s[2] 269 FMLA v25.4s, v17.4s, v5.s[2] 270 FMLA v27.4s, v17.4s, v7.s[2] 271 FMLA v29.4s, v17.4s, v9.s[2] 272 LDP q16, q17, [x5], 32 273 274 FMLA v20.4s, v18.4s, v1.s[3] 275 FMLA v22.4s, v18.4s, v3.s[3] 276 SUBS x0, x0, 32 277 FMLA v24.4s, v18.4s, v5.s[3] 278 FMLA v26.4s, v18.4s, v7.s[3] 279 FMLA v28.4s, v18.4s, v9.s[3] 280 FMLA v21.4s, v19.4s, v1.s[3] 281 FMLA v23.4s, v19.4s, v3.s[3] 282 FMLA v25.4s, v19.4s, v5.s[3] 283 FMLA v27.4s, v19.4s, v7.s[3] 284 FMLA v29.4s, v19.4s, v9.s[3] 285 B.HS 1b 286 287 # Epilogue - 8 floats of A (32 bytes) 288 # 80 FMA + 5 LDP A + 8 LDP B 289 # First block same as main loop. Second block has no preloads. 2902: 291 # First group of 4 A. 40 FMA. 292 FMLA v20.4s, v12.4s, v0.s[0] 293 LDP q18, q19, [x5], 32 // Load last B 294 FMLA v22.4s, v12.4s, v2.s[0] 295 FMLA v24.4s, v12.4s, v4.s[0] 296 FMLA v26.4s, v12.4s, v6.s[0] 297 $if PREFETCH: 298 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 299 FMLA v28.4s, v12.4s, v8.s[0] 300 FMLA v21.4s, v13.4s, v0.s[0] 301 FMLA v23.4s, v13.4s, v2.s[0] 302 $if PREFETCH: 303 PRFM PLDL1KEEP, [x5, 256] 304 FMLA v25.4s, v13.4s, v4.s[0] 305 FMLA v27.4s, v13.4s, v6.s[0] 306 FMLA v29.4s, v13.4s, v8.s[0] 307 LDR q1, [x3], 16 // Load next 5 A 308 309 FMLA v20.4s, v14.4s, v0.s[1] 310 FMLA v22.4s, v14.4s, v2.s[1] 311 FMLA v24.4s, v14.4s, v4.s[1] 312 LDR q3, [x9], 16 313 FMLA v26.4s, v14.4s, v6.s[1] 314 FMLA v28.4s, v14.4s, v8.s[1] 315 FMLA v21.4s, v15.4s, v0.s[1] 316 LDR q5, [x10], 16 317 FMLA v23.4s, v15.4s, v2.s[1] 318 FMLA v25.4s, v15.4s, v4.s[1] 319 FMLA v27.4s, v15.4s, v6.s[1] 320 LDR q7, [x11], 16 321 FMLA v29.4s, v15.4s, v8.s[1] 322 323 FMLA v20.4s, v16.4s, v0.s[2] 324 FMLA v22.4s, v16.4s, v2.s[2] 325 LDR q9, [x12], 16 326 FMLA v24.4s, v16.4s, v4.s[2] 327 FMLA v26.4s, v16.4s, v6.s[2] 328 FMLA v28.4s, v16.4s, v8.s[2] 329 LDP q12, q13, [x5], 32 // Load 4 B 330 FMLA v21.4s, v17.4s, v0.s[2] 331 FMLA v23.4s, v17.4s, v2.s[2] 332 FMLA v25.4s, v17.4s, v4.s[2] 333 LDP q14, q15, [x5], 32 334 FMLA v27.4s, v17.4s, v6.s[2] 335 FMLA v29.4s, v17.4s, v8.s[2] 336 337 FMLA v20.4s, v18.4s, v0.s[3] 338 LDP q16, q17, [x5], 32 339 FMLA v22.4s, v18.4s, v2.s[3] 340 FMLA v24.4s, v18.4s, v4.s[3] 341 FMLA v26.4s, v18.4s, v6.s[3] 342 FMLA v28.4s, v18.4s, v8.s[3] 343 FMLA v21.4s, v19.4s, v0.s[3] 344 FMLA v23.4s, v19.4s, v2.s[3] 345 FMLA v25.4s, v19.4s, v4.s[3] 346 FMLA v27.4s, v19.4s, v6.s[3] 347 FMLA v29.4s, v19.4s, v8.s[3] 348 LDP q18, q19, [x5], 32 349 350 # Second group of 4 A. 40 FMA. 351 FMLA v20.4s, v12.4s, v1.s[0] 352 FMLA v22.4s, v12.4s, v3.s[0] 353 FMLA v24.4s, v12.4s, v5.s[0] 354 FMLA v26.4s, v12.4s, v7.s[0] 355 FMLA v28.4s, v12.4s, v9.s[0] 356 FMLA v21.4s, v13.4s, v1.s[0] 357 FMLA v23.4s, v13.4s, v3.s[0] 358 FMLA v25.4s, v13.4s, v5.s[0] 359 FMLA v27.4s, v13.4s, v7.s[0] 360 FMLA v29.4s, v13.4s, v9.s[0] 361 362 FMLA v20.4s, v14.4s, v1.s[1] 363 FMLA v22.4s, v14.4s, v3.s[1] 364 FMLA v24.4s, v14.4s, v5.s[1] 365 FMLA v26.4s, v14.4s, v7.s[1] 366 FMLA v28.4s, v14.4s, v9.s[1] 367 FMLA v21.4s, v15.4s, v1.s[1] 368 FMLA v23.4s, v15.4s, v3.s[1] 369 FMLA v25.4s, v15.4s, v5.s[1] 370 FMLA v27.4s, v15.4s, v7.s[1] 371 FMLA v29.4s, v15.4s, v9.s[1] 372 373 FMLA v20.4s, v16.4s, v1.s[2] 374 FMLA v22.4s, v16.4s, v3.s[2] 375 FMLA v24.4s, v16.4s, v5.s[2] 376 FMLA v26.4s, v16.4s, v7.s[2] 377 FMLA v28.4s, v16.4s, v9.s[2] 378 FMLA v21.4s, v17.4s, v1.s[2] 379 FMLA v23.4s, v17.4s, v3.s[2] 380 FMLA v25.4s, v17.4s, v5.s[2] 381 FMLA v27.4s, v17.4s, v7.s[2] 382 FMLA v29.4s, v17.4s, v9.s[2] 383 TST x0, 31 384 385 FMLA v20.4s, v18.4s, v1.s[3] 386 FMLA v22.4s, v18.4s, v3.s[3] 387 FMLA v24.4s, v18.4s, v5.s[3] 388 FMLA v26.4s, v18.4s, v7.s[3] 389 FMLA v28.4s, v18.4s, v9.s[3] 390 FMLA v21.4s, v19.4s, v1.s[3] 391 FMLA v23.4s, v19.4s, v3.s[3] 392 FMLA v25.4s, v19.4s, v5.s[3] 393 FMLA v27.4s, v19.4s, v7.s[3] 394 FMLA v29.4s, v19.4s, v9.s[3] 395 B.NE 4f 396 397 # Clamp 3983: 399 FMIN v20.4s, v20.4s, v30.4s 400 SUBS x1, x1, 8 401 FMIN v21.4s, v21.4s, v30.4s 402 FMIN v22.4s, v22.4s, v30.4s 403 FMIN v23.4s, v23.4s, v30.4s 404 FMIN v24.4s, v24.4s, v30.4s 405 FMIN v25.4s, v25.4s, v30.4s 406 FMIN v26.4s, v26.4s, v30.4s 407 FMIN v27.4s, v27.4s, v30.4s 408 FMIN v28.4s, v28.4s, v30.4s 409 FMIN v29.4s, v29.4s, v30.4s 410 FMAX v20.4s, v20.4s, v31.4s 411 FMAX v21.4s, v21.4s, v31.4s 412 FMAX v22.4s, v22.4s, v31.4s 413 FMAX v23.4s, v23.4s, v31.4s 414 FMAX v24.4s, v24.4s, v31.4s 415 FMAX v25.4s, v25.4s, v31.4s 416 FMAX v26.4s, v26.4s, v31.4s 417 FMAX v27.4s, v27.4s, v31.4s 418 FMAX v28.4s, v28.4s, v31.4s 419 FMAX v29.4s, v29.4s, v31.4s 420 421 # Store full 5 x 8 422 B.LO 7f 423 424 $if INC: 425 SUB x3, x3, x2 // a0 -= kc 426 STP q28, q29, [x7] 427 ADD x7, x7, x14 428 SUB x9, x9, x2 // a1 -= kc 429 STP q26, q27, [x13] 430 ADD x13, x13, x14 431 SUB x10, x10, x2 // a2 -= kc 432 STP q24, q25, [x17] 433 ADD x17, x17, x14 434 SUB x11, x11, x2 // a3 -= kc 435 STP q22, q23, [x16] 436 ADD x16, x16, x14 437 SUB x12, x12, x2 // a4 -= kc 438 STP q20, q21, [x6] 439 ADD x6, x6, x14 440 $else: 441 STP q20, q21, [x6] 442 ADD x6, x6, x14 443 SUB x3, x3, x2 // a0 -= kc 444 STP q22, q23, [x16] 445 ADD x16, x16, x14 446 SUB x9, x9, x2 // a1 -= kc 447 STP q24, q25, [x17] 448 ADD x17, x17, x14 449 SUB x10, x10, x2 // a2 -= kc 450 STP q26, q27, [x13] 451 ADD x13, x13, x14 452 SUB x11, x11, x2 // a3 -= kc 453 STP q28, q29, [x7] 454 ADD x7, x7, x14 455 SUB x12, x12, x2 // a4 -= kc 456 457 B.HI 0b 458 459 # Restore d8-d15 from stack 460 LDP d14, d15, [sp, 32] 461 LDP d12, d13, [sp, 16] 462 LDP d8, d9, [sp], 48 463 RET 464 465 # Load clamp values 4664: 467 # Is there a remainder?- 4 floats of A (16 bytes) 468 TBZ x0, 4, 5f 469 470 # Remainder- 4 floats of A (16 bytes) 471 # Load A 472 LDR q0, [x3], 16 473 LDR q2, [x9], 16 474 LDR q4, [x10], 16 475 LDR q6, [x11], 16 476 LDR q8, [x12], 16 477 # Load B 478 LDP q12, q13, [x5], 32 479 LDP q14, q15, [x5], 32 480 LDP q16, q17, [x5], 32 481 LDP q18, q19, [x5], 32 482 483 FMLA v20.4s, v12.4s, v0.s[0] 484 FMLA v22.4s, v12.4s, v2.s[0] 485 FMLA v24.4s, v12.4s, v4.s[0] 486 FMLA v26.4s, v12.4s, v6.s[0] 487 FMLA v28.4s, v12.4s, v8.s[0] 488 FMLA v21.4s, v13.4s, v0.s[0] 489 FMLA v23.4s, v13.4s, v2.s[0] 490 FMLA v25.4s, v13.4s, v4.s[0] 491 FMLA v27.4s, v13.4s, v6.s[0] 492 FMLA v29.4s, v13.4s, v8.s[0] 493 494 FMLA v20.4s, v14.4s, v0.s[1] 495 FMLA v22.4s, v14.4s, v2.s[1] 496 FMLA v24.4s, v14.4s, v4.s[1] 497 FMLA v26.4s, v14.4s, v6.s[1] 498 FMLA v28.4s, v14.4s, v8.s[1] 499 FMLA v21.4s, v15.4s, v0.s[1] 500 FMLA v23.4s, v15.4s, v2.s[1] 501 FMLA v25.4s, v15.4s, v4.s[1] 502 FMLA v27.4s, v15.4s, v6.s[1] 503 FMLA v29.4s, v15.4s, v8.s[1] 504 505 FMLA v20.4s, v16.4s, v0.s[2] 506 FMLA v22.4s, v16.4s, v2.s[2] 507 FMLA v24.4s, v16.4s, v4.s[2] 508 FMLA v26.4s, v16.4s, v6.s[2] 509 FMLA v28.4s, v16.4s, v8.s[2] 510 FMLA v21.4s, v17.4s, v0.s[2] 511 FMLA v23.4s, v17.4s, v2.s[2] 512 FMLA v25.4s, v17.4s, v4.s[2] 513 FMLA v27.4s, v17.4s, v6.s[2] 514 FMLA v29.4s, v17.4s, v8.s[2] 515 516 FMLA v20.4s, v18.4s, v0.s[3] 517 FMLA v22.4s, v18.4s, v2.s[3] 518 FMLA v24.4s, v18.4s, v4.s[3] 519 FMLA v26.4s, v18.4s, v6.s[3] 520 FMLA v28.4s, v18.4s, v8.s[3] 521 FMLA v21.4s, v19.4s, v0.s[3] 522 FMLA v23.4s, v19.4s, v2.s[3] 523 FMLA v25.4s, v19.4s, v4.s[3] 524 FMLA v27.4s, v19.4s, v6.s[3] 525 FMLA v29.4s, v19.4s, v8.s[3] 526 527 # Is there a remainder?- 2 floats of A (8 bytes) 5285: 529 TBZ x0, 3, 6f 530 531 # Remainder- 2 floats of A (8 bytes) 532 # Load A 533 LDR d0, [x3], 8 534 LDR d2, [x9], 8 535 LDR d4, [x10], 8 536 LDR d6, [x11], 8 537 LDR d8, [x12], 8 538 # Load B 539 LDP q12, q13, [x5], 32 540 LDP q14, q15, [x5], 32 541 542 FMLA v20.4s, v12.4s, v0.s[0] 543 FMLA v22.4s, v12.4s, v2.s[0] 544 FMLA v24.4s, v12.4s, v4.s[0] 545 FMLA v26.4s, v12.4s, v6.s[0] 546 FMLA v28.4s, v12.4s, v8.s[0] 547 FMLA v21.4s, v13.4s, v0.s[0] 548 FMLA v23.4s, v13.4s, v2.s[0] 549 FMLA v25.4s, v13.4s, v4.s[0] 550 FMLA v27.4s, v13.4s, v6.s[0] 551 FMLA v29.4s, v13.4s, v8.s[0] 552 553 FMLA v20.4s, v14.4s, v0.s[1] 554 FMLA v22.4s, v14.4s, v2.s[1] 555 FMLA v24.4s, v14.4s, v4.s[1] 556 FMLA v26.4s, v14.4s, v6.s[1] 557 FMLA v28.4s, v14.4s, v8.s[1] 558 FMLA v21.4s, v15.4s, v0.s[1] 559 FMLA v23.4s, v15.4s, v2.s[1] 560 FMLA v25.4s, v15.4s, v4.s[1] 561 FMLA v27.4s, v15.4s, v6.s[1] 562 FMLA v29.4s, v15.4s, v8.s[1] 563 564 # Is there a remainder?- 1 float of A (4 bytes) 5656: 566 TBZ x0, 2, 3b 567 568 # Remainder- 1 float of A (4 bytes) 569 # Load A 570 LDR s0, [x3], 4 571 LDR s2, [x9], 4 572 LDR s4, [x10], 4 573 LDR s6, [x11], 4 574 LDR s8, [x12], 4 575 # Load B 576 LDP q12, q13, [x5], 32 577 578 FMLA v20.4s, v12.4s, v0.s[0] 579 FMLA v22.4s, v12.4s, v2.s[0] 580 FMLA v24.4s, v12.4s, v4.s[0] 581 FMLA v26.4s, v12.4s, v6.s[0] 582 FMLA v28.4s, v12.4s, v8.s[0] 583 FMLA v21.4s, v13.4s, v0.s[0] 584 FMLA v23.4s, v13.4s, v2.s[0] 585 FMLA v25.4s, v13.4s, v4.s[0] 586 FMLA v27.4s, v13.4s, v6.s[0] 587 FMLA v29.4s, v13.4s, v8.s[0] 588 B 3b 589 590 # Store odd width 5917: 592 TBZ x1, 2, 8f 593 $if INC: 594 STR q28, [x7], 16 595 MOV v28.16b, v29.16b 596 STR q26, [x13], 16 597 MOV v26.16b, v27.16b 598 STR q24, [x17], 16 599 MOV v24.16b, v25.16b 600 STR q22, [x16], 16 601 MOV v22.16b, v23.16b 602 STR q20, [x6], 16 603 MOV v20.16b, v21.16b 604 $else: 605 STR q20, [x6], 16 606 MOV v20.16b, v21.16b 607 STR q22, [x16], 16 608 MOV v22.16b, v23.16b 609 STR q24, [x17], 16 610 MOV v24.16b, v25.16b 611 STR q26, [x13], 16 612 MOV v26.16b, v27.16b 613 STR q28, [x7], 16 614 MOV v28.16b, v29.16b 6158: 616 TBZ x1, 1, 9f 617 $if INC: 618 STR d28, [x7], 8 619 DUP d28, v28.d[1] 620 STR d26, [x13], 8 621 DUP d26, v26.d[1] 622 STR d24, [x17], 8 623 DUP d24, v24.d[1] 624 STR d22, [x16], 8 625 DUP d22, v22.d[1] 626 STR d20, [x6], 8 627 DUP d20, v20.d[1] 628 $else: 629 STR d20, [x6], 8 630 DUP d20, v20.d[1] 631 STR d22, [x16], 8 632 DUP d22, v22.d[1] 633 STR d24, [x17], 8 634 DUP d24, v24.d[1] 635 STR d26, [x13], 8 636 DUP d26, v26.d[1] 637 STR d28, [x7], 8 638 DUP d28, v28.d[1] 639 6409: 641 TBZ x1, 0, 10f 642 $if INC: 643 STR s28, [x7] 644 STR s26, [x13] 645 STR s24, [x17] 646 STR s22, [x16] 647 STR s20, [x6] 648 $else: 649 STR s20, [x6] 650 STR s22, [x16] 651 STR s24, [x17] 652 STR s26, [x13] 653 STR s28, [x7] 65410: 655 # Restore d8-d15 from stack 656 LDP d14, d15, [sp, 32] 657 LDP d12, d13, [sp, 16] 658 LDP d8, d9, [sp], 48 659 RET 660 661END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_5x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 662 663#ifdef __ELF__ 664.section ".note.GNU-stack","",%progbits 665#endif 666