1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# size_t ks, x3 / x9 13# const float**restrict a, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x10 18# size_t a_offset, [sp + 8] -> x11 19# const float* zero, [sp + 16] -> x12 20# const xnn_f32_output_params params [sp + 24] -> x8 21 22# d8-d15 need to be preserved if used. 23# x19-30 need to be preserved if used. 24 25# A pointers 26# x14 a0 27# x15 a1 28# x20 a2 29# x21 a3 30# x22 a4 31# x23 a5 32 33# C pointers 34# x6 c0 35# x16 c1 36# x17 c2 37# x18 c3 38# x13 c4 39# x7 c5 40 41# Vector register usage 42# A0 v0 v6 43# A1 v1 v7 44# A2 v2 v8 45# A3 v3 v9 46# A4 v4 v10 47# A5 v5 v11 48# B v12 v13 v14 v15 49# B v16 v17 v18 v19 50# C v20 v21 51# C v22 v23 52# C v24 v25 53# C v26 v27 54# C v28 v29 55# C v30 v31 56# Clamp v6 v7 57 58BEGIN_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73 59 60 # Load cn_stride, a_offset 61 LDP x10, x11, [sp] 62 63 # Load zero, clamping params pointer 64 LDP x12, x8, [sp, 16] 65 66 # Clamp C pointers 67 STP d8, d9, [sp, -96]! 68 CMP x0, 2 // if mr < 2 69 ADD x16, x6, x7 // c1 = c0 + cm_stride 70 CSEL x16, x6, x16, LO // c1 = c0 71 72 STP d10, d11, [sp, 16] 73 ADD x17, x16, x7 // c2 = c1 + cm_stride 74 // if mr <= 2 75 CSEL x17, x16, x17, LS // c2 = c1 76 77 STP d12, d13, [sp, 32] 78 CMP x0, 4 // if mr < 4 79 ADD x18, x17, x7 // c3 = c2 + cm_stride 80 CSEL x18, x17, x18, LO // c3 = c2 81 82 STP d14, d15, [sp, 48] 83 ADD x13, x18, x7 // c4 = c3 + cm_stride 84 // if mr <= 5 85 CSEL x13, x18, x13, LS // c4 = c3 86 87 # Save x20,x21,x22,x23 on stack 88 STP x20, x21, [sp, 64] 89 STP x22, x23, [sp, 80] 90 91 CMP x0, 6 // if mr < 6 92 ADD x7, x13, x7 // c5 = c4 + cm_stride 93 CSEL x7, x13, x7, LO // c5 = c4 94 95 # Load zero, clamping params pointer 96 LDP x12, x8, [sp, 112] 97 98 # Load cn_stride, a_offset 99 LDP x10, x11, [sp, 96] 100 101 # Load clamping_params values 102 LD2R {v6.4s, v7.4s}, [x8] 103 1040: 105 # Load initial bias from w into accumulators 106 LD1 {v20.16b, v21.16b}, [x5], 32 107 MOV v22.16b, v20.16b 108 MOV v23.16b, v21.16b 109 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 110 MOV v24.16b, v20.16b 111 MOV v25.16b, v21.16b 112 PRFM PLDL1KEEP, [x5, 64] 113 MOV v26.16b, v20.16b 114 MOV v27.16b, v21.16b 115 PRFM PLDL1KEEP, [x5, 128] 116 MOV v28.16b, v20.16b 117 MOV v29.16b, v21.16b 118 PRFM PLDL1KEEP, [x5, 192] 119 MOV v30.16b, v20.16b 120 MOV v31.16b, v21.16b 121 122 MOV x9, x3 // p = ks 123 1241: 125 # Load next 6 A pointers 126 LDP x14, x15, [x4], 16 127 LDP x20, x21, [x4], 16 128 LDP x22, x23, [x4], 16 129 130 CMP x14, x12 // if a0 == zero 131 ADD x14, x14, x11 // a0 += a_offset 132 CSEL x14, x12, x14, EQ // a0 = zero, else += a0 + a_offset 133 CMP x15, x12 // if a1 == zero 134 ADD x15, x15, x11 // a1 += a_offset 135 CSEL x15, x12, x15, EQ // a1 = zero, else += a1 + a_offset 136 CMP x20, x12 // if a2 == zero 137 ADD x20, x20, x11 // a2 += a_offset 138 CSEL x20, x12, x20, EQ // a2 = zero, else += a2 + a_offset 139 CMP x21, x12 // if a3 == zero 140 ADD x21, x21, x11 // a3 += a_offset 141 CSEL x21, x12, x21, EQ // a3 = zero, else += a3 + a_offset 142 CMP x22, x12 // if a4 == zero 143 ADD x22, x22, x11 // a4 += a_offset 144 CSEL x22, x12, x22, EQ // a4 = zero, else += a4 + a_offset 145 CMP x23, x12 // if a5 == zero 146 ADD x23, x23, x11 // a5 += a_offset 147 CSEL x23, x12, x23, EQ // a5 = zero, else += a5 + a_offset 148 149 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 150 SUBS x0, x2, 32 // k = kc - 32 151 B.LO 5f 152 153 # Prologue - loads for main loop of 96 FMA 154 # load A0 to A4 but not A5 155 LDP q0, q6, [x14], 32 156 LDP q1, q7, [x15], 32 157 LDP q2, q8, [x20], 32 158 LDP q3, q9, [x21], 32 159 LDP q4, q10, [x22], 32 160 # load first set of B 161 LDP q12, q13, [x5], 32 162 LDP q14, q15, [x5], 32 163 164 # Is there at least 8 floats (32 bytes) for main loop? 165 SUBS x0, x0, 32 166 B.LO 3f 167 168 # Main loop - 8 floats of A (32 bytes) 169 # 96 FMA + 6 LDP A + 8 LDP B 1702: 171 # First group of 4 A. 48 FMA. Loads A5 172 173 LDP q5, q11, [x23], 32 174 FMLA v20.4s, v12.4s, v0.s[0] 175 FMLA v22.4s, v12.4s, v1.s[0] 176 LDP q16, q17, [x5], 32 177 FMLA v24.4s, v12.4s, v2.s[0] 178 FMLA v26.4s, v12.4s, v3.s[0] 179 LDP q18, q19, [x5], 32 180 FMLA v28.4s, v12.4s, v4.s[0] 181 FMLA v30.4s, v12.4s, v5.s[0] 182 FMLA v21.4s, v13.4s, v0.s[0] 183 FMLA v23.4s, v13.4s, v1.s[0] 184 FMLA v25.4s, v13.4s, v2.s[0] 185 FMLA v27.4s, v13.4s, v3.s[0] 186 FMLA v29.4s, v13.4s, v4.s[0] 187 FMLA v31.4s, v13.4s, v5.s[0] 188 189 FMLA v20.4s, v14.4s, v0.s[1] 190 FMLA v22.4s, v14.4s, v1.s[1] 191 FMLA v24.4s, v14.4s, v2.s[1] 192 FMLA v26.4s, v14.4s, v3.s[1] 193 FMLA v28.4s, v14.4s, v4.s[1] 194 FMLA v30.4s, v14.4s, v5.s[1] 195 FMLA v21.4s, v15.4s, v0.s[1] 196 FMLA v23.4s, v15.4s, v1.s[1] 197 FMLA v25.4s, v15.4s, v2.s[1] 198 FMLA v27.4s, v15.4s, v3.s[1] 199 FMLA v29.4s, v15.4s, v4.s[1] 200 FMLA v31.4s, v15.4s, v5.s[1] 201 202 LDP q12, q13, [x5], 32 203 FMLA v20.4s, v16.4s, v0.s[2] 204 FMLA v22.4s, v16.4s, v1.s[2] 205 LDP q14, q15, [x5], 32 206 FMLA v24.4s, v16.4s, v2.s[2] 207 FMLA v26.4s, v16.4s, v3.s[2] 208 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 209 FMLA v28.4s, v16.4s, v4.s[2] 210 FMLA v30.4s, v16.4s, v5.s[2] 211 FMLA v21.4s, v17.4s, v0.s[2] 212 FMLA v23.4s, v17.4s, v1.s[2] 213 PRFM PLDL1KEEP, [x5, 256] 214 FMLA v25.4s, v17.4s, v2.s[2] 215 FMLA v27.4s, v17.4s, v3.s[2] 216 FMLA v29.4s, v17.4s, v4.s[2] 217 FMLA v31.4s, v17.4s, v5.s[2] 218 219 FMLA v20.4s, v18.4s, v0.s[3] 220 FMLA v22.4s, v18.4s, v1.s[3] 221 FMLA v24.4s, v18.4s, v2.s[3] 222 FMLA v26.4s, v18.4s, v3.s[3] 223 FMLA v28.4s, v18.4s, v4.s[3] 224 FMLA v30.4s, v18.4s, v5.s[3] 225 FMLA v21.4s, v19.4s, v0.s[3] 226 FMLA v23.4s, v19.4s, v1.s[3] 227 FMLA v25.4s, v19.4s, v2.s[3] 228 FMLA v27.4s, v19.4s, v3.s[3] 229 FMLA v29.4s, v19.4s, v4.s[3] 230 FMLA v31.4s, v19.4s, v5.s[3] 231 232 # Second group of 4 A. 48 FMA. Loads A0 - A4 233 234 LDP q16, q17, [x5], 32 235 FMLA v20.4s, v12.4s, v6.s[0] 236 FMLA v22.4s, v12.4s, v7.s[0] 237 LDP q18, q19, [x5], 32 238 FMLA v24.4s, v12.4s, v8.s[0] 239 FMLA v26.4s, v12.4s, v9.s[0] 240 FMLA v28.4s, v12.4s, v10.s[0] 241 FMLA v30.4s, v12.4s, v11.s[0] 242 FMLA v21.4s, v13.4s, v6.s[0] 243 FMLA v23.4s, v13.4s, v7.s[0] 244 FMLA v25.4s, v13.4s, v8.s[0] 245 FMLA v27.4s, v13.4s, v9.s[0] 246 FMLA v29.4s, v13.4s, v10.s[0] 247 FMLA v31.4s, v13.4s, v11.s[0] 248 249 FMLA v20.4s, v14.4s, v6.s[1] 250 FMLA v22.4s, v14.4s, v7.s[1] 251 FMLA v24.4s, v14.4s, v8.s[1] 252 FMLA v26.4s, v14.4s, v9.s[1] 253 FMLA v28.4s, v14.4s, v10.s[1] 254 FMLA v30.4s, v14.4s, v11.s[1] 255 FMLA v21.4s, v15.4s, v6.s[1] 256 FMLA v23.4s, v15.4s, v7.s[1] 257 FMLA v25.4s, v15.4s, v8.s[1] 258 FMLA v27.4s, v15.4s, v9.s[1] 259 FMLA v29.4s, v15.4s, v10.s[1] 260 FMLA v31.4s, v15.4s, v11.s[1] 261 262 LDP q12, q13, [x5], 32 263 FMLA v20.4s, v16.4s, v6.s[2] 264 FMLA v20.4s, v18.4s, v6.s[3] 265 LDP q14, q15, [x5], 32 266 FMLA v21.4s, v17.4s, v6.s[2] 267 FMLA v21.4s, v19.4s, v6.s[3] 268 LDP q0, q6, [x14], 32 269 FMLA v22.4s, v16.4s, v7.s[2] 270 FMLA v22.4s, v18.4s, v7.s[3] 271 FMLA v23.4s, v17.4s, v7.s[2] 272 FMLA v23.4s, v19.4s, v7.s[3] 273 LDP q1, q7, [x15], 32 274 FMLA v24.4s, v16.4s, v8.s[2] 275 FMLA v24.4s, v18.4s, v8.s[3] 276 FMLA v25.4s, v17.4s, v8.s[2] 277 FMLA v25.4s, v19.4s, v8.s[3] 278 LDP q2, q8, [x20], 32 279 FMLA v26.4s, v16.4s, v9.s[2] 280 FMLA v26.4s, v18.4s, v9.s[3] 281 FMLA v27.4s, v17.4s, v9.s[2] 282 FMLA v27.4s, v19.4s, v9.s[3] 283 LDP q3, q9, [x21], 32 284 FMLA v28.4s, v16.4s, v10.s[2] 285 FMLA v28.4s, v18.4s, v10.s[3] 286 FMLA v29.4s, v17.4s, v10.s[2] 287 FMLA v29.4s, v19.4s, v10.s[3] 288 LDP q4, q10, [x22], 32 289 FMLA v30.4s, v16.4s, v11.s[2] 290 FMLA v30.4s, v18.4s, v11.s[3] 291 SUBS x0, x0, 32 292 FMLA v31.4s, v17.4s, v11.s[2] 293 FMLA v31.4s, v19.4s, v11.s[3] 294 B.HS 2b 295 296 # Epilogue - 8 floats of A (32 bytes) 297 # 96 FMA + 6 LDP A + 8 LDP B 298 # First block same as main loop. Second block has no preloads. 2993: 300 # First group of 4 A. 48 FMA. Loads A5 301 302 LDP q5, q11, [x23], 32 303 FMLA v20.4s, v12.4s, v0.s[0] 304 FMLA v22.4s, v12.4s, v1.s[0] 305 LDP q16, q17, [x5], 32 306 FMLA v24.4s, v12.4s, v2.s[0] 307 FMLA v26.4s, v12.4s, v3.s[0] 308 LDP q18, q19, [x5], 32 309 FMLA v28.4s, v12.4s, v4.s[0] 310 FMLA v30.4s, v12.4s, v5.s[0] 311 FMLA v21.4s, v13.4s, v0.s[0] 312 FMLA v23.4s, v13.4s, v1.s[0] 313 FMLA v25.4s, v13.4s, v2.s[0] 314 FMLA v27.4s, v13.4s, v3.s[0] 315 FMLA v29.4s, v13.4s, v4.s[0] 316 FMLA v31.4s, v13.4s, v5.s[0] 317 318 FMLA v20.4s, v14.4s, v0.s[1] 319 FMLA v22.4s, v14.4s, v1.s[1] 320 FMLA v24.4s, v14.4s, v2.s[1] 321 FMLA v26.4s, v14.4s, v3.s[1] 322 FMLA v28.4s, v14.4s, v4.s[1] 323 FMLA v30.4s, v14.4s, v5.s[1] 324 FMLA v21.4s, v15.4s, v0.s[1] 325 FMLA v23.4s, v15.4s, v1.s[1] 326 FMLA v25.4s, v15.4s, v2.s[1] 327 FMLA v27.4s, v15.4s, v3.s[1] 328 FMLA v29.4s, v15.4s, v4.s[1] 329 FMLA v31.4s, v15.4s, v5.s[1] 330 331 LDP q12, q13, [x5], 32 332 FMLA v20.4s, v16.4s, v0.s[2] 333 FMLA v22.4s, v16.4s, v1.s[2] 334 LDP q14, q15, [x5], 32 335 FMLA v24.4s, v16.4s, v2.s[2] 336 FMLA v26.4s, v16.4s, v3.s[2] 337 FMLA v28.4s, v16.4s, v4.s[2] 338 FMLA v30.4s, v16.4s, v5.s[2] 339 FMLA v21.4s, v17.4s, v0.s[2] 340 FMLA v23.4s, v17.4s, v1.s[2] 341 FMLA v25.4s, v17.4s, v2.s[2] 342 FMLA v27.4s, v17.4s, v3.s[2] 343 FMLA v29.4s, v17.4s, v4.s[2] 344 FMLA v31.4s, v17.4s, v5.s[2] 345 346 FMLA v20.4s, v18.4s, v0.s[3] 347 FMLA v22.4s, v18.4s, v1.s[3] 348 FMLA v24.4s, v18.4s, v2.s[3] 349 FMLA v26.4s, v18.4s, v3.s[3] 350 FMLA v28.4s, v18.4s, v4.s[3] 351 FMLA v30.4s, v18.4s, v5.s[3] 352 FMLA v21.4s, v19.4s, v0.s[3] 353 FMLA v23.4s, v19.4s, v1.s[3] 354 FMLA v25.4s, v19.4s, v2.s[3] 355 FMLA v27.4s, v19.4s, v3.s[3] 356 FMLA v29.4s, v19.4s, v4.s[3] 357 FMLA v31.4s, v19.4s, v5.s[3] 358 359 # Second group of 4 A. 48 FMA. No A Loads, No last B load 360 361 LDP q16, q17, [x5], 32 362 FMLA v20.4s, v12.4s, v6.s[0] 363 FMLA v22.4s, v12.4s, v7.s[0] 364 LDP q18, q19, [x5], 32 365 FMLA v24.4s, v12.4s, v8.s[0] 366 FMLA v26.4s, v12.4s, v9.s[0] 367 FMLA v28.4s, v12.4s, v10.s[0] 368 FMLA v30.4s, v12.4s, v11.s[0] 369 FMLA v21.4s, v13.4s, v6.s[0] 370 FMLA v23.4s, v13.4s, v7.s[0] 371 FMLA v25.4s, v13.4s, v8.s[0] 372 FMLA v27.4s, v13.4s, v9.s[0] 373 FMLA v29.4s, v13.4s, v10.s[0] 374 FMLA v31.4s, v13.4s, v11.s[0] 375 376 FMLA v20.4s, v14.4s, v6.s[1] 377 FMLA v22.4s, v14.4s, v7.s[1] 378 FMLA v24.4s, v14.4s, v8.s[1] 379 FMLA v26.4s, v14.4s, v9.s[1] 380 FMLA v28.4s, v14.4s, v10.s[1] 381 FMLA v30.4s, v14.4s, v11.s[1] 382 FMLA v21.4s, v15.4s, v6.s[1] 383 FMLA v23.4s, v15.4s, v7.s[1] 384 FMLA v25.4s, v15.4s, v8.s[1] 385 FMLA v27.4s, v15.4s, v9.s[1] 386 FMLA v29.4s, v15.4s, v10.s[1] 387 FMLA v31.4s, v15.4s, v11.s[1] 388 389 # Last part of epilogue has loads removed. 390 391 FMLA v20.4s, v16.4s, v6.s[2] 392 FMLA v22.4s, v16.4s, v7.s[2] 393 FMLA v24.4s, v16.4s, v8.s[2] 394 FMLA v26.4s, v16.4s, v9.s[2] 395 FMLA v28.4s, v16.4s, v10.s[2] 396 FMLA v30.4s, v16.4s, v11.s[2] 397 FMLA v21.4s, v17.4s, v6.s[2] 398 FMLA v23.4s, v17.4s, v7.s[2] 399 FMLA v25.4s, v17.4s, v8.s[2] 400 FMLA v27.4s, v17.4s, v9.s[2] 401 FMLA v29.4s, v17.4s, v10.s[2] 402 FMLA v31.4s, v17.4s, v11.s[2] 403 404 FMLA v20.4s, v18.4s, v6.s[3] 405 FMLA v22.4s, v18.4s, v7.s[3] 406 FMLA v24.4s, v18.4s, v8.s[3] 407 FMLA v26.4s, v18.4s, v9.s[3] 408 FMLA v28.4s, v18.4s, v10.s[3] 409 FMLA v30.4s, v18.4s, v11.s[3] 410 FMLA v21.4s, v19.4s, v6.s[3] 411 FMLA v23.4s, v19.4s, v7.s[3] 412 413 # Load clamping_params values 414 LD2R {v6.4s, v7.4s}, [x8] 415 416 FMLA v25.4s, v19.4s, v8.s[3] 417 FMLA v27.4s, v19.4s, v9.s[3] 418 TST x0, 31 419 FMLA v29.4s, v19.4s, v10.s[3] 420 FMLA v31.4s, v19.4s, v11.s[3] 421 B.NE 5f 422 423 .p2align 3 4244: 425 # ks loop 426 SUBS x9, x9, 48 // ks -= MR * sizeof(void*) 427 B.NE 1b 428 429 # Clamp 430 FMIN v20.4s, v20.4s, v6.4s 431 FMIN v21.4s, v21.4s, v6.4s 432 FMIN v22.4s, v22.4s, v6.4s 433 FMIN v23.4s, v23.4s, v6.4s 434 FMIN v24.4s, v24.4s, v6.4s 435 FMIN v25.4s, v25.4s, v6.4s 436 FMIN v26.4s, v26.4s, v6.4s 437 FMIN v27.4s, v27.4s, v6.4s 438 FMIN v28.4s, v28.4s, v6.4s 439 FMIN v29.4s, v29.4s, v6.4s 440 FMIN v30.4s, v30.4s, v6.4s 441 FMIN v31.4s, v31.4s, v6.4s 442 FMAX v20.4s, v20.4s, v7.4s 443 FMAX v21.4s, v21.4s, v7.4s 444 FMAX v22.4s, v22.4s, v7.4s 445 FMAX v23.4s, v23.4s, v7.4s 446 FMAX v24.4s, v24.4s, v7.4s 447 FMAX v25.4s, v25.4s, v7.4s 448 FMAX v26.4s, v26.4s, v7.4s 449 FMAX v27.4s, v27.4s, v7.4s 450 FMAX v28.4s, v28.4s, v7.4s 451 FMAX v29.4s, v29.4s, v7.4s 452 FMAX v30.4s, v30.4s, v7.4s 453 FMAX v31.4s, v31.4s, v7.4s 454 455 # Store full 6 x 8 456 SUBS x1, x1, 8 457 B.LO 8f 458 459 STP q30, q31, [x7] 460 ADD x7, x7, x10 461 STP q28, q29, [x13] 462 ADD x13, x13, x10 463 STP q26, q27, [x18] 464 ADD x18, x18, x10 465 STP q24, q25, [x17] 466 ADD x17, x17, x10 467 STP q22, q23, [x16] 468 ADD x16, x16, x10 469 STP q20, q21, [x6] 470 ADD x6, x6, x10 471 472 SUB x4, x4, x3 // a -= ks 473 474 # nc loop 475 B.HI 0b 476 477 # Restore x20,x21,x22,x23 from stack 478 LDP x22, x23, [sp, 80] 479 LDP x20, x21, [sp, 64] 480 481 # Restore d8-d15 from stack 482 LDP d14, d15, [sp, 48] 483 LDP d12, d13, [sp, 32] 484 LDP d10, d11, [sp, 16] 485 LDP d8, d9, [sp], 96 486 RET 487 488 .p2align 3 4895: 490 # Is there a remainder?- 4 floats of A (16 bytes) 491 TBZ x0, 4, 6f 492 493 # Remainder- 4 floats of A (16 bytes) 494 # Load A 495 LDR q0, [x14], 16 496 LDR q1, [x15], 16 497 LDR q2, [x20], 16 498 LDR q3, [x21], 16 499 LDR q4, [x22], 16 500 LDR q5, [x23], 16 501 # Load B 502 LDP q12, q13, [x5], 32 503 LDP q14, q15, [x5], 32 504 LDP q16, q17, [x5], 32 505 LDP q18, q19, [x5], 32 506 507 FMLA v20.4s, v12.4s, v0.s[0] 508 FMLA v22.4s, v12.4s, v1.s[0] 509 FMLA v24.4s, v12.4s, v2.s[0] 510 FMLA v26.4s, v12.4s, v3.s[0] 511 FMLA v28.4s, v12.4s, v4.s[0] 512 FMLA v30.4s, v12.4s, v5.s[0] 513 FMLA v21.4s, v13.4s, v0.s[0] 514 FMLA v23.4s, v13.4s, v1.s[0] 515 FMLA v25.4s, v13.4s, v2.s[0] 516 FMLA v27.4s, v13.4s, v3.s[0] 517 FMLA v29.4s, v13.4s, v4.s[0] 518 FMLA v31.4s, v13.4s, v5.s[0] 519 520 FMLA v20.4s, v14.4s, v0.s[1] 521 FMLA v22.4s, v14.4s, v1.s[1] 522 FMLA v24.4s, v14.4s, v2.s[1] 523 FMLA v26.4s, v14.4s, v3.s[1] 524 FMLA v28.4s, v14.4s, v4.s[1] 525 FMLA v30.4s, v14.4s, v5.s[1] 526 FMLA v21.4s, v15.4s, v0.s[1] 527 FMLA v23.4s, v15.4s, v1.s[1] 528 FMLA v25.4s, v15.4s, v2.s[1] 529 FMLA v27.4s, v15.4s, v3.s[1] 530 FMLA v29.4s, v15.4s, v4.s[1] 531 FMLA v31.4s, v15.4s, v5.s[1] 532 533 FMLA v20.4s, v16.4s, v0.s[2] 534 FMLA v22.4s, v16.4s, v1.s[2] 535 FMLA v24.4s, v16.4s, v2.s[2] 536 FMLA v26.4s, v16.4s, v3.s[2] 537 FMLA v28.4s, v16.4s, v4.s[2] 538 FMLA v30.4s, v16.4s, v5.s[2] 539 FMLA v21.4s, v17.4s, v0.s[2] 540 FMLA v23.4s, v17.4s, v1.s[2] 541 FMLA v25.4s, v17.4s, v2.s[2] 542 FMLA v27.4s, v17.4s, v3.s[2] 543 FMLA v29.4s, v17.4s, v4.s[2] 544 FMLA v31.4s, v17.4s, v5.s[2] 545 546 FMLA v20.4s, v18.4s, v0.s[3] 547 FMLA v22.4s, v18.4s, v1.s[3] 548 FMLA v24.4s, v18.4s, v2.s[3] 549 FMLA v26.4s, v18.4s, v3.s[3] 550 FMLA v28.4s, v18.4s, v4.s[3] 551 FMLA v30.4s, v18.4s, v5.s[3] 552 FMLA v21.4s, v19.4s, v0.s[3] 553 FMLA v23.4s, v19.4s, v1.s[3] 554 FMLA v25.4s, v19.4s, v2.s[3] 555 FMLA v27.4s, v19.4s, v3.s[3] 556 FMLA v29.4s, v19.4s, v4.s[3] 557 FMLA v31.4s, v19.4s, v5.s[3] 558 559 # Is there a remainder?- 2 floats of A (8 bytes) 5606: 561 TBZ x0, 3, 7f 562 563 # Remainder- 2 floats of A (8 bytes) 564 # Load A 565 LDR d0, [x14], 8 566 LDR d1, [x15], 8 567 LDR d2, [x20], 8 568 LDR d3, [x21], 8 569 LDR d4, [x22], 8 570 LDR d5, [x23], 8 571 # Load B 572 LDP q12, q13, [x5], 32 573 LDP q14, q15, [x5], 32 574 575 FMLA v20.4s, v12.4s, v0.s[0] 576 FMLA v22.4s, v12.4s, v1.s[0] 577 FMLA v24.4s, v12.4s, v2.s[0] 578 FMLA v26.4s, v12.4s, v3.s[0] 579 FMLA v28.4s, v12.4s, v4.s[0] 580 FMLA v30.4s, v12.4s, v5.s[0] 581 FMLA v21.4s, v13.4s, v0.s[0] 582 FMLA v23.4s, v13.4s, v1.s[0] 583 FMLA v25.4s, v13.4s, v2.s[0] 584 FMLA v27.4s, v13.4s, v3.s[0] 585 FMLA v29.4s, v13.4s, v4.s[0] 586 FMLA v31.4s, v13.4s, v5.s[0] 587 588 FMLA v20.4s, v14.4s, v0.s[1] 589 FMLA v22.4s, v14.4s, v1.s[1] 590 FMLA v24.4s, v14.4s, v2.s[1] 591 FMLA v26.4s, v14.4s, v3.s[1] 592 FMLA v28.4s, v14.4s, v4.s[1] 593 FMLA v30.4s, v14.4s, v5.s[1] 594 FMLA v21.4s, v15.4s, v0.s[1] 595 FMLA v23.4s, v15.4s, v1.s[1] 596 FMLA v25.4s, v15.4s, v2.s[1] 597 FMLA v27.4s, v15.4s, v3.s[1] 598 FMLA v29.4s, v15.4s, v4.s[1] 599 FMLA v31.4s, v15.4s, v5.s[1] 600 601 # Is there a remainder?- 1 float of A (4 bytes) 6027: 603 TBZ x0, 2, 4b 604 605 # Remainder- 1 float of A (4 bytes) 606 # Load A 607 LDR s0, [x14], 4 608 LDR s1, [x15], 4 609 LDR s2, [x20], 4 610 LDR s3, [x21], 4 611 LDR s4, [x22], 4 612 LDR s5, [x23], 4 613 # Load B 614 LDP q12, q13, [x5], 32 615 616 FMLA v20.4s, v12.4s, v0.s[0] 617 FMLA v22.4s, v12.4s, v1.s[0] 618 FMLA v24.4s, v12.4s, v2.s[0] 619 FMLA v26.4s, v12.4s, v3.s[0] 620 FMLA v28.4s, v12.4s, v4.s[0] 621 FMLA v30.4s, v12.4s, v5.s[0] 622 FMLA v21.4s, v13.4s, v0.s[0] 623 FMLA v23.4s, v13.4s, v1.s[0] 624 FMLA v25.4s, v13.4s, v2.s[0] 625 FMLA v27.4s, v13.4s, v3.s[0] 626 FMLA v29.4s, v13.4s, v4.s[0] 627 FMLA v31.4s, v13.4s, v5.s[0] 628 B 4b 629 630 # Store odd width 6318: 632 TBZ x1, 2, 9f 633 STR q30, [x7], 16 634 MOV v30.16b, v31.16b 635 STR q28, [x13], 16 636 MOV v28.16b, v29.16b 637 STR q26, [x18], 16 638 MOV v26.16b, v27.16b 639 STR q24, [x17], 16 640 MOV v24.16b, v25.16b 641 STR q22, [x16], 16 642 MOV v22.16b, v23.16b 643 STR q20, [x6], 16 644 MOV v20.16b, v21.16b 6459: 646 TBZ x1, 1, 10f 647 STR d30, [x7], 8 648 DUP d30, v30.d[1] 649 STR d28, [x13], 8 650 DUP d28, v28.d[1] 651 STR d26, [x18], 8 652 DUP d26, v26.d[1] 653 STR d24, [x17], 8 654 DUP d24, v24.d[1] 655 STR d22, [x16], 8 656 DUP d22, v22.d[1] 657 STR d20, [x6], 8 658 DUP d20, v20.d[1] 659 66010: 661 TBZ x1, 0, 11f 662 STR s30, [x7] 663 STR s28, [x13] 664 STR s26, [x18] 665 STR s24, [x17] 666 STR s22, [x16] 667 STR s20, [x6] 66811: 669 # Restore x20,x21,x22,x23 from stack 670 LDP x22, x23, [sp, 80] 671 LDP x20, x21, [sp, 64] 672 673 # Restore d8-d15 from stack 674 LDP d14, d15, [sp, 48] 675 LDP d12, d13, [sp, 32] 676 LDP d10, d11, [sp, 16] 677 LDP d8, d9, [sp], 96 678 RET 679 680END_FUNCTION xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73 681 682#ifdef __ELF__ 683.section ".note.GNU-stack","",%progbits 684#endif 685