1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 24 25# d8-d15 need to be preserved if used. 26# x19-30 need to be preserved if used. 27 28# A pointers 29# x3 a0 30# x9 a1 31# x10 a2 32# x11 a3 33# x12 a4 34# x4 a5 35 36# C pointers 37# x6 c0 38# x16 c1 39# x17 c2 40# x18 c3 41# x13 c4 42# x7 c5 43 44# Vector register usage 45# A0 v0 v6 46# A1 v1 v7 47# A2 v2 v8 48# A3 v3 v9 49# A4 v4 v10 50# A5 v5 v11 51# B v12 v13 v14 v15 52# B v16 v17 v18 v19 53# C v20 v21 54# C v22 v23 55# C v24 v25 56# C v26 v27 57# C v28 v29 58# C v30 v31 59# Clamp v6 v7 60 61BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57 62 63 # Clamp A and C pointers / Save d8-d15 on stack 64 STP d8, d9, [sp, -64]! 65 CMP x0, 2 // if mr < 2 66 ADD x9, x3, x4 // a1 = a0 + a_stride 67 ADD x16, x6, x7 // c1 = c0 + cm_stride 68 CSEL x9, x3, x9, LO // a1 = a0 69 CSEL x16, x6, x16, LO // c1 = c0 70 71 STP d10, d11, [sp, 16] 72 ADD x10, x9, x4 // a2 = a1 + a_stride 73 ADD x17, x16, x7 // c2 = c1 + cm_stride 74 // if mr <= 2 75 CSEL x10, x9, x10, LS // a2 = a1 76 CSEL x17, x16, x17, LS // c2 = c1 77 78 STP d12, d13, [sp, 32] 79 CMP x0, 4 // if mr < 4 80 ADD x11, x10, x4 // a3 = a2 + a_stride 81 ADD x18, x17, x7 // c3 = c2 + cm_stride 82 CSEL x11, x10, x11, LO // a3 = a2 83 CSEL x18, x17, x18, LO // c3 = c2 84 85 STP d14, d15, [sp, 48] 86 ADD x12, x11, x4 // a4 = a3 + a_stride 87 ADD x13, x18, x7 // c4 = c3 + cm_stride 88 // if mr <= 5 89 CSEL x12, x11, x12, LS // a4 = a3 90 CSEL x13, x18, x13, LS // c4 = c3 91 92 # Load acc, params pointer 93 LDP x15, x8, [sp, 72] 94 95 CMP x0, 6 // if mr < 6 96 ADD x4, x12, x4 // a5 = a4 + a_stride 97 ADD x7, x13, x7 // c5 = c4 + cm_stride 98 CSEL x4, x12, x4, LO // a5 = a4 99 CSEL x7, x13, x7, LO // c5 = c4 100 101 # Load cn_stride 102 LDR x14, [sp, 64] 103 1040: 105 # Load initial accumulators 106 LDP q20, q21, [x15], 32 107 LDP q22, q23, [x15], 32 108 LDP q24, q25, [x15], 32 109 LDP q26, q27, [x15], 32 110 LDP q28, q29, [x15], 32 111 LDP q30, q31, [x15], 32 112 113 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 114 SUBS x0, x2, 32 // k = kc - 32 115 B.LO 4f 116 117 # Prologue - loads for main loop of 96 FMA 118 LDR q0, [x3], 16 119 LDR q1, [x9], 16 120 LDR q2, [x10], 16 121 LDR q3, [x11], 16 122 LDR q4, [x12], 16 123 LDR q5, [x4], 16 124 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 125 LDP q14, q15, [x5], 32 126 LDP q16, q17, [x5], 32 127 128 # Is there at least 8 floats (32 bytes) for main loop? 129 SUBS x0, x0, 32 130 B.LO 2f 131 132 # Main loop - 8 floats of A (32 bytes) 133 # 96 FMA + 6 LDP A + 8 LDP B 1341: 135 # First group of 4 A. 48 FMA. 136 FMLA v20.4s, v12.4s, v0.s[0] 137 LDP q18, q19, [x5], 32 // Load last B 138 FMLA v22.4s, v12.4s, v1.s[0] 139 FMLA v24.4s, v12.4s, v2.s[0] 140 FMLA v26.4s, v12.4s, v3.s[0] 141 FMLA v28.4s, v12.4s, v4.s[0] 142 FMLA v30.4s, v12.4s, v5.s[0] 143 FMLA v21.4s, v13.4s, v0.s[0] 144 FMLA v23.4s, v13.4s, v1.s[0] 145 FMLA v25.4s, v13.4s, v2.s[0] 146 FMLA v27.4s, v13.4s, v3.s[0] 147 FMLA v29.4s, v13.4s, v4.s[0] 148 149 FMLA v31.4s, v13.4s, v5.s[0] 150 FMLA v20.4s, v14.4s, v0.s[1] 151 FMLA v22.4s, v14.4s, v1.s[1] 152 FMLA v24.4s, v14.4s, v2.s[1] 153 FMLA v26.4s, v14.4s, v3.s[1] 154 FMLA v28.4s, v14.4s, v4.s[1] 155 FMLA v30.4s, v14.4s, v5.s[1] 156 FMLA v21.4s, v15.4s, v0.s[1] 157 FMLA v23.4s, v15.4s, v1.s[1] 158 FMLA v25.4s, v15.4s, v2.s[1] 159 LDR q6, [x3], 16 // Load next 6 A 160 FMLA v27.4s, v15.4s, v3.s[1] 161 FMLA v29.4s, v15.4s, v4.s[1] 162 FMLA v31.4s, v15.4s, v5.s[1] 163 LDR q7, [x9], 16 164 165 FMLA v20.4s, v16.4s, v0.s[2] 166 FMLA v22.4s, v16.4s, v1.s[2] 167 FMLA v24.4s, v16.4s, v2.s[2] 168 LDR q8, [x10], 16 169 FMLA v26.4s, v16.4s, v3.s[2] 170 FMLA v28.4s, v16.4s, v4.s[2] 171 FMLA v30.4s, v16.4s, v5.s[2] 172 LDR q9, [x11], 16 173 FMLA v21.4s, v17.4s, v0.s[2] 174 FMLA v23.4s, v17.4s, v1.s[2] 175 FMLA v25.4s, v17.4s, v2.s[2] 176 LDR q10, [x12], 16 177 FMLA v27.4s, v17.4s, v3.s[2] 178 FMLA v29.4s, v17.4s, v4.s[2] 179 FMLA v31.4s, v17.4s, v5.s[2] 180 LDR q11, [x4], 16 181 182 FMLA v20.4s, v18.4s, v0.s[3] 183 FMLA v22.4s, v18.4s, v1.s[3] 184 FMLA v24.4s, v18.4s, v2.s[3] 185 LDP q12, q13, [x5], 32 // Load 4 B 186 FMLA v26.4s, v18.4s, v3.s[3] 187 FMLA v28.4s, v18.4s, v4.s[3] 188 FMLA v30.4s, v18.4s, v5.s[3] 189 LDP q14, q15, [x5], 32 190 FMLA v21.4s, v19.4s, v0.s[3] 191 FMLA v23.4s, v19.4s, v1.s[3] 192 FMLA v25.4s, v19.4s, v2.s[3] 193 LDP q16, q17, [x5], 32 194 FMLA v27.4s, v19.4s, v3.s[3] 195 FMLA v29.4s, v19.4s, v4.s[3] 196 FMLA v31.4s, v19.4s, v5.s[3] 197 LDP q18, q19, [x5], 32 198 199 # Second group of 4 A. 48 FMA. 200 FMLA v20.4s, v12.4s, v6.s[0] 201 FMLA v22.4s, v12.4s, v7.s[0] 202 FMLA v24.4s, v12.4s, v8.s[0] 203 LDR q0, [x3], 16 // Load next 6 A 204 FMLA v26.4s, v12.4s, v9.s[0] 205 FMLA v28.4s, v12.4s, v10.s[0] 206 FMLA v30.4s, v12.4s, v11.s[0] 207 LDR q1, [x9], 16 208 FMLA v21.4s, v13.4s, v6.s[0] 209 FMLA v23.4s, v13.4s, v7.s[0] 210 FMLA v25.4s, v13.4s, v8.s[0] 211 LDR q2, [x10], 16 212 FMLA v27.4s, v13.4s, v9.s[0] 213 FMLA v29.4s, v13.4s, v10.s[0] 214 FMLA v31.4s, v13.4s, v11.s[0] 215 LDR q3, [x11], 16 216 217 FMLA v20.4s, v14.4s, v6.s[1] 218 FMLA v22.4s, v14.4s, v7.s[1] 219 FMLA v24.4s, v14.4s, v8.s[1] 220 LDR q4, [x12], 16 221 FMLA v26.4s, v14.4s, v9.s[1] 222 FMLA v28.4s, v14.4s, v10.s[1] 223 FMLA v30.4s, v14.4s, v11.s[1] 224 LDR q5, [x4], 16 225 FMLA v21.4s, v15.4s, v6.s[1] 226 FMLA v23.4s, v15.4s, v7.s[1] 227 FMLA v25.4s, v15.4s, v8.s[1] 228 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 229 FMLA v27.4s, v15.4s, v9.s[1] 230 FMLA v29.4s, v15.4s, v10.s[1] 231 FMLA v31.4s, v15.4s, v11.s[1] 232 LDP q14, q15, [x5], 32 233 234 FMLA v20.4s, v16.4s, v6.s[2] 235 FMLA v22.4s, v16.4s, v7.s[2] 236 FMLA v24.4s, v16.4s, v8.s[2] 237 FMLA v26.4s, v16.4s, v9.s[2] 238 FMLA v28.4s, v16.4s, v10.s[2] 239 FMLA v30.4s, v16.4s, v11.s[2] 240 FMLA v21.4s, v17.4s, v6.s[2] 241 FMLA v23.4s, v17.4s, v7.s[2] 242 FMLA v25.4s, v17.4s, v8.s[2] 243 FMLA v27.4s, v17.4s, v9.s[2] 244 FMLA v29.4s, v17.4s, v10.s[2] 245 FMLA v31.4s, v17.4s, v11.s[2] 246 LDP q16, q17, [x5], 32 247 248 FMLA v20.4s, v18.4s, v6.s[3] 249 FMLA v22.4s, v18.4s, v7.s[3] 250 SUBS x0, x0, 32 251 FMLA v24.4s, v18.4s, v8.s[3] 252 FMLA v26.4s, v18.4s, v9.s[3] 253 FMLA v28.4s, v18.4s, v10.s[3] 254 FMLA v30.4s, v18.4s, v11.s[3] 255 FMLA v21.4s, v19.4s, v6.s[3] 256 FMLA v23.4s, v19.4s, v7.s[3] 257 FMLA v25.4s, v19.4s, v8.s[3] 258 FMLA v27.4s, v19.4s, v9.s[3] 259 FMLA v29.4s, v19.4s, v10.s[3] 260 FMLA v31.4s, v19.4s, v11.s[3] 261 B.HS 1b 262 263 # Epilogue - 8 floats of A (32 bytes) 264 # 96 FMA + 6 LDP A + 8 LDP B 265 # First block same as main loop. Second block has no preloads. 2662: 267 # First group of 4 A. 48 FMA. 268 FMLA v20.4s, v12.4s, v0.s[0] 269 LDP q18, q19, [x5], 32 // Load last B 270 FMLA v22.4s, v12.4s, v1.s[0] 271 FMLA v24.4s, v12.4s, v2.s[0] 272 FMLA v26.4s, v12.4s, v3.s[0] 273 FMLA v28.4s, v12.4s, v4.s[0] 274 FMLA v30.4s, v12.4s, v5.s[0] 275 FMLA v21.4s, v13.4s, v0.s[0] 276 FMLA v23.4s, v13.4s, v1.s[0] 277 FMLA v25.4s, v13.4s, v2.s[0] 278 FMLA v27.4s, v13.4s, v3.s[0] 279 FMLA v29.4s, v13.4s, v4.s[0] 280 281 FMLA v31.4s, v13.4s, v5.s[0] 282 FMLA v20.4s, v14.4s, v0.s[1] 283 FMLA v22.4s, v14.4s, v1.s[1] 284 FMLA v24.4s, v14.4s, v2.s[1] 285 FMLA v26.4s, v14.4s, v3.s[1] 286 FMLA v28.4s, v14.4s, v4.s[1] 287 FMLA v30.4s, v14.4s, v5.s[1] 288 FMLA v21.4s, v15.4s, v0.s[1] 289 FMLA v23.4s, v15.4s, v1.s[1] 290 FMLA v25.4s, v15.4s, v2.s[1] 291 LDR q6, [x3], 16 // Load next 6 A 292 FMLA v27.4s, v15.4s, v3.s[1] 293 FMLA v29.4s, v15.4s, v4.s[1] 294 FMLA v31.4s, v15.4s, v5.s[1] 295 LDR q7, [x9], 16 296 297 FMLA v20.4s, v16.4s, v0.s[2] 298 FMLA v22.4s, v16.4s, v1.s[2] 299 FMLA v24.4s, v16.4s, v2.s[2] 300 LDR q8, [x10], 16 301 FMLA v26.4s, v16.4s, v3.s[2] 302 FMLA v28.4s, v16.4s, v4.s[2] 303 FMLA v30.4s, v16.4s, v5.s[2] 304 LDR q9, [x11], 16 305 FMLA v21.4s, v17.4s, v0.s[2] 306 FMLA v23.4s, v17.4s, v1.s[2] 307 FMLA v25.4s, v17.4s, v2.s[2] 308 LDR q10, [x12], 16 309 FMLA v27.4s, v17.4s, v3.s[2] 310 FMLA v29.4s, v17.4s, v4.s[2] 311 FMLA v31.4s, v17.4s, v5.s[2] 312 LDR q11, [x4], 16 313 314 FMLA v20.4s, v18.4s, v0.s[3] 315 FMLA v22.4s, v18.4s, v1.s[3] 316 FMLA v24.4s, v18.4s, v2.s[3] 317 LDP q12, q13, [x5], 32 // Load 4 B 318 FMLA v26.4s, v18.4s, v3.s[3] 319 FMLA v28.4s, v18.4s, v4.s[3] 320 FMLA v30.4s, v18.4s, v5.s[3] 321 LDP q14, q15, [x5], 32 322 FMLA v21.4s, v19.4s, v0.s[3] 323 FMLA v23.4s, v19.4s, v1.s[3] 324 FMLA v25.4s, v19.4s, v2.s[3] 325 LDP q16, q17, [x5], 32 326 FMLA v27.4s, v19.4s, v3.s[3] 327 FMLA v29.4s, v19.4s, v4.s[3] 328 FMLA v31.4s, v19.4s, v5.s[3] 329 LDP q18, q19, [x5], 32 330 331 # Second group of 4 A. 48 FMA. 332 FMLA v20.4s, v12.4s, v6.s[0] 333 FMLA v22.4s, v12.4s, v7.s[0] 334 FMLA v24.4s, v12.4s, v8.s[0] 335 FMLA v26.4s, v12.4s, v9.s[0] 336 FMLA v28.4s, v12.4s, v10.s[0] 337 FMLA v30.4s, v12.4s, v11.s[0] 338 FMLA v21.4s, v13.4s, v6.s[0] 339 FMLA v23.4s, v13.4s, v7.s[0] 340 FMLA v25.4s, v13.4s, v8.s[0] 341 FMLA v27.4s, v13.4s, v9.s[0] 342 FMLA v29.4s, v13.4s, v10.s[0] 343 FMLA v31.4s, v13.4s, v11.s[0] 344 345 FMLA v20.4s, v14.4s, v6.s[1] 346 FMLA v22.4s, v14.4s, v7.s[1] 347 FMLA v24.4s, v14.4s, v8.s[1] 348 FMLA v26.4s, v14.4s, v9.s[1] 349 FMLA v28.4s, v14.4s, v10.s[1] 350 FMLA v30.4s, v14.4s, v11.s[1] 351 FMLA v21.4s, v15.4s, v6.s[1] 352 FMLA v23.4s, v15.4s, v7.s[1] 353 FMLA v25.4s, v15.4s, v8.s[1] 354 FMLA v27.4s, v15.4s, v9.s[1] 355 FMLA v29.4s, v15.4s, v10.s[1] 356 FMLA v31.4s, v15.4s, v11.s[1] 357 358 FMLA v20.4s, v16.4s, v6.s[2] 359 FMLA v22.4s, v16.4s, v7.s[2] 360 FMLA v24.4s, v16.4s, v8.s[2] 361 FMLA v26.4s, v16.4s, v9.s[2] 362 FMLA v28.4s, v16.4s, v10.s[2] 363 FMLA v30.4s, v16.4s, v11.s[2] 364 FMLA v21.4s, v17.4s, v6.s[2] 365 FMLA v23.4s, v17.4s, v7.s[2] 366 FMLA v25.4s, v17.4s, v8.s[2] 367 FMLA v27.4s, v17.4s, v9.s[2] 368 FMLA v29.4s, v17.4s, v10.s[2] 369 FMLA v31.4s, v17.4s, v11.s[2] 370 371 FMLA v20.4s, v18.4s, v6.s[3] 372 FMLA v22.4s, v18.4s, v7.s[3] 373 FMLA v24.4s, v18.4s, v8.s[3] 374 FMLA v26.4s, v18.4s, v9.s[3] 375 FMLA v28.4s, v18.4s, v10.s[3] 376 FMLA v30.4s, v18.4s, v11.s[3] 377 FMLA v21.4s, v19.4s, v6.s[3] 378 FMLA v23.4s, v19.4s, v7.s[3] 379 380 # Load clamping_params values 381 LD2R {v6.4s, v7.4s}, [x8] 382 383 FMLA v25.4s, v19.4s, v8.s[3] 384 FMLA v27.4s, v19.4s, v9.s[3] 385 # Is there a remainder?- 4 floats of A (16 bytes) or less 386 TST x0, 31 387 FMLA v29.4s, v19.4s, v10.s[3] 388 FMLA v31.4s, v19.4s, v11.s[3] 389 B.NE 4f 390 391 # Clamp 3923: 393 FMIN v20.4s, v20.4s, v6.4s 394 SUBS x1, x1, 8 395 FMIN v21.4s, v21.4s, v6.4s 396 FMIN v22.4s, v22.4s, v6.4s 397 FMIN v23.4s, v23.4s, v6.4s 398 FMIN v24.4s, v24.4s, v6.4s 399 FMIN v25.4s, v25.4s, v6.4s 400 FMIN v26.4s, v26.4s, v6.4s 401 FMIN v27.4s, v27.4s, v6.4s 402 FMIN v28.4s, v28.4s, v6.4s 403 FMIN v29.4s, v29.4s, v6.4s 404 FMIN v30.4s, v30.4s, v6.4s 405 FMIN v31.4s, v31.4s, v6.4s 406 FMAX v20.4s, v20.4s, v7.4s 407 FMAX v21.4s, v21.4s, v7.4s 408 FMAX v22.4s, v22.4s, v7.4s 409 FMAX v23.4s, v23.4s, v7.4s 410 FMAX v24.4s, v24.4s, v7.4s 411 FMAX v25.4s, v25.4s, v7.4s 412 FMAX v26.4s, v26.4s, v7.4s 413 FMAX v27.4s, v27.4s, v7.4s 414 FMAX v28.4s, v28.4s, v7.4s 415 FMAX v29.4s, v29.4s, v7.4s 416 FMAX v30.4s, v30.4s, v7.4s 417 FMAX v31.4s, v31.4s, v7.4s 418 419 # Store full 6 x 8 420 B.LO 7f 421 422 STP q30, q31, [x7] 423 ADD x7, x7, x14 424 SUB x3, x3, x2 // a0 -= kc 425 STP q28, q29, [x13] 426 ADD x13, x13, x14 427 SUB x9, x9, x2 // a1 -= kc 428 STP q26, q27, [x18] 429 ADD x18, x18, x14 430 SUB x10, x10, x2 // a2 -= kc 431 STP q24, q25, [x17] 432 ADD x17, x17, x14 433 SUB x11, x11, x2 // a3 -= kc 434 STP q22, q23, [x16] 435 ADD x16, x16, x14 436 SUB x12, x12, x2 // a4 -= kc 437 STP q20, q21, [x6] 438 ADD x6, x6, x14 439 SUB x4, x4, x2 // a5 -= kc 440 441 B.HI 0b 442 443 # Restore d8-d15 from stack 444 LDP d14, d15, [sp, 48] 445 LDP d12, d13, [sp, 32] 446 LDP d10, d11, [sp, 16] 447 LDP d8, d9, [sp], 64 448 RET 449 4504: 451 # Load clamping_params values 452 LD2R {v6.4s, v7.4s}, [x8] 453 454 # Is there a remainder?- 4 floats of A (16 bytes) 455 TBZ x0, 4, 5f 456 457 # Remainder- 4 floats of A (16 bytes) 458 # Load A 459 LDR q0, [x3], 16 460 LDR q1, [x9], 16 461 LDR q2, [x10], 16 462 LDR q3, [x11], 16 463 LDR q4, [x12], 16 464 LDR q5, [x4], 16 465 # Load B 466 LDP q12, q13, [x5], 32 467 LDP q14, q15, [x5], 32 468 LDP q16, q17, [x5], 32 469 LDP q18, q19, [x5], 32 470 471 FMLA v20.4s, v12.4s, v0.s[0] 472 FMLA v22.4s, v12.4s, v1.s[0] 473 FMLA v24.4s, v12.4s, v2.s[0] 474 FMLA v26.4s, v12.4s, v3.s[0] 475 FMLA v28.4s, v12.4s, v4.s[0] 476 FMLA v30.4s, v12.4s, v5.s[0] 477 FMLA v21.4s, v13.4s, v0.s[0] 478 FMLA v23.4s, v13.4s, v1.s[0] 479 FMLA v25.4s, v13.4s, v2.s[0] 480 FMLA v27.4s, v13.4s, v3.s[0] 481 FMLA v29.4s, v13.4s, v4.s[0] 482 FMLA v31.4s, v13.4s, v5.s[0] 483 484 FMLA v20.4s, v14.4s, v0.s[1] 485 FMLA v22.4s, v14.4s, v1.s[1] 486 FMLA v24.4s, v14.4s, v2.s[1] 487 FMLA v26.4s, v14.4s, v3.s[1] 488 FMLA v28.4s, v14.4s, v4.s[1] 489 FMLA v30.4s, v14.4s, v5.s[1] 490 FMLA v21.4s, v15.4s, v0.s[1] 491 FMLA v23.4s, v15.4s, v1.s[1] 492 FMLA v25.4s, v15.4s, v2.s[1] 493 FMLA v27.4s, v15.4s, v3.s[1] 494 FMLA v29.4s, v15.4s, v4.s[1] 495 FMLA v31.4s, v15.4s, v5.s[1] 496 497 FMLA v20.4s, v16.4s, v0.s[2] 498 FMLA v22.4s, v16.4s, v1.s[2] 499 FMLA v24.4s, v16.4s, v2.s[2] 500 FMLA v26.4s, v16.4s, v3.s[2] 501 FMLA v28.4s, v16.4s, v4.s[2] 502 FMLA v30.4s, v16.4s, v5.s[2] 503 FMLA v21.4s, v17.4s, v0.s[2] 504 FMLA v23.4s, v17.4s, v1.s[2] 505 FMLA v25.4s, v17.4s, v2.s[2] 506 FMLA v27.4s, v17.4s, v3.s[2] 507 FMLA v29.4s, v17.4s, v4.s[2] 508 FMLA v31.4s, v17.4s, v5.s[2] 509 510 FMLA v20.4s, v18.4s, v0.s[3] 511 FMLA v22.4s, v18.4s, v1.s[3] 512 FMLA v24.4s, v18.4s, v2.s[3] 513 FMLA v26.4s, v18.4s, v3.s[3] 514 FMLA v28.4s, v18.4s, v4.s[3] 515 FMLA v30.4s, v18.4s, v5.s[3] 516 FMLA v21.4s, v19.4s, v0.s[3] 517 FMLA v23.4s, v19.4s, v1.s[3] 518 FMLA v25.4s, v19.4s, v2.s[3] 519 FMLA v27.4s, v19.4s, v3.s[3] 520 FMLA v29.4s, v19.4s, v4.s[3] 521 FMLA v31.4s, v19.4s, v5.s[3] 522 523 # Is there a remainder?- 2 floats of A (8 bytes) 5245: 525 TBZ x0, 3, 6f 526 527 # Remainder- 2 floats of A (8 bytes) 528 # Load A 529 LDR d0, [x3], 8 530 LDR d1, [x9], 8 531 LDR d2, [x10], 8 532 LDR d3, [x11], 8 533 LDR d4, [x12], 8 534 LDR d5, [x4], 8 535 # Load B 536 LDP q12, q13, [x5], 32 537 LDP q14, q15, [x5], 32 538 539 FMLA v20.4s, v12.4s, v0.s[0] 540 FMLA v22.4s, v12.4s, v1.s[0] 541 FMLA v24.4s, v12.4s, v2.s[0] 542 FMLA v26.4s, v12.4s, v3.s[0] 543 FMLA v28.4s, v12.4s, v4.s[0] 544 FMLA v30.4s, v12.4s, v5.s[0] 545 FMLA v21.4s, v13.4s, v0.s[0] 546 FMLA v23.4s, v13.4s, v1.s[0] 547 FMLA v25.4s, v13.4s, v2.s[0] 548 FMLA v27.4s, v13.4s, v3.s[0] 549 FMLA v29.4s, v13.4s, v4.s[0] 550 FMLA v31.4s, v13.4s, v5.s[0] 551 552 FMLA v20.4s, v14.4s, v0.s[1] 553 FMLA v22.4s, v14.4s, v1.s[1] 554 FMLA v24.4s, v14.4s, v2.s[1] 555 FMLA v26.4s, v14.4s, v3.s[1] 556 FMLA v28.4s, v14.4s, v4.s[1] 557 FMLA v30.4s, v14.4s, v5.s[1] 558 FMLA v21.4s, v15.4s, v0.s[1] 559 FMLA v23.4s, v15.4s, v1.s[1] 560 FMLA v25.4s, v15.4s, v2.s[1] 561 FMLA v27.4s, v15.4s, v3.s[1] 562 FMLA v29.4s, v15.4s, v4.s[1] 563 FMLA v31.4s, v15.4s, v5.s[1] 564 565 # Is there a remainder?- 1 float of A (4 bytes) 5666: 567 TBZ x0, 2, 3b 568 569 # Remainder- 1 float of A (4 bytes) 570 # Load A 571 LDR s0, [x3], 4 572 LDR s1, [x9], 4 573 LDR s2, [x10], 4 574 LDR s3, [x11], 4 575 LDR s4, [x12], 4 576 LDR s5, [x4], 4 577 # Load B 578 LDP q12, q13, [x5], 32 579 580 FMLA v20.4s, v12.4s, v0.s[0] 581 FMLA v22.4s, v12.4s, v1.s[0] 582 FMLA v24.4s, v12.4s, v2.s[0] 583 FMLA v26.4s, v12.4s, v3.s[0] 584 FMLA v28.4s, v12.4s, v4.s[0] 585 FMLA v30.4s, v12.4s, v5.s[0] 586 FMLA v21.4s, v13.4s, v0.s[0] 587 FMLA v23.4s, v13.4s, v1.s[0] 588 FMLA v25.4s, v13.4s, v2.s[0] 589 FMLA v27.4s, v13.4s, v3.s[0] 590 FMLA v29.4s, v13.4s, v4.s[0] 591 FMLA v31.4s, v13.4s, v5.s[0] 592 B 3b 593 594 # Store odd width 5957: 596 TBZ x1, 2, 8f 597 STR q30, [x7], 16 598 MOV v30.16b, v31.16b 599 STR q28, [x13], 16 600 MOV v28.16b, v29.16b 601 STR q26, [x18], 16 602 MOV v26.16b, v27.16b 603 STR q24, [x17], 16 604 MOV v24.16b, v25.16b 605 STR q22, [x16], 16 606 MOV v22.16b, v23.16b 607 STR q20, [x6], 16 608 MOV v20.16b, v21.16b 6098: 610 TBZ x1, 1, 9f 611 STR d30, [x7], 8 612 DUP d30, v30.d[1] 613 STR d28, [x13], 8 614 DUP d28, v28.d[1] 615 STR d26, [x18], 8 616 DUP d26, v26.d[1] 617 STR d24, [x17], 8 618 DUP d24, v24.d[1] 619 STR d22, [x16], 8 620 DUP d22, v22.d[1] 621 STR d20, [x6], 8 622 DUP d20, v20.d[1] 623 6249: 625 TBZ x1, 0, 10f 626 STR s30, [x7] 627 STR s28, [x13] 628 STR s26, [x18] 629 STR s24, [x17] 630 STR s22, [x16] 631 STR s20, [x6] 63210: 633 # Restore d8-d15 from stack 634 LDP d14, d15, [sp, 48] 635 LDP d12, d13, [sp, 32] 636 LDP d10, d11, [sp, 16] 637 LDP d8, d9, [sp], 64 638 RET 639 640END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57 641 642#ifdef __ELF__ 643.section ".note.GNU-stack","",%progbits 644#endif 645