1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x14 c3 40# x13 c4 41# x7 c5 42 43# Vector register usage 44# A0 v0 v6 45# A1 v1 v7 46# A2 v2 v8 47# A3 v3 v9 48# A4 v4 v10 49# A5 v5 v11 50# B v12 v13 v14 v15 51# B v16 v17 v18 v19 52# C v20 v21 53# C v22 v23 54# C v24 v25 55# C v26 v27 56# C v28 v29 57# C v30 v31 58# Clamp v6 v7 59 60BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57 61 62 # Load acc, params pointer 63 LDP x15, x8, [sp, 8] 64 65 # Clamp A and C pointers / Save d8-d15 on stack 66 STP d8, d9, [sp, -64]! 67 CMP x0, 2 // if mr < 2 68 ADD x9, x3, x4 // a1 = a0 + a_stride 69 ADD x16, x6, x7 // c1 = c0 + cm_stride 70 CSEL x9, x3, x9, LO // a1 = a0 71 CSEL x16, x6, x16, LO // c1 = c0 72 73 STP d10, d11, [sp, 16] 74 ADD x10, x9, x4 // a2 = a1 + a_stride 75 ADD x17, x16, x7 // c2 = c1 + cm_stride 76 // if mr <= 2 77 CSEL x10, x9, x10, LS // a2 = a1 78 CSEL x17, x16, x17, LS // c2 = c1 79 80 STP d12, d13, [sp, 32] 81 CMP x0, 4 // if mr < 4 82 ADD x11, x10, x4 // a3 = a2 + a_stride 83 ADD x14, x17, x7 // c3 = c2 + cm_stride 84 CSEL x11, x10, x11, LO // a3 = a2 85 CSEL x14, x17, x14, LO // c3 = c2 86 87 STP d14, d15, [sp, 48] 88 ADD x12, x11, x4 // a4 = a3 + a_stride 89 ADD x13, x14, x7 // c4 = c3 + cm_stride 90 // if mr <= 4 91 CSEL x12, x11, x12, LS // a4 = a3 92 CSEL x13, x14, x13, LS // c4 = c3 93 94 CMP x0, 6 // if mr < 6 95 ADD x4, x12, x4 // a5 = a4 + a_stride 96 ADD x7, x13, x7 // c5 = c4 + cm_stride 97 CSEL x4, x12, x4, LO // a5 = a4 98 CSEL x7, x13, x7, LO // c5 = c4 99 1000: 101 # Load initial accumulators 102 LDP q20, q21, [x15], 32 103 LDP q22, q23, [x15], 32 104 LDP q24, q25, [x15], 32 105 LDP q26, q27, [x15], 32 106 LDP q28, q29, [x15], 32 107 LDP q30, q31, [x15], 32 108 109 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 110 SUBS x0, x2, 32 // k = kc - 32 111 B.LO 4f 112 113 # Prologue - loads for main loop of 96 FMA 114 LDR q0, [x3], 16 115 LDR q1, [x9], 16 116 LDR q2, [x10], 16 117 LDR q3, [x11], 16 118 LDR q4, [x12], 16 119 LDR q5, [x4], 16 120 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 121 LDP q14, q15, [x5], 32 122 LDP q16, q17, [x5], 32 123 124 # Is there at least 8 floats (32 bytes) for main loop? 125 SUBS x0, x0, 32 126 B.LO 2f 127 128 # Main loop - 8 floats of A (32 bytes) 129 # 96 FMA + 6 LDP A + 8 LDP B 1301: 131 # First group of 4 A. 48 FMA. 132 FMLA v20.4s, v12.4s, v0.s[0] 133 LDP q18, q19, [x5], 32 // Load last B 134 FMLA v22.4s, v12.4s, v1.s[0] 135 FMLA v24.4s, v12.4s, v2.s[0] 136 FMLA v26.4s, v12.4s, v3.s[0] 137 FMLA v28.4s, v12.4s, v4.s[0] 138 FMLA v30.4s, v12.4s, v5.s[0] 139 FMLA v21.4s, v13.4s, v0.s[0] 140 FMLA v23.4s, v13.4s, v1.s[0] 141 FMLA v25.4s, v13.4s, v2.s[0] 142 FMLA v27.4s, v13.4s, v3.s[0] 143 FMLA v29.4s, v13.4s, v4.s[0] 144 145 FMLA v31.4s, v13.4s, v5.s[0] 146 FMLA v20.4s, v14.4s, v0.s[1] 147 FMLA v22.4s, v14.4s, v1.s[1] 148 FMLA v24.4s, v14.4s, v2.s[1] 149 FMLA v26.4s, v14.4s, v3.s[1] 150 FMLA v28.4s, v14.4s, v4.s[1] 151 FMLA v30.4s, v14.4s, v5.s[1] 152 FMLA v21.4s, v15.4s, v0.s[1] 153 FMLA v23.4s, v15.4s, v1.s[1] 154 FMLA v25.4s, v15.4s, v2.s[1] 155 LDR q6, [x3], 16 // Load next 6 A 156 FMLA v27.4s, v15.4s, v3.s[1] 157 FMLA v29.4s, v15.4s, v4.s[1] 158 FMLA v31.4s, v15.4s, v5.s[1] 159 LDR q7, [x9], 16 160 161 FMLA v20.4s, v16.4s, v0.s[2] 162 FMLA v22.4s, v16.4s, v1.s[2] 163 FMLA v24.4s, v16.4s, v2.s[2] 164 LDR q8, [x10], 16 165 FMLA v26.4s, v16.4s, v3.s[2] 166 FMLA v28.4s, v16.4s, v4.s[2] 167 FMLA v30.4s, v16.4s, v5.s[2] 168 LDR q9, [x11], 16 169 FMLA v21.4s, v17.4s, v0.s[2] 170 FMLA v23.4s, v17.4s, v1.s[2] 171 FMLA v25.4s, v17.4s, v2.s[2] 172 LDR q10, [x12], 16 173 FMLA v27.4s, v17.4s, v3.s[2] 174 FMLA v29.4s, v17.4s, v4.s[2] 175 FMLA v31.4s, v17.4s, v5.s[2] 176 LDR q11, [x4], 16 177 178 FMLA v20.4s, v18.4s, v0.s[3] 179 FMLA v22.4s, v18.4s, v1.s[3] 180 FMLA v24.4s, v18.4s, v2.s[3] 181 LDP q12, q13, [x5], 32 // Load 4 B 182 FMLA v26.4s, v18.4s, v3.s[3] 183 FMLA v28.4s, v18.4s, v4.s[3] 184 FMLA v30.4s, v18.4s, v5.s[3] 185 LDP q14, q15, [x5], 32 186 FMLA v21.4s, v19.4s, v0.s[3] 187 FMLA v23.4s, v19.4s, v1.s[3] 188 FMLA v25.4s, v19.4s, v2.s[3] 189 LDP q16, q17, [x5], 32 190 FMLA v27.4s, v19.4s, v3.s[3] 191 FMLA v29.4s, v19.4s, v4.s[3] 192 FMLA v31.4s, v19.4s, v5.s[3] 193 LDP q18, q19, [x5], 32 194 195 # Second group of 4 A. 48 FMA. 196 FMLA v20.4s, v12.4s, v6.s[0] 197 FMLA v22.4s, v12.4s, v7.s[0] 198 FMLA v24.4s, v12.4s, v8.s[0] 199 LDR q0, [x3], 16 // Load next 6 A 200 FMLA v26.4s, v12.4s, v9.s[0] 201 FMLA v28.4s, v12.4s, v10.s[0] 202 FMLA v30.4s, v12.4s, v11.s[0] 203 LDR q1, [x9], 16 204 FMLA v21.4s, v13.4s, v6.s[0] 205 FMLA v23.4s, v13.4s, v7.s[0] 206 FMLA v25.4s, v13.4s, v8.s[0] 207 LDR q2, [x10], 16 208 FMLA v27.4s, v13.4s, v9.s[0] 209 FMLA v29.4s, v13.4s, v10.s[0] 210 FMLA v31.4s, v13.4s, v11.s[0] 211 LDR q3, [x11], 16 212 213 FMLA v20.4s, v14.4s, v6.s[1] 214 FMLA v22.4s, v14.4s, v7.s[1] 215 FMLA v24.4s, v14.4s, v8.s[1] 216 LDR q4, [x12], 16 217 FMLA v26.4s, v14.4s, v9.s[1] 218 FMLA v28.4s, v14.4s, v10.s[1] 219 FMLA v30.4s, v14.4s, v11.s[1] 220 LDR q5, [x4], 16 221 FMLA v21.4s, v15.4s, v6.s[1] 222 FMLA v23.4s, v15.4s, v7.s[1] 223 FMLA v25.4s, v15.4s, v8.s[1] 224 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 225 FMLA v27.4s, v15.4s, v9.s[1] 226 FMLA v29.4s, v15.4s, v10.s[1] 227 FMLA v31.4s, v15.4s, v11.s[1] 228 LDP q14, q15, [x5], 32 229 230 FMLA v20.4s, v16.4s, v6.s[2] 231 FMLA v22.4s, v16.4s, v7.s[2] 232 FMLA v24.4s, v16.4s, v8.s[2] 233 FMLA v26.4s, v16.4s, v9.s[2] 234 FMLA v28.4s, v16.4s, v10.s[2] 235 FMLA v30.4s, v16.4s, v11.s[2] 236 FMLA v21.4s, v17.4s, v6.s[2] 237 FMLA v23.4s, v17.4s, v7.s[2] 238 FMLA v25.4s, v17.4s, v8.s[2] 239 FMLA v27.4s, v17.4s, v9.s[2] 240 FMLA v29.4s, v17.4s, v10.s[2] 241 FMLA v31.4s, v17.4s, v11.s[2] 242 LDP q16, q17, [x5], 32 243 244 FMLA v20.4s, v18.4s, v6.s[3] 245 FMLA v22.4s, v18.4s, v7.s[3] 246 SUBS x0, x0, 32 247 FMLA v24.4s, v18.4s, v8.s[3] 248 FMLA v26.4s, v18.4s, v9.s[3] 249 FMLA v28.4s, v18.4s, v10.s[3] 250 FMLA v30.4s, v18.4s, v11.s[3] 251 FMLA v21.4s, v19.4s, v6.s[3] 252 FMLA v23.4s, v19.4s, v7.s[3] 253 FMLA v25.4s, v19.4s, v8.s[3] 254 FMLA v27.4s, v19.4s, v9.s[3] 255 FMLA v29.4s, v19.4s, v10.s[3] 256 FMLA v31.4s, v19.4s, v11.s[3] 257 B.HS 1b 258 259 # Epilogue - 8 floats of A (32 bytes) 260 # 96 FMA + 6 LDP A + 8 LDP B 261 # First block same as main loop. Second block has no preloads. 2622: 263 # First group of 4 A. 48 FMA. 264 FMLA v20.4s, v12.4s, v0.s[0] 265 LDP q18, q19, [x5], 32 // Load last B 266 FMLA v22.4s, v12.4s, v1.s[0] 267 FMLA v24.4s, v12.4s, v2.s[0] 268 FMLA v26.4s, v12.4s, v3.s[0] 269 FMLA v28.4s, v12.4s, v4.s[0] 270 FMLA v30.4s, v12.4s, v5.s[0] 271 FMLA v21.4s, v13.4s, v0.s[0] 272 FMLA v23.4s, v13.4s, v1.s[0] 273 FMLA v25.4s, v13.4s, v2.s[0] 274 FMLA v27.4s, v13.4s, v3.s[0] 275 FMLA v29.4s, v13.4s, v4.s[0] 276 277 FMLA v31.4s, v13.4s, v5.s[0] 278 FMLA v20.4s, v14.4s, v0.s[1] 279 FMLA v22.4s, v14.4s, v1.s[1] 280 FMLA v24.4s, v14.4s, v2.s[1] 281 FMLA v26.4s, v14.4s, v3.s[1] 282 FMLA v28.4s, v14.4s, v4.s[1] 283 FMLA v30.4s, v14.4s, v5.s[1] 284 FMLA v21.4s, v15.4s, v0.s[1] 285 FMLA v23.4s, v15.4s, v1.s[1] 286 FMLA v25.4s, v15.4s, v2.s[1] 287 LDR q6, [x3], 16 // Load next 6 A 288 FMLA v27.4s, v15.4s, v3.s[1] 289 FMLA v29.4s, v15.4s, v4.s[1] 290 FMLA v31.4s, v15.4s, v5.s[1] 291 LDR q7, [x9], 16 292 293 FMLA v20.4s, v16.4s, v0.s[2] 294 FMLA v22.4s, v16.4s, v1.s[2] 295 FMLA v24.4s, v16.4s, v2.s[2] 296 LDR q8, [x10], 16 297 FMLA v26.4s, v16.4s, v3.s[2] 298 FMLA v28.4s, v16.4s, v4.s[2] 299 FMLA v30.4s, v16.4s, v5.s[2] 300 LDR q9, [x11], 16 301 FMLA v21.4s, v17.4s, v0.s[2] 302 FMLA v23.4s, v17.4s, v1.s[2] 303 FMLA v25.4s, v17.4s, v2.s[2] 304 LDR q10, [x12], 16 305 FMLA v27.4s, v17.4s, v3.s[2] 306 FMLA v29.4s, v17.4s, v4.s[2] 307 FMLA v31.4s, v17.4s, v5.s[2] 308 LDR q11, [x4], 16 309 310 FMLA v20.4s, v18.4s, v0.s[3] 311 FMLA v22.4s, v18.4s, v1.s[3] 312 FMLA v24.4s, v18.4s, v2.s[3] 313 LDP q12, q13, [x5], 32 // Load 4 B 314 FMLA v26.4s, v18.4s, v3.s[3] 315 FMLA v28.4s, v18.4s, v4.s[3] 316 FMLA v30.4s, v18.4s, v5.s[3] 317 LDP q14, q15, [x5], 32 318 FMLA v21.4s, v19.4s, v0.s[3] 319 FMLA v23.4s, v19.4s, v1.s[3] 320 FMLA v25.4s, v19.4s, v2.s[3] 321 LDP q16, q17, [x5], 32 322 FMLA v27.4s, v19.4s, v3.s[3] 323 FMLA v29.4s, v19.4s, v4.s[3] 324 FMLA v31.4s, v19.4s, v5.s[3] 325 LDP q18, q19, [x5], 32 326 327 # Second group of 4 A. 48 FMA. 328 FMLA v20.4s, v12.4s, v6.s[0] 329 FMLA v22.4s, v12.4s, v7.s[0] 330 FMLA v24.4s, v12.4s, v8.s[0] 331 FMLA v26.4s, v12.4s, v9.s[0] 332 FMLA v28.4s, v12.4s, v10.s[0] 333 FMLA v30.4s, v12.4s, v11.s[0] 334 FMLA v21.4s, v13.4s, v6.s[0] 335 FMLA v23.4s, v13.4s, v7.s[0] 336 FMLA v25.4s, v13.4s, v8.s[0] 337 FMLA v27.4s, v13.4s, v9.s[0] 338 FMLA v29.4s, v13.4s, v10.s[0] 339 FMLA v31.4s, v13.4s, v11.s[0] 340 341 FMLA v20.4s, v14.4s, v6.s[1] 342 FMLA v22.4s, v14.4s, v7.s[1] 343 FMLA v24.4s, v14.4s, v8.s[1] 344 FMLA v26.4s, v14.4s, v9.s[1] 345 FMLA v28.4s, v14.4s, v10.s[1] 346 FMLA v30.4s, v14.4s, v11.s[1] 347 FMLA v21.4s, v15.4s, v6.s[1] 348 FMLA v23.4s, v15.4s, v7.s[1] 349 FMLA v25.4s, v15.4s, v8.s[1] 350 FMLA v27.4s, v15.4s, v9.s[1] 351 FMLA v29.4s, v15.4s, v10.s[1] 352 FMLA v31.4s, v15.4s, v11.s[1] 353 354 FMLA v20.4s, v16.4s, v6.s[2] 355 FMLA v22.4s, v16.4s, v7.s[2] 356 FMLA v24.4s, v16.4s, v8.s[2] 357 FMLA v26.4s, v16.4s, v9.s[2] 358 FMLA v28.4s, v16.4s, v10.s[2] 359 FMLA v30.4s, v16.4s, v11.s[2] 360 FMLA v21.4s, v17.4s, v6.s[2] 361 FMLA v23.4s, v17.4s, v7.s[2] 362 FMLA v25.4s, v17.4s, v8.s[2] 363 FMLA v27.4s, v17.4s, v9.s[2] 364 FMLA v29.4s, v17.4s, v10.s[2] 365 FMLA v31.4s, v17.4s, v11.s[2] 366 367 FMLA v20.4s, v18.4s, v6.s[3] 368 FMLA v22.4s, v18.4s, v7.s[3] 369 FMLA v24.4s, v18.4s, v8.s[3] 370 FMLA v26.4s, v18.4s, v9.s[3] 371 FMLA v28.4s, v18.4s, v10.s[3] 372 FMLA v30.4s, v18.4s, v11.s[3] 373 FMLA v21.4s, v19.4s, v6.s[3] 374 FMLA v23.4s, v19.4s, v7.s[3] 375 376 # Load min/max values 377 LD2R {v6.4s, v7.4s}, [x8] 378 379 FMLA v25.4s, v19.4s, v8.s[3] 380 FMLA v27.4s, v19.4s, v9.s[3] 381 # Is there a remainder?- 4 floats of A (16 bytes) or less 382 TST x0, 31 383 FMLA v29.4s, v19.4s, v10.s[3] 384 FMLA v31.4s, v19.4s, v11.s[3] 385 B.NE 4f 386 387 # Clamp 3883: 389 FMAX v20.4s, v20.4s, v6.4s 390 # Load cn_stride 391 LDR x0, [sp, 64] 392 FMAX v21.4s, v21.4s, v6.4s 393 FMAX v22.4s, v22.4s, v6.4s 394 FMAX v23.4s, v23.4s, v6.4s 395 FMAX v24.4s, v24.4s, v6.4s 396 FMAX v25.4s, v25.4s, v6.4s 397 FMAX v26.4s, v26.4s, v6.4s 398 FMAX v27.4s, v27.4s, v6.4s 399 FMAX v28.4s, v28.4s, v6.4s 400 FMAX v29.4s, v29.4s, v6.4s 401 FMAX v30.4s, v30.4s, v6.4s 402 FMAX v31.4s, v31.4s, v6.4s 403 SUBS x1, x1, 8 404 FMIN v20.4s, v20.4s, v7.4s 405 FMIN v21.4s, v21.4s, v7.4s 406 FMIN v22.4s, v22.4s, v7.4s 407 FMIN v23.4s, v23.4s, v7.4s 408 FMIN v24.4s, v24.4s, v7.4s 409 FMIN v25.4s, v25.4s, v7.4s 410 FMIN v26.4s, v26.4s, v7.4s 411 FMIN v27.4s, v27.4s, v7.4s 412 FMIN v28.4s, v28.4s, v7.4s 413 FMIN v29.4s, v29.4s, v7.4s 414 FMIN v30.4s, v30.4s, v7.4s 415 FMIN v31.4s, v31.4s, v7.4s 416 417 # Store full 6 x 8 418 B.LO 7f 419 420 STP q30, q31, [x7] 421 ADD x7, x7, x0 422 SUB x3, x3, x2 // a0 -= kc 423 STP q28, q29, [x13] 424 ADD x13, x13, x0 425 SUB x9, x9, x2 // a1 -= kc 426 STP q26, q27, [x14] 427 ADD x14, x14, x0 428 SUB x10, x10, x2 // a2 -= kc 429 STP q24, q25, [x17] 430 ADD x17, x17, x0 431 SUB x11, x11, x2 // a3 -= kc 432 STP q22, q23, [x16] 433 ADD x16, x16, x0 434 SUB x12, x12, x2 // a4 -= kc 435 STP q20, q21, [x6] 436 ADD x6, x6, x0 437 SUB x4, x4, x2 // a5 -= kc 438 439 B.HI 0b 440 441 # Restore d8-d15 from stack 442 LDP d14, d15, [sp, 48] 443 LDP d12, d13, [sp, 32] 444 LDP d10, d11, [sp, 16] 445 LDP d8, d9, [sp], 64 446 RET 447 4484: 449 # Load min/max values 450 LD2R {v6.4s, v7.4s}, [x8] 451 452 # Is there a remainder?- 4 floats of A (16 bytes) 453 TBZ x0, 4, 5f 454 455 # Remainder- 4 floats of A (16 bytes) 456 # Load A 457 LDR q0, [x3], 16 458 LDR q1, [x9], 16 459 LDR q2, [x10], 16 460 LDR q3, [x11], 16 461 LDR q4, [x12], 16 462 LDR q5, [x4], 16 463 # Load B 464 LDP q12, q13, [x5], 32 465 LDP q14, q15, [x5], 32 466 LDP q16, q17, [x5], 32 467 LDP q18, q19, [x5], 32 468 469 FMLA v20.4s, v12.4s, v0.s[0] 470 FMLA v22.4s, v12.4s, v1.s[0] 471 FMLA v24.4s, v12.4s, v2.s[0] 472 FMLA v26.4s, v12.4s, v3.s[0] 473 FMLA v28.4s, v12.4s, v4.s[0] 474 FMLA v30.4s, v12.4s, v5.s[0] 475 FMLA v21.4s, v13.4s, v0.s[0] 476 FMLA v23.4s, v13.4s, v1.s[0] 477 FMLA v25.4s, v13.4s, v2.s[0] 478 FMLA v27.4s, v13.4s, v3.s[0] 479 FMLA v29.4s, v13.4s, v4.s[0] 480 FMLA v31.4s, v13.4s, v5.s[0] 481 482 FMLA v20.4s, v14.4s, v0.s[1] 483 FMLA v22.4s, v14.4s, v1.s[1] 484 FMLA v24.4s, v14.4s, v2.s[1] 485 FMLA v26.4s, v14.4s, v3.s[1] 486 FMLA v28.4s, v14.4s, v4.s[1] 487 FMLA v30.4s, v14.4s, v5.s[1] 488 FMLA v21.4s, v15.4s, v0.s[1] 489 FMLA v23.4s, v15.4s, v1.s[1] 490 FMLA v25.4s, v15.4s, v2.s[1] 491 FMLA v27.4s, v15.4s, v3.s[1] 492 FMLA v29.4s, v15.4s, v4.s[1] 493 FMLA v31.4s, v15.4s, v5.s[1] 494 495 FMLA v20.4s, v16.4s, v0.s[2] 496 FMLA v22.4s, v16.4s, v1.s[2] 497 FMLA v24.4s, v16.4s, v2.s[2] 498 FMLA v26.4s, v16.4s, v3.s[2] 499 FMLA v28.4s, v16.4s, v4.s[2] 500 FMLA v30.4s, v16.4s, v5.s[2] 501 FMLA v21.4s, v17.4s, v0.s[2] 502 FMLA v23.4s, v17.4s, v1.s[2] 503 FMLA v25.4s, v17.4s, v2.s[2] 504 FMLA v27.4s, v17.4s, v3.s[2] 505 FMLA v29.4s, v17.4s, v4.s[2] 506 FMLA v31.4s, v17.4s, v5.s[2] 507 508 FMLA v20.4s, v18.4s, v0.s[3] 509 FMLA v22.4s, v18.4s, v1.s[3] 510 FMLA v24.4s, v18.4s, v2.s[3] 511 FMLA v26.4s, v18.4s, v3.s[3] 512 FMLA v28.4s, v18.4s, v4.s[3] 513 FMLA v30.4s, v18.4s, v5.s[3] 514 FMLA v21.4s, v19.4s, v0.s[3] 515 FMLA v23.4s, v19.4s, v1.s[3] 516 FMLA v25.4s, v19.4s, v2.s[3] 517 FMLA v27.4s, v19.4s, v3.s[3] 518 FMLA v29.4s, v19.4s, v4.s[3] 519 FMLA v31.4s, v19.4s, v5.s[3] 520 521 # Is there a remainder?- 2 floats of A (8 bytes) 5225: 523 TBZ x0, 3, 6f 524 525 # Remainder- 2 floats of A (8 bytes) 526 # Load A 527 LDR d0, [x3], 8 528 LDR d1, [x9], 8 529 LDR d2, [x10], 8 530 LDR d3, [x11], 8 531 LDR d4, [x12], 8 532 LDR d5, [x4], 8 533 # Load B 534 LDP q12, q13, [x5], 32 535 LDP q14, q15, [x5], 32 536 537 FMLA v20.4s, v12.4s, v0.s[0] 538 FMLA v22.4s, v12.4s, v1.s[0] 539 FMLA v24.4s, v12.4s, v2.s[0] 540 FMLA v26.4s, v12.4s, v3.s[0] 541 FMLA v28.4s, v12.4s, v4.s[0] 542 FMLA v30.4s, v12.4s, v5.s[0] 543 FMLA v21.4s, v13.4s, v0.s[0] 544 FMLA v23.4s, v13.4s, v1.s[0] 545 FMLA v25.4s, v13.4s, v2.s[0] 546 FMLA v27.4s, v13.4s, v3.s[0] 547 FMLA v29.4s, v13.4s, v4.s[0] 548 FMLA v31.4s, v13.4s, v5.s[0] 549 550 FMLA v20.4s, v14.4s, v0.s[1] 551 FMLA v22.4s, v14.4s, v1.s[1] 552 FMLA v24.4s, v14.4s, v2.s[1] 553 FMLA v26.4s, v14.4s, v3.s[1] 554 FMLA v28.4s, v14.4s, v4.s[1] 555 FMLA v30.4s, v14.4s, v5.s[1] 556 FMLA v21.4s, v15.4s, v0.s[1] 557 FMLA v23.4s, v15.4s, v1.s[1] 558 FMLA v25.4s, v15.4s, v2.s[1] 559 FMLA v27.4s, v15.4s, v3.s[1] 560 FMLA v29.4s, v15.4s, v4.s[1] 561 FMLA v31.4s, v15.4s, v5.s[1] 562 563 # Is there a remainder?- 1 float of A (4 bytes) 5646: 565 TBZ x0, 2, 3b 566 567 # Remainder- 1 float of A (4 bytes) 568 # Load A 569 LDR s0, [x3], 4 570 LDR s1, [x9], 4 571 LDR s2, [x10], 4 572 LDR s3, [x11], 4 573 LDR s4, [x12], 4 574 LDR s5, [x4], 4 575 # Load B 576 LDP q12, q13, [x5], 32 577 578 FMLA v20.4s, v12.4s, v0.s[0] 579 FMLA v22.4s, v12.4s, v1.s[0] 580 FMLA v24.4s, v12.4s, v2.s[0] 581 FMLA v26.4s, v12.4s, v3.s[0] 582 FMLA v28.4s, v12.4s, v4.s[0] 583 FMLA v30.4s, v12.4s, v5.s[0] 584 FMLA v21.4s, v13.4s, v0.s[0] 585 FMLA v23.4s, v13.4s, v1.s[0] 586 FMLA v25.4s, v13.4s, v2.s[0] 587 FMLA v27.4s, v13.4s, v3.s[0] 588 FMLA v29.4s, v13.4s, v4.s[0] 589 FMLA v31.4s, v13.4s, v5.s[0] 590 B 3b 591 592 # Store odd width 5937: 594 TBZ x1, 2, 8f 595 STR q30, [x7], 16 596 MOV v30.16b, v31.16b 597 STR q28, [x13], 16 598 MOV v28.16b, v29.16b 599 STR q26, [x14], 16 600 MOV v26.16b, v27.16b 601 STR q24, [x17], 16 602 MOV v24.16b, v25.16b 603 STR q22, [x16], 16 604 MOV v22.16b, v23.16b 605 STR q20, [x6], 16 606 MOV v20.16b, v21.16b 6078: 608 TBZ x1, 1, 9f 609 STR d30, [x7], 8 610 DUP d30, v30.d[1] 611 STR d28, [x13], 8 612 DUP d28, v28.d[1] 613 STR d26, [x14], 8 614 DUP d26, v26.d[1] 615 STR d24, [x17], 8 616 DUP d24, v24.d[1] 617 STR d22, [x16], 8 618 DUP d22, v22.d[1] 619 STR d20, [x6], 8 620 DUP d20, v20.d[1] 621 6229: 623 TBZ x1, 0, 10f 624 STR s30, [x7] 625 STR s28, [x13] 626 STR s26, [x14] 627 STR s24, [x17] 628 STR s22, [x16] 629 STR s20, [x6] 63010: 631 # Restore d8-d15 from stack 632 LDP d14, d15, [sp, 48] 633 LDP d12, d13, [sp, 32] 634 LDP d10, d11, [sp, 16] 635 LDP d8, d9, [sp], 64 636 RET 637 638END_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57 639 640#ifdef __ELF__ 641.section ".note.GNU-stack","",%progbits 642#endif 643