1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x18 c3 40# x13 c4 41# x7 c5 42 43# Vector register usage 44# A0 v0 v6 45# A1 v1 v7 46# A2 v2 v8 47# A3 v3 v9 48# A4 v4 v10 49# A5 v5 v11 50# B v12 v13 v14 v15 51# B v16 v17 v18 v19 52# C v20 v21 53# C v22 v23 54# C v24 v25 55# C v26 v27 56# C v28 v29 57# C v30 v31 58# Clamp v6 v7 59 60BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57 61 62 # Clamp A and C pointers / Save d8-d15 on stack 63 STP d8, d9, [sp, -64]! 64 CMP x0, 2 // if mr < 2 65 ADD x9, x3, x4 // a1 = a0 + a_stride 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 CSEL x9, x3, x9, LO // a1 = a0 68 CSEL x16, x6, x16, LO // c1 = c0 69 70 STP d10, d11, [sp, 16] 71 ADD x10, x9, x4 // a2 = a1 + a_stride 72 ADD x17, x16, x7 // c2 = c1 + cm_stride 73 // if mr <= 2 74 CSEL x10, x9, x10, LS // a2 = a1 75 CSEL x17, x16, x17, LS // c2 = c1 76 77 STP d12, d13, [sp, 32] 78 CMP x0, 4 // if mr < 4 79 ADD x11, x10, x4 // a3 = a2 + a_stride 80 ADD x18, x17, x7 // c3 = c2 + cm_stride 81 CSEL x11, x10, x11, LO // a3 = a2 82 CSEL x18, x17, x18, LO // c3 = c2 83 84 STP d14, d15, [sp, 48] 85 ADD x12, x11, x4 // a4 = a3 + a_stride 86 ADD x13, x18, x7 // c4 = c3 + cm_stride 87 // if mr <= 5 88 CSEL x12, x11, x12, LS // a4 = a3 89 CSEL x13, x18, x13, LS // c4 = c3 90 91 # Load params pointer 92 LDR x8, [sp, 72] 93 94 CMP x0, 6 // if mr < 6 95 ADD x4, x12, x4 // a5 = a4 + a_stride 96 ADD x7, x13, x7 // c5 = c4 + cm_stride 97 CSEL x4, x12, x4, LO // a5 = a4 98 CSEL x7, x13, x7, LO // c5 = c4 99 100 # Load cn_stride 101 LDR x14, [sp, 64] 102 1030: 104 # Load initial bias from w into accumulators 105 LDP q20, q21, [x5], 32 106 MOV v22.16b, v20.16b 107 MOV v23.16b, v21.16b 108 MOV v24.16b, v20.16b 109 MOV v25.16b, v21.16b 110 MOV v26.16b, v20.16b 111 MOV v27.16b, v21.16b 112 MOV v28.16b, v20.16b 113 MOV v29.16b, v21.16b 114 MOV v30.16b, v20.16b 115 MOV v31.16b, v21.16b 116 117 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 118 SUBS x0, x2, 32 // k = kc - 32 119 B.LO 4f 120 121 # Prologue - loads for main loop of 96 FMA 122 LDR q0, [x3], 16 123 LDR q1, [x9], 16 124 LDR q2, [x10], 16 125 LDR q3, [x11], 16 126 LDR q4, [x12], 16 127 LDR q5, [x4], 16 128 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 129 LDP q14, q15, [x5], 32 130 LDP q16, q17, [x5], 32 131 132 # Is there at least 8 floats (32 bytes) for main loop? 133 SUBS x0, x0, 32 134 B.LO 2f 135 136 # Main loop - 8 floats of A (32 bytes) 137 # 96 FMA + 6 LDP A + 8 LDP B 1381: 139 # First group of 4 A. 48 FMA. 140 FMLA v20.4s, v12.4s, v0.s[0] 141 LDP q18, q19, [x5], 32 // Load last B 142 FMLA v22.4s, v12.4s, v1.s[0] 143 FMLA v24.4s, v12.4s, v2.s[0] 144 FMLA v26.4s, v12.4s, v3.s[0] 145 FMLA v28.4s, v12.4s, v4.s[0] 146 FMLA v30.4s, v12.4s, v5.s[0] 147 FMLA v21.4s, v13.4s, v0.s[0] 148 FMLA v23.4s, v13.4s, v1.s[0] 149 FMLA v25.4s, v13.4s, v2.s[0] 150 FMLA v27.4s, v13.4s, v3.s[0] 151 FMLA v29.4s, v13.4s, v4.s[0] 152 153 FMLA v31.4s, v13.4s, v5.s[0] 154 FMLA v20.4s, v14.4s, v0.s[1] 155 FMLA v22.4s, v14.4s, v1.s[1] 156 FMLA v24.4s, v14.4s, v2.s[1] 157 FMLA v26.4s, v14.4s, v3.s[1] 158 FMLA v28.4s, v14.4s, v4.s[1] 159 FMLA v30.4s, v14.4s, v5.s[1] 160 FMLA v21.4s, v15.4s, v0.s[1] 161 FMLA v23.4s, v15.4s, v1.s[1] 162 FMLA v25.4s, v15.4s, v2.s[1] 163 LDR q6, [x3], 16 // Load next 6 A 164 FMLA v27.4s, v15.4s, v3.s[1] 165 FMLA v29.4s, v15.4s, v4.s[1] 166 FMLA v31.4s, v15.4s, v5.s[1] 167 LDR q7, [x9], 16 168 169 FMLA v20.4s, v16.4s, v0.s[2] 170 FMLA v22.4s, v16.4s, v1.s[2] 171 FMLA v24.4s, v16.4s, v2.s[2] 172 LDR q8, [x10], 16 173 FMLA v26.4s, v16.4s, v3.s[2] 174 FMLA v28.4s, v16.4s, v4.s[2] 175 FMLA v30.4s, v16.4s, v5.s[2] 176 LDR q9, [x11], 16 177 FMLA v21.4s, v17.4s, v0.s[2] 178 FMLA v23.4s, v17.4s, v1.s[2] 179 FMLA v25.4s, v17.4s, v2.s[2] 180 LDR q10, [x12], 16 181 FMLA v27.4s, v17.4s, v3.s[2] 182 FMLA v29.4s, v17.4s, v4.s[2] 183 FMLA v31.4s, v17.4s, v5.s[2] 184 LDR q11, [x4], 16 185 186 FMLA v20.4s, v18.4s, v0.s[3] 187 FMLA v22.4s, v18.4s, v1.s[3] 188 FMLA v24.4s, v18.4s, v2.s[3] 189 LDP q12, q13, [x5], 32 // Load 4 B 190 FMLA v26.4s, v18.4s, v3.s[3] 191 FMLA v28.4s, v18.4s, v4.s[3] 192 FMLA v30.4s, v18.4s, v5.s[3] 193 LDP q14, q15, [x5], 32 194 FMLA v21.4s, v19.4s, v0.s[3] 195 FMLA v23.4s, v19.4s, v1.s[3] 196 FMLA v25.4s, v19.4s, v2.s[3] 197 LDP q16, q17, [x5], 32 198 FMLA v27.4s, v19.4s, v3.s[3] 199 FMLA v29.4s, v19.4s, v4.s[3] 200 FMLA v31.4s, v19.4s, v5.s[3] 201 LDP q18, q19, [x5], 32 202 203 # Second group of 4 A. 48 FMA. 204 FMLA v20.4s, v12.4s, v6.s[0] 205 FMLA v22.4s, v12.4s, v7.s[0] 206 FMLA v24.4s, v12.4s, v8.s[0] 207 LDR q0, [x3], 16 // Load next 6 A 208 FMLA v26.4s, v12.4s, v9.s[0] 209 FMLA v28.4s, v12.4s, v10.s[0] 210 FMLA v30.4s, v12.4s, v11.s[0] 211 LDR q1, [x9], 16 212 FMLA v21.4s, v13.4s, v6.s[0] 213 FMLA v23.4s, v13.4s, v7.s[0] 214 FMLA v25.4s, v13.4s, v8.s[0] 215 LDR q2, [x10], 16 216 FMLA v27.4s, v13.4s, v9.s[0] 217 FMLA v29.4s, v13.4s, v10.s[0] 218 FMLA v31.4s, v13.4s, v11.s[0] 219 LDR q3, [x11], 16 220 221 FMLA v20.4s, v14.4s, v6.s[1] 222 FMLA v22.4s, v14.4s, v7.s[1] 223 FMLA v24.4s, v14.4s, v8.s[1] 224 LDR q4, [x12], 16 225 FMLA v26.4s, v14.4s, v9.s[1] 226 FMLA v28.4s, v14.4s, v10.s[1] 227 FMLA v30.4s, v14.4s, v11.s[1] 228 LDR q5, [x4], 16 229 FMLA v21.4s, v15.4s, v6.s[1] 230 FMLA v23.4s, v15.4s, v7.s[1] 231 FMLA v25.4s, v15.4s, v8.s[1] 232 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 233 FMLA v27.4s, v15.4s, v9.s[1] 234 FMLA v29.4s, v15.4s, v10.s[1] 235 FMLA v31.4s, v15.4s, v11.s[1] 236 LDP q14, q15, [x5], 32 237 238 FMLA v20.4s, v16.4s, v6.s[2] 239 FMLA v22.4s, v16.4s, v7.s[2] 240 FMLA v24.4s, v16.4s, v8.s[2] 241 FMLA v26.4s, v16.4s, v9.s[2] 242 FMLA v28.4s, v16.4s, v10.s[2] 243 FMLA v30.4s, v16.4s, v11.s[2] 244 FMLA v21.4s, v17.4s, v6.s[2] 245 FMLA v23.4s, v17.4s, v7.s[2] 246 FMLA v25.4s, v17.4s, v8.s[2] 247 FMLA v27.4s, v17.4s, v9.s[2] 248 FMLA v29.4s, v17.4s, v10.s[2] 249 FMLA v31.4s, v17.4s, v11.s[2] 250 LDP q16, q17, [x5], 32 251 252 FMLA v20.4s, v18.4s, v6.s[3] 253 FMLA v22.4s, v18.4s, v7.s[3] 254 SUBS x0, x0, 32 255 FMLA v24.4s, v18.4s, v8.s[3] 256 FMLA v26.4s, v18.4s, v9.s[3] 257 FMLA v28.4s, v18.4s, v10.s[3] 258 FMLA v30.4s, v18.4s, v11.s[3] 259 FMLA v21.4s, v19.4s, v6.s[3] 260 FMLA v23.4s, v19.4s, v7.s[3] 261 FMLA v25.4s, v19.4s, v8.s[3] 262 FMLA v27.4s, v19.4s, v9.s[3] 263 FMLA v29.4s, v19.4s, v10.s[3] 264 FMLA v31.4s, v19.4s, v11.s[3] 265 B.HS 1b 266 267 # Epilogue - 8 floats of A (32 bytes) 268 # 96 FMA + 6 LDP A + 8 LDP B 269 # First block same as main loop. Second block has no preloads. 2702: 271 # First group of 4 A. 48 FMA. 272 FMLA v20.4s, v12.4s, v0.s[0] 273 LDP q18, q19, [x5], 32 // Load last B 274 FMLA v22.4s, v12.4s, v1.s[0] 275 FMLA v24.4s, v12.4s, v2.s[0] 276 FMLA v26.4s, v12.4s, v3.s[0] 277 FMLA v28.4s, v12.4s, v4.s[0] 278 FMLA v30.4s, v12.4s, v5.s[0] 279 FMLA v21.4s, v13.4s, v0.s[0] 280 FMLA v23.4s, v13.4s, v1.s[0] 281 FMLA v25.4s, v13.4s, v2.s[0] 282 FMLA v27.4s, v13.4s, v3.s[0] 283 FMLA v29.4s, v13.4s, v4.s[0] 284 285 FMLA v31.4s, v13.4s, v5.s[0] 286 FMLA v20.4s, v14.4s, v0.s[1] 287 FMLA v22.4s, v14.4s, v1.s[1] 288 FMLA v24.4s, v14.4s, v2.s[1] 289 FMLA v26.4s, v14.4s, v3.s[1] 290 FMLA v28.4s, v14.4s, v4.s[1] 291 FMLA v30.4s, v14.4s, v5.s[1] 292 FMLA v21.4s, v15.4s, v0.s[1] 293 FMLA v23.4s, v15.4s, v1.s[1] 294 FMLA v25.4s, v15.4s, v2.s[1] 295 LDR q6, [x3], 16 // Load next 6 A 296 FMLA v27.4s, v15.4s, v3.s[1] 297 FMLA v29.4s, v15.4s, v4.s[1] 298 FMLA v31.4s, v15.4s, v5.s[1] 299 LDR q7, [x9], 16 300 301 FMLA v20.4s, v16.4s, v0.s[2] 302 FMLA v22.4s, v16.4s, v1.s[2] 303 FMLA v24.4s, v16.4s, v2.s[2] 304 LDR q8, [x10], 16 305 FMLA v26.4s, v16.4s, v3.s[2] 306 FMLA v28.4s, v16.4s, v4.s[2] 307 FMLA v30.4s, v16.4s, v5.s[2] 308 LDR q9, [x11], 16 309 FMLA v21.4s, v17.4s, v0.s[2] 310 FMLA v23.4s, v17.4s, v1.s[2] 311 FMLA v25.4s, v17.4s, v2.s[2] 312 LDR q10, [x12], 16 313 FMLA v27.4s, v17.4s, v3.s[2] 314 FMLA v29.4s, v17.4s, v4.s[2] 315 FMLA v31.4s, v17.4s, v5.s[2] 316 LDR q11, [x4], 16 317 318 FMLA v20.4s, v18.4s, v0.s[3] 319 FMLA v22.4s, v18.4s, v1.s[3] 320 FMLA v24.4s, v18.4s, v2.s[3] 321 LDP q12, q13, [x5], 32 // Load 4 B 322 FMLA v26.4s, v18.4s, v3.s[3] 323 FMLA v28.4s, v18.4s, v4.s[3] 324 FMLA v30.4s, v18.4s, v5.s[3] 325 LDP q14, q15, [x5], 32 326 FMLA v21.4s, v19.4s, v0.s[3] 327 FMLA v23.4s, v19.4s, v1.s[3] 328 FMLA v25.4s, v19.4s, v2.s[3] 329 LDP q16, q17, [x5], 32 330 FMLA v27.4s, v19.4s, v3.s[3] 331 FMLA v29.4s, v19.4s, v4.s[3] 332 FMLA v31.4s, v19.4s, v5.s[3] 333 LDP q18, q19, [x5], 32 334 335 # Second group of 4 A. 48 FMA. 336 FMLA v20.4s, v12.4s, v6.s[0] 337 FMLA v22.4s, v12.4s, v7.s[0] 338 FMLA v24.4s, v12.4s, v8.s[0] 339 FMLA v26.4s, v12.4s, v9.s[0] 340 FMLA v28.4s, v12.4s, v10.s[0] 341 FMLA v30.4s, v12.4s, v11.s[0] 342 FMLA v21.4s, v13.4s, v6.s[0] 343 FMLA v23.4s, v13.4s, v7.s[0] 344 FMLA v25.4s, v13.4s, v8.s[0] 345 FMLA v27.4s, v13.4s, v9.s[0] 346 FMLA v29.4s, v13.4s, v10.s[0] 347 FMLA v31.4s, v13.4s, v11.s[0] 348 349 FMLA v20.4s, v14.4s, v6.s[1] 350 FMLA v22.4s, v14.4s, v7.s[1] 351 FMLA v24.4s, v14.4s, v8.s[1] 352 FMLA v26.4s, v14.4s, v9.s[1] 353 FMLA v28.4s, v14.4s, v10.s[1] 354 FMLA v30.4s, v14.4s, v11.s[1] 355 FMLA v21.4s, v15.4s, v6.s[1] 356 FMLA v23.4s, v15.4s, v7.s[1] 357 FMLA v25.4s, v15.4s, v8.s[1] 358 FMLA v27.4s, v15.4s, v9.s[1] 359 FMLA v29.4s, v15.4s, v10.s[1] 360 FMLA v31.4s, v15.4s, v11.s[1] 361 362 FMLA v20.4s, v16.4s, v6.s[2] 363 FMLA v22.4s, v16.4s, v7.s[2] 364 FMLA v24.4s, v16.4s, v8.s[2] 365 FMLA v26.4s, v16.4s, v9.s[2] 366 FMLA v28.4s, v16.4s, v10.s[2] 367 FMLA v30.4s, v16.4s, v11.s[2] 368 FMLA v21.4s, v17.4s, v6.s[2] 369 FMLA v23.4s, v17.4s, v7.s[2] 370 FMLA v25.4s, v17.4s, v8.s[2] 371 FMLA v27.4s, v17.4s, v9.s[2] 372 FMLA v29.4s, v17.4s, v10.s[2] 373 FMLA v31.4s, v17.4s, v11.s[2] 374 375 FMLA v20.4s, v18.4s, v6.s[3] 376 FMLA v22.4s, v18.4s, v7.s[3] 377 FMLA v24.4s, v18.4s, v8.s[3] 378 FMLA v26.4s, v18.4s, v9.s[3] 379 FMLA v28.4s, v18.4s, v10.s[3] 380 FMLA v30.4s, v18.4s, v11.s[3] 381 FMLA v21.4s, v19.4s, v6.s[3] 382 FMLA v23.4s, v19.4s, v7.s[3] 383 384 # Load clamping_params values 385 LD2R {v6.4s, v7.4s}, [x8] 386 387 FMLA v25.4s, v19.4s, v8.s[3] 388 FMLA v27.4s, v19.4s, v9.s[3] 389 # Is there a remainder?- 4 floats of A (16 bytes) or less 390 TST x0, 31 391 FMLA v29.4s, v19.4s, v10.s[3] 392 FMLA v31.4s, v19.4s, v11.s[3] 393 B.NE 4f 394 395 # Clamp 3963: 397 FMIN v20.4s, v20.4s, v6.4s 398 SUBS x1, x1, 8 399 FMIN v21.4s, v21.4s, v6.4s 400 FMIN v22.4s, v22.4s, v6.4s 401 FMIN v23.4s, v23.4s, v6.4s 402 FMIN v24.4s, v24.4s, v6.4s 403 FMIN v25.4s, v25.4s, v6.4s 404 FMIN v26.4s, v26.4s, v6.4s 405 FMIN v27.4s, v27.4s, v6.4s 406 FMIN v28.4s, v28.4s, v6.4s 407 FMIN v29.4s, v29.4s, v6.4s 408 FMIN v30.4s, v30.4s, v6.4s 409 FMIN v31.4s, v31.4s, v6.4s 410 FMAX v20.4s, v20.4s, v7.4s 411 FMAX v21.4s, v21.4s, v7.4s 412 FMAX v22.4s, v22.4s, v7.4s 413 FMAX v23.4s, v23.4s, v7.4s 414 FMAX v24.4s, v24.4s, v7.4s 415 FMAX v25.4s, v25.4s, v7.4s 416 FMAX v26.4s, v26.4s, v7.4s 417 FMAX v27.4s, v27.4s, v7.4s 418 FMAX v28.4s, v28.4s, v7.4s 419 FMAX v29.4s, v29.4s, v7.4s 420 FMAX v30.4s, v30.4s, v7.4s 421 FMAX v31.4s, v31.4s, v7.4s 422 423 # Store full 6 x 8 424 B.LO 7f 425 426 STP q20, q21, [x6] 427 ADD x6, x6, x14 428 SUB x3, x3, x2 // a0 -= kc 429 STP q22, q23, [x16] 430 ADD x16, x16, x14 431 SUB x9, x9, x2 // a1 -= kc 432 STP q24, q25, [x17] 433 ADD x17, x17, x14 434 SUB x10, x10, x2 // a2 -= kc 435 STP q26, q27, [x18] 436 ADD x18, x18, x14 437 SUB x11, x11, x2 // a3 -= kc 438 STP q28, q29, [x13] 439 ADD x13, x13, x14 440 SUB x12, x12, x2 // a4 -= kc 441 STP q30, q31, [x7] 442 ADD x7, x7, x14 443 SUB x4, x4, x2 // a5 -= kc 444 445 B.HI 0b 446 447 # Restore d8-d15 from stack 448 LDP d14, d15, [sp, 48] 449 LDP d12, d13, [sp, 32] 450 LDP d10, d11, [sp, 16] 451 LDP d8, d9, [sp], 64 452 RET 453 4544: 455 # Load clamping_params values 456 LD2R {v6.4s, v7.4s}, [x8] 457 458 # Is there a remainder?- 4 floats of A (16 bytes) 459 TBZ x0, 4, 5f 460 461 # Remainder- 4 floats of A (16 bytes) 462 # Load A 463 LDR q0, [x3], 16 464 LDR q1, [x9], 16 465 LDR q2, [x10], 16 466 LDR q3, [x11], 16 467 LDR q4, [x12], 16 468 LDR q5, [x4], 16 469 # Load B 470 LDP q12, q13, [x5], 32 471 LDP q14, q15, [x5], 32 472 LDP q16, q17, [x5], 32 473 LDP q18, q19, [x5], 32 474 475 FMLA v20.4s, v12.4s, v0.s[0] 476 FMLA v22.4s, v12.4s, v1.s[0] 477 FMLA v24.4s, v12.4s, v2.s[0] 478 FMLA v26.4s, v12.4s, v3.s[0] 479 FMLA v28.4s, v12.4s, v4.s[0] 480 FMLA v30.4s, v12.4s, v5.s[0] 481 FMLA v21.4s, v13.4s, v0.s[0] 482 FMLA v23.4s, v13.4s, v1.s[0] 483 FMLA v25.4s, v13.4s, v2.s[0] 484 FMLA v27.4s, v13.4s, v3.s[0] 485 FMLA v29.4s, v13.4s, v4.s[0] 486 FMLA v31.4s, v13.4s, v5.s[0] 487 488 FMLA v20.4s, v14.4s, v0.s[1] 489 FMLA v22.4s, v14.4s, v1.s[1] 490 FMLA v24.4s, v14.4s, v2.s[1] 491 FMLA v26.4s, v14.4s, v3.s[1] 492 FMLA v28.4s, v14.4s, v4.s[1] 493 FMLA v30.4s, v14.4s, v5.s[1] 494 FMLA v21.4s, v15.4s, v0.s[1] 495 FMLA v23.4s, v15.4s, v1.s[1] 496 FMLA v25.4s, v15.4s, v2.s[1] 497 FMLA v27.4s, v15.4s, v3.s[1] 498 FMLA v29.4s, v15.4s, v4.s[1] 499 FMLA v31.4s, v15.4s, v5.s[1] 500 501 FMLA v20.4s, v16.4s, v0.s[2] 502 FMLA v22.4s, v16.4s, v1.s[2] 503 FMLA v24.4s, v16.4s, v2.s[2] 504 FMLA v26.4s, v16.4s, v3.s[2] 505 FMLA v28.4s, v16.4s, v4.s[2] 506 FMLA v30.4s, v16.4s, v5.s[2] 507 FMLA v21.4s, v17.4s, v0.s[2] 508 FMLA v23.4s, v17.4s, v1.s[2] 509 FMLA v25.4s, v17.4s, v2.s[2] 510 FMLA v27.4s, v17.4s, v3.s[2] 511 FMLA v29.4s, v17.4s, v4.s[2] 512 FMLA v31.4s, v17.4s, v5.s[2] 513 514 FMLA v20.4s, v18.4s, v0.s[3] 515 FMLA v22.4s, v18.4s, v1.s[3] 516 FMLA v24.4s, v18.4s, v2.s[3] 517 FMLA v26.4s, v18.4s, v3.s[3] 518 FMLA v28.4s, v18.4s, v4.s[3] 519 FMLA v30.4s, v18.4s, v5.s[3] 520 FMLA v21.4s, v19.4s, v0.s[3] 521 FMLA v23.4s, v19.4s, v1.s[3] 522 FMLA v25.4s, v19.4s, v2.s[3] 523 FMLA v27.4s, v19.4s, v3.s[3] 524 FMLA v29.4s, v19.4s, v4.s[3] 525 FMLA v31.4s, v19.4s, v5.s[3] 526 527 # Is there a remainder?- 2 floats of A (8 bytes) 5285: 529 TBZ x0, 3, 6f 530 531 # Remainder- 2 floats of A (8 bytes) 532 # Load A 533 LDR d0, [x3], 8 534 LDR d1, [x9], 8 535 LDR d2, [x10], 8 536 LDR d3, [x11], 8 537 LDR d4, [x12], 8 538 LDR d5, [x4], 8 539 # Load B 540 LDP q12, q13, [x5], 32 541 LDP q14, q15, [x5], 32 542 543 FMLA v20.4s, v12.4s, v0.s[0] 544 FMLA v22.4s, v12.4s, v1.s[0] 545 FMLA v24.4s, v12.4s, v2.s[0] 546 FMLA v26.4s, v12.4s, v3.s[0] 547 FMLA v28.4s, v12.4s, v4.s[0] 548 FMLA v30.4s, v12.4s, v5.s[0] 549 FMLA v21.4s, v13.4s, v0.s[0] 550 FMLA v23.4s, v13.4s, v1.s[0] 551 FMLA v25.4s, v13.4s, v2.s[0] 552 FMLA v27.4s, v13.4s, v3.s[0] 553 FMLA v29.4s, v13.4s, v4.s[0] 554 FMLA v31.4s, v13.4s, v5.s[0] 555 556 FMLA v20.4s, v14.4s, v0.s[1] 557 FMLA v22.4s, v14.4s, v1.s[1] 558 FMLA v24.4s, v14.4s, v2.s[1] 559 FMLA v26.4s, v14.4s, v3.s[1] 560 FMLA v28.4s, v14.4s, v4.s[1] 561 FMLA v30.4s, v14.4s, v5.s[1] 562 FMLA v21.4s, v15.4s, v0.s[1] 563 FMLA v23.4s, v15.4s, v1.s[1] 564 FMLA v25.4s, v15.4s, v2.s[1] 565 FMLA v27.4s, v15.4s, v3.s[1] 566 FMLA v29.4s, v15.4s, v4.s[1] 567 FMLA v31.4s, v15.4s, v5.s[1] 568 569 # Is there a remainder?- 1 float of A (4 bytes) 5706: 571 TBZ x0, 2, 3b 572 573 # Remainder- 1 float of A (4 bytes) 574 # Load A 575 LDR s0, [x3], 4 576 LDR s1, [x9], 4 577 LDR s2, [x10], 4 578 LDR s3, [x11], 4 579 LDR s4, [x12], 4 580 LDR s5, [x4], 4 581 # Load B 582 LDP q12, q13, [x5], 32 583 584 FMLA v20.4s, v12.4s, v0.s[0] 585 FMLA v22.4s, v12.4s, v1.s[0] 586 FMLA v24.4s, v12.4s, v2.s[0] 587 FMLA v26.4s, v12.4s, v3.s[0] 588 FMLA v28.4s, v12.4s, v4.s[0] 589 FMLA v30.4s, v12.4s, v5.s[0] 590 FMLA v21.4s, v13.4s, v0.s[0] 591 FMLA v23.4s, v13.4s, v1.s[0] 592 FMLA v25.4s, v13.4s, v2.s[0] 593 FMLA v27.4s, v13.4s, v3.s[0] 594 FMLA v29.4s, v13.4s, v4.s[0] 595 FMLA v31.4s, v13.4s, v5.s[0] 596 B 3b 597 598 # Store odd width 5997: 600 TBZ x1, 2, 8f 601 STR q20, [x6], 16 602 MOV v20.16b, v21.16b 603 STR q22, [x16], 16 604 MOV v22.16b, v23.16b 605 STR q24, [x17], 16 606 MOV v24.16b, v25.16b 607 STR q26, [x18], 16 608 MOV v26.16b, v27.16b 609 STR q28, [x13], 16 610 MOV v28.16b, v29.16b 611 STR q30, [x7], 16 612 MOV v30.16b, v31.16b 6138: 614 TBZ x1, 1, 9f 615 STR d20, [x6], 8 616 DUP d20, v20.d[1] 617 STR d22, [x16], 8 618 DUP d22, v22.d[1] 619 STR d24, [x17], 8 620 DUP d24, v24.d[1] 621 STR d26, [x18], 8 622 DUP d26, v26.d[1] 623 STR d28, [x13], 8 624 DUP d28, v28.d[1] 625 STR d30, [x7], 8 626 DUP d30, v30.d[1] 627 6289: 629 TBZ x1, 0, 10f 630 STR s20, [x6] 631 STR s22, [x16] 632 STR s24, [x17] 633 STR s26, [x18] 634 STR s28, [x13] 635 STR s30, [x7] 63610: 637 # Restore d8-d15 from stack 638 LDP d14, d15, [sp, 48] 639 LDP d12, d13, [sp, 32] 640 LDP d10, d11, [sp, 16] 641 LDP d8, d9, [sp], 64 642 RET 643 644END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57 645 646#ifdef __ELF__ 647.section ".note.GNU-stack","",%progbits 648#endif 649