1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 24 25# d8-d15 need to be preserved if used. 26# x19-30 need to be preserved if used. 27 28# A pointers 29# x3 a0 30# x9 a1 31# x10 a2 32# x11 a3 33# x12 a4 34# x4 a5 35 36# C pointers 37# x6 c0 38# x16 c1 39# x17 c2 40# x18 c3 41# x13 c4 42# x7 c5 43 44# Vector register usage 45# A0 v0 v6 46# A1 v1 v7 47# A2 v2 v8 48# A3 v3 v9 49# A4 v4 v10 50# A5 v5 v11 51# B v12 v13 v14 v15 52# B v16 v17 v18 v19 53# C v20 v21 54# C v22 v23 55# C v24 v25 56# C v26 v27 57# C v28 v29 58# C v30 v31 59# Clamp v6 v7 60 61BEGIN_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75 62 63 # Clamp A and C pointers / Save d8-d15 on stack 64 STP d8, d9, [sp, -64]! 65 CMP x0, 2 // if mr < 2 66 ADD x9, x3, x4 // a1 = a0 + a_stride 67 ADD x16, x6, x7 // c1 = c0 + cm_stride 68 CSEL x9, x3, x9, LO // a1 = a0 69 CSEL x16, x6, x16, LO // c1 = c0 70 71 STP d10, d11, [sp, 16] 72 ADD x10, x9, x4 // a2 = a1 + a_stride 73 ADD x17, x16, x7 // c2 = c1 + cm_stride 74 // if mr <= 2 75 CSEL x10, x9, x10, LS // a2 = a1 76 CSEL x17, x16, x17, LS // c2 = c1 77 78 STP d12, d13, [sp, 32] 79 CMP x0, 4 // if mr < 4 80 ADD x11, x10, x4 // a3 = a2 + a_stride 81 ADD x18, x17, x7 // c3 = c2 + cm_stride 82 CSEL x11, x10, x11, LO // a3 = a2 83 CSEL x18, x17, x18, LO // c3 = c2 84 85 STP d14, d15, [sp, 48] 86 ADD x12, x11, x4 // a4 = a3 + a_stride 87 ADD x13, x18, x7 // c4 = c3 + cm_stride 88 // if mr <= 5 89 CSEL x12, x11, x12, LS // a4 = a3 90 CSEL x13, x18, x13, LS // c4 = c3 91 92 # Load acc, params pointer 93 LDP x15, x8, [sp, 72] 94 95 CMP x0, 6 // if mr < 6 96 ADD x4, x12, x4 // a5 = a4 + a_stride 97 ADD x7, x13, x7 // c5 = c4 + cm_stride 98 CSEL x4, x12, x4, LO // a5 = a4 99 CSEL x7, x13, x7, LO // c5 = c4 100 101 # Load cn_stride 102 LDR x14, [sp, 64] 103 1040: 105 # Load initial accumulators 106 LDP q20, q21, [x15], 32 107 LDP q22, q23, [x15], 32 108 LDP q24, q25, [x15], 32 109 LDP q26, q27, [x15], 32 110 LDP q28, q29, [x15], 32 111 LDP q30, q31, [x15], 32 112 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 113 PRFM PLDL1KEEP, [x5, 64] 114 PRFM PLDL1KEEP, [x5, 128] 115 PRFM PLDL1KEEP, [x5, 192] 116 PRFM PLDL1KEEP, [x3] // Prefetch A 117 PRFM PLDL1KEEP, [x9] 118 PRFM PLDL1KEEP, [x10] 119 PRFM PLDL1KEEP, [x11] 120 PRFM PLDL1KEEP, [x12] 121 PRFM PLDL1KEEP, [x4] 122 123 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 124 SUBS x0, x2, 32 // k = kc - 32 125 B.LO 4f 126 127 # Prologue - loads for main loop of 96 FMA 128 LDR q0, [x3], 16 129 LDR q1, [x9], 16 130 LDR q2, [x10], 16 131 LDR q3, [x11], 16 132 LDR q4, [x12], 16 133 LDR q5, [x4], 16 134 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 135 LDP q14, q15, [x5], 32 136 LDP q16, q17, [x5], 32 137 138 # Is there at least 8 floats (32 bytes) for main loop? 139 SUBS x0, x0, 32 140 B.LO 2f 141 142 # Main loop - 8 floats of A (32 bytes) 143 # 96 FMA + 6 LDP A + 8 LDP B 1441: 145 # First group of 4 A. 48 FMA. 146 FMLA v20.4s, v12.4s, v0.s[0] 147 LDP q18, q19, [x5], 32 // Load last B 148 FMLA v22.4s, v12.4s, v1.s[0] 149 FMLA v24.4s, v12.4s, v2.s[0] 150 FMLA v26.4s, v12.4s, v3.s[0] 151 FMLA v28.4s, v12.4s, v4.s[0] 152 FMLA v30.4s, v12.4s, v5.s[0] 153 FMLA v21.4s, v13.4s, v0.s[0] 154 FMLA v23.4s, v13.4s, v1.s[0] 155 FMLA v25.4s, v13.4s, v2.s[0] 156 FMLA v27.4s, v13.4s, v3.s[0] 157 FMLA v29.4s, v13.4s, v4.s[0] 158 159 FMLA v31.4s, v13.4s, v5.s[0] 160 FMLA v20.4s, v14.4s, v0.s[1] 161 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 162 FMLA v22.4s, v14.4s, v1.s[1] 163 FMLA v24.4s, v14.4s, v2.s[1] 164 FMLA v26.4s, v14.4s, v3.s[1] 165 FMLA v28.4s, v14.4s, v4.s[1] 166 PRFM PLDL1KEEP, [x5, 256] 167 FMLA v30.4s, v14.4s, v5.s[1] 168 FMLA v21.4s, v15.4s, v0.s[1] 169 FMLA v23.4s, v15.4s, v1.s[1] 170 FMLA v25.4s, v15.4s, v2.s[1] 171 LDR q6, [x3], 16 // Load next 6 A 172 FMLA v27.4s, v15.4s, v3.s[1] 173 FMLA v29.4s, v15.4s, v4.s[1] 174 FMLA v31.4s, v15.4s, v5.s[1] 175 LDR q7, [x9], 16 176 177 FMLA v20.4s, v16.4s, v0.s[2] 178 FMLA v22.4s, v16.4s, v1.s[2] 179 FMLA v24.4s, v16.4s, v2.s[2] 180 LDR q8, [x10], 16 181 FMLA v26.4s, v16.4s, v3.s[2] 182 FMLA v28.4s, v16.4s, v4.s[2] 183 FMLA v30.4s, v16.4s, v5.s[2] 184 LDR q9, [x11], 16 185 FMLA v21.4s, v17.4s, v0.s[2] 186 FMLA v23.4s, v17.4s, v1.s[2] 187 FMLA v25.4s, v17.4s, v2.s[2] 188 LDR q10, [x12], 16 189 FMLA v27.4s, v17.4s, v3.s[2] 190 FMLA v29.4s, v17.4s, v4.s[2] 191 FMLA v31.4s, v17.4s, v5.s[2] 192 LDR q11, [x4], 16 193 194 FMLA v20.4s, v18.4s, v0.s[3] 195 FMLA v22.4s, v18.4s, v1.s[3] 196 FMLA v24.4s, v18.4s, v2.s[3] 197 LDP q12, q13, [x5], 32 // Load 4 B 198 FMLA v26.4s, v18.4s, v3.s[3] 199 FMLA v28.4s, v18.4s, v4.s[3] 200 FMLA v30.4s, v18.4s, v5.s[3] 201 LDP q14, q15, [x5], 32 202 FMLA v21.4s, v19.4s, v0.s[3] 203 FMLA v23.4s, v19.4s, v1.s[3] 204 FMLA v25.4s, v19.4s, v2.s[3] 205 LDP q16, q17, [x5], 32 206 FMLA v27.4s, v19.4s, v3.s[3] 207 FMLA v29.4s, v19.4s, v4.s[3] 208 FMLA v31.4s, v19.4s, v5.s[3] 209 LDP q18, q19, [x5], 32 210 211 # Second group of 4 A. 48 FMA. 212 FMLA v20.4s, v12.4s, v6.s[0] 213 FMLA v22.4s, v12.4s, v7.s[0] 214 FMLA v24.4s, v12.4s, v8.s[0] 215 LDR q0, [x3], 16 // Load next 6 A 216 FMLA v26.4s, v12.4s, v9.s[0] 217 FMLA v28.4s, v12.4s, v10.s[0] 218 FMLA v30.4s, v12.4s, v11.s[0] 219 LDR q1, [x9], 16 220 FMLA v21.4s, v13.4s, v6.s[0] 221 FMLA v23.4s, v13.4s, v7.s[0] 222 FMLA v25.4s, v13.4s, v8.s[0] 223 LDR q2, [x10], 16 224 FMLA v27.4s, v13.4s, v9.s[0] 225 FMLA v29.4s, v13.4s, v10.s[0] 226 FMLA v31.4s, v13.4s, v11.s[0] 227 LDR q3, [x11], 16 228 229 FMLA v20.4s, v14.4s, v6.s[1] 230 FMLA v22.4s, v14.4s, v7.s[1] 231 FMLA v24.4s, v14.4s, v8.s[1] 232 LDR q4, [x12], 16 233 FMLA v26.4s, v14.4s, v9.s[1] 234 FMLA v28.4s, v14.4s, v10.s[1] 235 FMLA v30.4s, v14.4s, v11.s[1] 236 LDR q5, [x4], 16 237 FMLA v21.4s, v15.4s, v6.s[1] 238 FMLA v23.4s, v15.4s, v7.s[1] 239 FMLA v25.4s, v15.4s, v8.s[1] 240 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 241 FMLA v27.4s, v15.4s, v9.s[1] 242 FMLA v29.4s, v15.4s, v10.s[1] 243 FMLA v31.4s, v15.4s, v11.s[1] 244 LDP q14, q15, [x5], 32 245 246 FMLA v20.4s, v16.4s, v6.s[2] 247 FMLA v22.4s, v16.4s, v7.s[2] 248 FMLA v24.4s, v16.4s, v8.s[2] 249 FMLA v26.4s, v16.4s, v9.s[2] 250 FMLA v28.4s, v16.4s, v10.s[2] 251 FMLA v30.4s, v16.4s, v11.s[2] 252 FMLA v21.4s, v17.4s, v6.s[2] 253 FMLA v23.4s, v17.4s, v7.s[2] 254 FMLA v25.4s, v17.4s, v8.s[2] 255 FMLA v27.4s, v17.4s, v9.s[2] 256 FMLA v29.4s, v17.4s, v10.s[2] 257 FMLA v31.4s, v17.4s, v11.s[2] 258 LDP q16, q17, [x5], 32 259 260 FMLA v20.4s, v18.4s, v6.s[3] 261 FMLA v22.4s, v18.4s, v7.s[3] 262 SUBS x0, x0, 32 263 FMLA v24.4s, v18.4s, v8.s[3] 264 FMLA v26.4s, v18.4s, v9.s[3] 265 FMLA v28.4s, v18.4s, v10.s[3] 266 FMLA v30.4s, v18.4s, v11.s[3] 267 FMLA v21.4s, v19.4s, v6.s[3] 268 FMLA v23.4s, v19.4s, v7.s[3] 269 FMLA v25.4s, v19.4s, v8.s[3] 270 FMLA v27.4s, v19.4s, v9.s[3] 271 FMLA v29.4s, v19.4s, v10.s[3] 272 FMLA v31.4s, v19.4s, v11.s[3] 273 B.HS 1b 274 275 # Epilogue - 8 floats of A (32 bytes) 276 # 96 FMA + 6 LDP A + 8 LDP B 277 # First block same as main loop. Second block has no preloads. 2782: 279 # First group of 4 A. 48 FMA. 280 FMLA v20.4s, v12.4s, v0.s[0] 281 LDP q18, q19, [x5], 32 // Load last B 282 FMLA v22.4s, v12.4s, v1.s[0] 283 FMLA v24.4s, v12.4s, v2.s[0] 284 FMLA v26.4s, v12.4s, v3.s[0] 285 FMLA v28.4s, v12.4s, v4.s[0] 286 FMLA v30.4s, v12.4s, v5.s[0] 287 FMLA v21.4s, v13.4s, v0.s[0] 288 FMLA v23.4s, v13.4s, v1.s[0] 289 FMLA v25.4s, v13.4s, v2.s[0] 290 FMLA v27.4s, v13.4s, v3.s[0] 291 FMLA v29.4s, v13.4s, v4.s[0] 292 293 FMLA v31.4s, v13.4s, v5.s[0] 294 FMLA v20.4s, v14.4s, v0.s[1] 295 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 296 FMLA v22.4s, v14.4s, v1.s[1] 297 FMLA v24.4s, v14.4s, v2.s[1] 298 FMLA v26.4s, v14.4s, v3.s[1] 299 FMLA v28.4s, v14.4s, v4.s[1] 300 PRFM PLDL1KEEP, [x5, 256] 301 FMLA v30.4s, v14.4s, v5.s[1] 302 FMLA v21.4s, v15.4s, v0.s[1] 303 FMLA v23.4s, v15.4s, v1.s[1] 304 FMLA v25.4s, v15.4s, v2.s[1] 305 LDR q6, [x3], 16 // Load next 6 A 306 FMLA v27.4s, v15.4s, v3.s[1] 307 FMLA v29.4s, v15.4s, v4.s[1] 308 FMLA v31.4s, v15.4s, v5.s[1] 309 LDR q7, [x9], 16 310 311 FMLA v20.4s, v16.4s, v0.s[2] 312 FMLA v22.4s, v16.4s, v1.s[2] 313 FMLA v24.4s, v16.4s, v2.s[2] 314 LDR q8, [x10], 16 315 FMLA v26.4s, v16.4s, v3.s[2] 316 FMLA v28.4s, v16.4s, v4.s[2] 317 FMLA v30.4s, v16.4s, v5.s[2] 318 LDR q9, [x11], 16 319 FMLA v21.4s, v17.4s, v0.s[2] 320 FMLA v23.4s, v17.4s, v1.s[2] 321 FMLA v25.4s, v17.4s, v2.s[2] 322 LDR q10, [x12], 16 323 FMLA v27.4s, v17.4s, v3.s[2] 324 FMLA v29.4s, v17.4s, v4.s[2] 325 FMLA v31.4s, v17.4s, v5.s[2] 326 LDR q11, [x4], 16 327 328 FMLA v20.4s, v18.4s, v0.s[3] 329 FMLA v22.4s, v18.4s, v1.s[3] 330 FMLA v24.4s, v18.4s, v2.s[3] 331 LDP q12, q13, [x5], 32 // Load 4 B 332 FMLA v26.4s, v18.4s, v3.s[3] 333 FMLA v28.4s, v18.4s, v4.s[3] 334 FMLA v30.4s, v18.4s, v5.s[3] 335 LDP q14, q15, [x5], 32 336 FMLA v21.4s, v19.4s, v0.s[3] 337 FMLA v23.4s, v19.4s, v1.s[3] 338 FMLA v25.4s, v19.4s, v2.s[3] 339 LDP q16, q17, [x5], 32 340 FMLA v27.4s, v19.4s, v3.s[3] 341 FMLA v29.4s, v19.4s, v4.s[3] 342 FMLA v31.4s, v19.4s, v5.s[3] 343 LDP q18, q19, [x5], 32 344 345 # Second group of 4 A. 48 FMA. 346 FMLA v20.4s, v12.4s, v6.s[0] 347 FMLA v22.4s, v12.4s, v7.s[0] 348 FMLA v24.4s, v12.4s, v8.s[0] 349 FMLA v26.4s, v12.4s, v9.s[0] 350 FMLA v28.4s, v12.4s, v10.s[0] 351 FMLA v30.4s, v12.4s, v11.s[0] 352 FMLA v21.4s, v13.4s, v6.s[0] 353 FMLA v23.4s, v13.4s, v7.s[0] 354 FMLA v25.4s, v13.4s, v8.s[0] 355 FMLA v27.4s, v13.4s, v9.s[0] 356 FMLA v29.4s, v13.4s, v10.s[0] 357 FMLA v31.4s, v13.4s, v11.s[0] 358 359 FMLA v20.4s, v14.4s, v6.s[1] 360 FMLA v22.4s, v14.4s, v7.s[1] 361 FMLA v24.4s, v14.4s, v8.s[1] 362 FMLA v26.4s, v14.4s, v9.s[1] 363 FMLA v28.4s, v14.4s, v10.s[1] 364 FMLA v30.4s, v14.4s, v11.s[1] 365 FMLA v21.4s, v15.4s, v6.s[1] 366 FMLA v23.4s, v15.4s, v7.s[1] 367 FMLA v25.4s, v15.4s, v8.s[1] 368 FMLA v27.4s, v15.4s, v9.s[1] 369 FMLA v29.4s, v15.4s, v10.s[1] 370 FMLA v31.4s, v15.4s, v11.s[1] 371 372 FMLA v20.4s, v16.4s, v6.s[2] 373 FMLA v22.4s, v16.4s, v7.s[2] 374 FMLA v24.4s, v16.4s, v8.s[2] 375 FMLA v26.4s, v16.4s, v9.s[2] 376 FMLA v28.4s, v16.4s, v10.s[2] 377 FMLA v30.4s, v16.4s, v11.s[2] 378 FMLA v21.4s, v17.4s, v6.s[2] 379 FMLA v23.4s, v17.4s, v7.s[2] 380 FMLA v25.4s, v17.4s, v8.s[2] 381 FMLA v27.4s, v17.4s, v9.s[2] 382 FMLA v29.4s, v17.4s, v10.s[2] 383 FMLA v31.4s, v17.4s, v11.s[2] 384 385 FMLA v20.4s, v18.4s, v6.s[3] 386 FMLA v22.4s, v18.4s, v7.s[3] 387 FMLA v24.4s, v18.4s, v8.s[3] 388 FMLA v26.4s, v18.4s, v9.s[3] 389 FMLA v28.4s, v18.4s, v10.s[3] 390 FMLA v30.4s, v18.4s, v11.s[3] 391 FMLA v21.4s, v19.4s, v6.s[3] 392 FMLA v23.4s, v19.4s, v7.s[3] 393 394 # Load clamping_params values 395 LD2R {v6.4s, v7.4s}, [x8] 396 397 FMLA v25.4s, v19.4s, v8.s[3] 398 FMLA v27.4s, v19.4s, v9.s[3] 399 # Is there a remainder?- 4 floats of A (16 bytes) or less 400 TST x0, 31 401 FMLA v29.4s, v19.4s, v10.s[3] 402 FMLA v31.4s, v19.4s, v11.s[3] 403 B.NE 4f 404 405 # Clamp 4063: 407 FMIN v20.4s, v20.4s, v6.4s 408 SUBS x1, x1, 8 409 FMIN v21.4s, v21.4s, v6.4s 410 FMIN v22.4s, v22.4s, v6.4s 411 FMIN v23.4s, v23.4s, v6.4s 412 FMIN v24.4s, v24.4s, v6.4s 413 FMIN v25.4s, v25.4s, v6.4s 414 FMIN v26.4s, v26.4s, v6.4s 415 FMIN v27.4s, v27.4s, v6.4s 416 FMIN v28.4s, v28.4s, v6.4s 417 FMIN v29.4s, v29.4s, v6.4s 418 FMIN v30.4s, v30.4s, v6.4s 419 FMIN v31.4s, v31.4s, v6.4s 420 FMAX v20.4s, v20.4s, v7.4s 421 FMAX v21.4s, v21.4s, v7.4s 422 FMAX v22.4s, v22.4s, v7.4s 423 FMAX v23.4s, v23.4s, v7.4s 424 FMAX v24.4s, v24.4s, v7.4s 425 FMAX v25.4s, v25.4s, v7.4s 426 FMAX v26.4s, v26.4s, v7.4s 427 FMAX v27.4s, v27.4s, v7.4s 428 FMAX v28.4s, v28.4s, v7.4s 429 FMAX v29.4s, v29.4s, v7.4s 430 FMAX v30.4s, v30.4s, v7.4s 431 FMAX v31.4s, v31.4s, v7.4s 432 433 # Store full 6 x 8 434 B.LO 7f 435 436 STP q30, q31, [x7] 437 ADD x7, x7, x14 438 SUB x3, x3, x2 // a0 -= kc 439 STP q28, q29, [x13] 440 ADD x13, x13, x14 441 SUB x9, x9, x2 // a1 -= kc 442 STP q26, q27, [x18] 443 ADD x18, x18, x14 444 SUB x10, x10, x2 // a2 -= kc 445 STP q24, q25, [x17] 446 ADD x17, x17, x14 447 SUB x11, x11, x2 // a3 -= kc 448 STP q22, q23, [x16] 449 ADD x16, x16, x14 450 SUB x12, x12, x2 // a4 -= kc 451 STP q20, q21, [x6] 452 ADD x6, x6, x14 453 SUB x4, x4, x2 // a5 -= kc 454 455 B.HI 0b 456 457 # Restore d8-d15 from stack 458 LDP d14, d15, [sp, 48] 459 LDP d12, d13, [sp, 32] 460 LDP d10, d11, [sp, 16] 461 LDP d8, d9, [sp], 64 462 RET 463 4644: 465 # Load clamping_params values 466 LD2R {v6.4s, v7.4s}, [x8] 467 468 # Is there a remainder?- 4 floats of A (16 bytes) 469 TBZ x0, 4, 5f 470 471 # Remainder- 4 floats of A (16 bytes) 472 # Load A 473 LDR q0, [x3], 16 474 LDR q1, [x9], 16 475 LDR q2, [x10], 16 476 LDR q3, [x11], 16 477 LDR q4, [x12], 16 478 LDR q5, [x4], 16 479 # Load B 480 LDP q12, q13, [x5], 32 481 LDP q14, q15, [x5], 32 482 LDP q16, q17, [x5], 32 483 LDP q18, q19, [x5], 32 484 485 FMLA v20.4s, v12.4s, v0.s[0] 486 FMLA v22.4s, v12.4s, v1.s[0] 487 FMLA v24.4s, v12.4s, v2.s[0] 488 FMLA v26.4s, v12.4s, v3.s[0] 489 FMLA v28.4s, v12.4s, v4.s[0] 490 FMLA v30.4s, v12.4s, v5.s[0] 491 FMLA v21.4s, v13.4s, v0.s[0] 492 FMLA v23.4s, v13.4s, v1.s[0] 493 FMLA v25.4s, v13.4s, v2.s[0] 494 FMLA v27.4s, v13.4s, v3.s[0] 495 FMLA v29.4s, v13.4s, v4.s[0] 496 FMLA v31.4s, v13.4s, v5.s[0] 497 498 FMLA v20.4s, v14.4s, v0.s[1] 499 FMLA v22.4s, v14.4s, v1.s[1] 500 FMLA v24.4s, v14.4s, v2.s[1] 501 FMLA v26.4s, v14.4s, v3.s[1] 502 FMLA v28.4s, v14.4s, v4.s[1] 503 FMLA v30.4s, v14.4s, v5.s[1] 504 FMLA v21.4s, v15.4s, v0.s[1] 505 FMLA v23.4s, v15.4s, v1.s[1] 506 FMLA v25.4s, v15.4s, v2.s[1] 507 FMLA v27.4s, v15.4s, v3.s[1] 508 FMLA v29.4s, v15.4s, v4.s[1] 509 FMLA v31.4s, v15.4s, v5.s[1] 510 511 FMLA v20.4s, v16.4s, v0.s[2] 512 FMLA v22.4s, v16.4s, v1.s[2] 513 FMLA v24.4s, v16.4s, v2.s[2] 514 FMLA v26.4s, v16.4s, v3.s[2] 515 FMLA v28.4s, v16.4s, v4.s[2] 516 FMLA v30.4s, v16.4s, v5.s[2] 517 FMLA v21.4s, v17.4s, v0.s[2] 518 FMLA v23.4s, v17.4s, v1.s[2] 519 FMLA v25.4s, v17.4s, v2.s[2] 520 FMLA v27.4s, v17.4s, v3.s[2] 521 FMLA v29.4s, v17.4s, v4.s[2] 522 FMLA v31.4s, v17.4s, v5.s[2] 523 524 FMLA v20.4s, v18.4s, v0.s[3] 525 FMLA v22.4s, v18.4s, v1.s[3] 526 FMLA v24.4s, v18.4s, v2.s[3] 527 FMLA v26.4s, v18.4s, v3.s[3] 528 FMLA v28.4s, v18.4s, v4.s[3] 529 FMLA v30.4s, v18.4s, v5.s[3] 530 FMLA v21.4s, v19.4s, v0.s[3] 531 FMLA v23.4s, v19.4s, v1.s[3] 532 FMLA v25.4s, v19.4s, v2.s[3] 533 FMLA v27.4s, v19.4s, v3.s[3] 534 FMLA v29.4s, v19.4s, v4.s[3] 535 FMLA v31.4s, v19.4s, v5.s[3] 536 537 # Is there a remainder?- 2 floats of A (8 bytes) 5385: 539 TBZ x0, 3, 6f 540 541 # Remainder- 2 floats of A (8 bytes) 542 # Load A 543 LDR d0, [x3], 8 544 LDR d1, [x9], 8 545 LDR d2, [x10], 8 546 LDR d3, [x11], 8 547 LDR d4, [x12], 8 548 LDR d5, [x4], 8 549 # Load B 550 LDP q12, q13, [x5], 32 551 LDP q14, q15, [x5], 32 552 553 FMLA v20.4s, v12.4s, v0.s[0] 554 FMLA v22.4s, v12.4s, v1.s[0] 555 FMLA v24.4s, v12.4s, v2.s[0] 556 FMLA v26.4s, v12.4s, v3.s[0] 557 FMLA v28.4s, v12.4s, v4.s[0] 558 FMLA v30.4s, v12.4s, v5.s[0] 559 FMLA v21.4s, v13.4s, v0.s[0] 560 FMLA v23.4s, v13.4s, v1.s[0] 561 FMLA v25.4s, v13.4s, v2.s[0] 562 FMLA v27.4s, v13.4s, v3.s[0] 563 FMLA v29.4s, v13.4s, v4.s[0] 564 FMLA v31.4s, v13.4s, v5.s[0] 565 566 FMLA v20.4s, v14.4s, v0.s[1] 567 FMLA v22.4s, v14.4s, v1.s[1] 568 FMLA v24.4s, v14.4s, v2.s[1] 569 FMLA v26.4s, v14.4s, v3.s[1] 570 FMLA v28.4s, v14.4s, v4.s[1] 571 FMLA v30.4s, v14.4s, v5.s[1] 572 FMLA v21.4s, v15.4s, v0.s[1] 573 FMLA v23.4s, v15.4s, v1.s[1] 574 FMLA v25.4s, v15.4s, v2.s[1] 575 FMLA v27.4s, v15.4s, v3.s[1] 576 FMLA v29.4s, v15.4s, v4.s[1] 577 FMLA v31.4s, v15.4s, v5.s[1] 578 579 # Is there a remainder?- 1 float of A (4 bytes) 5806: 581 TBZ x0, 2, 3b 582 583 # Remainder- 1 float of A (4 bytes) 584 # Load A 585 LDR s0, [x3], 4 586 LDR s1, [x9], 4 587 LDR s2, [x10], 4 588 LDR s3, [x11], 4 589 LDR s4, [x12], 4 590 LDR s5, [x4], 4 591 # Load B 592 LDP q12, q13, [x5], 32 593 594 FMLA v20.4s, v12.4s, v0.s[0] 595 FMLA v22.4s, v12.4s, v1.s[0] 596 FMLA v24.4s, v12.4s, v2.s[0] 597 FMLA v26.4s, v12.4s, v3.s[0] 598 FMLA v28.4s, v12.4s, v4.s[0] 599 FMLA v30.4s, v12.4s, v5.s[0] 600 FMLA v21.4s, v13.4s, v0.s[0] 601 FMLA v23.4s, v13.4s, v1.s[0] 602 FMLA v25.4s, v13.4s, v2.s[0] 603 FMLA v27.4s, v13.4s, v3.s[0] 604 FMLA v29.4s, v13.4s, v4.s[0] 605 FMLA v31.4s, v13.4s, v5.s[0] 606 B 3b 607 608 # Store odd width 6097: 610 TBZ x1, 2, 8f 611 STR q30, [x7], 16 612 MOV v30.16b, v31.16b 613 STR q28, [x13], 16 614 MOV v28.16b, v29.16b 615 STR q26, [x18], 16 616 MOV v26.16b, v27.16b 617 STR q24, [x17], 16 618 MOV v24.16b, v25.16b 619 STR q22, [x16], 16 620 MOV v22.16b, v23.16b 621 STR q20, [x6], 16 622 MOV v20.16b, v21.16b 6238: 624 TBZ x1, 1, 9f 625 STR d30, [x7], 8 626 DUP d30, v30.d[1] 627 STR d28, [x13], 8 628 DUP d28, v28.d[1] 629 STR d26, [x18], 8 630 DUP d26, v26.d[1] 631 STR d24, [x17], 8 632 DUP d24, v24.d[1] 633 STR d22, [x16], 8 634 DUP d22, v22.d[1] 635 STR d20, [x6], 8 636 DUP d20, v20.d[1] 637 6389: 639 TBZ x1, 0, 10f 640 STR s30, [x7] 641 STR s28, [x13] 642 STR s26, [x18] 643 STR s24, [x17] 644 STR s22, [x16] 645 STR s20, [x6] 64610: 647 # Restore d8-d15 from stack 648 LDP d14, d15, [sp, 48] 649 LDP d12, d13, [sp, 32] 650 LDP d10, d11, [sp, 16] 651 LDP d8, d9, [sp], 64 652 RET 653 654END_FUNCTION xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75 655 656#ifdef __ELF__ 657.section ".note.GNU-stack","",%progbits 658#endif 659