1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t* a, x3 17# size_t a_stride, x4 18# const void* w, x5 19# uint8_t* c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const float* acc, [sp + 8] -> x15 23# const xnn_f32_minmax_params params [sp + 16] -> x8 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x14 c3 40# x13 c4 41# x7 c5 42 43# Vector register usage 44# A0 v0 v6 45# A1 v1 v7 46# A2 v2 v8 47# A3 v3 v9 48# A4 v4 v10 49# A5 v5 v11 50# B v12 v13 v14 v15 51# B v16 v17 v18 v19 52# C v20 v21 53# C v22 v23 54# C v24 v25 55# C v26 v27 56# C v28 v29 57# C v30 v31 58# Clamp v6 v7 59 60BEGIN_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75 61 62 # Clamp A and C pointers / Save d8-d15 on stack 63 CMP x0, 2 // if mr < 2 64 STP d8, d9, [sp, -64]! 65 ADD x9, x3, x4 // a1 = a0 + a_stride 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 CSEL x9, x3, x9, LO // a1 = a0 68 CSEL x16, x6, x16, LO // c1 = c0 69 70 STP d10, d11, [sp, 16] 71 ADD x10, x9, x4 // a2 = a1 + a_stride 72 ADD x17, x16, x7 // c2 = c1 + cm_stride 73 // if mr <= 2 74 CSEL x10, x9, x10, LS // a2 = a1 75 CSEL x17, x16, x17, LS // c2 = c1 76 77 STP d12, d13, [sp, 32] 78 CMP x0, 4 // if mr < 4 79 ADD x11, x10, x4 // a3 = a2 + a_stride 80 ADD x14, x17, x7 // c3 = c2 + cm_stride 81 CSEL x11, x10, x11, LO // a3 = a2 82 CSEL x14, x17, x14, LO // c3 = c2 83 84 STP d14, d15, [sp, 48] 85 ADD x12, x11, x4 // a4 = a3 + a_stride 86 ADD x13, x14, x7 // c4 = c3 + cm_stride 87 // if mr <= 4 88 CSEL x12, x11, x12, LS // a4 = a3 89 CSEL x13, x14, x13, LS // c4 = c3 90 91 # Load acc, params pointer 92 LDP x15, x8, [sp, 72] 93 94 CMP x0, 6 // if mr < 6 95 ADD x4, x12, x4 // a5 = a4 + a_stride 96 ADD x7, x13, x7 // c5 = c4 + cm_stride 97 CSEL x4, x12, x4, LO // a5 = a4 98 CSEL x7, x13, x7, LO // c5 = c4 99 1000: 101 # Load initial accumulators 102 LDP q20, q21, [x15], 32 103 LDP q22, q23, [x15], 32 104 LDP q24, q25, [x15], 32 105 LDP q26, q27, [x15], 32 106 LDP q28, q29, [x15], 32 107 LDP q30, q31, [x15], 32 108 SUBS x0, x2, 32 // k = kc - 32 109 B.LO 4f 110 111 # Prologue - loads for main loop of 96 FMA 112 LDR q0, [x3], 16 113 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 114 LDR q1, [x9], 16 115 LDR q2, [x10], 16 116 LDR q3, [x11], 16 117 LDR q4, [x12], 16 118 LDR q5, [x4], 16 119 LDP q14, q15, [x5], 32 120 LDP q16, q17, [x5], 32 121 122 # Is there at least 8 floats (32 bytes) for main loop? 123 SUBS x0, x0, 32 124 B.LO 2f 125 126 # Main loop - 8 floats of A (32 bytes) 127 # 96 FMA + 6 LDP A + 8 LDP B 128 # 64 float weights = 256 bytes. 4 cache lines. 1291: 130 # First group of 4 A. 48 FMA. 131 FMLA v20.4s, v12.4s, v0.s[0] 132 LDP q18, q19, [x5], 32 // Load last B 133 FMLA v22.4s, v12.4s, v1.s[0] 134 FMLA v24.4s, v12.4s, v2.s[0] 135 FMLA v26.4s, v12.4s, v3.s[0] 136 FMLA v28.4s, v12.4s, v4.s[0] 137 FMLA v30.4s, v12.4s, v5.s[0] 138 FMLA v21.4s, v13.4s, v0.s[0] 139 FMLA v23.4s, v13.4s, v1.s[0] 140 FMLA v25.4s, v13.4s, v2.s[0] 141 FMLA v27.4s, v13.4s, v3.s[0] 142 FMLA v29.4s, v13.4s, v4.s[0] 143 FMLA v31.4s, v13.4s, v5.s[0] 144 FMLA v20.4s, v14.4s, v0.s[1] 145 FMLA v22.4s, v14.4s, v1.s[1] 146 FMLA v24.4s, v14.4s, v2.s[1] 147 FMLA v26.4s, v14.4s, v3.s[1] 148 FMLA v28.4s, v14.4s, v4.s[1] 149 FMLA v30.4s, v14.4s, v5.s[1] 150 FMLA v21.4s, v15.4s, v0.s[1] 151 FMLA v23.4s, v15.4s, v1.s[1] 152 FMLA v25.4s, v15.4s, v2.s[1] 153 LDR q6, [x3], 16 // Load next 6 A 154 FMLA v27.4s, v15.4s, v3.s[1] 155 FMLA v29.4s, v15.4s, v4.s[1] 156 FMLA v31.4s, v15.4s, v5.s[1] 157 LDR q7, [x9], 16 158 159 FMLA v20.4s, v16.4s, v0.s[2] 160 FMLA v22.4s, v16.4s, v1.s[2] 161 FMLA v24.4s, v16.4s, v2.s[2] 162 LDR q8, [x10], 16 163 FMLA v26.4s, v16.4s, v3.s[2] 164 FMLA v28.4s, v16.4s, v4.s[2] 165 FMLA v30.4s, v16.4s, v5.s[2] 166 LDR q9, [x11], 16 167 FMLA v21.4s, v17.4s, v0.s[2] 168 FMLA v23.4s, v17.4s, v1.s[2] 169 FMLA v25.4s, v17.4s, v2.s[2] 170 LDR q10, [x12], 16 171 FMLA v27.4s, v17.4s, v3.s[2] 172 FMLA v29.4s, v17.4s, v4.s[2] 173 FMLA v31.4s, v17.4s, v5.s[2] 174 LDR q11, [x4], 16 175 176 FMLA v20.4s, v18.4s, v0.s[3] 177 FMLA v22.4s, v18.4s, v1.s[3] 178 FMLA v24.4s, v18.4s, v2.s[3] 179 LDP q12, q13, [x5], 32 // Load 4 B 180 FMLA v26.4s, v18.4s, v3.s[3] 181 FMLA v28.4s, v18.4s, v4.s[3] 182 FMLA v30.4s, v18.4s, v5.s[3] 183 LDP q14, q15, [x5], 32 184 FMLA v21.4s, v19.4s, v0.s[3] 185 FMLA v23.4s, v19.4s, v1.s[3] 186 FMLA v25.4s, v19.4s, v2.s[3] 187 LDP q16, q17, [x5], 32 188 FMLA v27.4s, v19.4s, v3.s[3] 189 FMLA v29.4s, v19.4s, v4.s[3] 190 FMLA v31.4s, v19.4s, v5.s[3] 191 LDP q18, q19, [x5], 32 192 193 # Second group of 4 A. 48 FMA. 194 FMLA v20.4s, v12.4s, v6.s[0] 195 FMLA v22.4s, v12.4s, v7.s[0] 196 FMLA v24.4s, v12.4s, v8.s[0] 197 LDR q0, [x3], 16 // Load next 6 A 198 FMLA v26.4s, v12.4s, v9.s[0] 199 FMLA v28.4s, v12.4s, v10.s[0] 200 FMLA v30.4s, v12.4s, v11.s[0] 201 LDR q1, [x9], 16 202 FMLA v21.4s, v13.4s, v6.s[0] 203 FMLA v23.4s, v13.4s, v7.s[0] 204 FMLA v25.4s, v13.4s, v8.s[0] 205 LDR q2, [x10], 16 206 FMLA v27.4s, v13.4s, v9.s[0] 207 FMLA v29.4s, v13.4s, v10.s[0] 208 FMLA v31.4s, v13.4s, v11.s[0] 209 LDR q3, [x11], 16 210 211 FMLA v20.4s, v14.4s, v6.s[1] 212 FMLA v22.4s, v14.4s, v7.s[1] 213 FMLA v24.4s, v14.4s, v8.s[1] 214 LDR q4, [x12], 16 215 FMLA v26.4s, v14.4s, v9.s[1] 216 FMLA v28.4s, v14.4s, v10.s[1] 217 FMLA v30.4s, v14.4s, v11.s[1] 218 LDR q5, [x4], 16 219 FMLA v21.4s, v15.4s, v6.s[1] 220 FMLA v23.4s, v15.4s, v7.s[1] 221 FMLA v25.4s, v15.4s, v8.s[1] 222 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 223 FMLA v27.4s, v15.4s, v9.s[1] 224 FMLA v29.4s, v15.4s, v10.s[1] 225 FMLA v31.4s, v15.4s, v11.s[1] 226 LDP q14, q15, [x5], 32 227 228 FMLA v20.4s, v16.4s, v6.s[2] 229 FMLA v22.4s, v16.4s, v7.s[2] 230 FMLA v24.4s, v16.4s, v8.s[2] 231 FMLA v26.4s, v16.4s, v9.s[2] 232 FMLA v28.4s, v16.4s, v10.s[2] 233 FMLA v30.4s, v16.4s, v11.s[2] 234 FMLA v21.4s, v17.4s, v6.s[2] 235 FMLA v23.4s, v17.4s, v7.s[2] 236 FMLA v25.4s, v17.4s, v8.s[2] 237 FMLA v27.4s, v17.4s, v9.s[2] 238 FMLA v29.4s, v17.4s, v10.s[2] 239 FMLA v31.4s, v17.4s, v11.s[2] 240 241 FMLA v20.4s, v18.4s, v6.s[3] 242 FMLA v22.4s, v18.4s, v7.s[3] 243 LDP q16, q17, [x5], 32 244 FMLA v24.4s, v18.4s, v8.s[3] 245 FMLA v26.4s, v18.4s, v9.s[3] 246 FMLA v28.4s, v18.4s, v10.s[3] 247 FMLA v30.4s, v18.4s, v11.s[3] 248 SUBS x0, x0, 32 249 FMLA v21.4s, v19.4s, v6.s[3] 250 FMLA v23.4s, v19.4s, v7.s[3] 251 FMLA v25.4s, v19.4s, v8.s[3] 252 FMLA v27.4s, v19.4s, v9.s[3] 253 FMLA v29.4s, v19.4s, v10.s[3] 254 FMLA v31.4s, v19.4s, v11.s[3] 255 B.HS 1b 256 257 # Epilogue - 8 floats of A (32 bytes) 258 # 96 FMA + 6 LDP A + 8 LDP B 259 # First block same as main loop. Second block has no preloads. 2602: 261 # First group of 4 A. 48 FMA. 262 FMLA v20.4s, v12.4s, v0.s[0] 263 LDP q18, q19, [x5], 32 // Load last B 264 FMLA v22.4s, v12.4s, v1.s[0] 265 FMLA v24.4s, v12.4s, v2.s[0] 266 FMLA v26.4s, v12.4s, v3.s[0] 267 FMLA v28.4s, v12.4s, v4.s[0] 268 FMLA v30.4s, v12.4s, v5.s[0] 269 FMLA v21.4s, v13.4s, v0.s[0] 270 FMLA v23.4s, v13.4s, v1.s[0] 271 FMLA v25.4s, v13.4s, v2.s[0] 272 FMLA v27.4s, v13.4s, v3.s[0] 273 FMLA v29.4s, v13.4s, v4.s[0] 274 FMLA v31.4s, v13.4s, v5.s[0] 275 FMLA v20.4s, v14.4s, v0.s[1] 276 FMLA v22.4s, v14.4s, v1.s[1] 277 FMLA v24.4s, v14.4s, v2.s[1] 278 FMLA v26.4s, v14.4s, v3.s[1] 279 FMLA v28.4s, v14.4s, v4.s[1] 280 FMLA v30.4s, v14.4s, v5.s[1] 281 FMLA v21.4s, v15.4s, v0.s[1] 282 FMLA v23.4s, v15.4s, v1.s[1] 283 FMLA v25.4s, v15.4s, v2.s[1] 284 LDR q6, [x3], 16 // Load next 6 A 285 FMLA v27.4s, v15.4s, v3.s[1] 286 FMLA v29.4s, v15.4s, v4.s[1] 287 FMLA v31.4s, v15.4s, v5.s[1] 288 LDR q7, [x9], 16 289 290 FMLA v20.4s, v16.4s, v0.s[2] 291 FMLA v22.4s, v16.4s, v1.s[2] 292 FMLA v24.4s, v16.4s, v2.s[2] 293 LDR q8, [x10], 16 294 FMLA v26.4s, v16.4s, v3.s[2] 295 FMLA v28.4s, v16.4s, v4.s[2] 296 FMLA v30.4s, v16.4s, v5.s[2] 297 LDR q9, [x11], 16 298 FMLA v21.4s, v17.4s, v0.s[2] 299 FMLA v23.4s, v17.4s, v1.s[2] 300 FMLA v25.4s, v17.4s, v2.s[2] 301 LDR q10, [x12], 16 302 FMLA v27.4s, v17.4s, v3.s[2] 303 FMLA v29.4s, v17.4s, v4.s[2] 304 FMLA v31.4s, v17.4s, v5.s[2] 305 LDR q11, [x4], 16 306 307 FMLA v20.4s, v18.4s, v0.s[3] 308 FMLA v22.4s, v18.4s, v1.s[3] 309 FMLA v24.4s, v18.4s, v2.s[3] 310 LDP q12, q13, [x5], 32 // Load 4 B 311 FMLA v26.4s, v18.4s, v3.s[3] 312 FMLA v28.4s, v18.4s, v4.s[3] 313 FMLA v30.4s, v18.4s, v5.s[3] 314 LDP q14, q15, [x5], 32 315 FMLA v21.4s, v19.4s, v0.s[3] 316 FMLA v23.4s, v19.4s, v1.s[3] 317 FMLA v25.4s, v19.4s, v2.s[3] 318 LDP q16, q17, [x5], 32 319 FMLA v27.4s, v19.4s, v3.s[3] 320 FMLA v29.4s, v19.4s, v4.s[3] 321 FMLA v31.4s, v19.4s, v5.s[3] 322 LDP q18, q19, [x5], 32 323 324 # Second group of 4 A. 48 FMA. 325 FMLA v20.4s, v12.4s, v6.s[0] 326 FMLA v22.4s, v12.4s, v7.s[0] 327 FMLA v24.4s, v12.4s, v8.s[0] 328 FMLA v26.4s, v12.4s, v9.s[0] 329 FMLA v28.4s, v12.4s, v10.s[0] 330 FMLA v30.4s, v12.4s, v11.s[0] 331 FMLA v21.4s, v13.4s, v6.s[0] 332 FMLA v23.4s, v13.4s, v7.s[0] 333 FMLA v25.4s, v13.4s, v8.s[0] 334 FMLA v27.4s, v13.4s, v9.s[0] 335 FMLA v29.4s, v13.4s, v10.s[0] 336 FMLA v31.4s, v13.4s, v11.s[0] 337 338 FMLA v20.4s, v14.4s, v6.s[1] 339 FMLA v22.4s, v14.4s, v7.s[1] 340 FMLA v24.4s, v14.4s, v8.s[1] 341 FMLA v26.4s, v14.4s, v9.s[1] 342 FMLA v28.4s, v14.4s, v10.s[1] 343 FMLA v30.4s, v14.4s, v11.s[1] 344 FMLA v21.4s, v15.4s, v6.s[1] 345 FMLA v23.4s, v15.4s, v7.s[1] 346 FMLA v25.4s, v15.4s, v8.s[1] 347 FMLA v27.4s, v15.4s, v9.s[1] 348 FMLA v29.4s, v15.4s, v10.s[1] 349 FMLA v31.4s, v15.4s, v11.s[1] 350 351 FMLA v20.4s, v16.4s, v6.s[2] 352 FMLA v22.4s, v16.4s, v7.s[2] 353 FMLA v24.4s, v16.4s, v8.s[2] 354 FMLA v26.4s, v16.4s, v9.s[2] 355 FMLA v28.4s, v16.4s, v10.s[2] 356 FMLA v30.4s, v16.4s, v11.s[2] 357 FMLA v21.4s, v17.4s, v6.s[2] 358 FMLA v23.4s, v17.4s, v7.s[2] 359 FMLA v25.4s, v17.4s, v8.s[2] 360 FMLA v27.4s, v17.4s, v9.s[2] 361 FMLA v29.4s, v17.4s, v10.s[2] 362 FMLA v31.4s, v17.4s, v11.s[2] 363 364 FMLA v20.4s, v18.4s, v6.s[3] 365 FMLA v22.4s, v18.4s, v7.s[3] 366 FMLA v24.4s, v18.4s, v8.s[3] 367 FMLA v26.4s, v18.4s, v9.s[3] 368 FMLA v28.4s, v18.4s, v10.s[3] 369 FMLA v30.4s, v18.4s, v11.s[3] 370 371 # Is there a remainder?- 4 floats of A (16 bytes) or less 372 TST x0, 31 373 374 FMLA v21.4s, v19.4s, v6.s[3] 375 FMLA v23.4s, v19.4s, v7.s[3] 376 FMLA v25.4s, v19.4s, v8.s[3] 377 LD2R {v6.4s, v7.4s}, [x8] // Load min/max values 378 FMLA v27.4s, v19.4s, v9.s[3] 379 FMLA v29.4s, v19.4s, v10.s[3] 380 FMLA v31.4s, v19.4s, v11.s[3] 381 B.NE 4f 382 383 # Clamp 3843: 385 FMAX v20.4s, v20.4s, v6.4s 386 FMAX v21.4s, v21.4s, v6.4s 387 FMAX v22.4s, v22.4s, v6.4s 388 FMAX v23.4s, v23.4s, v6.4s 389 FMAX v24.4s, v24.4s, v6.4s 390 LDR x0, [sp, 64] // Load cn_stride 391 FMAX v25.4s, v25.4s, v6.4s 392 FMAX v26.4s, v26.4s, v6.4s 393 FMAX v27.4s, v27.4s, v6.4s 394 FMAX v28.4s, v28.4s, v6.4s 395 FMAX v29.4s, v29.4s, v6.4s 396 FMAX v30.4s, v30.4s, v6.4s 397 FMAX v31.4s, v31.4s, v6.4s 398 SUBS x1, x1, 8 399 FMIN v20.4s, v20.4s, v7.4s 400 FMIN v21.4s, v21.4s, v7.4s 401 FMIN v22.4s, v22.4s, v7.4s 402 FMIN v23.4s, v23.4s, v7.4s 403 FMIN v24.4s, v24.4s, v7.4s 404 FMIN v25.4s, v25.4s, v7.4s 405 FMIN v26.4s, v26.4s, v7.4s 406 FMIN v27.4s, v27.4s, v7.4s 407 FMIN v28.4s, v28.4s, v7.4s 408 FMIN v29.4s, v29.4s, v7.4s 409 FMIN v30.4s, v30.4s, v7.4s 410 FMIN v31.4s, v31.4s, v7.4s 411 412 # Store full 6 x 8 413 B.LO 7f 414 415 STP q30, q31, [x7] 416 ADD x7, x7, x0 417 SUB x3, x3, x2 // a0 -= kc 418 STP q28, q29, [x13] 419 ADD x13, x13, x0 420 SUB x9, x9, x2 // a1 -= kc 421 STP q26, q27, [x14] 422 ADD x14, x14, x0 423 SUB x10, x10, x2 // a2 -= kc 424 STP q24, q25, [x17] 425 ADD x17, x17, x0 426 SUB x11, x11, x2 // a3 -= kc 427 STP q22, q23, [x16] 428 ADD x16, x16, x0 429 SUB x12, x12, x2 // a4 -= kc 430 STP q20, q21, [x6] 431 ADD x6, x6, x0 432 SUB x4, x4, x2 // a5 -= kc 433 434 B.HI 0b 435 436 # Restore d8-d15 from stack 437 LDP d14, d15, [sp, 48] 438 LDP d12, d13, [sp, 32] 439 LDP d10, d11, [sp, 16] 440 LDP d8, d9, [sp], 64 441 RET 442 4434: 444 # Load min/max values 445 LD2R {v6.4s, v7.4s}, [x8] 446 447 # Is there a remainder?- 4 floats of A (16 bytes) 448 TBZ x0, 4, 5f 449 450 # Remainder- 4 floats of A (16 bytes) 451 # Load A 452 LDR q0, [x3], 16 453 LDR q1, [x9], 16 454 LDR q2, [x10], 16 455 LDR q3, [x11], 16 456 LDR q4, [x12], 16 457 LDR q5, [x4], 16 458 # Load B 459 LDP q12, q13, [x5], 32 460 LDP q14, q15, [x5], 32 461 LDP q16, q17, [x5], 32 462 LDP q18, q19, [x5], 32 463 464 FMLA v20.4s, v12.4s, v0.s[0] 465 FMLA v22.4s, v12.4s, v1.s[0] 466 FMLA v24.4s, v12.4s, v2.s[0] 467 FMLA v26.4s, v12.4s, v3.s[0] 468 FMLA v28.4s, v12.4s, v4.s[0] 469 FMLA v30.4s, v12.4s, v5.s[0] 470 FMLA v21.4s, v13.4s, v0.s[0] 471 FMLA v23.4s, v13.4s, v1.s[0] 472 FMLA v25.4s, v13.4s, v2.s[0] 473 FMLA v27.4s, v13.4s, v3.s[0] 474 FMLA v29.4s, v13.4s, v4.s[0] 475 FMLA v31.4s, v13.4s, v5.s[0] 476 477 FMLA v20.4s, v14.4s, v0.s[1] 478 FMLA v22.4s, v14.4s, v1.s[1] 479 FMLA v24.4s, v14.4s, v2.s[1] 480 FMLA v26.4s, v14.4s, v3.s[1] 481 FMLA v28.4s, v14.4s, v4.s[1] 482 FMLA v30.4s, v14.4s, v5.s[1] 483 FMLA v21.4s, v15.4s, v0.s[1] 484 FMLA v23.4s, v15.4s, v1.s[1] 485 FMLA v25.4s, v15.4s, v2.s[1] 486 FMLA v27.4s, v15.4s, v3.s[1] 487 FMLA v29.4s, v15.4s, v4.s[1] 488 FMLA v31.4s, v15.4s, v5.s[1] 489 490 FMLA v20.4s, v16.4s, v0.s[2] 491 FMLA v22.4s, v16.4s, v1.s[2] 492 FMLA v24.4s, v16.4s, v2.s[2] 493 FMLA v26.4s, v16.4s, v3.s[2] 494 FMLA v28.4s, v16.4s, v4.s[2] 495 FMLA v30.4s, v16.4s, v5.s[2] 496 FMLA v21.4s, v17.4s, v0.s[2] 497 FMLA v23.4s, v17.4s, v1.s[2] 498 FMLA v25.4s, v17.4s, v2.s[2] 499 FMLA v27.4s, v17.4s, v3.s[2] 500 FMLA v29.4s, v17.4s, v4.s[2] 501 FMLA v31.4s, v17.4s, v5.s[2] 502 503 FMLA v20.4s, v18.4s, v0.s[3] 504 FMLA v22.4s, v18.4s, v1.s[3] 505 FMLA v24.4s, v18.4s, v2.s[3] 506 FMLA v26.4s, v18.4s, v3.s[3] 507 FMLA v28.4s, v18.4s, v4.s[3] 508 FMLA v30.4s, v18.4s, v5.s[3] 509 FMLA v21.4s, v19.4s, v0.s[3] 510 FMLA v23.4s, v19.4s, v1.s[3] 511 FMLA v25.4s, v19.4s, v2.s[3] 512 FMLA v27.4s, v19.4s, v3.s[3] 513 FMLA v29.4s, v19.4s, v4.s[3] 514 FMLA v31.4s, v19.4s, v5.s[3] 515 516 # Is there a remainder?- 2 floats of A (8 bytes) 5175: 518 TBZ x0, 3, 6f 519 520 # Remainder- 2 floats of A (8 bytes) 521 # Load A 522 LDR d0, [x3], 8 523 LDR d1, [x9], 8 524 LDR d2, [x10], 8 525 LDR d3, [x11], 8 526 LDR d4, [x12], 8 527 LDR d5, [x4], 8 528 # Load B 529 LDP q12, q13, [x5], 32 530 LDP q14, q15, [x5], 32 531 532 FMLA v20.4s, v12.4s, v0.s[0] 533 FMLA v22.4s, v12.4s, v1.s[0] 534 FMLA v24.4s, v12.4s, v2.s[0] 535 FMLA v26.4s, v12.4s, v3.s[0] 536 FMLA v28.4s, v12.4s, v4.s[0] 537 FMLA v30.4s, v12.4s, v5.s[0] 538 FMLA v21.4s, v13.4s, v0.s[0] 539 FMLA v23.4s, v13.4s, v1.s[0] 540 FMLA v25.4s, v13.4s, v2.s[0] 541 FMLA v27.4s, v13.4s, v3.s[0] 542 FMLA v29.4s, v13.4s, v4.s[0] 543 FMLA v31.4s, v13.4s, v5.s[0] 544 545 FMLA v20.4s, v14.4s, v0.s[1] 546 FMLA v22.4s, v14.4s, v1.s[1] 547 FMLA v24.4s, v14.4s, v2.s[1] 548 FMLA v26.4s, v14.4s, v3.s[1] 549 FMLA v28.4s, v14.4s, v4.s[1] 550 FMLA v30.4s, v14.4s, v5.s[1] 551 FMLA v21.4s, v15.4s, v0.s[1] 552 FMLA v23.4s, v15.4s, v1.s[1] 553 FMLA v25.4s, v15.4s, v2.s[1] 554 FMLA v27.4s, v15.4s, v3.s[1] 555 FMLA v29.4s, v15.4s, v4.s[1] 556 FMLA v31.4s, v15.4s, v5.s[1] 557 558 # Is there a remainder?- 1 float of A (4 bytes) 5596: 560 TBZ x0, 2, 3b 561 562 # Remainder- 1 float of A (4 bytes) 563 # Load A 564 LDR s0, [x3], 4 565 LDR s1, [x9], 4 566 LDR s2, [x10], 4 567 LDR s3, [x11], 4 568 LDR s4, [x12], 4 569 LDR s5, [x4], 4 570 # Load B 571 LDP q12, q13, [x5], 32 572 573 FMLA v20.4s, v12.4s, v0.s[0] 574 FMLA v22.4s, v12.4s, v1.s[0] 575 FMLA v24.4s, v12.4s, v2.s[0] 576 FMLA v26.4s, v12.4s, v3.s[0] 577 FMLA v28.4s, v12.4s, v4.s[0] 578 FMLA v30.4s, v12.4s, v5.s[0] 579 FMLA v21.4s, v13.4s, v0.s[0] 580 FMLA v23.4s, v13.4s, v1.s[0] 581 FMLA v25.4s, v13.4s, v2.s[0] 582 FMLA v27.4s, v13.4s, v3.s[0] 583 FMLA v29.4s, v13.4s, v4.s[0] 584 FMLA v31.4s, v13.4s, v5.s[0] 585 B 3b 586 587 # Store odd width 5887: 589 TBZ x1, 2, 8f 590 STR q30, [x7], 16 591 MOV v30.16b, v31.16b 592 STR q28, [x13], 16 593 MOV v28.16b, v29.16b 594 STR q26, [x14], 16 595 MOV v26.16b, v27.16b 596 STR q24, [x17], 16 597 MOV v24.16b, v25.16b 598 STR q22, [x16], 16 599 MOV v22.16b, v23.16b 600 STR q20, [x6], 16 601 MOV v20.16b, v21.16b 6028: 603 TBZ x1, 1, 9f 604 STR d30, [x7], 8 605 STR d28, [x13], 8 606 DUP d30, v30.d[1] 607 DUP d28, v28.d[1] 608 STR d26, [x14], 8 609 STR d24, [x17], 8 610 DUP d26, v26.d[1] 611 DUP d24, v24.d[1] 612 STR d22, [x16], 8 613 STR d20, [x6], 8 614 DUP d22, v22.d[1] 615 DUP d20, v20.d[1] 616 6179: 618 TBZ x1, 0, 10f 619 STR s30, [x7] 620 STR s28, [x13] 621 STR s26, [x14] 622 STR s24, [x17] 623 STR s22, [x16] 624 STR s20, [x6] 62510: 626 # Restore d8-d15 from stack 627 LDP d14, d15, [sp, 48] 628 LDP d12, d13, [sp, 32] 629 LDP d10, d11, [sp, 16] 630 LDP d8, d9, [sp], 64 631 RET 632 633END_FUNCTION xnn_f32_gemminc_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75 634 635#ifdef __ELF__ 636.section ".note.GNU-stack","",%progbits 637#endif 638