1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# Vector register usage 43# A0 v0 v6 44# A1 v1 v7 45# A2 v2 v8 46# A3 v3 v9 47# A4 v4 v10 48# A5 v5 v11 49# B v12 v13 v14 v15 50# B v16 v17 v18 v19 51# C v20 v21 52# C v22 v23 53# C v24 v25 54# C v26 v27 55# C v28 v29 56# C v30 v31 57# Clamp v6 v7 58 59BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73 60 61 $if INC: 62 # Load acc, params pointer 63 LDP x15, x8, [sp, 8] 64 $else: 65 # Load params pointer 66 LDR x8, [sp, 8] 67 68 # Clamp A and C pointers / Save d8-d15 on stack 69 STP d8, d9, [sp, -64]! 70 CMP x0, 2 // if mr < 2 71 ADD x9, x3, x4 // a1 = a0 + a_stride 72 ADD x16, x6, x7 // c1 = c0 + cm_stride 73 CSEL x9, x3, x9, LO // a1 = a0 74 CSEL x16, x6, x16, LO // c1 = c0 75 76 STP d10, d11, [sp, 16] 77 ADD x10, x9, x4 // a2 = a1 + a_stride 78 ADD x17, x16, x7 // c2 = c1 + cm_stride 79 // if mr <= 2 80 CSEL x10, x9, x10, LS // a2 = a1 81 CSEL x17, x16, x17, LS // c2 = c1 82 83 STP d12, d13, [sp, 32] 84 CMP x0, 4 // if mr < 4 85 ADD x11, x10, x4 // a3 = a2 + a_stride 86 ADD x14, x17, x7 // c3 = c2 + cm_stride 87 CSEL x11, x10, x11, LO // a3 = a2 88 CSEL x14, x17, x14, LO // c3 = c2 89 90 STP d14, d15, [sp, 48] 91 ADD x12, x11, x4 // a4 = a3 + a_stride 92 ADD x13, x14, x7 // c4 = c3 + cm_stride 93 // if mr <= 4 94 CSEL x12, x11, x12, LS // a4 = a3 95 CSEL x13, x14, x13, LS // c4 = c3 96 97 CMP x0, 6 // if mr < 6 98 ADD x4, x12, x4 // a5 = a4 + a_stride 99 ADD x7, x13, x7 // c5 = c4 + cm_stride 100 CSEL x4, x12, x4, LO // a5 = a4 101 CSEL x7, x13, x7, LO // c5 = c4 102 103 .p2align 3 1040: 105 $if INC: 106 # Load initial accumulators 107 LDP q20, q21, [x15], 32 108 LDP q22, q23, [x15], 32 109 LDP q24, q25, [x15], 32 110 LDP q26, q27, [x15], 32 111 LDP q28, q29, [x15], 32 112 LDP q30, q31, [x15], 32 113 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 114 PRFM PLDL1KEEP, [x5, 64] 115 PRFM PLDL1KEEP, [x5, 128] 116 PRFM PLDL1KEEP, [x5, 192] 117 PRFM PLDL1KEEP, [x3] // Prefetch A 118 PRFM PLDL1KEEP, [x9] 119 PRFM PLDL1KEEP, [x10] 120 PRFM PLDL1KEEP, [x11] 121 PRFM PLDL1KEEP, [x12] 122 PRFM PLDL1KEEP, [x4] 123 $else: 124 # Load initial bias from w into accumulators 125 LDP q20, q21, [x5], 32 126 MOV v22.16b, v20.16b 127 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 128 MOV v23.16b, v21.16b 129 PRFM PLDL1KEEP, [x5, 64] 130 MOV v24.16b, v20.16b 131 PRFM PLDL1KEEP, [x5, 128] 132 MOV v25.16b, v21.16b 133 PRFM PLDL1KEEP, [x5, 192] 134 MOV v26.16b, v20.16b 135 PRFM PLDL1KEEP, [x3] // Prefetch A 136 MOV v27.16b, v21.16b 137 PRFM PLDL1KEEP, [x9] 138 MOV v28.16b, v20.16b 139 PRFM PLDL1KEEP, [x10] 140 MOV v29.16b, v21.16b 141 PRFM PLDL1KEEP, [x11] 142 MOV v30.16b, v20.16b 143 PRFM PLDL1KEEP, [x12] 144 MOV v31.16b, v21.16b 145 PRFM PLDL1KEEP, [x4] 146 147 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 148 SUBS x0, x2, 32 // k = kc - 32 149 B.LO 4f 150 151 # Prologue - loads for main loop of 96 FMA 152 # load A0 to A4 but not A5 153 LDP q0, q6, [x3], 32 154 LDP q1, q7, [x9], 32 155 LDP q2, q8, [x10], 32 156 LDP q3, q9, [x11], 32 157 LDP q4, q10, [x12], 32 158 # load first set of B 159 LDP q12, q13, [x5], 32 160 LDP q14, q15, [x5], 32 161 162 # Is there at least 8 floats (32 bytes) for main loop? 163 SUBS x0, x0, 32 164 B.LO 2f 165 166 # Main loop - 8 floats of A (32 bytes) 167 # 96 FMA + 6 LDP A + 8 LDP B 168 .p2align 3 1691: 170 # First group of 4 A. 48 FMA. Loads A5 171 172 LDP q5, q11, [x4], 32 173 FMLA v20.4s, v12.4s, v0.s[0] 174 FMLA v22.4s, v12.4s, v1.s[0] 175 LDP q16, q17, [x5], 32 176 FMLA v24.4s, v12.4s, v2.s[0] 177 FMLA v26.4s, v12.4s, v3.s[0] 178 LDP q18, q19, [x5], 32 179 FMLA v28.4s, v12.4s, v4.s[0] 180 FMLA v30.4s, v12.4s, v5.s[0] 181 FMLA v21.4s, v13.4s, v0.s[0] 182 FMLA v23.4s, v13.4s, v1.s[0] 183 FMLA v25.4s, v13.4s, v2.s[0] 184 FMLA v27.4s, v13.4s, v3.s[0] 185 FMLA v29.4s, v13.4s, v4.s[0] 186 FMLA v31.4s, v13.4s, v5.s[0] 187 188 FMLA v20.4s, v14.4s, v0.s[1] 189 FMLA v22.4s, v14.4s, v1.s[1] 190 FMLA v24.4s, v14.4s, v2.s[1] 191 FMLA v26.4s, v14.4s, v3.s[1] 192 FMLA v28.4s, v14.4s, v4.s[1] 193 FMLA v30.4s, v14.4s, v5.s[1] 194 FMLA v21.4s, v15.4s, v0.s[1] 195 FMLA v23.4s, v15.4s, v1.s[1] 196 FMLA v25.4s, v15.4s, v2.s[1] 197 FMLA v27.4s, v15.4s, v3.s[1] 198 FMLA v29.4s, v15.4s, v4.s[1] 199 FMLA v31.4s, v15.4s, v5.s[1] 200 201 LDP q12, q13, [x5], 32 202 FMLA v20.4s, v16.4s, v0.s[2] 203 FMLA v22.4s, v16.4s, v1.s[2] 204 LDP q14, q15, [x5], 32 205 FMLA v24.4s, v16.4s, v2.s[2] 206 FMLA v26.4s, v16.4s, v3.s[2] 207 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 208 FMLA v28.4s, v16.4s, v4.s[2] 209 FMLA v30.4s, v16.4s, v5.s[2] 210 PRFM PLDL1KEEP, [x5, 256] 211 FMLA v21.4s, v17.4s, v0.s[2] 212 FMLA v23.4s, v17.4s, v1.s[2] 213 FMLA v25.4s, v17.4s, v2.s[2] 214 FMLA v27.4s, v17.4s, v3.s[2] 215 FMLA v29.4s, v17.4s, v4.s[2] 216 FMLA v31.4s, v17.4s, v5.s[2] 217 218 FMLA v20.4s, v18.4s, v0.s[3] 219 FMLA v22.4s, v18.4s, v1.s[3] 220 FMLA v24.4s, v18.4s, v2.s[3] 221 FMLA v26.4s, v18.4s, v3.s[3] 222 FMLA v28.4s, v18.4s, v4.s[3] 223 FMLA v30.4s, v18.4s, v5.s[3] 224 FMLA v21.4s, v19.4s, v0.s[3] 225 FMLA v23.4s, v19.4s, v1.s[3] 226 FMLA v25.4s, v19.4s, v2.s[3] 227 FMLA v27.4s, v19.4s, v3.s[3] 228 FMLA v29.4s, v19.4s, v4.s[3] 229 FMLA v31.4s, v19.4s, v5.s[3] 230 231 # Second group of 4 A. 48 FMA. Loads A0 - A4 232 233 LDP q16, q17, [x5], 32 234 FMLA v20.4s, v12.4s, v6.s[0] 235 FMLA v22.4s, v12.4s, v7.s[0] 236 LDP q18, q19, [x5], 32 237 FMLA v24.4s, v12.4s, v8.s[0] 238 FMLA v26.4s, v12.4s, v9.s[0] 239 FMLA v28.4s, v12.4s, v10.s[0] 240 FMLA v30.4s, v12.4s, v11.s[0] 241 FMLA v21.4s, v13.4s, v6.s[0] 242 FMLA v23.4s, v13.4s, v7.s[0] 243 FMLA v25.4s, v13.4s, v8.s[0] 244 FMLA v27.4s, v13.4s, v9.s[0] 245 FMLA v29.4s, v13.4s, v10.s[0] 246 FMLA v31.4s, v13.4s, v11.s[0] 247 248 FMLA v20.4s, v14.4s, v6.s[1] 249 FMLA v22.4s, v14.4s, v7.s[1] 250 FMLA v24.4s, v14.4s, v8.s[1] 251 FMLA v26.4s, v14.4s, v9.s[1] 252 FMLA v28.4s, v14.4s, v10.s[1] 253 FMLA v30.4s, v14.4s, v11.s[1] 254 FMLA v21.4s, v15.4s, v6.s[1] 255 FMLA v23.4s, v15.4s, v7.s[1] 256 FMLA v25.4s, v15.4s, v8.s[1] 257 FMLA v27.4s, v15.4s, v9.s[1] 258 FMLA v29.4s, v15.4s, v10.s[1] 259 FMLA v31.4s, v15.4s, v11.s[1] 260 261 LDP q12, q13, [x5], 32 262 FMLA v20.4s, v16.4s, v6.s[2] 263 FMLA v20.4s, v18.4s, v6.s[3] 264 LDP q14, q15, [x5], 32 265 FMLA v21.4s, v17.4s, v6.s[2] 266 FMLA v21.4s, v19.4s, v6.s[3] 267 LDP q0, q6, [x3], 32 268 FMLA v22.4s, v16.4s, v7.s[2] 269 FMLA v22.4s, v18.4s, v7.s[3] 270 FMLA v23.4s, v17.4s, v7.s[2] 271 FMLA v23.4s, v19.4s, v7.s[3] 272 LDP q1, q7, [x9], 32 273 FMLA v24.4s, v16.4s, v8.s[2] 274 FMLA v24.4s, v18.4s, v8.s[3] 275 FMLA v25.4s, v17.4s, v8.s[2] 276 FMLA v25.4s, v19.4s, v8.s[3] 277 LDP q2, q8, [x10], 32 278 FMLA v26.4s, v16.4s, v9.s[2] 279 FMLA v26.4s, v18.4s, v9.s[3] 280 FMLA v27.4s, v17.4s, v9.s[2] 281 FMLA v27.4s, v19.4s, v9.s[3] 282 LDP q3, q9, [x11], 32 283 FMLA v28.4s, v16.4s, v10.s[2] 284 FMLA v28.4s, v18.4s, v10.s[3] 285 FMLA v29.4s, v17.4s, v10.s[2] 286 FMLA v29.4s, v19.4s, v10.s[3] 287 LDP q4, q10, [x12], 32 288 FMLA v30.4s, v16.4s, v11.s[2] 289 FMLA v30.4s, v18.4s, v11.s[3] 290 SUBS x0, x0, 32 291 FMLA v31.4s, v17.4s, v11.s[2] 292 FMLA v31.4s, v19.4s, v11.s[3] 293 B.HS 1b 294 295 # Epilogue - 8 floats of A (32 bytes) 296 # 96 FMA + 6 LDP A + 8 LDP B 297 # First block same as main loop. Second block has no preloads. 2982: 299 # First group of 4 A. 48 FMA. Loads A5 300 301 LDP q5, q11, [x4], 32 302 FMLA v20.4s, v12.4s, v0.s[0] 303 FMLA v22.4s, v12.4s, v1.s[0] 304 LDP q16, q17, [x5], 32 305 FMLA v24.4s, v12.4s, v2.s[0] 306 FMLA v26.4s, v12.4s, v3.s[0] 307 LDP q18, q19, [x5], 32 308 FMLA v28.4s, v12.4s, v4.s[0] 309 FMLA v30.4s, v12.4s, v5.s[0] 310 FMLA v21.4s, v13.4s, v0.s[0] 311 FMLA v23.4s, v13.4s, v1.s[0] 312 FMLA v25.4s, v13.4s, v2.s[0] 313 FMLA v27.4s, v13.4s, v3.s[0] 314 FMLA v29.4s, v13.4s, v4.s[0] 315 FMLA v31.4s, v13.4s, v5.s[0] 316 317 FMLA v20.4s, v14.4s, v0.s[1] 318 FMLA v22.4s, v14.4s, v1.s[1] 319 FMLA v24.4s, v14.4s, v2.s[1] 320 FMLA v26.4s, v14.4s, v3.s[1] 321 FMLA v28.4s, v14.4s, v4.s[1] 322 FMLA v30.4s, v14.4s, v5.s[1] 323 FMLA v21.4s, v15.4s, v0.s[1] 324 FMLA v23.4s, v15.4s, v1.s[1] 325 FMLA v25.4s, v15.4s, v2.s[1] 326 FMLA v27.4s, v15.4s, v3.s[1] 327 FMLA v29.4s, v15.4s, v4.s[1] 328 FMLA v31.4s, v15.4s, v5.s[1] 329 330 LDP q12, q13, [x5], 32 331 FMLA v20.4s, v16.4s, v0.s[2] 332 FMLA v22.4s, v16.4s, v1.s[2] 333 LDP q14, q15, [x5], 32 334 FMLA v24.4s, v16.4s, v2.s[2] 335 FMLA v26.4s, v16.4s, v3.s[2] 336 FMLA v28.4s, v16.4s, v4.s[2] 337 FMLA v30.4s, v16.4s, v5.s[2] 338 FMLA v21.4s, v17.4s, v0.s[2] 339 FMLA v23.4s, v17.4s, v1.s[2] 340 FMLA v25.4s, v17.4s, v2.s[2] 341 FMLA v27.4s, v17.4s, v3.s[2] 342 FMLA v29.4s, v17.4s, v4.s[2] 343 FMLA v31.4s, v17.4s, v5.s[2] 344 345 FMLA v20.4s, v18.4s, v0.s[3] 346 FMLA v22.4s, v18.4s, v1.s[3] 347 FMLA v24.4s, v18.4s, v2.s[3] 348 FMLA v26.4s, v18.4s, v3.s[3] 349 FMLA v28.4s, v18.4s, v4.s[3] 350 FMLA v30.4s, v18.4s, v5.s[3] 351 FMLA v21.4s, v19.4s, v0.s[3] 352 FMLA v23.4s, v19.4s, v1.s[3] 353 FMLA v25.4s, v19.4s, v2.s[3] 354 FMLA v27.4s, v19.4s, v3.s[3] 355 FMLA v29.4s, v19.4s, v4.s[3] 356 FMLA v31.4s, v19.4s, v5.s[3] 357 358 # Second group of 4 A. 48 FMA. No A Loads, No last B load 359 360 LDP q16, q17, [x5], 32 361 FMLA v20.4s, v12.4s, v6.s[0] 362 FMLA v22.4s, v12.4s, v7.s[0] 363 LDP q18, q19, [x5], 32 364 FMLA v24.4s, v12.4s, v8.s[0] 365 FMLA v26.4s, v12.4s, v9.s[0] 366 FMLA v28.4s, v12.4s, v10.s[0] 367 FMLA v30.4s, v12.4s, v11.s[0] 368 FMLA v21.4s, v13.4s, v6.s[0] 369 FMLA v23.4s, v13.4s, v7.s[0] 370 FMLA v25.4s, v13.4s, v8.s[0] 371 FMLA v27.4s, v13.4s, v9.s[0] 372 FMLA v29.4s, v13.4s, v10.s[0] 373 FMLA v31.4s, v13.4s, v11.s[0] 374 375 FMLA v20.4s, v14.4s, v6.s[1] 376 FMLA v22.4s, v14.4s, v7.s[1] 377 FMLA v24.4s, v14.4s, v8.s[1] 378 FMLA v26.4s, v14.4s, v9.s[1] 379 FMLA v28.4s, v14.4s, v10.s[1] 380 FMLA v30.4s, v14.4s, v11.s[1] 381 FMLA v21.4s, v15.4s, v6.s[1] 382 FMLA v23.4s, v15.4s, v7.s[1] 383 FMLA v25.4s, v15.4s, v8.s[1] 384 FMLA v27.4s, v15.4s, v9.s[1] 385 FMLA v29.4s, v15.4s, v10.s[1] 386 FMLA v31.4s, v15.4s, v11.s[1] 387 388 # Last part of epilogue has loads removed. 389 390 FMLA v20.4s, v16.4s, v6.s[2] 391 FMLA v22.4s, v16.4s, v7.s[2] 392 FMLA v24.4s, v16.4s, v8.s[2] 393 FMLA v26.4s, v16.4s, v9.s[2] 394 FMLA v28.4s, v16.4s, v10.s[2] 395 FMLA v30.4s, v16.4s, v11.s[2] 396 FMLA v21.4s, v17.4s, v6.s[2] 397 FMLA v23.4s, v17.4s, v7.s[2] 398 FMLA v25.4s, v17.4s, v8.s[2] 399 FMLA v27.4s, v17.4s, v9.s[2] 400 FMLA v29.4s, v17.4s, v10.s[2] 401 FMLA v31.4s, v17.4s, v11.s[2] 402 403 FMLA v20.4s, v18.4s, v6.s[3] 404 FMLA v22.4s, v18.4s, v7.s[3] 405 FMLA v24.4s, v18.4s, v8.s[3] 406 FMLA v26.4s, v18.4s, v9.s[3] 407 FMLA v28.4s, v18.4s, v10.s[3] 408 FMLA v30.4s, v18.4s, v11.s[3] 409 FMLA v21.4s, v19.4s, v6.s[3] 410 FMLA v23.4s, v19.4s, v7.s[3] 411 412 # Load min/max values 413 LD2R {v6.4s, v7.4s}, [x8] 414 415 FMLA v25.4s, v19.4s, v8.s[3] 416 FMLA v27.4s, v19.4s, v9.s[3] 417 # Is there a remainder?- 4 floats of A (16 bytes) or less 418 TST x0, 31 419 FMLA v29.4s, v19.4s, v10.s[3] 420 FMLA v31.4s, v19.4s, v11.s[3] 421 B.NE 4f 422 423 .p2align 3 424 425 # Clamp 4263: 427 FMAX v20.4s, v20.4s, v6.4s 428 # Load cn_stride 429 LDR x0, [sp, 64] 430 FMAX v21.4s, v21.4s, v6.4s 431 FMAX v22.4s, v22.4s, v6.4s 432 FMAX v23.4s, v23.4s, v6.4s 433 FMAX v24.4s, v24.4s, v6.4s 434 FMAX v25.4s, v25.4s, v6.4s 435 FMAX v26.4s, v26.4s, v6.4s 436 FMAX v27.4s, v27.4s, v6.4s 437 FMAX v28.4s, v28.4s, v6.4s 438 FMAX v29.4s, v29.4s, v6.4s 439 FMAX v30.4s, v30.4s, v6.4s 440 FMAX v31.4s, v31.4s, v6.4s 441 SUBS x1, x1, 8 442 FMIN v20.4s, v20.4s, v7.4s 443 FMIN v21.4s, v21.4s, v7.4s 444 FMIN v22.4s, v22.4s, v7.4s 445 FMIN v23.4s, v23.4s, v7.4s 446 FMIN v24.4s, v24.4s, v7.4s 447 FMIN v25.4s, v25.4s, v7.4s 448 FMIN v26.4s, v26.4s, v7.4s 449 FMIN v27.4s, v27.4s, v7.4s 450 FMIN v28.4s, v28.4s, v7.4s 451 FMIN v29.4s, v29.4s, v7.4s 452 FMIN v30.4s, v30.4s, v7.4s 453 FMIN v31.4s, v31.4s, v7.4s 454 455 # Store full 6 x 8 456 B.LO 7f 457 458 $if INC: 459 STP q30, q31, [x7] 460 ADD x7, x7, x0 461 SUB x3, x3, x2 // a0 -= kc 462 STP q28, q29, [x13] 463 ADD x13, x13, x0 464 SUB x9, x9, x2 // a1 -= kc 465 STP q26, q27, [x14] 466 ADD x14, x14, x0 467 SUB x10, x10, x2 // a2 -= kc 468 STP q24, q25, [x17] 469 ADD x17, x17, x0 470 SUB x11, x11, x2 // a3 -= kc 471 STP q22, q23, [x16] 472 ADD x16, x16, x0 473 SUB x12, x12, x2 // a4 -= kc 474 STP q20, q21, [x6] 475 ADD x6, x6, x0 476 SUB x4, x4, x2 // a5 -= kc 477 $else: 478 STP q20, q21, [x6] 479 ADD x6, x6, x0 480 SUB x3, x3, x2 // a0 -= kc 481 STP q22, q23, [x16] 482 ADD x16, x16, x0 483 SUB x9, x9, x2 // a1 -= kc 484 STP q24, q25, [x17] 485 ADD x17, x17, x0 486 SUB x10, x10, x2 // a2 -= kc 487 STP q26, q27, [x14] 488 ADD x14, x14, x0 489 SUB x11, x11, x2 // a3 -= kc 490 STP q28, q29, [x13] 491 ADD x13, x13, x0 492 SUB x12, x12, x2 // a4 -= kc 493 STP q30, q31, [x7] 494 ADD x7, x7, x0 495 SUB x4, x4, x2 // a5 -= kc 496 497 NOP 498 B.HI 0b 499 500 # Restore d8-d15 from stack 501 LDP d14, d15, [sp, 48] 502 LDP d12, d13, [sp, 32] 503 LDP d10, d11, [sp, 16] 504 LDP d8, d9, [sp], 64 505 RET 506 507 .p2align 3 5084: 509 # Load min/max values 510 LD2R {v6.4s, v7.4s}, [x8] 511 512 # Is there a remainder?- 4 floats of A (16 bytes) 513 TBZ x0, 4, 5f 514 515 # Remainder- 4 floats of A (16 bytes) 516 # Load A 517 LDR q0, [x3], 16 518 LDR q1, [x9], 16 519 LDR q2, [x10], 16 520 LDR q3, [x11], 16 521 LDR q4, [x12], 16 522 LDR q5, [x4], 16 523 # Load B 524 LDP q12, q13, [x5], 32 525 LDP q14, q15, [x5], 32 526 LDP q16, q17, [x5], 32 527 LDP q18, q19, [x5], 32 528 529 FMLA v20.4s, v12.4s, v0.s[0] 530 FMLA v22.4s, v12.4s, v1.s[0] 531 FMLA v24.4s, v12.4s, v2.s[0] 532 FMLA v26.4s, v12.4s, v3.s[0] 533 FMLA v28.4s, v12.4s, v4.s[0] 534 FMLA v30.4s, v12.4s, v5.s[0] 535 FMLA v21.4s, v13.4s, v0.s[0] 536 FMLA v23.4s, v13.4s, v1.s[0] 537 FMLA v25.4s, v13.4s, v2.s[0] 538 FMLA v27.4s, v13.4s, v3.s[0] 539 FMLA v29.4s, v13.4s, v4.s[0] 540 FMLA v31.4s, v13.4s, v5.s[0] 541 542 FMLA v20.4s, v14.4s, v0.s[1] 543 FMLA v22.4s, v14.4s, v1.s[1] 544 FMLA v24.4s, v14.4s, v2.s[1] 545 FMLA v26.4s, v14.4s, v3.s[1] 546 FMLA v28.4s, v14.4s, v4.s[1] 547 FMLA v30.4s, v14.4s, v5.s[1] 548 FMLA v21.4s, v15.4s, v0.s[1] 549 FMLA v23.4s, v15.4s, v1.s[1] 550 FMLA v25.4s, v15.4s, v2.s[1] 551 FMLA v27.4s, v15.4s, v3.s[1] 552 FMLA v29.4s, v15.4s, v4.s[1] 553 FMLA v31.4s, v15.4s, v5.s[1] 554 555 FMLA v20.4s, v16.4s, v0.s[2] 556 FMLA v22.4s, v16.4s, v1.s[2] 557 FMLA v24.4s, v16.4s, v2.s[2] 558 FMLA v26.4s, v16.4s, v3.s[2] 559 FMLA v28.4s, v16.4s, v4.s[2] 560 FMLA v30.4s, v16.4s, v5.s[2] 561 FMLA v21.4s, v17.4s, v0.s[2] 562 FMLA v23.4s, v17.4s, v1.s[2] 563 FMLA v25.4s, v17.4s, v2.s[2] 564 FMLA v27.4s, v17.4s, v3.s[2] 565 FMLA v29.4s, v17.4s, v4.s[2] 566 FMLA v31.4s, v17.4s, v5.s[2] 567 568 FMLA v20.4s, v18.4s, v0.s[3] 569 FMLA v22.4s, v18.4s, v1.s[3] 570 FMLA v24.4s, v18.4s, v2.s[3] 571 FMLA v26.4s, v18.4s, v3.s[3] 572 FMLA v28.4s, v18.4s, v4.s[3] 573 FMLA v30.4s, v18.4s, v5.s[3] 574 FMLA v21.4s, v19.4s, v0.s[3] 575 FMLA v23.4s, v19.4s, v1.s[3] 576 FMLA v25.4s, v19.4s, v2.s[3] 577 FMLA v27.4s, v19.4s, v3.s[3] 578 FMLA v29.4s, v19.4s, v4.s[3] 579 FMLA v31.4s, v19.4s, v5.s[3] 580 581 # Is there a remainder?- 2 floats of A (8 bytes) 5825: 583 TBZ x0, 3, 6f 584 585 # Remainder- 2 floats of A (8 bytes) 586 # Load A 587 LDR d0, [x3], 8 588 LDR d1, [x9], 8 589 LDR d2, [x10], 8 590 LDR d3, [x11], 8 591 LDR d4, [x12], 8 592 LDR d5, [x4], 8 593 # Load B 594 LDP q12, q13, [x5], 32 595 LDP q14, q15, [x5], 32 596 597 FMLA v20.4s, v12.4s, v0.s[0] 598 FMLA v22.4s, v12.4s, v1.s[0] 599 FMLA v24.4s, v12.4s, v2.s[0] 600 FMLA v26.4s, v12.4s, v3.s[0] 601 FMLA v28.4s, v12.4s, v4.s[0] 602 FMLA v30.4s, v12.4s, v5.s[0] 603 FMLA v21.4s, v13.4s, v0.s[0] 604 FMLA v23.4s, v13.4s, v1.s[0] 605 FMLA v25.4s, v13.4s, v2.s[0] 606 FMLA v27.4s, v13.4s, v3.s[0] 607 FMLA v29.4s, v13.4s, v4.s[0] 608 FMLA v31.4s, v13.4s, v5.s[0] 609 610 FMLA v20.4s, v14.4s, v0.s[1] 611 FMLA v22.4s, v14.4s, v1.s[1] 612 FMLA v24.4s, v14.4s, v2.s[1] 613 FMLA v26.4s, v14.4s, v3.s[1] 614 FMLA v28.4s, v14.4s, v4.s[1] 615 FMLA v30.4s, v14.4s, v5.s[1] 616 FMLA v21.4s, v15.4s, v0.s[1] 617 FMLA v23.4s, v15.4s, v1.s[1] 618 FMLA v25.4s, v15.4s, v2.s[1] 619 FMLA v27.4s, v15.4s, v3.s[1] 620 FMLA v29.4s, v15.4s, v4.s[1] 621 FMLA v31.4s, v15.4s, v5.s[1] 622 623 # Is there a remainder?- 1 float of A (4 bytes) 6246: 625 TBZ x0, 2, 3b 626 627 # Remainder- 1 float of A (4 bytes) 628 # Load A 629 LDR s0, [x3], 4 630 LDR s1, [x9], 4 631 LDR s2, [x10], 4 632 LDR s3, [x11], 4 633 LDR s4, [x12], 4 634 LDR s5, [x4], 4 635 # Load B 636 LDP q12, q13, [x5], 32 637 638 FMLA v20.4s, v12.4s, v0.s[0] 639 FMLA v22.4s, v12.4s, v1.s[0] 640 FMLA v24.4s, v12.4s, v2.s[0] 641 FMLA v26.4s, v12.4s, v3.s[0] 642 FMLA v28.4s, v12.4s, v4.s[0] 643 FMLA v30.4s, v12.4s, v5.s[0] 644 FMLA v21.4s, v13.4s, v0.s[0] 645 FMLA v23.4s, v13.4s, v1.s[0] 646 FMLA v25.4s, v13.4s, v2.s[0] 647 FMLA v27.4s, v13.4s, v3.s[0] 648 FMLA v29.4s, v13.4s, v4.s[0] 649 FMLA v31.4s, v13.4s, v5.s[0] 650 B 3b 651 652 .p2align 3 653 654 # Store odd width 6557: 656 TBZ x1, 2, 8f 657 $if INC: 658 STR q30, [x7], 16 659 MOV v30.16b, v31.16b 660 STR q28, [x13], 16 661 MOV v28.16b, v29.16b 662 STR q26, [x14], 16 663 MOV v26.16b, v27.16b 664 STR q24, [x17], 16 665 MOV v24.16b, v25.16b 666 STR q22, [x16], 16 667 MOV v22.16b, v23.16b 668 STR q20, [x6], 16 669 MOV v20.16b, v21.16b 670 $else: 671 STR q20, [x6], 16 672 MOV v20.16b, v21.16b 673 STR q22, [x16], 16 674 MOV v22.16b, v23.16b 675 STR q24, [x17], 16 676 MOV v24.16b, v25.16b 677 STR q26, [x14], 16 678 MOV v26.16b, v27.16b 679 STR q28, [x13], 16 680 MOV v28.16b, v29.16b 681 STR q30, [x7], 16 682 MOV v30.16b, v31.16b 6838: 684 TBZ x1, 1, 9f 685 $if INC: 686 STR d30, [x7], 8 687 DUP d30, v30.d[1] 688 STR d28, [x13], 8 689 DUP d28, v28.d[1] 690 STR d26, [x14], 8 691 DUP d26, v26.d[1] 692 STR d24, [x17], 8 693 DUP d24, v24.d[1] 694 STR d22, [x16], 8 695 DUP d22, v22.d[1] 696 STR d20, [x6], 8 697 DUP d20, v20.d[1] 698 $else: 699 STR d20, [x6], 8 700 DUP d20, v20.d[1] 701 STR d22, [x16], 8 702 DUP d22, v22.d[1] 703 STR d24, [x17], 8 704 DUP d24, v24.d[1] 705 STR d26, [x14], 8 706 DUP d26, v26.d[1] 707 STR d28, [x13], 8 708 DUP d28, v28.d[1] 709 STR d30, [x7], 8 710 DUP d30, v30.d[1] 711 7129: 713 TBZ x1, 0, 10f 714 $if INC: 715 STR s30, [x7] 716 STR s28, [x13] 717 STR s26, [x14] 718 STR s24, [x17] 719 STR s22, [x16] 720 STR s20, [x6] 721 $else: 722 STR s20, [x6] 723 STR s22, [x16] 724 STR s24, [x17] 725 STR s26, [x14] 726 STR s28, [x13] 727 STR s30, [x7] 72810: 729 # Restore d8-d15 from stack 730 LDP d14, d15, [sp, 48] 731 LDP d12, d13, [sp, 32] 732 LDP d10, d11, [sp, 16] 733 LDP d8, d9, [sp], 64 734 RET 735 736END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73 737 738#ifdef __ELF__ 739.section ".note.GNU-stack","",%progbits 740#endif 741