1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a73( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x18 c3 40# x13 c4 41# x7 c5 42 43# Vector register usage 44# A0 v0 v6 45# A1 v1 v7 46# A2 v2 v8 47# A3 v3 v9 48# A4 v4 v10 49# A5 v5 v11 50# B v12 v13 v14 v15 51# B v16 v17 v18 v19 52# C v20 v21 53# C v22 v23 54# C v24 v25 55# C v26 v27 56# C v28 v29 57# C v30 v31 58# Clamp v6 v7 59 60BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a73 61 62 # Clamp A and C pointers / Save d8-d15 on stack 63 STP d8, d9, [sp, -64]! 64 CMP x0, 2 // if mr < 2 65 ADD x9, x3, x4 // a1 = a0 + a_stride 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 CSEL x9, x3, x9, LO // a1 = a0 68 CSEL x16, x6, x16, LO // c1 = c0 69 70 STP d10, d11, [sp, 16] 71 ADD x10, x9, x4 // a2 = a1 + a_stride 72 ADD x17, x16, x7 // c2 = c1 + cm_stride 73 // if mr <= 2 74 CSEL x10, x9, x10, LS // a2 = a1 75 CSEL x17, x16, x17, LS // c2 = c1 76 77 STP d12, d13, [sp, 32] 78 CMP x0, 4 // if mr < 4 79 ADD x11, x10, x4 // a3 = a2 + a_stride 80 ADD x18, x17, x7 // c3 = c2 + cm_stride 81 CSEL x11, x10, x11, LO // a3 = a2 82 CSEL x18, x17, x18, LO // c3 = c2 83 84 STP d14, d15, [sp, 48] 85 ADD x12, x11, x4 // a4 = a3 + a_stride 86 ADD x13, x18, x7 // c4 = c3 + cm_stride 87 // if mr <= 5 88 CSEL x12, x11, x12, LS // a4 = a3 89 CSEL x13, x18, x13, LS // c4 = c3 90 91 $if INC: 92 # Load acc, params pointer 93 LDP x15, x8, [sp, 72] 94 $else: 95 # Load params pointer 96 LDR x8, [sp, 72] 97 98 CMP x0, 6 // if mr < 6 99 ADD x4, x12, x4 // a5 = a4 + a_stride 100 ADD x7, x13, x7 // c5 = c4 + cm_stride 101 CSEL x4, x12, x4, LO // a5 = a4 102 CSEL x7, x13, x7, LO // c5 = c4 103 104 # Load cn_stride 105 LDR x14, [sp, 64] 106 107 .p2align 3 1080: 109 $if INC: 110 # Load initial accumulators 111 LDP q20, q21, [x15], 32 112 LDP q22, q23, [x15], 32 113 LDP q24, q25, [x15], 32 114 LDP q26, q27, [x15], 32 115 LDP q28, q29, [x15], 32 116 LDP q30, q31, [x15], 32 117 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 118 PRFM PLDL1KEEP, [x5, 64] 119 PRFM PLDL1KEEP, [x5, 128] 120 PRFM PLDL1KEEP, [x5, 192] 121 PRFM PLDL1KEEP, [x3] // Prefetch A 122 PRFM PLDL1KEEP, [x9] 123 PRFM PLDL1KEEP, [x10] 124 PRFM PLDL1KEEP, [x11] 125 PRFM PLDL1KEEP, [x12] 126 PRFM PLDL1KEEP, [x4] 127 $else: 128 # Load initial bias from w into accumulators 129 LDP q20, q21, [x5], 32 130 MOV v22.16b, v20.16b 131 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 132 MOV v23.16b, v21.16b 133 PRFM PLDL1KEEP, [x5, 64] 134 MOV v24.16b, v20.16b 135 PRFM PLDL1KEEP, [x5, 128] 136 MOV v25.16b, v21.16b 137 PRFM PLDL1KEEP, [x5, 192] 138 MOV v26.16b, v20.16b 139 PRFM PLDL1KEEP, [x3] // Prefetch A 140 MOV v27.16b, v21.16b 141 PRFM PLDL1KEEP, [x9] 142 MOV v28.16b, v20.16b 143 PRFM PLDL1KEEP, [x10] 144 MOV v29.16b, v21.16b 145 PRFM PLDL1KEEP, [x11] 146 MOV v30.16b, v20.16b 147 PRFM PLDL1KEEP, [x12] 148 MOV v31.16b, v21.16b 149 PRFM PLDL1KEEP, [x4] 150 151 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 152 SUBS x0, x2, 32 // k = kc - 32 153 B.LO 4f 154 155 # Prologue - loads for main loop of 96 FMA 156 # load A0 to A4 but not A5 157 LDP q0, q6, [x3], 32 158 LDP q1, q7, [x9], 32 159 LDP q2, q8, [x10], 32 160 LDP q3, q9, [x11], 32 161 LDP q4, q10, [x12], 32 162 # load first set of B 163 LDP q12, q13, [x5], 32 164 LDP q14, q15, [x5], 32 165 166 # Is there at least 8 floats (32 bytes) for main loop? 167 SUBS x0, x0, 32 168 B.LO 2f 169 170 # Main loop - 8 floats of A (32 bytes) 171 # 96 FMA + 6 LDP A + 8 LDP B 172 .p2align 3 1731: 174 # First group of 4 A. 48 FMA. Loads A5 175 176 LDP q5, q11, [x4], 32 177 FMLA v20.4s, v12.4s, v0.s[0] 178 FMLA v22.4s, v12.4s, v1.s[0] 179 LDP q16, q17, [x5], 32 180 FMLA v24.4s, v12.4s, v2.s[0] 181 FMLA v26.4s, v12.4s, v3.s[0] 182 LDP q18, q19, [x5], 32 183 FMLA v28.4s, v12.4s, v4.s[0] 184 FMLA v30.4s, v12.4s, v5.s[0] 185 FMLA v21.4s, v13.4s, v0.s[0] 186 FMLA v23.4s, v13.4s, v1.s[0] 187 FMLA v25.4s, v13.4s, v2.s[0] 188 FMLA v27.4s, v13.4s, v3.s[0] 189 FMLA v29.4s, v13.4s, v4.s[0] 190 FMLA v31.4s, v13.4s, v5.s[0] 191 192 FMLA v20.4s, v14.4s, v0.s[1] 193 FMLA v22.4s, v14.4s, v1.s[1] 194 FMLA v24.4s, v14.4s, v2.s[1] 195 FMLA v26.4s, v14.4s, v3.s[1] 196 FMLA v28.4s, v14.4s, v4.s[1] 197 FMLA v30.4s, v14.4s, v5.s[1] 198 FMLA v21.4s, v15.4s, v0.s[1] 199 FMLA v23.4s, v15.4s, v1.s[1] 200 FMLA v25.4s, v15.4s, v2.s[1] 201 FMLA v27.4s, v15.4s, v3.s[1] 202 FMLA v29.4s, v15.4s, v4.s[1] 203 FMLA v31.4s, v15.4s, v5.s[1] 204 205 LDP q12, q13, [x5], 32 206 FMLA v20.4s, v16.4s, v0.s[2] 207 FMLA v22.4s, v16.4s, v1.s[2] 208 LDP q14, q15, [x5], 32 209 FMLA v24.4s, v16.4s, v2.s[2] 210 FMLA v26.4s, v16.4s, v3.s[2] 211 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 212 FMLA v28.4s, v16.4s, v4.s[2] 213 FMLA v30.4s, v16.4s, v5.s[2] 214 PRFM PLDL1KEEP, [x5, 256] 215 FMLA v21.4s, v17.4s, v0.s[2] 216 FMLA v23.4s, v17.4s, v1.s[2] 217 FMLA v25.4s, v17.4s, v2.s[2] 218 FMLA v27.4s, v17.4s, v3.s[2] 219 FMLA v29.4s, v17.4s, v4.s[2] 220 FMLA v31.4s, v17.4s, v5.s[2] 221 222 FMLA v20.4s, v18.4s, v0.s[3] 223 FMLA v22.4s, v18.4s, v1.s[3] 224 FMLA v24.4s, v18.4s, v2.s[3] 225 FMLA v26.4s, v18.4s, v3.s[3] 226 FMLA v28.4s, v18.4s, v4.s[3] 227 FMLA v30.4s, v18.4s, v5.s[3] 228 FMLA v21.4s, v19.4s, v0.s[3] 229 FMLA v23.4s, v19.4s, v1.s[3] 230 FMLA v25.4s, v19.4s, v2.s[3] 231 FMLA v27.4s, v19.4s, v3.s[3] 232 FMLA v29.4s, v19.4s, v4.s[3] 233 FMLA v31.4s, v19.4s, v5.s[3] 234 235 # Second group of 4 A. 48 FMA. Loads A0 - A4 236 237 LDP q16, q17, [x5], 32 238 FMLA v20.4s, v12.4s, v6.s[0] 239 FMLA v22.4s, v12.4s, v7.s[0] 240 LDP q18, q19, [x5], 32 241 FMLA v24.4s, v12.4s, v8.s[0] 242 FMLA v26.4s, v12.4s, v9.s[0] 243 FMLA v28.4s, v12.4s, v10.s[0] 244 FMLA v30.4s, v12.4s, v11.s[0] 245 FMLA v21.4s, v13.4s, v6.s[0] 246 FMLA v23.4s, v13.4s, v7.s[0] 247 FMLA v25.4s, v13.4s, v8.s[0] 248 FMLA v27.4s, v13.4s, v9.s[0] 249 FMLA v29.4s, v13.4s, v10.s[0] 250 FMLA v31.4s, v13.4s, v11.s[0] 251 252 FMLA v20.4s, v14.4s, v6.s[1] 253 FMLA v22.4s, v14.4s, v7.s[1] 254 FMLA v24.4s, v14.4s, v8.s[1] 255 FMLA v26.4s, v14.4s, v9.s[1] 256 FMLA v28.4s, v14.4s, v10.s[1] 257 FMLA v30.4s, v14.4s, v11.s[1] 258 FMLA v21.4s, v15.4s, v6.s[1] 259 FMLA v23.4s, v15.4s, v7.s[1] 260 FMLA v25.4s, v15.4s, v8.s[1] 261 FMLA v27.4s, v15.4s, v9.s[1] 262 FMLA v29.4s, v15.4s, v10.s[1] 263 FMLA v31.4s, v15.4s, v11.s[1] 264 265 LDP q12, q13, [x5], 32 266 FMLA v20.4s, v16.4s, v6.s[2] 267 FMLA v20.4s, v18.4s, v6.s[3] 268 LDP q14, q15, [x5], 32 269 FMLA v21.4s, v17.4s, v6.s[2] 270 FMLA v21.4s, v19.4s, v6.s[3] 271 LDP q0, q6, [x3], 32 272 FMLA v22.4s, v16.4s, v7.s[2] 273 FMLA v22.4s, v18.4s, v7.s[3] 274 FMLA v23.4s, v17.4s, v7.s[2] 275 FMLA v23.4s, v19.4s, v7.s[3] 276 LDP q1, q7, [x9], 32 277 FMLA v24.4s, v16.4s, v8.s[2] 278 FMLA v24.4s, v18.4s, v8.s[3] 279 FMLA v25.4s, v17.4s, v8.s[2] 280 FMLA v25.4s, v19.4s, v8.s[3] 281 LDP q2, q8, [x10], 32 282 FMLA v26.4s, v16.4s, v9.s[2] 283 FMLA v26.4s, v18.4s, v9.s[3] 284 FMLA v27.4s, v17.4s, v9.s[2] 285 FMLA v27.4s, v19.4s, v9.s[3] 286 LDP q3, q9, [x11], 32 287 FMLA v28.4s, v16.4s, v10.s[2] 288 FMLA v28.4s, v18.4s, v10.s[3] 289 FMLA v29.4s, v17.4s, v10.s[2] 290 FMLA v29.4s, v19.4s, v10.s[3] 291 LDP q4, q10, [x12], 32 292 FMLA v30.4s, v16.4s, v11.s[2] 293 FMLA v30.4s, v18.4s, v11.s[3] 294 SUBS x0, x0, 32 295 FMLA v31.4s, v17.4s, v11.s[2] 296 FMLA v31.4s, v19.4s, v11.s[3] 297 B.HS 1b 298 299 # Epilogue - 8 floats of A (32 bytes) 300 # 96 FMA + 6 LDP A + 8 LDP B 301 # First block same as main loop. Second block has no preloads. 3022: 303 # First group of 4 A. 48 FMA. Loads A5 304 305 LDP q5, q11, [x4], 32 306 FMLA v20.4s, v12.4s, v0.s[0] 307 FMLA v22.4s, v12.4s, v1.s[0] 308 LDP q16, q17, [x5], 32 309 FMLA v24.4s, v12.4s, v2.s[0] 310 FMLA v26.4s, v12.4s, v3.s[0] 311 LDP q18, q19, [x5], 32 312 FMLA v28.4s, v12.4s, v4.s[0] 313 FMLA v30.4s, v12.4s, v5.s[0] 314 FMLA v21.4s, v13.4s, v0.s[0] 315 FMLA v23.4s, v13.4s, v1.s[0] 316 FMLA v25.4s, v13.4s, v2.s[0] 317 FMLA v27.4s, v13.4s, v3.s[0] 318 FMLA v29.4s, v13.4s, v4.s[0] 319 FMLA v31.4s, v13.4s, v5.s[0] 320 321 FMLA v20.4s, v14.4s, v0.s[1] 322 FMLA v22.4s, v14.4s, v1.s[1] 323 FMLA v24.4s, v14.4s, v2.s[1] 324 FMLA v26.4s, v14.4s, v3.s[1] 325 FMLA v28.4s, v14.4s, v4.s[1] 326 FMLA v30.4s, v14.4s, v5.s[1] 327 FMLA v21.4s, v15.4s, v0.s[1] 328 FMLA v23.4s, v15.4s, v1.s[1] 329 FMLA v25.4s, v15.4s, v2.s[1] 330 FMLA v27.4s, v15.4s, v3.s[1] 331 FMLA v29.4s, v15.4s, v4.s[1] 332 FMLA v31.4s, v15.4s, v5.s[1] 333 334 LDP q12, q13, [x5], 32 335 FMLA v20.4s, v16.4s, v0.s[2] 336 FMLA v22.4s, v16.4s, v1.s[2] 337 LDP q14, q15, [x5], 32 338 FMLA v24.4s, v16.4s, v2.s[2] 339 FMLA v26.4s, v16.4s, v3.s[2] 340 FMLA v28.4s, v16.4s, v4.s[2] 341 FMLA v30.4s, v16.4s, v5.s[2] 342 FMLA v21.4s, v17.4s, v0.s[2] 343 FMLA v23.4s, v17.4s, v1.s[2] 344 FMLA v25.4s, v17.4s, v2.s[2] 345 FMLA v27.4s, v17.4s, v3.s[2] 346 FMLA v29.4s, v17.4s, v4.s[2] 347 FMLA v31.4s, v17.4s, v5.s[2] 348 349 FMLA v20.4s, v18.4s, v0.s[3] 350 FMLA v22.4s, v18.4s, v1.s[3] 351 FMLA v24.4s, v18.4s, v2.s[3] 352 FMLA v26.4s, v18.4s, v3.s[3] 353 FMLA v28.4s, v18.4s, v4.s[3] 354 FMLA v30.4s, v18.4s, v5.s[3] 355 FMLA v21.4s, v19.4s, v0.s[3] 356 FMLA v23.4s, v19.4s, v1.s[3] 357 FMLA v25.4s, v19.4s, v2.s[3] 358 FMLA v27.4s, v19.4s, v3.s[3] 359 FMLA v29.4s, v19.4s, v4.s[3] 360 FMLA v31.4s, v19.4s, v5.s[3] 361 362 # Second group of 4 A. 48 FMA. No A Loads, No last B load 363 364 LDP q16, q17, [x5], 32 365 FMLA v20.4s, v12.4s, v6.s[0] 366 FMLA v22.4s, v12.4s, v7.s[0] 367 LDP q18, q19, [x5], 32 368 FMLA v24.4s, v12.4s, v8.s[0] 369 FMLA v26.4s, v12.4s, v9.s[0] 370 FMLA v28.4s, v12.4s, v10.s[0] 371 FMLA v30.4s, v12.4s, v11.s[0] 372 FMLA v21.4s, v13.4s, v6.s[0] 373 FMLA v23.4s, v13.4s, v7.s[0] 374 FMLA v25.4s, v13.4s, v8.s[0] 375 FMLA v27.4s, v13.4s, v9.s[0] 376 FMLA v29.4s, v13.4s, v10.s[0] 377 FMLA v31.4s, v13.4s, v11.s[0] 378 379 FMLA v20.4s, v14.4s, v6.s[1] 380 FMLA v22.4s, v14.4s, v7.s[1] 381 FMLA v24.4s, v14.4s, v8.s[1] 382 FMLA v26.4s, v14.4s, v9.s[1] 383 FMLA v28.4s, v14.4s, v10.s[1] 384 FMLA v30.4s, v14.4s, v11.s[1] 385 FMLA v21.4s, v15.4s, v6.s[1] 386 FMLA v23.4s, v15.4s, v7.s[1] 387 FMLA v25.4s, v15.4s, v8.s[1] 388 FMLA v27.4s, v15.4s, v9.s[1] 389 FMLA v29.4s, v15.4s, v10.s[1] 390 FMLA v31.4s, v15.4s, v11.s[1] 391 392 # Last part of epilogue has loads removed. 393 394 FMLA v20.4s, v16.4s, v6.s[2] 395 FMLA v22.4s, v16.4s, v7.s[2] 396 FMLA v24.4s, v16.4s, v8.s[2] 397 FMLA v26.4s, v16.4s, v9.s[2] 398 FMLA v28.4s, v16.4s, v10.s[2] 399 FMLA v30.4s, v16.4s, v11.s[2] 400 FMLA v21.4s, v17.4s, v6.s[2] 401 FMLA v23.4s, v17.4s, v7.s[2] 402 FMLA v25.4s, v17.4s, v8.s[2] 403 FMLA v27.4s, v17.4s, v9.s[2] 404 FMLA v29.4s, v17.4s, v10.s[2] 405 FMLA v31.4s, v17.4s, v11.s[2] 406 407 FMLA v20.4s, v18.4s, v6.s[3] 408 FMLA v22.4s, v18.4s, v7.s[3] 409 FMLA v24.4s, v18.4s, v8.s[3] 410 FMLA v26.4s, v18.4s, v9.s[3] 411 FMLA v28.4s, v18.4s, v10.s[3] 412 FMLA v30.4s, v18.4s, v11.s[3] 413 FMLA v21.4s, v19.4s, v6.s[3] 414 FMLA v23.4s, v19.4s, v7.s[3] 415 416 # Load clamping_params values 417 LD2R {v6.4s, v7.4s}, [x8] 418 419 FMLA v25.4s, v19.4s, v8.s[3] 420 FMLA v27.4s, v19.4s, v9.s[3] 421 # Is there a remainder?- 4 floats of A (16 bytes) or less 422 TST x0, 31 423 FMLA v29.4s, v19.4s, v10.s[3] 424 FMLA v31.4s, v19.4s, v11.s[3] 425 B.NE 4f 426 427 .p2align 3 428 429 # Clamp 4303: 431 SUBS x1, x1, 8 432 FMIN v20.4s, v20.4s, v6.4s 433 FMIN v21.4s, v21.4s, v6.4s 434 FMIN v22.4s, v22.4s, v6.4s 435 FMIN v23.4s, v23.4s, v6.4s 436 FMIN v24.4s, v24.4s, v6.4s 437 FMIN v25.4s, v25.4s, v6.4s 438 FMIN v26.4s, v26.4s, v6.4s 439 FMIN v27.4s, v27.4s, v6.4s 440 FMIN v28.4s, v28.4s, v6.4s 441 FMIN v29.4s, v29.4s, v6.4s 442 FMIN v30.4s, v30.4s, v6.4s 443 FMIN v31.4s, v31.4s, v6.4s 444 FMAX v20.4s, v20.4s, v7.4s 445 FMAX v21.4s, v21.4s, v7.4s 446 FMAX v22.4s, v22.4s, v7.4s 447 FMAX v23.4s, v23.4s, v7.4s 448 FMAX v24.4s, v24.4s, v7.4s 449 FMAX v25.4s, v25.4s, v7.4s 450 FMAX v26.4s, v26.4s, v7.4s 451 FMAX v27.4s, v27.4s, v7.4s 452 FMAX v28.4s, v28.4s, v7.4s 453 FMAX v29.4s, v29.4s, v7.4s 454 FMAX v30.4s, v30.4s, v7.4s 455 FMAX v31.4s, v31.4s, v7.4s 456 457 # Store full 6 x 8 458 NOP 459 B.LO 7f 460 461 $if INC: 462 STP q30, q31, [x7] 463 ADD x7, x7, x14 464 SUB x3, x3, x2 // a0 -= kc 465 STP q28, q29, [x13] 466 ADD x13, x13, x14 467 SUB x9, x9, x2 // a1 -= kc 468 STP q26, q27, [x18] 469 ADD x18, x18, x14 470 SUB x10, x10, x2 // a2 -= kc 471 STP q24, q25, [x17] 472 ADD x17, x17, x14 473 SUB x11, x11, x2 // a3 -= kc 474 STP q22, q23, [x16] 475 ADD x16, x16, x14 476 SUB x12, x12, x2 // a4 -= kc 477 STP q20, q21, [x6] 478 ADD x6, x6, x14 479 SUB x4, x4, x2 // a5 -= kc 480 $else: 481 STP q20, q21, [x6] 482 ADD x6, x6, x14 483 SUB x3, x3, x2 // a0 -= kc 484 STP q22, q23, [x16] 485 ADD x16, x16, x14 486 SUB x9, x9, x2 // a1 -= kc 487 STP q24, q25, [x17] 488 ADD x17, x17, x14 489 SUB x10, x10, x2 // a2 -= kc 490 STP q26, q27, [x18] 491 ADD x18, x18, x14 492 SUB x11, x11, x2 // a3 -= kc 493 STP q28, q29, [x13] 494 ADD x13, x13, x14 495 SUB x12, x12, x2 // a4 -= kc 496 STP q30, q31, [x7] 497 ADD x7, x7, x14 498 SUB x4, x4, x2 // a5 -= kc 499 500 NOP 501 B.HI 0b 502 503 # Restore d8-d15 from stack 504 LDP d14, d15, [sp, 48] 505 LDP d12, d13, [sp, 32] 506 LDP d10, d11, [sp, 16] 507 LDP d8, d9, [sp], 64 508 RET 509 510 .p2align 3 5114: 512 # Load clamping_params values 513 LD2R {v6.4s, v7.4s}, [x8] 514 515 # Is there a remainder?- 4 floats of A (16 bytes) 516 TBZ x0, 4, 5f 517 518 # Remainder- 4 floats of A (16 bytes) 519 # Load A 520 LDR q0, [x3], 16 521 LDR q1, [x9], 16 522 LDR q2, [x10], 16 523 LDR q3, [x11], 16 524 LDR q4, [x12], 16 525 LDR q5, [x4], 16 526 # Load B 527 LDP q12, q13, [x5], 32 528 LDP q14, q15, [x5], 32 529 LDP q16, q17, [x5], 32 530 LDP q18, q19, [x5], 32 531 532 FMLA v20.4s, v12.4s, v0.s[0] 533 FMLA v22.4s, v12.4s, v1.s[0] 534 FMLA v24.4s, v12.4s, v2.s[0] 535 FMLA v26.4s, v12.4s, v3.s[0] 536 FMLA v28.4s, v12.4s, v4.s[0] 537 FMLA v30.4s, v12.4s, v5.s[0] 538 FMLA v21.4s, v13.4s, v0.s[0] 539 FMLA v23.4s, v13.4s, v1.s[0] 540 FMLA v25.4s, v13.4s, v2.s[0] 541 FMLA v27.4s, v13.4s, v3.s[0] 542 FMLA v29.4s, v13.4s, v4.s[0] 543 FMLA v31.4s, v13.4s, v5.s[0] 544 545 FMLA v20.4s, v14.4s, v0.s[1] 546 FMLA v22.4s, v14.4s, v1.s[1] 547 FMLA v24.4s, v14.4s, v2.s[1] 548 FMLA v26.4s, v14.4s, v3.s[1] 549 FMLA v28.4s, v14.4s, v4.s[1] 550 FMLA v30.4s, v14.4s, v5.s[1] 551 FMLA v21.4s, v15.4s, v0.s[1] 552 FMLA v23.4s, v15.4s, v1.s[1] 553 FMLA v25.4s, v15.4s, v2.s[1] 554 FMLA v27.4s, v15.4s, v3.s[1] 555 FMLA v29.4s, v15.4s, v4.s[1] 556 FMLA v31.4s, v15.4s, v5.s[1] 557 558 FMLA v20.4s, v16.4s, v0.s[2] 559 FMLA v22.4s, v16.4s, v1.s[2] 560 FMLA v24.4s, v16.4s, v2.s[2] 561 FMLA v26.4s, v16.4s, v3.s[2] 562 FMLA v28.4s, v16.4s, v4.s[2] 563 FMLA v30.4s, v16.4s, v5.s[2] 564 FMLA v21.4s, v17.4s, v0.s[2] 565 FMLA v23.4s, v17.4s, v1.s[2] 566 FMLA v25.4s, v17.4s, v2.s[2] 567 FMLA v27.4s, v17.4s, v3.s[2] 568 FMLA v29.4s, v17.4s, v4.s[2] 569 FMLA v31.4s, v17.4s, v5.s[2] 570 571 FMLA v20.4s, v18.4s, v0.s[3] 572 FMLA v22.4s, v18.4s, v1.s[3] 573 FMLA v24.4s, v18.4s, v2.s[3] 574 FMLA v26.4s, v18.4s, v3.s[3] 575 FMLA v28.4s, v18.4s, v4.s[3] 576 FMLA v30.4s, v18.4s, v5.s[3] 577 FMLA v21.4s, v19.4s, v0.s[3] 578 FMLA v23.4s, v19.4s, v1.s[3] 579 FMLA v25.4s, v19.4s, v2.s[3] 580 FMLA v27.4s, v19.4s, v3.s[3] 581 FMLA v29.4s, v19.4s, v4.s[3] 582 FMLA v31.4s, v19.4s, v5.s[3] 583 584 # Is there a remainder?- 2 floats of A (8 bytes) 5855: 586 TBZ x0, 3, 6f 587 588 # Remainder- 2 floats of A (8 bytes) 589 # Load A 590 LDR d0, [x3], 8 591 LDR d1, [x9], 8 592 LDR d2, [x10], 8 593 LDR d3, [x11], 8 594 LDR d4, [x12], 8 595 LDR d5, [x4], 8 596 # Load B 597 LDP q12, q13, [x5], 32 598 LDP q14, q15, [x5], 32 599 600 FMLA v20.4s, v12.4s, v0.s[0] 601 FMLA v22.4s, v12.4s, v1.s[0] 602 FMLA v24.4s, v12.4s, v2.s[0] 603 FMLA v26.4s, v12.4s, v3.s[0] 604 FMLA v28.4s, v12.4s, v4.s[0] 605 FMLA v30.4s, v12.4s, v5.s[0] 606 FMLA v21.4s, v13.4s, v0.s[0] 607 FMLA v23.4s, v13.4s, v1.s[0] 608 FMLA v25.4s, v13.4s, v2.s[0] 609 FMLA v27.4s, v13.4s, v3.s[0] 610 FMLA v29.4s, v13.4s, v4.s[0] 611 FMLA v31.4s, v13.4s, v5.s[0] 612 613 FMLA v20.4s, v14.4s, v0.s[1] 614 FMLA v22.4s, v14.4s, v1.s[1] 615 FMLA v24.4s, v14.4s, v2.s[1] 616 FMLA v26.4s, v14.4s, v3.s[1] 617 FMLA v28.4s, v14.4s, v4.s[1] 618 FMLA v30.4s, v14.4s, v5.s[1] 619 FMLA v21.4s, v15.4s, v0.s[1] 620 FMLA v23.4s, v15.4s, v1.s[1] 621 FMLA v25.4s, v15.4s, v2.s[1] 622 FMLA v27.4s, v15.4s, v3.s[1] 623 FMLA v29.4s, v15.4s, v4.s[1] 624 FMLA v31.4s, v15.4s, v5.s[1] 625 626 # Is there a remainder?- 1 float of A (4 bytes) 6276: 628 TBZ x0, 2, 3b 629 630 # Remainder- 1 float of A (4 bytes) 631 # Load A 632 LDR s0, [x3], 4 633 LDR s1, [x9], 4 634 LDR s2, [x10], 4 635 LDR s3, [x11], 4 636 LDR s4, [x12], 4 637 LDR s5, [x4], 4 638 # Load B 639 LDP q12, q13, [x5], 32 640 641 FMLA v20.4s, v12.4s, v0.s[0] 642 FMLA v22.4s, v12.4s, v1.s[0] 643 FMLA v24.4s, v12.4s, v2.s[0] 644 FMLA v26.4s, v12.4s, v3.s[0] 645 FMLA v28.4s, v12.4s, v4.s[0] 646 FMLA v30.4s, v12.4s, v5.s[0] 647 FMLA v21.4s, v13.4s, v0.s[0] 648 FMLA v23.4s, v13.4s, v1.s[0] 649 FMLA v25.4s, v13.4s, v2.s[0] 650 FMLA v27.4s, v13.4s, v3.s[0] 651 FMLA v29.4s, v13.4s, v4.s[0] 652 FMLA v31.4s, v13.4s, v5.s[0] 653 B 3b 654 655 .p2align 3 656 657 # Store odd width 6587: 659 TBZ x1, 2, 8f 660 $if INC: 661 STR q30, [x7], 16 662 MOV v30.16b, v31.16b 663 STR q28, [x13], 16 664 MOV v28.16b, v29.16b 665 STR q26, [x18], 16 666 MOV v26.16b, v27.16b 667 STR q24, [x17], 16 668 MOV v24.16b, v25.16b 669 STR q22, [x16], 16 670 MOV v22.16b, v23.16b 671 STR q20, [x6], 16 672 MOV v20.16b, v21.16b 673 $else: 674 STR q20, [x6], 16 675 MOV v20.16b, v21.16b 676 STR q22, [x16], 16 677 MOV v22.16b, v23.16b 678 STR q24, [x17], 16 679 MOV v24.16b, v25.16b 680 STR q26, [x18], 16 681 MOV v26.16b, v27.16b 682 STR q28, [x13], 16 683 MOV v28.16b, v29.16b 684 STR q30, [x7], 16 685 MOV v30.16b, v31.16b 6868: 687 TBZ x1, 1, 9f 688 $if INC: 689 STR d30, [x7], 8 690 DUP d30, v30.d[1] 691 STR d28, [x13], 8 692 DUP d28, v28.d[1] 693 STR d26, [x18], 8 694 DUP d26, v26.d[1] 695 STR d24, [x17], 8 696 DUP d24, v24.d[1] 697 STR d22, [x16], 8 698 DUP d22, v22.d[1] 699 STR d20, [x6], 8 700 DUP d20, v20.d[1] 701 $else: 702 STR d20, [x6], 8 703 DUP d20, v20.d[1] 704 STR d22, [x16], 8 705 DUP d22, v22.d[1] 706 STR d24, [x17], 8 707 DUP d24, v24.d[1] 708 STR d26, [x18], 8 709 DUP d26, v26.d[1] 710 STR d28, [x13], 8 711 DUP d28, v28.d[1] 712 STR d30, [x7], 8 713 DUP d30, v30.d[1] 714 7159: 716 TBZ x1, 0, 10f 717 $if INC: 718 STR s30, [x7] 719 STR s28, [x13] 720 STR s26, [x18] 721 STR s24, [x17] 722 STR s22, [x16] 723 STR s20, [x6] 724 $else: 725 STR s20, [x6] 726 STR s22, [x16] 727 STR s24, [x17] 728 STR s26, [x18] 729 STR s28, [x13] 730 STR s30, [x7] 73110: 732 # Restore d8-d15 from stack 733 LDP d14, d15, [sp, 48] 734 LDP d12, d13, [sp, 32] 735 LDP d10, d11, [sp, 16] 736 LDP d8, d9, [sp], 64 737 RET 738 739END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_a73 740 741#ifdef __ELF__ 742.section ".note.GNU-stack","",%progbits 743#endif 744