1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"}( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x18 c3 40# x13 c4 41# x7 c5 42 43# Vector register usage 44# A0 v0 v6 45# A1 v1 v7 46# A2 v2 v8 47# A3 v3 v9 48# A4 v4 v10 49# A5 v5 v11 50# B v12 v13 v14 v15 51# B v16 v17 v18 v19 52# C v20 v21 53# C v22 v23 54# C v24 v25 55# C v26 v27 56# C v28 v29 57# C v30 v31 58# Clamp v6 v7 59 60BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 61 62 # Clamp A and C pointers / Save d8-d15 on stack 63 STP d8, d9, [sp, -64]! 64 CMP x0, 2 // if mr < 2 65 ADD x9, x3, x4 // a1 = a0 + a_stride 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 CSEL x9, x3, x9, LO // a1 = a0 68 CSEL x16, x6, x16, LO // c1 = c0 69 70 STP d10, d11, [sp, 16] 71 ADD x10, x9, x4 // a2 = a1 + a_stride 72 ADD x17, x16, x7 // c2 = c1 + cm_stride 73 // if mr <= 2 74 CSEL x10, x9, x10, LS // a2 = a1 75 CSEL x17, x16, x17, LS // c2 = c1 76 77 STP d12, d13, [sp, 32] 78 CMP x0, 4 // if mr < 4 79 ADD x11, x10, x4 // a3 = a2 + a_stride 80 ADD x18, x17, x7 // c3 = c2 + cm_stride 81 CSEL x11, x10, x11, LO // a3 = a2 82 CSEL x18, x17, x18, LO // c3 = c2 83 84 STP d14, d15, [sp, 48] 85 ADD x12, x11, x4 // a4 = a3 + a_stride 86 ADD x13, x18, x7 // c4 = c3 + cm_stride 87 // if mr <= 5 88 CSEL x12, x11, x12, LS // a4 = a3 89 CSEL x13, x18, x13, LS // c4 = c3 90 91 $if INC: 92 # Load acc, params pointer 93 LDP x15, x8, [sp, 72] 94 $else: 95 # Load params pointer 96 LDR x8, [sp, 72] 97 98 CMP x0, 6 // if mr < 6 99 ADD x4, x12, x4 // a5 = a4 + a_stride 100 ADD x7, x13, x7 // c5 = c4 + cm_stride 101 CSEL x4, x12, x4, LO // a5 = a4 102 CSEL x7, x13, x7, LO // c5 = c4 103 104 # Load cn_stride 105 LDR x14, [sp, 64] 106 1070: 108 $if INC: 109 # Load initial accumulators 110 LDP q20, q21, [x15], 32 111 LDP q22, q23, [x15], 32 112 LDP q24, q25, [x15], 32 113 LDP q26, q27, [x15], 32 114 LDP q28, q29, [x15], 32 115 LDP q30, q31, [x15], 32 116 $if PREFETCH: 117 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 118 PRFM PLDL1KEEP, [x5, 64] 119 PRFM PLDL1KEEP, [x5, 128] 120 PRFM PLDL1KEEP, [x5, 192] 121 PRFM PLDL1KEEP, [x3] // Prefetch A 122 PRFM PLDL1KEEP, [x9] 123 PRFM PLDL1KEEP, [x10] 124 PRFM PLDL1KEEP, [x11] 125 PRFM PLDL1KEEP, [x12] 126 PRFM PLDL1KEEP, [x4] 127 $else: 128 # Load initial bias from w into accumulators 129 LDP q20, q21, [x5], 32 130 MOV v22.16b, v20.16b 131 $if PREFETCH: 132 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 133 MOV v23.16b, v21.16b 134 $if PREFETCH: 135 PRFM PLDL1KEEP, [x5, 64] 136 MOV v24.16b, v20.16b 137 $if PREFETCH: 138 PRFM PLDL1KEEP, [x5, 128] 139 MOV v25.16b, v21.16b 140 $if PREFETCH: 141 PRFM PLDL1KEEP, [x5, 192] 142 MOV v26.16b, v20.16b 143 $if PREFETCH: 144 PRFM PLDL1KEEP, [x3] // Prefetch A 145 MOV v27.16b, v21.16b 146 $if PREFETCH: 147 PRFM PLDL1KEEP, [x9] 148 MOV v28.16b, v20.16b 149 $if PREFETCH: 150 PRFM PLDL1KEEP, [x10] 151 MOV v29.16b, v21.16b 152 $if PREFETCH: 153 PRFM PLDL1KEEP, [x11] 154 MOV v30.16b, v20.16b 155 $if PREFETCH: 156 PRFM PLDL1KEEP, [x12] 157 MOV v31.16b, v21.16b 158 $if PREFETCH: 159 PRFM PLDL1KEEP, [x4] 160 161 # Is there at least 8 floats (32 bytes) for prologue + epilogue? 162 SUBS x0, x2, 32 // k = kc - 32 163 B.LO 4f 164 165 # Prologue - loads for main loop of 96 FMA 166 LDR q0, [x3], 16 167 LDR q1, [x9], 16 168 LDR q2, [x10], 16 169 LDR q3, [x11], 16 170 LDR q4, [x12], 16 171 LDR q5, [x4], 16 172 LDP q12, q13, [x5], 32 // Fetch 3 B (4th deferred) 173 LDP q14, q15, [x5], 32 174 LDP q16, q17, [x5], 32 175 176 # Is there at least 8 floats (32 bytes) for main loop? 177 SUBS x0, x0, 32 178 B.LO 2f 179 180 # Main loop - 8 floats of A (32 bytes) 181 # 96 FMA + 6 LDP A + 8 LDP B 1821: 183 # First group of 4 A. 48 FMA. 184 FMLA v20.4s, v12.4s, v0.s[0] 185 LDP q18, q19, [x5], 32 // Load last B 186 FMLA v22.4s, v12.4s, v1.s[0] 187 FMLA v24.4s, v12.4s, v2.s[0] 188 FMLA v26.4s, v12.4s, v3.s[0] 189 FMLA v28.4s, v12.4s, v4.s[0] 190 FMLA v30.4s, v12.4s, v5.s[0] 191 FMLA v21.4s, v13.4s, v0.s[0] 192 FMLA v23.4s, v13.4s, v1.s[0] 193 FMLA v25.4s, v13.4s, v2.s[0] 194 FMLA v27.4s, v13.4s, v3.s[0] 195 FMLA v29.4s, v13.4s, v4.s[0] 196 197 FMLA v31.4s, v13.4s, v5.s[0] 198 FMLA v20.4s, v14.4s, v0.s[1] 199 $if PREFETCH: 200 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 201 FMLA v22.4s, v14.4s, v1.s[1] 202 FMLA v24.4s, v14.4s, v2.s[1] 203 FMLA v26.4s, v14.4s, v3.s[1] 204 FMLA v28.4s, v14.4s, v4.s[1] 205 $if PREFETCH: 206 PRFM PLDL1KEEP, [x5, 256] 207 FMLA v30.4s, v14.4s, v5.s[1] 208 FMLA v21.4s, v15.4s, v0.s[1] 209 FMLA v23.4s, v15.4s, v1.s[1] 210 FMLA v25.4s, v15.4s, v2.s[1] 211 LDR q6, [x3], 16 // Load next 6 A 212 FMLA v27.4s, v15.4s, v3.s[1] 213 FMLA v29.4s, v15.4s, v4.s[1] 214 FMLA v31.4s, v15.4s, v5.s[1] 215 LDR q7, [x9], 16 216 217 FMLA v20.4s, v16.4s, v0.s[2] 218 FMLA v22.4s, v16.4s, v1.s[2] 219 FMLA v24.4s, v16.4s, v2.s[2] 220 LDR q8, [x10], 16 221 FMLA v26.4s, v16.4s, v3.s[2] 222 FMLA v28.4s, v16.4s, v4.s[2] 223 FMLA v30.4s, v16.4s, v5.s[2] 224 LDR q9, [x11], 16 225 FMLA v21.4s, v17.4s, v0.s[2] 226 FMLA v23.4s, v17.4s, v1.s[2] 227 FMLA v25.4s, v17.4s, v2.s[2] 228 LDR q10, [x12], 16 229 FMLA v27.4s, v17.4s, v3.s[2] 230 FMLA v29.4s, v17.4s, v4.s[2] 231 FMLA v31.4s, v17.4s, v5.s[2] 232 LDR q11, [x4], 16 233 234 FMLA v20.4s, v18.4s, v0.s[3] 235 FMLA v22.4s, v18.4s, v1.s[3] 236 FMLA v24.4s, v18.4s, v2.s[3] 237 LDP q12, q13, [x5], 32 // Load 4 B 238 FMLA v26.4s, v18.4s, v3.s[3] 239 FMLA v28.4s, v18.4s, v4.s[3] 240 FMLA v30.4s, v18.4s, v5.s[3] 241 LDP q14, q15, [x5], 32 242 FMLA v21.4s, v19.4s, v0.s[3] 243 FMLA v23.4s, v19.4s, v1.s[3] 244 FMLA v25.4s, v19.4s, v2.s[3] 245 LDP q16, q17, [x5], 32 246 FMLA v27.4s, v19.4s, v3.s[3] 247 FMLA v29.4s, v19.4s, v4.s[3] 248 FMLA v31.4s, v19.4s, v5.s[3] 249 LDP q18, q19, [x5], 32 250 251 # Second group of 4 A. 48 FMA. 252 FMLA v20.4s, v12.4s, v6.s[0] 253 FMLA v22.4s, v12.4s, v7.s[0] 254 FMLA v24.4s, v12.4s, v8.s[0] 255 LDR q0, [x3], 16 // Load next 6 A 256 FMLA v26.4s, v12.4s, v9.s[0] 257 FMLA v28.4s, v12.4s, v10.s[0] 258 FMLA v30.4s, v12.4s, v11.s[0] 259 LDR q1, [x9], 16 260 FMLA v21.4s, v13.4s, v6.s[0] 261 FMLA v23.4s, v13.4s, v7.s[0] 262 FMLA v25.4s, v13.4s, v8.s[0] 263 LDR q2, [x10], 16 264 FMLA v27.4s, v13.4s, v9.s[0] 265 FMLA v29.4s, v13.4s, v10.s[0] 266 FMLA v31.4s, v13.4s, v11.s[0] 267 LDR q3, [x11], 16 268 269 FMLA v20.4s, v14.4s, v6.s[1] 270 FMLA v22.4s, v14.4s, v7.s[1] 271 FMLA v24.4s, v14.4s, v8.s[1] 272 LDR q4, [x12], 16 273 FMLA v26.4s, v14.4s, v9.s[1] 274 FMLA v28.4s, v14.4s, v10.s[1] 275 FMLA v30.4s, v14.4s, v11.s[1] 276 LDR q5, [x4], 16 277 FMLA v21.4s, v15.4s, v6.s[1] 278 FMLA v23.4s, v15.4s, v7.s[1] 279 FMLA v25.4s, v15.4s, v8.s[1] 280 LDP q12, q13, [x5], 32 // Load next 3 B (not last) 281 FMLA v27.4s, v15.4s, v9.s[1] 282 FMLA v29.4s, v15.4s, v10.s[1] 283 FMLA v31.4s, v15.4s, v11.s[1] 284 LDP q14, q15, [x5], 32 285 286 FMLA v20.4s, v16.4s, v6.s[2] 287 FMLA v22.4s, v16.4s, v7.s[2] 288 FMLA v24.4s, v16.4s, v8.s[2] 289 FMLA v26.4s, v16.4s, v9.s[2] 290 FMLA v28.4s, v16.4s, v10.s[2] 291 FMLA v30.4s, v16.4s, v11.s[2] 292 FMLA v21.4s, v17.4s, v6.s[2] 293 FMLA v23.4s, v17.4s, v7.s[2] 294 FMLA v25.4s, v17.4s, v8.s[2] 295 FMLA v27.4s, v17.4s, v9.s[2] 296 FMLA v29.4s, v17.4s, v10.s[2] 297 FMLA v31.4s, v17.4s, v11.s[2] 298 LDP q16, q17, [x5], 32 299 300 FMLA v20.4s, v18.4s, v6.s[3] 301 FMLA v22.4s, v18.4s, v7.s[3] 302 SUBS x0, x0, 32 303 FMLA v24.4s, v18.4s, v8.s[3] 304 FMLA v26.4s, v18.4s, v9.s[3] 305 FMLA v28.4s, v18.4s, v10.s[3] 306 FMLA v30.4s, v18.4s, v11.s[3] 307 FMLA v21.4s, v19.4s, v6.s[3] 308 FMLA v23.4s, v19.4s, v7.s[3] 309 FMLA v25.4s, v19.4s, v8.s[3] 310 FMLA v27.4s, v19.4s, v9.s[3] 311 FMLA v29.4s, v19.4s, v10.s[3] 312 FMLA v31.4s, v19.4s, v11.s[3] 313 B.HS 1b 314 315 # Epilogue - 8 floats of A (32 bytes) 316 # 96 FMA + 6 LDP A + 8 LDP B 317 # First block same as main loop. Second block has no preloads. 3182: 319 # First group of 4 A. 48 FMA. 320 FMLA v20.4s, v12.4s, v0.s[0] 321 LDP q18, q19, [x5], 32 // Load last B 322 FMLA v22.4s, v12.4s, v1.s[0] 323 FMLA v24.4s, v12.4s, v2.s[0] 324 FMLA v26.4s, v12.4s, v3.s[0] 325 FMLA v28.4s, v12.4s, v4.s[0] 326 FMLA v30.4s, v12.4s, v5.s[0] 327 FMLA v21.4s, v13.4s, v0.s[0] 328 FMLA v23.4s, v13.4s, v1.s[0] 329 FMLA v25.4s, v13.4s, v2.s[0] 330 FMLA v27.4s, v13.4s, v3.s[0] 331 FMLA v29.4s, v13.4s, v4.s[0] 332 333 FMLA v31.4s, v13.4s, v5.s[0] 334 FMLA v20.4s, v14.4s, v0.s[1] 335 $if PREFETCH: 336 PRFM PLDL1KEEP, [x5, 128] // Prefetch B 337 FMLA v22.4s, v14.4s, v1.s[1] 338 FMLA v24.4s, v14.4s, v2.s[1] 339 FMLA v26.4s, v14.4s, v3.s[1] 340 FMLA v28.4s, v14.4s, v4.s[1] 341 $if PREFETCH: 342 PRFM PLDL1KEEP, [x5, 256] 343 FMLA v30.4s, v14.4s, v5.s[1] 344 FMLA v21.4s, v15.4s, v0.s[1] 345 FMLA v23.4s, v15.4s, v1.s[1] 346 FMLA v25.4s, v15.4s, v2.s[1] 347 LDR q6, [x3], 16 // Load next 6 A 348 FMLA v27.4s, v15.4s, v3.s[1] 349 FMLA v29.4s, v15.4s, v4.s[1] 350 FMLA v31.4s, v15.4s, v5.s[1] 351 LDR q7, [x9], 16 352 353 FMLA v20.4s, v16.4s, v0.s[2] 354 FMLA v22.4s, v16.4s, v1.s[2] 355 FMLA v24.4s, v16.4s, v2.s[2] 356 LDR q8, [x10], 16 357 FMLA v26.4s, v16.4s, v3.s[2] 358 FMLA v28.4s, v16.4s, v4.s[2] 359 FMLA v30.4s, v16.4s, v5.s[2] 360 LDR q9, [x11], 16 361 FMLA v21.4s, v17.4s, v0.s[2] 362 FMLA v23.4s, v17.4s, v1.s[2] 363 FMLA v25.4s, v17.4s, v2.s[2] 364 LDR q10, [x12], 16 365 FMLA v27.4s, v17.4s, v3.s[2] 366 FMLA v29.4s, v17.4s, v4.s[2] 367 FMLA v31.4s, v17.4s, v5.s[2] 368 LDR q11, [x4], 16 369 370 FMLA v20.4s, v18.4s, v0.s[3] 371 FMLA v22.4s, v18.4s, v1.s[3] 372 FMLA v24.4s, v18.4s, v2.s[3] 373 LDP q12, q13, [x5], 32 // Load 4 B 374 FMLA v26.4s, v18.4s, v3.s[3] 375 FMLA v28.4s, v18.4s, v4.s[3] 376 FMLA v30.4s, v18.4s, v5.s[3] 377 LDP q14, q15, [x5], 32 378 FMLA v21.4s, v19.4s, v0.s[3] 379 FMLA v23.4s, v19.4s, v1.s[3] 380 FMLA v25.4s, v19.4s, v2.s[3] 381 LDP q16, q17, [x5], 32 382 FMLA v27.4s, v19.4s, v3.s[3] 383 FMLA v29.4s, v19.4s, v4.s[3] 384 FMLA v31.4s, v19.4s, v5.s[3] 385 LDP q18, q19, [x5], 32 386 387 # Second group of 4 A. 48 FMA. 388 FMLA v20.4s, v12.4s, v6.s[0] 389 FMLA v22.4s, v12.4s, v7.s[0] 390 FMLA v24.4s, v12.4s, v8.s[0] 391 FMLA v26.4s, v12.4s, v9.s[0] 392 FMLA v28.4s, v12.4s, v10.s[0] 393 FMLA v30.4s, v12.4s, v11.s[0] 394 FMLA v21.4s, v13.4s, v6.s[0] 395 FMLA v23.4s, v13.4s, v7.s[0] 396 FMLA v25.4s, v13.4s, v8.s[0] 397 FMLA v27.4s, v13.4s, v9.s[0] 398 FMLA v29.4s, v13.4s, v10.s[0] 399 FMLA v31.4s, v13.4s, v11.s[0] 400 401 FMLA v20.4s, v14.4s, v6.s[1] 402 FMLA v22.4s, v14.4s, v7.s[1] 403 FMLA v24.4s, v14.4s, v8.s[1] 404 FMLA v26.4s, v14.4s, v9.s[1] 405 FMLA v28.4s, v14.4s, v10.s[1] 406 FMLA v30.4s, v14.4s, v11.s[1] 407 FMLA v21.4s, v15.4s, v6.s[1] 408 FMLA v23.4s, v15.4s, v7.s[1] 409 FMLA v25.4s, v15.4s, v8.s[1] 410 FMLA v27.4s, v15.4s, v9.s[1] 411 FMLA v29.4s, v15.4s, v10.s[1] 412 FMLA v31.4s, v15.4s, v11.s[1] 413 414 FMLA v20.4s, v16.4s, v6.s[2] 415 FMLA v22.4s, v16.4s, v7.s[2] 416 FMLA v24.4s, v16.4s, v8.s[2] 417 FMLA v26.4s, v16.4s, v9.s[2] 418 FMLA v28.4s, v16.4s, v10.s[2] 419 FMLA v30.4s, v16.4s, v11.s[2] 420 FMLA v21.4s, v17.4s, v6.s[2] 421 FMLA v23.4s, v17.4s, v7.s[2] 422 FMLA v25.4s, v17.4s, v8.s[2] 423 FMLA v27.4s, v17.4s, v9.s[2] 424 FMLA v29.4s, v17.4s, v10.s[2] 425 FMLA v31.4s, v17.4s, v11.s[2] 426 427 FMLA v20.4s, v18.4s, v6.s[3] 428 FMLA v22.4s, v18.4s, v7.s[3] 429 FMLA v24.4s, v18.4s, v8.s[3] 430 FMLA v26.4s, v18.4s, v9.s[3] 431 FMLA v28.4s, v18.4s, v10.s[3] 432 FMLA v30.4s, v18.4s, v11.s[3] 433 FMLA v21.4s, v19.4s, v6.s[3] 434 FMLA v23.4s, v19.4s, v7.s[3] 435 436 # Load clamping_params values 437 LD2R {v6.4s, v7.4s}, [x8] 438 439 FMLA v25.4s, v19.4s, v8.s[3] 440 FMLA v27.4s, v19.4s, v9.s[3] 441 # Is there a remainder?- 4 floats of A (16 bytes) or less 442 TST x0, 31 443 FMLA v29.4s, v19.4s, v10.s[3] 444 FMLA v31.4s, v19.4s, v11.s[3] 445 B.NE 4f 446 447 # Clamp 4483: 449 FMIN v20.4s, v20.4s, v6.4s 450 SUBS x1, x1, 8 451 FMIN v21.4s, v21.4s, v6.4s 452 FMIN v22.4s, v22.4s, v6.4s 453 FMIN v23.4s, v23.4s, v6.4s 454 FMIN v24.4s, v24.4s, v6.4s 455 FMIN v25.4s, v25.4s, v6.4s 456 FMIN v26.4s, v26.4s, v6.4s 457 FMIN v27.4s, v27.4s, v6.4s 458 FMIN v28.4s, v28.4s, v6.4s 459 FMIN v29.4s, v29.4s, v6.4s 460 FMIN v30.4s, v30.4s, v6.4s 461 FMIN v31.4s, v31.4s, v6.4s 462 FMAX v20.4s, v20.4s, v7.4s 463 FMAX v21.4s, v21.4s, v7.4s 464 FMAX v22.4s, v22.4s, v7.4s 465 FMAX v23.4s, v23.4s, v7.4s 466 FMAX v24.4s, v24.4s, v7.4s 467 FMAX v25.4s, v25.4s, v7.4s 468 FMAX v26.4s, v26.4s, v7.4s 469 FMAX v27.4s, v27.4s, v7.4s 470 FMAX v28.4s, v28.4s, v7.4s 471 FMAX v29.4s, v29.4s, v7.4s 472 FMAX v30.4s, v30.4s, v7.4s 473 FMAX v31.4s, v31.4s, v7.4s 474 475 # Store full 6 x 8 476 B.LO 7f 477 478 $if INC: 479 STP q30, q31, [x7] 480 ADD x7, x7, x14 481 SUB x3, x3, x2 // a0 -= kc 482 STP q28, q29, [x13] 483 ADD x13, x13, x14 484 SUB x9, x9, x2 // a1 -= kc 485 STP q26, q27, [x18] 486 ADD x18, x18, x14 487 SUB x10, x10, x2 // a2 -= kc 488 STP q24, q25, [x17] 489 ADD x17, x17, x14 490 SUB x11, x11, x2 // a3 -= kc 491 STP q22, q23, [x16] 492 ADD x16, x16, x14 493 SUB x12, x12, x2 // a4 -= kc 494 STP q20, q21, [x6] 495 ADD x6, x6, x14 496 SUB x4, x4, x2 // a5 -= kc 497 $else: 498 STP q20, q21, [x6] 499 ADD x6, x6, x14 500 SUB x3, x3, x2 // a0 -= kc 501 STP q22, q23, [x16] 502 ADD x16, x16, x14 503 SUB x9, x9, x2 // a1 -= kc 504 STP q24, q25, [x17] 505 ADD x17, x17, x14 506 SUB x10, x10, x2 // a2 -= kc 507 STP q26, q27, [x18] 508 ADD x18, x18, x14 509 SUB x11, x11, x2 // a3 -= kc 510 STP q28, q29, [x13] 511 ADD x13, x13, x14 512 SUB x12, x12, x2 // a4 -= kc 513 STP q30, q31, [x7] 514 ADD x7, x7, x14 515 SUB x4, x4, x2 // a5 -= kc 516 517 B.HI 0b 518 519 # Restore d8-d15 from stack 520 LDP d14, d15, [sp, 48] 521 LDP d12, d13, [sp, 32] 522 LDP d10, d11, [sp, 16] 523 LDP d8, d9, [sp], 64 524 RET 525 5264: 527 # Load clamping_params values 528 LD2R {v6.4s, v7.4s}, [x8] 529 530 # Is there a remainder?- 4 floats of A (16 bytes) 531 TBZ x0, 4, 5f 532 533 # Remainder- 4 floats of A (16 bytes) 534 # Load A 535 LDR q0, [x3], 16 536 LDR q1, [x9], 16 537 LDR q2, [x10], 16 538 LDR q3, [x11], 16 539 LDR q4, [x12], 16 540 LDR q5, [x4], 16 541 # Load B 542 LDP q12, q13, [x5], 32 543 LDP q14, q15, [x5], 32 544 LDP q16, q17, [x5], 32 545 LDP q18, q19, [x5], 32 546 547 FMLA v20.4s, v12.4s, v0.s[0] 548 FMLA v22.4s, v12.4s, v1.s[0] 549 FMLA v24.4s, v12.4s, v2.s[0] 550 FMLA v26.4s, v12.4s, v3.s[0] 551 FMLA v28.4s, v12.4s, v4.s[0] 552 FMLA v30.4s, v12.4s, v5.s[0] 553 FMLA v21.4s, v13.4s, v0.s[0] 554 FMLA v23.4s, v13.4s, v1.s[0] 555 FMLA v25.4s, v13.4s, v2.s[0] 556 FMLA v27.4s, v13.4s, v3.s[0] 557 FMLA v29.4s, v13.4s, v4.s[0] 558 FMLA v31.4s, v13.4s, v5.s[0] 559 560 FMLA v20.4s, v14.4s, v0.s[1] 561 FMLA v22.4s, v14.4s, v1.s[1] 562 FMLA v24.4s, v14.4s, v2.s[1] 563 FMLA v26.4s, v14.4s, v3.s[1] 564 FMLA v28.4s, v14.4s, v4.s[1] 565 FMLA v30.4s, v14.4s, v5.s[1] 566 FMLA v21.4s, v15.4s, v0.s[1] 567 FMLA v23.4s, v15.4s, v1.s[1] 568 FMLA v25.4s, v15.4s, v2.s[1] 569 FMLA v27.4s, v15.4s, v3.s[1] 570 FMLA v29.4s, v15.4s, v4.s[1] 571 FMLA v31.4s, v15.4s, v5.s[1] 572 573 FMLA v20.4s, v16.4s, v0.s[2] 574 FMLA v22.4s, v16.4s, v1.s[2] 575 FMLA v24.4s, v16.4s, v2.s[2] 576 FMLA v26.4s, v16.4s, v3.s[2] 577 FMLA v28.4s, v16.4s, v4.s[2] 578 FMLA v30.4s, v16.4s, v5.s[2] 579 FMLA v21.4s, v17.4s, v0.s[2] 580 FMLA v23.4s, v17.4s, v1.s[2] 581 FMLA v25.4s, v17.4s, v2.s[2] 582 FMLA v27.4s, v17.4s, v3.s[2] 583 FMLA v29.4s, v17.4s, v4.s[2] 584 FMLA v31.4s, v17.4s, v5.s[2] 585 586 FMLA v20.4s, v18.4s, v0.s[3] 587 FMLA v22.4s, v18.4s, v1.s[3] 588 FMLA v24.4s, v18.4s, v2.s[3] 589 FMLA v26.4s, v18.4s, v3.s[3] 590 FMLA v28.4s, v18.4s, v4.s[3] 591 FMLA v30.4s, v18.4s, v5.s[3] 592 FMLA v21.4s, v19.4s, v0.s[3] 593 FMLA v23.4s, v19.4s, v1.s[3] 594 FMLA v25.4s, v19.4s, v2.s[3] 595 FMLA v27.4s, v19.4s, v3.s[3] 596 FMLA v29.4s, v19.4s, v4.s[3] 597 FMLA v31.4s, v19.4s, v5.s[3] 598 599 # Is there a remainder?- 2 floats of A (8 bytes) 6005: 601 TBZ x0, 3, 6f 602 603 # Remainder- 2 floats of A (8 bytes) 604 # Load A 605 LDR d0, [x3], 8 606 LDR d1, [x9], 8 607 LDR d2, [x10], 8 608 LDR d3, [x11], 8 609 LDR d4, [x12], 8 610 LDR d5, [x4], 8 611 # Load B 612 LDP q12, q13, [x5], 32 613 LDP q14, q15, [x5], 32 614 615 FMLA v20.4s, v12.4s, v0.s[0] 616 FMLA v22.4s, v12.4s, v1.s[0] 617 FMLA v24.4s, v12.4s, v2.s[0] 618 FMLA v26.4s, v12.4s, v3.s[0] 619 FMLA v28.4s, v12.4s, v4.s[0] 620 FMLA v30.4s, v12.4s, v5.s[0] 621 FMLA v21.4s, v13.4s, v0.s[0] 622 FMLA v23.4s, v13.4s, v1.s[0] 623 FMLA v25.4s, v13.4s, v2.s[0] 624 FMLA v27.4s, v13.4s, v3.s[0] 625 FMLA v29.4s, v13.4s, v4.s[0] 626 FMLA v31.4s, v13.4s, v5.s[0] 627 628 FMLA v20.4s, v14.4s, v0.s[1] 629 FMLA v22.4s, v14.4s, v1.s[1] 630 FMLA v24.4s, v14.4s, v2.s[1] 631 FMLA v26.4s, v14.4s, v3.s[1] 632 FMLA v28.4s, v14.4s, v4.s[1] 633 FMLA v30.4s, v14.4s, v5.s[1] 634 FMLA v21.4s, v15.4s, v0.s[1] 635 FMLA v23.4s, v15.4s, v1.s[1] 636 FMLA v25.4s, v15.4s, v2.s[1] 637 FMLA v27.4s, v15.4s, v3.s[1] 638 FMLA v29.4s, v15.4s, v4.s[1] 639 FMLA v31.4s, v15.4s, v5.s[1] 640 641 # Is there a remainder?- 1 float of A (4 bytes) 6426: 643 TBZ x0, 2, 3b 644 645 # Remainder- 1 float of A (4 bytes) 646 # Load A 647 LDR s0, [x3], 4 648 LDR s1, [x9], 4 649 LDR s2, [x10], 4 650 LDR s3, [x11], 4 651 LDR s4, [x12], 4 652 LDR s5, [x4], 4 653 # Load B 654 LDP q12, q13, [x5], 32 655 656 FMLA v20.4s, v12.4s, v0.s[0] 657 FMLA v22.4s, v12.4s, v1.s[0] 658 FMLA v24.4s, v12.4s, v2.s[0] 659 FMLA v26.4s, v12.4s, v3.s[0] 660 FMLA v28.4s, v12.4s, v4.s[0] 661 FMLA v30.4s, v12.4s, v5.s[0] 662 FMLA v21.4s, v13.4s, v0.s[0] 663 FMLA v23.4s, v13.4s, v1.s[0] 664 FMLA v25.4s, v13.4s, v2.s[0] 665 FMLA v27.4s, v13.4s, v3.s[0] 666 FMLA v29.4s, v13.4s, v4.s[0] 667 FMLA v31.4s, v13.4s, v5.s[0] 668 B 3b 669 670 # Store odd width 6717: 672 TBZ x1, 2, 8f 673 $if INC: 674 STR q30, [x7], 16 675 MOV v30.16b, v31.16b 676 STR q28, [x13], 16 677 MOV v28.16b, v29.16b 678 STR q26, [x18], 16 679 MOV v26.16b, v27.16b 680 STR q24, [x17], 16 681 MOV v24.16b, v25.16b 682 STR q22, [x16], 16 683 MOV v22.16b, v23.16b 684 STR q20, [x6], 16 685 MOV v20.16b, v21.16b 686 $else: 687 STR q20, [x6], 16 688 MOV v20.16b, v21.16b 689 STR q22, [x16], 16 690 MOV v22.16b, v23.16b 691 STR q24, [x17], 16 692 MOV v24.16b, v25.16b 693 STR q26, [x18], 16 694 MOV v26.16b, v27.16b 695 STR q28, [x13], 16 696 MOV v28.16b, v29.16b 697 STR q30, [x7], 16 698 MOV v30.16b, v31.16b 6998: 700 TBZ x1, 1, 9f 701 $if INC: 702 STR d30, [x7], 8 703 DUP d30, v30.d[1] 704 STR d28, [x13], 8 705 DUP d28, v28.d[1] 706 STR d26, [x18], 8 707 DUP d26, v26.d[1] 708 STR d24, [x17], 8 709 DUP d24, v24.d[1] 710 STR d22, [x16], 8 711 DUP d22, v22.d[1] 712 STR d20, [x6], 8 713 DUP d20, v20.d[1] 714 $else: 715 STR d20, [x6], 8 716 DUP d20, v20.d[1] 717 STR d22, [x16], 8 718 DUP d22, v22.d[1] 719 STR d24, [x17], 8 720 DUP d24, v24.d[1] 721 STR d26, [x18], 8 722 DUP d26, v26.d[1] 723 STR d28, [x13], 8 724 DUP d28, v28.d[1] 725 STR d30, [x7], 8 726 DUP d30, v30.d[1] 727 7289: 729 TBZ x1, 0, 10f 730 $if INC: 731 STR s30, [x7] 732 STR s28, [x13] 733 STR s26, [x18] 734 STR s24, [x17] 735 STR s22, [x16] 736 STR s20, [x6] 737 $else: 738 STR s20, [x6] 739 STR s22, [x16] 740 STR s24, [x17] 741 STR s26, [x18] 742 STR s28, [x13] 743 STR s30, [x7] 74410: 745 # Restore d8-d15 from stack 746 LDP d14, d15, [sp, 48] 747 LDP d12, d13, [sp, 32] 748 LDP d10, d11, [sp, 16] 749 LDP d8, d9, [sp], 64 750 RET 751 752END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_cortex_${"a75" if PREFETCH else "a57"} 753 754#ifdef __ELF__ 755.section ".note.GNU-stack","",%progbits 756#endif 757