1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld128( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x18 c3 40# x13 c4 41# x7 c5 42 43# Vector register usage 44# A0 v0 45# A1 v1 46# A2 v2 47# A3 v3 48# A4 v4 49# A5 v5 50# B v16 v17 v18 v19 51# C v20 v21 52# C v22 v23 53# C v24 v25 54# C v26 v27 55# C v28 v29 56# C v30 v31 57# Clamp v6 v7 58# unused A v8 v9 v10 v11 59# unused B v12 v13 v14 v15 60 61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld128 62 63 # Clamp A and C pointers 64 CMP x0, 2 // if mr < 2 65 ADD x9, x3, x4 // a1 = a0 + a_stride 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 CSEL x9, x3, x9, LO // a1 = a0 68 CSEL x16, x6, x16, LO // c1 = c0 69 70 ADD x10, x9, x4 // a2 = a1 + a_stride 71 ADD x17, x16, x7 // c2 = c1 + cm_stride 72 // if mr <= 2 73 CSEL x10, x9, x10, LS // a2 = a1 74 CSEL x17, x16, x17, LS // c2 = c1 75 76 CMP x0, 4 // if mr < 4 77 ADD x11, x10, x4 // a3 = a2 + a_stride 78 ADD x18, x17, x7 // c3 = c2 + cm_stride 79 CSEL x11, x10, x11, LO // a3 = a2 80 CSEL x18, x17, x18, LO // c3 = c2 81 82 ADD x12, x11, x4 // a4 = a3 + a_stride 83 ADD x13, x18, x7 // c4 = c3 + cm_stride 84 // if mr <= 5 85 CSEL x12, x11, x12, LS // a4 = a3 86 CSEL x13, x18, x13, LS // c4 = c3 87 88 $if INC: 89 # Load acc, params pointer 90 LDP x15, x8, [sp, 8] 91 $else: 92 # Load params pointer 93 LDR x8, [sp, 8] 94 95 CMP x0, 6 // if mr < 6 96 ADD x4, x12, x4 // a5 = a4 + a_stride 97 ADD x7, x13, x7 // c5 = c4 + cm_stride 98 CSEL x4, x12, x4, LO // a5 = a4 99 CSEL x7, x13, x7, LO // c5 = c4 100 101 # Load clamping_params values 102 LD2R {v6.4s, v7.4s}, [x8] 103 104 # Load cn_stride 105 LDR x14, [sp] 106 1070: 108 $if INC: 109 # Load initial accumulators 110 LDP q20, q21, [x15], 32 111 LDP q22, q23, [x15], 32 112 LDP q24, q25, [x15], 32 113 LDP q26, q27, [x15], 32 114 LDP q28, q29, [x15], 32 115 LDP q30, q31, [x15], 32 116 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 117 PRFM PLDL1KEEP, [x5, 64] 118 PRFM PLDL1KEEP, [x5, 128] 119 PRFM PLDL1KEEP, [x5, 192] 120 PRFM PLDL1KEEP, [x3] // Prefetch A 121 PRFM PLDL1KEEP, [x9] 122 PRFM PLDL1KEEP, [x10] 123 PRFM PLDL1KEEP, [x11] 124 PRFM PLDL1KEEP, [x12] 125 PRFM PLDL1KEEP, [x4] 126 $else: 127 # Load initial bias from w into accumulators 128 LDP q20, q21, [x5], 32 129 MOV v22.16b, v20.16b 130 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 131 MOV v23.16b, v21.16b 132 PRFM PLDL1KEEP, [x5, 64] 133 MOV v24.16b, v20.16b 134 PRFM PLDL1KEEP, [x5, 128] 135 MOV v25.16b, v21.16b 136 PRFM PLDL1KEEP, [x5, 192] 137 MOV v26.16b, v20.16b 138 PRFM PLDL1KEEP, [x3] // Prefetch A 139 MOV v27.16b, v21.16b 140 PRFM PLDL1KEEP, [x9] 141 MOV v28.16b, v20.16b 142 PRFM PLDL1KEEP, [x10] 143 MOV v29.16b, v21.16b 144 PRFM PLDL1KEEP, [x11] 145 MOV v30.16b, v20.16b 146 PRFM PLDL1KEEP, [x12] 147 MOV v31.16b, v21.16b 148 PRFM PLDL1KEEP, [x4] 149 150 # Is there at least 4 floats (16 bytes)? 151 SUBS x0, x2, 16 // k = kc - 16 152 B.LO 5f 153 154 # Main loop - 4 floats of A (16 bytes) 155 # 48 FMA + 6 ld128 A + 4 LDP B 1561: 157 LDR q0, [x3], 16 158 LDP q16, q17, [x5], 32 159 LDR q1, [x9], 16 160 LDR q2, [x10], 16 161 LDR q3, [x11], 16 162 LDR q4, [x12], 16 163 LDR q5, [x4], 16 164 FMLA v20.4s, v16.4s, v0.s[0] 165 FMLA v22.4s, v16.4s, v1.s[0] 166 FMLA v24.4s, v16.4s, v2.s[0] 167 FMLA v26.4s, v16.4s, v3.s[0] 168 LDP q18, q19, [x5], 32 169 FMLA v28.4s, v16.4s, v4.s[0] 170 FMLA v30.4s, v16.4s, v5.s[0] 171 FMLA v21.4s, v17.4s, v0.s[0] 172 FMLA v23.4s, v17.4s, v1.s[0] 173 FMLA v25.4s, v17.4s, v2.s[0] 174 FMLA v27.4s, v17.4s, v3.s[0] 175 FMLA v29.4s, v17.4s, v4.s[0] 176 FMLA v31.4s, v17.4s, v5.s[0] 177 178 FMLA v20.4s, v18.4s, v0.s[1] 179 LDP q16, q17, [x5], 32 180 FMLA v22.4s, v18.4s, v1.s[1] 181 FMLA v24.4s, v18.4s, v2.s[1] 182 FMLA v26.4s, v18.4s, v3.s[1] 183 FMLA v28.4s, v18.4s, v4.s[1] 184 FMLA v30.4s, v18.4s, v5.s[1] 185 FMLA v21.4s, v19.4s, v0.s[1] 186 FMLA v23.4s, v19.4s, v1.s[1] 187 FMLA v25.4s, v19.4s, v2.s[1] 188 FMLA v27.4s, v19.4s, v3.s[1] 189 FMLA v29.4s, v19.4s, v4.s[1] 190 FMLA v31.4s, v19.4s, v5.s[1] 191 192 FMLA v20.4s, v16.4s, v0.s[2] 193 LDP q18, q19, [x5], 32 194 FMLA v22.4s, v16.4s, v1.s[2] 195 FMLA v24.4s, v16.4s, v2.s[2] 196 FMLA v26.4s, v16.4s, v3.s[2] 197 FMLA v28.4s, v16.4s, v4.s[2] 198 FMLA v30.4s, v16.4s, v5.s[2] 199 FMLA v21.4s, v17.4s, v0.s[2] 200 FMLA v23.4s, v17.4s, v1.s[2] 201 FMLA v25.4s, v17.4s, v2.s[2] 202 FMLA v27.4s, v17.4s, v3.s[2] 203 FMLA v29.4s, v17.4s, v4.s[2] 204 FMLA v31.4s, v17.4s, v5.s[2] 205 206 FMLA v20.4s, v18.4s, v0.s[3] 207 FMLA v22.4s, v18.4s, v1.s[3] 208 FMLA v24.4s, v18.4s, v2.s[3] 209 FMLA v26.4s, v18.4s, v3.s[3] 210 FMLA v28.4s, v18.4s, v4.s[3] 211 FMLA v30.4s, v18.4s, v5.s[3] 212 FMLA v21.4s, v19.4s, v0.s[3] 213 FMLA v23.4s, v19.4s, v1.s[3] 214 FMLA v25.4s, v19.4s, v2.s[3] 215 FMLA v27.4s, v19.4s, v3.s[3] 216 SUBS x0, x0, 16 217 FMLA v29.4s, v19.4s, v4.s[3] 218 FMLA v31.4s, v19.4s, v5.s[3] 219 B.HS 1b 220 221 # Is there a remainder?- 2 floats of A (8 bytes) or less 222 TST x0, 15 223 B.NE 5f 224 2254: 226 # Clamp 227 FMIN v20.4s, v20.4s, v6.4s 228 SUBS x1, x1, 8 229 FMIN v21.4s, v21.4s, v6.4s 230 FMIN v22.4s, v22.4s, v6.4s 231 FMIN v23.4s, v23.4s, v6.4s 232 FMIN v24.4s, v24.4s, v6.4s 233 FMIN v25.4s, v25.4s, v6.4s 234 FMIN v26.4s, v26.4s, v6.4s 235 FMIN v27.4s, v27.4s, v6.4s 236 FMIN v28.4s, v28.4s, v6.4s 237 FMIN v29.4s, v29.4s, v6.4s 238 FMIN v30.4s, v30.4s, v6.4s 239 FMIN v31.4s, v31.4s, v6.4s 240 FMAX v20.4s, v20.4s, v7.4s 241 FMAX v21.4s, v21.4s, v7.4s 242 FMAX v22.4s, v22.4s, v7.4s 243 FMAX v23.4s, v23.4s, v7.4s 244 FMAX v24.4s, v24.4s, v7.4s 245 FMAX v25.4s, v25.4s, v7.4s 246 FMAX v26.4s, v26.4s, v7.4s 247 FMAX v27.4s, v27.4s, v7.4s 248 FMAX v28.4s, v28.4s, v7.4s 249 FMAX v29.4s, v29.4s, v7.4s 250 FMAX v30.4s, v30.4s, v7.4s 251 FMAX v31.4s, v31.4s, v7.4s 252 253 # Store full 6 x 8 254 B.LO 7f 255 256 $if INC: 257 ST1 {v30.16b, v31.16b}, [x7], x14 258 SUB x3, x3, x2 // a0 -= kc 259 ST1 {v28.16b, v29.16b}, [x13], x14 260 SUB x9, x9, x2 // a1 -= kc 261 ST1 {v26.16b, v27.16b}, [x18], x14 262 SUB x10, x10, x2 // a2 -= kc 263 ST1 {v24.16b, v25.16b}, [x17], x14 264 SUB x11, x11, x2 // a3 -= kc 265 ST1 {v22.16b, v23.16b}, [x16], x14 266 SUB x12, x12, x2 // a4 -= kc 267 ST1 {v20.16b, v21.16b}, [x6], x14 268 SUB x4, x4, x2 // a5 -= kc 269 $else: 270 ST1 {v20.16b, v21.16b}, [x6], x14 271 SUB x3, x3, x2 // a0 -= kc 272 ST1 {v22.16b, v23.16b}, [x16], x14 273 SUB x9, x9, x2 // a1 -= kc 274 ST1 {v24.16b, v25.16b}, [x17], x14 275 SUB x10, x10, x2 // a2 -= kc 276 ST1 {v26.16b, v27.16b}, [x18], x14 277 SUB x11, x11, x2 // a3 -= kc 278 ST1 {v28.16b, v29.16b}, [x13], x14 279 SUB x12, x12, x2 // a4 -= kc 280 ST1 {v30.16b, v31.16b}, [x7], x14 281 SUB x4, x4, x2 // a5 -= kc 282 283 B.HI 0b 284 RET 285 2865: 287 # Is there a remainder?- 2 floats of A (8 bytes) 288 TBZ x0, 3, 6f 289 290 # Remainder- 2 floats of A (8 bytes) 291 LDR d0, [x3], 8 292 LDP q16, q17, [x5], 32 293 LDR d1, [x9], 8 294 LDR d2, [x10], 8 295 LDR d3, [x11], 8 296 LDR d4, [x12], 8 297 LDR d5, [x4], 8 298 FMLA v20.4s, v16.4s, v0.s[0] 299 FMLA v22.4s, v16.4s, v1.s[0] 300 FMLA v24.4s, v16.4s, v2.s[0] 301 FMLA v26.4s, v16.4s, v3.s[0] 302 LDP q18, q19, [x5], 32 303 FMLA v28.4s, v16.4s, v4.s[0] 304 FMLA v30.4s, v16.4s, v5.s[0] 305 FMLA v21.4s, v17.4s, v0.s[0] 306 FMLA v23.4s, v17.4s, v1.s[0] 307 FMLA v25.4s, v17.4s, v2.s[0] 308 FMLA v27.4s, v17.4s, v3.s[0] 309 FMLA v29.4s, v17.4s, v4.s[0] 310 FMLA v31.4s, v17.4s, v5.s[0] 311 312 FMLA v20.4s, v18.4s, v0.s[1] 313 FMLA v22.4s, v18.4s, v1.s[1] 314 FMLA v24.4s, v18.4s, v2.s[1] 315 FMLA v26.4s, v18.4s, v3.s[1] 316 FMLA v28.4s, v18.4s, v4.s[1] 317 FMLA v30.4s, v18.4s, v5.s[1] 318 FMLA v21.4s, v19.4s, v0.s[1] 319 FMLA v23.4s, v19.4s, v1.s[1] 320 FMLA v25.4s, v19.4s, v2.s[1] 321 FMLA v27.4s, v19.4s, v3.s[1] 322 FMLA v29.4s, v19.4s, v4.s[1] 323 FMLA v31.4s, v19.4s, v5.s[1] 324 325 # Is there a remainder?- 1 floats of A (4 bytes) 326 TBZ x0, 2, 4b 327 328 # Remainder- 1 float of A (4 bytes) 3296: 330 LDR s0, [x3], 4 331 LDP q16, q17, [x5], 32 332 LDR s1, [x9], 4 333 LDR s2, [x10], 4 334 LDR s3, [x11], 4 335 LDR s4, [x12], 4 336 LDR s5, [x4], 4 337 FMLA v20.4s, v16.4s, v0.s[0] 338 FMLA v22.4s, v16.4s, v1.s[0] 339 FMLA v24.4s, v16.4s, v2.s[0] 340 FMLA v26.4s, v16.4s, v3.s[0] 341 FMLA v28.4s, v16.4s, v4.s[0] 342 FMLA v30.4s, v16.4s, v5.s[0] 343 FMLA v21.4s, v17.4s, v0.s[0] 344 FMLA v23.4s, v17.4s, v1.s[0] 345 FMLA v25.4s, v17.4s, v2.s[0] 346 FMLA v27.4s, v17.4s, v3.s[0] 347 FMLA v29.4s, v17.4s, v4.s[0] 348 FMLA v31.4s, v17.4s, v5.s[0] 349 B 4b 350 351 # Store odd width 3527: 353 TBZ x1, 2, 8f 354 $if INC: 355 STR q30, [x7], 16 356 MOV v30.16b, v31.16b 357 STR q28, [x13], 16 358 MOV v28.16b, v29.16b 359 STR q26, [x18], 16 360 MOV v26.16b, v27.16b 361 STR q24, [x17], 16 362 MOV v24.16b, v25.16b 363 STR q22, [x16], 16 364 MOV v22.16b, v23.16b 365 STR q20, [x6], 16 366 MOV v20.16b, v21.16b 367 $else: 368 STR q20, [x6], 16 369 MOV v20.16b, v21.16b 370 STR q22, [x16], 16 371 MOV v22.16b, v23.16b 372 STR q24, [x17], 16 373 MOV v24.16b, v25.16b 374 STR q26, [x18], 16 375 MOV v26.16b, v27.16b 376 STR q28, [x13], 16 377 MOV v28.16b, v29.16b 378 STR q30, [x7], 16 379 MOV v30.16b, v31.16b 380 3818: 382 TBZ x1, 1, 9f 383 $if INC: 384 STR d30, [x7], 8 385 DUP d30, v30.d[1] 386 STR d28, [x13], 8 387 DUP d28, v28.d[1] 388 STR d26, [x18], 8 389 DUP d26, v26.d[1] 390 STR d24, [x17], 8 391 DUP d24, v24.d[1] 392 STR d22, [x16], 8 393 DUP d22, v22.d[1] 394 STR d20, [x6], 8 395 DUP d20, v20.d[1] 396 $else: 397 STR d20, [x6], 8 398 DUP d20, v20.d[1] 399 STR d22, [x16], 8 400 DUP d22, v22.d[1] 401 STR d24, [x17], 8 402 DUP d24, v24.d[1] 403 STR d26, [x18], 8 404 DUP d26, v26.d[1] 405 STR d28, [x13], 8 406 DUP d28, v28.d[1] 407 STR d30, [x7], 8 408 DUP d30, v30.d[1] 409 4109: 411 TBZ x1, 0, 10f 412 $if INC: 413 STR s30, [x7] 414 STR s28, [x13] 415 STR s26, [x18] 416 STR s24, [x17] 417 STR s22, [x16] 418 STR s20, [x6] 419 $else: 420 STR s20, [x6] 421 STR s22, [x16] 422 STR s24, [x17] 423 STR s26, [x18] 424 STR s28, [x13] 425 STR s30, [x7] 42610: 427 RET 428 429END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld128 430 431#ifdef __ELF__ 432.section ".note.GNU-stack","",%progbits 433#endif 434