1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_ld128( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> (x0) 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# Vector register usage 43# A0 v0 44# A1 v1 45# A2 v2 46# A3 v3 47# A4 v4 48# A5 v5 49# B v16 v17 v18 v19 50# C v20 v21 51# C v22 v23 52# C v24 v25 53# C v26 v27 54# C v28 v29 55# C v30 v31 56# Clamp v6 v7 57# unused A v8 v9 v10 v11 58# unused B v12 v13 v14 v15 59 60BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_ld128 61 62 $if INC: 63 # Load acc, params pointer 64 LDP x15, x8, [sp, 8] 65 $else: 66 # Load params pointer 67 LDR x8, [sp, 8] 68 69 # Clamp A and C pointers 70 CMP x0, 2 // if mr < 2 71 ADD x9, x3, x4 // a1 = a0 + a_stride 72 ADD x16, x6, x7 // c1 = c0 + cm_stride 73 CSEL x9, x3, x9, LO // a1 = a0 74 CSEL x16, x6, x16, LO // c1 = c0 75 76 ADD x10, x9, x4 // a2 = a1 + a_stride 77 ADD x17, x16, x7 // c2 = c1 + cm_stride 78 // if mr <= 2 79 CSEL x10, x9, x10, LS // a2 = a1 80 CSEL x17, x16, x17, LS // c2 = c1 81 82 CMP x0, 4 // if mr < 4 83 ADD x11, x10, x4 // a3 = a2 + a_stride 84 ADD x14, x17, x7 // c3 = c2 + cm_stride 85 CSEL x11, x10, x11, LO // a3 = a2 86 CSEL x14, x17, x14, LO // c3 = c2 87 88 ADD x12, x11, x4 // a4 = a3 + a_stride 89 ADD x13, x14, x7 // c4 = c3 + cm_stride 90 // if mr <= 4 91 CSEL x12, x11, x12, LS // a4 = a3 92 CSEL x13, x14, x13, LS // c4 = c3 93 94 CMP x0, 6 // if mr < 6 95 ADD x4, x12, x4 // a5 = a4 + a_stride 96 ADD x7, x13, x7 // c5 = c4 + cm_stride 97 CSEL x4, x12, x4, LO // a5 = a4 98 CSEL x7, x13, x7, LO // c5 = c4 99 100 # Load min/max values 101 LD2R {v6.4s, v7.4s}, [x8] 102 1030: 104 $if INC: 105 # Load initial accumulators 106 LDP q20, q21, [x15], 32 107 LDP q22, q23, [x15], 32 108 LDP q24, q25, [x15], 32 109 LDP q26, q27, [x15], 32 110 LDP q28, q29, [x15], 32 111 LDP q30, q31, [x15], 32 112 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 113 PRFM PLDL1KEEP, [x5, 64] 114 PRFM PLDL1KEEP, [x5, 128] 115 PRFM PLDL1KEEP, [x5, 192] 116 PRFM PLDL1KEEP, [x3] // Prefetch A 117 PRFM PLDL1KEEP, [x9] 118 PRFM PLDL1KEEP, [x10] 119 PRFM PLDL1KEEP, [x11] 120 PRFM PLDL1KEEP, [x12] 121 PRFM PLDL1KEEP, [x4] 122 $else: 123 # Load initial bias from w into accumulators 124 LDP q20, q21, [x5], 32 125 MOV v22.16b, v20.16b 126 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 127 MOV v23.16b, v21.16b 128 PRFM PLDL1KEEP, [x5, 64] 129 MOV v24.16b, v20.16b 130 PRFM PLDL1KEEP, [x5, 128] 131 MOV v25.16b, v21.16b 132 PRFM PLDL1KEEP, [x5, 192] 133 MOV v26.16b, v20.16b 134 PRFM PLDL1KEEP, [x3] // Prefetch A 135 MOV v27.16b, v21.16b 136 PRFM PLDL1KEEP, [x9] 137 MOV v28.16b, v20.16b 138 PRFM PLDL1KEEP, [x10] 139 MOV v29.16b, v21.16b 140 PRFM PLDL1KEEP, [x11] 141 MOV v30.16b, v20.16b 142 PRFM PLDL1KEEP, [x12] 143 MOV v31.16b, v21.16b 144 PRFM PLDL1KEEP, [x4] 145 146 # Is there at least 4 floats (16 bytes)? 147 SUBS x0, x2, 16 // k = kc - 16 148 B.LO 3f 149 150 # Main loop - 4 floats of A (16 bytes) 151 # 48 FMA + 6 ld128 A + 4 LDP B 1521: 153 LDR q0, [x3], 16 154 LDP q16, q17, [x5], 32 155 LDR q1, [x9], 16 156 LDR q2, [x10], 16 157 LDR q3, [x11], 16 158 LDR q4, [x12], 16 159 LDR q5, [x4], 16 160 FMLA v20.4s, v16.4s, v0.s[0] 161 FMLA v22.4s, v16.4s, v1.s[0] 162 FMLA v24.4s, v16.4s, v2.s[0] 163 FMLA v26.4s, v16.4s, v3.s[0] 164 LDP q18, q19, [x5], 32 165 FMLA v28.4s, v16.4s, v4.s[0] 166 FMLA v30.4s, v16.4s, v5.s[0] 167 FMLA v21.4s, v17.4s, v0.s[0] 168 FMLA v23.4s, v17.4s, v1.s[0] 169 FMLA v25.4s, v17.4s, v2.s[0] 170 FMLA v27.4s, v17.4s, v3.s[0] 171 FMLA v29.4s, v17.4s, v4.s[0] 172 FMLA v31.4s, v17.4s, v5.s[0] 173 174 FMLA v20.4s, v18.4s, v0.s[1] 175 LDP q16, q17, [x5], 32 176 FMLA v22.4s, v18.4s, v1.s[1] 177 FMLA v24.4s, v18.4s, v2.s[1] 178 FMLA v26.4s, v18.4s, v3.s[1] 179 FMLA v28.4s, v18.4s, v4.s[1] 180 FMLA v30.4s, v18.4s, v5.s[1] 181 FMLA v21.4s, v19.4s, v0.s[1] 182 FMLA v23.4s, v19.4s, v1.s[1] 183 FMLA v25.4s, v19.4s, v2.s[1] 184 FMLA v27.4s, v19.4s, v3.s[1] 185 FMLA v29.4s, v19.4s, v4.s[1] 186 FMLA v31.4s, v19.4s, v5.s[1] 187 188 FMLA v20.4s, v16.4s, v0.s[2] 189 LDP q18, q19, [x5], 32 190 FMLA v22.4s, v16.4s, v1.s[2] 191 FMLA v24.4s, v16.4s, v2.s[2] 192 FMLA v26.4s, v16.4s, v3.s[2] 193 FMLA v28.4s, v16.4s, v4.s[2] 194 FMLA v30.4s, v16.4s, v5.s[2] 195 FMLA v21.4s, v17.4s, v0.s[2] 196 FMLA v23.4s, v17.4s, v1.s[2] 197 FMLA v25.4s, v17.4s, v2.s[2] 198 FMLA v27.4s, v17.4s, v3.s[2] 199 FMLA v29.4s, v17.4s, v4.s[2] 200 FMLA v31.4s, v17.4s, v5.s[2] 201 202 FMLA v20.4s, v18.4s, v0.s[3] 203 FMLA v22.4s, v18.4s, v1.s[3] 204 FMLA v24.4s, v18.4s, v2.s[3] 205 FMLA v26.4s, v18.4s, v3.s[3] 206 FMLA v28.4s, v18.4s, v4.s[3] 207 FMLA v30.4s, v18.4s, v5.s[3] 208 FMLA v21.4s, v19.4s, v0.s[3] 209 FMLA v23.4s, v19.4s, v1.s[3] 210 FMLA v25.4s, v19.4s, v2.s[3] 211 FMLA v27.4s, v19.4s, v3.s[3] 212 SUBS x0, x0, 16 213 FMLA v29.4s, v19.4s, v4.s[3] 214 FMLA v31.4s, v19.4s, v5.s[3] 215 B.HS 1b 216 217 # Is there a remainder?- 2 floats of A (8 bytes) or less 218 TST x0, 15 219 B.NE 3f 220 2212: 222 # Clamp 223 FMAX v20.4s, v20.4s, v6.4s 224 # Load cn_stride 225 LDR x0, [sp, 0] 226 FMAX v21.4s, v21.4s, v6.4s 227 FMAX v22.4s, v22.4s, v6.4s 228 FMAX v23.4s, v23.4s, v6.4s 229 FMAX v24.4s, v24.4s, v6.4s 230 FMAX v25.4s, v25.4s, v6.4s 231 FMAX v26.4s, v26.4s, v6.4s 232 FMAX v27.4s, v27.4s, v6.4s 233 FMAX v28.4s, v28.4s, v6.4s 234 FMAX v29.4s, v29.4s, v6.4s 235 FMAX v30.4s, v30.4s, v6.4s 236 FMAX v31.4s, v31.4s, v6.4s 237 SUBS x1, x1, 8 238 FMIN v20.4s, v20.4s, v7.4s 239 FMIN v21.4s, v21.4s, v7.4s 240 FMIN v22.4s, v22.4s, v7.4s 241 FMIN v23.4s, v23.4s, v7.4s 242 FMIN v24.4s, v24.4s, v7.4s 243 FMIN v25.4s, v25.4s, v7.4s 244 FMIN v26.4s, v26.4s, v7.4s 245 FMIN v27.4s, v27.4s, v7.4s 246 FMIN v28.4s, v28.4s, v7.4s 247 FMIN v29.4s, v29.4s, v7.4s 248 FMIN v30.4s, v30.4s, v7.4s 249 FMIN v31.4s, v31.4s, v7.4s 250 251 # Store full 6 x 8 252 B.LO 5f 253 254 $if INC: 255 ST1 {v30.16b, v31.16b}, [x7], x0 256 SUB x3, x3, x2 // a0 -= kc 257 ST1 {v28.16b, v29.16b}, [x13], x0 258 SUB x9, x9, x2 // a1 -= kc 259 ST1 {v26.16b, v27.16b}, [x14], x0 260 SUB x10, x10, x2 // a2 -= kc 261 ST1 {v24.16b, v25.16b}, [x17], x0 262 SUB x11, x11, x2 // a3 -= kc 263 ST1 {v22.16b, v23.16b}, [x16], x0 264 SUB x12, x12, x2 // a4 -= kc 265 ST1 {v20.16b, v21.16b}, [x6], x0 266 SUB x4, x4, x2 // a5 -= kc 267 $else: 268 ST1 {v20.16b, v21.16b}, [x6], x0 269 SUB x3, x3, x2 // a0 -= kc 270 ST1 {v22.16b, v23.16b}, [x16], x0 271 SUB x9, x9, x2 // a1 -= kc 272 ST1 {v24.16b, v25.16b}, [x17], x0 273 SUB x10, x10, x2 // a2 -= kc 274 ST1 {v26.16b, v27.16b}, [x14], x0 275 SUB x11, x11, x2 // a3 -= kc 276 ST1 {v28.16b, v29.16b}, [x13], x0 277 SUB x12, x12, x2 // a4 -= kc 278 ST1 {v30.16b, v31.16b}, [x7], x0 279 SUB x4, x4, x2 // a5 -= kc 280 281 B.HI 0b 282 RET 283 2843: 285 # Is there a remainder?- 2 floats of A (8 bytes) 286 TBZ x0, 3, 4f 287 288 # Remainder- 2 floats of A (8 bytes) 289 LDR d0, [x3], 8 290 LDP q16, q17, [x5], 32 291 LDR d1, [x9], 8 292 LDR d2, [x10], 8 293 LDR d3, [x11], 8 294 LDR d4, [x12], 8 295 LDR d5, [x4], 8 296 FMLA v20.4s, v16.4s, v0.s[0] 297 FMLA v22.4s, v16.4s, v1.s[0] 298 FMLA v24.4s, v16.4s, v2.s[0] 299 FMLA v26.4s, v16.4s, v3.s[0] 300 LDP q18, q19, [x5], 32 301 FMLA v28.4s, v16.4s, v4.s[0] 302 FMLA v30.4s, v16.4s, v5.s[0] 303 FMLA v21.4s, v17.4s, v0.s[0] 304 FMLA v23.4s, v17.4s, v1.s[0] 305 FMLA v25.4s, v17.4s, v2.s[0] 306 FMLA v27.4s, v17.4s, v3.s[0] 307 FMLA v29.4s, v17.4s, v4.s[0] 308 FMLA v31.4s, v17.4s, v5.s[0] 309 310 FMLA v20.4s, v18.4s, v0.s[1] 311 FMLA v22.4s, v18.4s, v1.s[1] 312 FMLA v24.4s, v18.4s, v2.s[1] 313 FMLA v26.4s, v18.4s, v3.s[1] 314 FMLA v28.4s, v18.4s, v4.s[1] 315 FMLA v30.4s, v18.4s, v5.s[1] 316 FMLA v21.4s, v19.4s, v0.s[1] 317 FMLA v23.4s, v19.4s, v1.s[1] 318 FMLA v25.4s, v19.4s, v2.s[1] 319 FMLA v27.4s, v19.4s, v3.s[1] 320 FMLA v29.4s, v19.4s, v4.s[1] 321 FMLA v31.4s, v19.4s, v5.s[1] 322 323 # Is there a remainder?- 1 floats of A (4 bytes) 324 TBZ x0, 2, 2b 325 326 # Remainder- 1 float of A (4 bytes) 3274: 328 LDR s0, [x3], 4 329 LDP q16, q17, [x5], 32 330 LDR s1, [x9], 4 331 LDR s2, [x10], 4 332 LDR s3, [x11], 4 333 LDR s4, [x12], 4 334 LDR s5, [x4], 4 335 FMLA v20.4s, v16.4s, v0.s[0] 336 FMLA v22.4s, v16.4s, v1.s[0] 337 FMLA v24.4s, v16.4s, v2.s[0] 338 FMLA v26.4s, v16.4s, v3.s[0] 339 FMLA v28.4s, v16.4s, v4.s[0] 340 FMLA v30.4s, v16.4s, v5.s[0] 341 FMLA v21.4s, v17.4s, v0.s[0] 342 FMLA v23.4s, v17.4s, v1.s[0] 343 FMLA v25.4s, v17.4s, v2.s[0] 344 FMLA v27.4s, v17.4s, v3.s[0] 345 FMLA v29.4s, v17.4s, v4.s[0] 346 FMLA v31.4s, v17.4s, v5.s[0] 347 B 2b 348 349 # Store odd width 3505: 351 TBZ x1, 2, 6f 352 $if INC: 353 STR q30, [x7], 16 354 MOV v30.16b, v31.16b 355 STR q28, [x13], 16 356 MOV v28.16b, v29.16b 357 STR q26, [x14], 16 358 MOV v26.16b, v27.16b 359 STR q24, [x17], 16 360 MOV v24.16b, v25.16b 361 STR q22, [x16], 16 362 MOV v22.16b, v23.16b 363 STR q20, [x6], 16 364 MOV v20.16b, v21.16b 365 $else: 366 STR q20, [x6], 16 367 MOV v20.16b, v21.16b 368 STR q22, [x16], 16 369 MOV v22.16b, v23.16b 370 STR q24, [x17], 16 371 MOV v24.16b, v25.16b 372 STR q26, [x14], 16 373 MOV v26.16b, v27.16b 374 STR q28, [x13], 16 375 MOV v28.16b, v29.16b 376 STR q30, [x7], 16 377 MOV v30.16b, v31.16b 378 3796: 380 TBZ x1, 1, 7f 381 $if INC: 382 STR d30, [x7], 8 383 DUP d30, v30.d[1] 384 STR d28, [x13], 8 385 DUP d28, v28.d[1] 386 STR d26, [x14], 8 387 DUP d26, v26.d[1] 388 STR d24, [x17], 8 389 DUP d24, v24.d[1] 390 STR d22, [x16], 8 391 DUP d22, v22.d[1] 392 STR d20, [x6], 8 393 DUP d20, v20.d[1] 394 $else: 395 STR d20, [x6], 8 396 DUP d20, v20.d[1] 397 STR d22, [x16], 8 398 DUP d22, v22.d[1] 399 STR d24, [x17], 8 400 DUP d24, v24.d[1] 401 STR d26, [x14], 8 402 DUP d26, v26.d[1] 403 STR d28, [x13], 8 404 DUP d28, v28.d[1] 405 STR d30, [x7], 8 406 DUP d30, v30.d[1] 407 4087: 409 TBZ x1, 0, 8f 410 $if INC: 411 STR s30, [x7] 412 STR s28, [x13] 413 STR s26, [x14] 414 STR s24, [x17] 415 STR s22, [x16] 416 STR s20, [x6] 417 $else: 418 STR s20, [x6] 419 STR s22, [x16] 420 STR s24, [x17] 421 STR s26, [x14] 422 STR s28, [x13] 423 STR s30, [x7] 4248: 425 RET 426 427END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_minmax_ukernel_6x8__aarch64_neonfma_ld128 428 429#ifdef __ELF__ 430.section ".note.GNU-stack","",%progbits 431#endif 432