1// Copyright 2019 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6#include <xnnpack/assembly.h> 7 8# void xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64( 9# size_t mr, x0 10# size_t nc, x1 11# size_t kc, x2 / x0 12# const uint8_t*restrict a, x3 13# size_t a_stride, x4 14# const void*restrict w, x5 15# uint8_t*restrict c, x6 16# size_t cm_stride, x7 17# size_t cn_stride, [sp] -> x14 18$if INC: 19 # const float*restrict acc, [sp + 8] -> x15 20 # const union xnn_f32_output_params params[restrict static 1]) [sp + 16] -> x8 21$else: 22 # const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x18 c3 40# x13 c4 41# x7 c5 42 43# Vector register usage 44# A0 v0 45# A1 v1 46# A2 v2 47# A3 v3 48# A4 v4 49# A5 v5 50# B v16 v17 v18 v19 51# C v20 v21 52# C v22 v23 53# C v24 v25 54# C v26 v27 55# C v28 v29 56# C v30 v31 57# Clamp v6 v7 58# unused A v8 v9 v10 v11 59# unused B v12 v13 v14 v15 60 61BEGIN_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64 62 63 # Clamp A and C pointers 64 CMP x0, 2 // if mr < 2 65 ADD x9, x3, x4 // a1 = a0 + a_stride 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 CSEL x9, x3, x9, LO // a1 = a0 68 CSEL x16, x6, x16, LO // c1 = c0 69 70 ADD x10, x9, x4 // a2 = a1 + a_stride 71 ADD x17, x16, x7 // c2 = c1 + cm_stride 72 // if mr <= 2 73 CSEL x10, x9, x10, LS // a2 = a1 74 CSEL x17, x16, x17, LS // c2 = c1 75 76 CMP x0, 4 // if mr < 4 77 ADD x11, x10, x4 // a3 = a2 + a_stride 78 ADD x18, x17, x7 // c3 = c2 + cm_stride 79 CSEL x11, x10, x11, LO // a3 = a2 80 CSEL x18, x17, x18, LO // c3 = c2 81 82 ADD x12, x11, x4 // a4 = a3 + a_stride 83 ADD x13, x18, x7 // c4 = c3 + cm_stride 84 // if mr <= 5 85 CSEL x12, x11, x12, LS // a4 = a3 86 CSEL x13, x18, x13, LS // c4 = c3 87 88 $if INC: 89 # Load acc, params pointer 90 LDP x15, x8, [sp, 8] 91 $else: 92 # Load params pointer 93 LDR x8, [sp, 8] 94 95 CMP x0, 6 // if mr < 6 96 ADD x4, x12, x4 // a5 = a4 + a_stride 97 ADD x7, x13, x7 // c5 = c4 + cm_stride 98 CSEL x4, x12, x4, LO // a5 = a4 99 CSEL x7, x13, x7, LO // c5 = c4 100 101 # Load clamping_params values 102 LD2R {v6.4s, v7.4s}, [x8] 103 104 # Load cn_stride 105 LDR x14, [sp] 106 1070: 108 $if INC: 109 # Load initial accumulators 110 LDP q20, q21, [x15], 32 111 LDP q22, q23, [x15], 32 112 LDP q24, q25, [x15], 32 113 LDP q26, q27, [x15], 32 114 LDP q28, q29, [x15], 32 115 LDP q30, q31, [x15], 32 116 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 117 PRFM PLDL1KEEP, [x5, 64] 118 PRFM PLDL1KEEP, [x5, 128] 119 PRFM PLDL1KEEP, [x5, 192] 120 PRFM PLDL1KEEP, [x3] // Prefetch A 121 PRFM PLDL1KEEP, [x9] 122 PRFM PLDL1KEEP, [x10] 123 PRFM PLDL1KEEP, [x11] 124 PRFM PLDL1KEEP, [x12] 125 PRFM PLDL1KEEP, [x4] 126 $else: 127 # Load initial bias from w into accumulators 128 LDP q20, q21, [x5], 32 129 MOV v22.16b, v20.16b 130 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 131 MOV v23.16b, v21.16b 132 PRFM PLDL1KEEP, [x5, 64] 133 MOV v24.16b, v20.16b 134 PRFM PLDL1KEEP, [x5, 128] 135 MOV v25.16b, v21.16b 136 PRFM PLDL1KEEP, [x5, 192] 137 MOV v26.16b, v20.16b 138 PRFM PLDL1KEEP, [x3] // Prefetch A 139 MOV v27.16b, v21.16b 140 PRFM PLDL1KEEP, [x9] 141 MOV v28.16b, v20.16b 142 PRFM PLDL1KEEP, [x10] 143 MOV v29.16b, v21.16b 144 PRFM PLDL1KEEP, [x11] 145 MOV v30.16b, v20.16b 146 PRFM PLDL1KEEP, [x12] 147 MOV v31.16b, v21.16b 148 PRFM PLDL1KEEP, [x4] 149 150 # Is there at least 2 floats (8 bytes) for main loop? 151 SUBS x0, x2, 8 // k = kc - 8 152 B.LO 4f 153 154 # Main loop - 2 floats of A (8 bytes) 155 # 24 FMA + 6 LD64 A + 2 LDP B 1561: 157 LDR d0, [x3], 8 158 LDP q16, q17, [x5], 32 159 LDR d1, [x9], 8 160 LDR d2, [x10], 8 161 LDR d3, [x11], 8 162 LDR d4, [x12], 8 163 LDR d5, [x4], 8 164 FMLA v20.4s, v16.4s, v0.s[0] 165 FMLA v22.4s, v16.4s, v1.s[0] 166 FMLA v24.4s, v16.4s, v2.s[0] 167 FMLA v26.4s, v16.4s, v3.s[0] 168 LDP q18, q19, [x5], 32 169 FMLA v28.4s, v16.4s, v4.s[0] 170 FMLA v30.4s, v16.4s, v5.s[0] 171 FMLA v21.4s, v17.4s, v0.s[0] 172 FMLA v23.4s, v17.4s, v1.s[0] 173 FMLA v25.4s, v17.4s, v2.s[0] 174 FMLA v27.4s, v17.4s, v3.s[0] 175 FMLA v29.4s, v17.4s, v4.s[0] 176 FMLA v31.4s, v17.4s, v5.s[0] 177 178 FMLA v20.4s, v18.4s, v0.s[1] 179 FMLA v22.4s, v18.4s, v1.s[1] 180 FMLA v24.4s, v18.4s, v2.s[1] 181 FMLA v26.4s, v18.4s, v3.s[1] 182 FMLA v28.4s, v18.4s, v4.s[1] 183 FMLA v30.4s, v18.4s, v5.s[1] 184 FMLA v21.4s, v19.4s, v0.s[1] 185 FMLA v23.4s, v19.4s, v1.s[1] 186 FMLA v25.4s, v19.4s, v2.s[1] 187 FMLA v27.4s, v19.4s, v3.s[1] 188 SUBS x0, x0, 8 189 FMLA v29.4s, v19.4s, v4.s[1] 190 FMLA v31.4s, v19.4s, v5.s[1] 191 B.HS 1b 192 193 # Is there a remainder?- 1 floats of A (4 bytes) 194 TBNZ x0, 2, 4f 1953: 196 # Clamp 197 FMIN v20.4s, v20.4s, v6.4s 198 SUBS x1, x1, 8 199 FMIN v21.4s, v21.4s, v6.4s 200 FMIN v22.4s, v22.4s, v6.4s 201 FMIN v23.4s, v23.4s, v6.4s 202 FMIN v24.4s, v24.4s, v6.4s 203 FMIN v25.4s, v25.4s, v6.4s 204 FMIN v26.4s, v26.4s, v6.4s 205 FMIN v27.4s, v27.4s, v6.4s 206 FMIN v28.4s, v28.4s, v6.4s 207 FMIN v29.4s, v29.4s, v6.4s 208 FMIN v30.4s, v30.4s, v6.4s 209 FMIN v31.4s, v31.4s, v6.4s 210 FMAX v20.4s, v20.4s, v7.4s 211 FMAX v21.4s, v21.4s, v7.4s 212 FMAX v22.4s, v22.4s, v7.4s 213 FMAX v23.4s, v23.4s, v7.4s 214 FMAX v24.4s, v24.4s, v7.4s 215 FMAX v25.4s, v25.4s, v7.4s 216 FMAX v26.4s, v26.4s, v7.4s 217 FMAX v27.4s, v27.4s, v7.4s 218 FMAX v28.4s, v28.4s, v7.4s 219 FMAX v29.4s, v29.4s, v7.4s 220 FMAX v30.4s, v30.4s, v7.4s 221 FMAX v31.4s, v31.4s, v7.4s 222 223 # Store full 6 x 8 224 B.LO 5f 225 226 $if INC: 227 ST1 {v30.16b, v31.16b}, [x7], x14 228 SUB x3, x3, x2 // a0 -= kc 229 ST1 {v28.16b, v29.16b}, [x13], x14 230 SUB x9, x9, x2 // a1 -= kc 231 ST1 {v26.16b, v27.16b}, [x18], x14 232 SUB x10, x10, x2 // a2 -= kc 233 ST1 {v24.16b, v25.16b}, [x17], x14 234 SUB x11, x11, x2 // a3 -= kc 235 ST1 {v22.16b, v23.16b}, [x16], x14 236 SUB x12, x12, x2 // a4 -= kc 237 ST1 {v20.16b, v21.16b}, [x6], x14 238 SUB x4, x4, x2 // a5 -= kc 239 $else: 240 ST1 {v20.16b, v21.16b}, [x6], x14 241 SUB x3, x3, x2 // a0 -= kc 242 ST1 {v22.16b, v23.16b}, [x16], x14 243 SUB x9, x9, x2 // a1 -= kc 244 ST1 {v24.16b, v25.16b}, [x17], x14 245 SUB x10, x10, x2 // a2 -= kc 246 ST1 {v26.16b, v27.16b}, [x18], x14 247 SUB x11, x11, x2 // a3 -= kc 248 ST1 {v28.16b, v29.16b}, [x13], x14 249 SUB x12, x12, x2 // a4 -= kc 250 ST1 {v30.16b, v31.16b}, [x7], x14 251 SUB x4, x4, x2 // a5 -= kc 252 253 B.HI 0b 254 RET 255 2564: 257 # Remainder- 1 floats of A (4 bytes) 258 LDR s0, [x3], 4 259 LDP q16, q17, [x5], 32 260 LDR s1, [x9], 4 261 LDR s2, [x10], 4 262 LDR s3, [x11], 4 263 LDR s4, [x12], 4 264 LDR s5, [x4], 4 265 FMLA v20.4s, v16.4s, v0.s[0] 266 FMLA v22.4s, v16.4s, v1.s[0] 267 FMLA v24.4s, v16.4s, v2.s[0] 268 FMLA v26.4s, v16.4s, v3.s[0] 269 FMLA v28.4s, v16.4s, v4.s[0] 270 FMLA v30.4s, v16.4s, v5.s[0] 271 FMLA v21.4s, v17.4s, v0.s[0] 272 FMLA v23.4s, v17.4s, v1.s[0] 273 FMLA v25.4s, v17.4s, v2.s[0] 274 FMLA v27.4s, v17.4s, v3.s[0] 275 FMLA v29.4s, v17.4s, v4.s[0] 276 FMLA v31.4s, v17.4s, v5.s[0] 277 B 3b 278 279 # Store odd width 2805: 281 TBZ x1, 2, 6f 282 $if INC: 283 STR q30, [x7], 16 284 MOV v30.16b, v31.16b 285 STR q28, [x13], 16 286 MOV v28.16b, v29.16b 287 STR q26, [x18], 16 288 MOV v26.16b, v27.16b 289 STR q24, [x17], 16 290 MOV v24.16b, v25.16b 291 STR q22, [x16], 16 292 MOV v22.16b, v23.16b 293 STR q20, [x6], 16 294 MOV v20.16b, v21.16b 295 $else: 296 STR q20, [x6], 16 297 MOV v20.16b, v21.16b 298 STR q22, [x16], 16 299 MOV v22.16b, v23.16b 300 STR q24, [x17], 16 301 MOV v24.16b, v25.16b 302 STR q26, [x18], 16 303 MOV v26.16b, v27.16b 304 STR q28, [x13], 16 305 MOV v28.16b, v29.16b 306 STR q30, [x7], 16 307 MOV v30.16b, v31.16b 308 3096: 310 TBZ x1, 1, 7f 311 $if INC: 312 STR d30, [x7], 8 313 DUP d30, v30.d[1] 314 STR d28, [x13], 8 315 DUP d28, v28.d[1] 316 STR d26, [x18], 8 317 DUP d26, v26.d[1] 318 STR d24, [x17], 8 319 DUP d24, v24.d[1] 320 STR d22, [x16], 8 321 DUP d22, v22.d[1] 322 STR d20, [x6], 8 323 DUP d20, v20.d[1] 324 $else: 325 STR d20, [x6], 8 326 DUP d20, v20.d[1] 327 STR d22, [x16], 8 328 DUP d22, v22.d[1] 329 STR d24, [x17], 8 330 DUP d24, v24.d[1] 331 STR d26, [x18], 8 332 DUP d26, v26.d[1] 333 STR d28, [x13], 8 334 DUP d28, v28.d[1] 335 STR d30, [x7], 8 336 DUP d30, v30.d[1] 337 3387: 339 TBZ x1, 0, 8f 340 $if INC: 341 STR s30, [x7] 342 STR s28, [x13] 343 STR s26, [x18] 344 STR s24, [x17] 345 STR s22, [x16] 346 STR s20, [x6] 347 $else: 348 STR s20, [x6] 349 STR s22, [x16] 350 STR s24, [x17] 351 STR s26, [x18] 352 STR s28, [x13] 353 STR s30, [x7] 3548: 355 RET 356 357END_FUNCTION xnn_f32_gemm${"inc" if INC else ""}_ukernel_6x8__aarch64_neonfma_ld64 358 359#ifdef __ELF__ 360.section ".note.GNU-stack","",%progbits 361#endif 362