1// Auto-generated file. Do not edit! 2// Template: src/f32-gemm/6x8-aarch64-neonfma-ld128.S.in 3// Generator: tools/xngen 4// 5// Copyright 2019 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f32_output_params params[restrict static 1]) [sp + 8] -> x8 23 24# d8-d15 need to be preserved if used. 25# x19-30 need to be preserved if used. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x18 c3 40# x13 c4 41# x7 c5 42 43# Vector register usage 44# A0 v0 45# A1 v1 46# A2 v2 47# A3 v3 48# A4 v4 49# A5 v5 50# B v16 v17 v18 v19 51# C v20 v21 52# C v22 v23 53# C v24 v25 54# C v26 v27 55# C v28 v29 56# C v30 v31 57# Clamp v6 v7 58# unused A v8 v9 v10 v11 59# unused B v12 v13 v14 v15 60 61BEGIN_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128 62 63 # Clamp A and C pointers 64 CMP x0, 2 // if mr < 2 65 ADD x9, x3, x4 // a1 = a0 + a_stride 66 ADD x16, x6, x7 // c1 = c0 + cm_stride 67 CSEL x9, x3, x9, LO // a1 = a0 68 CSEL x16, x6, x16, LO // c1 = c0 69 70 ADD x10, x9, x4 // a2 = a1 + a_stride 71 ADD x17, x16, x7 // c2 = c1 + cm_stride 72 // if mr <= 2 73 CSEL x10, x9, x10, LS // a2 = a1 74 CSEL x17, x16, x17, LS // c2 = c1 75 76 CMP x0, 4 // if mr < 4 77 ADD x11, x10, x4 // a3 = a2 + a_stride 78 ADD x18, x17, x7 // c3 = c2 + cm_stride 79 CSEL x11, x10, x11, LO // a3 = a2 80 CSEL x18, x17, x18, LO // c3 = c2 81 82 ADD x12, x11, x4 // a4 = a3 + a_stride 83 ADD x13, x18, x7 // c4 = c3 + cm_stride 84 // if mr <= 5 85 CSEL x12, x11, x12, LS // a4 = a3 86 CSEL x13, x18, x13, LS // c4 = c3 87 88 # Load params pointer 89 LDR x8, [sp, 8] 90 91 CMP x0, 6 // if mr < 6 92 ADD x4, x12, x4 // a5 = a4 + a_stride 93 ADD x7, x13, x7 // c5 = c4 + cm_stride 94 CSEL x4, x12, x4, LO // a5 = a4 95 CSEL x7, x13, x7, LO // c5 = c4 96 97 # Load clamping_params values 98 LD2R {v6.4s, v7.4s}, [x8] 99 100 # Load cn_stride 101 LDR x14, [sp] 102 1030: 104 # Load initial bias from w into accumulators 105 LDP q20, q21, [x5], 32 106 MOV v22.16b, v20.16b 107 PRFM PLDL1KEEP, [x5, 0] // Prefetch B 108 MOV v23.16b, v21.16b 109 PRFM PLDL1KEEP, [x5, 64] 110 MOV v24.16b, v20.16b 111 PRFM PLDL1KEEP, [x5, 128] 112 MOV v25.16b, v21.16b 113 PRFM PLDL1KEEP, [x5, 192] 114 MOV v26.16b, v20.16b 115 PRFM PLDL1KEEP, [x3] // Prefetch A 116 MOV v27.16b, v21.16b 117 PRFM PLDL1KEEP, [x9] 118 MOV v28.16b, v20.16b 119 PRFM PLDL1KEEP, [x10] 120 MOV v29.16b, v21.16b 121 PRFM PLDL1KEEP, [x11] 122 MOV v30.16b, v20.16b 123 PRFM PLDL1KEEP, [x12] 124 MOV v31.16b, v21.16b 125 PRFM PLDL1KEEP, [x4] 126 127 # Is there at least 4 floats (16 bytes)? 128 SUBS x0, x2, 16 // k = kc - 16 129 B.LO 5f 130 131 # Main loop - 4 floats of A (16 bytes) 132 # 48 FMA + 6 ld128 A + 4 LDP B 1331: 134 LDR q0, [x3], 16 135 LDP q16, q17, [x5], 32 136 LDR q1, [x9], 16 137 LDR q2, [x10], 16 138 LDR q3, [x11], 16 139 LDR q4, [x12], 16 140 LDR q5, [x4], 16 141 FMLA v20.4s, v16.4s, v0.s[0] 142 FMLA v22.4s, v16.4s, v1.s[0] 143 FMLA v24.4s, v16.4s, v2.s[0] 144 FMLA v26.4s, v16.4s, v3.s[0] 145 LDP q18, q19, [x5], 32 146 FMLA v28.4s, v16.4s, v4.s[0] 147 FMLA v30.4s, v16.4s, v5.s[0] 148 FMLA v21.4s, v17.4s, v0.s[0] 149 FMLA v23.4s, v17.4s, v1.s[0] 150 FMLA v25.4s, v17.4s, v2.s[0] 151 FMLA v27.4s, v17.4s, v3.s[0] 152 FMLA v29.4s, v17.4s, v4.s[0] 153 FMLA v31.4s, v17.4s, v5.s[0] 154 155 FMLA v20.4s, v18.4s, v0.s[1] 156 LDP q16, q17, [x5], 32 157 FMLA v22.4s, v18.4s, v1.s[1] 158 FMLA v24.4s, v18.4s, v2.s[1] 159 FMLA v26.4s, v18.4s, v3.s[1] 160 FMLA v28.4s, v18.4s, v4.s[1] 161 FMLA v30.4s, v18.4s, v5.s[1] 162 FMLA v21.4s, v19.4s, v0.s[1] 163 FMLA v23.4s, v19.4s, v1.s[1] 164 FMLA v25.4s, v19.4s, v2.s[1] 165 FMLA v27.4s, v19.4s, v3.s[1] 166 FMLA v29.4s, v19.4s, v4.s[1] 167 FMLA v31.4s, v19.4s, v5.s[1] 168 169 FMLA v20.4s, v16.4s, v0.s[2] 170 LDP q18, q19, [x5], 32 171 FMLA v22.4s, v16.4s, v1.s[2] 172 FMLA v24.4s, v16.4s, v2.s[2] 173 FMLA v26.4s, v16.4s, v3.s[2] 174 FMLA v28.4s, v16.4s, v4.s[2] 175 FMLA v30.4s, v16.4s, v5.s[2] 176 FMLA v21.4s, v17.4s, v0.s[2] 177 FMLA v23.4s, v17.4s, v1.s[2] 178 FMLA v25.4s, v17.4s, v2.s[2] 179 FMLA v27.4s, v17.4s, v3.s[2] 180 FMLA v29.4s, v17.4s, v4.s[2] 181 FMLA v31.4s, v17.4s, v5.s[2] 182 183 FMLA v20.4s, v18.4s, v0.s[3] 184 FMLA v22.4s, v18.4s, v1.s[3] 185 FMLA v24.4s, v18.4s, v2.s[3] 186 FMLA v26.4s, v18.4s, v3.s[3] 187 FMLA v28.4s, v18.4s, v4.s[3] 188 FMLA v30.4s, v18.4s, v5.s[3] 189 FMLA v21.4s, v19.4s, v0.s[3] 190 FMLA v23.4s, v19.4s, v1.s[3] 191 FMLA v25.4s, v19.4s, v2.s[3] 192 FMLA v27.4s, v19.4s, v3.s[3] 193 SUBS x0, x0, 16 194 FMLA v29.4s, v19.4s, v4.s[3] 195 FMLA v31.4s, v19.4s, v5.s[3] 196 B.HS 1b 197 198 # Is there a remainder?- 2 floats of A (8 bytes) or less 199 TST x0, 15 200 B.NE 5f 201 2024: 203 # Clamp 204 FMIN v20.4s, v20.4s, v6.4s 205 SUBS x1, x1, 8 206 FMIN v21.4s, v21.4s, v6.4s 207 FMIN v22.4s, v22.4s, v6.4s 208 FMIN v23.4s, v23.4s, v6.4s 209 FMIN v24.4s, v24.4s, v6.4s 210 FMIN v25.4s, v25.4s, v6.4s 211 FMIN v26.4s, v26.4s, v6.4s 212 FMIN v27.4s, v27.4s, v6.4s 213 FMIN v28.4s, v28.4s, v6.4s 214 FMIN v29.4s, v29.4s, v6.4s 215 FMIN v30.4s, v30.4s, v6.4s 216 FMIN v31.4s, v31.4s, v6.4s 217 FMAX v20.4s, v20.4s, v7.4s 218 FMAX v21.4s, v21.4s, v7.4s 219 FMAX v22.4s, v22.4s, v7.4s 220 FMAX v23.4s, v23.4s, v7.4s 221 FMAX v24.4s, v24.4s, v7.4s 222 FMAX v25.4s, v25.4s, v7.4s 223 FMAX v26.4s, v26.4s, v7.4s 224 FMAX v27.4s, v27.4s, v7.4s 225 FMAX v28.4s, v28.4s, v7.4s 226 FMAX v29.4s, v29.4s, v7.4s 227 FMAX v30.4s, v30.4s, v7.4s 228 FMAX v31.4s, v31.4s, v7.4s 229 230 # Store full 6 x 8 231 B.LO 7f 232 233 ST1 {v20.16b, v21.16b}, [x6], x14 234 SUB x3, x3, x2 // a0 -= kc 235 ST1 {v22.16b, v23.16b}, [x16], x14 236 SUB x9, x9, x2 // a1 -= kc 237 ST1 {v24.16b, v25.16b}, [x17], x14 238 SUB x10, x10, x2 // a2 -= kc 239 ST1 {v26.16b, v27.16b}, [x18], x14 240 SUB x11, x11, x2 // a3 -= kc 241 ST1 {v28.16b, v29.16b}, [x13], x14 242 SUB x12, x12, x2 // a4 -= kc 243 ST1 {v30.16b, v31.16b}, [x7], x14 244 SUB x4, x4, x2 // a5 -= kc 245 246 B.HI 0b 247 RET 248 2495: 250 # Is there a remainder?- 2 floats of A (8 bytes) 251 TBZ x0, 3, 6f 252 253 # Remainder- 2 floats of A (8 bytes) 254 LDR d0, [x3], 8 255 LDP q16, q17, [x5], 32 256 LDR d1, [x9], 8 257 LDR d2, [x10], 8 258 LDR d3, [x11], 8 259 LDR d4, [x12], 8 260 LDR d5, [x4], 8 261 FMLA v20.4s, v16.4s, v0.s[0] 262 FMLA v22.4s, v16.4s, v1.s[0] 263 FMLA v24.4s, v16.4s, v2.s[0] 264 FMLA v26.4s, v16.4s, v3.s[0] 265 LDP q18, q19, [x5], 32 266 FMLA v28.4s, v16.4s, v4.s[0] 267 FMLA v30.4s, v16.4s, v5.s[0] 268 FMLA v21.4s, v17.4s, v0.s[0] 269 FMLA v23.4s, v17.4s, v1.s[0] 270 FMLA v25.4s, v17.4s, v2.s[0] 271 FMLA v27.4s, v17.4s, v3.s[0] 272 FMLA v29.4s, v17.4s, v4.s[0] 273 FMLA v31.4s, v17.4s, v5.s[0] 274 275 FMLA v20.4s, v18.4s, v0.s[1] 276 FMLA v22.4s, v18.4s, v1.s[1] 277 FMLA v24.4s, v18.4s, v2.s[1] 278 FMLA v26.4s, v18.4s, v3.s[1] 279 FMLA v28.4s, v18.4s, v4.s[1] 280 FMLA v30.4s, v18.4s, v5.s[1] 281 FMLA v21.4s, v19.4s, v0.s[1] 282 FMLA v23.4s, v19.4s, v1.s[1] 283 FMLA v25.4s, v19.4s, v2.s[1] 284 FMLA v27.4s, v19.4s, v3.s[1] 285 FMLA v29.4s, v19.4s, v4.s[1] 286 FMLA v31.4s, v19.4s, v5.s[1] 287 288 # Is there a remainder?- 1 floats of A (4 bytes) 289 TBZ x0, 2, 4b 290 291 # Remainder- 1 float of A (4 bytes) 2926: 293 LDR s0, [x3], 4 294 LDP q16, q17, [x5], 32 295 LDR s1, [x9], 4 296 LDR s2, [x10], 4 297 LDR s3, [x11], 4 298 LDR s4, [x12], 4 299 LDR s5, [x4], 4 300 FMLA v20.4s, v16.4s, v0.s[0] 301 FMLA v22.4s, v16.4s, v1.s[0] 302 FMLA v24.4s, v16.4s, v2.s[0] 303 FMLA v26.4s, v16.4s, v3.s[0] 304 FMLA v28.4s, v16.4s, v4.s[0] 305 FMLA v30.4s, v16.4s, v5.s[0] 306 FMLA v21.4s, v17.4s, v0.s[0] 307 FMLA v23.4s, v17.4s, v1.s[0] 308 FMLA v25.4s, v17.4s, v2.s[0] 309 FMLA v27.4s, v17.4s, v3.s[0] 310 FMLA v29.4s, v17.4s, v4.s[0] 311 FMLA v31.4s, v17.4s, v5.s[0] 312 B 4b 313 314 # Store odd width 3157: 316 TBZ x1, 2, 8f 317 STR q20, [x6], 16 318 MOV v20.16b, v21.16b 319 STR q22, [x16], 16 320 MOV v22.16b, v23.16b 321 STR q24, [x17], 16 322 MOV v24.16b, v25.16b 323 STR q26, [x18], 16 324 MOV v26.16b, v27.16b 325 STR q28, [x13], 16 326 MOV v28.16b, v29.16b 327 STR q30, [x7], 16 328 MOV v30.16b, v31.16b 329 3308: 331 TBZ x1, 1, 9f 332 STR d20, [x6], 8 333 DUP d20, v20.d[1] 334 STR d22, [x16], 8 335 DUP d22, v22.d[1] 336 STR d24, [x17], 8 337 DUP d24, v24.d[1] 338 STR d26, [x18], 8 339 DUP d26, v26.d[1] 340 STR d28, [x13], 8 341 DUP d28, v28.d[1] 342 STR d30, [x7], 8 343 DUP d30, v30.d[1] 344 3459: 346 TBZ x1, 0, 10f 347 STR s20, [x6] 348 STR s22, [x16] 349 STR s24, [x17] 350 STR s26, [x18] 351 STR s28, [x13] 352 STR s30, [x7] 35310: 354 RET 355 356END_FUNCTION xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128 357 358#ifdef __ELF__ 359.section ".note.GNU-stack","",%progbits 360#endif 361