1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/8x8-aarch64-neonfp16arith-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x8) 22 23# const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x19 a5 34# x20 a6 35# x4 a7 36 37# C pointers 38# x6 c0 39# x16 c1 40# x17 c2 41# x14 c3 42# x13 c4 43# x21 c5 44# x22 c6 45# x7 c7 46 47# Vector register usage 48# A0 v0 49# A1 v1 50# A2 v2 51# A3 v3 52# A4 v4 53# A5 v5 54# A6 v6 55# A7 v7 56# B v16 v17 v18 v19 57# C v24 58# C v25 59# C v26 60# C v27 61# C v28 62# C v29 63# C v30 64# C v31 65 66# Clamp v20 v21 v22 67# unused A v8 v9 v10 v11 68# unused B v12 v13 v14 v15 69 70BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64 71 72 # Load params pointer 73 LDR x8, [sp, 8] 74 75 # Save x19,x20,x21,x22 on stack 76 STP x19, x20, [sp, -32]! 77 STP x21, x22, [sp, 16] 78 79 # Clamp A and C pointers 80 CMP x0, 2 // if mr < 2 81 ADD x9, x3, x4 // a1 = a0 + a_stride 82 ADD x16, x6, x7 // c1 = c0 + cm_stride 83 CSEL x9, x3, x9, LO // a1 = a0 84 CSEL x16, x6, x16, LO // c1 = c0 85 86 # Load params 87 LD3R {v20.8h, v21.8h, v22.8h}, [x8] 88 89 ADD x10, x9, x4 // a2 = a1 + a_stride 90 ADD x17, x16, x7 // c2 = c1 + cm_stride 91 // if mr <= 2 92 CSEL x10, x9, x10, LS // a2 = a1 93 CSEL x17, x16, x17, LS // c2 = c1 94 95 CMP x0, 4 // if mr < 4 96 ADD x11, x10, x4 // a3 = a2 + a_stride 97 ADD x14, x17, x7 // c3 = c2 + cm_stride 98 CSEL x11, x10, x11, LO // a3 = a2 99 CSEL x14, x17, x14, LO // c3 = c2 100 101 ADD x12, x11, x4 // a4 = a3 + a_stride 102 ADD x13, x14, x7 // c4 = c3 + cm_stride 103 // if mr <= 4 104 CSEL x12, x11, x12, LS // a4 = a3 105 CSEL x13, x14, x13, LS // c4 = c3 106 107 CMP x0, 6 // if mr < 6 108 ADD x19, x12, x4 // a5 = a4 + a_stride 109 ADD x21, x13, x7 // c5 = c4 + cm_stride 110 CSEL x19, x12, x19, LO // a5 = a4 111 CSEL x21, x13, x21, LO // c5 = c4 112 113 ADD x20, x19, x4 // a6 = a5 + a_stride 114 ADD x22, x21, x7 // c6 = c5 + cm_stride 115 // if mr <= 6 116 CSEL x20, x19, x20, LS // a6 = a5 117 CSEL x22, x21, x22, LS // c6 = c5 118 119 CMP x0, 8 // if mr < 8 120 ADD x4, x20, x4 // a7 = a5 + a_stride 121 ADD x7, x22, x7 // c7 = c5 + cm_stride 122 CSEL x4, x20, x4, LO // a7 = a5 123 CSEL x7, x22, x7, LO // c7 = c5 124 125 LDR x8, [sp, 32] // load cn_stride 126 1270: 128 # Load initial bias from w into accumulators 129 LDR q24, [x5], 16 130 MOV v25.16b, v24.16b 131 MOV v26.16b, v24.16b 132 MOV v27.16b, v24.16b 133 MOV v28.16b, v24.16b 134 MOV v29.16b, v24.16b 135 MOV v30.16b, v24.16b 136 MOV v31.16b, v24.16b 137 138 # Is there at least 4 halffloats (8 bytes)? 139 SUBS x0, x2, 8 // k = kc - 8 140 B.LO 3f 141 142 # Main loop - 4 halffloats of A (8 bytes) 143 # 32 FMA + 8 ld64 A + 4 LDR B 1441: 145 LDR d0, [x3], 8 146 LDR q16, [x5], 16 147 LDR q17, [x5], 16 148 LDR d1, [x9], 8 149 LDR d2, [x10], 8 150 LDR d3, [x11], 8 151 LDR d4, [x12], 8 152 LDR d5, [x19], 8 153 LDR d6, [x20], 8 154 LDR d7, [x4], 8 155 SUBS x0, x0, 8 156 FMLA v24.8h, v16.8h, v0.h[0] 157 FMLA v25.8h, v16.8h, v1.h[0] 158 FMLA v26.8h, v16.8h, v2.h[0] 159 FMLA v27.8h, v16.8h, v3.h[0] 160 FMLA v28.8h, v16.8h, v4.h[0] 161 FMLA v29.8h, v16.8h, v5.h[0] 162 FMLA v30.8h, v16.8h, v6.h[0] 163 FMLA v31.8h, v16.8h, v7.h[0] 164 LDR q18, [x5], 16 165 LDR q19, [x5], 16 166 167 FMLA v24.8h, v17.8h, v0.h[1] 168 FMLA v25.8h, v17.8h, v1.h[1] 169 FMLA v26.8h, v17.8h, v2.h[1] 170 FMLA v27.8h, v17.8h, v3.h[1] 171 FMLA v28.8h, v17.8h, v4.h[1] 172 FMLA v29.8h, v17.8h, v5.h[1] 173 FMLA v30.8h, v17.8h, v6.h[1] 174 FMLA v31.8h, v17.8h, v7.h[1] 175 176 FMLA v24.8h, v18.8h, v0.h[2] 177 FMLA v25.8h, v18.8h, v1.h[2] 178 FMLA v26.8h, v18.8h, v2.h[2] 179 FMLA v27.8h, v18.8h, v3.h[2] 180 FMLA v28.8h, v18.8h, v4.h[2] 181 FMLA v29.8h, v18.8h, v5.h[2] 182 FMLA v30.8h, v18.8h, v6.h[2] 183 FMLA v31.8h, v18.8h, v7.h[2] 184 185 FMLA v24.8h, v19.8h, v0.h[3] 186 FMLA v25.8h, v19.8h, v1.h[3] 187 FMLA v26.8h, v19.8h, v2.h[3] 188 FMLA v27.8h, v19.8h, v3.h[3] 189 FMLA v28.8h, v19.8h, v4.h[3] 190 FMLA v29.8h, v19.8h, v5.h[3] 191 FMLA v30.8h, v19.8h, v6.h[3] 192 FMLA v31.8h, v19.8h, v7.h[3] 193 B.HS 1b 194 195 # Is there a remainder?- 2 halffloats of A (4 bytes) 196 TBNZ x0, 2, 4f 197 # Is there a remainder?- 1 halffloats of A (2 bytes) 198 TBNZ x0, 1, 5f 1992: 200 # Scale and Clamp 201 FMUL v24.8h, v24.8h, v20.8h 202 FMUL v25.8h, v25.8h, v20.8h 203 FMUL v26.8h, v26.8h, v20.8h 204 FMUL v27.8h, v27.8h, v20.8h 205 FMUL v28.8h, v28.8h, v20.8h 206 FMUL v29.8h, v29.8h, v20.8h 207 FMUL v30.8h, v30.8h, v20.8h 208 FMUL v31.8h, v31.8h, v20.8h 209 FMAX v24.8h, v24.8h, v21.8h 210 FMAX v25.8h, v25.8h, v21.8h 211 FMAX v26.8h, v26.8h, v21.8h 212 FMAX v27.8h, v27.8h, v21.8h 213 FMAX v28.8h, v28.8h, v21.8h 214 FMAX v29.8h, v29.8h, v21.8h 215 FMAX v30.8h, v30.8h, v21.8h 216 FMAX v31.8h, v31.8h, v21.8h 217 SUBS x1, x1, 8 218 FMIN v24.8h, v24.8h, v22.8h 219 FMIN v25.8h, v25.8h, v22.8h 220 FMIN v26.8h, v26.8h, v22.8h 221 FMIN v27.8h, v27.8h, v22.8h 222 FMIN v28.8h, v28.8h, v22.8h 223 FMIN v29.8h, v29.8h, v22.8h 224 FMIN v30.8h, v30.8h, v22.8h 225 FMIN v31.8h, v31.8h, v22.8h 226 227 # Store full 8 x 8 228 B.LO 6f 229 230 ST1 {v24.16b}, [x6], x8 231 SUB x3, x3, x2 // a0 -= kc 232 ST1 {v25.16b}, [x16], x8 233 SUB x9, x9, x2 // a1 -= kc 234 ST1 {v26.16b}, [x17], x8 235 SUB x10, x10, x2 // a2 -= kc 236 ST1 {v27.16b}, [x14], x8 237 SUB x11, x11, x2 // a3 -= kc 238 ST1 {v28.16b}, [x13], x8 239 SUB x12, x12, x2 // a4 -= kc 240 ST1 {v29.16b}, [x21], x8 241 SUB x19, x19, x2 // a6 -= kc 242 ST1 {v30.16b}, [x22], x8 243 SUB x20, x20, x2 // a6 -= kc 244 ST1 {v31.16b}, [x7], x8 245 SUB x4, x4, x2 // a7 -= kc 246 247 B.HI 0b 248 249 # Restore x19,x20,x21,x22 from stack 250 LDP x21, x22, [sp, 16] 251 LDP x19, x20, [sp], 32 252 RET 253 2543: 255 TBZ x0, 2, 5f 2564: 257 # Remainder- 2 halffloats of A (4 bytes) 258 LDR s0, [x3], 4 259 LDR q16, [x5], 16 260 LDR q17, [x5], 16 261 LDR s1, [x9], 4 262 LDR s2, [x10], 4 263 LDR s3, [x11], 4 264 LDR s4, [x12], 4 265 LDR s5, [x19], 4 266 LDR s6, [x20], 4 267 LDR s7, [x4], 4 268 269 FMLA v24.8h, v16.8h, v0.h[0] 270 FMLA v25.8h, v16.8h, v1.h[0] 271 FMLA v26.8h, v16.8h, v2.h[0] 272 FMLA v27.8h, v16.8h, v3.h[0] 273 FMLA v28.8h, v16.8h, v4.h[0] 274 FMLA v29.8h, v16.8h, v5.h[0] 275 FMLA v30.8h, v16.8h, v6.h[0] 276 FMLA v31.8h, v16.8h, v7.h[0] 277 278 FMLA v24.8h, v17.8h, v0.h[1] 279 FMLA v25.8h, v17.8h, v1.h[1] 280 FMLA v26.8h, v17.8h, v2.h[1] 281 FMLA v27.8h, v17.8h, v3.h[1] 282 FMLA v28.8h, v17.8h, v4.h[1] 283 FMLA v29.8h, v17.8h, v5.h[1] 284 FMLA v30.8h, v17.8h, v6.h[1] 285 FMLA v31.8h, v17.8h, v7.h[1] 286 287 TBZ x0, 1, 2b 288 2895: 290 # Remainder- 1 halffloat of A (2 bytes) 291 LDR h0, [x3], 2 292 LDR q16, [x5], 16 293 LDR h1, [x9], 2 294 LDR h2, [x10], 2 295 LDR h3, [x11], 2 296 LDR h4, [x12], 2 297 LDR h5, [x19], 2 298 LDR h6, [x20], 2 299 LDR h7, [x4], 2 300 301 FMLA v24.8h, v16.8h, v0.h[0] 302 FMLA v25.8h, v16.8h, v1.h[0] 303 FMLA v26.8h, v16.8h, v2.h[0] 304 FMLA v27.8h, v16.8h, v3.h[0] 305 FMLA v28.8h, v16.8h, v4.h[0] 306 FMLA v29.8h, v16.8h, v5.h[0] 307 FMLA v30.8h, v16.8h, v6.h[0] 308 FMLA v31.8h, v16.8h, v7.h[0] 309 B 2b 310 311 # Store odd width 3126: 313 TBZ x1, 2, 7f 314 STR d24, [x6], 8 315 STR d25, [x16], 8 316 DUP d24, v24.d[1] 317 DUP d25, v25.d[1] 318 STR d26, [x17], 8 319 STR d27, [x14], 8 320 DUP d26, v26.d[1] 321 DUP d27, v27.d[1] 322 STR d28, [x13], 8 323 STR d29, [x21], 8 324 DUP d28, v28.d[1] 325 DUP d29, v29.d[1] 326 STR d30, [x22], 8 327 STR d31, [x7], 8 328 DUP d30, v30.d[1] 329 DUP d31, v31.d[1] 3307: 331 TBZ x1, 1, 8f 332 STR s24, [x6], 4 333 STR s25, [x16], 4 334 DUP s24, v24.s[1] 335 DUP s25, v25.s[1] 336 STR s26, [x17], 4 337 STR s27, [x14], 4 338 DUP s26, v26.s[1] 339 DUP s27, v27.s[1] 340 STR s28, [x13], 4 341 STR s29, [x21], 4 342 DUP s28, v28.s[1] 343 DUP s29, v29.s[1] 344 STR s30, [x22], 4 345 STR s31, [x7], 4 346 DUP s30, v30.s[1] 347 DUP s31, v31.s[1] 348 3498: 350 TBZ x1, 0, 9f 351 STR h24, [x6] 352 STR h25, [x16] 353 STR h26, [x17] 354 STR h27, [x14] 355 STR h28, [x13] 356 STR h29, [x21] 357 STR h30, [x22] 358 STR h31, [x7] 3599: 360 # Restore x19,x20,x21,x22 from stack 361 LDP x21, x22, [sp, 16] 362 LDP x19, x20, [sp], 32 363 RET 364 365END_FUNCTION xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64 366 367#ifdef __ELF__ 368.section ".note.GNU-stack","",%progbits 369#endif 370