1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x8) 22 23# const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x14 c3 40# x13 c4 41# x7 c5 42 43# Vector register usage 44# A0 v0 45# A1 v1 46# A2 v2 47# A3 v3 48# A4 v4 49# A5 v5 50# B v16 v17 v18 v19 51# C v20 v21 52# C v22 v23 53# C v24 v25 54# C v26 v27 55# C v28 v29 56# C v30 v31 57# Clamp v6, (v4), (v5) 58# unused v7 59# unused A v8 v9 v10 v11 60# unused B v12 v13 v14 v15 61 62 63BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75 64 65 # Load params pointer 66 LDR x8, [sp, 8] 67 68 # Clamp A and C pointers 69 CMP x0, 2 // if mr < 2 70 ADD x9, x3, x4 // a1 = a0 + a_stride 71 ADD x16, x6, x7 // c1 = c0 + cm_stride 72 CSEL x9, x3, x9, LO // a1 = a0 73 CSEL x16, x6, x16, LO // c1 = c0 74 75 # Load params 76 LDR d6, [x8] 77 78 ADD x10, x9, x4 // a2 = a1 + a_stride 79 ADD x17, x16, x7 // c2 = c1 + cm_stride 80 // if mr <= 2 81 CSEL x10, x9, x10, LS // a2 = a1 82 CSEL x17, x16, x17, LS // c2 = c1 83 84 CMP x0, 4 // if mr < 4 85 ADD x11, x10, x4 // a3 = a2 + a_stride 86 ADD x14, x17, x7 // c3 = c2 + cm_stride 87 CSEL x11, x10, x11, LO // a3 = a2 88 CSEL x14, x17, x14, LO // c3 = c2 89 90 ADD x12, x11, x4 // a4 = a3 + a_stride 91 ADD x13, x14, x7 // c4 = c3 + cm_stride 92 // if mr <= 4 93 CSEL x12, x11, x12, LS // a4 = a3 94 CSEL x13, x14, x13, LS // c4 = c3 95 96 CMP x0, 6 // if mr < 6 97 ADD x4, x12, x4 // a5 = a4 + a_stride 98 ADD x7, x13, x7 // c5 = c4 + cm_stride 99 CSEL x4, x12, x4, LO // a5 = a4 100 CSEL x7, x13, x7, LO // c5 = c4 101 102 LDR x8, [sp] // load cn_stride 103 1040: 105 # Load initial bias from w into accumulators 106 LDP q20, q21, [x5], 32 107 MOV v22.16b, v20.16b 108 MOV v23.16b, v21.16b 109 MOV v24.16b, v20.16b 110 MOV v25.16b, v21.16b 111 MOV v26.16b, v20.16b 112 MOV v27.16b, v21.16b 113 MOV v28.16b, v20.16b 114 MOV v29.16b, v21.16b 115 MOV v30.16b, v20.16b 116 MOV v31.16b, v21.16b 117 118 # Is there at least 2 halffloats (4 bytes)? 119 SUBS x0, x2, 4 // k = kc - 4 120 B.LO 4f 121 122 # Prologue - load 4 A and 2 B 123 124 LDR s0, [x3], 4 125 LDR q16, [x5], 16 126 LDR q17, [x5], 16 127 LDR s1, [x9], 4 128 LDR s2, [x10], 4 129 LDR s3, [x11], 4 130 131 # Is there at least 2 halffloats for main loop? 132 SUBS x0, x0, 4 133 B.LO 2f 134 135 # Main loop - 2 halffloats of A (4 bytes) 136 # 24 FMA + 6 ld32 A + 4 LDR B 1371: 138 FMLA v20.8h, v16.8h, v0.h[0] 139 LDR s4, [x12], 4 140 FMLA v21.8h, v17.8h, v0.h[0] 141 LDR s5, [x4], 4 142 FMLA v22.8h, v16.8h, v1.h[0] 143 LDR q18, [x5], 16 144 FMLA v23.8h, v17.8h, v1.h[0] 145 FMLA v24.8h, v16.8h, v2.h[0] 146 LDR q19, [x5], 16 147 FMLA v25.8h, v17.8h, v2.h[0] 148 FMLA v26.8h, v16.8h, v3.h[0] 149 FMLA v27.8h, v17.8h, v3.h[0] 150 FMLA v28.8h, v16.8h, v4.h[0] 151 FMLA v29.8h, v17.8h, v4.h[0] 152 FMLA v30.8h, v16.8h, v5.h[0] 153 FMLA v31.8h, v17.8h, v5.h[0] 154 SUBS x0, x0, 4 155 156 FMLA v20.8h, v18.8h, v0.h[1] 157 LDR q16, [x5], 16 158 FMLA v21.8h, v19.8h, v0.h[1] 159 FMLA v22.8h, v18.8h, v1.h[1] 160 LDR q17, [x5], 16 161 FMLA v23.8h, v19.8h, v1.h[1] 162 FMLA v24.8h, v18.8h, v2.h[1] 163 LDR s0, [x3], 4 164 FMLA v25.8h, v19.8h, v2.h[1] 165 FMLA v26.8h, v18.8h, v3.h[1] 166 LDR s1, [x9], 4 167 FMLA v27.8h, v19.8h, v3.h[1] 168 FMLA v28.8h, v18.8h, v4.h[1] 169 LDR s2, [x10], 4 170 FMLA v29.8h, v19.8h, v4.h[1] 171 FMLA v30.8h, v18.8h, v5.h[1] 172 LDR s3, [x11], 4 173 FMLA v31.8h, v19.8h, v5.h[1] 174 B.HS 1b 175 176 # Epilogue - same as main loop but no loads for next loop 1772: 178 FMLA v20.8h, v16.8h, v0.h[0] 179 LDR s4, [x12], 4 180 FMLA v21.8h, v17.8h, v0.h[0] 181 LDR s5, [x4], 4 182 FMLA v22.8h, v16.8h, v1.h[0] 183 LDR q18, [x5], 16 184 FMLA v23.8h, v17.8h, v1.h[0] 185 FMLA v24.8h, v16.8h, v2.h[0] 186 LDR q19, [x5], 16 187 FMLA v25.8h, v17.8h, v2.h[0] 188 FMLA v26.8h, v16.8h, v3.h[0] 189 FMLA v27.8h, v17.8h, v3.h[0] 190 FMLA v28.8h, v16.8h, v4.h[0] 191 FMLA v29.8h, v17.8h, v4.h[0] 192 FMLA v30.8h, v16.8h, v5.h[0] 193 FMLA v31.8h, v17.8h, v5.h[0] 194 195 FMLA v20.8h, v18.8h, v0.h[1] 196 FMLA v21.8h, v19.8h, v0.h[1] 197 FMLA v22.8h, v18.8h, v1.h[1] 198 FMLA v23.8h, v19.8h, v1.h[1] 199 FMLA v24.8h, v18.8h, v2.h[1] 200 FMLA v25.8h, v19.8h, v2.h[1] 201 FMLA v26.8h, v18.8h, v3.h[1] 202 FMLA v27.8h, v19.8h, v3.h[1] 203 FMLA v28.8h, v18.8h, v4.h[1] 204 FMLA v29.8h, v19.8h, v4.h[1] 205 FMLA v30.8h, v18.8h, v5.h[1] 206 FMLA v31.8h, v19.8h, v5.h[1] 207 208 # Is there a remainder?- 1 halffloat of A (2 bytes) 209 TBNZ x0, 1, 4f 2103: 211 # Scale and Clamp 212 FMUL v20.8h, v20.8h, v6.h[0] 213 DUP v4.8h, v6.h[1] 214 FMUL v21.8h, v21.8h, v6.h[0] 215 DUP v5.8h, v6.h[2] 216 FMUL v22.8h, v22.8h, v6.h[0] 217 FMUL v23.8h, v23.8h, v6.h[0] 218 FMUL v24.8h, v24.8h, v6.h[0] 219 FMUL v25.8h, v25.8h, v6.h[0] 220 FMUL v26.8h, v26.8h, v6.h[0] 221 FMUL v27.8h, v27.8h, v6.h[0] 222 FMUL v28.8h, v28.8h, v6.h[0] 223 FMUL v29.8h, v29.8h, v6.h[0] 224 FMUL v30.8h, v30.8h, v6.h[0] 225 FMUL v31.8h, v31.8h, v6.h[0] 226 FMAX v20.8h, v20.8h, v4.8h 227 FMAX v21.8h, v21.8h, v4.8h 228 FMAX v22.8h, v22.8h, v4.8h 229 FMAX v23.8h, v23.8h, v4.8h 230 FMAX v24.8h, v24.8h, v4.8h 231 FMAX v25.8h, v25.8h, v4.8h 232 FMAX v26.8h, v26.8h, v4.8h 233 FMAX v27.8h, v27.8h, v4.8h 234 FMAX v28.8h, v28.8h, v4.8h 235 FMAX v29.8h, v29.8h, v4.8h 236 FMAX v30.8h, v30.8h, v4.8h 237 FMAX v31.8h, v31.8h, v4.8h 238 SUBS x1, x1, 16 239 FMIN v20.8h, v20.8h, v5.8h 240 FMIN v21.8h, v21.8h, v5.8h 241 FMIN v22.8h, v22.8h, v5.8h 242 FMIN v23.8h, v23.8h, v5.8h 243 FMIN v24.8h, v24.8h, v5.8h 244 FMIN v25.8h, v25.8h, v5.8h 245 FMIN v26.8h, v26.8h, v5.8h 246 FMIN v27.8h, v27.8h, v5.8h 247 FMIN v28.8h, v28.8h, v5.8h 248 FMIN v29.8h, v29.8h, v5.8h 249 FMIN v30.8h, v30.8h, v5.8h 250 FMIN v31.8h, v31.8h, v5.8h 251 252 # Store full 6 x 16 253 B.LO 5f 254 255 ST1 {v20.16b, v21.16b}, [x6], x8 256 SUB x3, x3, x2 // a0 -= kc 257 ST1 {v22.16b, v23.16b}, [x16], x8 258 SUB x9, x9, x2 // a1 -= kc 259 ST1 {v24.16b, v25.16b}, [x17], x8 260 SUB x10, x10, x2 // a2 -= kc 261 ST1 {v26.16b, v27.16b}, [x14], x8 262 SUB x11, x11, x2 // a3 -= kc 263 ST1 {v28.16b, v29.16b}, [x13], x8 264 SUB x12, x12, x2 // a4 -= kc 265 ST1 {v30.16b, v31.16b}, [x7], x8 266 SUB x4, x4, x2 // a5 -= kc 267 268 B.HI 0b 269 RET 270 2714: 272 # Remainder- 1 halffloat of A (2 bytes) 273 LDR h0, [x3], 2 274 LDR q16, [x5], 16 275 LDR q17, [x5], 16 276 LDR h1, [x9], 2 277 LDR h2, [x10], 2 278 LDR h3, [x11], 2 279 LDR h4, [x12], 2 280 LDR h5, [x4], 2 281 FMLA v20.8h, v16.8h, v0.h[0] 282 FMLA v22.8h, v16.8h, v1.h[0] 283 FMLA v24.8h, v16.8h, v2.h[0] 284 FMLA v26.8h, v16.8h, v3.h[0] 285 FMLA v28.8h, v16.8h, v4.h[0] 286 FMLA v30.8h, v16.8h, v5.h[0] 287 FMLA v21.8h, v17.8h, v0.h[0] 288 FMLA v23.8h, v17.8h, v1.h[0] 289 FMLA v25.8h, v17.8h, v2.h[0] 290 FMLA v27.8h, v17.8h, v3.h[0] 291 FMLA v29.8h, v17.8h, v4.h[0] 292 FMLA v31.8h, v17.8h, v5.h[0] 293 B 3b 294 295 # Store odd width 2965: 297 TBZ x1, 3, 6f 298 STR q20, [x6], 16 299 MOV v20.16b, v21.16b 300 STR q22, [x16], 16 301 MOV v22.16b, v23.16b 302 STR q24, [x17], 16 303 MOV v24.16b, v25.16b 304 STR q26, [x14], 16 305 MOV v26.16b, v27.16b 306 STR q28, [x13], 16 307 MOV v28.16b, v29.16b 308 STR q30, [x7], 16 309 MOV v30.16b, v31.16b 310 3116: 312 TBZ x1, 2, 7f 313 STR d20, [x6], 8 314 STR d22, [x16], 8 315 DUP d20, v20.d[1] 316 DUP d22, v22.d[1] 317 STR d24, [x17], 8 318 STR d26, [x14], 8 319 DUP d24, v24.d[1] 320 DUP d26, v26.d[1] 321 STR d28, [x13], 8 322 STR d30, [x7], 8 323 DUP d28, v28.d[1] 324 DUP d30, v30.d[1] 325 3267: 327 TBZ x1, 1, 8f 328 STR s20, [x6], 4 329 STR s22, [x16], 4 330 DUP s20, v20.s[1] 331 DUP s22, v22.s[1] 332 STR s24, [x17], 4 333 STR s26, [x14], 4 334 DUP s24, v24.s[1] 335 DUP s26, v26.s[1] 336 STR s28, [x13], 4 337 STR s30, [x7], 4 338 DUP s28, v28.s[1] 339 DUP s30, v30.s[1] 340 3418: 342 TBZ x1, 0, 9f 343 STR h20, [x6] 344 STR h22, [x16] 345 STR h24, [x17] 346 STR h26, [x14] 347 STR h28, [x13] 348 STR h30, [x7] 3499: 350 RET 351 352END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75 353 354#ifdef __ELF__ 355.section ".note.GNU-stack","",%progbits 356#endif 357