1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a55.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x8) 22 23# const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> (x8) 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x14 c3 40# x13 c4 41# x7 c5 42 43# x8 temporary vector shadow register 44 45# Vector register usage 46# A0 v0 47# A1 v1 48# A2 v2 49# A3 v3 50# A4 v4 51# A5 v5 52# B v16 v17 v18 v19 53# C v20 v21 54# C v22 v23 55# C v24 v25 56# C v26 v27 57# C v28 v29 58# C v30 v31 59# Clamp v6, (v4), (v5) 60# unused v7 61# unused A v8 v9 v10 v11 62# unused B v12 v13 v14 v15 63 64 65BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55 66 67 # Load params pointer 68 LDR x8, [sp, 8] 69 70 # Clamp A and C pointers 71 CMP x0, 2 // if mr < 2 72 ADD x9, x3, x4 // a1 = a0 + a_stride 73 ADD x16, x6, x7 // c1 = c0 + cm_stride 74 CSEL x9, x3, x9, LO // a1 = a0 75 CSEL x16, x6, x16, LO // c1 = c0 76 77 # Load params 78 LDR d6, [x8] 79 80 ADD x10, x9, x4 // a2 = a1 + a_stride 81 ADD x17, x16, x7 // c2 = c1 + cm_stride 82 // if mr <= 2 83 CSEL x10, x9, x10, LS // a2 = a1 84 CSEL x17, x16, x17, LS // c2 = c1 85 86 CMP x0, 4 // if mr < 4 87 ADD x11, x10, x4 // a3 = a2 + a_stride 88 ADD x14, x17, x7 // c3 = c2 + cm_stride 89 CSEL x11, x10, x11, LO // a3 = a2 90 CSEL x14, x17, x14, LO // c3 = c2 91 92 ADD x12, x11, x4 // a4 = a3 + a_stride 93 ADD x13, x14, x7 // c4 = c3 + cm_stride 94 // if mr <= 4 95 CSEL x12, x11, x12, LS // a4 = a3 96 CSEL x13, x14, x13, LS // c4 = c3 97 98 CMP x0, 6 // if mr < 6 99 ADD x4, x12, x4 // a5 = a4 + a_stride 100 ADD x7, x13, x7 // c5 = c4 + cm_stride 101 CSEL x4, x12, x4, LO // a5 = a4 102 CSEL x7, x13, x7, LO // c5 = c4 103 104 LDR x8, [sp] // load cn_stride 105 1060: 107 # Load initial bias from w into accumulators 108 LDP q20, q21, [x5], 32 109 MOV v22.16b, v20.16b 110 MOV v23.16b, v21.16b 111 MOV v24.16b, v20.16b 112 MOV v25.16b, v21.16b 113 MOV v26.16b, v20.16b 114 MOV v27.16b, v21.16b 115 MOV v28.16b, v20.16b 116 MOV v29.16b, v21.16b 117 MOV v30.16b, v20.16b 118 MOV v31.16b, v21.16b 119 120 # Is there at least 2 halffloats (4 bytes)? 121 SUBS x0, x2, 4 // k = kc - 4 122 B.LO 4f 123 124 # Prologue - load 4 A and 2 B 125 126 LDR s0, [x3], 4 127 LDR q16, [x5], 16 128 LDR q17, [x5], 16 129 LDR s1, [x9], 4 130 LDR s2, [x10], 4 131 LDR s3, [x11], 4 132 133 # Is there at least 2 halffloats for main loop? 134 SUBS x0, x0, 4 135 B.LO 2f 136 137 # Main loop - 2 halffloats of A (4 bytes) 138 # 24 FMA + 6 ld32 A + 4 LDR B 1391: 140 FMLA v20.8h, v16.8h, v0.h[0] 141 LDR s4, [x12], 4 // a4 142 FMLA v21.8h, v17.8h, v0.h[0] 143 LDR s5, [x4], 4 // a5 144 FMLA v22.8h, v16.8h, v1.h[0] 145 LDR d18, [x5], 8 // b0 146 FMLA v23.8h, v17.8h, v1.h[0] 147 FMLA v24.8h, v16.8h, v2.h[0] 148 LD1 {v18.d}[1], [x5], 8 // b0 149 FMLA v25.8h, v17.8h, v2.h[0] 150 FMLA v26.8h, v16.8h, v3.h[0] 151 LDR d19, [x5], 8 // b1 152 FMLA v27.8h, v17.8h, v3.h[0] 153 FMLA v28.8h, v16.8h, v4.h[0] 154 LD1 {v19.d}[1], [x5], 8 // b1 155 FMLA v29.8h, v17.8h, v4.h[0] 156 FMLA v30.8h, v16.8h, v5.h[0] 157 FMLA v31.8h, v17.8h, v5.h[0] 158 SUBS x0, x0, 4 159 160 FMLA v20.8h, v18.8h, v0.h[1] 161 LDR q16, [x5], 16 162 FMLA v21.8h, v19.8h, v0.h[1] 163 FMLA v22.8h, v18.8h, v1.h[1] 164 LDR q17, [x5], 16 165 FMLA v23.8h, v19.8h, v1.h[1] 166 FMLA v24.8h, v18.8h, v2.h[1] 167 LDR s0, [x3], 4 168 FMLA v25.8h, v19.8h, v2.h[1] 169 FMLA v26.8h, v18.8h, v3.h[1] 170 LDR s1, [x9], 4 171 FMLA v27.8h, v19.8h, v3.h[1] 172 FMLA v28.8h, v18.8h, v4.h[1] 173 LDR s2, [x10], 4 174 FMLA v29.8h, v19.8h, v4.h[1] 175 FMLA v30.8h, v18.8h, v5.h[1] 176 LDR s3, [x11], 4 177 FMLA v31.8h, v19.8h, v5.h[1] 178 B.HS 1b 179 180 # Epilogue - same as main loop but no loads for next loop 1812: 182 FMLA v20.8h, v16.8h, v0.h[0] 183 LDR s4, [x12], 4 184 FMLA v21.8h, v17.8h, v0.h[0] 185 LDR s5, [x4], 4 186 FMLA v22.8h, v16.8h, v1.h[0] 187 LDR q18, [x5], 16 188 FMLA v23.8h, v17.8h, v1.h[0] 189 FMLA v24.8h, v16.8h, v2.h[0] 190 LDR q19, [x5], 16 191 FMLA v25.8h, v17.8h, v2.h[0] 192 FMLA v26.8h, v16.8h, v3.h[0] 193 FMLA v27.8h, v17.8h, v3.h[0] 194 FMLA v28.8h, v16.8h, v4.h[0] 195 FMLA v29.8h, v17.8h, v4.h[0] 196 FMLA v30.8h, v16.8h, v5.h[0] 197 FMLA v31.8h, v17.8h, v5.h[0] 198 199 FMLA v20.8h, v18.8h, v0.h[1] 200 FMLA v21.8h, v19.8h, v0.h[1] 201 FMLA v22.8h, v18.8h, v1.h[1] 202 FMLA v23.8h, v19.8h, v1.h[1] 203 FMLA v24.8h, v18.8h, v2.h[1] 204 FMLA v25.8h, v19.8h, v2.h[1] 205 FMLA v26.8h, v18.8h, v3.h[1] 206 FMLA v27.8h, v19.8h, v3.h[1] 207 FMLA v28.8h, v18.8h, v4.h[1] 208 FMLA v29.8h, v19.8h, v4.h[1] 209 FMLA v30.8h, v18.8h, v5.h[1] 210 FMLA v31.8h, v19.8h, v5.h[1] 211 212 # Is there a remainder?- 1 halffloat of A (2 bytes) 213 TBNZ x0, 1, 4f 2143: 215 # Scale and Clamp 216 FMUL v20.8h, v20.8h, v6.h[0] 217 DUP v4.8h, v6.h[1] 218 FMUL v21.8h, v21.8h, v6.h[0] 219 DUP v5.8h, v6.h[2] 220 FMUL v22.8h, v22.8h, v6.h[0] 221 FMUL v23.8h, v23.8h, v6.h[0] 222 FMUL v24.8h, v24.8h, v6.h[0] 223 FMUL v25.8h, v25.8h, v6.h[0] 224 FMUL v26.8h, v26.8h, v6.h[0] 225 FMUL v27.8h, v27.8h, v6.h[0] 226 FMUL v28.8h, v28.8h, v6.h[0] 227 FMUL v29.8h, v29.8h, v6.h[0] 228 FMUL v30.8h, v30.8h, v6.h[0] 229 FMUL v31.8h, v31.8h, v6.h[0] 230 FMAX v20.8h, v20.8h, v4.8h 231 FMAX v21.8h, v21.8h, v4.8h 232 FMAX v22.8h, v22.8h, v4.8h 233 FMAX v23.8h, v23.8h, v4.8h 234 FMAX v24.8h, v24.8h, v4.8h 235 FMAX v25.8h, v25.8h, v4.8h 236 FMAX v26.8h, v26.8h, v4.8h 237 FMAX v27.8h, v27.8h, v4.8h 238 FMAX v28.8h, v28.8h, v4.8h 239 FMAX v29.8h, v29.8h, v4.8h 240 FMAX v30.8h, v30.8h, v4.8h 241 FMAX v31.8h, v31.8h, v4.8h 242 SUBS x1, x1, 16 243 FMIN v20.8h, v20.8h, v5.8h 244 FMIN v21.8h, v21.8h, v5.8h 245 FMIN v22.8h, v22.8h, v5.8h 246 FMIN v23.8h, v23.8h, v5.8h 247 FMIN v24.8h, v24.8h, v5.8h 248 FMIN v25.8h, v25.8h, v5.8h 249 FMIN v26.8h, v26.8h, v5.8h 250 FMIN v27.8h, v27.8h, v5.8h 251 FMIN v28.8h, v28.8h, v5.8h 252 FMIN v29.8h, v29.8h, v5.8h 253 FMIN v30.8h, v30.8h, v5.8h 254 FMIN v31.8h, v31.8h, v5.8h 255 256 # Store full 6 x 16 257 B.LO 5f 258 259 ST1 {v20.16b, v21.16b}, [x6], x8 260 SUB x3, x3, x2 // a0 -= kc 261 ST1 {v22.16b, v23.16b}, [x16], x8 262 SUB x9, x9, x2 // a1 -= kc 263 ST1 {v24.16b, v25.16b}, [x17], x8 264 SUB x10, x10, x2 // a2 -= kc 265 ST1 {v26.16b, v27.16b}, [x14], x8 266 SUB x11, x11, x2 // a3 -= kc 267 ST1 {v28.16b, v29.16b}, [x13], x8 268 SUB x12, x12, x2 // a4 -= kc 269 ST1 {v30.16b, v31.16b}, [x7], x8 270 SUB x4, x4, x2 // a5 -= kc 271 272 B.HI 0b 273 RET 274 2754: 276 # Remainder- 1 halffloat of A (2 bytes) 277 LDR h0, [x3], 2 278 LDR q16, [x5], 16 279 LDR q17, [x5], 16 280 LDR h1, [x9], 2 281 LDR h2, [x10], 2 282 LDR h3, [x11], 2 283 LDR h4, [x12], 2 284 LDR h5, [x4], 2 285 FMLA v20.8h, v16.8h, v0.h[0] 286 FMLA v22.8h, v16.8h, v1.h[0] 287 FMLA v24.8h, v16.8h, v2.h[0] 288 FMLA v26.8h, v16.8h, v3.h[0] 289 FMLA v28.8h, v16.8h, v4.h[0] 290 FMLA v30.8h, v16.8h, v5.h[0] 291 FMLA v21.8h, v17.8h, v0.h[0] 292 FMLA v23.8h, v17.8h, v1.h[0] 293 FMLA v25.8h, v17.8h, v2.h[0] 294 FMLA v27.8h, v17.8h, v3.h[0] 295 FMLA v29.8h, v17.8h, v4.h[0] 296 FMLA v31.8h, v17.8h, v5.h[0] 297 B 3b 298 299 # Store odd width 3005: 301 TBZ x1, 3, 6f 302 STR q20, [x6], 16 303 MOV v20.16b, v21.16b 304 STR q22, [x16], 16 305 MOV v22.16b, v23.16b 306 STR q24, [x17], 16 307 MOV v24.16b, v25.16b 308 STR q26, [x14], 16 309 MOV v26.16b, v27.16b 310 STR q28, [x13], 16 311 MOV v28.16b, v29.16b 312 STR q30, [x7], 16 313 MOV v30.16b, v31.16b 314 3156: 316 TBZ x1, 2, 7f 317 STR d20, [x6], 8 318 STR d22, [x16], 8 319 DUP d20, v20.d[1] 320 DUP d22, v22.d[1] 321 STR d24, [x17], 8 322 STR d26, [x14], 8 323 DUP d24, v24.d[1] 324 DUP d26, v26.d[1] 325 STR d28, [x13], 8 326 STR d30, [x7], 8 327 DUP d28, v28.d[1] 328 DUP d30, v30.d[1] 329 3307: 331 TBZ x1, 1, 8f 332 STR s20, [x6], 4 333 STR s22, [x16], 4 334 DUP s20, v20.s[1] 335 DUP s22, v22.s[1] 336 STR s24, [x17], 4 337 STR s26, [x14], 4 338 DUP s24, v24.s[1] 339 DUP s26, v26.s[1] 340 STR s28, [x13], 4 341 STR s30, [x7], 4 342 DUP s28, v28.s[1] 343 DUP s30, v30.s[1] 344 3458: 346 TBZ x1, 0, 9f 347 STR h20, [x6] 348 STR h22, [x16] 349 STR h24, [x17] 350 STR h26, [x14] 351 STR h28, [x13] 352 STR h30, [x7] 3539: 354 RET 355 356END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55 357 358#ifdef __ELF__ 359.section ".note.GNU-stack","",%progbits 360#endif 361