1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x8) 22 23# const float*restrict acc, [sp + 8] -> x15 24# const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x3 a0 30# x9 a1 31# x10 a2 32# x11 a3 33# x12 a4 34# x4 a5 35 36# C pointers 37# x6 c0 38# x16 c1 39# x17 c2 40# x14 c3 41# x13 c4 42# x7 c5 43 44# Vector register usage 45# A0 v0 46# A1 v1 47# A2 v2 48# A3 v3 49# A4 v4 50# A5 v5 51# B v16 v17 v18 v19 52# C v20 v21 53# C v22 v23 54# C v24 v25 55# C v26 v27 56# C v28 v29 57# C v30 v31 58# Clamp v6, (v4), (v5) 59# unused v7 60# unused A v8 v9 v10 v11 61# unused B v12 v13 v14 v15 62 63 64BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75 65 66 # Load acc, params pointer 67 LDP x15, x8, [sp, 8] 68 69 # Clamp A and C pointers 70 CMP x0, 2 // if mr < 2 71 ADD x9, x3, x4 // a1 = a0 + a_stride 72 ADD x16, x6, x7 // c1 = c0 + cm_stride 73 CSEL x9, x3, x9, LO // a1 = a0 74 CSEL x16, x6, x16, LO // c1 = c0 75 76 # Load params 77 LDR d6, [x8] 78 79 ADD x10, x9, x4 // a2 = a1 + a_stride 80 ADD x17, x16, x7 // c2 = c1 + cm_stride 81 // if mr <= 2 82 CSEL x10, x9, x10, LS // a2 = a1 83 CSEL x17, x16, x17, LS // c2 = c1 84 85 CMP x0, 4 // if mr < 4 86 ADD x11, x10, x4 // a3 = a2 + a_stride 87 ADD x14, x17, x7 // c3 = c2 + cm_stride 88 CSEL x11, x10, x11, LO // a3 = a2 89 CSEL x14, x17, x14, LO // c3 = c2 90 91 ADD x12, x11, x4 // a4 = a3 + a_stride 92 ADD x13, x14, x7 // c4 = c3 + cm_stride 93 // if mr <= 4 94 CSEL x12, x11, x12, LS // a4 = a3 95 CSEL x13, x14, x13, LS // c4 = c3 96 97 CMP x0, 6 // if mr < 6 98 ADD x4, x12, x4 // a5 = a4 + a_stride 99 ADD x7, x13, x7 // c5 = c4 + cm_stride 100 CSEL x4, x12, x4, LO // a5 = a4 101 CSEL x7, x13, x7, LO // c5 = c4 102 103 LDR x8, [sp] // load cn_stride 104 1050: 106 # Load initial accumulators 107 LDP q20, q21, [x15], 32 108 LDP q22, q23, [x15], 32 109 LDP q24, q25, [x15], 32 110 LDP q26, q27, [x15], 32 111 LDP q28, q29, [x15], 32 112 LDP q30, q31, [x15], 32 113 114 # Is there at least 2 halffloats (4 bytes)? 115 SUBS x0, x2, 4 // k = kc - 4 116 B.LO 4f 117 118 # Prologue - load 4 A and 2 B 119 120 LDR s0, [x3], 4 121 LDR q16, [x5], 16 122 LDR q17, [x5], 16 123 LDR s1, [x9], 4 124 LDR s2, [x10], 4 125 LDR s3, [x11], 4 126 127 # Is there at least 2 halffloats for main loop? 128 SUBS x0, x0, 4 129 B.LO 2f 130 131 # Main loop - 2 halffloats of A (4 bytes) 132 # 24 FMA + 6 ld32 A + 4 LDR B 1331: 134 FMLA v20.8h, v16.8h, v0.h[0] 135 LDR s4, [x12], 4 136 FMLA v21.8h, v17.8h, v0.h[0] 137 LDR s5, [x4], 4 138 FMLA v22.8h, v16.8h, v1.h[0] 139 LDR q18, [x5], 16 140 FMLA v23.8h, v17.8h, v1.h[0] 141 FMLA v24.8h, v16.8h, v2.h[0] 142 LDR q19, [x5], 16 143 FMLA v25.8h, v17.8h, v2.h[0] 144 FMLA v26.8h, v16.8h, v3.h[0] 145 FMLA v27.8h, v17.8h, v3.h[0] 146 FMLA v28.8h, v16.8h, v4.h[0] 147 FMLA v29.8h, v17.8h, v4.h[0] 148 FMLA v30.8h, v16.8h, v5.h[0] 149 FMLA v31.8h, v17.8h, v5.h[0] 150 SUBS x0, x0, 4 151 152 FMLA v20.8h, v18.8h, v0.h[1] 153 LDR q16, [x5], 16 154 FMLA v21.8h, v19.8h, v0.h[1] 155 FMLA v22.8h, v18.8h, v1.h[1] 156 LDR q17, [x5], 16 157 FMLA v23.8h, v19.8h, v1.h[1] 158 FMLA v24.8h, v18.8h, v2.h[1] 159 LDR s0, [x3], 4 160 FMLA v25.8h, v19.8h, v2.h[1] 161 FMLA v26.8h, v18.8h, v3.h[1] 162 LDR s1, [x9], 4 163 FMLA v27.8h, v19.8h, v3.h[1] 164 FMLA v28.8h, v18.8h, v4.h[1] 165 LDR s2, [x10], 4 166 FMLA v29.8h, v19.8h, v4.h[1] 167 FMLA v30.8h, v18.8h, v5.h[1] 168 LDR s3, [x11], 4 169 FMLA v31.8h, v19.8h, v5.h[1] 170 B.HS 1b 171 172 # Epilogue - same as main loop but no loads for next loop 1732: 174 FMLA v20.8h, v16.8h, v0.h[0] 175 LDR s4, [x12], 4 176 FMLA v21.8h, v17.8h, v0.h[0] 177 LDR s5, [x4], 4 178 FMLA v22.8h, v16.8h, v1.h[0] 179 LDR q18, [x5], 16 180 FMLA v23.8h, v17.8h, v1.h[0] 181 FMLA v24.8h, v16.8h, v2.h[0] 182 LDR q19, [x5], 16 183 FMLA v25.8h, v17.8h, v2.h[0] 184 FMLA v26.8h, v16.8h, v3.h[0] 185 FMLA v27.8h, v17.8h, v3.h[0] 186 FMLA v28.8h, v16.8h, v4.h[0] 187 FMLA v29.8h, v17.8h, v4.h[0] 188 FMLA v30.8h, v16.8h, v5.h[0] 189 FMLA v31.8h, v17.8h, v5.h[0] 190 191 FMLA v20.8h, v18.8h, v0.h[1] 192 FMLA v21.8h, v19.8h, v0.h[1] 193 FMLA v22.8h, v18.8h, v1.h[1] 194 FMLA v23.8h, v19.8h, v1.h[1] 195 FMLA v24.8h, v18.8h, v2.h[1] 196 FMLA v25.8h, v19.8h, v2.h[1] 197 FMLA v26.8h, v18.8h, v3.h[1] 198 FMLA v27.8h, v19.8h, v3.h[1] 199 FMLA v28.8h, v18.8h, v4.h[1] 200 FMLA v29.8h, v19.8h, v4.h[1] 201 FMLA v30.8h, v18.8h, v5.h[1] 202 FMLA v31.8h, v19.8h, v5.h[1] 203 204 # Is there a remainder?- 1 halffloat of A (2 bytes) 205 TBNZ x0, 1, 4f 2063: 207 # Scale and Clamp 208 FMUL v20.8h, v20.8h, v6.h[0] 209 DUP v4.8h, v6.h[1] 210 FMUL v21.8h, v21.8h, v6.h[0] 211 DUP v5.8h, v6.h[2] 212 FMUL v22.8h, v22.8h, v6.h[0] 213 FMUL v23.8h, v23.8h, v6.h[0] 214 FMUL v24.8h, v24.8h, v6.h[0] 215 FMUL v25.8h, v25.8h, v6.h[0] 216 FMUL v26.8h, v26.8h, v6.h[0] 217 FMUL v27.8h, v27.8h, v6.h[0] 218 FMUL v28.8h, v28.8h, v6.h[0] 219 FMUL v29.8h, v29.8h, v6.h[0] 220 FMUL v30.8h, v30.8h, v6.h[0] 221 FMUL v31.8h, v31.8h, v6.h[0] 222 FMAX v20.8h, v20.8h, v4.8h 223 FMAX v21.8h, v21.8h, v4.8h 224 FMAX v22.8h, v22.8h, v4.8h 225 FMAX v23.8h, v23.8h, v4.8h 226 FMAX v24.8h, v24.8h, v4.8h 227 FMAX v25.8h, v25.8h, v4.8h 228 FMAX v26.8h, v26.8h, v4.8h 229 FMAX v27.8h, v27.8h, v4.8h 230 FMAX v28.8h, v28.8h, v4.8h 231 FMAX v29.8h, v29.8h, v4.8h 232 FMAX v30.8h, v30.8h, v4.8h 233 FMAX v31.8h, v31.8h, v4.8h 234 SUBS x1, x1, 16 235 FMIN v20.8h, v20.8h, v5.8h 236 FMIN v21.8h, v21.8h, v5.8h 237 FMIN v22.8h, v22.8h, v5.8h 238 FMIN v23.8h, v23.8h, v5.8h 239 FMIN v24.8h, v24.8h, v5.8h 240 FMIN v25.8h, v25.8h, v5.8h 241 FMIN v26.8h, v26.8h, v5.8h 242 FMIN v27.8h, v27.8h, v5.8h 243 FMIN v28.8h, v28.8h, v5.8h 244 FMIN v29.8h, v29.8h, v5.8h 245 FMIN v30.8h, v30.8h, v5.8h 246 FMIN v31.8h, v31.8h, v5.8h 247 248 # Store full 6 x 16 249 B.LO 5f 250 251 ST1 {v30.16b, v31.16b}, [x7], x8 252 SUB x3, x3, x2 // a0 -= kc 253 ST1 {v28.16b, v29.16b}, [x13], x8 254 SUB x9, x9, x2 // a1 -= kc 255 ST1 {v26.16b, v27.16b}, [x14], x8 256 SUB x10, x10, x2 // a2 -= kc 257 ST1 {v24.16b, v25.16b}, [x17], x8 258 SUB x11, x11, x2 // a3 -= kc 259 ST1 {v22.16b, v23.16b}, [x16], x8 260 SUB x12, x12, x2 // a4 -= kc 261 ST1 {v20.16b, v21.16b}, [x6], x8 262 SUB x4, x4, x2 // a5 -= kc 263 264 B.HI 0b 265 RET 266 2674: 268 # Remainder- 1 halffloat of A (2 bytes) 269 LDR h0, [x3], 2 270 LDR q16, [x5], 16 271 LDR q17, [x5], 16 272 LDR h1, [x9], 2 273 LDR h2, [x10], 2 274 LDR h3, [x11], 2 275 LDR h4, [x12], 2 276 LDR h5, [x4], 2 277 FMLA v20.8h, v16.8h, v0.h[0] 278 FMLA v22.8h, v16.8h, v1.h[0] 279 FMLA v24.8h, v16.8h, v2.h[0] 280 FMLA v26.8h, v16.8h, v3.h[0] 281 FMLA v28.8h, v16.8h, v4.h[0] 282 FMLA v30.8h, v16.8h, v5.h[0] 283 FMLA v21.8h, v17.8h, v0.h[0] 284 FMLA v23.8h, v17.8h, v1.h[0] 285 FMLA v25.8h, v17.8h, v2.h[0] 286 FMLA v27.8h, v17.8h, v3.h[0] 287 FMLA v29.8h, v17.8h, v4.h[0] 288 FMLA v31.8h, v17.8h, v5.h[0] 289 B 3b 290 291 # Store odd width 2925: 293 TBZ x1, 3, 6f 294 STR q30, [x7], 16 295 MOV v30.16b, v31.16b 296 STR q28, [x13], 16 297 MOV v28.16b, v29.16b 298 STR q26, [x14], 16 299 MOV v26.16b, v27.16b 300 STR q24, [x17], 16 301 MOV v24.16b, v25.16b 302 STR q22, [x16], 16 303 MOV v22.16b, v23.16b 304 STR q20, [x6], 16 305 MOV v20.16b, v21.16b 306 3076: 308 TBZ x1, 2, 7f 309 STR d30, [x7], 8 310 STR d28, [x13], 8 311 DUP d30, v30.d[1] 312 DUP d28, v28.d[1] 313 STR d26, [x14], 8 314 STR d24, [x17], 8 315 DUP d26, v26.d[1] 316 DUP d24, v24.d[1] 317 STR d22, [x16], 8 318 STR d20, [x6], 8 319 DUP d22, v22.d[1] 320 DUP d20, v20.d[1] 321 3227: 323 TBZ x1, 1, 8f 324 STR s30, [x7], 4 325 STR s28, [x13], 4 326 DUP s30, v30.s[1] 327 DUP s28, v28.s[1] 328 STR s26, [x14], 4 329 STR s24, [x17], 4 330 DUP s26, v26.s[1] 331 DUP s24, v24.s[1] 332 STR s22, [x16], 4 333 STR s20, [x6], 4 334 DUP s22, v22.s[1] 335 DUP s20, v20.s[1] 336 3378: 338 TBZ x1, 0, 9f 339 STR h30, [x7] 340 STR h28, [x13] 341 STR h26, [x14] 342 STR h24, [x17] 343 STR h22, [x16] 344 STR h20, [x6] 3459: 346 RET 347 348END_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75 349 350#ifdef __ELF__ 351.section ".note.GNU-stack","",%progbits 352#endif 353