1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/6x16-aarch64-neonfp16arith-ld32.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x8) 22 23# const float*restrict acc, [sp + 8] -> x15 24# const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# A pointers 29# x3 a0 30# x9 a1 31# x10 a2 32# x11 a3 33# x12 a4 34# x4 a5 35 36# C pointers 37# x6 c0 38# x16 c1 39# x17 c2 40# x14 c3 41# x13 c4 42# x7 c5 43 44# Vector register usage 45# A0 v0 46# A1 v1 47# A2 v2 48# A3 v3 49# A4 v4 50# A5 v5 51# B v16 v17 v18 v19 52# C v20 v21 53# C v22 v23 54# C v24 v25 55# C v26 v27 56# C v28 v29 57# C v30 v31 58# Clamp v6, (v4), (v5) 59# unused A v8 v9 v10 v11 60# unused B v12 v13 v14 v15 61 62 63BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32 64 65 # Load acc, params pointer 66 LDP x15, x8, [sp, 8] 67 68 # Clamp A and C pointers 69 CMP x0, 2 // if mr < 2 70 ADD x9, x3, x4 // a1 = a0 + a_stride 71 ADD x16, x6, x7 // c1 = c0 + cm_stride 72 CSEL x9, x3, x9, LO // a1 = a0 73 CSEL x16, x6, x16, LO // c1 = c0 74 75 # Load params 76 LDR d6, [x8] 77 78 ADD x10, x9, x4 // a2 = a1 + a_stride 79 ADD x17, x16, x7 // c2 = c1 + cm_stride 80 // if mr <= 2 81 CSEL x10, x9, x10, LS // a2 = a1 82 CSEL x17, x16, x17, LS // c2 = c1 83 84 CMP x0, 4 // if mr < 4 85 ADD x11, x10, x4 // a3 = a2 + a_stride 86 ADD x14, x17, x7 // c3 = c2 + cm_stride 87 CSEL x11, x10, x11, LO // a3 = a2 88 CSEL x14, x17, x14, LO // c3 = c2 89 90 ADD x12, x11, x4 // a4 = a3 + a_stride 91 ADD x13, x14, x7 // c4 = c3 + cm_stride 92 // if mr <= 4 93 CSEL x12, x11, x12, LS // a4 = a3 94 CSEL x13, x14, x13, LS // c4 = c3 95 96 CMP x0, 6 // if mr < 6 97 ADD x4, x12, x4 // a5 = a4 + a_stride 98 ADD x7, x13, x7 // c5 = c4 + cm_stride 99 CSEL x4, x12, x4, LO // a5 = a4 100 CSEL x7, x13, x7, LO // c5 = c4 101 102 LDR x8, [sp] // load cn_stride 103 104 1050: 106 # Load initial accumulators 107 LDP q20, q21, [x15], 32 108 LDP q22, q23, [x15], 32 109 LDP q24, q25, [x15], 32 110 LDP q26, q27, [x15], 32 111 LDP q28, q29, [x15], 32 112 LDP q30, q31, [x15], 32 113 114 # Is there at least 2 halffloats (4 bytes)? 115 SUBS x0, x2, 4 // k = kc - 4 116 B.LO 3f 117 118 # Main loop - 2 halffloats of A (4 bytes) 119 # 24 FMA + 6 ld32 A + 4 LDR B 1201: 121 LDR s0, [x3], 4 122 LDR q16, [x5], 16 123 LDR q17, [x5], 16 124 LDR s1, [x9], 4 125 LDR s2, [x10], 4 126 LDR s3, [x11], 4 127 LDR s4, [x12], 4 128 LDR s5, [x4], 4 129 SUBS x0, x0, 4 130 FMLA v20.8h, v16.8h, v0.h[0] 131 FMLA v22.8h, v16.8h, v1.h[0] 132 FMLA v24.8h, v16.8h, v2.h[0] 133 FMLA v26.8h, v16.8h, v3.h[0] 134 LDR q18, [x5], 16 135 LDR q19, [x5], 16 136 FMLA v28.8h, v16.8h, v4.h[0] 137 FMLA v30.8h, v16.8h, v5.h[0] 138 FMLA v21.8h, v17.8h, v0.h[0] 139 FMLA v23.8h, v17.8h, v1.h[0] 140 FMLA v25.8h, v17.8h, v2.h[0] 141 FMLA v27.8h, v17.8h, v3.h[0] 142 FMLA v29.8h, v17.8h, v4.h[0] 143 FMLA v31.8h, v17.8h, v5.h[0] 144 145 FMLA v20.8h, v18.8h, v0.h[1] 146 FMLA v22.8h, v18.8h, v1.h[1] 147 FMLA v24.8h, v18.8h, v2.h[1] 148 FMLA v26.8h, v18.8h, v3.h[1] 149 FMLA v28.8h, v18.8h, v4.h[1] 150 FMLA v30.8h, v18.8h, v5.h[1] 151 FMLA v21.8h, v19.8h, v0.h[1] 152 FMLA v23.8h, v19.8h, v1.h[1] 153 FMLA v25.8h, v19.8h, v2.h[1] 154 FMLA v27.8h, v19.8h, v3.h[1] 155 FMLA v29.8h, v19.8h, v4.h[1] 156 FMLA v31.8h, v19.8h, v5.h[1] 157 B.HS 1b 158 159 # Is there a remainder?- 1 halffloat of A (2 bytes) 160 TBNZ x0, 1, 3f 1612: 162 # Scale and Clamp 163 FMUL v20.8h, v20.8h, v6.h[0] 164 DUP v4.8h, v6.h[1] 165 FMUL v21.8h, v21.8h, v6.h[0] 166 DUP v5.8h, v6.h[2] 167 FMUL v22.8h, v22.8h, v6.h[0] 168 FMUL v23.8h, v23.8h, v6.h[0] 169 FMUL v24.8h, v24.8h, v6.h[0] 170 FMUL v25.8h, v25.8h, v6.h[0] 171 FMUL v26.8h, v26.8h, v6.h[0] 172 FMUL v27.8h, v27.8h, v6.h[0] 173 FMUL v28.8h, v28.8h, v6.h[0] 174 FMUL v29.8h, v29.8h, v6.h[0] 175 FMUL v30.8h, v30.8h, v6.h[0] 176 FMUL v31.8h, v31.8h, v6.h[0] 177 FMAX v20.8h, v20.8h, v4.8h 178 FMAX v21.8h, v21.8h, v4.8h 179 FMAX v22.8h, v22.8h, v4.8h 180 FMAX v23.8h, v23.8h, v4.8h 181 FMAX v24.8h, v24.8h, v4.8h 182 FMAX v25.8h, v25.8h, v4.8h 183 FMAX v26.8h, v26.8h, v4.8h 184 FMAX v27.8h, v27.8h, v4.8h 185 FMAX v28.8h, v28.8h, v4.8h 186 FMAX v29.8h, v29.8h, v4.8h 187 FMAX v30.8h, v30.8h, v4.8h 188 FMAX v31.8h, v31.8h, v4.8h 189 SUBS x1, x1, 16 190 FMIN v20.8h, v20.8h, v5.8h 191 FMIN v21.8h, v21.8h, v5.8h 192 FMIN v22.8h, v22.8h, v5.8h 193 FMIN v23.8h, v23.8h, v5.8h 194 FMIN v24.8h, v24.8h, v5.8h 195 FMIN v25.8h, v25.8h, v5.8h 196 FMIN v26.8h, v26.8h, v5.8h 197 FMIN v27.8h, v27.8h, v5.8h 198 FMIN v28.8h, v28.8h, v5.8h 199 FMIN v29.8h, v29.8h, v5.8h 200 FMIN v30.8h, v30.8h, v5.8h 201 FMIN v31.8h, v31.8h, v5.8h 202 203 # Store full 6 x 16 204 B.LO 4f 205 206 ST1 {v30.16b, v31.16b}, [x7], x8 207 SUB x3, x3, x2 // a0 -= kc 208 ST1 {v28.16b, v29.16b}, [x13], x8 209 SUB x9, x9, x2 // a1 -= kc 210 ST1 {v26.16b, v27.16b}, [x14], x8 211 SUB x10, x10, x2 // a2 -= kc 212 ST1 {v24.16b, v25.16b}, [x17], x8 213 SUB x11, x11, x2 // a3 -= kc 214 ST1 {v22.16b, v23.16b}, [x16], x8 215 SUB x12, x12, x2 // a4 -= kc 216 ST1 {v20.16b, v21.16b}, [x6], x8 217 SUB x4, x4, x2 // a5 -= kc 218 219 B.HI 0b 220 RET 221 2223: 223 # Remainder- 1 halffloat of A (2 bytes) 224 LDR h0, [x3], 2 225 LDR q16, [x5], 16 226 LDR q17, [x5], 16 227 LDR h1, [x9], 2 228 LDR h2, [x10], 2 229 LDR h3, [x11], 2 230 LDR h4, [x12], 2 231 LDR h5, [x4], 2 232 FMLA v20.8h, v16.8h, v0.h[0] 233 FMLA v22.8h, v16.8h, v1.h[0] 234 FMLA v24.8h, v16.8h, v2.h[0] 235 FMLA v26.8h, v16.8h, v3.h[0] 236 FMLA v28.8h, v16.8h, v4.h[0] 237 FMLA v30.8h, v16.8h, v5.h[0] 238 FMLA v21.8h, v17.8h, v0.h[0] 239 FMLA v23.8h, v17.8h, v1.h[0] 240 FMLA v25.8h, v17.8h, v2.h[0] 241 FMLA v27.8h, v17.8h, v3.h[0] 242 FMLA v29.8h, v17.8h, v4.h[0] 243 FMLA v31.8h, v17.8h, v5.h[0] 244 B 2b 245 246 # Store odd width 2474: 248 TBZ x1, 3, 5f 249 STR q30, [x7], 16 250 MOV v30.16b, v31.16b 251 STR q28, [x13], 16 252 MOV v28.16b, v29.16b 253 STR q26, [x14], 16 254 MOV v26.16b, v27.16b 255 STR q24, [x17], 16 256 MOV v24.16b, v25.16b 257 STR q22, [x16], 16 258 MOV v22.16b, v23.16b 259 STR q20, [x6], 16 260 MOV v20.16b, v21.16b 261 2625: 263 TBZ x1, 2, 6f 264 STR d30, [x7], 8 265 STR d28, [x13], 8 266 DUP d30, v30.d[1] 267 DUP d28, v28.d[1] 268 STR d26, [x14], 8 269 STR d24, [x17], 8 270 DUP d26, v26.d[1] 271 DUP d24, v24.d[1] 272 STR d22, [x16], 8 273 STR d20, [x6], 8 274 DUP d22, v22.d[1] 275 DUP d20, v20.d[1] 276 2776: 278 TBZ x1, 1, 7f 279 STR s30, [x7], 4 280 STR s28, [x13], 4 281 DUP s30, v30.s[1] 282 DUP s28, v28.s[1] 283 STR s26, [x14], 4 284 STR s24, [x17], 4 285 DUP s26, v26.s[1] 286 DUP s24, v24.s[1] 287 STR s22, [x16], 4 288 STR s20, [x6], 4 289 DUP s22, v22.s[1] 290 DUP s20, v20.s[1] 291 2927: 293 TBZ x1, 0, 8f 294 STR h30, [x7] 295 STR h28, [x13] 296 STR h26, [x14] 297 STR h24, [x17] 298 STR h22, [x16] 299 STR h20, [x6] 3008: 301 RET 302 303END_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32 304 305#ifdef __ELF__ 306.section ".note.GNU-stack","",%progbits 307#endif 308