1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x9 a1 29# x10 a2 30# x11 a3 31# x12 a4 32# x4 a5 33 34# C pointers 35# x6 c0 36# x16 c1 37# x17 c2 38# x14 c3 39# x13 c4 40# x7 c5 41 42# Vector register usage 43# A0 v0 44# A1 v1 45# A2 v2 46# A3 v3 47# A4 v4 48# A5 v5 49# B v16 v17 v18 v19 50# C v20 51# C v22 52# C v24 53# C v26 54# C v28 55# C v30 56# Clamp v6, (v4), (v5) 57# unused A v8 v9 v10 v11 58# unused B v12 v13 v14 v15 59 60 61BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64 62 63 # Load params pointer 64 LDR x8, [sp, 8] 65 66 # Clamp A and C pointers 67 CMP x0, 2 // if mr < 2 68 ADD x9, x3, x4 // a1 = a0 + a_stride 69 ADD x16, x6, x7 // c1 = c0 + cm_stride 70 CSEL x9, x3, x9, LO // a1 = a0 71 CSEL x16, x6, x16, LO // c1 = c0 72 73 ADD x10, x9, x4 // a2 = a1 + a_stride 74 ADD x17, x16, x7 // c2 = c1 + cm_stride 75 // if mr <= 2 76 CSEL x10, x9, x10, LS // a2 = a1 77 CSEL x17, x16, x17, LS // c2 = c1 78 79 CMP x0, 4 // if mr < 4 80 ADD x11, x10, x4 // a3 = a2 + a_stride 81 ADD x14, x17, x7 // c3 = c2 + cm_stride 82 CSEL x11, x10, x11, LO // a3 = a2 83 CSEL x14, x17, x14, LO // c3 = c2 84 85 ADD x12, x11, x4 // a4 = a3 + a_stride 86 ADD x13, x14, x7 // c4 = c3 + cm_stride 87 // if mr <= 4 88 CSEL x12, x11, x12, LS // a4 = a3 89 CSEL x13, x14, x13, LS // c4 = c3 90 91 CMP x0, 6 // if mr < 6 92 ADD x4, x12, x4 // a5 = a4 + a_stride 93 ADD x7, x13, x7 // c5 = c4 + cm_stride 94 CSEL x4, x12, x4, LO // a5 = a4 95 CSEL x7, x13, x7, LO // c5 = c4 96 97 # Load params scale value 98 LD1R {v6.8h}, [x8] 99 ADD x8, x8, 2 100 1010: 102 # Load initial bias from w into accumulators 103 LDR q20, [x5], 16 104 MOV v22.16b, v20.16b 105 MOV v24.16b, v20.16b 106 MOV v26.16b, v20.16b 107 MOV v28.16b, v20.16b 108 MOV v30.16b, v20.16b 109 110 # Is there at least 4 halffloats (8 bytes)? 111 SUBS x0, x2, 8 // k = kc - 8 112 B.LO 3f 113 114 # Main loop - 4 halffloats of A (8 bytes) 115 # 24 FMA + 6 ld64 A + 4 LDR B 1161: 117 LDR d0, [x3], 8 118 LDR q16, [x5], 16 119 LDR q17, [x5], 16 120 LDR d1, [x9], 8 121 LDR d2, [x10], 8 122 LDR d3, [x11], 8 123 LDR d4, [x12], 8 124 LDR d5, [x4], 8 125 SUBS x0, x0, 8 126 FMLA v20.8h, v16.8h, v0.h[0] 127 FMLA v22.8h, v16.8h, v1.h[0] 128 FMLA v24.8h, v16.8h, v2.h[0] 129 FMLA v26.8h, v16.8h, v3.h[0] 130 FMLA v28.8h, v16.8h, v4.h[0] 131 FMLA v30.8h, v16.8h, v5.h[0] 132 LDR q18, [x5], 16 133 LDR q19, [x5], 16 134 135 FMLA v20.8h, v17.8h, v0.h[1] 136 FMLA v22.8h, v17.8h, v1.h[1] 137 FMLA v24.8h, v17.8h, v2.h[1] 138 FMLA v26.8h, v17.8h, v3.h[1] 139 FMLA v28.8h, v17.8h, v4.h[1] 140 FMLA v30.8h, v17.8h, v5.h[1] 141 142 FMLA v20.8h, v18.8h, v0.h[2] 143 FMLA v22.8h, v18.8h, v1.h[2] 144 FMLA v24.8h, v18.8h, v2.h[2] 145 FMLA v26.8h, v18.8h, v3.h[2] 146 FMLA v28.8h, v18.8h, v4.h[2] 147 FMLA v30.8h, v18.8h, v5.h[2] 148 149 FMLA v20.8h, v19.8h, v0.h[3] 150 FMLA v22.8h, v19.8h, v1.h[3] 151 FMLA v24.8h, v19.8h, v2.h[3] 152 FMLA v26.8h, v19.8h, v3.h[3] 153 FMLA v28.8h, v19.8h, v4.h[3] 154 FMLA v30.8h, v19.8h, v5.h[3] 155 B.HS 1b 156 157 # Is there a remainder?- 2 halffloats of A (4 bytes) 158 TBNZ x0, 2, 4f 159 # Is there a remainder?- 1 halffloats of A (2 bytes) 160 TBNZ x0, 1, 5f 1612: 162 # Scale and Clamp 163 FMUL v20.8h, v20.8h, v6.8h 164 # Load params values 165 LD2R {v4.8h, v5.8h}, [x8] 166 FMUL v22.8h, v22.8h, v6.8h 167 FMUL v24.8h, v24.8h, v6.8h 168 FMUL v26.8h, v26.8h, v6.8h 169 FMUL v28.8h, v28.8h, v6.8h 170 FMUL v30.8h, v30.8h, v6.8h 171 # Load cn_stride 172 LDR x0, [sp, 0] 173 FMAX v20.8h, v20.8h, v4.8h 174 FMAX v22.8h, v22.8h, v4.8h 175 FMAX v24.8h, v24.8h, v4.8h 176 FMAX v26.8h, v26.8h, v4.8h 177 FMAX v28.8h, v28.8h, v4.8h 178 FMAX v30.8h, v30.8h, v4.8h 179 SUBS x1, x1, 8 180 FMIN v20.8h, v20.8h, v5.8h 181 FMIN v22.8h, v22.8h, v5.8h 182 FMIN v24.8h, v24.8h, v5.8h 183 FMIN v26.8h, v26.8h, v5.8h 184 FMIN v28.8h, v28.8h, v5.8h 185 FMIN v30.8h, v30.8h, v5.8h 186 187 # Store full 6 x 8 188 B.LO 6f 189 190 ST1 {v20.16b}, [x6], x0 191 SUB x3, x3, x2 // a0 -= kc 192 ST1 {v22.16b}, [x16], x0 193 SUB x9, x9, x2 // a1 -= kc 194 ST1 {v24.16b}, [x17], x0 195 SUB x10, x10, x2 // a2 -= kc 196 ST1 {v26.16b}, [x14], x0 197 SUB x11, x11, x2 // a3 -= kc 198 ST1 {v28.16b}, [x13], x0 199 SUB x12, x12, x2 // a4 -= kc 200 ST1 {v30.16b}, [x7], x0 201 SUB x4, x4, x2 // a5 -= kc 202 203 B.HI 0b 204 RET 205 2063: 207 TBZ x0, 2, 5f 2084: 209 # Remainder- 2 halffloats of A (4 bytes) 210 LDR s0, [x3], 4 211 LDR q16, [x5], 16 212 LDR q17, [x5], 16 213 LDR s1, [x9], 4 214 LDR s2, [x10], 4 215 LDR s3, [x11], 4 216 LDR s4, [x12], 4 217 LDR s5, [x4], 4 218 219 FMLA v20.8h, v16.8h, v0.h[0] 220 FMLA v22.8h, v16.8h, v1.h[0] 221 FMLA v24.8h, v16.8h, v2.h[0] 222 FMLA v26.8h, v16.8h, v3.h[0] 223 FMLA v28.8h, v16.8h, v4.h[0] 224 FMLA v30.8h, v16.8h, v5.h[0] 225 226 FMLA v20.8h, v17.8h, v0.h[1] 227 FMLA v22.8h, v17.8h, v1.h[1] 228 FMLA v24.8h, v17.8h, v2.h[1] 229 FMLA v26.8h, v17.8h, v3.h[1] 230 FMLA v28.8h, v17.8h, v4.h[1] 231 FMLA v30.8h, v17.8h, v5.h[1] 232 233 TBZ x0, 1, 2b 234 2355: 236 # Remainder- 1 halffloat of A (2 bytes) 237 LDR h0, [x3], 2 238 LDR q16, [x5], 16 239 LDR h1, [x9], 2 240 LDR h2, [x10], 2 241 LDR h3, [x11], 2 242 LDR h4, [x12], 2 243 LDR h5, [x4], 2 244 FMLA v20.8h, v16.8h, v0.h[0] 245 FMLA v22.8h, v16.8h, v1.h[0] 246 FMLA v24.8h, v16.8h, v2.h[0] 247 FMLA v26.8h, v16.8h, v3.h[0] 248 FMLA v28.8h, v16.8h, v4.h[0] 249 FMLA v30.8h, v16.8h, v5.h[0] 250 B 2b 251 252 # Store odd width 2536: 254 TBZ x1, 2, 7f 255 STR d20, [x6], 8 256 DUP d20, v20.d[1] 257 STR d22, [x16], 8 258 DUP d22, v22.d[1] 259 STR d24, [x17], 8 260 DUP d24, v24.d[1] 261 STR d26, [x14], 8 262 DUP d26, v26.d[1] 263 STR d28, [x13], 8 264 DUP d28, v28.d[1] 265 STR d30, [x7], 8 266 DUP d30, v30.d[1] 267 2687: 269 TBZ x1, 1, 8f 270 STR s20, [x6], 4 271 DUP s20, v20.s[1] 272 STR s22, [x16], 4 273 DUP s22, v22.s[1] 274 STR s24, [x17], 4 275 DUP s24, v24.s[1] 276 STR s26, [x14], 4 277 DUP s26, v26.s[1] 278 STR s28, [x13], 4 279 DUP s28, v28.s[1] 280 STR s30, [x7], 4 281 DUP s30, v30.s[1] 282 2838: 284 TBZ x1, 0, 9f 285 STR h20, [x6] 286 STR h22, [x16] 287 STR h24, [x17] 288 STR h26, [x14] 289 STR h28, [x13] 290 STR h30, [x7] 2919: 292 RET 293 294END_FUNCTION xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64 295 296#ifdef __ELF__ 297.section ".note.GNU-stack","",%progbits 298#endif 299