1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemminc_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x0) 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> x8 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x14 c3 40# x13 c4 41# x7 c5 42 43# Vector register usage 44# A0 v0 45# A1 v1 46# A2 v2 47# A3 v3 48# A4 v4 49# A5 v5 50# B v16 v17 v18 v19 51# C v20 52# C v22 53# C v24 54# C v26 55# C v28 56# C v30 57# Clamp v6, (v4), (v5) 58# unused A v8 v9 v10 v11 59# unused B v12 v13 v14 v15 60 61 62BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64 63 64 # Load acc, params pointer 65 LDP x15, x8, [sp, 8] 66 67 # Clamp A and C pointers 68 CMP x0, 2 // if mr < 2 69 ADD x9, x3, x4 // a1 = a0 + a_stride 70 ADD x16, x6, x7 // c1 = c0 + cm_stride 71 CSEL x9, x3, x9, LO // a1 = a0 72 CSEL x16, x6, x16, LO // c1 = c0 73 74 ADD x10, x9, x4 // a2 = a1 + a_stride 75 ADD x17, x16, x7 // c2 = c1 + cm_stride 76 // if mr <= 2 77 CSEL x10, x9, x10, LS // a2 = a1 78 CSEL x17, x16, x17, LS // c2 = c1 79 80 CMP x0, 4 // if mr < 4 81 ADD x11, x10, x4 // a3 = a2 + a_stride 82 ADD x14, x17, x7 // c3 = c2 + cm_stride 83 CSEL x11, x10, x11, LO // a3 = a2 84 CSEL x14, x17, x14, LO // c3 = c2 85 86 ADD x12, x11, x4 // a4 = a3 + a_stride 87 ADD x13, x14, x7 // c4 = c3 + cm_stride 88 // if mr <= 4 89 CSEL x12, x11, x12, LS // a4 = a3 90 CSEL x13, x14, x13, LS // c4 = c3 91 92 CMP x0, 6 // if mr < 6 93 ADD x4, x12, x4 // a5 = a4 + a_stride 94 ADD x7, x13, x7 // c5 = c4 + cm_stride 95 CSEL x4, x12, x4, LO // a5 = a4 96 CSEL x7, x13, x7, LO // c5 = c4 97 98 # Load params scale value 99 LD1R {v6.8h}, [x8] 100 ADD x8, x8, 2 101 1020: 103 # Load initial accumulators 104 LDP q20, q22, [x15], 32 105 LDP q24, q26, [x15], 32 106 LDP q28, q30, [x15], 32 107 108 # Is there at least 4 halffloats (8 bytes)? 109 SUBS x0, x2, 8 // k = kc - 8 110 B.LO 3f 111 112 # Main loop - 4 halffloats of A (8 bytes) 113 # 24 FMA + 6 ld64 A + 4 LDR B 1141: 115 LDR d0, [x3], 8 116 LDR q16, [x5], 16 117 LDR q17, [x5], 16 118 LDR d1, [x9], 8 119 LDR d2, [x10], 8 120 LDR d3, [x11], 8 121 LDR d4, [x12], 8 122 LDR d5, [x4], 8 123 SUBS x0, x0, 8 124 FMLA v20.8h, v16.8h, v0.h[0] 125 FMLA v22.8h, v16.8h, v1.h[0] 126 FMLA v24.8h, v16.8h, v2.h[0] 127 FMLA v26.8h, v16.8h, v3.h[0] 128 FMLA v28.8h, v16.8h, v4.h[0] 129 FMLA v30.8h, v16.8h, v5.h[0] 130 LDR q18, [x5], 16 131 LDR q19, [x5], 16 132 133 FMLA v20.8h, v17.8h, v0.h[1] 134 FMLA v22.8h, v17.8h, v1.h[1] 135 FMLA v24.8h, v17.8h, v2.h[1] 136 FMLA v26.8h, v17.8h, v3.h[1] 137 FMLA v28.8h, v17.8h, v4.h[1] 138 FMLA v30.8h, v17.8h, v5.h[1] 139 140 FMLA v20.8h, v18.8h, v0.h[2] 141 FMLA v22.8h, v18.8h, v1.h[2] 142 FMLA v24.8h, v18.8h, v2.h[2] 143 FMLA v26.8h, v18.8h, v3.h[2] 144 FMLA v28.8h, v18.8h, v4.h[2] 145 FMLA v30.8h, v18.8h, v5.h[2] 146 147 FMLA v20.8h, v19.8h, v0.h[3] 148 FMLA v22.8h, v19.8h, v1.h[3] 149 FMLA v24.8h, v19.8h, v2.h[3] 150 FMLA v26.8h, v19.8h, v3.h[3] 151 FMLA v28.8h, v19.8h, v4.h[3] 152 FMLA v30.8h, v19.8h, v5.h[3] 153 B.HS 1b 154 155 # Is there a remainder?- 2 halffloats of A (4 bytes) 156 TBNZ x0, 2, 4f 157 # Is there a remainder?- 1 halffloats of A (2 bytes) 158 TBNZ x0, 1, 5f 1592: 160 # Scale and Clamp 161 FMUL v20.8h, v20.8h, v6.8h 162 # Load params values 163 LD2R {v4.8h, v5.8h}, [x8] 164 FMUL v22.8h, v22.8h, v6.8h 165 FMUL v24.8h, v24.8h, v6.8h 166 FMUL v26.8h, v26.8h, v6.8h 167 FMUL v28.8h, v28.8h, v6.8h 168 FMUL v30.8h, v30.8h, v6.8h 169 # Load cn_stride 170 LDR x0, [sp, 0] 171 FMAX v20.8h, v20.8h, v4.8h 172 FMAX v22.8h, v22.8h, v4.8h 173 FMAX v24.8h, v24.8h, v4.8h 174 FMAX v26.8h, v26.8h, v4.8h 175 FMAX v28.8h, v28.8h, v4.8h 176 FMAX v30.8h, v30.8h, v4.8h 177 SUBS x1, x1, 8 178 FMIN v20.8h, v20.8h, v5.8h 179 FMIN v22.8h, v22.8h, v5.8h 180 FMIN v24.8h, v24.8h, v5.8h 181 FMIN v26.8h, v26.8h, v5.8h 182 FMIN v28.8h, v28.8h, v5.8h 183 FMIN v30.8h, v30.8h, v5.8h 184 185 # Store full 6 x 8 186 B.LO 6f 187 188 ST1 {v30.16b}, [x7], x0 189 SUB x3, x3, x2 // a0 -= kc 190 ST1 {v28.16b}, [x13], x0 191 SUB x9, x9, x2 // a1 -= kc 192 ST1 {v26.16b}, [x14], x0 193 SUB x10, x10, x2 // a2 -= kc 194 ST1 {v24.16b}, [x17], x0 195 SUB x11, x11, x2 // a3 -= kc 196 ST1 {v22.16b}, [x16], x0 197 SUB x12, x12, x2 // a4 -= kc 198 ST1 {v20.16b}, [x6], x0 199 SUB x4, x4, x2 // a5 -= kc 200 201 B.HI 0b 202 RET 203 2043: 205 TBZ x0, 2, 5f 2064: 207 # Remainder- 2 halffloats of A (4 bytes) 208 LDR s0, [x3], 4 209 LDR q16, [x5], 16 210 LDR q17, [x5], 16 211 LDR s1, [x9], 4 212 LDR s2, [x10], 4 213 LDR s3, [x11], 4 214 LDR s4, [x12], 4 215 LDR s5, [x4], 4 216 217 FMLA v20.8h, v16.8h, v0.h[0] 218 FMLA v22.8h, v16.8h, v1.h[0] 219 FMLA v24.8h, v16.8h, v2.h[0] 220 FMLA v26.8h, v16.8h, v3.h[0] 221 FMLA v28.8h, v16.8h, v4.h[0] 222 FMLA v30.8h, v16.8h, v5.h[0] 223 224 FMLA v20.8h, v17.8h, v0.h[1] 225 FMLA v22.8h, v17.8h, v1.h[1] 226 FMLA v24.8h, v17.8h, v2.h[1] 227 FMLA v26.8h, v17.8h, v3.h[1] 228 FMLA v28.8h, v17.8h, v4.h[1] 229 FMLA v30.8h, v17.8h, v5.h[1] 230 231 TBZ x0, 1, 2b 232 2335: 234 # Remainder- 1 halffloat of A (2 bytes) 235 LDR h0, [x3], 2 236 LDR q16, [x5], 16 237 LDR h1, [x9], 2 238 LDR h2, [x10], 2 239 LDR h3, [x11], 2 240 LDR h4, [x12], 2 241 LDR h5, [x4], 2 242 FMLA v20.8h, v16.8h, v0.h[0] 243 FMLA v22.8h, v16.8h, v1.h[0] 244 FMLA v24.8h, v16.8h, v2.h[0] 245 FMLA v26.8h, v16.8h, v3.h[0] 246 FMLA v28.8h, v16.8h, v4.h[0] 247 FMLA v30.8h, v16.8h, v5.h[0] 248 B 2b 249 250 # Store odd width 2516: 252 TBZ x1, 2, 7f 253 STR d30, [x7], 8 254 DUP d30, v30.d[1] 255 STR d28, [x13], 8 256 DUP d28, v28.d[1] 257 STR d26, [x14], 8 258 DUP d26, v26.d[1] 259 STR d24, [x17], 8 260 DUP d24, v24.d[1] 261 STR d22, [x16], 8 262 DUP d22, v22.d[1] 263 STR d20, [x6], 8 264 DUP d20, v20.d[1] 265 2667: 267 TBZ x1, 1, 8f 268 STR s30, [x7], 4 269 DUP s30, v30.s[1] 270 STR s28, [x13], 4 271 DUP s28, v28.s[1] 272 STR s26, [x14], 4 273 DUP s26, v26.s[1] 274 STR s24, [x17], 4 275 DUP s24, v24.s[1] 276 STR s22, [x16], 4 277 DUP s22, v22.s[1] 278 STR s20, [x6], 4 279 DUP s20, v20.s[1] 280 2818: 282 TBZ x1, 0, 9f 283 STR h30, [x7] 284 STR h28, [x13] 285 STR h26, [x14] 286 STR h24, [x17] 287 STR h22, [x16] 288 STR h20, [x6] 2899: 290 RET 291 292END_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64 293 294#ifdef __ELF__ 295.section ".note.GNU-stack","",%progbits 296#endif 297