1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/6x8-aarch64-neonfp16arith-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemminc_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> (x8) 22# const float*restrict acc, [sp + 8] -> x15 23# const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8) 24 25# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 26 27# A pointers 28# x3 a0 29# x9 a1 30# x10 a2 31# x11 a3 32# x12 a4 33# x4 a5 34 35# C pointers 36# x6 c0 37# x16 c1 38# x17 c2 39# x14 c3 40# x13 c4 41# x7 c5 42 43# Vector register usage 44# A0 v0 45# A1 v1 46# A2 v2 47# A3 v3 48# A4 v4 49# A5 v5 50# B v16 v17 v18 v19 51# C v20 52# C v22 53# C v24 54# C v26 55# C v28 56# C v30 57# Clamp v6, (v4), (v5) 58# unused A v8 v9 v10 v11 59# unused B v12 v13 v14 v15 60 61 62BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64 63 64 # Load acc, params pointer 65 LDP x15, x8, [sp, 8] 66 67 # Clamp A and C pointers 68 CMP x0, 2 // if mr < 2 69 ADD x9, x3, x4 // a1 = a0 + a_stride 70 ADD x16, x6, x7 // c1 = c0 + cm_stride 71 CSEL x9, x3, x9, LO // a1 = a0 72 CSEL x16, x6, x16, LO // c1 = c0 73 74 # Load params 75 LDR d6, [x8] 76 77 ADD x10, x9, x4 // a2 = a1 + a_stride 78 ADD x17, x16, x7 // c2 = c1 + cm_stride 79 // if mr <= 2 80 CSEL x10, x9, x10, LS // a2 = a1 81 CSEL x17, x16, x17, LS // c2 = c1 82 83 CMP x0, 4 // if mr < 4 84 ADD x11, x10, x4 // a3 = a2 + a_stride 85 ADD x14, x17, x7 // c3 = c2 + cm_stride 86 CSEL x11, x10, x11, LO // a3 = a2 87 CSEL x14, x17, x14, LO // c3 = c2 88 89 ADD x12, x11, x4 // a4 = a3 + a_stride 90 ADD x13, x14, x7 // c4 = c3 + cm_stride 91 // if mr <= 4 92 CSEL x12, x11, x12, LS // a4 = a3 93 CSEL x13, x14, x13, LS // c4 = c3 94 95 CMP x0, 6 // if mr < 6 96 ADD x4, x12, x4 // a5 = a4 + a_stride 97 ADD x7, x13, x7 // c5 = c4 + cm_stride 98 CSEL x4, x12, x4, LO // a5 = a4 99 CSEL x7, x13, x7, LO // c5 = c4 100 101 LDR x8, [sp] // load cn_stride 102 1030: 104 # Load initial accumulators 105 LDP q20, q22, [x15], 32 106 LDP q24, q26, [x15], 32 107 LDP q28, q30, [x15], 32 108 109 # Is there at least 4 halffloats (8 bytes)? 110 SUBS x0, x2, 8 // k = kc - 8 111 B.LO 3f 112 113 # Main loop - 4 halffloats of A (8 bytes) 114 # 24 FMA + 6 ld64 A + 4 LDR B 1151: 116 LDR d0, [x3], 8 117 LDR q16, [x5], 16 118 LDR q17, [x5], 16 119 LDR d1, [x9], 8 120 LDR d2, [x10], 8 121 LDR d3, [x11], 8 122 LDR d4, [x12], 8 123 LDR d5, [x4], 8 124 SUBS x0, x0, 8 125 FMLA v20.8h, v16.8h, v0.h[0] 126 FMLA v22.8h, v16.8h, v1.h[0] 127 FMLA v24.8h, v16.8h, v2.h[0] 128 FMLA v26.8h, v16.8h, v3.h[0] 129 FMLA v28.8h, v16.8h, v4.h[0] 130 FMLA v30.8h, v16.8h, v5.h[0] 131 LDR q18, [x5], 16 132 LDR q19, [x5], 16 133 134 FMLA v20.8h, v17.8h, v0.h[1] 135 FMLA v22.8h, v17.8h, v1.h[1] 136 FMLA v24.8h, v17.8h, v2.h[1] 137 FMLA v26.8h, v17.8h, v3.h[1] 138 FMLA v28.8h, v17.8h, v4.h[1] 139 FMLA v30.8h, v17.8h, v5.h[1] 140 141 FMLA v20.8h, v18.8h, v0.h[2] 142 FMLA v22.8h, v18.8h, v1.h[2] 143 FMLA v24.8h, v18.8h, v2.h[2] 144 FMLA v26.8h, v18.8h, v3.h[2] 145 FMLA v28.8h, v18.8h, v4.h[2] 146 FMLA v30.8h, v18.8h, v5.h[2] 147 148 FMLA v20.8h, v19.8h, v0.h[3] 149 FMLA v22.8h, v19.8h, v1.h[3] 150 FMLA v24.8h, v19.8h, v2.h[3] 151 FMLA v26.8h, v19.8h, v3.h[3] 152 FMLA v28.8h, v19.8h, v4.h[3] 153 FMLA v30.8h, v19.8h, v5.h[3] 154 B.HS 1b 155 156 # Is there a remainder?- 2 halffloats of A (4 bytes) 157 TBNZ x0, 2, 4f 158 # Is there a remainder?- 1 halffloats of A (2 bytes) 159 TBNZ x0, 1, 5f 1602: 161 # Scale and Clamp 162 FMUL v20.8h, v20.8h, v6.h[0] 163 DUP v4.8h, v6.h[1] 164 FMUL v22.8h, v22.8h, v6.h[0] 165 DUP v5.8h, v6.h[2] 166 FMUL v24.8h, v24.8h, v6.h[0] 167 FMUL v26.8h, v26.8h, v6.h[0] 168 FMUL v28.8h, v28.8h, v6.h[0] 169 FMUL v30.8h, v30.8h, v6.h[0] 170 FMAX v20.8h, v20.8h, v4.8h 171 FMAX v22.8h, v22.8h, v4.8h 172 FMAX v24.8h, v24.8h, v4.8h 173 FMAX v26.8h, v26.8h, v4.8h 174 FMAX v28.8h, v28.8h, v4.8h 175 FMAX v30.8h, v30.8h, v4.8h 176 SUBS x1, x1, 8 177 FMIN v20.8h, v20.8h, v5.8h 178 FMIN v22.8h, v22.8h, v5.8h 179 FMIN v24.8h, v24.8h, v5.8h 180 FMIN v26.8h, v26.8h, v5.8h 181 FMIN v28.8h, v28.8h, v5.8h 182 FMIN v30.8h, v30.8h, v5.8h 183 184 # Store full 6 x 8 185 B.LO 6f 186 187 ST1 {v30.16b}, [x7], x8 188 SUB x3, x3, x2 // a0 -= kc 189 ST1 {v28.16b}, [x13], x8 190 SUB x9, x9, x2 // a1 -= kc 191 ST1 {v26.16b}, [x14], x8 192 SUB x10, x10, x2 // a2 -= kc 193 ST1 {v24.16b}, [x17], x8 194 SUB x11, x11, x2 // a3 -= kc 195 ST1 {v22.16b}, [x16], x8 196 SUB x12, x12, x2 // a4 -= kc 197 ST1 {v20.16b}, [x6], x8 198 SUB x4, x4, x2 // a5 -= kc 199 200 B.HI 0b 201 RET 202 2033: 204 TBZ x0, 2, 5f 2054: 206 # Remainder- 2 halffloats of A (4 bytes) 207 LDR s0, [x3], 4 208 LDR q16, [x5], 16 209 LDR q17, [x5], 16 210 LDR s1, [x9], 4 211 LDR s2, [x10], 4 212 LDR s3, [x11], 4 213 LDR s4, [x12], 4 214 LDR s5, [x4], 4 215 216 FMLA v20.8h, v16.8h, v0.h[0] 217 FMLA v22.8h, v16.8h, v1.h[0] 218 FMLA v24.8h, v16.8h, v2.h[0] 219 FMLA v26.8h, v16.8h, v3.h[0] 220 FMLA v28.8h, v16.8h, v4.h[0] 221 FMLA v30.8h, v16.8h, v5.h[0] 222 223 FMLA v20.8h, v17.8h, v0.h[1] 224 FMLA v22.8h, v17.8h, v1.h[1] 225 FMLA v24.8h, v17.8h, v2.h[1] 226 FMLA v26.8h, v17.8h, v3.h[1] 227 FMLA v28.8h, v17.8h, v4.h[1] 228 FMLA v30.8h, v17.8h, v5.h[1] 229 230 TBZ x0, 1, 2b 231 2325: 233 # Remainder- 1 halffloat of A (2 bytes) 234 LDR h0, [x3], 2 235 LDR q16, [x5], 16 236 LDR h1, [x9], 2 237 LDR h2, [x10], 2 238 LDR h3, [x11], 2 239 LDR h4, [x12], 2 240 LDR h5, [x4], 2 241 FMLA v20.8h, v16.8h, v0.h[0] 242 FMLA v22.8h, v16.8h, v1.h[0] 243 FMLA v24.8h, v16.8h, v2.h[0] 244 FMLA v26.8h, v16.8h, v3.h[0] 245 FMLA v28.8h, v16.8h, v4.h[0] 246 FMLA v30.8h, v16.8h, v5.h[0] 247 B 2b 248 249 # Store odd width 2506: 251 TBZ x1, 2, 7f 252 STR d30, [x7], 8 253 STR d28, [x13], 8 254 DUP d30, v30.d[1] 255 DUP d28, v28.d[1] 256 STR d26, [x14], 8 257 STR d24, [x17], 8 258 DUP d26, v26.d[1] 259 DUP d24, v24.d[1] 260 STR d22, [x16], 8 261 STR d20, [x6], 8 262 DUP d22, v22.d[1] 263 DUP d20, v20.d[1] 264 2657: 266 TBZ x1, 1, 8f 267 STR s30, [x7], 4 268 STR s28, [x13], 4 269 DUP s30, v30.s[1] 270 DUP s28, v28.s[1] 271 STR s26, [x14], 4 272 STR s24, [x17], 4 273 DUP s26, v26.s[1] 274 DUP s24, v24.s[1] 275 STR s22, [x16], 4 276 STR s20, [x6], 4 277 DUP s22, v22.s[1] 278 DUP s20, v20.s[1] 279 2808: 281 TBZ x1, 0, 9f 282 STR h30, [x7] 283 STR h28, [x13] 284 STR h26, [x14] 285 STR h24, [x17] 286 STR h22, [x16] 287 STR h20, [x6] 2889: 289 RET 290 291END_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64 292 293#ifdef __ELF__ 294.section ".note.GNU-stack","",%progbits 295#endif 296