1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/6x16-aarch64-neonfp16arith-cortex-a75.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const void*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# void*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x8 22 23# const float*restrict acc, [sp + 8] -> x15 24# const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 16] -> (x8) 25 26# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 27 28# Register usage 29# A0 x3 v0 30# A1 x9 v1 31# A2 x10 v2 32# A3 x11 v3 33# A4 x12 v4 34# A5 x4 v5 35 36# B x5 v16 v17 v18 v19 37 38# C0 x6 v20 v21 39# C1 x16 v22 v23 40# C2 x17 v24 v25 41# C3 x14 v26 v27 42# C4 x13 v28 v29 43# C5 x7 v30 v31 44 45# Clamp v6, (v4), (v5) 46# unused v7 47# unused A v8 v9 v10 v11 48# unused B v12 v13 v14 v15 49 50BEGIN_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75 51 52 # Load acc, params pointer 53 LDP x15, x8, [sp, 8] 54 55 # Clamp A and C pointers 56 CMP x0, 2 // if mr < 2 57 ADD x9, x3, x4 // a1 = a0 + a_stride 58 ADD x16, x6, x7 // c1 = c0 + cm_stride 59 CSEL x9, x3, x9, LO // a1 = a0 60 CSEL x16, x6, x16, LO // c1 = c0 61 62 # Load params 63 LDR s6, [x8] 64 65 ADD x10, x9, x4 // a2 = a1 + a_stride 66 ADD x17, x16, x7 // c2 = c1 + cm_stride 67 // if mr <= 2 68 CSEL x10, x9, x10, LS // a2 = a1 69 CSEL x17, x16, x17, LS // c2 = c1 70 71 CMP x0, 4 // if mr < 4 72 ADD x11, x10, x4 // a3 = a2 + a_stride 73 ADD x14, x17, x7 // c3 = c2 + cm_stride 74 CSEL x11, x10, x11, LO // a3 = a2 75 CSEL x14, x17, x14, LO // c3 = c2 76 77 ADD x12, x11, x4 // a4 = a3 + a_stride 78 ADD x13, x14, x7 // c4 = c3 + cm_stride 79 // if mr <= 4 80 CSEL x12, x11, x12, LS // a4 = a3 81 CSEL x13, x14, x13, LS // c4 = c3 82 83 CMP x0, 6 // if mr < 6 84 ADD x4, x12, x4 // a5 = a4 + a_stride 85 ADD x7, x13, x7 // c5 = c4 + cm_stride 86 CSEL x4, x12, x4, LO // a5 = a4 87 CSEL x7, x13, x7, LO // c5 = c4 88 89 LDR x8, [sp] // load cn_stride 90 910: 92 # Load initial accumulators 93 LDP q20, q21, [x15], 32 94 LDP q22, q23, [x15], 32 95 LDP q24, q25, [x15], 32 96 LDP q26, q27, [x15], 32 97 LDP q28, q29, [x15], 32 98 LDP q30, q31, [x15], 32 99 100 # Is there at least 4 halffloats (8 bytes)? 101 SUBS x0, x2, 8 // k = kc - 8 102 B.LO 4f 103 104 # Prologue - load 4 A and 2 B 105 106 LDR d0, [x3], 8 // A0 107 LDR q16, [x5], 16 // B0 108 LDR q17, [x5], 16 // B1 109 LDR d1, [x9], 8 // A1 110 LDR d2, [x10], 8 // A2 111 LDR d3, [x11], 8 // A3 112 113 # Is there at least 4 halffloats for main loop? 114 SUBS x0, x0, 8 115 B.LO 2f 116 117 .p2align 3 118 # Main loop - 4 halffloats of A (8 bytes) 119 # 48 FMA + 6 ld32 A + 8 LDR B 1201: 121 FMLA v20.8h, v16.8h, v0.h[0] 122 FMLA v21.8h, v17.8h, v0.h[0] 123 LDR d4, [x12], 8 // A4 124 FMLA v22.8h, v16.8h, v1.h[0] 125 FMLA v23.8h, v17.8h, v1.h[0] 126 LDR d5, [x4], 8 // A5 127 FMLA v24.8h, v16.8h, v2.h[0] 128 FMLA v25.8h, v17.8h, v2.h[0] 129 LDR q18, [x5], 16 // B2 130 FMLA v26.8h, v16.8h, v3.h[0] 131 FMLA v27.8h, v17.8h, v3.h[0] 132 LDR q19, [x5], 16 // B3 133 FMLA v28.8h, v16.8h, v4.h[0] 134 FMLA v29.8h, v17.8h, v4.h[0] 135 FMLA v30.8h, v16.8h, v5.h[0] 136 FMLA v31.8h, v17.8h, v5.h[0] 137 SUBS x0, x0, 8 138 139 FMLA v20.8h, v18.8h, v0.h[1] 140 FMLA v21.8h, v19.8h, v0.h[1] 141 LDR q16, [x5], 16 // B4 142 FMLA v22.8h, v18.8h, v1.h[1] 143 FMLA v23.8h, v19.8h, v1.h[1] 144 LDR q17, [x5], 16 // B5 145 FMLA v24.8h, v18.8h, v2.h[1] 146 FMLA v25.8h, v19.8h, v2.h[1] 147 FMLA v26.8h, v18.8h, v3.h[1] 148 FMLA v27.8h, v19.8h, v3.h[1] 149 FMLA v28.8h, v18.8h, v4.h[1] 150 FMLA v29.8h, v19.8h, v4.h[1] 151 FMLA v30.8h, v18.8h, v5.h[1] 152 FMLA v31.8h, v19.8h, v5.h[1] 153 154 FMLA v20.8h, v16.8h, v0.h[2] 155 FMLA v21.8h, v17.8h, v0.h[2] 156 LDR q18, [x5], 16 // B6 157 FMLA v22.8h, v16.8h, v1.h[2] 158 FMLA v23.8h, v17.8h, v1.h[2] 159 LDR q19, [x5], 16 // B7 160 FMLA v24.8h, v16.8h, v2.h[2] 161 FMLA v25.8h, v17.8h, v2.h[2] 162 FMLA v26.8h, v16.8h, v3.h[2] 163 FMLA v27.8h, v17.8h, v3.h[2] 164 FMLA v28.8h, v16.8h, v4.h[2] 165 FMLA v29.8h, v17.8h, v4.h[2] 166 FMLA v30.8h, v16.8h, v5.h[2] 167 FMLA v31.8h, v17.8h, v5.h[2] 168 169 LDR q16, [x5], 16 // B0 170 FMLA v20.8h, v18.8h, v0.h[3] 171 FMLA v21.8h, v19.8h, v0.h[3] 172 LDR q17, [x5], 16 // B1 173 FMLA v22.8h, v18.8h, v1.h[3] 174 FMLA v23.8h, v19.8h, v1.h[3] 175 LDR d0, [x3], 8 // A0 176 FMLA v24.8h, v18.8h, v2.h[3] 177 FMLA v25.8h, v19.8h, v2.h[3] 178 LDR d1, [x9], 8 // A1 179 FMLA v26.8h, v18.8h, v3.h[3] 180 FMLA v27.8h, v19.8h, v3.h[3] 181 LDR d2, [x10], 8 // A2 182 FMLA v28.8h, v18.8h, v4.h[3] 183 FMLA v29.8h, v19.8h, v4.h[3] 184 LDR d3, [x11], 8 // A3 185 FMLA v30.8h, v18.8h, v5.h[3] 186 FMLA v31.8h, v19.8h, v5.h[3] 187 B.HS 1b 188 189 # Epilogue - same as main loop but no loads for next loop 1902: 191 FMLA v20.8h, v16.8h, v0.h[0] 192 FMLA v21.8h, v17.8h, v0.h[0] 193 LDR d4, [x12], 8 // A4 194 FMLA v22.8h, v16.8h, v1.h[0] 195 FMLA v23.8h, v17.8h, v1.h[0] 196 LDR d5, [x4], 8 // A5 197 FMLA v24.8h, v16.8h, v2.h[0] 198 FMLA v25.8h, v17.8h, v2.h[0] 199 LDR q18, [x5], 16 // B2 200 FMLA v26.8h, v16.8h, v3.h[0] 201 FMLA v27.8h, v17.8h, v3.h[0] 202 LDR q19, [x5], 16 // B3 203 FMLA v28.8h, v16.8h, v4.h[0] 204 FMLA v29.8h, v17.8h, v4.h[0] 205 FMLA v30.8h, v16.8h, v5.h[0] 206 FMLA v31.8h, v17.8h, v5.h[0] 207 ADDS x0, x0, 8 208 209 FMLA v20.8h, v18.8h, v0.h[1] 210 FMLA v21.8h, v19.8h, v0.h[1] 211 LDR q16, [x5], 16 // B4 212 FMLA v22.8h, v18.8h, v1.h[1] 213 FMLA v23.8h, v19.8h, v1.h[1] 214 LDR q17, [x5], 16 // B5 215 FMLA v24.8h, v18.8h, v2.h[1] 216 FMLA v25.8h, v19.8h, v2.h[1] 217 FMLA v26.8h, v18.8h, v3.h[1] 218 FMLA v27.8h, v19.8h, v3.h[1] 219 FMLA v28.8h, v18.8h, v4.h[1] 220 FMLA v29.8h, v19.8h, v4.h[1] 221 FMLA v30.8h, v18.8h, v5.h[1] 222 FMLA v31.8h, v19.8h, v5.h[1] 223 224 FMLA v20.8h, v16.8h, v0.h[2] 225 FMLA v21.8h, v17.8h, v0.h[2] 226 LDR q18, [x5], 16 // B6 227 FMLA v22.8h, v16.8h, v1.h[2] 228 FMLA v23.8h, v17.8h, v1.h[2] 229 LDR q19, [x5], 16 // B7 230 FMLA v24.8h, v16.8h, v2.h[2] 231 FMLA v25.8h, v17.8h, v2.h[2] 232 FMLA v26.8h, v16.8h, v3.h[2] 233 FMLA v27.8h, v17.8h, v3.h[2] 234 FMLA v28.8h, v16.8h, v4.h[2] 235 FMLA v29.8h, v17.8h, v4.h[2] 236 FMLA v30.8h, v16.8h, v5.h[2] 237 FMLA v31.8h, v17.8h, v5.h[2] 238 239 FMLA v20.8h, v18.8h, v0.h[3] 240 FMLA v21.8h, v19.8h, v0.h[3] 241 FMLA v22.8h, v18.8h, v1.h[3] 242 FMLA v23.8h, v19.8h, v1.h[3] 243 FMLA v24.8h, v18.8h, v2.h[3] 244 FMLA v25.8h, v19.8h, v2.h[3] 245 FMLA v26.8h, v18.8h, v3.h[3] 246 FMLA v27.8h, v19.8h, v3.h[3] 247 FMLA v28.8h, v18.8h, v4.h[3] 248 FMLA v29.8h, v19.8h, v4.h[3] 249 FMLA v30.8h, v18.8h, v5.h[3] 250 FMLA v31.8h, v19.8h, v5.h[3] 251 252 # Is there a remainder?- 1-3 halffloats of A (2-6 bytes) 253 B.NE 4f 254 2553: 256 # Clamp 257 DUP v4.8h, v6.h[0] 258 DUP v5.8h, v6.h[1] 259 FMAX v20.8h, v20.8h, v4.8h 260 FMAX v21.8h, v21.8h, v4.8h 261 FMAX v22.8h, v22.8h, v4.8h 262 FMAX v23.8h, v23.8h, v4.8h 263 FMAX v24.8h, v24.8h, v4.8h 264 FMAX v25.8h, v25.8h, v4.8h 265 FMAX v26.8h, v26.8h, v4.8h 266 FMAX v27.8h, v27.8h, v4.8h 267 FMAX v28.8h, v28.8h, v4.8h 268 FMAX v29.8h, v29.8h, v4.8h 269 FMAX v30.8h, v30.8h, v4.8h 270 FMAX v31.8h, v31.8h, v4.8h 271 SUBS x1, x1, 16 272 FMIN v20.8h, v20.8h, v5.8h 273 FMIN v21.8h, v21.8h, v5.8h 274 FMIN v22.8h, v22.8h, v5.8h 275 FMIN v23.8h, v23.8h, v5.8h 276 FMIN v24.8h, v24.8h, v5.8h 277 FMIN v25.8h, v25.8h, v5.8h 278 FMIN v26.8h, v26.8h, v5.8h 279 FMIN v27.8h, v27.8h, v5.8h 280 FMIN v28.8h, v28.8h, v5.8h 281 FMIN v29.8h, v29.8h, v5.8h 282 FMIN v30.8h, v30.8h, v5.8h 283 FMIN v31.8h, v31.8h, v5.8h 284 285 # Store full 6 x 16 286 B.LO 6f 287 288 ST1 {v30.16b, v31.16b}, [x7], x8 289 SUB x3, x3, x2 // a0 -= kc 290 ST1 {v28.16b, v29.16b}, [x13], x8 291 SUB x9, x9, x2 // a1 -= kc 292 ST1 {v26.16b, v27.16b}, [x14], x8 293 SUB x10, x10, x2 // a2 -= kc 294 ST1 {v24.16b, v25.16b}, [x17], x8 295 SUB x11, x11, x2 // a3 -= kc 296 ST1 {v22.16b, v23.16b}, [x16], x8 297 SUB x12, x12, x2 // a4 -= kc 298 ST1 {v20.16b, v21.16b}, [x6], x8 299 SUB x4, x4, x2 // a5 -= kc 300 301 B.HI 0b 302 RET 303 304 # Remainder- 1-3 halffloats of A (2-6 bytes) 3054: 306 TBZ x0, 2, 5f 307 LDR s0, [x3], 4 308 LDR q16, [x5], 16 309 LDR q17, [x5], 16 310 LDR s1, [x9], 4 311 LDR s2, [x10], 4 312 LDR s3, [x11], 4 313 LDR s4, [x12], 4 314 LDR s5, [x4], 4 315 LDR q18, [x5], 16 316 LDR q19, [x5], 16 317 FMLA v20.8h, v16.8h, v0.h[0] 318 FMLA v22.8h, v16.8h, v1.h[0] 319 FMLA v24.8h, v16.8h, v2.h[0] 320 FMLA v26.8h, v16.8h, v3.h[0] 321 FMLA v28.8h, v16.8h, v4.h[0] 322 FMLA v30.8h, v16.8h, v5.h[0] 323 FMLA v21.8h, v17.8h, v0.h[0] 324 FMLA v23.8h, v17.8h, v1.h[0] 325 FMLA v25.8h, v17.8h, v2.h[0] 326 FMLA v27.8h, v17.8h, v3.h[0] 327 FMLA v29.8h, v17.8h, v4.h[0] 328 FMLA v31.8h, v17.8h, v5.h[0] 329 330 FMLA v20.8h, v18.8h, v0.h[1] 331 FMLA v22.8h, v18.8h, v1.h[1] 332 FMLA v24.8h, v18.8h, v2.h[1] 333 FMLA v26.8h, v18.8h, v3.h[1] 334 FMLA v28.8h, v18.8h, v4.h[1] 335 FMLA v30.8h, v18.8h, v5.h[1] 336 FMLA v21.8h, v19.8h, v0.h[1] 337 FMLA v23.8h, v19.8h, v1.h[1] 338 FMLA v25.8h, v19.8h, v2.h[1] 339 FMLA v27.8h, v19.8h, v3.h[1] 340 FMLA v29.8h, v19.8h, v4.h[1] 341 FMLA v31.8h, v19.8h, v5.h[1] 342 TBZ x0, 1, 3b 343 3445: 345 LDR h0, [x3], 2 346 LDR q16, [x5], 16 347 LDR q17, [x5], 16 348 LDR h1, [x9], 2 349 LDR h2, [x10], 2 350 LDR h3, [x11], 2 351 LDR h4, [x12], 2 352 LDR h5, [x4], 2 353 FMLA v20.8h, v16.8h, v0.h[0] 354 FMLA v22.8h, v16.8h, v1.h[0] 355 FMLA v24.8h, v16.8h, v2.h[0] 356 FMLA v26.8h, v16.8h, v3.h[0] 357 FMLA v28.8h, v16.8h, v4.h[0] 358 FMLA v30.8h, v16.8h, v5.h[0] 359 FMLA v21.8h, v17.8h, v0.h[0] 360 FMLA v23.8h, v17.8h, v1.h[0] 361 FMLA v25.8h, v17.8h, v2.h[0] 362 FMLA v27.8h, v17.8h, v3.h[0] 363 FMLA v29.8h, v17.8h, v4.h[0] 364 FMLA v31.8h, v17.8h, v5.h[0] 365 B 3b 366 367 # Store odd width 3686: 369 TBZ x1, 3, 7f 370 STR q30, [x7], 16 371 MOV v30.16b, v31.16b 372 STR q28, [x13], 16 373 MOV v28.16b, v29.16b 374 STR q26, [x14], 16 375 MOV v26.16b, v27.16b 376 STR q24, [x17], 16 377 MOV v24.16b, v25.16b 378 STR q22, [x16], 16 379 MOV v22.16b, v23.16b 380 STR q20, [x6], 16 381 MOV v20.16b, v21.16b 382 3837: 384 TBZ x1, 2, 8f 385 STR d30, [x7], 8 386 STR d28, [x13], 8 387 DUP d30, v30.d[1] 388 DUP d28, v28.d[1] 389 STR d26, [x14], 8 390 STR d24, [x17], 8 391 DUP d26, v26.d[1] 392 DUP d24, v24.d[1] 393 STR d22, [x16], 8 394 STR d20, [x6], 8 395 DUP d22, v22.d[1] 396 DUP d20, v20.d[1] 397 3988: 399 TBZ x1, 1, 9f 400 STR s30, [x7], 4 401 STR s28, [x13], 4 402 DUP s30, v30.s[1] 403 DUP s28, v28.s[1] 404 STR s26, [x14], 4 405 STR s24, [x17], 4 406 DUP s26, v26.s[1] 407 DUP s24, v24.s[1] 408 STR s22, [x16], 4 409 STR s20, [x6], 4 410 DUP s22, v22.s[1] 411 DUP s20, v20.s[1] 412 4139: 414 TBZ x1, 0, 10f 415 STR h30, [x7] 416 STR h28, [x13] 417 STR h26, [x14] 418 STR h24, [x17] 419 STR h22, [x16] 420 STR h20, [x6] 42110: 422 RET 423 424END_FUNCTION xnn_f16_gemminc_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75 425 426#ifdef __ELF__ 427.section ".note.GNU-stack","",%progbits 428#endif 429