1// Auto-generated file. Do not edit! 2// Template: src/f16-gemm/4x8-aarch64-neonfp16arith-ld64.S.in 3// Generator: tools/xngen 4// 5// Copyright 2020 Google LLC 6// 7// This source code is licensed under the BSD-style license found in the 8// LICENSE file in the root directory of this source tree. 9 10#include <xnnpack/assembly.h> 11 12# void xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64( 13# size_t mr, x0 14# size_t nc, x1 15# size_t kc, x2 / x0 16# const uint8_t*restrict a, x3 17# size_t a_stride, x4 18# const void*restrict w, x5 19# uint8_t*restrict c, x6 20# size_t cm_stride, x7 21# size_t cn_stride, [sp] -> x14 22# const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) [sp + 8] -> x8 23 24# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS. 25 26# A pointers 27# x3 a0 28# x11 a1 29# x12 a2 30# x4 a3 / a_stride 31 32# C pointers 33# x6 c0 34# x9 c1 35# x10 c2 36# x7 c3 / cm_stride 37 38# Vector register usage 39# A0 v0 40# A1 v1 41# A2 v2 42# A3 v3 43# B v20 v21 v22 v23 44# C v16 45# C v18 46# C v28 47# C v30 48# Clamp v4, v5, v6 49# unused A v7 v8 v9 v10 v11 50# unused B v19 51 52BEGIN_FUNCTION xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64 53 54 # Load cn_stride, params pointer 55 LDP x14, x8, [sp] 56 57 # Load params values 58 LD3R {v4.8h, v5.8h, v6.8h}, [x8] 59 60 # Clamp A and C pointers 61 CMP x0, 2 // if mr < 2 62 ADD x11, x3, x4 // a1 = a0 + a_stride 63 ADD x9, x6, x7 // c1 = c0 + cm_stride 64 CSEL x11, x3, x11, LO // a1 = a0 65 CSEL x9, x6, x9, LO // c1 = c0 66 67 ADD x12, x11, x4 // a2 = a1 + a_stride 68 ADD x10, x9, x7 // c2 = c1 + cm_stride 69 // if mr <= 2 70 CSEL x12, x11, x12, LS // a2 = a1 71 CSEL x10, x9, x10, LS // c2 = c1 72 73 CMP x0, 4 // if mr < 4 74 ADD x4, x12, x4 // a3 = a2 + a_stride 75 ADD x7, x10, x7 // c3 = c2 + cm_stride 76 CSEL x4, x12, x4, LO // a3 = a2 77 CSEL x7, x10, x7, LO // c3 = c2 78 790: 80 # Load initial bias from w into accumulators 81 LDR q16, [x5], 16 82 MOV v18.16b, v16.16b 83 MOV v28.16b, v16.16b 84 MOV v30.16b, v16.16b 85 86 # Is there at least 4 halffloats (8 bytes)? 87 SUBS x0, x2, 8 // k = kc - 8 88 B.LO 3f 89 90 # Main loop - 4 halffloats of A (8 bytes) 911: 92 LDR d0, [x3], 8 93 LDR q20, [x5], 16 94 LDR q21, [x5], 16 95 LDR d1, [x11], 8 96 LDR d2, [x12], 8 97 LDR d3, [x4], 8 98 SUBS x0, x0, 8 99 FMLA v16.8h, v20.8h, v0.h[0] 100 FMLA v18.8h, v20.8h, v1.h[0] 101 FMLA v28.8h, v20.8h, v2.h[0] 102 FMLA v30.8h, v20.8h, v3.h[0] 103 LDR q22, [x5], 16 104 LDR q23, [x5], 16 105 106 FMLA v16.8h, v21.8h, v0.h[1] 107 FMLA v18.8h, v21.8h, v1.h[1] 108 FMLA v28.8h, v21.8h, v2.h[1] 109 FMLA v30.8h, v21.8h, v3.h[1] 110 111 FMLA v16.8h, v22.8h, v0.h[2] 112 FMLA v18.8h, v22.8h, v1.h[2] 113 FMLA v28.8h, v22.8h, v2.h[2] 114 FMLA v30.8h, v22.8h, v3.h[2] 115 116 FMLA v16.8h, v23.8h, v0.h[3] 117 FMLA v18.8h, v23.8h, v1.h[3] 118 FMLA v28.8h, v23.8h, v2.h[3] 119 FMLA v30.8h, v23.8h, v3.h[3] 120 B.HS 1b 121 122 # Is there a remainder?- 2 halffloats of A (4 bytes) 123 TBNZ x0, 2, 4f 124 # Is there a remainder?- 1 halffloats of A (2 bytes) 125 TBNZ x0, 1, 5f 1262: 127 # Scale and Clamp 128 FMUL v16.8h, v16.8h, v4.8h 129 SUBS x1, x1, 8 130 FMUL v18.8h, v18.8h, v4.8h 131 FMUL v28.8h, v28.8h, v4.8h 132 FMUL v30.8h, v30.8h, v4.8h 133 FMAX v16.8h, v16.8h, v5.8h 134 FMAX v18.8h, v18.8h, v5.8h 135 FMAX v28.8h, v28.8h, v5.8h 136 FMAX v30.8h, v30.8h, v5.8h 137 FMIN v16.8h, v16.8h, v6.8h 138 FMIN v18.8h, v18.8h, v6.8h 139 FMIN v28.8h, v28.8h, v6.8h 140 FMIN v30.8h, v30.8h, v6.8h 141 142 # Store full 4 x 8 143 B.LO 6f 144 145 ST1 {v16.16b}, [x6], x14 146 SUB x3, x3, x2 // a0 -= kc 147 ST1 {v18.16b}, [x9], x14 148 SUB x11, x11, x2 // a1 -= kc 149 ST1 {v28.16b}, [x10], x14 150 SUB x12, x12, x2 // a2 -= kc 151 ST1 {v30.16b}, [x7], x14 152 SUB x4, x4, x2 // a3 -= kc 153 154 B.HI 0b 155 RET 156 1573: 158 TBZ x0, 2, 5f 1594: 160 # Remainder- 2 halffloats of A (4 bytes) 161 LDR s0, [x3], 4 162 LDR q20, [x5], 16 163 LDR q21, [x5], 16 164 LDR s1, [x11], 4 165 LDR s2, [x12], 4 166 LDR s3, [x4], 4 167 168 FMLA v16.8h, v20.8h, v0.h[0] 169 FMLA v18.8h, v20.8h, v1.h[0] 170 FMLA v28.8h, v20.8h, v2.h[0] 171 FMLA v30.8h, v20.8h, v3.h[0] 172 173 FMLA v16.8h, v21.8h, v0.h[1] 174 FMLA v18.8h, v21.8h, v1.h[1] 175 FMLA v28.8h, v21.8h, v2.h[1] 176 FMLA v30.8h, v21.8h, v3.h[1] 177 178 TBZ x0, 1, 2b 179 1805: 181 # Remainder- 1 halffloat of A (2 bytes) 182 LDR h0, [x3], 2 183 LDR q20, [x5], 16 184 LDR h1, [x11], 2 185 LDR h2, [x12], 2 186 LDR h3 , [x4], 2 187 FMLA v16.8h, v20.8h, v0.h[0] 188 FMLA v18.8h, v20.8h, v1.h[0] 189 FMLA v28.8h, v20.8h, v2.h[0] 190 FMLA v30.8h, v20.8h, v3.h[0] 191 B 2b 192 193 # Store odd width 1946: 195 TBZ x1, 2, 7f 196 STR d16, [x6], 8 197 DUP d16, v16.d[1] 198 STR d18, [x9], 8 199 DUP d18, v18.d[1] 200 STR d28, [x10], 8 201 DUP d28, v28.d[1] 202 STR d30, [x7], 8 203 DUP d30, v30.d[1] 204 2057: 206 TBZ x1, 1, 8f 207 STR s16, [x6], 4 208 DUP s16, v16.s[1] 209 STR s18, [x9], 4 210 DUP s18, v18.s[1] 211 STR s28, [x10], 4 212 DUP s28, v28.s[1] 213 STR s30, [x7], 4 214 DUP s30, v30.s[1] 215 2168: 217 TBZ x1, 0, 9f 218 STR h16, [x6] 219 STR h18, [x9] 220 STR h28, [x10] 221 STR h30, [x7] 2229: 223 RET 224 225END_FUNCTION xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64 226 227#ifdef __ELF__ 228.section ".note.GNU-stack","",%progbits 229#endif 230