1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert CHANNEL_TILE % 8 == 0 7$assert CHANNEL_TILE >= 8 8$assert ROW_TILE >= 2 9$assert ROW_SUBTILE >= 2 10$assert ROW_SUBTILE <= ROW_TILE 11$assert ACCUMULATORS >= 1 12$assert ROW_TILE >= ACCUMULATORS * 2 13$assert ROW_SUBTILE >= ACCUMULATORS * 2 14$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 15#include <assert.h> 16 17#include <wasm_simd128.h> 18 19#include <xnnpack/gavgpool.h> 20#include <xnnpack/math.h> 21 22 23void xnn_qs8_gavgpool_minmax_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__wasmsimd_c${CHANNEL_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}( 24 size_t rows, 25 size_t channels, 26 const int8_t* input, 27 size_t input_stride, 28 const int8_t* zero, 29 int32_t* buffer, 30 int8_t* output, 31 const union xnn_qs8_avgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN 32{ 33 assert(rows > ${ROW_TILE}); 34 assert(channels != 0); 35 36 const int8_t* i0 = input; 37 $for M in range(1, ROW_TILE): 38 const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride); 39 $if CHANNEL_TILE <= 16: 40 const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}); 41 $else: 42 const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8); 43 44 const v128_t vbias = wasm_v128_load(params->wasmsimd.bias); 45 int32_t* b = buffer; 46 size_t c = channels; 47 for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { 48 $for M in range(ROW_TILE): 49 const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M}); 50 $for C in range(8, CHANNEL_TILE, 8): 51 const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${M} + ${C}); 52 i${M} += ${CHANNEL_TILE}; 53 54 $for A in range(ACCUMULATORS): 55 $for C in range(0, CHANNEL_TILE, 8): 56 v128_t vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]}); 57 58 $for M in range(ACCUMULATORS * 2, ROW_TILE): 59 $for C in range(0, CHANNEL_TILE, 8): 60 vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi${M}x${ABC[C:C+8]}); 61 62 $if ACCUMULATORS > 1: 63 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]} 64 $ACC_SLICE = 1 65 $while ACC_SLICE < ACCUMULATORS: 66 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 67 $if A + ACC_SLICE < ACCUMULATORS: 68 $for C in range(0, CHANNEL_TILE, 8): 69 vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]}); 70 $ACC_SLICE *= 2 71 72 $for C in range(0, CHANNEL_TILE, 8): 73 const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_low_i16x8(vacc0x${ABC[C:C+8]})); 74 const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_high_i16x8(vacc0x${ABC[C:C+8]})); 75 76 wasm_v128_store(b, vacc${ABC[0:4]}); 77 $for C in range(4, CHANNEL_TILE, 4): 78 wasm_v128_store(b + ${C}, vacc${ABC[C:C+4]}); 79 b += ${CHANNEL_TILE}; 80 } 81 $if CHANNEL_TILE > 16: 82 if XNN_UNLIKELY(c != 0) { 83 do { 84 $for M in range(ROW_TILE): 85 const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M}); 86 i${M} += 8; 87 88 $for A in range(ACCUMULATORS): 89 v128_t vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vxi${A*2}x${ABC[0:8]}, vxi${A*2+1}x${ABC[0:8]}); 90 91 $for M in range(ACCUMULATORS * 2, ROW_TILE): 92 vacc${M % ACCUMULATORS}x${ABC[0:8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[0:8]}, vxi${M}x${ABC[0:8]}); 93 94 $if ACCUMULATORS > 1: 95 // Add up all accumulators to vacc0x${ABC[0:8]} 96 $ACC_SLICE = 1 97 $while ACC_SLICE < ACCUMULATORS: 98 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 99 $if A + ACC_SLICE < ACCUMULATORS: 100 vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vacc${A}x${ABC[0:8]}, vacc${A + ACC_SLICE}x${ABC[0:8]}); 101 $ACC_SLICE *= 2 102 103 const v128_t vacc${ABC[0:4]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_low_i16x8(vacc0x${ABC[0:8]})); 104 const v128_t vacc${ABC[4:8]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_high_i16x8(vacc0x${ABC[0:8]})); 105 106 wasm_v128_store(b, vacc${ABC[0:4]}); 107 wasm_v128_store(b + 4, vacc${ABC[4:8]}); 108 b += 8; 109 110 c = doz(c, 8); 111 } while (c != 0); 112 } 113 114 for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { 115 $for M in range(ROW_SUBTILE): 116 i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); 117 118 int32_t* b = buffer; 119 size_t c = channels; 120 for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { 121 $for M in range(ROW_SUBTILE): 122 const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M}); 123 $for C in range(8, CHANNEL_TILE, 8): 124 const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${M} + ${C}); 125 i${M} += ${CHANNEL_TILE}; 126 127 $for A in range(ACCUMULATORS): 128 $for C in range(0, CHANNEL_TILE, 8): 129 v128_t vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]}); 130 131 $for M in range(ACCUMULATORS * 2, ROW_SUBTILE): 132 $for C in range(0, CHANNEL_TILE, 8): 133 vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi${M}x${ABC[C:C+8]}); 134 135 $if ACCUMULATORS > 1: 136 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]} 137 $ACC_SLICE = 1 138 $while ACC_SLICE < ACCUMULATORS: 139 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 140 $if A + ACC_SLICE < ACCUMULATORS: 141 $for C in range(0, CHANNEL_TILE, 8): 142 vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]}); 143 $ACC_SLICE *= 2 144 145 $for C in range(0, CHANNEL_TILE, 8): 146 const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(wasm_i32x4_widen_low_i16x8(vacc0x${ABC[C:C+8]}), wasm_v128_load(b + ${C})); 147 const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(wasm_i32x4_widen_high_i16x8(vacc0x${ABC[C:C+8]}), wasm_v128_load(b + ${C+4})); 148 149 wasm_v128_store(b, vacc${ABC[0:4]}); 150 $for C in range(4, CHANNEL_TILE, 4): 151 wasm_v128_store(b + ${C}, vacc${ABC[C:C+4]}); 152 b += ${CHANNEL_TILE}; 153 } 154 $if CHANNEL_TILE > 16: 155 if XNN_UNLIKELY(c != 0) { 156 do { 157 $for M in range(ROW_SUBTILE): 158 const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M}); 159 i${M} += 8; 160 161 $for A in range(ACCUMULATORS): 162 v128_t vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vxi${A*2}x${ABC[0:8]}, vxi${A*2+1}x${ABC[0:8]}); 163 164 $for M in range(ACCUMULATORS * 2, ROW_SUBTILE): 165 vacc${M % ACCUMULATORS}x${ABC[0:8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[0:8]}, vxi${M}x${ABC[0:8]}); 166 167 $if ACCUMULATORS > 1: 168 // Add up all accumulators to vacc0x${ABC[0:8]} 169 $ACC_SLICE = 1 170 $while ACC_SLICE < ACCUMULATORS: 171 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 172 $if A + ACC_SLICE < ACCUMULATORS: 173 vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vacc${A}x${ABC[0:8]}, vacc${A + ACC_SLICE}x${ABC[0:8]}); 174 $ACC_SLICE *= 2 175 176 const v128_t vacc${ABC[0:4]} = wasm_i32x4_add(wasm_i32x4_widen_low_i16x8(vacc0x${ABC[0:8]}), wasm_v128_load(b)); 177 const v128_t vacc${ABC[4:8]} = wasm_i32x4_add(wasm_i32x4_widen_high_i16x8(vacc0x${ABC[0:8]}), wasm_v128_load(b + 4)); 178 179 wasm_v128_store(b, vacc${ABC[0:4]}); 180 wasm_v128_store(b + 4, vacc${ABC[4:8]}); 181 b += 8; 182 183 c = doz(c, 8); 184 } while (c != 0); 185 } 186 } 187 188 i0 = (const int8_t*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); 189 $for M in range(1, ROW_SUBTILE): 190 i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); 191 $if M % 2 == 1: 192 if XNN_UNPREDICTABLE(rows < ${M+1}) { 193 i${M} = zero; 194 } 195 $else: 196 if XNN_UNPREDICTABLE(rows <= ${M}) { 197 i${M} = zero; 198 } 199 200 const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier); 201 const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding); 202 const int32_t vshift = params->wasmsimd.shift; 203 const v128_t vzero = wasm_f64x2_splat(0.0); 204 while (channels >= ${CHANNEL_TILE}) { 205 $for M in range(ROW_SUBTILE): 206 const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M}); 207 $for C in range(8, CHANNEL_TILE, 8): 208 const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${M} + ${C}); 209 i${M} += ${CHANNEL_TILE}; 210 211 $for A in range(ACCUMULATORS): 212 $for C in range(0, CHANNEL_TILE, 8): 213 v128_t vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]}); 214 215 $for M in range(ACCUMULATORS * 2, ROW_SUBTILE): 216 $for C in range(0, CHANNEL_TILE, 8): 217 vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi${M}x${ABC[C:C+8]}); 218 219 $if ACCUMULATORS > 1: 220 // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]} 221 $ACC_SLICE = 1 222 $while ACC_SLICE < ACCUMULATORS: 223 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 224 $if A + ACC_SLICE < ACCUMULATORS: 225 $for C in range(0, CHANNEL_TILE, 8): 226 vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]}); 227 $ACC_SLICE *= 2 228 229 $for C in range(0, CHANNEL_TILE, 8): 230 const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(wasm_i32x4_widen_low_i16x8(vacc0x${ABC[C:C+8]}), wasm_v128_load(buffer + ${C})); 231 const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(wasm_i32x4_widen_high_i16x8(vacc0x${ABC[C:C+8]}), wasm_v128_load(buffer + ${C+4})); 232 buffer += ${CHANNEL_TILE}; 233 234 $for C in range(0, CHANNEL_TILE, 4): 235 const v128_t vabsacc${ABC[C:C+4]} = wasm_i32x4_abs(vacc${ABC[C:C+4]}); 236 237 $for C in range(0, CHANNEL_TILE, 4): 238 const v128_t vsgnacc${ABC[C:C+4]} = wasm_i32x4_gt(vabsacc${ABC[C:C+4]}, vacc${ABC[C:C+4]}); 239 240 $for C in range(0, CHANNEL_TILE, 4): 241 const v128_t vabsacc${ABC[C:C+2]} = wasm_v32x4_shuffle(vabsacc${ABC[C:C+4]}, vzero, 0, 4, 1, 5); 242 const v128_t vabsacc${ABC[C+2:C+4]} = wasm_v32x4_shuffle(vabsacc${ABC[C:C+4]}, vzero, 2, 6, 3, 7); 243 244 $for C in range(0, CHANNEL_TILE, 2): 245 const v128_t vabsprod${ABC[C:C+2]} = wasm_i64x2_mul(vabsacc${ABC[C:C+2]}, vmultiplier); 246 247 $for C in range(0, CHANNEL_TILE, 2): 248 const v128_t vabsout${ABC[C:C+2]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[C:C+2]}, vrounding), vshift); 249 250 $for C in range(0, CHANNEL_TILE, 4): 251 const v128_t vabsout${ABC[C:C+4]} = wasm_v32x4_shuffle(vabsout${ABC[C:C+2]}, vabsout${ABC[C+2:C+4]}, 0, 2, 4, 6); 252 253 $for C in range(0, CHANNEL_TILE, 4): 254 const v128_t vout${ABC[C:C+4]} = wasm_i32x4_sub(wasm_v128_xor(vabsout${ABC[C:C+4]}, vsgnacc${ABC[C:C+4]}), vsgnacc${ABC[C:C+4]}); 255 256 const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point); 257 $for C in range(0, CHANNEL_TILE, 8): 258 v128_t vout${ABC[C:C+8]} = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vout${ABC[C:C+4]}, vout${ABC[C+4:C+8]}), voutput_zero_point); 259 260 const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min); 261 const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max); 262 $for C in range(0, CHANNEL_TILE, 16): 263 $if C + 8 < CHANNEL_TILE: 264 v128_t vout${ABC[C:C+16]} = wasm_i8x16_min(wasm_i8x16_max(wasm_i8x16_narrow_i16x8(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}), voutput_min), voutput_max); 265 $else: 266 v128_t vout${ABC[C:C+8]}${ABC[C:C+8]} = wasm_i8x16_min(wasm_i8x16_max(wasm_i8x16_narrow_i16x8(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}), voutput_min), voutput_max); 267 268 $if CHANNEL_TILE > 8: 269 wasm_v128_store(output, vout${ABC[0:16]}); 270 $else: 271 *((double*) output) = wasm_f64x2_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 272 $for C in range(16, CHANNEL_TILE, 16): 273 $if C + 8 < CHANNEL_TILE: 274 wasm_v128_store(output + ${C}, vout${ABC[C:C+16]}); 275 $else: 276 *((double*) (output + ${C})) = wasm_f64x2_extract_lane(vout${ABC[C:C+8]}${ABC[C:C+8]}, 0); 277 output += ${CHANNEL_TILE}; 278 279 channels -= ${CHANNEL_TILE}; 280 } 281 if XNN_UNLIKELY(channels != 0) { 282 ${"do " if CHANNEL_TILE > 8 else ""}{ 283 $for M in range(ROW_SUBTILE): 284 const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M}); 285 i${M} += 8; 286 287 $for A in range(ACCUMULATORS): 288 v128_t vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vxi${A*2}x${ABC[0:8]}, vxi${A*2+1}x${ABC[0:8]}); 289 290 $for M in range(ACCUMULATORS * 2, ROW_SUBTILE): 291 vacc${M % ACCUMULATORS}x${ABC[0:8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[0:8]}, vxi${M}x${ABC[0:8]}); 292 293 $if ACCUMULATORS > 1: 294 // Add up all accumulators to vacc0x${ABC[0:8]} 295 $ACC_SLICE = 1 296 $while ACC_SLICE < ACCUMULATORS: 297 $for A in range(0, ACCUMULATORS, ACC_SLICE * 2): 298 $if A + ACC_SLICE < ACCUMULATORS: 299 vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vacc${A}x${ABC[0:8]}, vacc${A + ACC_SLICE}x${ABC[0:8]}); 300 $ACC_SLICE *= 2 301 302 const v128_t vacc${ABC[0:4]} = wasm_i32x4_add(wasm_i32x4_widen_low_i16x8(vacc0x${ABC[0:8]}), wasm_v128_load(buffer)); 303 const v128_t vacc${ABC[4:8]} = wasm_i32x4_add(wasm_i32x4_widen_high_i16x8(vacc0x${ABC[0:8]}), wasm_v128_load(buffer + 4)); 304 buffer += 8; 305 306 const v128_t vabsacc${ABC[0:4]} = wasm_i32x4_abs(vacc${ABC[0:4]}); 307 const v128_t vabsacc${ABC[4:8]} = wasm_i32x4_abs(vacc${ABC[4:8]}); 308 309 const v128_t vsgnacc${ABC[0:4]} = wasm_i32x4_gt(vabsacc${ABC[0:4]}, vacc${ABC[0:4]}); 310 const v128_t vsgnacc${ABC[4:8]} = wasm_i32x4_gt(vabsacc${ABC[4:8]}, vacc${ABC[4:8]}); 311 312 const v128_t vabsacc${ABC[0:2]} = wasm_v32x4_shuffle(vabsacc${ABC[0:4]}, vzero, 0, 4, 1, 5); 313 const v128_t vabsacc${ABC[2:4]} = wasm_v32x4_shuffle(vabsacc${ABC[0:4]}, vzero, 2, 6, 3, 7); 314 const v128_t vabsacc${ABC[4:6]} = wasm_v32x4_shuffle(vabsacc${ABC[4:8]}, vzero, 0, 4, 1, 5); 315 const v128_t vabsacc${ABC[6:8]} = wasm_v32x4_shuffle(vabsacc${ABC[4:8]}, vzero, 2, 6, 3, 7); 316 317 const v128_t vabsprod${ABC[0:2]} = wasm_i64x2_mul(vabsacc${ABC[0:2]}, vmultiplier); 318 const v128_t vabsprod${ABC[2:4]} = wasm_i64x2_mul(vabsacc${ABC[2:4]}, vmultiplier); 319 const v128_t vabsprod${ABC[4:6]} = wasm_i64x2_mul(vabsacc${ABC[4:6]}, vmultiplier); 320 const v128_t vabsprod${ABC[6:8]} = wasm_i64x2_mul(vabsacc${ABC[6:8]}, vmultiplier); 321 322 const v128_t vabsout${ABC[0:2]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[0:2]}, vrounding), vshift); 323 const v128_t vabsout${ABC[2:4]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[2:4]}, vrounding), vshift); 324 const v128_t vabsout${ABC[4:6]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[4:6]}, vrounding), vshift); 325 const v128_t vabsout${ABC[6:8]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[6:8]}, vrounding), vshift); 326 327 const v128_t vabsout${ABC[0:4]} = wasm_v32x4_shuffle(vabsout${ABC[0:2]}, vabsout${ABC[2:4]}, 0, 2, 4, 6); 328 const v128_t vabsout${ABC[4:8]} = wasm_v32x4_shuffle(vabsout${ABC[4:6]}, vabsout${ABC[6:8]}, 0, 2, 4, 6); 329 330 const v128_t vout${ABC[0:4]} = wasm_i32x4_sub(wasm_v128_xor(vabsout${ABC[0:4]}, vsgnacc${ABC[0:4]}), vsgnacc${ABC[0:4]}); 331 const v128_t vout${ABC[4:8]} = wasm_i32x4_sub(wasm_v128_xor(vabsout${ABC[4:8]}, vsgnacc${ABC[4:8]}), vsgnacc${ABC[4:8]}); 332 333 const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point); 334 const v128_t vout${ABC[0:8]} = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vout${ABC[0:4]}, vout${ABC[4:8]}), voutput_zero_point); 335 336 const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min); 337 const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max); 338 v128_t vout${ABC[0:8]}${ABC[0:8]} = wasm_i8x16_min(wasm_i8x16_max(wasm_i8x16_narrow_i16x8(vout${ABC[0:8]}, vout${ABC[0:8]}), voutput_min), voutput_max); 339 340 $if CHANNEL_TILE > 8: 341 if XNN_LIKELY(channels >= 8) { 342 *((double*) output) = wasm_f64x2_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 343 output += 8; 344 channels -= 8; 345 } else { 346 if (channels & 4) { 347 *((float*) output) = wasm_f32x4_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 348 vout${ABC[0:8]}${ABC[0:8]} = wasm_u64x2_shr(vout${ABC[0:8]}${ABC[0:8]}, 32); 349 output += 4; 350 } 351 if (channels & 2) { 352 *((uint16_t*) output) = (uint16_t) wasm_i16x8_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 353 vout${ABC[0:8]}${ABC[0:8]} = wasm_u32x4_shr(vout${ABC[0:8]}${ABC[0:8]}, 16); 354 output += 2; 355 } 356 if (channels & 1) { 357 *output = (int8_t) wasm_i8x16_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 358 output += 1; 359 } 360 channels = 0; 361 } 362 $else: 363 if (channels & 4) { 364 *((float*) output) = wasm_f32x4_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 365 vout${ABC[0:8]}${ABC[0:8]} = wasm_u64x2_shr(vout${ABC[0:8]}${ABC[0:8]}, 32); 366 output += 4; 367 } 368 if (channels & 2) { 369 *((uint16_t*) output) = (uint16_t) wasm_i16x8_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 370 vout${ABC[0:8]}${ABC[0:8]} = wasm_u32x4_shr(vout${ABC[0:8]}${ABC[0:8]}, 16); 371 output += 2; 372 } 373 if (channels & 1) { 374 *output = (int8_t) wasm_i8x16_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 375 } 376 }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} 377 } 378} 379