1// Copyright 2020 Google LLC 2// 3// This source code is licensed under the BSD-style license found in the 4// LICENSE file in the root directory of this source tree. 5 6$assert DATATYPE in ["QS8", "QU8"] 7$assert CHANNEL_TILE % 8 == 0 8$assert CHANNEL_TILE >= 8 9$assert ROW_TILE >= 3 10$assert ROW_SUBTILE >= 3 11$assert ROW_SUBTILE <= ROW_TILE 12$assert REQUANTIZATION == "FP32" 13$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" 14#include <assert.h> 15 16#include <wasm_simd128.h> 17 18#include <xnnpack/gavgpool.h> 19#include <xnnpack/math.h> 20 21 22$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE] 23$WASM_X16X8_LOAD8X8 = {"QS8": "wasm_i16x8_load8x8", "QU8": "wasm_u16x8_load8x8"}[DATATYPE] 24$WASM_X32X4_EXTEND_LOW_X16X8 = {"QS8": "wasm_i32x4_extend_low_i16x8", "QU8": "wasm_u32x4_extend_low_u16x8"}[DATATYPE] 25$WASM_X32X4_EXTEND_HIGH_X16X8 = {"QS8": "wasm_i32x4_extend_high_i16x8", "QU8": "wasm_u32x4_extend_high_u16x8"}[DATATYPE] 26$WASM_X8X16_NARROW_I16X8 = {"QS8": "wasm_i8x16_narrow_i16x8", "QU8": "wasm_u8x16_narrow_i16x8"}[DATATYPE] 27$WASM_X8X16_MIN = {"QS8": "wasm_i8x16_min", "QU8": "wasm_u8x16_min"}[DATATYPE] 28void xnn_${DATATYPE.lower()}_gavgpool_minmax_fp32_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__wasmsimd_c${CHANNEL_TILE}( 29 size_t rows, 30 size_t channels, 31 const ${XINT8_T}* input, 32 size_t input_stride, 33 const ${XINT8_T}* zero, 34 int32_t* buffer, 35 ${XINT8_T}* output, 36 const union xnn_${DATATYPE.lower()}_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS 37{ 38 assert(rows > ${ROW_TILE}); 39 assert(channels != 0); 40 41 const ${XINT8_T}* i0 = input; 42 $for M in range(1, ROW_TILE): 43 const ${XINT8_T}* i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M-1} + input_stride); 44 $if CHANNEL_TILE <= 16: 45 const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE}) * sizeof(${XINT8_T}); 46 $else: 47 const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8) * sizeof(${XINT8_T}); 48 49 const v128_t vinit_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.init_bias); 50 int32_t* b = buffer; 51 size_t c = channels; 52 for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { 53 $for M in range(2): 54 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 55 $for C in range(8, CHANNEL_TILE, 8): 56 const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); 57 i${M} += ${CHANNEL_TILE}; 58 59 v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 60 const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); 61 $for C in range(8, CHANNEL_TILE, 8): 62 v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); 63 const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C}); 64 i2 += ${CHANNEL_TILE}; 65 66 $for M in range(3, ROW_TILE): 67 vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 68 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 69 $for C in range(8, CHANNEL_TILE, 8): 70 vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); 71 const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); 72 i${M} += ${CHANNEL_TILE}; 73 74 $for C in range(0, CHANNEL_TILE, 8): 75 vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${ROW_TILE-1}x${ABC[C:C+8]}); 76 77 $for C in range(0, CHANNEL_TILE, 8): 78 const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[C:C+8]})); 79 const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[C:C+8]})); 80 81 wasm_v128_store(b, vacc${ABC[0:4]}); 82 $for C in range(4, CHANNEL_TILE, 4): 83 wasm_v128_store(b + ${C}, vacc${ABC[C:C+4]}); 84 b += ${CHANNEL_TILE}; 85 } 86 $if CHANNEL_TILE > 16: 87 if XNN_UNLIKELY(c != 0) { 88 do { 89 $for M in range(2): 90 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 91 i${M} += 8; 92 93 v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 94 const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); 95 i2 += 8; 96 97 $for M in range(3, ROW_TILE): 98 vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 99 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 100 i${M} += 8; 101 102 vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]}); 103 104 const v128_t vacc${ABC[0:4]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[0:8]})); 105 const v128_t vacc${ABC[4:8]} = wasm_i32x4_add(vinit_bias, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[0:8]})); 106 107 wasm_v128_store(b, vacc${ABC[0:4]}); 108 wasm_v128_store(b + 4, vacc${ABC[4:8]}); 109 b += 8; 110 111 c = doz(c, 8); 112 } while (c != 0); 113 } 114 115 for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) { 116 $for M in range(ROW_SUBTILE): 117 i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); 118 119 int32_t* b = buffer; 120 size_t c = channels; 121 for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) { 122 $for M in range(2): 123 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 124 $for C in range(8, CHANNEL_TILE, 8): 125 const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); 126 i${M} += ${CHANNEL_TILE}; 127 128 v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 129 const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); 130 $for C in range(8, CHANNEL_TILE, 8): 131 v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); 132 const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C}); 133 i2 += ${CHANNEL_TILE}; 134 135 $for M in range(3, ROW_SUBTILE): 136 vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 137 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 138 $for C in range(8, CHANNEL_TILE, 8): 139 vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); 140 const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); 141 i${M} += ${CHANNEL_TILE}; 142 143 $for C in range(0, CHANNEL_TILE, 8): 144 vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${ROW_SUBTILE-1}x${ABC[C:C+8]}); 145 146 v128_t vacc${ABC[0:4]} = wasm_v128_load(b); 147 $for C in range(4, CHANNEL_TILE, 4): 148 v128_t vacc${ABC[C:C+4]} = wasm_v128_load(b + ${C}); 149 150 $for C in range(0, CHANNEL_TILE, 8): 151 vacc${ABC[C:C+4]} = wasm_i32x4_add(vacc${ABC[C:C+4]}, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[C:C+8]})); 152 vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vacc${ABC[C+4:C+8]}, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[C:C+8]})); 153 154 wasm_v128_store(b, vacc${ABC[0:4]}); 155 $for C in range(4, CHANNEL_TILE, 4): 156 wasm_v128_store(b + ${C}, vacc${ABC[C:C+4]}); 157 b += ${CHANNEL_TILE}; 158 } 159 $if CHANNEL_TILE > 16: 160 if XNN_UNLIKELY(c != 0) { 161 do { 162 $for M in range(2): 163 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 164 i${M} += 8; 165 166 v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 167 const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); 168 i2 += 8; 169 170 $for M in range(3, ROW_SUBTILE): 171 vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 172 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 173 i${M} += 8; 174 175 vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]}); 176 177 v128_t vacc${ABC[0:4]} = wasm_v128_load(b); 178 v128_t vacc${ABC[4:8]} = wasm_v128_load(b + 4); 179 180 vacc${ABC[0:4]} = wasm_i32x4_add(vacc${ABC[0:4]}, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[0:8]})); 181 vacc${ABC[4:8]} = wasm_i32x4_add(vacc${ABC[4:8]}, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[0:8]})); 182 183 wasm_v128_store(b, vacc${ABC[0:4]}); 184 wasm_v128_store(b + 4, vacc${ABC[4:8]}); 185 b += 8; 186 187 c = doz(c, 8); 188 } while (c != 0); 189 } 190 } 191 192 i0 = (const ${XINT8_T}*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment); 193 $for M in range(1, ROW_SUBTILE): 194 i${M} = (const ${XINT8_T}*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment); 195 $if M % 2 == 1: 196 if XNN_UNPREDICTABLE(rows < ${M+1}) { 197 i${M} = zero; 198 } 199 $else: 200 if XNN_UNPREDICTABLE(rows <= ${M}) { 201 i${M} = zero; 202 } 203 204 const v128_t vscale = wasm_v128_load64_splat(params->fp32_wasmsimd.scale); 205 const v128_t vmagic_bias = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias); 206 const v128_t vmagic_min = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_min); 207 const v128_t vmagic_bias_less_output_zero_point = wasm_v128_load64_splat(params->fp32_wasmsimd.magic_bias_less_output_zero_point); 208 const v128_t voutput_max = wasm_v128_load64_splat(params->fp32_wasmsimd.output_max); 209 for (; channels >= ${CHANNEL_TILE}; channels -= ${CHANNEL_TILE}) { 210 $for M in range(2): 211 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 212 $for C in range(8, CHANNEL_TILE, 8): 213 const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); 214 i${M} += ${CHANNEL_TILE}; 215 216 v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 217 const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); 218 $for C in range(8, CHANNEL_TILE, 8): 219 v128_t vacc${ABC[C:C+8]} = wasm_i16x8_add(vxi0x${ABC[C:C+8]}, vxi1x${ABC[C:C+8]}); 220 const v128_t vxi2x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i2 + ${C}); 221 i2 += ${CHANNEL_TILE}; 222 223 $for M in range(3, ROW_SUBTILE): 224 vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 225 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 226 $for C in range(8, CHANNEL_TILE, 8): 227 vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${M-1}x${ABC[C:C+8]}); 228 const v128_t vxi${M}x${ABC[C:C+8]} = ${WASM_X16X8_LOAD8X8}(i${M} + ${C}); 229 i${M} += ${CHANNEL_TILE}; 230 231 $for C in range(0, CHANNEL_TILE, 8): 232 vacc${ABC[C:C+8]} = wasm_i16x8_add(vacc${ABC[C:C+8]}, vxi${ROW_SUBTILE-1}x${ABC[C:C+8]}); 233 234 v128_t vacc${ABC[0:4]} = wasm_v128_load(buffer); 235 $for C in range(4, CHANNEL_TILE, 4): 236 v128_t vacc${ABC[C:C+4]} = wasm_v128_load(buffer + ${C}); 237 buffer += ${CHANNEL_TILE}; 238 239 $for C in range(0, CHANNEL_TILE, 8): 240 vacc${ABC[C:C+4]} = wasm_i32x4_add(vacc${ABC[C:C+4]}, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[C:C+8]})); 241 vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vacc${ABC[C+4:C+8]}, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[C:C+8]})); 242 243 $for C in range(0, CHANNEL_TILE, 4): 244 vacc${ABC[C:C+4]} = wasm_f32x4_convert_i32x4(vacc${ABC[C:C+4]}); 245 246 $for C in range(0, CHANNEL_TILE, 4): 247 vacc${ABC[C:C+4]} = wasm_f32x4_mul(vacc${ABC[C:C+4]}, vscale); 248 249 $for C in range(0, CHANNEL_TILE, 4): 250 vacc${ABC[C:C+4]} = wasm_f32x4_add(vacc${ABC[C:C+4]}, vmagic_bias); 251 252 $for C in range(0, CHANNEL_TILE, 4): 253 vacc${ABC[C:C+4]} = wasm_i32x4_max(vacc${ABC[C:C+4]}, vmagic_min); 254 255 $for C in range(0, CHANNEL_TILE, 4): 256 vacc${ABC[C:C+4]} = wasm_i32x4_sub(vacc${ABC[C:C+4]}, vmagic_bias_less_output_zero_point); 257 258 $for C in range(0, CHANNEL_TILE, 8): 259 v128_t vout${ABC[C:C+8]} = wasm_i16x8_narrow_i32x4(vacc${ABC[C:C+4]}, vacc${ABC[C+4:C+8]}); 260 261 $for C in range(0, CHANNEL_TILE, 16): 262 $if C + 8 < CHANNEL_TILE: 263 v128_t vout${ABC[C:C+16]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}); 264 $else: 265 v128_t vout${ABC[C:C+8]}${ABC[C:C+8]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}); 266 267 $for C in range(0, CHANNEL_TILE, 16): 268 $if C + 8 < CHANNEL_TILE: 269 vout${ABC[C:C+16]} = ${WASM_X8X16_MIN}(vout${ABC[C:C+16]}, voutput_max); 270 $else: 271 vout${ABC[C:C+8]}${ABC[C:C+8]} = ${WASM_X8X16_MIN}(vout${ABC[C:C+8]}${ABC[C:C+8]}, voutput_max); 272 273 $if CHANNEL_TILE > 8: 274 wasm_v128_store(output, vout${ABC[0:16]}); 275 $else: 276 *((double*) output) = wasm_f64x2_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 277 $for C in range(16, CHANNEL_TILE, 16): 278 $if C + 8 < CHANNEL_TILE: 279 wasm_v128_store(output + ${C}, vout${ABC[C:C+16]}); 280 $else: 281 *((double*) (output + ${C})) = wasm_f64x2_extract_lane(vout${ABC[C:C+8]}${ABC[C:C+8]}, 0); 282 output += ${CHANNEL_TILE}; 283 } 284 if XNN_UNLIKELY(channels != 0) { 285 ${"do " if CHANNEL_TILE > 8 else ""}{ 286 $for M in range(2): 287 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 288 i${M} += 8; 289 290 v128_t vacc${ABC[0:8]} = wasm_i16x8_add(vxi0x${ABC[0:8]}, vxi1x${ABC[0:8]}); 291 const v128_t vxi2x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i2); 292 i2 += 8; 293 294 $for M in range(3, ROW_SUBTILE): 295 vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${M-1}x${ABC[0:8]}); 296 const v128_t vxi${M}x${ABC[0:8]} = ${WASM_X16X8_LOAD8X8}(i${M}); 297 i${M} += 8; 298 299 vacc${ABC[0:8]} = wasm_i16x8_add(vacc${ABC[0:8]}, vxi${ROW_SUBTILE-1}x${ABC[0:8]}); 300 301 v128_t vacc${ABC[0:4]} = wasm_v128_load(buffer); 302 v128_t vacc${ABC[4:8]} = wasm_v128_load(buffer + 4); 303 buffer += 8; 304 305 vacc${ABC[0:4]} = wasm_i32x4_add(vacc${ABC[0:4]}, ${WASM_X32X4_EXTEND_LOW_X16X8}(vacc${ABC[0:8]})); 306 vacc${ABC[4:8]} = wasm_i32x4_add(vacc${ABC[4:8]}, ${WASM_X32X4_EXTEND_HIGH_X16X8}(vacc${ABC[0:8]})); 307 308 vacc${ABC[0:4]} = wasm_f32x4_convert_i32x4(vacc${ABC[0:4]}); 309 vacc${ABC[4:8]} = wasm_f32x4_convert_i32x4(vacc${ABC[4:8]}); 310 311 vacc${ABC[0:4]} = wasm_f32x4_mul(vacc${ABC[0:4]}, vscale); 312 vacc${ABC[4:8]} = wasm_f32x4_mul(vacc${ABC[4:8]}, vscale); 313 314 vacc${ABC[0:4]} = wasm_f32x4_add(vacc${ABC[0:4]}, vmagic_bias); 315 vacc${ABC[4:8]} = wasm_f32x4_add(vacc${ABC[4:8]}, vmagic_bias); 316 317 vacc${ABC[0:4]} = wasm_i32x4_max(vacc${ABC[0:4]}, vmagic_min); 318 vacc${ABC[4:8]} = wasm_i32x4_max(vacc${ABC[4:8]}, vmagic_min); 319 320 vacc${ABC[0:4]} = wasm_i32x4_sub(vacc${ABC[0:4]}, vmagic_bias_less_output_zero_point); 321 vacc${ABC[4:8]} = wasm_i32x4_sub(vacc${ABC[4:8]}, vmagic_bias_less_output_zero_point); 322 323 const v128_t vout${ABC[0:8]} = wasm_i16x8_narrow_i32x4(vacc${ABC[0:4]}, vacc${ABC[4:8]}); 324 v128_t vout${ABC[0:8]}${ABC[0:8]} = ${WASM_X8X16_NARROW_I16X8}(vout${ABC[0:8]}, vout${ABC[0:8]}); 325 vout${ABC[0:8]}${ABC[0:8]} = ${WASM_X8X16_MIN}(vout${ABC[0:8]}${ABC[0:8]}, voutput_max); 326 327 $if CHANNEL_TILE > 8: 328 if XNN_LIKELY(channels >= 8) { 329 *((double*) output) = wasm_f64x2_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 330 output += 8; 331 channels -= 8; 332 } else { 333 if (channels & 4) { 334 *((float*) output) = wasm_f32x4_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 335 vout${ABC[0:8]}${ABC[0:8]} = wasm_u64x2_shr(vout${ABC[0:8]}${ABC[0:8]}, 32); 336 output += 4; 337 } 338 uint32_t vout${ABC[0:4]} = wasm_i32x4_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 339 if (channels & 2) { 340 *((uint16_t*) output) = (uint16_t) vout${ABC[0:4]}; 341 vout${ABC[0:4]} >>= 16; 342 output += 2; 343 } 344 if (channels & 1) { 345 *output = (${XINT8_T}) vout${ABC[0:4]}; 346 output += 1; 347 } 348 channels = 0; 349 } 350 $else: 351 if (channels & 4) { 352 *((float*) output) = wasm_f32x4_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 353 vout${ABC[0:8]}${ABC[0:8]} = wasm_u64x2_shr(vout${ABC[0:8]}${ABC[0:8]}, 32); 354 output += 4; 355 } 356 uint32_t vout${ABC[0:4]} = wasm_i32x4_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0); 357 if (channels & 2) { 358 *((uint16_t*) output) = (uint16_t) vout${ABC[0:4]}; 359 vout${ABC[0:4]} >>= 16; 360 output += 2; 361 } 362 if (channels & 1) { 363 *output = (${XINT8_T}) vout${ABC[0:4]}; 364 } 365 }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""} 366 } 367} 368