• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert CHANNEL_TILE % 8 == 0
7$assert CHANNEL_TILE >= 8
8$assert ROW_TILE >= 2
9$assert ROW_SUBTILE >= 2
10$assert ROW_SUBTILE <= ROW_TILE
11$assert ACCUMULATORS >= 1
12$assert ROW_TILE >= ACCUMULATORS * 2
13$assert ROW_SUBTILE >= ACCUMULATORS * 2
14$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
15#include <assert.h>
16
17#include <wasm_simd128.h>
18
19#include <xnnpack/gavgpool.h>
20#include <xnnpack/math.h>
21
22
23void xnn_qs8_gavgpool_minmax_ukernel_${ROW_TILE}p${ROW_SUBTILE}x__wasmsimd_c${CHANNEL_TILE}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
24    size_t rows,
25    size_t channels,
26    const int8_t* input,
27    size_t input_stride,
28    const int8_t* zero,
29    int32_t* buffer,
30    int8_t* output,
31    const union xnn_qs8_avgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
32{
33  assert(rows > ${ROW_TILE});
34  assert(channels != 0);
35
36  const int8_t* i0 = input;
37  $for M in range(1, ROW_TILE):
38    const int8_t* i${M} = (const int8_t*) ((uintptr_t) i${M-1} + input_stride);
39  $if CHANNEL_TILE <= 16:
40    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, ${CHANNEL_TILE});
41  $else:
42    const size_t input_increment = ${ROW_TILE} * input_stride - round_up_po2(channels, 8);
43
44  const v128_t vbias = wasm_v128_load(params->wasmsimd.bias);
45  int32_t* b = buffer;
46  size_t c = channels;
47  for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) {
48    $for M in range(ROW_TILE):
49      const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M});
50      $for C in range(8, CHANNEL_TILE, 8):
51        const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${M} + ${C});
52      i${M} += ${CHANNEL_TILE};
53
54    $for A in range(ACCUMULATORS):
55      $for C in range(0, CHANNEL_TILE, 8):
56        v128_t vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]});
57
58    $for M in range(ACCUMULATORS * 2, ROW_TILE):
59      $for C in range(0, CHANNEL_TILE, 8):
60        vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi${M}x${ABC[C:C+8]});
61
62    $if ACCUMULATORS > 1:
63      // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
64      $ACC_SLICE = 1
65      $while ACC_SLICE < ACCUMULATORS:
66        $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
67          $if A + ACC_SLICE < ACCUMULATORS:
68            $for C in range(0, CHANNEL_TILE, 8):
69              vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]});
70        $ACC_SLICE *= 2
71
72    $for C in range(0, CHANNEL_TILE, 8):
73      const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_low_i16x8(vacc0x${ABC[C:C+8]}));
74      const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_high_i16x8(vacc0x${ABC[C:C+8]}));
75
76    wasm_v128_store(b, vacc${ABC[0:4]});
77    $for C in range(4, CHANNEL_TILE, 4):
78      wasm_v128_store(b + ${C}, vacc${ABC[C:C+4]});
79    b += ${CHANNEL_TILE};
80  }
81  $if CHANNEL_TILE > 16:
82    if XNN_UNLIKELY(c != 0) {
83      do {
84        $for M in range(ROW_TILE):
85          const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M});
86          i${M} += 8;
87
88        $for A in range(ACCUMULATORS):
89          v128_t vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vxi${A*2}x${ABC[0:8]}, vxi${A*2+1}x${ABC[0:8]});
90
91        $for M in range(ACCUMULATORS * 2, ROW_TILE):
92          vacc${M % ACCUMULATORS}x${ABC[0:8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[0:8]}, vxi${M}x${ABC[0:8]});
93
94        $if ACCUMULATORS > 1:
95          // Add up all accumulators to vacc0x${ABC[0:8]}
96          $ACC_SLICE = 1
97          $while ACC_SLICE < ACCUMULATORS:
98            $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
99              $if A + ACC_SLICE < ACCUMULATORS:
100                vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vacc${A}x${ABC[0:8]}, vacc${A + ACC_SLICE}x${ABC[0:8]});
101            $ACC_SLICE *= 2
102
103        const v128_t vacc${ABC[0:4]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_low_i16x8(vacc0x${ABC[0:8]}));
104        const v128_t vacc${ABC[4:8]} = wasm_i32x4_add(vbias, wasm_i32x4_widen_high_i16x8(vacc0x${ABC[0:8]}));
105
106        wasm_v128_store(b, vacc${ABC[0:4]});
107        wasm_v128_store(b + 4, vacc${ABC[4:8]});
108        b += 8;
109
110        c = doz(c, 8);
111      } while (c != 0);
112    }
113
114  for (rows -= ${ROW_TILE}; rows > ${ROW_SUBTILE}; rows -= ${ROW_SUBTILE}) {
115    $for M in range(ROW_SUBTILE):
116      i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
117
118    int32_t* b = buffer;
119    size_t c = channels;
120    for (; ${"c >= %d" % CHANNEL_TILE if CHANNEL_TILE > 16 else "c != 0"}; ${("c -= %d" if CHANNEL_TILE > 16 else "c = doz(c, %d)") % CHANNEL_TILE}) {
121      $for M in range(ROW_SUBTILE):
122        const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M});
123        $for C in range(8, CHANNEL_TILE, 8):
124          const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${M} + ${C});
125        i${M} += ${CHANNEL_TILE};
126
127      $for A in range(ACCUMULATORS):
128        $for C in range(0, CHANNEL_TILE, 8):
129          v128_t vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]});
130
131      $for M in range(ACCUMULATORS * 2, ROW_SUBTILE):
132        $for C in range(0, CHANNEL_TILE, 8):
133          vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi${M}x${ABC[C:C+8]});
134
135      $if ACCUMULATORS > 1:
136        // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
137        $ACC_SLICE = 1
138        $while ACC_SLICE < ACCUMULATORS:
139          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
140            $if A + ACC_SLICE < ACCUMULATORS:
141              $for C in range(0, CHANNEL_TILE, 8):
142                vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]});
143          $ACC_SLICE *= 2
144
145      $for C in range(0, CHANNEL_TILE, 8):
146        const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(wasm_i32x4_widen_low_i16x8(vacc0x${ABC[C:C+8]}), wasm_v128_load(b + ${C}));
147        const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(wasm_i32x4_widen_high_i16x8(vacc0x${ABC[C:C+8]}), wasm_v128_load(b + ${C+4}));
148
149      wasm_v128_store(b, vacc${ABC[0:4]});
150      $for C in range(4, CHANNEL_TILE, 4):
151        wasm_v128_store(b + ${C}, vacc${ABC[C:C+4]});
152      b += ${CHANNEL_TILE};
153    }
154    $if CHANNEL_TILE > 16:
155      if XNN_UNLIKELY(c != 0) {
156        do {
157          $for M in range(ROW_SUBTILE):
158            const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M});
159            i${M} += 8;
160
161          $for A in range(ACCUMULATORS):
162            v128_t vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vxi${A*2}x${ABC[0:8]}, vxi${A*2+1}x${ABC[0:8]});
163
164          $for M in range(ACCUMULATORS * 2, ROW_SUBTILE):
165            vacc${M % ACCUMULATORS}x${ABC[0:8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[0:8]}, vxi${M}x${ABC[0:8]});
166
167          $if ACCUMULATORS > 1:
168            // Add up all accumulators to vacc0x${ABC[0:8]}
169            $ACC_SLICE = 1
170            $while ACC_SLICE < ACCUMULATORS:
171              $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
172                $if A + ACC_SLICE < ACCUMULATORS:
173                  vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vacc${A}x${ABC[0:8]}, vacc${A + ACC_SLICE}x${ABC[0:8]});
174              $ACC_SLICE *= 2
175
176          const v128_t vacc${ABC[0:4]} = wasm_i32x4_add(wasm_i32x4_widen_low_i16x8(vacc0x${ABC[0:8]}), wasm_v128_load(b));
177          const v128_t vacc${ABC[4:8]} = wasm_i32x4_add(wasm_i32x4_widen_high_i16x8(vacc0x${ABC[0:8]}), wasm_v128_load(b + 4));
178
179          wasm_v128_store(b, vacc${ABC[0:4]});
180          wasm_v128_store(b + 4, vacc${ABC[4:8]});
181          b += 8;
182
183          c = doz(c, 8);
184        } while (c != 0);
185      }
186  }
187
188  i0 = (const int8_t*) ((uintptr_t) i${ROW_TILE - ROW_SUBTILE} + input_increment);
189  $for M in range(1, ROW_SUBTILE):
190    i${M} = (const int8_t*) ((uintptr_t) i${M + ROW_TILE - ROW_SUBTILE} + input_increment);
191    $if M % 2 == 1:
192      if XNN_UNPREDICTABLE(rows < ${M+1}) {
193        i${M} = zero;
194      }
195    $else:
196      if XNN_UNPREDICTABLE(rows <= ${M}) {
197        i${M} = zero;
198      }
199
200  const v128_t vmultiplier = wasm_v128_load(params->wasmsimd.multiplier);
201  const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
202  const int32_t vshift = params->wasmsimd.shift;
203  const v128_t vzero = wasm_f64x2_splat(0.0);
204  while (channels >= ${CHANNEL_TILE}) {
205    $for M in range(ROW_SUBTILE):
206      const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M});
207      $for C in range(8, CHANNEL_TILE, 8):
208        const v128_t vxi${M}x${ABC[C:C+8]} = wasm_i16x8_load_8x8(i${M} + ${C});
209      i${M} += ${CHANNEL_TILE};
210
211    $for A in range(ACCUMULATORS):
212      $for C in range(0, CHANNEL_TILE, 8):
213        v128_t vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vxi${A*2}x${ABC[C:C+8]}, vxi${A*2+1}x${ABC[C:C+8]});
214
215    $for M in range(ACCUMULATORS * 2, ROW_SUBTILE):
216      $for C in range(0, CHANNEL_TILE, 8):
217        vacc${M % ACCUMULATORS}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[C:C+8]}, vxi${M}x${ABC[C:C+8]});
218
219    $if ACCUMULATORS > 1:
220      // Add up all accumulators to vacc0x${ABC[0:CHANNEL_TILE]}
221      $ACC_SLICE = 1
222      $while ACC_SLICE < ACCUMULATORS:
223        $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
224          $if A + ACC_SLICE < ACCUMULATORS:
225            $for C in range(0, CHANNEL_TILE, 8):
226              vacc${A}x${ABC[C:C+8]} = wasm_i16x8_add(vacc${A}x${ABC[C:C+8]}, vacc${A + ACC_SLICE}x${ABC[C:C+8]});
227        $ACC_SLICE *= 2
228
229    $for C in range(0, CHANNEL_TILE, 8):
230      const v128_t vacc${ABC[C:C+4]} = wasm_i32x4_add(wasm_i32x4_widen_low_i16x8(vacc0x${ABC[C:C+8]}), wasm_v128_load(buffer + ${C}));
231      const v128_t vacc${ABC[C+4:C+8]} = wasm_i32x4_add(wasm_i32x4_widen_high_i16x8(vacc0x${ABC[C:C+8]}), wasm_v128_load(buffer + ${C+4}));
232    buffer += ${CHANNEL_TILE};
233
234    $for C in range(0, CHANNEL_TILE, 4):
235      const v128_t vabsacc${ABC[C:C+4]} = wasm_i32x4_abs(vacc${ABC[C:C+4]});
236
237    $for C in range(0, CHANNEL_TILE, 4):
238      const v128_t vsgnacc${ABC[C:C+4]} = wasm_i32x4_gt(vabsacc${ABC[C:C+4]}, vacc${ABC[C:C+4]});
239
240    $for C in range(0, CHANNEL_TILE, 4):
241      const v128_t vabsacc${ABC[C:C+2]} = wasm_v32x4_shuffle(vabsacc${ABC[C:C+4]}, vzero, 0, 4, 1, 5);
242      const v128_t vabsacc${ABC[C+2:C+4]} = wasm_v32x4_shuffle(vabsacc${ABC[C:C+4]}, vzero, 2, 6, 3, 7);
243
244    $for C in range(0, CHANNEL_TILE, 2):
245      const v128_t vabsprod${ABC[C:C+2]} = wasm_i64x2_mul(vabsacc${ABC[C:C+2]}, vmultiplier);
246
247    $for C in range(0, CHANNEL_TILE, 2):
248      const v128_t vabsout${ABC[C:C+2]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[C:C+2]}, vrounding), vshift);
249
250    $for C in range(0, CHANNEL_TILE, 4):
251      const v128_t vabsout${ABC[C:C+4]} = wasm_v32x4_shuffle(vabsout${ABC[C:C+2]}, vabsout${ABC[C+2:C+4]}, 0, 2, 4, 6);
252
253    $for C in range(0, CHANNEL_TILE, 4):
254      const v128_t vout${ABC[C:C+4]} = wasm_i32x4_sub(wasm_v128_xor(vabsout${ABC[C:C+4]}, vsgnacc${ABC[C:C+4]}), vsgnacc${ABC[C:C+4]});
255
256    const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
257    $for C in range(0, CHANNEL_TILE, 8):
258      v128_t vout${ABC[C:C+8]} = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vout${ABC[C:C+4]}, vout${ABC[C+4:C+8]}), voutput_zero_point);
259
260    const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
261    const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
262    $for C in range(0, CHANNEL_TILE, 16):
263      $if C + 8 < CHANNEL_TILE:
264        v128_t vout${ABC[C:C+16]} = wasm_i8x16_min(wasm_i8x16_max(wasm_i8x16_narrow_i16x8(vout${ABC[C:C+8]}, vout${ABC[C+8:C+16]}), voutput_min), voutput_max);
265      $else:
266        v128_t vout${ABC[C:C+8]}${ABC[C:C+8]} = wasm_i8x16_min(wasm_i8x16_max(wasm_i8x16_narrow_i16x8(vout${ABC[C:C+8]}, vout${ABC[C:C+8]}), voutput_min), voutput_max);
267
268    $if CHANNEL_TILE > 8:
269      wasm_v128_store(output, vout${ABC[0:16]});
270    $else:
271      *((double*) output) = wasm_f64x2_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
272    $for C in range(16, CHANNEL_TILE, 16):
273      $if C + 8 < CHANNEL_TILE:
274        wasm_v128_store(output + ${C}, vout${ABC[C:C+16]});
275      $else:
276        *((double*) (output + ${C})) = wasm_f64x2_extract_lane(vout${ABC[C:C+8]}${ABC[C:C+8]}, 0);
277    output += ${CHANNEL_TILE};
278
279    channels -= ${CHANNEL_TILE};
280  }
281  if XNN_UNLIKELY(channels != 0) {
282    ${"do " if CHANNEL_TILE > 8 else ""}{
283      $for M in range(ROW_SUBTILE):
284        const v128_t vxi${M}x${ABC[0:8]} = wasm_i16x8_load_8x8(i${M});
285        i${M} += 8;
286
287      $for A in range(ACCUMULATORS):
288        v128_t vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vxi${A*2}x${ABC[0:8]}, vxi${A*2+1}x${ABC[0:8]});
289
290      $for M in range(ACCUMULATORS * 2, ROW_SUBTILE):
291        vacc${M % ACCUMULATORS}x${ABC[0:8]} = wasm_i16x8_add(vacc${M % ACCUMULATORS}x${ABC[0:8]}, vxi${M}x${ABC[0:8]});
292
293      $if ACCUMULATORS > 1:
294        // Add up all accumulators to vacc0x${ABC[0:8]}
295        $ACC_SLICE = 1
296        $while ACC_SLICE < ACCUMULATORS:
297          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
298            $if A + ACC_SLICE < ACCUMULATORS:
299              vacc${A}x${ABC[0:8]} = wasm_i16x8_add(vacc${A}x${ABC[0:8]}, vacc${A + ACC_SLICE}x${ABC[0:8]});
300          $ACC_SLICE *= 2
301
302      const v128_t vacc${ABC[0:4]} = wasm_i32x4_add(wasm_i32x4_widen_low_i16x8(vacc0x${ABC[0:8]}), wasm_v128_load(buffer));
303      const v128_t vacc${ABC[4:8]} = wasm_i32x4_add(wasm_i32x4_widen_high_i16x8(vacc0x${ABC[0:8]}), wasm_v128_load(buffer + 4));
304      buffer += 8;
305
306      const v128_t vabsacc${ABC[0:4]} = wasm_i32x4_abs(vacc${ABC[0:4]});
307      const v128_t vabsacc${ABC[4:8]} = wasm_i32x4_abs(vacc${ABC[4:8]});
308
309      const v128_t vsgnacc${ABC[0:4]} = wasm_i32x4_gt(vabsacc${ABC[0:4]}, vacc${ABC[0:4]});
310      const v128_t vsgnacc${ABC[4:8]} = wasm_i32x4_gt(vabsacc${ABC[4:8]}, vacc${ABC[4:8]});
311
312      const v128_t vabsacc${ABC[0:2]} = wasm_v32x4_shuffle(vabsacc${ABC[0:4]}, vzero, 0, 4, 1, 5);
313      const v128_t vabsacc${ABC[2:4]} = wasm_v32x4_shuffle(vabsacc${ABC[0:4]}, vzero, 2, 6, 3, 7);
314      const v128_t vabsacc${ABC[4:6]} = wasm_v32x4_shuffle(vabsacc${ABC[4:8]}, vzero, 0, 4, 1, 5);
315      const v128_t vabsacc${ABC[6:8]} = wasm_v32x4_shuffle(vabsacc${ABC[4:8]}, vzero, 2, 6, 3, 7);
316
317      const v128_t vabsprod${ABC[0:2]} = wasm_i64x2_mul(vabsacc${ABC[0:2]}, vmultiplier);
318      const v128_t vabsprod${ABC[2:4]} = wasm_i64x2_mul(vabsacc${ABC[2:4]}, vmultiplier);
319      const v128_t vabsprod${ABC[4:6]} = wasm_i64x2_mul(vabsacc${ABC[4:6]}, vmultiplier);
320      const v128_t vabsprod${ABC[6:8]} = wasm_i64x2_mul(vabsacc${ABC[6:8]}, vmultiplier);
321
322      const v128_t vabsout${ABC[0:2]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[0:2]}, vrounding), vshift);
323      const v128_t vabsout${ABC[2:4]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[2:4]}, vrounding), vshift);
324      const v128_t vabsout${ABC[4:6]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[4:6]}, vrounding), vshift);
325      const v128_t vabsout${ABC[6:8]} = wasm_u64x2_shr(wasm_i64x2_add(vabsprod${ABC[6:8]}, vrounding), vshift);
326
327      const v128_t vabsout${ABC[0:4]} = wasm_v32x4_shuffle(vabsout${ABC[0:2]}, vabsout${ABC[2:4]}, 0, 2, 4, 6);
328      const v128_t vabsout${ABC[4:8]} = wasm_v32x4_shuffle(vabsout${ABC[4:6]}, vabsout${ABC[6:8]}, 0, 2, 4, 6);
329
330      const v128_t vout${ABC[0:4]} = wasm_i32x4_sub(wasm_v128_xor(vabsout${ABC[0:4]}, vsgnacc${ABC[0:4]}), vsgnacc${ABC[0:4]});
331      const v128_t vout${ABC[4:8]} = wasm_i32x4_sub(wasm_v128_xor(vabsout${ABC[4:8]}, vsgnacc${ABC[4:8]}), vsgnacc${ABC[4:8]});
332
333      const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
334      const v128_t vout${ABC[0:8]} = wasm_i16x8_add_saturate(wasm_i16x8_narrow_i32x4(vout${ABC[0:4]}, vout${ABC[4:8]}), voutput_zero_point);
335
336      const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
337      const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
338      v128_t vout${ABC[0:8]}${ABC[0:8]} = wasm_i8x16_min(wasm_i8x16_max(wasm_i8x16_narrow_i16x8(vout${ABC[0:8]}, vout${ABC[0:8]}), voutput_min), voutput_max);
339
340      $if CHANNEL_TILE > 8:
341        if XNN_LIKELY(channels >= 8) {
342          *((double*) output) = wasm_f64x2_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
343          output += 8;
344          channels -= 8;
345        } else {
346          if (channels & 4) {
347            *((float*) output) = wasm_f32x4_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
348            vout${ABC[0:8]}${ABC[0:8]} = wasm_u64x2_shr(vout${ABC[0:8]}${ABC[0:8]}, 32);
349            output += 4;
350          }
351          if (channels & 2) {
352            *((uint16_t*) output) = (uint16_t) wasm_i16x8_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
353            vout${ABC[0:8]}${ABC[0:8]} = wasm_u32x4_shr(vout${ABC[0:8]}${ABC[0:8]}, 16);
354            output += 2;
355          }
356          if (channels & 1) {
357            *output = (int8_t) wasm_i8x16_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
358            output += 1;
359          }
360          channels = 0;
361        }
362      $else:
363        if (channels & 4) {
364          *((float*) output) = wasm_f32x4_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
365          vout${ABC[0:8]}${ABC[0:8]} = wasm_u64x2_shr(vout${ABC[0:8]}${ABC[0:8]}, 32);
366          output += 4;
367        }
368        if (channels & 2) {
369          *((uint16_t*) output) = (uint16_t) wasm_i16x8_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
370          vout${ABC[0:8]}${ABC[0:8]} = wasm_u32x4_shr(vout${ABC[0:8]}${ABC[0:8]}, 16);
371          output += 2;
372        }
373        if (channels & 1) {
374          *output = (int8_t) wasm_i8x16_extract_lane(vout${ABC[0:8]}${ABC[0:8]}, 0);
375        }
376    }${" while (channels != 0);" if CHANNEL_TILE > 8 else ""}
377  }
378}
379