• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/x32-transposec/wasmsimd.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <wasm_simd128.h>
11 
12 #include <assert.h>
13 
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17 
xnn_x8_transposec_ukernel__16x16_reuse_switch_wasmsimd(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x8_transposec_ukernel__16x16_reuse_switch_wasmsimd(
19     const uint8_t* input,
20     uint8_t* output,
21     size_t input_stride,
22     size_t output_stride,
23     size_t block_width,
24     size_t block_height) XNN_OOB_READS
25 {
26   assert(output_stride >= block_height * sizeof(uint8_t));
27   assert(input_stride >= block_width * sizeof(uint8_t));
28 
29   const size_t tile_height = 16;
30   const size_t tile_width = 16;
31   const size_t tile_hbytes = tile_height * sizeof(uint8_t);
32   const size_t tile_wbytes = tile_width * sizeof(uint8_t);
33   const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34   const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t);
35 
36   const uint8_t* i0 = input;
37   uint8_t* o = (uint8_t*) output;
38   const size_t minus_output_stride = -output_stride;
39 
40   do {
41     const size_t rem = min(block_width - 1, 15);
42     const size_t oN_stride = rem * output_stride;
43     size_t bh = block_height;
44     for (; bh >= 16; bh -= 16) {
45       const v128_t v4_0 = wasm_v128_load(i0);
46       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
47       const v128_t v4_1 = wasm_v128_load(i0);
48       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
49       const v128_t v4_2 = wasm_v128_load(i0);
50       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
51       const v128_t v4_3 = wasm_v128_load(i0);
52       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
53       const v128_t v4_4 = wasm_v128_load(i0);
54       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
55       const v128_t v4_5 = wasm_v128_load(i0);
56       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
57       const v128_t v4_6 = wasm_v128_load(i0);
58       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
59       const v128_t v4_7 = wasm_v128_load(i0);
60       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
61       const v128_t v4_8 = wasm_v128_load(i0);
62       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
63       const v128_t v4_9 = wasm_v128_load(i0);
64       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
65       const v128_t v4_10 = wasm_v128_load(i0);
66       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
67       const v128_t v4_11 = wasm_v128_load(i0);
68       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
69       const v128_t v4_12 = wasm_v128_load(i0);
70       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
71       const v128_t v4_13 = wasm_v128_load(i0);
72       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
73       const v128_t v4_14 = wasm_v128_load(i0);
74       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
75       const v128_t v4_15 = wasm_v128_load(i0);
76       i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
77 
78       const v128_t v3_0 = wasm_v8x16_shuffle(v4_0, v4_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
79       const v128_t v3_1 = wasm_v8x16_shuffle(v4_0, v4_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
80       const v128_t v3_2 = wasm_v8x16_shuffle(v4_1, v4_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
81       const v128_t v3_3 = wasm_v8x16_shuffle(v4_1, v4_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
82       const v128_t v3_4 = wasm_v8x16_shuffle(v4_2, v4_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
83       const v128_t v3_5 = wasm_v8x16_shuffle(v4_2, v4_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
84       const v128_t v3_6 = wasm_v8x16_shuffle(v4_3, v4_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
85       const v128_t v3_7 = wasm_v8x16_shuffle(v4_3, v4_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
86       const v128_t v3_8 = wasm_v8x16_shuffle(v4_4, v4_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
87       const v128_t v3_9 = wasm_v8x16_shuffle(v4_4, v4_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
88       const v128_t v3_10 = wasm_v8x16_shuffle(v4_5, v4_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
89       const v128_t v3_11 = wasm_v8x16_shuffle(v4_5, v4_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
90       const v128_t v3_12 = wasm_v8x16_shuffle(v4_6, v4_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
91       const v128_t v3_13 = wasm_v8x16_shuffle(v4_6, v4_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
92       const v128_t v3_14 = wasm_v8x16_shuffle(v4_7, v4_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
93       const v128_t v3_15 = wasm_v8x16_shuffle(v4_7, v4_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
94       const v128_t v2_0 = wasm_v8x16_shuffle(v3_0, v3_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
95       const v128_t v2_1 = wasm_v8x16_shuffle(v3_0, v3_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
96       const v128_t v2_2 = wasm_v8x16_shuffle(v3_1, v3_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
97       const v128_t v2_3 = wasm_v8x16_shuffle(v3_1, v3_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
98       const v128_t v2_4 = wasm_v8x16_shuffle(v3_2, v3_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
99       const v128_t v2_5 = wasm_v8x16_shuffle(v3_2, v3_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
100       const v128_t v2_6 = wasm_v8x16_shuffle(v3_3, v3_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
101       const v128_t v2_7 = wasm_v8x16_shuffle(v3_3, v3_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
102       const v128_t v2_8 = wasm_v8x16_shuffle(v3_4, v3_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
103       const v128_t v2_9 = wasm_v8x16_shuffle(v3_4, v3_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
104       const v128_t v2_10 = wasm_v8x16_shuffle(v3_5, v3_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
105       const v128_t v2_11 = wasm_v8x16_shuffle(v3_5, v3_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
106       const v128_t v2_12 = wasm_v8x16_shuffle(v3_6, v3_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
107       const v128_t v2_13 = wasm_v8x16_shuffle(v3_6, v3_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
108       const v128_t v2_14 = wasm_v8x16_shuffle(v3_7, v3_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
109       const v128_t v2_15 = wasm_v8x16_shuffle(v3_7, v3_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
110       const v128_t v1_0 = wasm_v8x16_shuffle(v2_0, v2_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
111       const v128_t v1_1 = wasm_v8x16_shuffle(v2_0, v2_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
112       const v128_t v1_2 = wasm_v8x16_shuffle(v2_1, v2_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
113       const v128_t v1_3 = wasm_v8x16_shuffle(v2_1, v2_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
114       const v128_t v1_4 = wasm_v8x16_shuffle(v2_2, v2_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
115       const v128_t v1_5 = wasm_v8x16_shuffle(v2_2, v2_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
116       const v128_t v1_6 = wasm_v8x16_shuffle(v2_3, v2_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
117       const v128_t v1_7 = wasm_v8x16_shuffle(v2_3, v2_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
118       const v128_t v1_8 = wasm_v8x16_shuffle(v2_4, v2_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
119       const v128_t v1_9 = wasm_v8x16_shuffle(v2_4, v2_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
120       const v128_t v1_10 = wasm_v8x16_shuffle(v2_5, v2_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
121       const v128_t v1_11 = wasm_v8x16_shuffle(v2_5, v2_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
122       const v128_t v1_12 = wasm_v8x16_shuffle(v2_6, v2_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
123       const v128_t v1_13 = wasm_v8x16_shuffle(v2_6, v2_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
124       const v128_t v1_14 = wasm_v8x16_shuffle(v2_7, v2_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
125       const v128_t v1_15 = wasm_v8x16_shuffle(v2_7, v2_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
126       const v128_t v0_0 = wasm_v8x16_shuffle(v1_0, v1_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
127       const v128_t v0_1 = wasm_v8x16_shuffle(v1_0, v1_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
128       const v128_t v0_2 = wasm_v8x16_shuffle(v1_1, v1_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
129       const v128_t v0_3 = wasm_v8x16_shuffle(v1_1, v1_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
130       const v128_t v0_4 = wasm_v8x16_shuffle(v1_2, v1_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
131       const v128_t v0_5 = wasm_v8x16_shuffle(v1_2, v1_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
132       const v128_t v0_6 = wasm_v8x16_shuffle(v1_3, v1_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
133       const v128_t v0_7 = wasm_v8x16_shuffle(v1_3, v1_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
134       const v128_t v0_8 = wasm_v8x16_shuffle(v1_4, v1_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
135       const v128_t v0_9 = wasm_v8x16_shuffle(v1_4, v1_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
136       const v128_t v0_10 = wasm_v8x16_shuffle(v1_5, v1_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
137       const v128_t v0_11 = wasm_v8x16_shuffle(v1_5, v1_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
138       const v128_t v0_12 = wasm_v8x16_shuffle(v1_6, v1_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
139       const v128_t v0_13 = wasm_v8x16_shuffle(v1_6, v1_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
140       const v128_t v0_14 = wasm_v8x16_shuffle(v1_7, v1_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
141       const v128_t v0_15 = wasm_v8x16_shuffle(v1_7, v1_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
142 
143       uint8_t *oN = (uint8_t*) ((uintptr_t) o + oN_stride);
144       switch (rem) {
145         case 15:
146           wasm_v128_store(oN, v0_15);
147           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
148         case 14:
149           wasm_v128_store(oN, v0_14);
150           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
151         case 13:
152           wasm_v128_store(oN, v0_13);
153           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
154         case 12:
155           wasm_v128_store(oN, v0_12);
156           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
157         case 11:
158           wasm_v128_store(oN, v0_11);
159           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
160         case 10:
161           wasm_v128_store(oN, v0_10);
162           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
163         case 9:
164           wasm_v128_store(oN, v0_9);
165           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
166         case 8:
167           wasm_v128_store(oN, v0_8);
168           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
169         case 7:
170           wasm_v128_store(oN, v0_7);
171           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
172         case 6:
173           wasm_v128_store(oN, v0_6);
174           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
175         case 5:
176           wasm_v128_store(oN, v0_5);
177           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
178         case 4:
179           wasm_v128_store(oN, v0_4);
180           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
181         case 3:
182           wasm_v128_store(oN, v0_3);
183           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
184         case 2:
185           wasm_v128_store(oN, v0_2);
186           oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
187         case 1:
188           wasm_v128_store(oN, v0_1);
189         case 0:
190           wasm_v128_store(o, v0_0);
191           o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
192           break;
193         default:
194           XNN_UNREACHABLE;
195       }
196     }
197 
198     if (bh != 0) {
199       const v128_t v4_0 = wasm_v128_load(i0);
200       const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
201       if XNN_UNPREDICTABLE(bh < 2) {
202         i1 = i0;
203       }
204       const v128_t v4_1 = wasm_v128_load(i1);
205       const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
206       if XNN_UNPREDICTABLE(bh <= 2) {
207         i2 = i1;
208       }
209       const v128_t v4_2 = wasm_v128_load(i2);
210       const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
211       if XNN_UNPREDICTABLE(bh < 4) {
212         i3 = i2;
213       }
214       const v128_t v4_3 = wasm_v128_load(i3);
215       const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
216       if XNN_UNPREDICTABLE(bh <= 4) {
217         i4 = i3;
218       }
219       const v128_t v4_4 = wasm_v128_load(i4);
220       const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
221       if XNN_UNPREDICTABLE(bh < 6) {
222         i5 = i4;
223       }
224       const v128_t v4_5 = wasm_v128_load(i5);
225       const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
226       if XNN_UNPREDICTABLE(bh <= 6) {
227         i6 = i5;
228       }
229       const v128_t v4_6 = wasm_v128_load(i6);
230       const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
231       if XNN_UNPREDICTABLE(bh < 8) {
232         i7 = i6;
233       }
234       const v128_t v4_7 = wasm_v128_load(i7);
235       const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
236       if XNN_UNPREDICTABLE(bh <= 8) {
237         i8 = i7;
238       }
239       const v128_t v4_8 = wasm_v128_load(i8);
240       const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
241       if XNN_UNPREDICTABLE(bh < 10) {
242         i9 = i8;
243       }
244       const v128_t v4_9 = wasm_v128_load(i9);
245       const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
246       if XNN_UNPREDICTABLE(bh <= 10) {
247         i10 = i9;
248       }
249       const v128_t v4_10 = wasm_v128_load(i10);
250       const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
251       if XNN_UNPREDICTABLE(bh < 12) {
252         i11 = i10;
253       }
254       const v128_t v4_11 = wasm_v128_load(i11);
255       const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
256       if XNN_UNPREDICTABLE(bh <= 12) {
257         i12 = i11;
258       }
259       const v128_t v4_12 = wasm_v128_load(i12);
260       const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
261       if XNN_UNPREDICTABLE(bh < 14) {
262         i13 = i12;
263       }
264       const v128_t v4_13 = wasm_v128_load(i13);
265       const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
266       if XNN_UNPREDICTABLE(bh <= 14) {
267         i14 = i13;
268       }
269       const v128_t v4_14 = wasm_v128_load(i14);
270       const v128_t v4_15 = wasm_v128_xor(v4_0, v4_0);
271 
272       const v128_t v3_0 = wasm_v8x16_shuffle(v4_0, v4_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
273       const v128_t v3_1 = wasm_v8x16_shuffle(v4_0, v4_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
274       const v128_t v3_2 = wasm_v8x16_shuffle(v4_1, v4_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
275       const v128_t v3_3 = wasm_v8x16_shuffle(v4_1, v4_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
276       const v128_t v3_4 = wasm_v8x16_shuffle(v4_2, v4_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
277       const v128_t v3_5 = wasm_v8x16_shuffle(v4_2, v4_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
278       const v128_t v3_6 = wasm_v8x16_shuffle(v4_3, v4_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
279       const v128_t v3_7 = wasm_v8x16_shuffle(v4_3, v4_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
280       const v128_t v3_8 = wasm_v8x16_shuffle(v4_4, v4_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
281       const v128_t v3_9 = wasm_v8x16_shuffle(v4_4, v4_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
282       const v128_t v3_10 = wasm_v8x16_shuffle(v4_5, v4_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
283       const v128_t v3_11 = wasm_v8x16_shuffle(v4_5, v4_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
284       const v128_t v3_12 = wasm_v8x16_shuffle(v4_6, v4_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
285       const v128_t v3_13 = wasm_v8x16_shuffle(v4_6, v4_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
286       const v128_t v3_14 = wasm_v8x16_shuffle(v4_7, v4_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
287       const v128_t v3_15 = wasm_v8x16_shuffle(v4_7, v4_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
288       const v128_t v2_0 = wasm_v8x16_shuffle(v3_0, v3_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
289       const v128_t v2_1 = wasm_v8x16_shuffle(v3_0, v3_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
290       const v128_t v2_2 = wasm_v8x16_shuffle(v3_1, v3_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
291       const v128_t v2_3 = wasm_v8x16_shuffle(v3_1, v3_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
292       const v128_t v2_4 = wasm_v8x16_shuffle(v3_2, v3_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
293       const v128_t v2_5 = wasm_v8x16_shuffle(v3_2, v3_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
294       const v128_t v2_6 = wasm_v8x16_shuffle(v3_3, v3_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
295       const v128_t v2_7 = wasm_v8x16_shuffle(v3_3, v3_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
296       const v128_t v2_8 = wasm_v8x16_shuffle(v3_4, v3_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
297       const v128_t v2_9 = wasm_v8x16_shuffle(v3_4, v3_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
298       const v128_t v2_10 = wasm_v8x16_shuffle(v3_5, v3_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
299       const v128_t v2_11 = wasm_v8x16_shuffle(v3_5, v3_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
300       const v128_t v2_12 = wasm_v8x16_shuffle(v3_6, v3_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
301       const v128_t v2_13 = wasm_v8x16_shuffle(v3_6, v3_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
302       const v128_t v2_14 = wasm_v8x16_shuffle(v3_7, v3_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
303       const v128_t v2_15 = wasm_v8x16_shuffle(v3_7, v3_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
304       const v128_t v1_0 = wasm_v8x16_shuffle(v2_0, v2_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
305       const v128_t v1_1 = wasm_v8x16_shuffle(v2_0, v2_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
306       const v128_t v1_2 = wasm_v8x16_shuffle(v2_1, v2_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
307       const v128_t v1_3 = wasm_v8x16_shuffle(v2_1, v2_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
308       const v128_t v1_4 = wasm_v8x16_shuffle(v2_2, v2_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
309       const v128_t v1_5 = wasm_v8x16_shuffle(v2_2, v2_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
310       const v128_t v1_6 = wasm_v8x16_shuffle(v2_3, v2_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
311       const v128_t v1_7 = wasm_v8x16_shuffle(v2_3, v2_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
312       const v128_t v1_8 = wasm_v8x16_shuffle(v2_4, v2_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
313       const v128_t v1_9 = wasm_v8x16_shuffle(v2_4, v2_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
314       const v128_t v1_10 = wasm_v8x16_shuffle(v2_5, v2_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
315       const v128_t v1_11 = wasm_v8x16_shuffle(v2_5, v2_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
316       const v128_t v1_12 = wasm_v8x16_shuffle(v2_6, v2_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
317       const v128_t v1_13 = wasm_v8x16_shuffle(v2_6, v2_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
318       const v128_t v1_14 = wasm_v8x16_shuffle(v2_7, v2_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
319       const v128_t v1_15 = wasm_v8x16_shuffle(v2_7, v2_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
320 
321       v128_t v0_0 = wasm_v8x16_shuffle(v1_0, v1_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
322       v128_t v0_1 = wasm_v8x16_shuffle(v1_0, v1_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
323       v128_t v0_2 = wasm_v8x16_shuffle(v1_1, v1_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
324       v128_t v0_3 = wasm_v8x16_shuffle(v1_1, v1_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
325       v128_t v0_4 = wasm_v8x16_shuffle(v1_2, v1_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
326       v128_t v0_5 = wasm_v8x16_shuffle(v1_2, v1_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
327       v128_t v0_6 = wasm_v8x16_shuffle(v1_3, v1_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
328       v128_t v0_7 = wasm_v8x16_shuffle(v1_3, v1_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
329       v128_t v0_8 = wasm_v8x16_shuffle(v1_4, v1_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
330       v128_t v0_9 = wasm_v8x16_shuffle(v1_4, v1_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
331       v128_t v0_10 = wasm_v8x16_shuffle(v1_5, v1_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
332       v128_t v0_11 = wasm_v8x16_shuffle(v1_5, v1_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
333       v128_t v0_12 = wasm_v8x16_shuffle(v1_6, v1_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
334       v128_t v0_13 = wasm_v8x16_shuffle(v1_6, v1_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
335       v128_t v0_14 = wasm_v8x16_shuffle(v1_7, v1_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
336       v128_t v0_15 = wasm_v8x16_shuffle(v1_7, v1_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
337 
338       if (bh & 8) {
339         uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
340         switch (rem) {
341           case 15:
342             *((double*) oN) = wasm_f64x2_extract_lane(v0_15, 0);
343             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
344           case 14:
345             *((double*) oN) = wasm_f64x2_extract_lane(v0_14, 0);
346             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
347           case 13:
348             *((double*) oN) = wasm_f64x2_extract_lane(v0_13, 0);
349             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
350           case 12:
351             *((double*) oN) = wasm_f64x2_extract_lane(v0_12, 0);
352             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
353           case 11:
354             *((double*) oN) = wasm_f64x2_extract_lane(v0_11, 0);
355             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
356           case 10:
357             *((double*) oN) = wasm_f64x2_extract_lane(v0_10, 0);
358             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
359           case 9:
360             *((double*) oN) = wasm_f64x2_extract_lane(v0_9, 0);
361             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
362           case 8:
363             *((double*) oN) = wasm_f64x2_extract_lane(v0_8, 0);
364             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
365           case 7:
366             *((double*) oN) = wasm_f64x2_extract_lane(v0_7, 0);
367             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
368           case 6:
369             *((double*) oN) = wasm_f64x2_extract_lane(v0_6, 0);
370             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
371           case 5:
372             *((double*) oN) = wasm_f64x2_extract_lane(v0_5, 0);
373             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
374           case 4:
375             *((double*) oN) = wasm_f64x2_extract_lane(v0_4, 0);
376             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
377           case 3:
378             *((double*) oN) = wasm_f64x2_extract_lane(v0_3, 0);
379             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
380           case 2:
381             *((double*) oN) = wasm_f64x2_extract_lane(v0_2, 0);
382             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
383           case 1:
384             *((double*) oN) = wasm_f64x2_extract_lane(v0_1, 0);
385           case 0:
386             *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
387             o += 8;
388             break;
389           default:
390             XNN_UNREACHABLE;
391         }
392         v0_0 = wasm_v64x2_shuffle(v0_0, v0_0, 1, 1);
393         v0_1 = wasm_v64x2_shuffle(v0_1, v0_1, 1, 1);
394         v0_2 = wasm_v64x2_shuffle(v0_2, v0_2, 1, 1);
395         v0_3 = wasm_v64x2_shuffle(v0_3, v0_3, 1, 1);
396         v0_4 = wasm_v64x2_shuffle(v0_4, v0_4, 1, 1);
397         v0_5 = wasm_v64x2_shuffle(v0_5, v0_5, 1, 1);
398         v0_6 = wasm_v64x2_shuffle(v0_6, v0_6, 1, 1);
399         v0_7 = wasm_v64x2_shuffle(v0_7, v0_7, 1, 1);
400         v0_8 = wasm_v64x2_shuffle(v0_8, v0_8, 1, 1);
401         v0_9 = wasm_v64x2_shuffle(v0_9, v0_9, 1, 1);
402         v0_10 = wasm_v64x2_shuffle(v0_10, v0_10, 1, 1);
403         v0_11 = wasm_v64x2_shuffle(v0_11, v0_11, 1, 1);
404         v0_12 = wasm_v64x2_shuffle(v0_12, v0_12, 1, 1);
405         v0_13 = wasm_v64x2_shuffle(v0_13, v0_13, 1, 1);
406         v0_14 = wasm_v64x2_shuffle(v0_14, v0_14, 1, 1);
407         v0_15 = wasm_v64x2_shuffle(v0_15, v0_15, 1, 1);
408       }
409 
410       if (bh & 4) {
411         uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
412         switch (rem) {
413           case 15:
414             *((float*) oN) = wasm_f32x4_extract_lane(v0_15, 0);
415             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
416           case 14:
417             *((float*) oN) = wasm_f32x4_extract_lane(v0_14, 0);
418             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
419           case 13:
420             *((float*) oN) = wasm_f32x4_extract_lane(v0_13, 0);
421             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
422           case 12:
423             *((float*) oN) = wasm_f32x4_extract_lane(v0_12, 0);
424             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
425           case 11:
426             *((float*) oN) = wasm_f32x4_extract_lane(v0_11, 0);
427             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
428           case 10:
429             *((float*) oN) = wasm_f32x4_extract_lane(v0_10, 0);
430             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
431           case 9:
432             *((float*) oN) = wasm_f32x4_extract_lane(v0_9, 0);
433             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
434           case 8:
435             *((float*) oN) = wasm_f32x4_extract_lane(v0_8, 0);
436             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
437           case 7:
438             *((float*) oN) = wasm_f32x4_extract_lane(v0_7, 0);
439             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
440           case 6:
441             *((float*) oN) = wasm_f32x4_extract_lane(v0_6, 0);
442             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
443           case 5:
444             *((float*) oN) = wasm_f32x4_extract_lane(v0_5, 0);
445             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
446           case 4:
447             *((float*) oN) = wasm_f32x4_extract_lane(v0_4, 0);
448             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
449           case 3:
450             *((float*) oN) = wasm_f32x4_extract_lane(v0_3, 0);
451             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
452           case 2:
453             *((float*) oN) = wasm_f32x4_extract_lane(v0_2, 0);
454             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
455           case 1:
456             *((float*) oN) = wasm_f32x4_extract_lane(v0_1, 0);
457           case 0:
458             *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0);
459             o += 4;
460             break;
461           default:
462             XNN_UNREACHABLE;
463         }
464         v0_0 = wasm_u64x2_shr(v0_0, 32);
465         v0_1 = wasm_u64x2_shr(v0_1, 32);
466         v0_2 = wasm_u64x2_shr(v0_2, 32);
467         v0_3 = wasm_u64x2_shr(v0_3, 32);
468         v0_4 = wasm_u64x2_shr(v0_4, 32);
469         v0_5 = wasm_u64x2_shr(v0_5, 32);
470         v0_6 = wasm_u64x2_shr(v0_6, 32);
471         v0_7 = wasm_u64x2_shr(v0_7, 32);
472         v0_8 = wasm_u64x2_shr(v0_8, 32);
473         v0_9 = wasm_u64x2_shr(v0_9, 32);
474         v0_10 = wasm_u64x2_shr(v0_10, 32);
475         v0_11 = wasm_u64x2_shr(v0_11, 32);
476         v0_12 = wasm_u64x2_shr(v0_12, 32);
477         v0_13 = wasm_u64x2_shr(v0_13, 32);
478         v0_14 = wasm_u64x2_shr(v0_14, 32);
479         v0_15 = wasm_u64x2_shr(v0_15, 32);
480       }
481       if (bh & 2) {
482         uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
483         switch (rem) {
484           case 15:
485             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_15, 0);
486             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
487           case 14:
488             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_14, 0);
489             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
490           case 13:
491             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_13, 0);
492             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
493           case 12:
494             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_12, 0);
495             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
496           case 11:
497             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_11, 0);
498             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
499           case 10:
500             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_10, 0);
501             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
502           case 9:
503             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_9, 0);
504             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
505           case 8:
506             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_8, 0);
507             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
508           case 7:
509             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_7, 0);
510             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
511           case 6:
512             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_6, 0);
513             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
514           case 5:
515             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_5, 0);
516             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
517           case 4:
518             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_4, 0);
519             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
520           case 3:
521             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_3, 0);
522             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
523           case 2:
524             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_2, 0);
525             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
526           case 1:
527             *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_1, 0);
528           case 0:
529             *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_0, 0);
530             o += 2;
531             break;
532           default:
533             XNN_UNREACHABLE;
534         }
535         v0_0 = wasm_u32x4_shr(v0_0, 16);
536         v0_1 = wasm_u32x4_shr(v0_1, 16);
537         v0_2 = wasm_u32x4_shr(v0_2, 16);
538         v0_3 = wasm_u32x4_shr(v0_3, 16);
539         v0_4 = wasm_u32x4_shr(v0_4, 16);
540         v0_5 = wasm_u32x4_shr(v0_5, 16);
541         v0_6 = wasm_u32x4_shr(v0_6, 16);
542         v0_7 = wasm_u32x4_shr(v0_7, 16);
543         v0_8 = wasm_u32x4_shr(v0_8, 16);
544         v0_9 = wasm_u32x4_shr(v0_9, 16);
545         v0_10 = wasm_u32x4_shr(v0_10, 16);
546         v0_11 = wasm_u32x4_shr(v0_11, 16);
547         v0_12 = wasm_u32x4_shr(v0_12, 16);
548         v0_13 = wasm_u32x4_shr(v0_13, 16);
549         v0_14 = wasm_u32x4_shr(v0_14, 16);
550         v0_15 = wasm_u32x4_shr(v0_15, 16);
551       }
552       if (bh & 1) {
553         uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
554         switch (rem) {
555           case 15:
556             *oN = wasm_i8x16_extract_lane(v0_15, 0);
557             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
558           case 14:
559             *oN = wasm_i8x16_extract_lane(v0_14, 0);
560             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
561           case 13:
562             *oN = wasm_i8x16_extract_lane(v0_13, 0);
563             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
564           case 12:
565             *oN = wasm_i8x16_extract_lane(v0_12, 0);
566             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
567           case 11:
568             *oN = wasm_i8x16_extract_lane(v0_11, 0);
569             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
570           case 10:
571             *oN = wasm_i8x16_extract_lane(v0_10, 0);
572             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
573           case 9:
574             *oN = wasm_i8x16_extract_lane(v0_9, 0);
575             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
576           case 8:
577             *oN = wasm_i8x16_extract_lane(v0_8, 0);
578             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
579           case 7:
580             *oN = wasm_i8x16_extract_lane(v0_7, 0);
581             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
582           case 6:
583             *oN = wasm_i8x16_extract_lane(v0_6, 0);
584             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
585           case 5:
586             *oN = wasm_i8x16_extract_lane(v0_5, 0);
587             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
588           case 4:
589             *oN = wasm_i8x16_extract_lane(v0_4, 0);
590             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
591           case 3:
592             *oN = wasm_i8x16_extract_lane(v0_3, 0);
593             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
594           case 2:
595             *oN = wasm_i8x16_extract_lane(v0_2, 0);
596             oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
597           case 1:
598             *oN = wasm_i8x16_extract_lane(v0_1, 0);
599           case 0:
600             *o = wasm_i8x16_extract_lane(v0_0, 0);
601             break;
602           default:
603             XNN_UNREACHABLE;
604         }
605       }
606     }
607 
608     i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
609     o = (uint8_t*) ((uintptr_t) o + output_reset);
610     block_width = doz(block_width, tile_width);
611   } while (block_width != 0);
612 }
613