1 // Auto-generated file. Do not edit!
2 // Template: src/x32-transposec/wasmsimd.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <wasm_simd128.h>
11
12 #include <assert.h>
13
14 #include <xnnpack/common.h>
15 #include <xnnpack/math.h>
16 #include <xnnpack/transpose.h>
17
xnn_x8_transposec_ukernel__16x16_reuse_switch_wasmsimd(const uint8_t * input,uint8_t * output,size_t input_stride,size_t output_stride,size_t block_width,size_t block_height)18 void xnn_x8_transposec_ukernel__16x16_reuse_switch_wasmsimd(
19 const uint8_t* input,
20 uint8_t* output,
21 size_t input_stride,
22 size_t output_stride,
23 size_t block_width,
24 size_t block_height) XNN_OOB_READS
25 {
26 assert(output_stride >= block_height * sizeof(uint8_t));
27 assert(input_stride >= block_width * sizeof(uint8_t));
28
29 const size_t tile_height = 16;
30 const size_t tile_width = 16;
31 const size_t tile_hbytes = tile_height * sizeof(uint8_t);
32 const size_t tile_wbytes = tile_width * sizeof(uint8_t);
33 const size_t input_reset = tile_wbytes - round_down_po2(block_height, tile_height) * input_stride;
34 const size_t output_reset = tile_width * output_stride - round_down_po2(block_height, 2) * sizeof(uint8_t);
35
36 const uint8_t* i0 = input;
37 uint8_t* o = (uint8_t*) output;
38 const size_t minus_output_stride = -output_stride;
39
40 do {
41 const size_t rem = min(block_width - 1, 15);
42 const size_t oN_stride = rem * output_stride;
43 size_t bh = block_height;
44 for (; bh >= 16; bh -= 16) {
45 const v128_t v4_0 = wasm_v128_load(i0);
46 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
47 const v128_t v4_1 = wasm_v128_load(i0);
48 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
49 const v128_t v4_2 = wasm_v128_load(i0);
50 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
51 const v128_t v4_3 = wasm_v128_load(i0);
52 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
53 const v128_t v4_4 = wasm_v128_load(i0);
54 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
55 const v128_t v4_5 = wasm_v128_load(i0);
56 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
57 const v128_t v4_6 = wasm_v128_load(i0);
58 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
59 const v128_t v4_7 = wasm_v128_load(i0);
60 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
61 const v128_t v4_8 = wasm_v128_load(i0);
62 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
63 const v128_t v4_9 = wasm_v128_load(i0);
64 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
65 const v128_t v4_10 = wasm_v128_load(i0);
66 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
67 const v128_t v4_11 = wasm_v128_load(i0);
68 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
69 const v128_t v4_12 = wasm_v128_load(i0);
70 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
71 const v128_t v4_13 = wasm_v128_load(i0);
72 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
73 const v128_t v4_14 = wasm_v128_load(i0);
74 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
75 const v128_t v4_15 = wasm_v128_load(i0);
76 i0 = (uint8_t*) ((uintptr_t) i0 + input_stride);
77
78 const v128_t v3_0 = wasm_v8x16_shuffle(v4_0, v4_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
79 const v128_t v3_1 = wasm_v8x16_shuffle(v4_0, v4_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
80 const v128_t v3_2 = wasm_v8x16_shuffle(v4_1, v4_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
81 const v128_t v3_3 = wasm_v8x16_shuffle(v4_1, v4_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
82 const v128_t v3_4 = wasm_v8x16_shuffle(v4_2, v4_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
83 const v128_t v3_5 = wasm_v8x16_shuffle(v4_2, v4_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
84 const v128_t v3_6 = wasm_v8x16_shuffle(v4_3, v4_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
85 const v128_t v3_7 = wasm_v8x16_shuffle(v4_3, v4_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
86 const v128_t v3_8 = wasm_v8x16_shuffle(v4_4, v4_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
87 const v128_t v3_9 = wasm_v8x16_shuffle(v4_4, v4_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
88 const v128_t v3_10 = wasm_v8x16_shuffle(v4_5, v4_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
89 const v128_t v3_11 = wasm_v8x16_shuffle(v4_5, v4_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
90 const v128_t v3_12 = wasm_v8x16_shuffle(v4_6, v4_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
91 const v128_t v3_13 = wasm_v8x16_shuffle(v4_6, v4_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
92 const v128_t v3_14 = wasm_v8x16_shuffle(v4_7, v4_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
93 const v128_t v3_15 = wasm_v8x16_shuffle(v4_7, v4_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
94 const v128_t v2_0 = wasm_v8x16_shuffle(v3_0, v3_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
95 const v128_t v2_1 = wasm_v8x16_shuffle(v3_0, v3_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
96 const v128_t v2_2 = wasm_v8x16_shuffle(v3_1, v3_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
97 const v128_t v2_3 = wasm_v8x16_shuffle(v3_1, v3_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
98 const v128_t v2_4 = wasm_v8x16_shuffle(v3_2, v3_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
99 const v128_t v2_5 = wasm_v8x16_shuffle(v3_2, v3_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
100 const v128_t v2_6 = wasm_v8x16_shuffle(v3_3, v3_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
101 const v128_t v2_7 = wasm_v8x16_shuffle(v3_3, v3_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
102 const v128_t v2_8 = wasm_v8x16_shuffle(v3_4, v3_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
103 const v128_t v2_9 = wasm_v8x16_shuffle(v3_4, v3_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
104 const v128_t v2_10 = wasm_v8x16_shuffle(v3_5, v3_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
105 const v128_t v2_11 = wasm_v8x16_shuffle(v3_5, v3_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
106 const v128_t v2_12 = wasm_v8x16_shuffle(v3_6, v3_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
107 const v128_t v2_13 = wasm_v8x16_shuffle(v3_6, v3_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
108 const v128_t v2_14 = wasm_v8x16_shuffle(v3_7, v3_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
109 const v128_t v2_15 = wasm_v8x16_shuffle(v3_7, v3_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
110 const v128_t v1_0 = wasm_v8x16_shuffle(v2_0, v2_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
111 const v128_t v1_1 = wasm_v8x16_shuffle(v2_0, v2_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
112 const v128_t v1_2 = wasm_v8x16_shuffle(v2_1, v2_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
113 const v128_t v1_3 = wasm_v8x16_shuffle(v2_1, v2_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
114 const v128_t v1_4 = wasm_v8x16_shuffle(v2_2, v2_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
115 const v128_t v1_5 = wasm_v8x16_shuffle(v2_2, v2_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
116 const v128_t v1_6 = wasm_v8x16_shuffle(v2_3, v2_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
117 const v128_t v1_7 = wasm_v8x16_shuffle(v2_3, v2_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
118 const v128_t v1_8 = wasm_v8x16_shuffle(v2_4, v2_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
119 const v128_t v1_9 = wasm_v8x16_shuffle(v2_4, v2_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
120 const v128_t v1_10 = wasm_v8x16_shuffle(v2_5, v2_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
121 const v128_t v1_11 = wasm_v8x16_shuffle(v2_5, v2_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
122 const v128_t v1_12 = wasm_v8x16_shuffle(v2_6, v2_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
123 const v128_t v1_13 = wasm_v8x16_shuffle(v2_6, v2_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
124 const v128_t v1_14 = wasm_v8x16_shuffle(v2_7, v2_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
125 const v128_t v1_15 = wasm_v8x16_shuffle(v2_7, v2_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
126 const v128_t v0_0 = wasm_v8x16_shuffle(v1_0, v1_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
127 const v128_t v0_1 = wasm_v8x16_shuffle(v1_0, v1_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
128 const v128_t v0_2 = wasm_v8x16_shuffle(v1_1, v1_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
129 const v128_t v0_3 = wasm_v8x16_shuffle(v1_1, v1_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
130 const v128_t v0_4 = wasm_v8x16_shuffle(v1_2, v1_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
131 const v128_t v0_5 = wasm_v8x16_shuffle(v1_2, v1_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
132 const v128_t v0_6 = wasm_v8x16_shuffle(v1_3, v1_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
133 const v128_t v0_7 = wasm_v8x16_shuffle(v1_3, v1_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
134 const v128_t v0_8 = wasm_v8x16_shuffle(v1_4, v1_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
135 const v128_t v0_9 = wasm_v8x16_shuffle(v1_4, v1_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
136 const v128_t v0_10 = wasm_v8x16_shuffle(v1_5, v1_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
137 const v128_t v0_11 = wasm_v8x16_shuffle(v1_5, v1_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
138 const v128_t v0_12 = wasm_v8x16_shuffle(v1_6, v1_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
139 const v128_t v0_13 = wasm_v8x16_shuffle(v1_6, v1_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
140 const v128_t v0_14 = wasm_v8x16_shuffle(v1_7, v1_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
141 const v128_t v0_15 = wasm_v8x16_shuffle(v1_7, v1_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
142
143 uint8_t *oN = (uint8_t*) ((uintptr_t) o + oN_stride);
144 switch (rem) {
145 case 15:
146 wasm_v128_store(oN, v0_15);
147 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
148 case 14:
149 wasm_v128_store(oN, v0_14);
150 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
151 case 13:
152 wasm_v128_store(oN, v0_13);
153 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
154 case 12:
155 wasm_v128_store(oN, v0_12);
156 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
157 case 11:
158 wasm_v128_store(oN, v0_11);
159 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
160 case 10:
161 wasm_v128_store(oN, v0_10);
162 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
163 case 9:
164 wasm_v128_store(oN, v0_9);
165 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
166 case 8:
167 wasm_v128_store(oN, v0_8);
168 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
169 case 7:
170 wasm_v128_store(oN, v0_7);
171 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
172 case 6:
173 wasm_v128_store(oN, v0_6);
174 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
175 case 5:
176 wasm_v128_store(oN, v0_5);
177 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
178 case 4:
179 wasm_v128_store(oN, v0_4);
180 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
181 case 3:
182 wasm_v128_store(oN, v0_3);
183 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
184 case 2:
185 wasm_v128_store(oN, v0_2);
186 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
187 case 1:
188 wasm_v128_store(oN, v0_1);
189 case 0:
190 wasm_v128_store(o, v0_0);
191 o = (uint8_t*) ((uintptr_t) o + tile_hbytes);
192 break;
193 default:
194 XNN_UNREACHABLE;
195 }
196 }
197
198 if (bh != 0) {
199 const v128_t v4_0 = wasm_v128_load(i0);
200 const uint8_t *i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
201 if XNN_UNPREDICTABLE(bh < 2) {
202 i1 = i0;
203 }
204 const v128_t v4_1 = wasm_v128_load(i1);
205 const uint8_t *i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
206 if XNN_UNPREDICTABLE(bh <= 2) {
207 i2 = i1;
208 }
209 const v128_t v4_2 = wasm_v128_load(i2);
210 const uint8_t *i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
211 if XNN_UNPREDICTABLE(bh < 4) {
212 i3 = i2;
213 }
214 const v128_t v4_3 = wasm_v128_load(i3);
215 const uint8_t *i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
216 if XNN_UNPREDICTABLE(bh <= 4) {
217 i4 = i3;
218 }
219 const v128_t v4_4 = wasm_v128_load(i4);
220 const uint8_t *i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
221 if XNN_UNPREDICTABLE(bh < 6) {
222 i5 = i4;
223 }
224 const v128_t v4_5 = wasm_v128_load(i5);
225 const uint8_t *i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
226 if XNN_UNPREDICTABLE(bh <= 6) {
227 i6 = i5;
228 }
229 const v128_t v4_6 = wasm_v128_load(i6);
230 const uint8_t *i7 = (const uint8_t*) ((uintptr_t) i6 + input_stride);
231 if XNN_UNPREDICTABLE(bh < 8) {
232 i7 = i6;
233 }
234 const v128_t v4_7 = wasm_v128_load(i7);
235 const uint8_t *i8 = (const uint8_t*) ((uintptr_t) i7 + input_stride);
236 if XNN_UNPREDICTABLE(bh <= 8) {
237 i8 = i7;
238 }
239 const v128_t v4_8 = wasm_v128_load(i8);
240 const uint8_t *i9 = (const uint8_t*) ((uintptr_t) i8 + input_stride);
241 if XNN_UNPREDICTABLE(bh < 10) {
242 i9 = i8;
243 }
244 const v128_t v4_9 = wasm_v128_load(i9);
245 const uint8_t *i10 = (const uint8_t*) ((uintptr_t) i9 + input_stride);
246 if XNN_UNPREDICTABLE(bh <= 10) {
247 i10 = i9;
248 }
249 const v128_t v4_10 = wasm_v128_load(i10);
250 const uint8_t *i11 = (const uint8_t*) ((uintptr_t) i10 + input_stride);
251 if XNN_UNPREDICTABLE(bh < 12) {
252 i11 = i10;
253 }
254 const v128_t v4_11 = wasm_v128_load(i11);
255 const uint8_t *i12 = (const uint8_t*) ((uintptr_t) i11 + input_stride);
256 if XNN_UNPREDICTABLE(bh <= 12) {
257 i12 = i11;
258 }
259 const v128_t v4_12 = wasm_v128_load(i12);
260 const uint8_t *i13 = (const uint8_t*) ((uintptr_t) i12 + input_stride);
261 if XNN_UNPREDICTABLE(bh < 14) {
262 i13 = i12;
263 }
264 const v128_t v4_13 = wasm_v128_load(i13);
265 const uint8_t *i14 = (const uint8_t*) ((uintptr_t) i13 + input_stride);
266 if XNN_UNPREDICTABLE(bh <= 14) {
267 i14 = i13;
268 }
269 const v128_t v4_14 = wasm_v128_load(i14);
270 const v128_t v4_15 = wasm_v128_xor(v4_0, v4_0);
271
272 const v128_t v3_0 = wasm_v8x16_shuffle(v4_0, v4_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
273 const v128_t v3_1 = wasm_v8x16_shuffle(v4_0, v4_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
274 const v128_t v3_2 = wasm_v8x16_shuffle(v4_1, v4_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
275 const v128_t v3_3 = wasm_v8x16_shuffle(v4_1, v4_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
276 const v128_t v3_4 = wasm_v8x16_shuffle(v4_2, v4_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
277 const v128_t v3_5 = wasm_v8x16_shuffle(v4_2, v4_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
278 const v128_t v3_6 = wasm_v8x16_shuffle(v4_3, v4_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
279 const v128_t v3_7 = wasm_v8x16_shuffle(v4_3, v4_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
280 const v128_t v3_8 = wasm_v8x16_shuffle(v4_4, v4_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
281 const v128_t v3_9 = wasm_v8x16_shuffle(v4_4, v4_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
282 const v128_t v3_10 = wasm_v8x16_shuffle(v4_5, v4_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
283 const v128_t v3_11 = wasm_v8x16_shuffle(v4_5, v4_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
284 const v128_t v3_12 = wasm_v8x16_shuffle(v4_6, v4_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
285 const v128_t v3_13 = wasm_v8x16_shuffle(v4_6, v4_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
286 const v128_t v3_14 = wasm_v8x16_shuffle(v4_7, v4_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
287 const v128_t v3_15 = wasm_v8x16_shuffle(v4_7, v4_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
288 const v128_t v2_0 = wasm_v8x16_shuffle(v3_0, v3_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
289 const v128_t v2_1 = wasm_v8x16_shuffle(v3_0, v3_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
290 const v128_t v2_2 = wasm_v8x16_shuffle(v3_1, v3_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
291 const v128_t v2_3 = wasm_v8x16_shuffle(v3_1, v3_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
292 const v128_t v2_4 = wasm_v8x16_shuffle(v3_2, v3_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
293 const v128_t v2_5 = wasm_v8x16_shuffle(v3_2, v3_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
294 const v128_t v2_6 = wasm_v8x16_shuffle(v3_3, v3_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
295 const v128_t v2_7 = wasm_v8x16_shuffle(v3_3, v3_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
296 const v128_t v2_8 = wasm_v8x16_shuffle(v3_4, v3_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
297 const v128_t v2_9 = wasm_v8x16_shuffle(v3_4, v3_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
298 const v128_t v2_10 = wasm_v8x16_shuffle(v3_5, v3_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
299 const v128_t v2_11 = wasm_v8x16_shuffle(v3_5, v3_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
300 const v128_t v2_12 = wasm_v8x16_shuffle(v3_6, v3_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
301 const v128_t v2_13 = wasm_v8x16_shuffle(v3_6, v3_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
302 const v128_t v2_14 = wasm_v8x16_shuffle(v3_7, v3_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
303 const v128_t v2_15 = wasm_v8x16_shuffle(v3_7, v3_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
304 const v128_t v1_0 = wasm_v8x16_shuffle(v2_0, v2_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
305 const v128_t v1_1 = wasm_v8x16_shuffle(v2_0, v2_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
306 const v128_t v1_2 = wasm_v8x16_shuffle(v2_1, v2_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
307 const v128_t v1_3 = wasm_v8x16_shuffle(v2_1, v2_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
308 const v128_t v1_4 = wasm_v8x16_shuffle(v2_2, v2_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
309 const v128_t v1_5 = wasm_v8x16_shuffle(v2_2, v2_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
310 const v128_t v1_6 = wasm_v8x16_shuffle(v2_3, v2_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
311 const v128_t v1_7 = wasm_v8x16_shuffle(v2_3, v2_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
312 const v128_t v1_8 = wasm_v8x16_shuffle(v2_4, v2_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
313 const v128_t v1_9 = wasm_v8x16_shuffle(v2_4, v2_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
314 const v128_t v1_10 = wasm_v8x16_shuffle(v2_5, v2_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
315 const v128_t v1_11 = wasm_v8x16_shuffle(v2_5, v2_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
316 const v128_t v1_12 = wasm_v8x16_shuffle(v2_6, v2_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
317 const v128_t v1_13 = wasm_v8x16_shuffle(v2_6, v2_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
318 const v128_t v1_14 = wasm_v8x16_shuffle(v2_7, v2_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
319 const v128_t v1_15 = wasm_v8x16_shuffle(v2_7, v2_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
320
321 v128_t v0_0 = wasm_v8x16_shuffle(v1_0, v1_8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
322 v128_t v0_1 = wasm_v8x16_shuffle(v1_0, v1_8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
323 v128_t v0_2 = wasm_v8x16_shuffle(v1_1, v1_9, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
324 v128_t v0_3 = wasm_v8x16_shuffle(v1_1, v1_9, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
325 v128_t v0_4 = wasm_v8x16_shuffle(v1_2, v1_10, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
326 v128_t v0_5 = wasm_v8x16_shuffle(v1_2, v1_10, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
327 v128_t v0_6 = wasm_v8x16_shuffle(v1_3, v1_11, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
328 v128_t v0_7 = wasm_v8x16_shuffle(v1_3, v1_11, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
329 v128_t v0_8 = wasm_v8x16_shuffle(v1_4, v1_12, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
330 v128_t v0_9 = wasm_v8x16_shuffle(v1_4, v1_12, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
331 v128_t v0_10 = wasm_v8x16_shuffle(v1_5, v1_13, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
332 v128_t v0_11 = wasm_v8x16_shuffle(v1_5, v1_13, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
333 v128_t v0_12 = wasm_v8x16_shuffle(v1_6, v1_14, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
334 v128_t v0_13 = wasm_v8x16_shuffle(v1_6, v1_14, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
335 v128_t v0_14 = wasm_v8x16_shuffle(v1_7, v1_15, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
336 v128_t v0_15 = wasm_v8x16_shuffle(v1_7, v1_15, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
337
338 if (bh & 8) {
339 uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
340 switch (rem) {
341 case 15:
342 *((double*) oN) = wasm_f64x2_extract_lane(v0_15, 0);
343 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
344 case 14:
345 *((double*) oN) = wasm_f64x2_extract_lane(v0_14, 0);
346 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
347 case 13:
348 *((double*) oN) = wasm_f64x2_extract_lane(v0_13, 0);
349 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
350 case 12:
351 *((double*) oN) = wasm_f64x2_extract_lane(v0_12, 0);
352 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
353 case 11:
354 *((double*) oN) = wasm_f64x2_extract_lane(v0_11, 0);
355 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
356 case 10:
357 *((double*) oN) = wasm_f64x2_extract_lane(v0_10, 0);
358 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
359 case 9:
360 *((double*) oN) = wasm_f64x2_extract_lane(v0_9, 0);
361 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
362 case 8:
363 *((double*) oN) = wasm_f64x2_extract_lane(v0_8, 0);
364 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
365 case 7:
366 *((double*) oN) = wasm_f64x2_extract_lane(v0_7, 0);
367 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
368 case 6:
369 *((double*) oN) = wasm_f64x2_extract_lane(v0_6, 0);
370 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
371 case 5:
372 *((double*) oN) = wasm_f64x2_extract_lane(v0_5, 0);
373 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
374 case 4:
375 *((double*) oN) = wasm_f64x2_extract_lane(v0_4, 0);
376 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
377 case 3:
378 *((double*) oN) = wasm_f64x2_extract_lane(v0_3, 0);
379 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
380 case 2:
381 *((double*) oN) = wasm_f64x2_extract_lane(v0_2, 0);
382 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
383 case 1:
384 *((double*) oN) = wasm_f64x2_extract_lane(v0_1, 0);
385 case 0:
386 *((double*) o) = wasm_f64x2_extract_lane(v0_0, 0);
387 o += 8;
388 break;
389 default:
390 XNN_UNREACHABLE;
391 }
392 v0_0 = wasm_v64x2_shuffle(v0_0, v0_0, 1, 1);
393 v0_1 = wasm_v64x2_shuffle(v0_1, v0_1, 1, 1);
394 v0_2 = wasm_v64x2_shuffle(v0_2, v0_2, 1, 1);
395 v0_3 = wasm_v64x2_shuffle(v0_3, v0_3, 1, 1);
396 v0_4 = wasm_v64x2_shuffle(v0_4, v0_4, 1, 1);
397 v0_5 = wasm_v64x2_shuffle(v0_5, v0_5, 1, 1);
398 v0_6 = wasm_v64x2_shuffle(v0_6, v0_6, 1, 1);
399 v0_7 = wasm_v64x2_shuffle(v0_7, v0_7, 1, 1);
400 v0_8 = wasm_v64x2_shuffle(v0_8, v0_8, 1, 1);
401 v0_9 = wasm_v64x2_shuffle(v0_9, v0_9, 1, 1);
402 v0_10 = wasm_v64x2_shuffle(v0_10, v0_10, 1, 1);
403 v0_11 = wasm_v64x2_shuffle(v0_11, v0_11, 1, 1);
404 v0_12 = wasm_v64x2_shuffle(v0_12, v0_12, 1, 1);
405 v0_13 = wasm_v64x2_shuffle(v0_13, v0_13, 1, 1);
406 v0_14 = wasm_v64x2_shuffle(v0_14, v0_14, 1, 1);
407 v0_15 = wasm_v64x2_shuffle(v0_15, v0_15, 1, 1);
408 }
409
410 if (bh & 4) {
411 uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
412 switch (rem) {
413 case 15:
414 *((float*) oN) = wasm_f32x4_extract_lane(v0_15, 0);
415 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
416 case 14:
417 *((float*) oN) = wasm_f32x4_extract_lane(v0_14, 0);
418 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
419 case 13:
420 *((float*) oN) = wasm_f32x4_extract_lane(v0_13, 0);
421 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
422 case 12:
423 *((float*) oN) = wasm_f32x4_extract_lane(v0_12, 0);
424 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
425 case 11:
426 *((float*) oN) = wasm_f32x4_extract_lane(v0_11, 0);
427 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
428 case 10:
429 *((float*) oN) = wasm_f32x4_extract_lane(v0_10, 0);
430 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
431 case 9:
432 *((float*) oN) = wasm_f32x4_extract_lane(v0_9, 0);
433 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
434 case 8:
435 *((float*) oN) = wasm_f32x4_extract_lane(v0_8, 0);
436 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
437 case 7:
438 *((float*) oN) = wasm_f32x4_extract_lane(v0_7, 0);
439 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
440 case 6:
441 *((float*) oN) = wasm_f32x4_extract_lane(v0_6, 0);
442 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
443 case 5:
444 *((float*) oN) = wasm_f32x4_extract_lane(v0_5, 0);
445 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
446 case 4:
447 *((float*) oN) = wasm_f32x4_extract_lane(v0_4, 0);
448 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
449 case 3:
450 *((float*) oN) = wasm_f32x4_extract_lane(v0_3, 0);
451 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
452 case 2:
453 *((float*) oN) = wasm_f32x4_extract_lane(v0_2, 0);
454 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
455 case 1:
456 *((float*) oN) = wasm_f32x4_extract_lane(v0_1, 0);
457 case 0:
458 *((float*) o) = wasm_f32x4_extract_lane(v0_0, 0);
459 o += 4;
460 break;
461 default:
462 XNN_UNREACHABLE;
463 }
464 v0_0 = wasm_u64x2_shr(v0_0, 32);
465 v0_1 = wasm_u64x2_shr(v0_1, 32);
466 v0_2 = wasm_u64x2_shr(v0_2, 32);
467 v0_3 = wasm_u64x2_shr(v0_3, 32);
468 v0_4 = wasm_u64x2_shr(v0_4, 32);
469 v0_5 = wasm_u64x2_shr(v0_5, 32);
470 v0_6 = wasm_u64x2_shr(v0_6, 32);
471 v0_7 = wasm_u64x2_shr(v0_7, 32);
472 v0_8 = wasm_u64x2_shr(v0_8, 32);
473 v0_9 = wasm_u64x2_shr(v0_9, 32);
474 v0_10 = wasm_u64x2_shr(v0_10, 32);
475 v0_11 = wasm_u64x2_shr(v0_11, 32);
476 v0_12 = wasm_u64x2_shr(v0_12, 32);
477 v0_13 = wasm_u64x2_shr(v0_13, 32);
478 v0_14 = wasm_u64x2_shr(v0_14, 32);
479 v0_15 = wasm_u64x2_shr(v0_15, 32);
480 }
481 if (bh & 2) {
482 uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
483 switch (rem) {
484 case 15:
485 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_15, 0);
486 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
487 case 14:
488 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_14, 0);
489 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
490 case 13:
491 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_13, 0);
492 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
493 case 12:
494 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_12, 0);
495 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
496 case 11:
497 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_11, 0);
498 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
499 case 10:
500 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_10, 0);
501 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
502 case 9:
503 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_9, 0);
504 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
505 case 8:
506 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_8, 0);
507 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
508 case 7:
509 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_7, 0);
510 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
511 case 6:
512 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_6, 0);
513 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
514 case 5:
515 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_5, 0);
516 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
517 case 4:
518 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_4, 0);
519 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
520 case 3:
521 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_3, 0);
522 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
523 case 2:
524 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_2, 0);
525 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
526 case 1:
527 *((uint16_t*) oN) = wasm_i16x8_extract_lane(v0_1, 0);
528 case 0:
529 *((uint16_t*) o) = wasm_i16x8_extract_lane(v0_0, 0);
530 o += 2;
531 break;
532 default:
533 XNN_UNREACHABLE;
534 }
535 v0_0 = wasm_u32x4_shr(v0_0, 16);
536 v0_1 = wasm_u32x4_shr(v0_1, 16);
537 v0_2 = wasm_u32x4_shr(v0_2, 16);
538 v0_3 = wasm_u32x4_shr(v0_3, 16);
539 v0_4 = wasm_u32x4_shr(v0_4, 16);
540 v0_5 = wasm_u32x4_shr(v0_5, 16);
541 v0_6 = wasm_u32x4_shr(v0_6, 16);
542 v0_7 = wasm_u32x4_shr(v0_7, 16);
543 v0_8 = wasm_u32x4_shr(v0_8, 16);
544 v0_9 = wasm_u32x4_shr(v0_9, 16);
545 v0_10 = wasm_u32x4_shr(v0_10, 16);
546 v0_11 = wasm_u32x4_shr(v0_11, 16);
547 v0_12 = wasm_u32x4_shr(v0_12, 16);
548 v0_13 = wasm_u32x4_shr(v0_13, 16);
549 v0_14 = wasm_u32x4_shr(v0_14, 16);
550 v0_15 = wasm_u32x4_shr(v0_15, 16);
551 }
552 if (bh & 1) {
553 uint8_t* oN = (uint8_t*) ((uintptr_t) o + oN_stride);
554 switch (rem) {
555 case 15:
556 *oN = wasm_i8x16_extract_lane(v0_15, 0);
557 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
558 case 14:
559 *oN = wasm_i8x16_extract_lane(v0_14, 0);
560 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
561 case 13:
562 *oN = wasm_i8x16_extract_lane(v0_13, 0);
563 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
564 case 12:
565 *oN = wasm_i8x16_extract_lane(v0_12, 0);
566 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
567 case 11:
568 *oN = wasm_i8x16_extract_lane(v0_11, 0);
569 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
570 case 10:
571 *oN = wasm_i8x16_extract_lane(v0_10, 0);
572 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
573 case 9:
574 *oN = wasm_i8x16_extract_lane(v0_9, 0);
575 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
576 case 8:
577 *oN = wasm_i8x16_extract_lane(v0_8, 0);
578 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
579 case 7:
580 *oN = wasm_i8x16_extract_lane(v0_7, 0);
581 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
582 case 6:
583 *oN = wasm_i8x16_extract_lane(v0_6, 0);
584 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
585 case 5:
586 *oN = wasm_i8x16_extract_lane(v0_5, 0);
587 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
588 case 4:
589 *oN = wasm_i8x16_extract_lane(v0_4, 0);
590 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
591 case 3:
592 *oN = wasm_i8x16_extract_lane(v0_3, 0);
593 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
594 case 2:
595 *oN = wasm_i8x16_extract_lane(v0_2, 0);
596 oN = (uint8_t*) ((uintptr_t) oN + minus_output_stride);
597 case 1:
598 *oN = wasm_i8x16_extract_lane(v0_1, 0);
599 case 0:
600 *o = wasm_i8x16_extract_lane(v0_0, 0);
601 break;
602 default:
603 XNN_UNREACHABLE;
604 }
605 }
606 }
607
608 i0 = (const uint8_t*) ((uintptr_t) i0 + input_reset);
609 o = (uint8_t*) ((uintptr_t) o + output_reset);
610 block_width = doz(block_width, tile_width);
611 } while (block_width != 0);
612 }
613