• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1// Copyright 2020 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6$assert ROW_TILE >= 1
7$assert ACCUMULATORS >= 1
8#include <assert.h>
9
10#include <wasm_simd128.h>
11
12#include <xnnpack/dwconv.h>
13#include <xnnpack/math.h>
14
15
16$ARCH_SUFFIX = "_x86" if X86 else "_arm"
17
18void xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd${ARCH_SUFFIX}_splat_${ROW_TILE}x4${"_acc%d" % ACCUMULATORS if ACCUMULATORS > 1 else ""}(
19    size_t input_height,
20    size_t input_width,
21    const float* input,
22    const float* weights,
23    const float* zero,
24    float* output,
25    uint32_t padding_top,
26    const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
27{
28  assert(input_height != 0);
29  assert(input_width != 0);
30  assert(input_width % sizeof(float) == 0);
31  assert(padding_top == 2);
32
33  const v128_t vmask = wasm_v128_load(params->scalar.mask);
34  const v128_t vmax = wasm_v32x4_load_splat(&params->scalar.max);
35  const v128_t vmin = wasm_v32x4_load_splat(&params->scalar.min);
36
37  const v128_t vw0123 = wasm_v128_load(weights);
38  const v128_t vw4567 = wasm_v128_load(weights + 4);
39  const v128_t vw89AB = wasm_v128_load(weights + 8);
40  const v128_t vwCDEF = wasm_v128_load(weights + 12);
41  const v128_t vwGHIJ = wasm_v128_load(weights + 16);
42  const v128_t vwKLMN = wasm_v128_load(weights + 20);
43  const v128_t vwOP = wasm_v64x2_load_splat(weights + 24);
44
45  const v128_t vzero = wasm_f32x4_splat(0.0f);
46
47  const size_t input_decrement = round_up_po2(input_width, 4 * sizeof(float));
48
49  const float* i0 = zero;
50  const float* i1 = zero;
51  const float* i2 = input;
52  $for M in range(3, 4 + ROW_TILE):
53    const float* i${M} = (const float*) ((uintptr_t) i${M-1} + input_width);
54
55  float* o0 = output;
56  $for M in range(1, ROW_TILE):
57    float* o${M} = (float*) ((uintptr_t) o${M-1} + input_width);
58
59  size_t output_height = input_height;
60  do {
61    $for M in range(2, 3 + ROW_TILE):
62      if XNN_UNPREDICTABLE(output_height < ${M}) {
63        i${M+1} = zero;
64        $if M <= ROW_TILE:
65          o${M-1} = o${M-2};
66      }
67
68    $for M in range(4 + ROW_TILE):
69      v128_t vi${M}x0123 = vzero;
70
71    $for M in range(4 + ROW_TILE):
72      v128_t vi${M}x4567 = wasm_v128_load(i${M}); i${M} += 4;
73
74    size_t w = input_width;
75    for (; w > 8 * sizeof(float); w -= 4 * sizeof(float)) {
76      $for M in range(ROW_TILE):
77        v128_t vo${M}p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0);
78
79      $for M in range(4 + ROW_TILE):
80        const v128_t vi${M}x89AB = wasm_v128_load(i${M}); i${M} += 4;
81
82      $for M in range(ROW_TILE):
83        $if ACCUMULATORS > 1:
84          v128_t vo${M}p1 = wasm_f32x4_mul(vi${M}x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3));
85        $else:
86          vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M}x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)));
87
88      $for M in range(ROW_TILE):
89        $if ACCUMULATORS > 2:
90          v128_t vo${M}p2 = wasm_f32x4_mul(vi${M+1}x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0));
91        $else:
92          vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M+1}x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)));
93
94      $for M in range(ROW_TILE):
95        $if ACCUMULATORS > 3:
96          v128_t vo${M}p3 = wasm_f32x4_mul(vi${M+2}x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1));
97        $else:
98          vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1)));
99
100      $for M in range(ROW_TILE):
101        $if ACCUMULATORS > 4:
102          v128_t vo${M}p4 = wasm_f32x4_mul(vi${M+3}x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2));
103        $else:
104          vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2)));
105
106      $for M in range(ROW_TILE):
107        $if ACCUMULATORS > 6:
108          v128_t vo${M}p5 = wasm_f32x4_mul(vi${M+4}x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3));
109        $else:
110          vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3)));
111
112      $for M in range(4 + ROW_TILE):
113        const v128_t vi${M}x3456 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 3, 4, 5, 6);
114
115      $for M in range(ROW_TILE):
116        vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2)));
117
118      $for M in range(ROW_TILE):
119        vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3)));
120
121      $for M in range(ROW_TILE):
122        vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0)));
123
124      $for M in range(ROW_TILE):
125        vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1)));
126
127      $for M in range(ROW_TILE):
128        vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2)));
129
130      $for M in range(4 + ROW_TILE):
131        const v128_t vi${M}x2345 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 2, 3, 4, 5);
132        vi${M}x0123 = vi${M}x4567;
133
134      $for M in range(ROW_TILE):
135        vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1)));
136
137      $for M in range(ROW_TILE):
138        vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2)));
139
140      $for M in range(ROW_TILE):
141        vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3)));
142
143      $for M in range(ROW_TILE):
144        vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0)));
145
146      $for M in range(ROW_TILE):
147        vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1)));
148
149      $for M in range(4 + ROW_TILE):
150        const v128_t vi${M}x5678 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 1, 2, 3, 4);
151
152      $for M in range(ROW_TILE):
153        vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0)));
154
155      $for M in range(ROW_TILE):
156        vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1)));
157
158      $for M in range(ROW_TILE):
159        vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2)));
160
161      $for M in range(ROW_TILE):
162        vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3)));
163
164      $for M in range(ROW_TILE):
165        vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0)));
166
167      $for M in range(4 + ROW_TILE):
168        const v128_t vi${M}x6789 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 2, 3, 4, 5);
169        vi${M}x4567 = vi${M}x89AB;
170
171      $for M in range(ROW_TILE):
172        vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)));
173
174      $for M in range(ROW_TILE):
175        vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2)));
176
177      $for M in range(ROW_TILE):
178        vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3)));
179
180      $for M in range(ROW_TILE):
181        vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0)));
182
183      $for M in range(ROW_TILE):
184        vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1)));
185
186      $if ACCUMULATORS > 1:
187        $ACC_SLICE = 1
188        $while ACC_SLICE < ACCUMULATORS:
189          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
190            $if A + ACC_SLICE < ACCUMULATORS:
191              $for M in range(ROW_TILE):
192                vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE});
193          $ACC_SLICE *= 2
194
195      $if X86:
196        $for M in range(ROW_TILE):
197          v128_t vo${M} = wasm_v128_bitselect(vmin, vo${M}p0, wasm_f32x4_lt(vo${M}p0, vmin));
198        $for M in range(ROW_TILE):
199          vo${M} = wasm_v128_bitselect(vo${M}, vmax, wasm_f32x4_le(vo${M}, vmax));
200      $else:
201        $for M in range(ROW_TILE):
202          v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin);
203        $for M in range(ROW_TILE):
204          vo${M} = wasm_f32x4_min(vo${M}, vmax);
205
206      $for M in reversed(range(ROW_TILE)):
207        wasm_v128_store(o${M}, vo${M}); o${M} += 4;
208    }
209    // Always process the last block of 5..8 pixels.
210    if XNN_LIKELY(w > 4 * sizeof(float)) {
211      $for M in range(ROW_TILE):
212        v128_t vo${M}p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0);
213
214      $for M in range(4 + ROW_TILE):
215        v128_t vi${M}x89AB = wasm_v128_load(i${M}); i${M} += 4;
216
217      $for M in range(4 + ROW_TILE):
218        vi${M}x89AB = wasm_v128_and(vmask, vi${M}x89AB);
219
220      $for M in range(ROW_TILE):
221        $if ACCUMULATORS > 1:
222          v128_t vo${M}p1 = wasm_f32x4_mul(vi${M}x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3));
223        $else:
224          vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M}x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)));
225
226      $for M in range(ROW_TILE):
227        $if ACCUMULATORS > 2:
228          v128_t vo${M}p2 = wasm_f32x4_mul(vi${M+1}x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0));
229        $else:
230          vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M+1}x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)));
231
232      $for M in range(ROW_TILE):
233        $if ACCUMULATORS > 3:
234          v128_t vo${M}p3 = wasm_f32x4_mul(vi${M+2}x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1));
235        $else:
236          vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1)));
237
238      $for M in range(ROW_TILE):
239        $if ACCUMULATORS > 4:
240          v128_t vo${M}p4 = wasm_f32x4_mul(vi${M+3}x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2));
241        $else:
242          vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2)));
243
244      $for M in range(ROW_TILE):
245        $if ACCUMULATORS > 6:
246          v128_t vo${M}p5 = wasm_f32x4_mul(vi${M+4}x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3));
247        $else:
248          vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3)));
249
250      $for M in range(4 + ROW_TILE):
251        const v128_t vi${M}x3456 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 3, 4, 5, 6);
252
253      $for M in range(ROW_TILE):
254        vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2)));
255
256      $for M in range(ROW_TILE):
257        vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3)));
258
259      $for M in range(ROW_TILE):
260        vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0)));
261
262      $for M in range(ROW_TILE):
263        vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1)));
264
265      $for M in range(ROW_TILE):
266        vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2)));
267
268      $for M in range(4 + ROW_TILE):
269        const v128_t vi${M}x2345 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 2, 3, 4, 5);
270        vi${M}x0123 = vi${M}x4567;
271
272      $for M in range(ROW_TILE):
273        vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1)));
274
275      $for M in range(ROW_TILE):
276        vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2)));
277
278      $for M in range(ROW_TILE):
279        vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3)));
280
281      $for M in range(ROW_TILE):
282        vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0)));
283
284      $for M in range(ROW_TILE):
285        vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1)));
286
287      $for M in range(4 + ROW_TILE):
288        const v128_t vi${M}x5678 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 1, 2, 3, 4);
289
290      $for M in range(ROW_TILE):
291        vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0)));
292
293      $for M in range(ROW_TILE):
294        vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1)));
295
296      $for M in range(ROW_TILE):
297        vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2)));
298
299      $for M in range(ROW_TILE):
300        vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3)));
301
302      $for M in range(ROW_TILE):
303        vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0)));
304
305      $for M in range(4 + ROW_TILE):
306        const v128_t vi${M}x6789 = wasm_v32x4_shuffle(vi${M}x4567, vi${M}x89AB, 2, 3, 4, 5);
307        vi${M}x4567 = vi${M}x89AB;
308
309      $for M in range(ROW_TILE):
310        vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)));
311
312      $for M in range(ROW_TILE):
313        vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2)));
314
315      $for M in range(ROW_TILE):
316        vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3)));
317
318      $for M in range(ROW_TILE):
319        vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0)));
320
321      $for M in range(ROW_TILE):
322        vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1)));
323
324      $if ACCUMULATORS > 1:
325        $ACC_SLICE = 1
326        $while ACC_SLICE < ACCUMULATORS:
327          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
328            $if A + ACC_SLICE < ACCUMULATORS:
329              $for M in range(ROW_TILE):
330                vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE});
331          $ACC_SLICE *= 2
332
333      $if X86:
334        $for M in range(ROW_TILE):
335          v128_t vo${M} = wasm_v128_bitselect(vmin, vo${M}p0, wasm_f32x4_lt(vo${M}p0, vmin));
336        $for M in range(ROW_TILE):
337          vo${M} = wasm_v128_bitselect(vo${M}, vmax, wasm_f32x4_le(vo${M}, vmax));
338      $else:
339        $for M in range(ROW_TILE):
340          v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin);
341        $for M in range(ROW_TILE):
342          vo${M} = wasm_f32x4_min(vo${M}, vmax);
343
344      $for M in reversed(range(ROW_TILE)):
345        wasm_v128_store(o${M}, vo${M}); o${M} += 4;
346
347      w -= 4 * sizeof(float);
348    }
349    assert(w >= 1 * sizeof(float));
350    assert(w <= 4 * sizeof(float));
351    {
352      $for M in range(ROW_TILE):
353        v128_t vo${M}p0 = wasm_v32x4_shuffle(vw0123, vw0123, 0, 0, 0, 0);
354
355      $for M in range(4 + ROW_TILE):
356        vi${M}x4567 = wasm_v128_and(vmask, vi${M}x4567);
357
358      $for M in range(ROW_TILE):
359        $if ACCUMULATORS > 1:
360          v128_t vo${M}p1 = wasm_f32x4_mul(vi${M}x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3));
361        $else:
362          vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M}x4567, wasm_v32x4_shuffle(vw0123, vw0123, 3, 3, 3, 3)));
363
364      $for M in range(ROW_TILE):
365        $if ACCUMULATORS > 2:
366          v128_t vo${M}p2 = wasm_f32x4_mul(vi${M+1}x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0));
367        $else:
368          vo${M}p0 = wasm_f32x4_add(vo${M}p0, wasm_f32x4_mul(vi${M+1}x4567, wasm_v32x4_shuffle(vw89AB, vw89AB, 0, 0, 0, 0)));
369
370      $for M in range(ROW_TILE):
371        $if ACCUMULATORS > 3:
372          v128_t vo${M}p3 = wasm_f32x4_mul(vi${M+2}x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1));
373        $else:
374          vo${M}p${4 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${4 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x4567, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 1, 1, 1, 1)));
375
376      $for M in range(ROW_TILE):
377        $if ACCUMULATORS > 4:
378          v128_t vo${M}p4 = wasm_f32x4_mul(vi${M+3}x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2));
379        $else:
380          vo${M}p${5 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${5 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x4567, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 2, 2, 2, 2)));
381
382      $for M in range(ROW_TILE):
383        $if ACCUMULATORS > 6:
384          v128_t vo${M}p5 = wasm_f32x4_mul(vi${M+4}x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3));
385        $else:
386          vo${M}p${6 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${6 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x4567, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 3, 3, 3, 3)));
387
388      $for M in range(4 + ROW_TILE):
389        const v128_t vi${M}x3456 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 3, 4, 5, 6);
390
391      $for M in range(ROW_TILE):
392        vo${M}p${7 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${7 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x3456, wasm_v32x4_shuffle(vw0123, vw0123, 2, 2, 2, 2)));
393
394      $for M in range(ROW_TILE):
395        vo${M}p${8 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${8 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x3456, wasm_v32x4_shuffle(vw4567, vw4567, 3, 3, 3, 3)));
396
397      $for M in range(ROW_TILE):
398        vo${M}p${9 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${9 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x3456, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 0, 0, 0, 0)));
399
400      $for M in range(ROW_TILE):
401        vo${M}p${10 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${10 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x3456, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 1, 1, 1, 1)));
402
403      $for M in range(ROW_TILE):
404        vo${M}p${11 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${11 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x3456, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 2, 2, 2, 2)));
405
406      $for M in range(4 + ROW_TILE):
407        const v128_t vi${M}x2345 = wasm_v32x4_shuffle(vi${M}x0123, vi${M}x4567, 2, 3, 4, 5);
408
409      $for M in range(ROW_TILE):
410        vo${M}p${12 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${12 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x2345, wasm_v32x4_shuffle(vw0123, vw0123, 1, 1, 1, 1)));
411
412      $for M in range(ROW_TILE):
413        vo${M}p${13 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${13 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x2345, wasm_v32x4_shuffle(vw4567, vw4567, 2, 2, 2, 2)));
414
415      $for M in range(ROW_TILE):
416        vo${M}p${14 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${14 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x2345, wasm_v32x4_shuffle(vw89AB, vw89AB, 3, 3, 3, 3)));
417
418      $for M in range(ROW_TILE):
419        vo${M}p${15 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${15 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x2345, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 0, 0, 0, 0)));
420
421      $for M in range(ROW_TILE):
422        vo${M}p${16 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${16 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x2345, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 1, 1, 1, 1)));
423
424      $for M in range(4 + ROW_TILE):
425        const v128_t vi${M}x5678 = wasm_v32x4_shuffle(vi${M}x4567, vzero, 1, 2, 3, 4);
426
427      $for M in range(ROW_TILE):
428        vo${M}p${17 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${17 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x5678, wasm_v32x4_shuffle(vw4567, vw4567, 0, 0, 0, 0)));
429
430      $for M in range(ROW_TILE):
431        vo${M}p${18 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${18 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x5678, wasm_v32x4_shuffle(vw89AB, vw89AB, 1, 1, 1, 1)));
432
433      $for M in range(ROW_TILE):
434        vo${M}p${19 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${19 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x5678, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 2, 2, 2, 2)));
435
436      $for M in range(ROW_TILE):
437        vo${M}p${20 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${20 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x5678, wasm_v32x4_shuffle(vwGHIJ, vwGHIJ, 3, 3, 3, 3)));
438
439      $for M in range(ROW_TILE):
440        vo${M}p${21 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${21 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x5678, wasm_v32x4_shuffle(vwOP, vwOP, 0, 0, 0, 0)));
441
442      $for M in range(4 + ROW_TILE):
443        const v128_t vi${M}x6789 = wasm_v32x4_shuffle(vi${M}x5678, vzero, 1, 2, 3, 4);
444
445      $for M in range(ROW_TILE):
446        vo${M}p${22 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${22 % ACCUMULATORS}, wasm_f32x4_mul(vi${M}x6789, wasm_v32x4_shuffle(vw4567, vw4567, 1, 1, 1, 1)));
447
448      $for M in range(ROW_TILE):
449        vo${M}p${23 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${23 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+1}x6789, wasm_v32x4_shuffle(vw89AB, vw89AB, 2, 2, 2, 2)));
450
451      $for M in range(ROW_TILE):
452        vo${M}p${24 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${24 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+2}x6789, wasm_v32x4_shuffle(vwCDEF, vwCDEF, 3, 3, 3, 3)));
453
454      $for M in range(ROW_TILE):
455        vo${M}p${25 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${25 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+3}x6789, wasm_v32x4_shuffle(vwKLMN, vwKLMN, 0, 0, 0, 0)));
456
457      $for M in range(ROW_TILE):
458        vo${M}p${26 % ACCUMULATORS} = wasm_f32x4_add(vo${M}p${26 % ACCUMULATORS}, wasm_f32x4_mul(vi${M+4}x6789, wasm_v32x4_shuffle(vwOP, vwOP, 1, 1, 1, 1)));
459
460      $if ACCUMULATORS > 1:
461        $ACC_SLICE = 1
462        $while ACC_SLICE < ACCUMULATORS:
463          $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
464            $if A + ACC_SLICE < ACCUMULATORS:
465              $for M in range(ROW_TILE):
466                vo${M}p${A} = wasm_f32x4_add(vo${M}p${A}, vo${M}p${A + ACC_SLICE});
467          $ACC_SLICE *= 2
468
469      $if X86:
470        $for M in range(ROW_TILE):
471          v128_t vo${M} = wasm_v128_bitselect(vmin, vo${M}p0, wasm_f32x4_lt(vo${M}p0, vmin));
472        $for M in range(ROW_TILE):
473          vo${M} = wasm_v128_bitselect(vo${M}, vmax, wasm_f32x4_le(vo${M}, vmax));
474      $else:
475        $for M in range(ROW_TILE):
476          v128_t vo${M} = wasm_f32x4_max(vo${M}p0, vmin);
477        $for M in range(ROW_TILE):
478          vo${M} = wasm_f32x4_min(vo${M}, vmax);
479
480      if XNN_LIKELY(w & (4 * sizeof(float))) {
481        $for M in reversed(range(ROW_TILE)):
482          wasm_v128_store(o${M}, vo${M}); o${M} += 4;
483      } else {
484        if (w & (2 * sizeof(float))) {
485          $for M in reversed(range(ROW_TILE)):
486            *((double*) o${M}) = wasm_f64x2_extract_lane(vo${M}, 0); o${M} += 2;
487
488          $for M in range(ROW_TILE):
489            vo${M} = wasm_v32x4_shuffle(vo${M}, vo${M}, 2, 3, 0, 1);
490        }
491        if (w & (1 * sizeof(float))) {
492          $for M in reversed(range(ROW_TILE)):
493            *o${M} = wasm_f32x4_extract_lane(vo${M}, 0); o${M} += 1;
494        }
495      }
496    }
497
498    i0 = (const float*) ((uintptr_t) i${ROW_TILE} - input_decrement);
499    i1 = (const float*) ((uintptr_t) i${ROW_TILE+1} - input_decrement);
500    $for M in range(2, 4 + ROW_TILE):
501      i${M} = (const float*) ((uintptr_t) i${M-1} + input_width);
502
503    $if ROW_TILE > 1:
504      o0 = o${ROW_TILE - 1};
505      $for M in range(1, ROW_TILE):
506        o${M} = (float*) ((uintptr_t) o${M-1} + input_width);
507
508    $if ROW_TILE > 1:
509      output_height = doz(output_height, ${ROW_TILE});
510  } while (${"--" if ROW_TILE == 1 else ""}output_height != 0);
511}
512