• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <xnnpack/conv.h>
9 #include <xnnpack/math.h>
10 
11 
xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_output_params params[restrict static1])12 void xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1(
13     size_t input_height,
14     size_t input_width,
15     size_t output_y_start,
16     size_t output_y_end,
17     const float* input,
18     const float* zero,
19     const float* weights,
20     float* output,
21     size_t input_padding_top,
22     size_t output_channels,
23     size_t output_height_stride,
24     size_t output_width_stride,
25     const union xnn_f32_output_params params[restrict static 1])
26 {
27   assert(input_width != 0);
28   assert(output_y_end > output_y_start);
29   assert(input_padding_top <= 1);
30   assert(output_channels != 0);
31 
32   const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
33   const size_t input_width_decrement = round_down_po2(input_width, 2) * 3 /* channels */ * sizeof(float);
34   const size_t output_width = (input_width + 1) / 2;
35   const size_t output_channel_increment = 4 * sizeof(float) - output_width * output_width_stride;
36 
37 
38   // Adjustment for padding processed below
39   const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
40   const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
41   const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
42   float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
43 
44   if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
45     i0 = zero;
46   }
47 
48   const float voutput_max = params->scalar.max;
49   const float voutput_min = params->scalar.min;
50 
51   for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 1) {
52     const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
53     if XNN_UNPREDICTABLE(input_y2 >= input_height) {
54       i2 = zero;
55     }
56 
57     const float* w = weights;
58     size_t c = output_channels;
59     float* o0 = output0;
60     do {
61       float vi00c0 = 0.0f;
62       float vi00c1 = 0.0f;
63       float vi00c2 = 0.0f;
64       float vi10c0 = 0.0f;
65       float vi10c1 = 0.0f;
66       float vi10c2 = 0.0f;
67       float vi20c0 = 0.0f;
68       float vi20c1 = 0.0f;
69       float vi20c2 = 0.0f;
70 
71       size_t iw = input_width;
72       for (; iw >= 2; iw -= 2) {
73         // start with biases
74         float voc0 = w[0];
75         float voc1 = w[1];
76         float voc2 = w[2];
77         float voc3 = w[3];
78 
79         const float vk00c0x0 = w[4];
80         const float vk00c0x1 = w[5];
81         const float vk00c0x2 = w[6];
82         const float vk00c0x3 = w[7];
83 
84         voc0 += vk00c0x0 * vi00c0;
85         voc1 += vk00c0x1 * vi00c0;
86         voc2 += vk00c0x2 * vi00c0;
87         voc3 += vk00c0x3 * vi00c0;
88 
89         const float vk10c0x0 = w[8];
90         const float vk10c0x1 = w[9];
91         const float vk10c0x2 = w[10];
92         const float vk10c0x3 = w[11];
93 
94         voc0 += vk10c0x0 * vi10c0;
95         voc1 += vk10c0x1 * vi10c0;
96         voc2 += vk10c0x2 * vi10c0;
97         voc3 += vk10c0x3 * vi10c0;
98 
99         const float vk20c0x0 = w[12];
100         const float vk20c0x1 = w[13];
101         const float vk20c0x2 = w[14];
102         const float vk20c0x3 = w[15];
103 
104         voc0 += vk20c0x0 * vi20c0;
105         voc1 += vk20c0x1 * vi20c0;
106         voc2 += vk20c0x2 * vi20c0;
107         voc3 += vk20c0x3 * vi20c0;
108 
109         const float vk00c1x0 = w[16];
110         const float vk00c1x1 = w[17];
111         const float vk00c1x2 = w[18];
112         const float vk00c1x3 = w[19];
113 
114         voc0 += vk00c1x0 * vi00c1;
115         voc1 += vk00c1x1 * vi00c1;
116         voc2 += vk00c1x2 * vi00c1;
117         voc3 += vk00c1x3 * vi00c1;
118 
119         const float vk10c1x0 = w[20];
120         const float vk10c1x1 = w[21];
121         const float vk10c1x2 = w[22];
122         const float vk10c1x3 = w[23];
123 
124         voc0 += vk10c1x0 * vi10c1;
125         voc1 += vk10c1x1 * vi10c1;
126         voc2 += vk10c1x2 * vi10c1;
127         voc3 += vk10c1x3 * vi10c1;
128 
129         const float vk20c1x0 = w[24];
130         const float vk20c1x1 = w[25];
131         const float vk20c1x2 = w[26];
132         const float vk20c1x3 = w[27];
133 
134         voc0 += vk20c1x0 * vi20c1;
135         voc1 += vk20c1x1 * vi20c1;
136         voc2 += vk20c1x2 * vi20c1;
137         voc3 += vk20c1x3 * vi20c1;
138 
139         const float vk00c2x0 = w[28];
140         const float vk00c2x1 = w[29];
141         const float vk00c2x2 = w[30];
142         const float vk00c2x3 = w[31];
143 
144         voc0 += vk00c2x0 * vi00c2;
145         voc1 += vk00c2x1 * vi00c2;
146         voc2 += vk00c2x2 * vi00c2;
147         voc3 += vk00c2x3 * vi00c2;
148 
149         const float vk10c2x0 = w[32];
150         const float vk10c2x1 = w[33];
151         const float vk10c2x2 = w[34];
152         const float vk10c2x3 = w[35];
153 
154         voc0 += vk10c2x0 * vi10c2;
155         voc1 += vk10c2x1 * vi10c2;
156         voc2 += vk10c2x2 * vi10c2;
157         voc3 += vk10c2x3 * vi10c2;
158 
159         const float vk20c2x0 = w[36];
160         const float vk20c2x1 = w[37];
161         const float vk20c2x2 = w[38];
162         const float vk20c2x3 = w[39];
163 
164         voc0 += vk20c2x0 * vi20c2;
165         voc1 += vk20c2x1 * vi20c2;
166         voc2 += vk20c2x2 * vi20c2;
167         voc3 += vk20c2x3 * vi20c2;
168 
169         const float vk01c0x0 = w[40];
170         const float vk01c0x1 = w[41];
171         const float vk01c0x2 = w[42];
172         const float vk01c0x3 = w[43];
173 
174         const float vi01c0 = i0[0];
175 
176         voc0 += vk01c0x0 * vi01c0;
177         voc1 += vk01c0x1 * vi01c0;
178         voc2 += vk01c0x2 * vi01c0;
179         voc3 += vk01c0x3 * vi01c0;
180 
181         const float vk11c0x0 = w[44];
182         const float vk11c0x1 = w[45];
183         const float vk11c0x2 = w[46];
184         const float vk11c0x3 = w[47];
185 
186         const float vi11c0 = i1[0];
187 
188         voc0 += vk11c0x0 * vi11c0;
189         voc1 += vk11c0x1 * vi11c0;
190         voc2 += vk11c0x2 * vi11c0;
191         voc3 += vk11c0x3 * vi11c0;
192 
193         const float vk21c0x0 = w[48];
194         const float vk21c0x1 = w[49];
195         const float vk21c0x2 = w[50];
196         const float vk21c0x3 = w[51];
197 
198         const float vi21c0 = i2[0];
199 
200         voc0 += vk21c0x0 * vi21c0;
201         voc1 += vk21c0x1 * vi21c0;
202         voc2 += vk21c0x2 * vi21c0;
203         voc3 += vk21c0x3 * vi21c0;
204 
205         const float vk01c1x0 = w[52];
206         const float vk01c1x1 = w[53];
207         const float vk01c1x2 = w[54];
208         const float vk01c1x3 = w[55];
209 
210         const float vi01c1 = i0[1];
211 
212         voc0 += vk01c1x0 * vi01c1;
213         voc1 += vk01c1x1 * vi01c1;
214         voc2 += vk01c1x2 * vi01c1;
215         voc3 += vk01c1x3 * vi01c1;
216 
217         const float vk11c1x0 = w[56];
218         const float vk11c1x1 = w[57];
219         const float vk11c1x2 = w[58];
220         const float vk11c1x3 = w[59];
221 
222         const float vi11c1 = i1[1];
223 
224         voc0 += vk11c1x0 * vi11c1;
225         voc1 += vk11c1x1 * vi11c1;
226         voc2 += vk11c1x2 * vi11c1;
227         voc3 += vk11c1x3 * vi11c1;
228 
229         const float vk21c1x0 = w[60];
230         const float vk21c1x1 = w[61];
231         const float vk21c1x2 = w[62];
232         const float vk21c1x3 = w[63];
233 
234         const float vi21c1 = i2[1];
235 
236         voc0 += vk21c1x0 * vi21c1;
237         voc1 += vk21c1x1 * vi21c1;
238         voc2 += vk21c1x2 * vi21c1;
239         voc3 += vk21c1x3 * vi21c1;
240 
241         const float vk01c2x0 = w[64];
242         const float vk01c2x1 = w[65];
243         const float vk01c2x2 = w[66];
244         const float vk01c2x3 = w[67];
245 
246         const float vi01c2 = i0[2];
247 
248         voc0 += vk01c2x0 * vi01c2;
249         voc1 += vk01c2x1 * vi01c2;
250         voc2 += vk01c2x2 * vi01c2;
251         voc3 += vk01c2x3 * vi01c2;
252 
253         const float vk11c2x0 = w[68];
254         const float vk11c2x1 = w[69];
255         const float vk11c2x2 = w[70];
256         const float vk11c2x3 = w[71];
257 
258         const float vi11c2 = i1[2];
259 
260         voc0 += vk11c2x0 * vi11c2;
261         voc1 += vk11c2x1 * vi11c2;
262         voc2 += vk11c2x2 * vi11c2;
263         voc3 += vk11c2x3 * vi11c2;
264 
265         const float vk21c2x0 = w[72];
266         const float vk21c2x1 = w[73];
267         const float vk21c2x2 = w[74];
268         const float vk21c2x3 = w[75];
269 
270         const float vi21c2 = i2[2];
271 
272         voc0 += vk21c2x0 * vi21c2;
273         voc1 += vk21c2x1 * vi21c2;
274         voc2 += vk21c2x2 * vi21c2;
275         voc3 += vk21c2x3 * vi21c2;
276 
277         const float vk02c0x0 = w[76];
278         const float vk02c0x1 = w[77];
279         const float vk02c0x2 = w[78];
280         const float vk02c0x3 = w[79];
281 
282         const float vi02c0 = i0[3];
283 
284         voc0 += vk02c0x0 * vi02c0;
285         voc1 += vk02c0x1 * vi02c0;
286         voc2 += vk02c0x2 * vi02c0;
287         voc3 += vk02c0x3 * vi02c0;
288 
289         const float vk12c0x0 = w[80];
290         const float vk12c0x1 = w[81];
291         const float vk12c0x2 = w[82];
292         const float vk12c0x3 = w[83];
293 
294         const float vi12c0 = i1[3];
295 
296         voc0 += vk12c0x0 * vi12c0;
297         voc1 += vk12c0x1 * vi12c0;
298         voc2 += vk12c0x2 * vi12c0;
299         voc3 += vk12c0x3 * vi12c0;
300 
301         const float vk22c0x0 = w[84];
302         const float vk22c0x1 = w[85];
303         const float vk22c0x2 = w[86];
304         const float vk22c0x3 = w[87];
305 
306         const float vi22c0 = i2[3];
307 
308         voc0 += vk22c0x0 * vi22c0;
309         voc1 += vk22c0x1 * vi22c0;
310         voc2 += vk22c0x2 * vi22c0;
311         voc3 += vk22c0x3 * vi22c0;
312 
313         vi00c0 = vi02c0;
314         vi10c0 = vi12c0;
315         vi20c0 = vi22c0;
316 
317         const float vk02c1x0 = w[88];
318         const float vk02c1x1 = w[89];
319         const float vk02c1x2 = w[90];
320         const float vk02c1x3 = w[91];
321 
322         const float vi02c1 = i0[4];
323 
324         voc0 += vk02c1x0 * vi02c1;
325         voc1 += vk02c1x1 * vi02c1;
326         voc2 += vk02c1x2 * vi02c1;
327         voc3 += vk02c1x3 * vi02c1;
328 
329         const float vk12c1x0 = w[92];
330         const float vk12c1x1 = w[93];
331         const float vk12c1x2 = w[94];
332         const float vk12c1x3 = w[95];
333 
334         const float vi12c1 = i1[4];
335 
336         voc0 += vk12c1x0 * vi12c1;
337         voc1 += vk12c1x1 * vi12c1;
338         voc2 += vk12c1x2 * vi12c1;
339         voc3 += vk12c1x3 * vi12c1;
340 
341         const float vk22c1x0 = w[96];
342         const float vk22c1x1 = w[97];
343         const float vk22c1x2 = w[98];
344         const float vk22c1x3 = w[99];
345 
346         const float vi22c1 = i2[4];
347 
348         voc0 += vk22c1x0 * vi22c1;
349         voc1 += vk22c1x1 * vi22c1;
350         voc2 += vk22c1x2 * vi22c1;
351         voc3 += vk22c1x3 * vi22c1;
352 
353         vi00c1 = vi02c1;
354         vi10c1 = vi12c1;
355         vi20c1 = vi22c1;
356 
357         const float vk02c2x0 = w[100];
358         const float vk02c2x1 = w[101];
359         const float vk02c2x2 = w[102];
360         const float vk02c2x3 = w[103];
361 
362         const float vi02c2 = i0[5];
363 
364         voc0 += vk02c2x0 * vi02c2;
365         voc1 += vk02c2x1 * vi02c2;
366         voc2 += vk02c2x2 * vi02c2;
367         voc3 += vk02c2x3 * vi02c2;
368 
369         const float vk12c2x0 = w[104];
370         const float vk12c2x1 = w[105];
371         const float vk12c2x2 = w[106];
372         const float vk12c2x3 = w[107];
373 
374         const float vi12c2 = i1[5];
375 
376         voc0 += vk12c2x0 * vi12c2;
377         voc1 += vk12c2x1 * vi12c2;
378         voc2 += vk12c2x2 * vi12c2;
379         voc3 += vk12c2x3 * vi12c2;
380 
381         const float vk22c2x0 = w[108];
382         const float vk22c2x1 = w[109];
383         const float vk22c2x2 = w[110];
384         const float vk22c2x3 = w[111];
385 
386         const float vi22c2 = i2[5];
387 
388         voc0 += vk22c2x0 * vi22c2;
389         voc1 += vk22c2x1 * vi22c2;
390         voc2 += vk22c2x2 * vi22c2;
391         voc3 += vk22c2x3 * vi22c2;
392 
393         vi00c2 = vi02c2;
394         vi10c2 = vi12c2;
395         vi20c2 = vi22c2;
396 
397         voc0 = math_min_f32(voc0, voutput_max);
398         voc1 = math_min_f32(voc1, voutput_max);
399         voc2 = math_min_f32(voc2, voutput_max);
400         voc3 = math_min_f32(voc3, voutput_max);
401 
402         voc0 = math_max_f32(voc0, voutput_min);
403         voc1 = math_max_f32(voc1, voutput_min);
404         voc2 = math_max_f32(voc2, voutput_min);
405         voc3 = math_max_f32(voc3, voutput_min);
406 
407         if XNN_LIKELY(c >= 4) {
408           o0[0] = voc0;
409           o0[1] = voc1;
410           o0[2] = voc2;
411           o0[3] = voc3;
412           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
413         } else {
414           float* o0_tmp = o0;
415           if (c & 2) {
416             o0_tmp[0] = voc0;
417             o0_tmp[1] = voc1;
418             o0_tmp += 2;
419             voc0 = voc2;
420           }
421           if (c & 1) {
422             *o0_tmp++ = voc0;
423           }
424           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
425         }
426 
427         i0 += 6;
428         i1 += 6;
429         i2 += 6;
430       }
431       assert(iw < 2);
432       if XNN_UNLIKELY(iw != 0) {
433         float voc0 = w[0];
434         float voc1 = w[1];
435         float voc2 = w[2];
436         float voc3 = w[3];
437 
438         const float vk00c0x0 = w[4];
439         const float vk00c0x1 = w[5];
440         const float vk00c0x2 = w[6];
441         const float vk00c0x3 = w[7];
442 
443         voc0 += vk00c0x0 * vi00c0;
444         voc1 += vk00c0x1 * vi00c0;
445         voc2 += vk00c0x2 * vi00c0;
446         voc3 += vk00c0x3 * vi00c0;
447 
448         const float vk10c0x0 = w[8];
449         const float vk10c0x1 = w[9];
450         const float vk10c0x2 = w[10];
451         const float vk10c0x3 = w[11];
452 
453         voc0 += vk10c0x0 * vi10c0;
454         voc1 += vk10c0x1 * vi10c0;
455         voc2 += vk10c0x2 * vi10c0;
456         voc3 += vk10c0x3 * vi10c0;
457 
458         const float vk20c0x0 = w[12];
459         const float vk20c0x1 = w[13];
460         const float vk20c0x2 = w[14];
461         const float vk20c0x3 = w[15];
462 
463         voc0 += vk20c0x0 * vi20c0;
464         voc1 += vk20c0x1 * vi20c0;
465         voc2 += vk20c0x2 * vi20c0;
466         voc3 += vk20c0x3 * vi20c0;
467 
468         const float vk00c1x0 = w[16];
469         const float vk00c1x1 = w[17];
470         const float vk00c1x2 = w[18];
471         const float vk00c1x3 = w[19];
472 
473         voc0 += vk00c1x0 * vi00c1;
474         voc1 += vk00c1x1 * vi00c1;
475         voc2 += vk00c1x2 * vi00c1;
476         voc3 += vk00c1x3 * vi00c1;
477 
478         const float vk10c1x0 = w[20];
479         const float vk10c1x1 = w[21];
480         const float vk10c1x2 = w[22];
481         const float vk10c1x3 = w[23];
482 
483         voc0 += vk10c1x0 * vi10c1;
484         voc1 += vk10c1x1 * vi10c1;
485         voc2 += vk10c1x2 * vi10c1;
486         voc3 += vk10c1x3 * vi10c1;
487 
488         const float vk20c1x0 = w[24];
489         const float vk20c1x1 = w[25];
490         const float vk20c1x2 = w[26];
491         const float vk20c1x3 = w[27];
492 
493         voc0 += vk20c1x0 * vi20c1;
494         voc1 += vk20c1x1 * vi20c1;
495         voc2 += vk20c1x2 * vi20c1;
496         voc3 += vk20c1x3 * vi20c1;
497 
498         const float vk00c2x0 = w[28];
499         const float vk00c2x1 = w[29];
500         const float vk00c2x2 = w[30];
501         const float vk00c2x3 = w[31];
502 
503         voc0 += vk00c2x0 * vi00c2;
504         voc1 += vk00c2x1 * vi00c2;
505         voc2 += vk00c2x2 * vi00c2;
506         voc3 += vk00c2x3 * vi00c2;
507 
508         const float vk10c2x0 = w[32];
509         const float vk10c2x1 = w[33];
510         const float vk10c2x2 = w[34];
511         const float vk10c2x3 = w[35];
512 
513         voc0 += vk10c2x0 * vi10c2;
514         voc1 += vk10c2x1 * vi10c2;
515         voc2 += vk10c2x2 * vi10c2;
516         voc3 += vk10c2x3 * vi10c2;
517 
518         const float vk20c2x0 = w[36];
519         const float vk20c2x1 = w[37];
520         const float vk20c2x2 = w[38];
521         const float vk20c2x3 = w[39];
522 
523         voc0 += vk20c2x0 * vi20c2;
524         voc1 += vk20c2x1 * vi20c2;
525         voc2 += vk20c2x2 * vi20c2;
526         voc3 += vk20c2x3 * vi20c2;
527 
528         const float vk01c0x0 = w[40];
529         const float vk01c0x1 = w[41];
530         const float vk01c0x2 = w[42];
531         const float vk01c0x3 = w[43];
532 
533         const float vi01c0 = i0[0];
534 
535         voc0 += vk01c0x0 * vi01c0;
536         voc1 += vk01c0x1 * vi01c0;
537         voc2 += vk01c0x2 * vi01c0;
538         voc3 += vk01c0x3 * vi01c0;
539 
540         const float vk11c0x0 = w[44];
541         const float vk11c0x1 = w[45];
542         const float vk11c0x2 = w[46];
543         const float vk11c0x3 = w[47];
544 
545         const float vi11c0 = i1[0];
546 
547         voc0 += vk11c0x0 * vi11c0;
548         voc1 += vk11c0x1 * vi11c0;
549         voc2 += vk11c0x2 * vi11c0;
550         voc3 += vk11c0x3 * vi11c0;
551 
552         const float vk21c0x0 = w[48];
553         const float vk21c0x1 = w[49];
554         const float vk21c0x2 = w[50];
555         const float vk21c0x3 = w[51];
556 
557         const float vi21c0 = i2[0];
558 
559         voc0 += vk21c0x0 * vi21c0;
560         voc1 += vk21c0x1 * vi21c0;
561         voc2 += vk21c0x2 * vi21c0;
562         voc3 += vk21c0x3 * vi21c0;
563 
564         const float vk01c1x0 = w[52];
565         const float vk01c1x1 = w[53];
566         const float vk01c1x2 = w[54];
567         const float vk01c1x3 = w[55];
568 
569         const float vi01c1 = i0[1];
570 
571         voc0 += vk01c1x0 * vi01c1;
572         voc1 += vk01c1x1 * vi01c1;
573         voc2 += vk01c1x2 * vi01c1;
574         voc3 += vk01c1x3 * vi01c1;
575 
576         const float vk11c1x0 = w[56];
577         const float vk11c1x1 = w[57];
578         const float vk11c1x2 = w[58];
579         const float vk11c1x3 = w[59];
580 
581         const float vi11c1 = i1[1];
582 
583         voc0 += vk11c1x0 * vi11c1;
584         voc1 += vk11c1x1 * vi11c1;
585         voc2 += vk11c1x2 * vi11c1;
586         voc3 += vk11c1x3 * vi11c1;
587 
588         const float vk21c1x0 = w[60];
589         const float vk21c1x1 = w[61];
590         const float vk21c1x2 = w[62];
591         const float vk21c1x3 = w[63];
592 
593         const float vi21c1 = i2[1];
594 
595         voc0 += vk21c1x0 * vi21c1;
596         voc1 += vk21c1x1 * vi21c1;
597         voc2 += vk21c1x2 * vi21c1;
598         voc3 += vk21c1x3 * vi21c1;
599 
600         const float vk01c2x0 = w[64];
601         const float vk01c2x1 = w[65];
602         const float vk01c2x2 = w[66];
603         const float vk01c2x3 = w[67];
604 
605         const float vi01c2 = i0[2];
606 
607         voc0 += vk01c2x0 * vi01c2;
608         voc1 += vk01c2x1 * vi01c2;
609         voc2 += vk01c2x2 * vi01c2;
610         voc3 += vk01c2x3 * vi01c2;
611 
612         const float vk11c2x0 = w[68];
613         const float vk11c2x1 = w[69];
614         const float vk11c2x2 = w[70];
615         const float vk11c2x3 = w[71];
616 
617         const float vi11c2 = i1[2];
618 
619         voc0 += vk11c2x0 * vi11c2;
620         voc1 += vk11c2x1 * vi11c2;
621         voc2 += vk11c2x2 * vi11c2;
622         voc3 += vk11c2x3 * vi11c2;
623 
624         const float vk21c2x0 = w[72];
625         const float vk21c2x1 = w[73];
626         const float vk21c2x2 = w[74];
627         const float vk21c2x3 = w[75];
628 
629         const float vi21c2 = i2[2];
630 
631         voc0 += vk21c2x0 * vi21c2;
632         voc1 += vk21c2x1 * vi21c2;
633         voc2 += vk21c2x2 * vi21c2;
634         voc3 += vk21c2x3 * vi21c2;
635 
636         voc0 = math_min_f32(voc0, voutput_max);
637         voc1 = math_min_f32(voc1, voutput_max);
638         voc2 = math_min_f32(voc2, voutput_max);
639         voc3 = math_min_f32(voc3, voutput_max);
640 
641         voc0 = math_max_f32(voc0, voutput_min);
642         voc1 = math_max_f32(voc1, voutput_min);
643         voc2 = math_max_f32(voc2, voutput_min);
644         voc3 = math_max_f32(voc3, voutput_min);
645 
646         if XNN_LIKELY(c >= 4) {
647           o0[0] = voc0;
648           o0[1] = voc1;
649           o0[2] = voc2;
650           o0[3] = voc3;
651           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
652         } else {
653           float* o0_tmp = o0;
654           if (c & 2) {
655             o0_tmp[0] = voc0;
656             o0_tmp[1] = voc1;
657             o0_tmp += 2;
658             voc0 = voc2;
659           }
660           if (c & 1) {
661             *o0_tmp++ = voc0;
662           }
663           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
664         }
665       }
666       // Move output pointers back to the position of the first pixel in a row,
667       // and forward to the next block of output channels
668       o0 = (float*) ((uintptr_t) o0 + output_channel_increment);
669       // Revert input pointers to the position of the first pixel in a row
670       i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
671       i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
672       i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
673       // Move to the block of weights for the next 4 output channels
674       w += 112;
675       c = doz(c, 4);
676     } while (c != 0);
677     // Move output pointers forward to the next row
678     output0 = (float*) ((uintptr_t) output0 + output_height_stride);
679     // Move input pointers forward to the next row
680     i0 = i2;
681     i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
682     i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
683   }
684 }
685