• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <xnnpack/conv.h>
9 #include <xnnpack/math.h>
10 
11 
xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])12 void xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1(
13     size_t input_height,
14     size_t input_width,
15     size_t output_y_start,
16     size_t output_y_end,
17     const float* input,
18     const float* zero,
19     const float* weights,
20     float* output,
21     size_t input_padding_top,
22     size_t output_channels,
23     size_t output_height_stride,
24     size_t output_width_stride,
25     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26 {
27   assert(input_width != 0);
28   assert(output_y_end > output_y_start);
29   assert(input_padding_top <= 1);
30   assert(output_channels != 0);
31 
32   const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
33   const size_t input_width_decrement = round_down_po2(input_width - 1, 2) * 3 /* channels */ * sizeof(float);
34   const size_t output_width = input_width / 2;
35   const size_t output_channel_decrement = output_width * output_width_stride - 4 * sizeof(float);
36   const size_t output_height_increment = output_height_stride - round_up_po2(output_channels, 4) * sizeof(float);
37 
38   // Adjustment for padding processed below
39   const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
40   const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
41   const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
42   float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
43 
44   if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
45     i0 = zero;
46   }
47 
48   const float voutput_max = params->scalar.max;
49   const float voutput_min = params->scalar.min;
50 
51   for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 1) {
52     const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
53     if XNN_UNPREDICTABLE(input_y2 >= input_height) {
54       i2 = zero;
55     }
56 
57     const float* w = weights;
58     size_t c = output_channels;
59     do {
60       float vi00c0 = i0[0];
61       float vi00c1 = i0[1];
62       float vi00c2 = i0[2];
63       float vi10c0 = i1[0];
64       float vi10c1 = i1[1];
65       float vi10c2 = i1[2];
66       float vi20c0 = i2[0];
67       float vi20c1 = i2[1];
68       float vi20c2 = i2[2];
69 
70       size_t iw = input_width - 1;
71       for (; iw >= 2; iw -= 2) {
72         // start with biases
73         float voc0 = w[0];
74         float voc1 = w[1];
75         float voc2 = w[2];
76         float voc3 = w[3];
77 
78         const float vk00c0x0 = w[4];
79         const float vk00c0x1 = w[5];
80         const float vk00c0x2 = w[6];
81         const float vk00c0x3 = w[7];
82 
83         voc0 += vk00c0x0 * vi00c0;
84         voc1 += vk00c0x1 * vi00c0;
85         voc2 += vk00c0x2 * vi00c0;
86         voc3 += vk00c0x3 * vi00c0;
87 
88         const float vk10c0x0 = w[8];
89         const float vk10c0x1 = w[9];
90         const float vk10c0x2 = w[10];
91         const float vk10c0x3 = w[11];
92 
93         voc0 += vk10c0x0 * vi10c0;
94         voc1 += vk10c0x1 * vi10c0;
95         voc2 += vk10c0x2 * vi10c0;
96         voc3 += vk10c0x3 * vi10c0;
97 
98         const float vk20c0x0 = w[12];
99         const float vk20c0x1 = w[13];
100         const float vk20c0x2 = w[14];
101         const float vk20c0x3 = w[15];
102 
103         voc0 += vk20c0x0 * vi20c0;
104         voc1 += vk20c0x1 * vi20c0;
105         voc2 += vk20c0x2 * vi20c0;
106         voc3 += vk20c0x3 * vi20c0;
107 
108         const float vk00c1x0 = w[16];
109         const float vk00c1x1 = w[17];
110         const float vk00c1x2 = w[18];
111         const float vk00c1x3 = w[19];
112 
113         voc0 += vk00c1x0 * vi00c1;
114         voc1 += vk00c1x1 * vi00c1;
115         voc2 += vk00c1x2 * vi00c1;
116         voc3 += vk00c1x3 * vi00c1;
117 
118         const float vk10c1x0 = w[20];
119         const float vk10c1x1 = w[21];
120         const float vk10c1x2 = w[22];
121         const float vk10c1x3 = w[23];
122 
123         voc0 += vk10c1x0 * vi10c1;
124         voc1 += vk10c1x1 * vi10c1;
125         voc2 += vk10c1x2 * vi10c1;
126         voc3 += vk10c1x3 * vi10c1;
127 
128         const float vk20c1x0 = w[24];
129         const float vk20c1x1 = w[25];
130         const float vk20c1x2 = w[26];
131         const float vk20c1x3 = w[27];
132 
133         voc0 += vk20c1x0 * vi20c1;
134         voc1 += vk20c1x1 * vi20c1;
135         voc2 += vk20c1x2 * vi20c1;
136         voc3 += vk20c1x3 * vi20c1;
137 
138         const float vk00c2x0 = w[28];
139         const float vk00c2x1 = w[29];
140         const float vk00c2x2 = w[30];
141         const float vk00c2x3 = w[31];
142 
143         voc0 += vk00c2x0 * vi00c2;
144         voc1 += vk00c2x1 * vi00c2;
145         voc2 += vk00c2x2 * vi00c2;
146         voc3 += vk00c2x3 * vi00c2;
147 
148         const float vk10c2x0 = w[32];
149         const float vk10c2x1 = w[33];
150         const float vk10c2x2 = w[34];
151         const float vk10c2x3 = w[35];
152 
153         voc0 += vk10c2x0 * vi10c2;
154         voc1 += vk10c2x1 * vi10c2;
155         voc2 += vk10c2x2 * vi10c2;
156         voc3 += vk10c2x3 * vi10c2;
157 
158         const float vk20c2x0 = w[36];
159         const float vk20c2x1 = w[37];
160         const float vk20c2x2 = w[38];
161         const float vk20c2x3 = w[39];
162 
163         voc0 += vk20c2x0 * vi20c2;
164         voc1 += vk20c2x1 * vi20c2;
165         voc2 += vk20c2x2 * vi20c2;
166         voc3 += vk20c2x3 * vi20c2;
167 
168         const float vk01c0x0 = w[40];
169         const float vk01c0x1 = w[41];
170         const float vk01c0x2 = w[42];
171         const float vk01c0x3 = w[43];
172 
173         const float vi01c0 = i0[3];
174 
175         voc0 += vk01c0x0 * vi01c0;
176         voc1 += vk01c0x1 * vi01c0;
177         voc2 += vk01c0x2 * vi01c0;
178         voc3 += vk01c0x3 * vi01c0;
179 
180         const float vk11c0x0 = w[44];
181         const float vk11c0x1 = w[45];
182         const float vk11c0x2 = w[46];
183         const float vk11c0x3 = w[47];
184 
185         const float vi11c0 = i1[3];
186 
187         voc0 += vk11c0x0 * vi11c0;
188         voc1 += vk11c0x1 * vi11c0;
189         voc2 += vk11c0x2 * vi11c0;
190         voc3 += vk11c0x3 * vi11c0;
191 
192         const float vk21c0x0 = w[48];
193         const float vk21c0x1 = w[49];
194         const float vk21c0x2 = w[50];
195         const float vk21c0x3 = w[51];
196 
197         const float vi21c0 = i2[3];
198 
199         voc0 += vk21c0x0 * vi21c0;
200         voc1 += vk21c0x1 * vi21c0;
201         voc2 += vk21c0x2 * vi21c0;
202         voc3 += vk21c0x3 * vi21c0;
203 
204         const float vk01c1x0 = w[52];
205         const float vk01c1x1 = w[53];
206         const float vk01c1x2 = w[54];
207         const float vk01c1x3 = w[55];
208 
209         const float vi01c1 = i0[4];
210 
211         voc0 += vk01c1x0 * vi01c1;
212         voc1 += vk01c1x1 * vi01c1;
213         voc2 += vk01c1x2 * vi01c1;
214         voc3 += vk01c1x3 * vi01c1;
215 
216         const float vk11c1x0 = w[56];
217         const float vk11c1x1 = w[57];
218         const float vk11c1x2 = w[58];
219         const float vk11c1x3 = w[59];
220 
221         const float vi11c1 = i1[4];
222 
223         voc0 += vk11c1x0 * vi11c1;
224         voc1 += vk11c1x1 * vi11c1;
225         voc2 += vk11c1x2 * vi11c1;
226         voc3 += vk11c1x3 * vi11c1;
227 
228         const float vk21c1x0 = w[60];
229         const float vk21c1x1 = w[61];
230         const float vk21c1x2 = w[62];
231         const float vk21c1x3 = w[63];
232 
233         const float vi21c1 = i2[4];
234 
235         voc0 += vk21c1x0 * vi21c1;
236         voc1 += vk21c1x1 * vi21c1;
237         voc2 += vk21c1x2 * vi21c1;
238         voc3 += vk21c1x3 * vi21c1;
239 
240         const float vk01c2x0 = w[64];
241         const float vk01c2x1 = w[65];
242         const float vk01c2x2 = w[66];
243         const float vk01c2x3 = w[67];
244 
245         const float vi01c2 = i0[5];
246 
247         voc0 += vk01c2x0 * vi01c2;
248         voc1 += vk01c2x1 * vi01c2;
249         voc2 += vk01c2x2 * vi01c2;
250         voc3 += vk01c2x3 * vi01c2;
251 
252         const float vk11c2x0 = w[68];
253         const float vk11c2x1 = w[69];
254         const float vk11c2x2 = w[70];
255         const float vk11c2x3 = w[71];
256 
257         const float vi11c2 = i1[5];
258 
259         voc0 += vk11c2x0 * vi11c2;
260         voc1 += vk11c2x1 * vi11c2;
261         voc2 += vk11c2x2 * vi11c2;
262         voc3 += vk11c2x3 * vi11c2;
263 
264         const float vk21c2x0 = w[72];
265         const float vk21c2x1 = w[73];
266         const float vk21c2x2 = w[74];
267         const float vk21c2x3 = w[75];
268 
269         const float vi21c2 = i2[5];
270 
271         voc0 += vk21c2x0 * vi21c2;
272         voc1 += vk21c2x1 * vi21c2;
273         voc2 += vk21c2x2 * vi21c2;
274         voc3 += vk21c2x3 * vi21c2;
275 
276         const float vk02c0x0 = w[76];
277         const float vk02c0x1 = w[77];
278         const float vk02c0x2 = w[78];
279         const float vk02c0x3 = w[79];
280 
281         const float vi02c0 = i0[6];
282 
283         voc0 += vk02c0x0 * vi02c0;
284         voc1 += vk02c0x1 * vi02c0;
285         voc2 += vk02c0x2 * vi02c0;
286         voc3 += vk02c0x3 * vi02c0;
287 
288         const float vk12c0x0 = w[80];
289         const float vk12c0x1 = w[81];
290         const float vk12c0x2 = w[82];
291         const float vk12c0x3 = w[83];
292 
293         const float vi12c0 = i1[6];
294 
295         voc0 += vk12c0x0 * vi12c0;
296         voc1 += vk12c0x1 * vi12c0;
297         voc2 += vk12c0x2 * vi12c0;
298         voc3 += vk12c0x3 * vi12c0;
299 
300         const float vk22c0x0 = w[84];
301         const float vk22c0x1 = w[85];
302         const float vk22c0x2 = w[86];
303         const float vk22c0x3 = w[87];
304 
305         const float vi22c0 = i2[6];
306 
307         voc0 += vk22c0x0 * vi22c0;
308         voc1 += vk22c0x1 * vi22c0;
309         voc2 += vk22c0x2 * vi22c0;
310         voc3 += vk22c0x3 * vi22c0;
311 
312         vi00c0 = vi02c0;
313         vi10c0 = vi12c0;
314         vi20c0 = vi22c0;
315 
316         const float vk02c1x0 = w[88];
317         const float vk02c1x1 = w[89];
318         const float vk02c1x2 = w[90];
319         const float vk02c1x3 = w[91];
320 
321         const float vi02c1 = i0[7];
322 
323         voc0 += vk02c1x0 * vi02c1;
324         voc1 += vk02c1x1 * vi02c1;
325         voc2 += vk02c1x2 * vi02c1;
326         voc3 += vk02c1x3 * vi02c1;
327 
328         const float vk12c1x0 = w[92];
329         const float vk12c1x1 = w[93];
330         const float vk12c1x2 = w[94];
331         const float vk12c1x3 = w[95];
332 
333         const float vi12c1 = i1[7];
334 
335         voc0 += vk12c1x0 * vi12c1;
336         voc1 += vk12c1x1 * vi12c1;
337         voc2 += vk12c1x2 * vi12c1;
338         voc3 += vk12c1x3 * vi12c1;
339 
340         const float vk22c1x0 = w[96];
341         const float vk22c1x1 = w[97];
342         const float vk22c1x2 = w[98];
343         const float vk22c1x3 = w[99];
344 
345         const float vi22c1 = i2[7];
346 
347         voc0 += vk22c1x0 * vi22c1;
348         voc1 += vk22c1x1 * vi22c1;
349         voc2 += vk22c1x2 * vi22c1;
350         voc3 += vk22c1x3 * vi22c1;
351 
352         vi00c1 = vi02c1;
353         vi10c1 = vi12c1;
354         vi20c1 = vi22c1;
355 
356         const float vk02c2x0 = w[100];
357         const float vk02c2x1 = w[101];
358         const float vk02c2x2 = w[102];
359         const float vk02c2x3 = w[103];
360 
361         const float vi02c2 = i0[8];
362 
363         voc0 += vk02c2x0 * vi02c2;
364         voc1 += vk02c2x1 * vi02c2;
365         voc2 += vk02c2x2 * vi02c2;
366         voc3 += vk02c2x3 * vi02c2;
367 
368         const float vk12c2x0 = w[104];
369         const float vk12c2x1 = w[105];
370         const float vk12c2x2 = w[106];
371         const float vk12c2x3 = w[107];
372 
373         const float vi12c2 = i1[8];
374 
375         voc0 += vk12c2x0 * vi12c2;
376         voc1 += vk12c2x1 * vi12c2;
377         voc2 += vk12c2x2 * vi12c2;
378         voc3 += vk12c2x3 * vi12c2;
379 
380         const float vk22c2x0 = w[108];
381         const float vk22c2x1 = w[109];
382         const float vk22c2x2 = w[110];
383         const float vk22c2x3 = w[111];
384 
385         const float vi22c2 = i2[8];
386 
387         voc0 += vk22c2x0 * vi22c2;
388         voc1 += vk22c2x1 * vi22c2;
389         voc2 += vk22c2x2 * vi22c2;
390         voc3 += vk22c2x3 * vi22c2;
391 
392         vi00c2 = vi02c2;
393         vi10c2 = vi12c2;
394         vi20c2 = vi22c2;
395 
396         voc0 = math_min_f32(voc0, voutput_max);
397         voc1 = math_min_f32(voc1, voutput_max);
398         voc2 = math_min_f32(voc2, voutput_max);
399         voc3 = math_min_f32(voc3, voutput_max);
400 
401         voc0 = math_max_f32(voc0, voutput_min);
402         voc1 = math_max_f32(voc1, voutput_min);
403         voc2 = math_max_f32(voc2, voutput_min);
404         voc3 = math_max_f32(voc3, voutput_min);
405 
406         if XNN_LIKELY(c >= 4) {
407           o0[0] = voc0;
408           o0[1] = voc1;
409           o0[2] = voc2;
410           o0[3] = voc3;
411           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
412         } else {
413           float* o0_tmp = o0;
414           if (c & 2) {
415             o0_tmp[0] = voc0;
416             o0_tmp[1] = voc1;
417             o0_tmp += 2;
418             voc0 = voc2;
419           }
420           if (c & 1) {
421             *o0_tmp++ = voc0;
422           }
423           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
424         }
425 
426         i0 += 6;
427         i1 += 6;
428         i2 += 6;
429       }
430       assert(iw < 2);
431       if XNN_UNLIKELY(iw != 0) {
432         float voc0 = w[0];
433         float voc1 = w[1];
434         float voc2 = w[2];
435         float voc3 = w[3];
436 
437         const float vk00c0x0 = w[4];
438         const float vk00c0x1 = w[5];
439         const float vk00c0x2 = w[6];
440         const float vk00c0x3 = w[7];
441 
442         voc0 += vk00c0x0 * vi00c0;
443         voc1 += vk00c0x1 * vi00c0;
444         voc2 += vk00c0x2 * vi00c0;
445         voc3 += vk00c0x3 * vi00c0;
446 
447         const float vk10c0x0 = w[8];
448         const float vk10c0x1 = w[9];
449         const float vk10c0x2 = w[10];
450         const float vk10c0x3 = w[11];
451 
452         voc0 += vk10c0x0 * vi10c0;
453         voc1 += vk10c0x1 * vi10c0;
454         voc2 += vk10c0x2 * vi10c0;
455         voc3 += vk10c0x3 * vi10c0;
456 
457         const float vk20c0x0 = w[12];
458         const float vk20c0x1 = w[13];
459         const float vk20c0x2 = w[14];
460         const float vk20c0x3 = w[15];
461 
462         voc0 += vk20c0x0 * vi20c0;
463         voc1 += vk20c0x1 * vi20c0;
464         voc2 += vk20c0x2 * vi20c0;
465         voc3 += vk20c0x3 * vi20c0;
466 
467         const float vk00c1x0 = w[16];
468         const float vk00c1x1 = w[17];
469         const float vk00c1x2 = w[18];
470         const float vk00c1x3 = w[19];
471 
472         voc0 += vk00c1x0 * vi00c1;
473         voc1 += vk00c1x1 * vi00c1;
474         voc2 += vk00c1x2 * vi00c1;
475         voc3 += vk00c1x3 * vi00c1;
476 
477         const float vk10c1x0 = w[20];
478         const float vk10c1x1 = w[21];
479         const float vk10c1x2 = w[22];
480         const float vk10c1x3 = w[23];
481 
482         voc0 += vk10c1x0 * vi10c1;
483         voc1 += vk10c1x1 * vi10c1;
484         voc2 += vk10c1x2 * vi10c1;
485         voc3 += vk10c1x3 * vi10c1;
486 
487         const float vk20c1x0 = w[24];
488         const float vk20c1x1 = w[25];
489         const float vk20c1x2 = w[26];
490         const float vk20c1x3 = w[27];
491 
492         voc0 += vk20c1x0 * vi20c1;
493         voc1 += vk20c1x1 * vi20c1;
494         voc2 += vk20c1x2 * vi20c1;
495         voc3 += vk20c1x3 * vi20c1;
496 
497         const float vk00c2x0 = w[28];
498         const float vk00c2x1 = w[29];
499         const float vk00c2x2 = w[30];
500         const float vk00c2x3 = w[31];
501 
502         voc0 += vk00c2x0 * vi00c2;
503         voc1 += vk00c2x1 * vi00c2;
504         voc2 += vk00c2x2 * vi00c2;
505         voc3 += vk00c2x3 * vi00c2;
506 
507         const float vk10c2x0 = w[32];
508         const float vk10c2x1 = w[33];
509         const float vk10c2x2 = w[34];
510         const float vk10c2x3 = w[35];
511 
512         voc0 += vk10c2x0 * vi10c2;
513         voc1 += vk10c2x1 * vi10c2;
514         voc2 += vk10c2x2 * vi10c2;
515         voc3 += vk10c2x3 * vi10c2;
516 
517         const float vk20c2x0 = w[36];
518         const float vk20c2x1 = w[37];
519         const float vk20c2x2 = w[38];
520         const float vk20c2x3 = w[39];
521 
522         voc0 += vk20c2x0 * vi20c2;
523         voc1 += vk20c2x1 * vi20c2;
524         voc2 += vk20c2x2 * vi20c2;
525         voc3 += vk20c2x3 * vi20c2;
526 
527         const float vk01c0x0 = w[40];
528         const float vk01c0x1 = w[41];
529         const float vk01c0x2 = w[42];
530         const float vk01c0x3 = w[43];
531 
532         const float vi01c0 = i0[3];
533 
534         voc0 += vk01c0x0 * vi01c0;
535         voc1 += vk01c0x1 * vi01c0;
536         voc2 += vk01c0x2 * vi01c0;
537         voc3 += vk01c0x3 * vi01c0;
538 
539         const float vk11c0x0 = w[44];
540         const float vk11c0x1 = w[45];
541         const float vk11c0x2 = w[46];
542         const float vk11c0x3 = w[47];
543 
544         const float vi11c0 = i1[3];
545 
546         voc0 += vk11c0x0 * vi11c0;
547         voc1 += vk11c0x1 * vi11c0;
548         voc2 += vk11c0x2 * vi11c0;
549         voc3 += vk11c0x3 * vi11c0;
550 
551         const float vk21c0x0 = w[48];
552         const float vk21c0x1 = w[49];
553         const float vk21c0x2 = w[50];
554         const float vk21c0x3 = w[51];
555 
556         const float vi21c0 = i2[3];
557 
558         voc0 += vk21c0x0 * vi21c0;
559         voc1 += vk21c0x1 * vi21c0;
560         voc2 += vk21c0x2 * vi21c0;
561         voc3 += vk21c0x3 * vi21c0;
562 
563         const float vk01c1x0 = w[52];
564         const float vk01c1x1 = w[53];
565         const float vk01c1x2 = w[54];
566         const float vk01c1x3 = w[55];
567 
568         const float vi01c1 = i0[4];
569 
570         voc0 += vk01c1x0 * vi01c1;
571         voc1 += vk01c1x1 * vi01c1;
572         voc2 += vk01c1x2 * vi01c1;
573         voc3 += vk01c1x3 * vi01c1;
574 
575         const float vk11c1x0 = w[56];
576         const float vk11c1x1 = w[57];
577         const float vk11c1x2 = w[58];
578         const float vk11c1x3 = w[59];
579 
580         const float vi11c1 = i1[4];
581 
582         voc0 += vk11c1x0 * vi11c1;
583         voc1 += vk11c1x1 * vi11c1;
584         voc2 += vk11c1x2 * vi11c1;
585         voc3 += vk11c1x3 * vi11c1;
586 
587         const float vk21c1x0 = w[60];
588         const float vk21c1x1 = w[61];
589         const float vk21c1x2 = w[62];
590         const float vk21c1x3 = w[63];
591 
592         const float vi21c1 = i2[4];
593 
594         voc0 += vk21c1x0 * vi21c1;
595         voc1 += vk21c1x1 * vi21c1;
596         voc2 += vk21c1x2 * vi21c1;
597         voc3 += vk21c1x3 * vi21c1;
598 
599         const float vk01c2x0 = w[64];
600         const float vk01c2x1 = w[65];
601         const float vk01c2x2 = w[66];
602         const float vk01c2x3 = w[67];
603 
604         const float vi01c2 = i0[5];
605 
606         voc0 += vk01c2x0 * vi01c2;
607         voc1 += vk01c2x1 * vi01c2;
608         voc2 += vk01c2x2 * vi01c2;
609         voc3 += vk01c2x3 * vi01c2;
610 
611         const float vk11c2x0 = w[68];
612         const float vk11c2x1 = w[69];
613         const float vk11c2x2 = w[70];
614         const float vk11c2x3 = w[71];
615 
616         const float vi11c2 = i1[5];
617 
618         voc0 += vk11c2x0 * vi11c2;
619         voc1 += vk11c2x1 * vi11c2;
620         voc2 += vk11c2x2 * vi11c2;
621         voc3 += vk11c2x3 * vi11c2;
622 
623         const float vk21c2x0 = w[72];
624         const float vk21c2x1 = w[73];
625         const float vk21c2x2 = w[74];
626         const float vk21c2x3 = w[75];
627 
628         const float vi21c2 = i2[5];
629 
630         voc0 += vk21c2x0 * vi21c2;
631         voc1 += vk21c2x1 * vi21c2;
632         voc2 += vk21c2x2 * vi21c2;
633         voc3 += vk21c2x3 * vi21c2;
634 
635         voc0 = math_min_f32(voc0, voutput_max);
636         voc1 = math_min_f32(voc1, voutput_max);
637         voc2 = math_min_f32(voc2, voutput_max);
638         voc3 = math_min_f32(voc3, voutput_max);
639 
640         voc0 = math_max_f32(voc0, voutput_min);
641         voc1 = math_max_f32(voc1, voutput_min);
642         voc2 = math_max_f32(voc2, voutput_min);
643         voc3 = math_max_f32(voc3, voutput_min);
644 
645         if XNN_LIKELY(c >= 4) {
646           o0[0] = voc0;
647           o0[1] = voc1;
648           o0[2] = voc2;
649           o0[3] = voc3;
650           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
651         } else {
652           float* o0_tmp = o0;
653           if (c & 2) {
654             o0_tmp[0] = voc0;
655             o0_tmp[1] = voc1;
656             o0_tmp += 2;
657             voc0 = voc2;
658           }
659           if (c & 1) {
660             *o0_tmp++ = voc0;
661           }
662           o0 = (float*) ((uintptr_t) o0 + output_width_stride);
663         }
664       }
665       // Move output pointers back to the position of the first pixel in a row,
666       // and forward to the next block of output channels
667       o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
668       // Revert input pointers to the position of the first pixel in a row
669       i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
670       i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
671       i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
672       // Move to the block of weights for the next 4 output channels
673       w += 112;
674       c = doz(c, 4);
675     } while (c != 0);
676     // Move output pointers back to the position of the first channel, and forward to the next block of rows
677     o0 = (float*) ((uintptr_t) o0 + output_height_increment);
678     // Move input pointers forward to the next row
679     i0 = i2;
680     i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
681     i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
682   }
683 }
684