1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <xnnpack/conv.h>
9 #include <xnnpack/math.h>
10
11
xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])12 void xnn_f32_conv_hwc_ukernel_3x3s2p0p1c3x4__scalar_1x1(
13 size_t input_height,
14 size_t input_width,
15 size_t output_y_start,
16 size_t output_y_end,
17 const float* input,
18 const float* zero,
19 const float* weights,
20 float* output,
21 size_t input_padding_top,
22 size_t output_channels,
23 size_t output_height_stride,
24 size_t output_width_stride,
25 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26 {
27 assert(input_width != 0);
28 assert(output_y_end > output_y_start);
29 assert(input_padding_top <= 1);
30 assert(output_channels != 0);
31
32 const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
33 const size_t input_width_decrement = round_down_po2(input_width - 1, 2) * 3 /* channels */ * sizeof(float);
34 const size_t output_width = input_width / 2;
35 const size_t output_channel_decrement = output_width * output_width_stride - 4 * sizeof(float);
36 const size_t output_height_increment = output_height_stride - round_up_po2(output_channels, 4) * sizeof(float);
37
38 // Adjustment for padding processed below
39 const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
40 const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
41 const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
42 float* o0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
43
44 if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
45 i0 = zero;
46 }
47
48 const float voutput_max = params->scalar.max;
49 const float voutput_min = params->scalar.min;
50
51 for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 1) {
52 const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
53 if XNN_UNPREDICTABLE(input_y2 >= input_height) {
54 i2 = zero;
55 }
56
57 const float* w = weights;
58 size_t c = output_channels;
59 do {
60 float vi00c0 = i0[0];
61 float vi00c1 = i0[1];
62 float vi00c2 = i0[2];
63 float vi10c0 = i1[0];
64 float vi10c1 = i1[1];
65 float vi10c2 = i1[2];
66 float vi20c0 = i2[0];
67 float vi20c1 = i2[1];
68 float vi20c2 = i2[2];
69
70 size_t iw = input_width - 1;
71 for (; iw >= 2; iw -= 2) {
72 // start with biases
73 float voc0 = w[0];
74 float voc1 = w[1];
75 float voc2 = w[2];
76 float voc3 = w[3];
77
78 const float vk00c0x0 = w[4];
79 const float vk00c0x1 = w[5];
80 const float vk00c0x2 = w[6];
81 const float vk00c0x3 = w[7];
82
83 voc0 += vk00c0x0 * vi00c0;
84 voc1 += vk00c0x1 * vi00c0;
85 voc2 += vk00c0x2 * vi00c0;
86 voc3 += vk00c0x3 * vi00c0;
87
88 const float vk10c0x0 = w[8];
89 const float vk10c0x1 = w[9];
90 const float vk10c0x2 = w[10];
91 const float vk10c0x3 = w[11];
92
93 voc0 += vk10c0x0 * vi10c0;
94 voc1 += vk10c0x1 * vi10c0;
95 voc2 += vk10c0x2 * vi10c0;
96 voc3 += vk10c0x3 * vi10c0;
97
98 const float vk20c0x0 = w[12];
99 const float vk20c0x1 = w[13];
100 const float vk20c0x2 = w[14];
101 const float vk20c0x3 = w[15];
102
103 voc0 += vk20c0x0 * vi20c0;
104 voc1 += vk20c0x1 * vi20c0;
105 voc2 += vk20c0x2 * vi20c0;
106 voc3 += vk20c0x3 * vi20c0;
107
108 const float vk00c1x0 = w[16];
109 const float vk00c1x1 = w[17];
110 const float vk00c1x2 = w[18];
111 const float vk00c1x3 = w[19];
112
113 voc0 += vk00c1x0 * vi00c1;
114 voc1 += vk00c1x1 * vi00c1;
115 voc2 += vk00c1x2 * vi00c1;
116 voc3 += vk00c1x3 * vi00c1;
117
118 const float vk10c1x0 = w[20];
119 const float vk10c1x1 = w[21];
120 const float vk10c1x2 = w[22];
121 const float vk10c1x3 = w[23];
122
123 voc0 += vk10c1x0 * vi10c1;
124 voc1 += vk10c1x1 * vi10c1;
125 voc2 += vk10c1x2 * vi10c1;
126 voc3 += vk10c1x3 * vi10c1;
127
128 const float vk20c1x0 = w[24];
129 const float vk20c1x1 = w[25];
130 const float vk20c1x2 = w[26];
131 const float vk20c1x3 = w[27];
132
133 voc0 += vk20c1x0 * vi20c1;
134 voc1 += vk20c1x1 * vi20c1;
135 voc2 += vk20c1x2 * vi20c1;
136 voc3 += vk20c1x3 * vi20c1;
137
138 const float vk00c2x0 = w[28];
139 const float vk00c2x1 = w[29];
140 const float vk00c2x2 = w[30];
141 const float vk00c2x3 = w[31];
142
143 voc0 += vk00c2x0 * vi00c2;
144 voc1 += vk00c2x1 * vi00c2;
145 voc2 += vk00c2x2 * vi00c2;
146 voc3 += vk00c2x3 * vi00c2;
147
148 const float vk10c2x0 = w[32];
149 const float vk10c2x1 = w[33];
150 const float vk10c2x2 = w[34];
151 const float vk10c2x3 = w[35];
152
153 voc0 += vk10c2x0 * vi10c2;
154 voc1 += vk10c2x1 * vi10c2;
155 voc2 += vk10c2x2 * vi10c2;
156 voc3 += vk10c2x3 * vi10c2;
157
158 const float vk20c2x0 = w[36];
159 const float vk20c2x1 = w[37];
160 const float vk20c2x2 = w[38];
161 const float vk20c2x3 = w[39];
162
163 voc0 += vk20c2x0 * vi20c2;
164 voc1 += vk20c2x1 * vi20c2;
165 voc2 += vk20c2x2 * vi20c2;
166 voc3 += vk20c2x3 * vi20c2;
167
168 const float vk01c0x0 = w[40];
169 const float vk01c0x1 = w[41];
170 const float vk01c0x2 = w[42];
171 const float vk01c0x3 = w[43];
172
173 const float vi01c0 = i0[3];
174
175 voc0 += vk01c0x0 * vi01c0;
176 voc1 += vk01c0x1 * vi01c0;
177 voc2 += vk01c0x2 * vi01c0;
178 voc3 += vk01c0x3 * vi01c0;
179
180 const float vk11c0x0 = w[44];
181 const float vk11c0x1 = w[45];
182 const float vk11c0x2 = w[46];
183 const float vk11c0x3 = w[47];
184
185 const float vi11c0 = i1[3];
186
187 voc0 += vk11c0x0 * vi11c0;
188 voc1 += vk11c0x1 * vi11c0;
189 voc2 += vk11c0x2 * vi11c0;
190 voc3 += vk11c0x3 * vi11c0;
191
192 const float vk21c0x0 = w[48];
193 const float vk21c0x1 = w[49];
194 const float vk21c0x2 = w[50];
195 const float vk21c0x3 = w[51];
196
197 const float vi21c0 = i2[3];
198
199 voc0 += vk21c0x0 * vi21c0;
200 voc1 += vk21c0x1 * vi21c0;
201 voc2 += vk21c0x2 * vi21c0;
202 voc3 += vk21c0x3 * vi21c0;
203
204 const float vk01c1x0 = w[52];
205 const float vk01c1x1 = w[53];
206 const float vk01c1x2 = w[54];
207 const float vk01c1x3 = w[55];
208
209 const float vi01c1 = i0[4];
210
211 voc0 += vk01c1x0 * vi01c1;
212 voc1 += vk01c1x1 * vi01c1;
213 voc2 += vk01c1x2 * vi01c1;
214 voc3 += vk01c1x3 * vi01c1;
215
216 const float vk11c1x0 = w[56];
217 const float vk11c1x1 = w[57];
218 const float vk11c1x2 = w[58];
219 const float vk11c1x3 = w[59];
220
221 const float vi11c1 = i1[4];
222
223 voc0 += vk11c1x0 * vi11c1;
224 voc1 += vk11c1x1 * vi11c1;
225 voc2 += vk11c1x2 * vi11c1;
226 voc3 += vk11c1x3 * vi11c1;
227
228 const float vk21c1x0 = w[60];
229 const float vk21c1x1 = w[61];
230 const float vk21c1x2 = w[62];
231 const float vk21c1x3 = w[63];
232
233 const float vi21c1 = i2[4];
234
235 voc0 += vk21c1x0 * vi21c1;
236 voc1 += vk21c1x1 * vi21c1;
237 voc2 += vk21c1x2 * vi21c1;
238 voc3 += vk21c1x3 * vi21c1;
239
240 const float vk01c2x0 = w[64];
241 const float vk01c2x1 = w[65];
242 const float vk01c2x2 = w[66];
243 const float vk01c2x3 = w[67];
244
245 const float vi01c2 = i0[5];
246
247 voc0 += vk01c2x0 * vi01c2;
248 voc1 += vk01c2x1 * vi01c2;
249 voc2 += vk01c2x2 * vi01c2;
250 voc3 += vk01c2x3 * vi01c2;
251
252 const float vk11c2x0 = w[68];
253 const float vk11c2x1 = w[69];
254 const float vk11c2x2 = w[70];
255 const float vk11c2x3 = w[71];
256
257 const float vi11c2 = i1[5];
258
259 voc0 += vk11c2x0 * vi11c2;
260 voc1 += vk11c2x1 * vi11c2;
261 voc2 += vk11c2x2 * vi11c2;
262 voc3 += vk11c2x3 * vi11c2;
263
264 const float vk21c2x0 = w[72];
265 const float vk21c2x1 = w[73];
266 const float vk21c2x2 = w[74];
267 const float vk21c2x3 = w[75];
268
269 const float vi21c2 = i2[5];
270
271 voc0 += vk21c2x0 * vi21c2;
272 voc1 += vk21c2x1 * vi21c2;
273 voc2 += vk21c2x2 * vi21c2;
274 voc3 += vk21c2x3 * vi21c2;
275
276 const float vk02c0x0 = w[76];
277 const float vk02c0x1 = w[77];
278 const float vk02c0x2 = w[78];
279 const float vk02c0x3 = w[79];
280
281 const float vi02c0 = i0[6];
282
283 voc0 += vk02c0x0 * vi02c0;
284 voc1 += vk02c0x1 * vi02c0;
285 voc2 += vk02c0x2 * vi02c0;
286 voc3 += vk02c0x3 * vi02c0;
287
288 const float vk12c0x0 = w[80];
289 const float vk12c0x1 = w[81];
290 const float vk12c0x2 = w[82];
291 const float vk12c0x3 = w[83];
292
293 const float vi12c0 = i1[6];
294
295 voc0 += vk12c0x0 * vi12c0;
296 voc1 += vk12c0x1 * vi12c0;
297 voc2 += vk12c0x2 * vi12c0;
298 voc3 += vk12c0x3 * vi12c0;
299
300 const float vk22c0x0 = w[84];
301 const float vk22c0x1 = w[85];
302 const float vk22c0x2 = w[86];
303 const float vk22c0x3 = w[87];
304
305 const float vi22c0 = i2[6];
306
307 voc0 += vk22c0x0 * vi22c0;
308 voc1 += vk22c0x1 * vi22c0;
309 voc2 += vk22c0x2 * vi22c0;
310 voc3 += vk22c0x3 * vi22c0;
311
312 vi00c0 = vi02c0;
313 vi10c0 = vi12c0;
314 vi20c0 = vi22c0;
315
316 const float vk02c1x0 = w[88];
317 const float vk02c1x1 = w[89];
318 const float vk02c1x2 = w[90];
319 const float vk02c1x3 = w[91];
320
321 const float vi02c1 = i0[7];
322
323 voc0 += vk02c1x0 * vi02c1;
324 voc1 += vk02c1x1 * vi02c1;
325 voc2 += vk02c1x2 * vi02c1;
326 voc3 += vk02c1x3 * vi02c1;
327
328 const float vk12c1x0 = w[92];
329 const float vk12c1x1 = w[93];
330 const float vk12c1x2 = w[94];
331 const float vk12c1x3 = w[95];
332
333 const float vi12c1 = i1[7];
334
335 voc0 += vk12c1x0 * vi12c1;
336 voc1 += vk12c1x1 * vi12c1;
337 voc2 += vk12c1x2 * vi12c1;
338 voc3 += vk12c1x3 * vi12c1;
339
340 const float vk22c1x0 = w[96];
341 const float vk22c1x1 = w[97];
342 const float vk22c1x2 = w[98];
343 const float vk22c1x3 = w[99];
344
345 const float vi22c1 = i2[7];
346
347 voc0 += vk22c1x0 * vi22c1;
348 voc1 += vk22c1x1 * vi22c1;
349 voc2 += vk22c1x2 * vi22c1;
350 voc3 += vk22c1x3 * vi22c1;
351
352 vi00c1 = vi02c1;
353 vi10c1 = vi12c1;
354 vi20c1 = vi22c1;
355
356 const float vk02c2x0 = w[100];
357 const float vk02c2x1 = w[101];
358 const float vk02c2x2 = w[102];
359 const float vk02c2x3 = w[103];
360
361 const float vi02c2 = i0[8];
362
363 voc0 += vk02c2x0 * vi02c2;
364 voc1 += vk02c2x1 * vi02c2;
365 voc2 += vk02c2x2 * vi02c2;
366 voc3 += vk02c2x3 * vi02c2;
367
368 const float vk12c2x0 = w[104];
369 const float vk12c2x1 = w[105];
370 const float vk12c2x2 = w[106];
371 const float vk12c2x3 = w[107];
372
373 const float vi12c2 = i1[8];
374
375 voc0 += vk12c2x0 * vi12c2;
376 voc1 += vk12c2x1 * vi12c2;
377 voc2 += vk12c2x2 * vi12c2;
378 voc3 += vk12c2x3 * vi12c2;
379
380 const float vk22c2x0 = w[108];
381 const float vk22c2x1 = w[109];
382 const float vk22c2x2 = w[110];
383 const float vk22c2x3 = w[111];
384
385 const float vi22c2 = i2[8];
386
387 voc0 += vk22c2x0 * vi22c2;
388 voc1 += vk22c2x1 * vi22c2;
389 voc2 += vk22c2x2 * vi22c2;
390 voc3 += vk22c2x3 * vi22c2;
391
392 vi00c2 = vi02c2;
393 vi10c2 = vi12c2;
394 vi20c2 = vi22c2;
395
396 voc0 = math_min_f32(voc0, voutput_max);
397 voc1 = math_min_f32(voc1, voutput_max);
398 voc2 = math_min_f32(voc2, voutput_max);
399 voc3 = math_min_f32(voc3, voutput_max);
400
401 voc0 = math_max_f32(voc0, voutput_min);
402 voc1 = math_max_f32(voc1, voutput_min);
403 voc2 = math_max_f32(voc2, voutput_min);
404 voc3 = math_max_f32(voc3, voutput_min);
405
406 if XNN_LIKELY(c >= 4) {
407 o0[0] = voc0;
408 o0[1] = voc1;
409 o0[2] = voc2;
410 o0[3] = voc3;
411 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
412 } else {
413 float* o0_tmp = o0;
414 if (c & 2) {
415 o0_tmp[0] = voc0;
416 o0_tmp[1] = voc1;
417 o0_tmp += 2;
418 voc0 = voc2;
419 }
420 if (c & 1) {
421 *o0_tmp++ = voc0;
422 }
423 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
424 }
425
426 i0 += 6;
427 i1 += 6;
428 i2 += 6;
429 }
430 assert(iw < 2);
431 if XNN_UNLIKELY(iw != 0) {
432 float voc0 = w[0];
433 float voc1 = w[1];
434 float voc2 = w[2];
435 float voc3 = w[3];
436
437 const float vk00c0x0 = w[4];
438 const float vk00c0x1 = w[5];
439 const float vk00c0x2 = w[6];
440 const float vk00c0x3 = w[7];
441
442 voc0 += vk00c0x0 * vi00c0;
443 voc1 += vk00c0x1 * vi00c0;
444 voc2 += vk00c0x2 * vi00c0;
445 voc3 += vk00c0x3 * vi00c0;
446
447 const float vk10c0x0 = w[8];
448 const float vk10c0x1 = w[9];
449 const float vk10c0x2 = w[10];
450 const float vk10c0x3 = w[11];
451
452 voc0 += vk10c0x0 * vi10c0;
453 voc1 += vk10c0x1 * vi10c0;
454 voc2 += vk10c0x2 * vi10c0;
455 voc3 += vk10c0x3 * vi10c0;
456
457 const float vk20c0x0 = w[12];
458 const float vk20c0x1 = w[13];
459 const float vk20c0x2 = w[14];
460 const float vk20c0x3 = w[15];
461
462 voc0 += vk20c0x0 * vi20c0;
463 voc1 += vk20c0x1 * vi20c0;
464 voc2 += vk20c0x2 * vi20c0;
465 voc3 += vk20c0x3 * vi20c0;
466
467 const float vk00c1x0 = w[16];
468 const float vk00c1x1 = w[17];
469 const float vk00c1x2 = w[18];
470 const float vk00c1x3 = w[19];
471
472 voc0 += vk00c1x0 * vi00c1;
473 voc1 += vk00c1x1 * vi00c1;
474 voc2 += vk00c1x2 * vi00c1;
475 voc3 += vk00c1x3 * vi00c1;
476
477 const float vk10c1x0 = w[20];
478 const float vk10c1x1 = w[21];
479 const float vk10c1x2 = w[22];
480 const float vk10c1x3 = w[23];
481
482 voc0 += vk10c1x0 * vi10c1;
483 voc1 += vk10c1x1 * vi10c1;
484 voc2 += vk10c1x2 * vi10c1;
485 voc3 += vk10c1x3 * vi10c1;
486
487 const float vk20c1x0 = w[24];
488 const float vk20c1x1 = w[25];
489 const float vk20c1x2 = w[26];
490 const float vk20c1x3 = w[27];
491
492 voc0 += vk20c1x0 * vi20c1;
493 voc1 += vk20c1x1 * vi20c1;
494 voc2 += vk20c1x2 * vi20c1;
495 voc3 += vk20c1x3 * vi20c1;
496
497 const float vk00c2x0 = w[28];
498 const float vk00c2x1 = w[29];
499 const float vk00c2x2 = w[30];
500 const float vk00c2x3 = w[31];
501
502 voc0 += vk00c2x0 * vi00c2;
503 voc1 += vk00c2x1 * vi00c2;
504 voc2 += vk00c2x2 * vi00c2;
505 voc3 += vk00c2x3 * vi00c2;
506
507 const float vk10c2x0 = w[32];
508 const float vk10c2x1 = w[33];
509 const float vk10c2x2 = w[34];
510 const float vk10c2x3 = w[35];
511
512 voc0 += vk10c2x0 * vi10c2;
513 voc1 += vk10c2x1 * vi10c2;
514 voc2 += vk10c2x2 * vi10c2;
515 voc3 += vk10c2x3 * vi10c2;
516
517 const float vk20c2x0 = w[36];
518 const float vk20c2x1 = w[37];
519 const float vk20c2x2 = w[38];
520 const float vk20c2x3 = w[39];
521
522 voc0 += vk20c2x0 * vi20c2;
523 voc1 += vk20c2x1 * vi20c2;
524 voc2 += vk20c2x2 * vi20c2;
525 voc3 += vk20c2x3 * vi20c2;
526
527 const float vk01c0x0 = w[40];
528 const float vk01c0x1 = w[41];
529 const float vk01c0x2 = w[42];
530 const float vk01c0x3 = w[43];
531
532 const float vi01c0 = i0[3];
533
534 voc0 += vk01c0x0 * vi01c0;
535 voc1 += vk01c0x1 * vi01c0;
536 voc2 += vk01c0x2 * vi01c0;
537 voc3 += vk01c0x3 * vi01c0;
538
539 const float vk11c0x0 = w[44];
540 const float vk11c0x1 = w[45];
541 const float vk11c0x2 = w[46];
542 const float vk11c0x3 = w[47];
543
544 const float vi11c0 = i1[3];
545
546 voc0 += vk11c0x0 * vi11c0;
547 voc1 += vk11c0x1 * vi11c0;
548 voc2 += vk11c0x2 * vi11c0;
549 voc3 += vk11c0x3 * vi11c0;
550
551 const float vk21c0x0 = w[48];
552 const float vk21c0x1 = w[49];
553 const float vk21c0x2 = w[50];
554 const float vk21c0x3 = w[51];
555
556 const float vi21c0 = i2[3];
557
558 voc0 += vk21c0x0 * vi21c0;
559 voc1 += vk21c0x1 * vi21c0;
560 voc2 += vk21c0x2 * vi21c0;
561 voc3 += vk21c0x3 * vi21c0;
562
563 const float vk01c1x0 = w[52];
564 const float vk01c1x1 = w[53];
565 const float vk01c1x2 = w[54];
566 const float vk01c1x3 = w[55];
567
568 const float vi01c1 = i0[4];
569
570 voc0 += vk01c1x0 * vi01c1;
571 voc1 += vk01c1x1 * vi01c1;
572 voc2 += vk01c1x2 * vi01c1;
573 voc3 += vk01c1x3 * vi01c1;
574
575 const float vk11c1x0 = w[56];
576 const float vk11c1x1 = w[57];
577 const float vk11c1x2 = w[58];
578 const float vk11c1x3 = w[59];
579
580 const float vi11c1 = i1[4];
581
582 voc0 += vk11c1x0 * vi11c1;
583 voc1 += vk11c1x1 * vi11c1;
584 voc2 += vk11c1x2 * vi11c1;
585 voc3 += vk11c1x3 * vi11c1;
586
587 const float vk21c1x0 = w[60];
588 const float vk21c1x1 = w[61];
589 const float vk21c1x2 = w[62];
590 const float vk21c1x3 = w[63];
591
592 const float vi21c1 = i2[4];
593
594 voc0 += vk21c1x0 * vi21c1;
595 voc1 += vk21c1x1 * vi21c1;
596 voc2 += vk21c1x2 * vi21c1;
597 voc3 += vk21c1x3 * vi21c1;
598
599 const float vk01c2x0 = w[64];
600 const float vk01c2x1 = w[65];
601 const float vk01c2x2 = w[66];
602 const float vk01c2x3 = w[67];
603
604 const float vi01c2 = i0[5];
605
606 voc0 += vk01c2x0 * vi01c2;
607 voc1 += vk01c2x1 * vi01c2;
608 voc2 += vk01c2x2 * vi01c2;
609 voc3 += vk01c2x3 * vi01c2;
610
611 const float vk11c2x0 = w[68];
612 const float vk11c2x1 = w[69];
613 const float vk11c2x2 = w[70];
614 const float vk11c2x3 = w[71];
615
616 const float vi11c2 = i1[5];
617
618 voc0 += vk11c2x0 * vi11c2;
619 voc1 += vk11c2x1 * vi11c2;
620 voc2 += vk11c2x2 * vi11c2;
621 voc3 += vk11c2x3 * vi11c2;
622
623 const float vk21c2x0 = w[72];
624 const float vk21c2x1 = w[73];
625 const float vk21c2x2 = w[74];
626 const float vk21c2x3 = w[75];
627
628 const float vi21c2 = i2[5];
629
630 voc0 += vk21c2x0 * vi21c2;
631 voc1 += vk21c2x1 * vi21c2;
632 voc2 += vk21c2x2 * vi21c2;
633 voc3 += vk21c2x3 * vi21c2;
634
635 voc0 = math_min_f32(voc0, voutput_max);
636 voc1 = math_min_f32(voc1, voutput_max);
637 voc2 = math_min_f32(voc2, voutput_max);
638 voc3 = math_min_f32(voc3, voutput_max);
639
640 voc0 = math_max_f32(voc0, voutput_min);
641 voc1 = math_max_f32(voc1, voutput_min);
642 voc2 = math_max_f32(voc2, voutput_min);
643 voc3 = math_max_f32(voc3, voutput_min);
644
645 if XNN_LIKELY(c >= 4) {
646 o0[0] = voc0;
647 o0[1] = voc1;
648 o0[2] = voc2;
649 o0[3] = voc3;
650 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
651 } else {
652 float* o0_tmp = o0;
653 if (c & 2) {
654 o0_tmp[0] = voc0;
655 o0_tmp[1] = voc1;
656 o0_tmp += 2;
657 voc0 = voc2;
658 }
659 if (c & 1) {
660 *o0_tmp++ = voc0;
661 }
662 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
663 }
664 }
665 // Move output pointers back to the position of the first pixel in a row,
666 // and forward to the next block of output channels
667 o0 = (float*) ((uintptr_t) o0 - output_channel_decrement);
668 // Revert input pointers to the position of the first pixel in a row
669 i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
670 i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
671 i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
672 // Move to the block of weights for the next 4 output channels
673 w += 112;
674 c = doz(c, 4);
675 } while (c != 0);
676 // Move output pointers back to the position of the first channel, and forward to the next block of rows
677 o0 = (float*) ((uintptr_t) o0 + output_height_increment);
678 // Move input pointers forward to the next row
679 i0 = i2;
680 i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
681 i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
682 }
683 }
684