1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <xnnpack/conv.h>
9 #include <xnnpack/math.h>
10
11
xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_width_stride,const union xnn_f32_output_params params[restrict static1])12 void xnn_f32_conv_hwc_ukernel_3x3s2p1c3x4__scalar_1x1(
13 size_t input_height,
14 size_t input_width,
15 size_t output_y_start,
16 size_t output_y_end,
17 const float* input,
18 const float* zero,
19 const float* weights,
20 float* output,
21 size_t input_padding_top,
22 size_t output_channels,
23 size_t output_height_stride,
24 size_t output_width_stride,
25 const union xnn_f32_output_params params[restrict static 1])
26 {
27 assert(input_width != 0);
28 assert(output_y_end > output_y_start);
29 assert(input_padding_top <= 1);
30 assert(output_channels != 0);
31
32 const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
33 const size_t input_width_decrement = round_down_po2(input_width, 2) * 3 /* channels */ * sizeof(float);
34 const size_t output_width = (input_width + 1) / 2;
35 const size_t output_channel_increment = 4 * sizeof(float) - output_width * output_width_stride;
36
37
38 // Adjustment for padding processed below
39 const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
40 const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
41 const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
42 float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
43
44 if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
45 i0 = zero;
46 }
47
48 const float voutput_max = params->scalar.max;
49 const float voutput_min = params->scalar.min;
50
51 for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 1) {
52 const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
53 if XNN_UNPREDICTABLE(input_y2 >= input_height) {
54 i2 = zero;
55 }
56
57 const float* w = weights;
58 size_t c = output_channels;
59 float* o0 = output0;
60 do {
61 float vi00c0 = 0.0f;
62 float vi00c1 = 0.0f;
63 float vi00c2 = 0.0f;
64 float vi10c0 = 0.0f;
65 float vi10c1 = 0.0f;
66 float vi10c2 = 0.0f;
67 float vi20c0 = 0.0f;
68 float vi20c1 = 0.0f;
69 float vi20c2 = 0.0f;
70
71 size_t iw = input_width;
72 for (; iw >= 2; iw -= 2) {
73 // start with biases
74 float voc0 = w[0];
75 float voc1 = w[1];
76 float voc2 = w[2];
77 float voc3 = w[3];
78
79 const float vk00c0x0 = w[4];
80 const float vk00c0x1 = w[5];
81 const float vk00c0x2 = w[6];
82 const float vk00c0x3 = w[7];
83
84 voc0 += vk00c0x0 * vi00c0;
85 voc1 += vk00c0x1 * vi00c0;
86 voc2 += vk00c0x2 * vi00c0;
87 voc3 += vk00c0x3 * vi00c0;
88
89 const float vk10c0x0 = w[8];
90 const float vk10c0x1 = w[9];
91 const float vk10c0x2 = w[10];
92 const float vk10c0x3 = w[11];
93
94 voc0 += vk10c0x0 * vi10c0;
95 voc1 += vk10c0x1 * vi10c0;
96 voc2 += vk10c0x2 * vi10c0;
97 voc3 += vk10c0x3 * vi10c0;
98
99 const float vk20c0x0 = w[12];
100 const float vk20c0x1 = w[13];
101 const float vk20c0x2 = w[14];
102 const float vk20c0x3 = w[15];
103
104 voc0 += vk20c0x0 * vi20c0;
105 voc1 += vk20c0x1 * vi20c0;
106 voc2 += vk20c0x2 * vi20c0;
107 voc3 += vk20c0x3 * vi20c0;
108
109 const float vk00c1x0 = w[16];
110 const float vk00c1x1 = w[17];
111 const float vk00c1x2 = w[18];
112 const float vk00c1x3 = w[19];
113
114 voc0 += vk00c1x0 * vi00c1;
115 voc1 += vk00c1x1 * vi00c1;
116 voc2 += vk00c1x2 * vi00c1;
117 voc3 += vk00c1x3 * vi00c1;
118
119 const float vk10c1x0 = w[20];
120 const float vk10c1x1 = w[21];
121 const float vk10c1x2 = w[22];
122 const float vk10c1x3 = w[23];
123
124 voc0 += vk10c1x0 * vi10c1;
125 voc1 += vk10c1x1 * vi10c1;
126 voc2 += vk10c1x2 * vi10c1;
127 voc3 += vk10c1x3 * vi10c1;
128
129 const float vk20c1x0 = w[24];
130 const float vk20c1x1 = w[25];
131 const float vk20c1x2 = w[26];
132 const float vk20c1x3 = w[27];
133
134 voc0 += vk20c1x0 * vi20c1;
135 voc1 += vk20c1x1 * vi20c1;
136 voc2 += vk20c1x2 * vi20c1;
137 voc3 += vk20c1x3 * vi20c1;
138
139 const float vk00c2x0 = w[28];
140 const float vk00c2x1 = w[29];
141 const float vk00c2x2 = w[30];
142 const float vk00c2x3 = w[31];
143
144 voc0 += vk00c2x0 * vi00c2;
145 voc1 += vk00c2x1 * vi00c2;
146 voc2 += vk00c2x2 * vi00c2;
147 voc3 += vk00c2x3 * vi00c2;
148
149 const float vk10c2x0 = w[32];
150 const float vk10c2x1 = w[33];
151 const float vk10c2x2 = w[34];
152 const float vk10c2x3 = w[35];
153
154 voc0 += vk10c2x0 * vi10c2;
155 voc1 += vk10c2x1 * vi10c2;
156 voc2 += vk10c2x2 * vi10c2;
157 voc3 += vk10c2x3 * vi10c2;
158
159 const float vk20c2x0 = w[36];
160 const float vk20c2x1 = w[37];
161 const float vk20c2x2 = w[38];
162 const float vk20c2x3 = w[39];
163
164 voc0 += vk20c2x0 * vi20c2;
165 voc1 += vk20c2x1 * vi20c2;
166 voc2 += vk20c2x2 * vi20c2;
167 voc3 += vk20c2x3 * vi20c2;
168
169 const float vk01c0x0 = w[40];
170 const float vk01c0x1 = w[41];
171 const float vk01c0x2 = w[42];
172 const float vk01c0x3 = w[43];
173
174 const float vi01c0 = i0[0];
175
176 voc0 += vk01c0x0 * vi01c0;
177 voc1 += vk01c0x1 * vi01c0;
178 voc2 += vk01c0x2 * vi01c0;
179 voc3 += vk01c0x3 * vi01c0;
180
181 const float vk11c0x0 = w[44];
182 const float vk11c0x1 = w[45];
183 const float vk11c0x2 = w[46];
184 const float vk11c0x3 = w[47];
185
186 const float vi11c0 = i1[0];
187
188 voc0 += vk11c0x0 * vi11c0;
189 voc1 += vk11c0x1 * vi11c0;
190 voc2 += vk11c0x2 * vi11c0;
191 voc3 += vk11c0x3 * vi11c0;
192
193 const float vk21c0x0 = w[48];
194 const float vk21c0x1 = w[49];
195 const float vk21c0x2 = w[50];
196 const float vk21c0x3 = w[51];
197
198 const float vi21c0 = i2[0];
199
200 voc0 += vk21c0x0 * vi21c0;
201 voc1 += vk21c0x1 * vi21c0;
202 voc2 += vk21c0x2 * vi21c0;
203 voc3 += vk21c0x3 * vi21c0;
204
205 const float vk01c1x0 = w[52];
206 const float vk01c1x1 = w[53];
207 const float vk01c1x2 = w[54];
208 const float vk01c1x3 = w[55];
209
210 const float vi01c1 = i0[1];
211
212 voc0 += vk01c1x0 * vi01c1;
213 voc1 += vk01c1x1 * vi01c1;
214 voc2 += vk01c1x2 * vi01c1;
215 voc3 += vk01c1x3 * vi01c1;
216
217 const float vk11c1x0 = w[56];
218 const float vk11c1x1 = w[57];
219 const float vk11c1x2 = w[58];
220 const float vk11c1x3 = w[59];
221
222 const float vi11c1 = i1[1];
223
224 voc0 += vk11c1x0 * vi11c1;
225 voc1 += vk11c1x1 * vi11c1;
226 voc2 += vk11c1x2 * vi11c1;
227 voc3 += vk11c1x3 * vi11c1;
228
229 const float vk21c1x0 = w[60];
230 const float vk21c1x1 = w[61];
231 const float vk21c1x2 = w[62];
232 const float vk21c1x3 = w[63];
233
234 const float vi21c1 = i2[1];
235
236 voc0 += vk21c1x0 * vi21c1;
237 voc1 += vk21c1x1 * vi21c1;
238 voc2 += vk21c1x2 * vi21c1;
239 voc3 += vk21c1x3 * vi21c1;
240
241 const float vk01c2x0 = w[64];
242 const float vk01c2x1 = w[65];
243 const float vk01c2x2 = w[66];
244 const float vk01c2x3 = w[67];
245
246 const float vi01c2 = i0[2];
247
248 voc0 += vk01c2x0 * vi01c2;
249 voc1 += vk01c2x1 * vi01c2;
250 voc2 += vk01c2x2 * vi01c2;
251 voc3 += vk01c2x3 * vi01c2;
252
253 const float vk11c2x0 = w[68];
254 const float vk11c2x1 = w[69];
255 const float vk11c2x2 = w[70];
256 const float vk11c2x3 = w[71];
257
258 const float vi11c2 = i1[2];
259
260 voc0 += vk11c2x0 * vi11c2;
261 voc1 += vk11c2x1 * vi11c2;
262 voc2 += vk11c2x2 * vi11c2;
263 voc3 += vk11c2x3 * vi11c2;
264
265 const float vk21c2x0 = w[72];
266 const float vk21c2x1 = w[73];
267 const float vk21c2x2 = w[74];
268 const float vk21c2x3 = w[75];
269
270 const float vi21c2 = i2[2];
271
272 voc0 += vk21c2x0 * vi21c2;
273 voc1 += vk21c2x1 * vi21c2;
274 voc2 += vk21c2x2 * vi21c2;
275 voc3 += vk21c2x3 * vi21c2;
276
277 const float vk02c0x0 = w[76];
278 const float vk02c0x1 = w[77];
279 const float vk02c0x2 = w[78];
280 const float vk02c0x3 = w[79];
281
282 const float vi02c0 = i0[3];
283
284 voc0 += vk02c0x0 * vi02c0;
285 voc1 += vk02c0x1 * vi02c0;
286 voc2 += vk02c0x2 * vi02c0;
287 voc3 += vk02c0x3 * vi02c0;
288
289 const float vk12c0x0 = w[80];
290 const float vk12c0x1 = w[81];
291 const float vk12c0x2 = w[82];
292 const float vk12c0x3 = w[83];
293
294 const float vi12c0 = i1[3];
295
296 voc0 += vk12c0x0 * vi12c0;
297 voc1 += vk12c0x1 * vi12c0;
298 voc2 += vk12c0x2 * vi12c0;
299 voc3 += vk12c0x3 * vi12c0;
300
301 const float vk22c0x0 = w[84];
302 const float vk22c0x1 = w[85];
303 const float vk22c0x2 = w[86];
304 const float vk22c0x3 = w[87];
305
306 const float vi22c0 = i2[3];
307
308 voc0 += vk22c0x0 * vi22c0;
309 voc1 += vk22c0x1 * vi22c0;
310 voc2 += vk22c0x2 * vi22c0;
311 voc3 += vk22c0x3 * vi22c0;
312
313 vi00c0 = vi02c0;
314 vi10c0 = vi12c0;
315 vi20c0 = vi22c0;
316
317 const float vk02c1x0 = w[88];
318 const float vk02c1x1 = w[89];
319 const float vk02c1x2 = w[90];
320 const float vk02c1x3 = w[91];
321
322 const float vi02c1 = i0[4];
323
324 voc0 += vk02c1x0 * vi02c1;
325 voc1 += vk02c1x1 * vi02c1;
326 voc2 += vk02c1x2 * vi02c1;
327 voc3 += vk02c1x3 * vi02c1;
328
329 const float vk12c1x0 = w[92];
330 const float vk12c1x1 = w[93];
331 const float vk12c1x2 = w[94];
332 const float vk12c1x3 = w[95];
333
334 const float vi12c1 = i1[4];
335
336 voc0 += vk12c1x0 * vi12c1;
337 voc1 += vk12c1x1 * vi12c1;
338 voc2 += vk12c1x2 * vi12c1;
339 voc3 += vk12c1x3 * vi12c1;
340
341 const float vk22c1x0 = w[96];
342 const float vk22c1x1 = w[97];
343 const float vk22c1x2 = w[98];
344 const float vk22c1x3 = w[99];
345
346 const float vi22c1 = i2[4];
347
348 voc0 += vk22c1x0 * vi22c1;
349 voc1 += vk22c1x1 * vi22c1;
350 voc2 += vk22c1x2 * vi22c1;
351 voc3 += vk22c1x3 * vi22c1;
352
353 vi00c1 = vi02c1;
354 vi10c1 = vi12c1;
355 vi20c1 = vi22c1;
356
357 const float vk02c2x0 = w[100];
358 const float vk02c2x1 = w[101];
359 const float vk02c2x2 = w[102];
360 const float vk02c2x3 = w[103];
361
362 const float vi02c2 = i0[5];
363
364 voc0 += vk02c2x0 * vi02c2;
365 voc1 += vk02c2x1 * vi02c2;
366 voc2 += vk02c2x2 * vi02c2;
367 voc3 += vk02c2x3 * vi02c2;
368
369 const float vk12c2x0 = w[104];
370 const float vk12c2x1 = w[105];
371 const float vk12c2x2 = w[106];
372 const float vk12c2x3 = w[107];
373
374 const float vi12c2 = i1[5];
375
376 voc0 += vk12c2x0 * vi12c2;
377 voc1 += vk12c2x1 * vi12c2;
378 voc2 += vk12c2x2 * vi12c2;
379 voc3 += vk12c2x3 * vi12c2;
380
381 const float vk22c2x0 = w[108];
382 const float vk22c2x1 = w[109];
383 const float vk22c2x2 = w[110];
384 const float vk22c2x3 = w[111];
385
386 const float vi22c2 = i2[5];
387
388 voc0 += vk22c2x0 * vi22c2;
389 voc1 += vk22c2x1 * vi22c2;
390 voc2 += vk22c2x2 * vi22c2;
391 voc3 += vk22c2x3 * vi22c2;
392
393 vi00c2 = vi02c2;
394 vi10c2 = vi12c2;
395 vi20c2 = vi22c2;
396
397 voc0 = math_min_f32(voc0, voutput_max);
398 voc1 = math_min_f32(voc1, voutput_max);
399 voc2 = math_min_f32(voc2, voutput_max);
400 voc3 = math_min_f32(voc3, voutput_max);
401
402 voc0 = math_max_f32(voc0, voutput_min);
403 voc1 = math_max_f32(voc1, voutput_min);
404 voc2 = math_max_f32(voc2, voutput_min);
405 voc3 = math_max_f32(voc3, voutput_min);
406
407 if XNN_LIKELY(c >= 4) {
408 o0[0] = voc0;
409 o0[1] = voc1;
410 o0[2] = voc2;
411 o0[3] = voc3;
412 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
413 } else {
414 float* o0_tmp = o0;
415 if (c & 2) {
416 o0_tmp[0] = voc0;
417 o0_tmp[1] = voc1;
418 o0_tmp += 2;
419 voc0 = voc2;
420 }
421 if (c & 1) {
422 *o0_tmp++ = voc0;
423 }
424 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
425 }
426
427 i0 += 6;
428 i1 += 6;
429 i2 += 6;
430 }
431 assert(iw < 2);
432 if XNN_UNLIKELY(iw != 0) {
433 float voc0 = w[0];
434 float voc1 = w[1];
435 float voc2 = w[2];
436 float voc3 = w[3];
437
438 const float vk00c0x0 = w[4];
439 const float vk00c0x1 = w[5];
440 const float vk00c0x2 = w[6];
441 const float vk00c0x3 = w[7];
442
443 voc0 += vk00c0x0 * vi00c0;
444 voc1 += vk00c0x1 * vi00c0;
445 voc2 += vk00c0x2 * vi00c0;
446 voc3 += vk00c0x3 * vi00c0;
447
448 const float vk10c0x0 = w[8];
449 const float vk10c0x1 = w[9];
450 const float vk10c0x2 = w[10];
451 const float vk10c0x3 = w[11];
452
453 voc0 += vk10c0x0 * vi10c0;
454 voc1 += vk10c0x1 * vi10c0;
455 voc2 += vk10c0x2 * vi10c0;
456 voc3 += vk10c0x3 * vi10c0;
457
458 const float vk20c0x0 = w[12];
459 const float vk20c0x1 = w[13];
460 const float vk20c0x2 = w[14];
461 const float vk20c0x3 = w[15];
462
463 voc0 += vk20c0x0 * vi20c0;
464 voc1 += vk20c0x1 * vi20c0;
465 voc2 += vk20c0x2 * vi20c0;
466 voc3 += vk20c0x3 * vi20c0;
467
468 const float vk00c1x0 = w[16];
469 const float vk00c1x1 = w[17];
470 const float vk00c1x2 = w[18];
471 const float vk00c1x3 = w[19];
472
473 voc0 += vk00c1x0 * vi00c1;
474 voc1 += vk00c1x1 * vi00c1;
475 voc2 += vk00c1x2 * vi00c1;
476 voc3 += vk00c1x3 * vi00c1;
477
478 const float vk10c1x0 = w[20];
479 const float vk10c1x1 = w[21];
480 const float vk10c1x2 = w[22];
481 const float vk10c1x3 = w[23];
482
483 voc0 += vk10c1x0 * vi10c1;
484 voc1 += vk10c1x1 * vi10c1;
485 voc2 += vk10c1x2 * vi10c1;
486 voc3 += vk10c1x3 * vi10c1;
487
488 const float vk20c1x0 = w[24];
489 const float vk20c1x1 = w[25];
490 const float vk20c1x2 = w[26];
491 const float vk20c1x3 = w[27];
492
493 voc0 += vk20c1x0 * vi20c1;
494 voc1 += vk20c1x1 * vi20c1;
495 voc2 += vk20c1x2 * vi20c1;
496 voc3 += vk20c1x3 * vi20c1;
497
498 const float vk00c2x0 = w[28];
499 const float vk00c2x1 = w[29];
500 const float vk00c2x2 = w[30];
501 const float vk00c2x3 = w[31];
502
503 voc0 += vk00c2x0 * vi00c2;
504 voc1 += vk00c2x1 * vi00c2;
505 voc2 += vk00c2x2 * vi00c2;
506 voc3 += vk00c2x3 * vi00c2;
507
508 const float vk10c2x0 = w[32];
509 const float vk10c2x1 = w[33];
510 const float vk10c2x2 = w[34];
511 const float vk10c2x3 = w[35];
512
513 voc0 += vk10c2x0 * vi10c2;
514 voc1 += vk10c2x1 * vi10c2;
515 voc2 += vk10c2x2 * vi10c2;
516 voc3 += vk10c2x3 * vi10c2;
517
518 const float vk20c2x0 = w[36];
519 const float vk20c2x1 = w[37];
520 const float vk20c2x2 = w[38];
521 const float vk20c2x3 = w[39];
522
523 voc0 += vk20c2x0 * vi20c2;
524 voc1 += vk20c2x1 * vi20c2;
525 voc2 += vk20c2x2 * vi20c2;
526 voc3 += vk20c2x3 * vi20c2;
527
528 const float vk01c0x0 = w[40];
529 const float vk01c0x1 = w[41];
530 const float vk01c0x2 = w[42];
531 const float vk01c0x3 = w[43];
532
533 const float vi01c0 = i0[0];
534
535 voc0 += vk01c0x0 * vi01c0;
536 voc1 += vk01c0x1 * vi01c0;
537 voc2 += vk01c0x2 * vi01c0;
538 voc3 += vk01c0x3 * vi01c0;
539
540 const float vk11c0x0 = w[44];
541 const float vk11c0x1 = w[45];
542 const float vk11c0x2 = w[46];
543 const float vk11c0x3 = w[47];
544
545 const float vi11c0 = i1[0];
546
547 voc0 += vk11c0x0 * vi11c0;
548 voc1 += vk11c0x1 * vi11c0;
549 voc2 += vk11c0x2 * vi11c0;
550 voc3 += vk11c0x3 * vi11c0;
551
552 const float vk21c0x0 = w[48];
553 const float vk21c0x1 = w[49];
554 const float vk21c0x2 = w[50];
555 const float vk21c0x3 = w[51];
556
557 const float vi21c0 = i2[0];
558
559 voc0 += vk21c0x0 * vi21c0;
560 voc1 += vk21c0x1 * vi21c0;
561 voc2 += vk21c0x2 * vi21c0;
562 voc3 += vk21c0x3 * vi21c0;
563
564 const float vk01c1x0 = w[52];
565 const float vk01c1x1 = w[53];
566 const float vk01c1x2 = w[54];
567 const float vk01c1x3 = w[55];
568
569 const float vi01c1 = i0[1];
570
571 voc0 += vk01c1x0 * vi01c1;
572 voc1 += vk01c1x1 * vi01c1;
573 voc2 += vk01c1x2 * vi01c1;
574 voc3 += vk01c1x3 * vi01c1;
575
576 const float vk11c1x0 = w[56];
577 const float vk11c1x1 = w[57];
578 const float vk11c1x2 = w[58];
579 const float vk11c1x3 = w[59];
580
581 const float vi11c1 = i1[1];
582
583 voc0 += vk11c1x0 * vi11c1;
584 voc1 += vk11c1x1 * vi11c1;
585 voc2 += vk11c1x2 * vi11c1;
586 voc3 += vk11c1x3 * vi11c1;
587
588 const float vk21c1x0 = w[60];
589 const float vk21c1x1 = w[61];
590 const float vk21c1x2 = w[62];
591 const float vk21c1x3 = w[63];
592
593 const float vi21c1 = i2[1];
594
595 voc0 += vk21c1x0 * vi21c1;
596 voc1 += vk21c1x1 * vi21c1;
597 voc2 += vk21c1x2 * vi21c1;
598 voc3 += vk21c1x3 * vi21c1;
599
600 const float vk01c2x0 = w[64];
601 const float vk01c2x1 = w[65];
602 const float vk01c2x2 = w[66];
603 const float vk01c2x3 = w[67];
604
605 const float vi01c2 = i0[2];
606
607 voc0 += vk01c2x0 * vi01c2;
608 voc1 += vk01c2x1 * vi01c2;
609 voc2 += vk01c2x2 * vi01c2;
610 voc3 += vk01c2x3 * vi01c2;
611
612 const float vk11c2x0 = w[68];
613 const float vk11c2x1 = w[69];
614 const float vk11c2x2 = w[70];
615 const float vk11c2x3 = w[71];
616
617 const float vi11c2 = i1[2];
618
619 voc0 += vk11c2x0 * vi11c2;
620 voc1 += vk11c2x1 * vi11c2;
621 voc2 += vk11c2x2 * vi11c2;
622 voc3 += vk11c2x3 * vi11c2;
623
624 const float vk21c2x0 = w[72];
625 const float vk21c2x1 = w[73];
626 const float vk21c2x2 = w[74];
627 const float vk21c2x3 = w[75];
628
629 const float vi21c2 = i2[2];
630
631 voc0 += vk21c2x0 * vi21c2;
632 voc1 += vk21c2x1 * vi21c2;
633 voc2 += vk21c2x2 * vi21c2;
634 voc3 += vk21c2x3 * vi21c2;
635
636 voc0 = math_min_f32(voc0, voutput_max);
637 voc1 = math_min_f32(voc1, voutput_max);
638 voc2 = math_min_f32(voc2, voutput_max);
639 voc3 = math_min_f32(voc3, voutput_max);
640
641 voc0 = math_max_f32(voc0, voutput_min);
642 voc1 = math_max_f32(voc1, voutput_min);
643 voc2 = math_max_f32(voc2, voutput_min);
644 voc3 = math_max_f32(voc3, voutput_min);
645
646 if XNN_LIKELY(c >= 4) {
647 o0[0] = voc0;
648 o0[1] = voc1;
649 o0[2] = voc2;
650 o0[3] = voc3;
651 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
652 } else {
653 float* o0_tmp = o0;
654 if (c & 2) {
655 o0_tmp[0] = voc0;
656 o0_tmp[1] = voc1;
657 o0_tmp += 2;
658 voc0 = voc2;
659 }
660 if (c & 1) {
661 *o0_tmp++ = voc0;
662 }
663 o0 = (float*) ((uintptr_t) o0 + output_width_stride);
664 }
665 }
666 // Move output pointers back to the position of the first pixel in a row,
667 // and forward to the next block of output channels
668 o0 = (float*) ((uintptr_t) o0 + output_channel_increment);
669 // Revert input pointers to the position of the first pixel in a row
670 i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
671 i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
672 i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
673 // Move to the block of weights for the next 4 output channels
674 w += 112;
675 c = doz(c, 4);
676 } while (c != 0);
677 // Move output pointers forward to the next row
678 output0 = (float*) ((uintptr_t) output0 + output_height_stride);
679 // Move input pointers forward to the next row
680 i0 = i2;
681 i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
682 i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
683 }
684 }
685