1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <xnnpack/conv.h>
9 #include <xnnpack/math.h>
10
11
xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1(size_t input_height,size_t input_width,size_t output_y_start,size_t output_y_end,const float * input,const float * zero,const float * weights,float * output,size_t input_padding_top,size_t output_channels,size_t output_height_stride,size_t output_channel_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])12 void xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1(
13 size_t input_height,
14 size_t input_width,
15 size_t output_y_start,
16 size_t output_y_end,
17 const float* input,
18 const float* zero,
19 const float* weights,
20 float* output,
21 size_t input_padding_top,
22 size_t output_channels,
23 size_t output_height_stride,
24 size_t output_channel_stride,
25 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
26 {
27 assert(input_width != 0);
28 assert(output_y_end > output_y_start);
29 assert(input_padding_top <= 1);
30 assert(output_channels != 0);
31
32 const size_t input_height_stride = input_width * 3 /* channels */ * sizeof(float);
33 const size_t input_width_decrement = round_down_po2(input_width, 2) * 3 /* channels */ * sizeof(float);
34 const size_t output_width = (input_width + 1) / 2;
35 const size_t output_channel_increment = output_channel_stride * 4 - output_width * sizeof(float);
36
37 // Adjustment for padding processed below
38 const float* i0 = (const float*) ((uintptr_t) input + input_height_stride * (output_y_start * 2 - input_padding_top));
39 const float* i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
40 const float* i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
41 float* output0 = (float*) ((uintptr_t) output + output_height_stride * output_y_start);
42
43 if XNN_UNPREDICTABLE(output_y_start < input_padding_top) {
44 i0 = zero;
45 }
46
47 const float voutput_max = params->scalar.max;
48 const float voutput_min = params->scalar.min;
49
50 for (size_t output_y = output_y_start; output_y < output_y_end; output_y += 1) {
51 const size_t input_y2 = output_y * 2 + 2 - input_padding_top;
52 if XNN_UNPREDICTABLE(input_y2 >= input_height) {
53 i2 = zero;
54 }
55
56 const float* w = weights;
57 size_t c = output_channels;
58 float* o0c0 = output0;
59 float* o0c1 = (float*) ((uintptr_t) o0c0 + output_channel_stride);
60 float* o0c2 = (float*) ((uintptr_t) o0c1 + output_channel_stride);
61 float* o0c3 = (float*) ((uintptr_t) o0c2 + output_channel_stride);
62 do {
63 if XNN_UNPREDICTABLE(c < 2) {
64 o0c1 = o0c0;
65 }
66 if XNN_UNPREDICTABLE(c <= 2) {
67 o0c2 = o0c1;
68 }
69 if XNN_UNPREDICTABLE(c < 4) {
70 o0c3 = o0c2;
71 }
72
73 // Left edge padding
74 float vi00c0 = 0.0f;
75 float vi00c1 = 0.0f;
76 float vi00c2 = 0.0f;
77 float vi10c0 = 0.0f;
78 float vi10c1 = 0.0f;
79 float vi10c2 = 0.0f;
80 float vi20c0 = 0.0f;
81 float vi20c1 = 0.0f;
82 float vi20c2 = 0.0f;
83
84 size_t iw = input_width;
85 for (; iw >= 2; iw -= 2) {
86 float voc0 = w[0];
87 float voc1 = w[1];
88 float voc2 = w[2];
89 float voc3 = w[3];
90
91 const float vk00c0x0 = w[4];
92 const float vk00c0x1 = w[5];
93 const float vk00c0x2 = w[6];
94 const float vk00c0x3 = w[7];
95
96 voc0 += vk00c0x0 * vi00c0;
97 voc1 += vk00c0x1 * vi00c0;
98 voc2 += vk00c0x2 * vi00c0;
99 voc3 += vk00c0x3 * vi00c0;
100
101 const float vk10c0x0 = w[8];
102 const float vk10c0x1 = w[9];
103 const float vk10c0x2 = w[10];
104 const float vk10c0x3 = w[11];
105
106 voc0 += vk10c0x0 * vi10c0;
107 voc1 += vk10c0x1 * vi10c0;
108 voc2 += vk10c0x2 * vi10c0;
109 voc3 += vk10c0x3 * vi10c0;
110
111 const float vk20c0x0 = w[12];
112 const float vk20c0x1 = w[13];
113 const float vk20c0x2 = w[14];
114 const float vk20c0x3 = w[15];
115
116 voc0 += vk20c0x0 * vi20c0;
117 voc1 += vk20c0x1 * vi20c0;
118 voc2 += vk20c0x2 * vi20c0;
119 voc3 += vk20c0x3 * vi20c0;
120
121 const float vk00c1x0 = w[16];
122 const float vk00c1x1 = w[17];
123 const float vk00c1x2 = w[18];
124 const float vk00c1x3 = w[19];
125
126 voc0 += vk00c1x0 * vi00c1;
127 voc1 += vk00c1x1 * vi00c1;
128 voc2 += vk00c1x2 * vi00c1;
129 voc3 += vk00c1x3 * vi00c1;
130
131 const float vk10c1x0 = w[20];
132 const float vk10c1x1 = w[21];
133 const float vk10c1x2 = w[22];
134 const float vk10c1x3 = w[23];
135
136 voc0 += vk10c1x0 * vi10c1;
137 voc1 += vk10c1x1 * vi10c1;
138 voc2 += vk10c1x2 * vi10c1;
139 voc3 += vk10c1x3 * vi10c1;
140
141 const float vk20c1x0 = w[24];
142 const float vk20c1x1 = w[25];
143 const float vk20c1x2 = w[26];
144 const float vk20c1x3 = w[27];
145
146 voc0 += vk20c1x0 * vi20c1;
147 voc1 += vk20c1x1 * vi20c1;
148 voc2 += vk20c1x2 * vi20c1;
149 voc3 += vk20c1x3 * vi20c1;
150
151 const float vk00c2x0 = w[28];
152 const float vk00c2x1 = w[29];
153 const float vk00c2x2 = w[30];
154 const float vk00c2x3 = w[31];
155
156 voc0 += vk00c2x0 * vi00c2;
157 voc1 += vk00c2x1 * vi00c2;
158 voc2 += vk00c2x2 * vi00c2;
159 voc3 += vk00c2x3 * vi00c2;
160
161 const float vk10c2x0 = w[32];
162 const float vk10c2x1 = w[33];
163 const float vk10c2x2 = w[34];
164 const float vk10c2x3 = w[35];
165
166 voc0 += vk10c2x0 * vi10c2;
167 voc1 += vk10c2x1 * vi10c2;
168 voc2 += vk10c2x2 * vi10c2;
169 voc3 += vk10c2x3 * vi10c2;
170
171 const float vk20c2x0 = w[36];
172 const float vk20c2x1 = w[37];
173 const float vk20c2x2 = w[38];
174 const float vk20c2x3 = w[39];
175
176 voc0 += vk20c2x0 * vi20c2;
177 voc1 += vk20c2x1 * vi20c2;
178 voc2 += vk20c2x2 * vi20c2;
179 voc3 += vk20c2x3 * vi20c2;
180
181 const float vk01c0x0 = w[40];
182 const float vk01c0x1 = w[41];
183 const float vk01c0x2 = w[42];
184 const float vk01c0x3 = w[43];
185
186 const float vi01c0 = i0[0];
187
188 voc0 += vk01c0x0 * vi01c0;
189 voc1 += vk01c0x1 * vi01c0;
190 voc2 += vk01c0x2 * vi01c0;
191 voc3 += vk01c0x3 * vi01c0;
192
193 const float vk11c0x0 = w[44];
194 const float vk11c0x1 = w[45];
195 const float vk11c0x2 = w[46];
196 const float vk11c0x3 = w[47];
197
198 const float vi11c0 = i1[0];
199
200 voc0 += vk11c0x0 * vi11c0;
201 voc1 += vk11c0x1 * vi11c0;
202 voc2 += vk11c0x2 * vi11c0;
203 voc3 += vk11c0x3 * vi11c0;
204
205 const float vk21c0x0 = w[48];
206 const float vk21c0x1 = w[49];
207 const float vk21c0x2 = w[50];
208 const float vk21c0x3 = w[51];
209
210 const float vi21c0 = i2[0];
211
212 voc0 += vk21c0x0 * vi21c0;
213 voc1 += vk21c0x1 * vi21c0;
214 voc2 += vk21c0x2 * vi21c0;
215 voc3 += vk21c0x3 * vi21c0;
216
217 const float vk01c1x0 = w[52];
218 const float vk01c1x1 = w[53];
219 const float vk01c1x2 = w[54];
220 const float vk01c1x3 = w[55];
221
222 const float vi01c1 = i0[1];
223
224 voc0 += vk01c1x0 * vi01c1;
225 voc1 += vk01c1x1 * vi01c1;
226 voc2 += vk01c1x2 * vi01c1;
227 voc3 += vk01c1x3 * vi01c1;
228
229 const float vk11c1x0 = w[56];
230 const float vk11c1x1 = w[57];
231 const float vk11c1x2 = w[58];
232 const float vk11c1x3 = w[59];
233
234 const float vi11c1 = i1[1];
235
236 voc0 += vk11c1x0 * vi11c1;
237 voc1 += vk11c1x1 * vi11c1;
238 voc2 += vk11c1x2 * vi11c1;
239 voc3 += vk11c1x3 * vi11c1;
240
241 const float vk21c1x0 = w[60];
242 const float vk21c1x1 = w[61];
243 const float vk21c1x2 = w[62];
244 const float vk21c1x3 = w[63];
245
246 const float vi21c1 = i2[1];
247
248 voc0 += vk21c1x0 * vi21c1;
249 voc1 += vk21c1x1 * vi21c1;
250 voc2 += vk21c1x2 * vi21c1;
251 voc3 += vk21c1x3 * vi21c1;
252
253 const float vk01c2x0 = w[64];
254 const float vk01c2x1 = w[65];
255 const float vk01c2x2 = w[66];
256 const float vk01c2x3 = w[67];
257
258 const float vi01c2 = i0[2];
259
260 voc0 += vk01c2x0 * vi01c2;
261 voc1 += vk01c2x1 * vi01c2;
262 voc2 += vk01c2x2 * vi01c2;
263 voc3 += vk01c2x3 * vi01c2;
264
265 const float vk11c2x0 = w[68];
266 const float vk11c2x1 = w[69];
267 const float vk11c2x2 = w[70];
268 const float vk11c2x3 = w[71];
269
270 const float vi11c2 = i1[2];
271
272 voc0 += vk11c2x0 * vi11c2;
273 voc1 += vk11c2x1 * vi11c2;
274 voc2 += vk11c2x2 * vi11c2;
275 voc3 += vk11c2x3 * vi11c2;
276
277 const float vk21c2x0 = w[72];
278 const float vk21c2x1 = w[73];
279 const float vk21c2x2 = w[74];
280 const float vk21c2x3 = w[75];
281
282 const float vi21c2 = i2[2];
283
284 voc0 += vk21c2x0 * vi21c2;
285 voc1 += vk21c2x1 * vi21c2;
286 voc2 += vk21c2x2 * vi21c2;
287 voc3 += vk21c2x3 * vi21c2;
288
289 const float vk02c0x0 = w[76];
290 const float vk02c0x1 = w[77];
291 const float vk02c0x2 = w[78];
292 const float vk02c0x3 = w[79];
293
294 const float vi02c0 = i0[3];
295
296 voc0 += vk02c0x0 * vi02c0;
297 voc1 += vk02c0x1 * vi02c0;
298 voc2 += vk02c0x2 * vi02c0;
299 voc3 += vk02c0x3 * vi02c0;
300
301 const float vk12c0x0 = w[80];
302 const float vk12c0x1 = w[81];
303 const float vk12c0x2 = w[82];
304 const float vk12c0x3 = w[83];
305
306 const float vi12c0 = i1[3];
307
308 voc0 += vk12c0x0 * vi12c0;
309 voc1 += vk12c0x1 * vi12c0;
310 voc2 += vk12c0x2 * vi12c0;
311 voc3 += vk12c0x3 * vi12c0;
312
313 const float vk22c0x0 = w[84];
314 const float vk22c0x1 = w[85];
315 const float vk22c0x2 = w[86];
316 const float vk22c0x3 = w[87];
317
318 const float vi22c0 = i2[3];
319
320 voc0 += vk22c0x0 * vi22c0;
321 voc1 += vk22c0x1 * vi22c0;
322 voc2 += vk22c0x2 * vi22c0;
323 voc3 += vk22c0x3 * vi22c0;
324
325 vi00c0 = vi02c0;
326 vi10c0 = vi12c0;
327 vi20c0 = vi22c0;
328
329 const float vk02c1x0 = w[88];
330 const float vk02c1x1 = w[89];
331 const float vk02c1x2 = w[90];
332 const float vk02c1x3 = w[91];
333
334 const float vi02c1 = i0[4];
335
336 voc0 += vk02c1x0 * vi02c1;
337 voc1 += vk02c1x1 * vi02c1;
338 voc2 += vk02c1x2 * vi02c1;
339 voc3 += vk02c1x3 * vi02c1;
340
341 const float vk12c1x0 = w[92];
342 const float vk12c1x1 = w[93];
343 const float vk12c1x2 = w[94];
344 const float vk12c1x3 = w[95];
345
346 const float vi12c1 = i1[4];
347
348 voc0 += vk12c1x0 * vi12c1;
349 voc1 += vk12c1x1 * vi12c1;
350 voc2 += vk12c1x2 * vi12c1;
351 voc3 += vk12c1x3 * vi12c1;
352
353 const float vk22c1x0 = w[96];
354 const float vk22c1x1 = w[97];
355 const float vk22c1x2 = w[98];
356 const float vk22c1x3 = w[99];
357
358 const float vi22c1 = i2[4];
359
360 voc0 += vk22c1x0 * vi22c1;
361 voc1 += vk22c1x1 * vi22c1;
362 voc2 += vk22c1x2 * vi22c1;
363 voc3 += vk22c1x3 * vi22c1;
364
365 vi00c1 = vi02c1;
366 vi10c1 = vi12c1;
367 vi20c1 = vi22c1;
368
369 const float vk02c2x0 = w[100];
370 const float vk02c2x1 = w[101];
371 const float vk02c2x2 = w[102];
372 const float vk02c2x3 = w[103];
373
374 const float vi02c2 = i0[5];
375
376 voc0 += vk02c2x0 * vi02c2;
377 voc1 += vk02c2x1 * vi02c2;
378 voc2 += vk02c2x2 * vi02c2;
379 voc3 += vk02c2x3 * vi02c2;
380
381 const float vk12c2x0 = w[104];
382 const float vk12c2x1 = w[105];
383 const float vk12c2x2 = w[106];
384 const float vk12c2x3 = w[107];
385
386 const float vi12c2 = i1[5];
387
388 voc0 += vk12c2x0 * vi12c2;
389 voc1 += vk12c2x1 * vi12c2;
390 voc2 += vk12c2x2 * vi12c2;
391 voc3 += vk12c2x3 * vi12c2;
392
393 const float vk22c2x0 = w[108];
394 const float vk22c2x1 = w[109];
395 const float vk22c2x2 = w[110];
396 const float vk22c2x3 = w[111];
397
398 const float vi22c2 = i2[5];
399
400 voc0 += vk22c2x0 * vi22c2;
401 voc1 += vk22c2x1 * vi22c2;
402 voc2 += vk22c2x2 * vi22c2;
403 voc3 += vk22c2x3 * vi22c2;
404
405 vi00c2 = vi02c2;
406 vi10c2 = vi12c2;
407 vi20c2 = vi22c2;
408
409 voc0 = math_min_f32(voc0, voutput_max);
410 voc1 = math_min_f32(voc1, voutput_max);
411 voc2 = math_min_f32(voc2, voutput_max);
412 voc3 = math_min_f32(voc3, voutput_max);
413
414 voc0 = math_max_f32(voc0, voutput_min);
415 voc1 = math_max_f32(voc1, voutput_min);
416 voc2 = math_max_f32(voc2, voutput_min);
417 voc3 = math_max_f32(voc3, voutput_min);
418
419 *o0c0++ = voc0;
420 *o0c1++ = voc1;
421 *o0c2++ = voc2;
422 *o0c3++ = voc3;
423
424 i0 += 6;
425 i1 += 6;
426 i2 += 6;
427 }
428 assert(iw < 2);
429 if XNN_UNLIKELY(iw != 0) {
430 float voc0 = w[0];
431 float voc1 = w[1];
432 float voc2 = w[2];
433 float voc3 = w[3];
434
435 const float vk00c0x0 = w[4];
436 const float vk00c0x1 = w[5];
437 const float vk00c0x2 = w[6];
438 const float vk00c0x3 = w[7];
439
440 voc0 += vk00c0x0 * vi00c0;
441 voc1 += vk00c0x1 * vi00c0;
442 voc2 += vk00c0x2 * vi00c0;
443 voc3 += vk00c0x3 * vi00c0;
444
445 const float vk10c0x0 = w[8];
446 const float vk10c0x1 = w[9];
447 const float vk10c0x2 = w[10];
448 const float vk10c0x3 = w[11];
449
450 voc0 += vk10c0x0 * vi10c0;
451 voc1 += vk10c0x1 * vi10c0;
452 voc2 += vk10c0x2 * vi10c0;
453 voc3 += vk10c0x3 * vi10c0;
454
455 const float vk20c0x0 = w[12];
456 const float vk20c0x1 = w[13];
457 const float vk20c0x2 = w[14];
458 const float vk20c0x3 = w[15];
459
460 voc0 += vk20c0x0 * vi20c0;
461 voc1 += vk20c0x1 * vi20c0;
462 voc2 += vk20c0x2 * vi20c0;
463 voc3 += vk20c0x3 * vi20c0;
464
465 const float vk00c1x0 = w[16];
466 const float vk00c1x1 = w[17];
467 const float vk00c1x2 = w[18];
468 const float vk00c1x3 = w[19];
469
470 voc0 += vk00c1x0 * vi00c1;
471 voc1 += vk00c1x1 * vi00c1;
472 voc2 += vk00c1x2 * vi00c1;
473 voc3 += vk00c1x3 * vi00c1;
474
475 const float vk10c1x0 = w[20];
476 const float vk10c1x1 = w[21];
477 const float vk10c1x2 = w[22];
478 const float vk10c1x3 = w[23];
479
480 voc0 += vk10c1x0 * vi10c1;
481 voc1 += vk10c1x1 * vi10c1;
482 voc2 += vk10c1x2 * vi10c1;
483 voc3 += vk10c1x3 * vi10c1;
484
485 const float vk20c1x0 = w[24];
486 const float vk20c1x1 = w[25];
487 const float vk20c1x2 = w[26];
488 const float vk20c1x3 = w[27];
489
490 voc0 += vk20c1x0 * vi20c1;
491 voc1 += vk20c1x1 * vi20c1;
492 voc2 += vk20c1x2 * vi20c1;
493 voc3 += vk20c1x3 * vi20c1;
494
495 const float vk00c2x0 = w[28];
496 const float vk00c2x1 = w[29];
497 const float vk00c2x2 = w[30];
498 const float vk00c2x3 = w[31];
499
500 voc0 += vk00c2x0 * vi00c2;
501 voc1 += vk00c2x1 * vi00c2;
502 voc2 += vk00c2x2 * vi00c2;
503 voc3 += vk00c2x3 * vi00c2;
504
505 const float vk10c2x0 = w[32];
506 const float vk10c2x1 = w[33];
507 const float vk10c2x2 = w[34];
508 const float vk10c2x3 = w[35];
509
510 voc0 += vk10c2x0 * vi10c2;
511 voc1 += vk10c2x1 * vi10c2;
512 voc2 += vk10c2x2 * vi10c2;
513 voc3 += vk10c2x3 * vi10c2;
514
515 const float vk20c2x0 = w[36];
516 const float vk20c2x1 = w[37];
517 const float vk20c2x2 = w[38];
518 const float vk20c2x3 = w[39];
519
520 voc0 += vk20c2x0 * vi20c2;
521 voc1 += vk20c2x1 * vi20c2;
522 voc2 += vk20c2x2 * vi20c2;
523 voc3 += vk20c2x3 * vi20c2;
524
525 const float vk01c0x0 = w[40];
526 const float vk01c0x1 = w[41];
527 const float vk01c0x2 = w[42];
528 const float vk01c0x3 = w[43];
529
530 const float vi01c0 = i0[0];
531
532 voc0 += vk01c0x0 * vi01c0;
533 voc1 += vk01c0x1 * vi01c0;
534 voc2 += vk01c0x2 * vi01c0;
535 voc3 += vk01c0x3 * vi01c0;
536
537 const float vk11c0x0 = w[44];
538 const float vk11c0x1 = w[45];
539 const float vk11c0x2 = w[46];
540 const float vk11c0x3 = w[47];
541
542 const float vi11c0 = i1[0];
543
544 voc0 += vk11c0x0 * vi11c0;
545 voc1 += vk11c0x1 * vi11c0;
546 voc2 += vk11c0x2 * vi11c0;
547 voc3 += vk11c0x3 * vi11c0;
548
549 const float vk21c0x0 = w[48];
550 const float vk21c0x1 = w[49];
551 const float vk21c0x2 = w[50];
552 const float vk21c0x3 = w[51];
553
554 const float vi21c0 = i2[0];
555
556 voc0 += vk21c0x0 * vi21c0;
557 voc1 += vk21c0x1 * vi21c0;
558 voc2 += vk21c0x2 * vi21c0;
559 voc3 += vk21c0x3 * vi21c0;
560
561 const float vk01c1x0 = w[52];
562 const float vk01c1x1 = w[53];
563 const float vk01c1x2 = w[54];
564 const float vk01c1x3 = w[55];
565
566 const float vi01c1 = i0[1];
567
568 voc0 += vk01c1x0 * vi01c1;
569 voc1 += vk01c1x1 * vi01c1;
570 voc2 += vk01c1x2 * vi01c1;
571 voc3 += vk01c1x3 * vi01c1;
572
573 const float vk11c1x0 = w[56];
574 const float vk11c1x1 = w[57];
575 const float vk11c1x2 = w[58];
576 const float vk11c1x3 = w[59];
577
578 const float vi11c1 = i1[1];
579
580 voc0 += vk11c1x0 * vi11c1;
581 voc1 += vk11c1x1 * vi11c1;
582 voc2 += vk11c1x2 * vi11c1;
583 voc3 += vk11c1x3 * vi11c1;
584
585 const float vk21c1x0 = w[60];
586 const float vk21c1x1 = w[61];
587 const float vk21c1x2 = w[62];
588 const float vk21c1x3 = w[63];
589
590 const float vi21c1 = i2[1];
591
592 voc0 += vk21c1x0 * vi21c1;
593 voc1 += vk21c1x1 * vi21c1;
594 voc2 += vk21c1x2 * vi21c1;
595 voc3 += vk21c1x3 * vi21c1;
596
597 const float vk01c2x0 = w[64];
598 const float vk01c2x1 = w[65];
599 const float vk01c2x2 = w[66];
600 const float vk01c2x3 = w[67];
601
602 const float vi01c2 = i0[2];
603
604 voc0 += vk01c2x0 * vi01c2;
605 voc1 += vk01c2x1 * vi01c2;
606 voc2 += vk01c2x2 * vi01c2;
607 voc3 += vk01c2x3 * vi01c2;
608
609 const float vk11c2x0 = w[68];
610 const float vk11c2x1 = w[69];
611 const float vk11c2x2 = w[70];
612 const float vk11c2x3 = w[71];
613
614 const float vi11c2 = i1[2];
615
616 voc0 += vk11c2x0 * vi11c2;
617 voc1 += vk11c2x1 * vi11c2;
618 voc2 += vk11c2x2 * vi11c2;
619 voc3 += vk11c2x3 * vi11c2;
620
621 const float vk21c2x0 = w[72];
622 const float vk21c2x1 = w[73];
623 const float vk21c2x2 = w[74];
624 const float vk21c2x3 = w[75];
625
626 const float vi21c2 = i2[2];
627
628 voc0 += vk21c2x0 * vi21c2;
629 voc1 += vk21c2x1 * vi21c2;
630 voc2 += vk21c2x2 * vi21c2;
631 voc3 += vk21c2x3 * vi21c2;
632
633 voc0 = math_min_f32(voc0, voutput_max);
634 voc1 = math_min_f32(voc1, voutput_max);
635 voc2 = math_min_f32(voc2, voutput_max);
636 voc3 = math_min_f32(voc3, voutput_max);
637
638 voc0 = math_max_f32(voc0, voutput_min);
639 voc1 = math_max_f32(voc1, voutput_min);
640 voc2 = math_max_f32(voc2, voutput_min);
641 voc3 = math_max_f32(voc3, voutput_min);
642
643 *o0c0++ = voc0;
644 *o0c1++ = voc1;
645 *o0c2++ = voc2;
646 *o0c3++ = voc3;
647 }
648 // Move output pointers back to the position of the first pixel in a row,
649 // and forward to the next block of output channels.
650 o0c0 = (float*) ((uintptr_t) o0c0 + output_channel_increment);
651 o0c1 = (float*) ((uintptr_t) o0c1 + output_channel_increment);
652 o0c2 = (float*) ((uintptr_t) o0c2 + output_channel_increment);
653 o0c3 = (float*) ((uintptr_t) o0c3 + output_channel_increment);
654 // Revert input pointers to the position of the first pixel in a row
655 i0 = (const float*) ((uintptr_t) i0 - input_width_decrement);
656 i1 = (const float*) ((uintptr_t) i1 - input_width_decrement);
657 i2 = (const float*) ((uintptr_t) i2 - input_width_decrement);
658 // Move to the block of weights for the next 4 output channels
659 w += 112;
660 c = doz(c, 4);
661 } while (c != 0);
662 // Move output pointers forward to the next row
663 output0 = (float*) ((uintptr_t) output0 + output_height_stride);
664 // Move input pointers forward to the next row
665 i0 = i2;
666 i1 = (const float*) ((uintptr_t) i0 + input_height_stride);
667 i2 = (const float*) ((uintptr_t) i1 + input_height_stride);
668 }
669 }
670