1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gavgpool/multipass-scalar.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <fp16.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16
17
xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,int32_t * buffer,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4(
19 size_t rows,
20 size_t channels,
21 const uint8_t* input,
22 size_t input_stride,
23 const uint8_t* zero,
24 int32_t* buffer,
25 uint8_t* output,
26 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28 assert(rows > 7);
29 assert(channels != 0);
30
31 const uint8_t* i0 = input;
32 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
33 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
34 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
35 const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
36 const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
37 const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
38 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(uint8_t);
39
40 const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
41 int32_t* b = buffer;
42 for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
43 const int32_t vi0x0 = (int32_t) i0[0];
44 const int32_t vi0x1 = (int32_t) i0[1];
45 const int32_t vi0x2 = (int32_t) i0[2];
46 const int32_t vi0x3 = (int32_t) i0[3];
47 i0 += 4;
48
49 int32_t vacc0 = vi0x0 + vinit_bias;
50 const int32_t vi1x0 = (int32_t) i1[0];
51 int32_t vacc1 = vi0x1 + vinit_bias;
52 const int32_t vi1x1 = (int32_t) i1[1];
53 int32_t vacc2 = vi0x2 + vinit_bias;
54 const int32_t vi1x2 = (int32_t) i1[2];
55 int32_t vacc3 = vi0x3 + vinit_bias;
56 const int32_t vi1x3 = (int32_t) i1[3];
57 i1 += 4;
58
59 vacc0 += vi1x0;
60 const int32_t vi2x0 = (int32_t) i2[0];
61 vacc1 += vi1x1;
62 const int32_t vi2x1 = (int32_t) i2[1];
63 vacc2 += vi1x2;
64 const int32_t vi2x2 = (int32_t) i2[2];
65 vacc3 += vi1x3;
66 const int32_t vi2x3 = (int32_t) i2[3];
67 i2 += 4;
68 vacc0 += vi2x0;
69 const int32_t vi3x0 = (int32_t) i3[0];
70 vacc1 += vi2x1;
71 const int32_t vi3x1 = (int32_t) i3[1];
72 vacc2 += vi2x2;
73 const int32_t vi3x2 = (int32_t) i3[2];
74 vacc3 += vi2x3;
75 const int32_t vi3x3 = (int32_t) i3[3];
76 i3 += 4;
77 vacc0 += vi3x0;
78 const int32_t vi4x0 = (int32_t) i4[0];
79 vacc1 += vi3x1;
80 const int32_t vi4x1 = (int32_t) i4[1];
81 vacc2 += vi3x2;
82 const int32_t vi4x2 = (int32_t) i4[2];
83 vacc3 += vi3x3;
84 const int32_t vi4x3 = (int32_t) i4[3];
85 i4 += 4;
86 vacc0 += vi4x0;
87 const int32_t vi5x0 = (int32_t) i5[0];
88 vacc1 += vi4x1;
89 const int32_t vi5x1 = (int32_t) i5[1];
90 vacc2 += vi4x2;
91 const int32_t vi5x2 = (int32_t) i5[2];
92 vacc3 += vi4x3;
93 const int32_t vi5x3 = (int32_t) i5[3];
94 i5 += 4;
95 vacc0 += vi5x0;
96 const int32_t vi6x0 = (int32_t) i6[0];
97 vacc1 += vi5x1;
98 const int32_t vi6x1 = (int32_t) i6[1];
99 vacc2 += vi5x2;
100 const int32_t vi6x2 = (int32_t) i6[2];
101 vacc3 += vi5x3;
102 const int32_t vi6x3 = (int32_t) i6[3];
103 i6 += 4;
104
105 vacc0 += vi6x0;
106 vacc1 += vi6x1;
107 vacc2 += vi6x2;
108 vacc3 += vi6x3;
109
110 b[0] = vacc0;
111 b[1] = vacc1;
112 b[2] = vacc2;
113 b[3] = vacc3;
114 b += 4;
115 }
116
117 for (rows -= 7; rows > 7; rows -= 7) {
118 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
119 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
120 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
121 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
122 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
123 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
124 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
125
126 int32_t* b = buffer;
127 for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
128 int32_t vacc0 = b[0];
129 const int32_t vi0x0 = (int32_t) i0[0];
130 int32_t vacc1 = b[1];
131 const int32_t vi0x1 = (int32_t) i0[1];
132 int32_t vacc2 = b[2];
133 const int32_t vi0x2 = (int32_t) i0[2];
134 int32_t vacc3 = b[3];
135 const int32_t vi0x3 = (int32_t) i0[3];
136 i0 += 4;
137
138 vacc0 += vi0x0;
139 const int32_t vi1x0 = (int32_t) i1[0];
140 vacc1 += vi0x1;
141 const int32_t vi1x1 = (int32_t) i1[1];
142 vacc2 += vi0x2;
143 const int32_t vi1x2 = (int32_t) i1[2];
144 vacc3 += vi0x3;
145 const int32_t vi1x3 = (int32_t) i1[3];
146 i1 += 4;
147 vacc0 += vi1x0;
148 const int32_t vi2x0 = (int32_t) i2[0];
149 vacc1 += vi1x1;
150 const int32_t vi2x1 = (int32_t) i2[1];
151 vacc2 += vi1x2;
152 const int32_t vi2x2 = (int32_t) i2[2];
153 vacc3 += vi1x3;
154 const int32_t vi2x3 = (int32_t) i2[3];
155 i2 += 4;
156 vacc0 += vi2x0;
157 const int32_t vi3x0 = (int32_t) i3[0];
158 vacc1 += vi2x1;
159 const int32_t vi3x1 = (int32_t) i3[1];
160 vacc2 += vi2x2;
161 const int32_t vi3x2 = (int32_t) i3[2];
162 vacc3 += vi2x3;
163 const int32_t vi3x3 = (int32_t) i3[3];
164 i3 += 4;
165 vacc0 += vi3x0;
166 const int32_t vi4x0 = (int32_t) i4[0];
167 vacc1 += vi3x1;
168 const int32_t vi4x1 = (int32_t) i4[1];
169 vacc2 += vi3x2;
170 const int32_t vi4x2 = (int32_t) i4[2];
171 vacc3 += vi3x3;
172 const int32_t vi4x3 = (int32_t) i4[3];
173 i4 += 4;
174 vacc0 += vi4x0;
175 const int32_t vi5x0 = (int32_t) i5[0];
176 vacc1 += vi4x1;
177 const int32_t vi5x1 = (int32_t) i5[1];
178 vacc2 += vi4x2;
179 const int32_t vi5x2 = (int32_t) i5[2];
180 vacc3 += vi4x3;
181 const int32_t vi5x3 = (int32_t) i5[3];
182 i5 += 4;
183 vacc0 += vi5x0;
184 const int32_t vi6x0 = (int32_t) i6[0];
185 vacc1 += vi5x1;
186 const int32_t vi6x1 = (int32_t) i6[1];
187 vacc2 += vi5x2;
188 const int32_t vi6x2 = (int32_t) i6[2];
189 vacc3 += vi5x3;
190 const int32_t vi6x3 = (int32_t) i6[3];
191 i6 += 4;
192
193 vacc0 += vi6x0;
194 vacc1 += vi6x1;
195 vacc2 += vi6x2;
196 vacc3 += vi6x3;
197
198 b[0] = vacc0;
199 b[1] = vacc1;
200 b[2] = vacc2;
201 b[3] = vacc3;
202 b += 4;
203 }
204 }
205
206 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
207 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
208 if XNN_UNPREDICTABLE(rows < 2) {
209 i1 = zero;
210 }
211 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
212 if XNN_UNPREDICTABLE(rows <= 2) {
213 i2 = zero;
214 }
215 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
216 if XNN_UNPREDICTABLE(rows < 4) {
217 i3 = zero;
218 }
219 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
220 if XNN_UNPREDICTABLE(rows <= 4) {
221 i4 = zero;
222 }
223 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
224 if XNN_UNPREDICTABLE(rows < 6) {
225 i5 = zero;
226 }
227 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
228 if XNN_UNPREDICTABLE(rows <= 6) {
229 i6 = zero;
230 }
231
232 const float vscale = params->fp32_scalar_imagic.scale;
233 const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
234 const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
235 const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
236 const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
237 for (; channels >= 4; channels -= 4) {
238 int32_t vacc0 = buffer[0];
239 const int32_t vi0x0 = (int32_t) i0[0];
240 int32_t vacc1 = buffer[1];
241 const int32_t vi0x1 = (int32_t) i0[1];
242 int32_t vacc2 = buffer[2];
243 const int32_t vi0x2 = (int32_t) i0[2];
244 int32_t vacc3 = buffer[3];
245 const int32_t vi0x3 = (int32_t) i0[3];
246 buffer += 4;
247 i0 += 4;
248
249 vacc0 += vi0x0;
250 const int32_t vi1x0 = (int32_t) i1[0];
251 vacc1 += vi0x1;
252 const int32_t vi1x1 = (int32_t) i1[1];
253 vacc2 += vi0x2;
254 const int32_t vi1x2 = (int32_t) i1[2];
255 vacc3 += vi0x3;
256 const int32_t vi1x3 = (int32_t) i1[3];
257 i1 += 4;
258 vacc0 += vi1x0;
259 const int32_t vi2x0 = (int32_t) i2[0];
260 vacc1 += vi1x1;
261 const int32_t vi2x1 = (int32_t) i2[1];
262 vacc2 += vi1x2;
263 const int32_t vi2x2 = (int32_t) i2[2];
264 vacc3 += vi1x3;
265 const int32_t vi2x3 = (int32_t) i2[3];
266 i2 += 4;
267 vacc0 += vi2x0;
268 const int32_t vi3x0 = (int32_t) i3[0];
269 vacc1 += vi2x1;
270 const int32_t vi3x1 = (int32_t) i3[1];
271 vacc2 += vi2x2;
272 const int32_t vi3x2 = (int32_t) i3[2];
273 vacc3 += vi2x3;
274 const int32_t vi3x3 = (int32_t) i3[3];
275 i3 += 4;
276 vacc0 += vi3x0;
277 const int32_t vi4x0 = (int32_t) i4[0];
278 vacc1 += vi3x1;
279 const int32_t vi4x1 = (int32_t) i4[1];
280 vacc2 += vi3x2;
281 const int32_t vi4x2 = (int32_t) i4[2];
282 vacc3 += vi3x3;
283 const int32_t vi4x3 = (int32_t) i4[3];
284 i4 += 4;
285 vacc0 += vi4x0;
286 const int32_t vi5x0 = (int32_t) i5[0];
287 vacc1 += vi4x1;
288 const int32_t vi5x1 = (int32_t) i5[1];
289 vacc2 += vi4x2;
290 const int32_t vi5x2 = (int32_t) i5[2];
291 vacc3 += vi4x3;
292 const int32_t vi5x3 = (int32_t) i5[3];
293 i5 += 4;
294 vacc0 += vi5x0;
295 const int32_t vi6x0 = (int32_t) i6[0];
296 vacc1 += vi5x1;
297 const int32_t vi6x1 = (int32_t) i6[1];
298 vacc2 += vi5x2;
299 const int32_t vi6x2 = (int32_t) i6[2];
300 vacc3 += vi5x3;
301 const int32_t vi6x3 = (int32_t) i6[3];
302 i6 += 4;
303
304 vacc0 += vi6x0;
305 vacc1 += vi6x1;
306 vacc2 += vi6x2;
307 vacc3 += vi6x3;
308
309 float vfpacc0 = (float) vacc0 * vscale;
310 float vfpacc1 = (float) vacc1 * vscale;
311 float vfpacc2 = (float) vacc2 * vscale;
312 float vfpacc3 = (float) vacc3 * vscale;
313
314 vfpacc0 += vmagic_bias;
315 vfpacc1 += vmagic_bias;
316 vfpacc2 += vmagic_bias;
317 vfpacc3 += vmagic_bias;
318
319 int32_t vout0 = (int32_t) fp32_to_bits(vfpacc0);
320 int32_t vout1 = (int32_t) fp32_to_bits(vfpacc1);
321 int32_t vout2 = (int32_t) fp32_to_bits(vfpacc2);
322 int32_t vout3 = (int32_t) fp32_to_bits(vfpacc3);
323
324 vout0 = math_max_s32(vout0, vmagic_min);
325 vout1 = math_max_s32(vout1, vmagic_min);
326 vout2 = math_max_s32(vout2, vmagic_min);
327 vout3 = math_max_s32(vout3, vmagic_min);
328
329 vout0 = math_min_s32(vout0, vmagic_max);
330 vout1 = math_min_s32(vout1, vmagic_max);
331 vout2 = math_min_s32(vout2, vmagic_max);
332 vout3 = math_min_s32(vout3, vmagic_max);
333
334 vout0 -= vmagic_bias_less_zero_point;
335 vout1 -= vmagic_bias_less_zero_point;
336 vout2 -= vmagic_bias_less_zero_point;
337 vout3 -= vmagic_bias_less_zero_point;
338
339 output[0] = (uint8_t) vout0;
340 output[1] = (uint8_t) vout1;
341 output[2] = (uint8_t) vout2;
342 output[3] = (uint8_t) vout3;
343 output += 4;
344 }
345 if XNN_UNLIKELY(channels != 0) {
346 do {
347 int32_t vacc = *buffer++;
348 const int32_t vi0 = (int32_t) *i0++;
349 const int32_t vi1 = (int32_t) *i1++;
350
351 vacc += vi0;
352 const int32_t vi2 = (int32_t) *i2++;
353 vacc += vi1;
354 const int32_t vi3 = (int32_t) *i3++;
355 vacc += vi2;
356 const int32_t vi4 = (int32_t) *i4++;
357 vacc += vi3;
358 const int32_t vi5 = (int32_t) *i5++;
359 vacc += vi4;
360 const int32_t vi6 = (int32_t) *i6++;
361
362 vacc += vi5;
363 vacc += vi6;
364
365 float vfpacc = (float) vacc * vscale;
366 vfpacc += vmagic_bias;
367 int32_t vout = (int32_t) fp32_to_bits(vfpacc);
368 vout = math_max_s32(vout, vmagic_min);
369 vout = math_min_s32(vout, vmagic_max);
370 vout -= vmagic_bias_less_zero_point;
371
372 *output++ = (uint8_t) vout;
373 } while (--channels != 0);
374 }
375 }
376