• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/qs8-gavgpool/multipass-scalar.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <xnnpack/gavgpool.h>
13 #include <xnnpack/math.h>
14 
15 
xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4(size_t rows,size_t channels,const int8_t * input,size_t input_stride,const int8_t * zero,int32_t * buffer,int8_t * output,const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_fmagic_c4(
17     size_t rows,
18     size_t channels,
19     const int8_t* input,
20     size_t input_stride,
21     const int8_t* zero,
22     int32_t* buffer,
23     int8_t* output,
24     const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
25 {
26   assert(rows > 7);
27   assert(channels != 0);
28 
29   const int8_t* i0 = input;
30   const int8_t* i1 = (const int8_t*) ((uintptr_t) i0 + input_stride);
31   const int8_t* i2 = (const int8_t*) ((uintptr_t) i1 + input_stride);
32   const int8_t* i3 = (const int8_t*) ((uintptr_t) i2 + input_stride);
33   const int8_t* i4 = (const int8_t*) ((uintptr_t) i3 + input_stride);
34   const int8_t* i5 = (const int8_t*) ((uintptr_t) i4 + input_stride);
35   const int8_t* i6 = (const int8_t*) ((uintptr_t) i5 + input_stride);
36   const size_t input_increment = 7 * input_stride - round_up_po2(channels, 4) * sizeof(int8_t);
37 
38   const int32_t vinit_bias = params->fp32_scalar_fmagic.init_bias;
39   int32_t* b = buffer;
40   for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
41     const int32_t vi0x0 = (int32_t) i0[0];
42     const int32_t vi0x1 = (int32_t) i0[1];
43     const int32_t vi0x2 = (int32_t) i0[2];
44     const int32_t vi0x3 = (int32_t) i0[3];
45     i0 += 4;
46 
47     int32_t vacc0 = vi0x0 + vinit_bias;
48     const int32_t vi1x0 = (int32_t) i1[0];
49     int32_t vacc1 = vi0x1 + vinit_bias;
50     const int32_t vi1x1 = (int32_t) i1[1];
51     int32_t vacc2 = vi0x2 + vinit_bias;
52     const int32_t vi1x2 = (int32_t) i1[2];
53     int32_t vacc3 = vi0x3 + vinit_bias;
54     const int32_t vi1x3 = (int32_t) i1[3];
55     i1 += 4;
56 
57     vacc0 += vi1x0;
58     const int32_t vi2x0 = (int32_t) i2[0];
59     vacc1 += vi1x1;
60     const int32_t vi2x1 = (int32_t) i2[1];
61     vacc2 += vi1x2;
62     const int32_t vi2x2 = (int32_t) i2[2];
63     vacc3 += vi1x3;
64     const int32_t vi2x3 = (int32_t) i2[3];
65     i2 += 4;
66     vacc0 += vi2x0;
67     const int32_t vi3x0 = (int32_t) i3[0];
68     vacc1 += vi2x1;
69     const int32_t vi3x1 = (int32_t) i3[1];
70     vacc2 += vi2x2;
71     const int32_t vi3x2 = (int32_t) i3[2];
72     vacc3 += vi2x3;
73     const int32_t vi3x3 = (int32_t) i3[3];
74     i3 += 4;
75     vacc0 += vi3x0;
76     const int32_t vi4x0 = (int32_t) i4[0];
77     vacc1 += vi3x1;
78     const int32_t vi4x1 = (int32_t) i4[1];
79     vacc2 += vi3x2;
80     const int32_t vi4x2 = (int32_t) i4[2];
81     vacc3 += vi3x3;
82     const int32_t vi4x3 = (int32_t) i4[3];
83     i4 += 4;
84     vacc0 += vi4x0;
85     const int32_t vi5x0 = (int32_t) i5[0];
86     vacc1 += vi4x1;
87     const int32_t vi5x1 = (int32_t) i5[1];
88     vacc2 += vi4x2;
89     const int32_t vi5x2 = (int32_t) i5[2];
90     vacc3 += vi4x3;
91     const int32_t vi5x3 = (int32_t) i5[3];
92     i5 += 4;
93     vacc0 += vi5x0;
94     const int32_t vi6x0 = (int32_t) i6[0];
95     vacc1 += vi5x1;
96     const int32_t vi6x1 = (int32_t) i6[1];
97     vacc2 += vi5x2;
98     const int32_t vi6x2 = (int32_t) i6[2];
99     vacc3 += vi5x3;
100     const int32_t vi6x3 = (int32_t) i6[3];
101     i6 += 4;
102 
103     vacc0 += vi6x0;
104     vacc1 += vi6x1;
105     vacc2 += vi6x2;
106     vacc3 += vi6x3;
107 
108     b[0] = vacc0;
109     b[1] = vacc1;
110     b[2] = vacc2;
111     b[3] = vacc3;
112     b += 4;
113   }
114 
115   for (rows -= 7; rows > 7; rows -= 7) {
116     i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
117     i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
118     i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
119     i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
120     i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
121     i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
122     i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
123 
124     int32_t* b = buffer;
125     for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 4) {
126       int32_t vacc0 = b[0];
127       const int32_t vi0x0 = (int32_t) i0[0];
128       int32_t vacc1 = b[1];
129       const int32_t vi0x1 = (int32_t) i0[1];
130       int32_t vacc2 = b[2];
131       const int32_t vi0x2 = (int32_t) i0[2];
132       int32_t vacc3 = b[3];
133       const int32_t vi0x3 = (int32_t) i0[3];
134       i0 += 4;
135 
136       vacc0 += vi0x0;
137       const int32_t vi1x0 = (int32_t) i1[0];
138       vacc1 += vi0x1;
139       const int32_t vi1x1 = (int32_t) i1[1];
140       vacc2 += vi0x2;
141       const int32_t vi1x2 = (int32_t) i1[2];
142       vacc3 += vi0x3;
143       const int32_t vi1x3 = (int32_t) i1[3];
144       i1 += 4;
145       vacc0 += vi1x0;
146       const int32_t vi2x0 = (int32_t) i2[0];
147       vacc1 += vi1x1;
148       const int32_t vi2x1 = (int32_t) i2[1];
149       vacc2 += vi1x2;
150       const int32_t vi2x2 = (int32_t) i2[2];
151       vacc3 += vi1x3;
152       const int32_t vi2x3 = (int32_t) i2[3];
153       i2 += 4;
154       vacc0 += vi2x0;
155       const int32_t vi3x0 = (int32_t) i3[0];
156       vacc1 += vi2x1;
157       const int32_t vi3x1 = (int32_t) i3[1];
158       vacc2 += vi2x2;
159       const int32_t vi3x2 = (int32_t) i3[2];
160       vacc3 += vi2x3;
161       const int32_t vi3x3 = (int32_t) i3[3];
162       i3 += 4;
163       vacc0 += vi3x0;
164       const int32_t vi4x0 = (int32_t) i4[0];
165       vacc1 += vi3x1;
166       const int32_t vi4x1 = (int32_t) i4[1];
167       vacc2 += vi3x2;
168       const int32_t vi4x2 = (int32_t) i4[2];
169       vacc3 += vi3x3;
170       const int32_t vi4x3 = (int32_t) i4[3];
171       i4 += 4;
172       vacc0 += vi4x0;
173       const int32_t vi5x0 = (int32_t) i5[0];
174       vacc1 += vi4x1;
175       const int32_t vi5x1 = (int32_t) i5[1];
176       vacc2 += vi4x2;
177       const int32_t vi5x2 = (int32_t) i5[2];
178       vacc3 += vi4x3;
179       const int32_t vi5x3 = (int32_t) i5[3];
180       i5 += 4;
181       vacc0 += vi5x0;
182       const int32_t vi6x0 = (int32_t) i6[0];
183       vacc1 += vi5x1;
184       const int32_t vi6x1 = (int32_t) i6[1];
185       vacc2 += vi5x2;
186       const int32_t vi6x2 = (int32_t) i6[2];
187       vacc3 += vi5x3;
188       const int32_t vi6x3 = (int32_t) i6[3];
189       i6 += 4;
190 
191       vacc0 += vi6x0;
192       vacc1 += vi6x1;
193       vacc2 += vi6x2;
194       vacc3 += vi6x3;
195 
196       b[0] = vacc0;
197       b[1] = vacc1;
198       b[2] = vacc2;
199       b[3] = vacc3;
200       b += 4;
201     }
202   }
203 
204   i0 = (const int8_t*) ((uintptr_t) i0 + input_increment);
205   i1 = (const int8_t*) ((uintptr_t) i1 + input_increment);
206   if XNN_UNPREDICTABLE(rows < 2) {
207     i1 = zero;
208   }
209   i2 = (const int8_t*) ((uintptr_t) i2 + input_increment);
210   if XNN_UNPREDICTABLE(rows <= 2) {
211     i2 = zero;
212   }
213   i3 = (const int8_t*) ((uintptr_t) i3 + input_increment);
214   if XNN_UNPREDICTABLE(rows < 4) {
215     i3 = zero;
216   }
217   i4 = (const int8_t*) ((uintptr_t) i4 + input_increment);
218   if XNN_UNPREDICTABLE(rows <= 4) {
219     i4 = zero;
220   }
221   i5 = (const int8_t*) ((uintptr_t) i5 + input_increment);
222   if XNN_UNPREDICTABLE(rows < 6) {
223     i5 = zero;
224   }
225   i6 = (const int8_t*) ((uintptr_t) i6 + input_increment);
226   if XNN_UNPREDICTABLE(rows <= 6) {
227     i6 = zero;
228   }
229 
230   const float vscale = params->fp32_scalar_fmagic.scale;
231   const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
232   const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
233   const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
234   const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
235   for (; channels >= 4; channels -= 4) {
236     int32_t vacc0 = buffer[0];
237     const int32_t vi0x0 = (int32_t) i0[0];
238     int32_t vacc1 = buffer[1];
239     const int32_t vi0x1 = (int32_t) i0[1];
240     int32_t vacc2 = buffer[2];
241     const int32_t vi0x2 = (int32_t) i0[2];
242     int32_t vacc3 = buffer[3];
243     const int32_t vi0x3 = (int32_t) i0[3];
244     buffer += 4;
245     i0 += 4;
246 
247     vacc0 += vi0x0;
248     const int32_t vi1x0 = (int32_t) i1[0];
249     vacc1 += vi0x1;
250     const int32_t vi1x1 = (int32_t) i1[1];
251     vacc2 += vi0x2;
252     const int32_t vi1x2 = (int32_t) i1[2];
253     vacc3 += vi0x3;
254     const int32_t vi1x3 = (int32_t) i1[3];
255     i1 += 4;
256     vacc0 += vi1x0;
257     const int32_t vi2x0 = (int32_t) i2[0];
258     vacc1 += vi1x1;
259     const int32_t vi2x1 = (int32_t) i2[1];
260     vacc2 += vi1x2;
261     const int32_t vi2x2 = (int32_t) i2[2];
262     vacc3 += vi1x3;
263     const int32_t vi2x3 = (int32_t) i2[3];
264     i2 += 4;
265     vacc0 += vi2x0;
266     const int32_t vi3x0 = (int32_t) i3[0];
267     vacc1 += vi2x1;
268     const int32_t vi3x1 = (int32_t) i3[1];
269     vacc2 += vi2x2;
270     const int32_t vi3x2 = (int32_t) i3[2];
271     vacc3 += vi2x3;
272     const int32_t vi3x3 = (int32_t) i3[3];
273     i3 += 4;
274     vacc0 += vi3x0;
275     const int32_t vi4x0 = (int32_t) i4[0];
276     vacc1 += vi3x1;
277     const int32_t vi4x1 = (int32_t) i4[1];
278     vacc2 += vi3x2;
279     const int32_t vi4x2 = (int32_t) i4[2];
280     vacc3 += vi3x3;
281     const int32_t vi4x3 = (int32_t) i4[3];
282     i4 += 4;
283     vacc0 += vi4x0;
284     const int32_t vi5x0 = (int32_t) i5[0];
285     vacc1 += vi4x1;
286     const int32_t vi5x1 = (int32_t) i5[1];
287     vacc2 += vi4x2;
288     const int32_t vi5x2 = (int32_t) i5[2];
289     vacc3 += vi4x3;
290     const int32_t vi5x3 = (int32_t) i5[3];
291     i5 += 4;
292     vacc0 += vi5x0;
293     const int32_t vi6x0 = (int32_t) i6[0];
294     vacc1 += vi5x1;
295     const int32_t vi6x1 = (int32_t) i6[1];
296     vacc2 += vi5x2;
297     const int32_t vi6x2 = (int32_t) i6[2];
298     vacc3 += vi5x3;
299     const int32_t vi6x3 = (int32_t) i6[3];
300     i6 += 4;
301 
302     vacc0 += vi6x0;
303     vacc1 += vi6x1;
304     vacc2 += vi6x2;
305     vacc3 += vi6x3;
306 
307     float vfpacc0 = (float) vacc0 * vscale;
308     float vfpacc1 = (float) vacc1 * vscale;
309     float vfpacc2 = (float) vacc2 * vscale;
310     float vfpacc3 = (float) vacc3 * vscale;
311 
312     vfpacc0 = math_max_f32(vfpacc0, voutput_min_less_zero_point);
313     vfpacc1 = math_max_f32(vfpacc1, voutput_min_less_zero_point);
314     vfpacc2 = math_max_f32(vfpacc2, voutput_min_less_zero_point);
315     vfpacc3 = math_max_f32(vfpacc3, voutput_min_less_zero_point);
316 
317     vfpacc0 = math_min_f32(vfpacc0, voutput_max_less_zero_point);
318     vfpacc1 = math_min_f32(vfpacc1, voutput_max_less_zero_point);
319     vfpacc2 = math_min_f32(vfpacc2, voutput_max_less_zero_point);
320     vfpacc3 = math_min_f32(vfpacc3, voutput_max_less_zero_point);
321 
322     vfpacc0 += vmagic_bias;
323     vfpacc1 += vmagic_bias;
324     vfpacc2 += vmagic_bias;
325     vfpacc3 += vmagic_bias;
326 
327     int32_t vout0 = (int32_t) float_as_uint32(vfpacc0) - vmagic_bias_less_output_zero_point;
328     int32_t vout1 = (int32_t) float_as_uint32(vfpacc1) - vmagic_bias_less_output_zero_point;
329     int32_t vout2 = (int32_t) float_as_uint32(vfpacc2) - vmagic_bias_less_output_zero_point;
330     int32_t vout3 = (int32_t) float_as_uint32(vfpacc3) - vmagic_bias_less_output_zero_point;
331 
332     output[0] = (int8_t) vout0;
333     output[1] = (int8_t) vout1;
334     output[2] = (int8_t) vout2;
335     output[3] = (int8_t) vout3;
336     output += 4;
337   }
338   if XNN_UNLIKELY(channels != 0) {
339     do {
340       int32_t vacc = *buffer++;
341       const int32_t vi0 = (int32_t) *i0++;
342       const int32_t vi1 = (int32_t) *i1++;
343 
344       vacc += vi0;
345       const int32_t vi2 = (int32_t) *i2++;
346       vacc += vi1;
347       const int32_t vi3 = (int32_t) *i3++;
348       vacc += vi2;
349       const int32_t vi4 = (int32_t) *i4++;
350       vacc += vi3;
351       const int32_t vi5 = (int32_t) *i5++;
352       vacc += vi4;
353       const int32_t vi6 = (int32_t) *i6++;
354 
355       vacc += vi5;
356       vacc += vi6;
357 
358       float vfpacc = (float) vacc * vscale;
359       vfpacc = math_max_f32(vfpacc, voutput_min_less_zero_point);
360       vfpacc = math_min_f32(vfpacc, voutput_max_less_zero_point);
361       vfpacc += vmagic_bias;
362       int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
363 
364       *output++ = (int8_t) vout;
365     } while (--channels != 0);
366   }
367 }
368