• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/qs8-gavgpool/multipass-scalar.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <fp16.h>
13 
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16 
17 
xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,int32_t * buffer,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2(
19     size_t rows,
20     size_t channels,
21     const uint8_t* input,
22     size_t input_stride,
23     const uint8_t* zero,
24     int32_t* buffer,
25     uint8_t* output,
26     const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28   assert(rows > 7);
29   assert(channels != 0);
30 
31   const uint8_t* i0 = input;
32   const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
33   const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
34   const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
35   const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
36   const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
37   const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
38   const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(uint8_t);
39 
40   const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
41   int32_t* b = buffer;
42   for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
43     const int32_t vi0x0 = (int32_t) i0[0];
44     const int32_t vi0x1 = (int32_t) i0[1];
45     i0 += 2;
46 
47     int32_t vacc0 = vi0x0 + vinit_bias;
48     const int32_t vi1x0 = (int32_t) i1[0];
49     int32_t vacc1 = vi0x1 + vinit_bias;
50     const int32_t vi1x1 = (int32_t) i1[1];
51     i1 += 2;
52 
53     vacc0 += vi1x0;
54     const int32_t vi2x0 = (int32_t) i2[0];
55     vacc1 += vi1x1;
56     const int32_t vi2x1 = (int32_t) i2[1];
57     i2 += 2;
58     vacc0 += vi2x0;
59     const int32_t vi3x0 = (int32_t) i3[0];
60     vacc1 += vi2x1;
61     const int32_t vi3x1 = (int32_t) i3[1];
62     i3 += 2;
63     vacc0 += vi3x0;
64     const int32_t vi4x0 = (int32_t) i4[0];
65     vacc1 += vi3x1;
66     const int32_t vi4x1 = (int32_t) i4[1];
67     i4 += 2;
68     vacc0 += vi4x0;
69     const int32_t vi5x0 = (int32_t) i5[0];
70     vacc1 += vi4x1;
71     const int32_t vi5x1 = (int32_t) i5[1];
72     i5 += 2;
73     vacc0 += vi5x0;
74     const int32_t vi6x0 = (int32_t) i6[0];
75     vacc1 += vi5x1;
76     const int32_t vi6x1 = (int32_t) i6[1];
77     i6 += 2;
78 
79     vacc0 += vi6x0;
80     vacc1 += vi6x1;
81 
82     b[0] = vacc0;
83     b[1] = vacc1;
84     b += 2;
85   }
86 
87   for (rows -= 7; rows > 7; rows -= 7) {
88     i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
89     i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
90     i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
91     i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
92     i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
93     i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
94     i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
95 
96     int32_t* b = buffer;
97     for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
98       int32_t vacc0 = b[0];
99       const int32_t vi0x0 = (int32_t) i0[0];
100       int32_t vacc1 = b[1];
101       const int32_t vi0x1 = (int32_t) i0[1];
102       i0 += 2;
103 
104       vacc0 += vi0x0;
105       const int32_t vi1x0 = (int32_t) i1[0];
106       vacc1 += vi0x1;
107       const int32_t vi1x1 = (int32_t) i1[1];
108       i1 += 2;
109       vacc0 += vi1x0;
110       const int32_t vi2x0 = (int32_t) i2[0];
111       vacc1 += vi1x1;
112       const int32_t vi2x1 = (int32_t) i2[1];
113       i2 += 2;
114       vacc0 += vi2x0;
115       const int32_t vi3x0 = (int32_t) i3[0];
116       vacc1 += vi2x1;
117       const int32_t vi3x1 = (int32_t) i3[1];
118       i3 += 2;
119       vacc0 += vi3x0;
120       const int32_t vi4x0 = (int32_t) i4[0];
121       vacc1 += vi3x1;
122       const int32_t vi4x1 = (int32_t) i4[1];
123       i4 += 2;
124       vacc0 += vi4x0;
125       const int32_t vi5x0 = (int32_t) i5[0];
126       vacc1 += vi4x1;
127       const int32_t vi5x1 = (int32_t) i5[1];
128       i5 += 2;
129       vacc0 += vi5x0;
130       const int32_t vi6x0 = (int32_t) i6[0];
131       vacc1 += vi5x1;
132       const int32_t vi6x1 = (int32_t) i6[1];
133       i6 += 2;
134 
135       vacc0 += vi6x0;
136       vacc1 += vi6x1;
137 
138       b[0] = vacc0;
139       b[1] = vacc1;
140       b += 2;
141     }
142   }
143 
144   i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
145   i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
146   if XNN_UNPREDICTABLE(rows < 2) {
147     i1 = zero;
148   }
149   i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
150   if XNN_UNPREDICTABLE(rows <= 2) {
151     i2 = zero;
152   }
153   i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
154   if XNN_UNPREDICTABLE(rows < 4) {
155     i3 = zero;
156   }
157   i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
158   if XNN_UNPREDICTABLE(rows <= 4) {
159     i4 = zero;
160   }
161   i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
162   if XNN_UNPREDICTABLE(rows < 6) {
163     i5 = zero;
164   }
165   i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
166   if XNN_UNPREDICTABLE(rows <= 6) {
167     i6 = zero;
168   }
169 
170   const float vscale = params->fp32_scalar_imagic.scale;
171   const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
172   const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
173   const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
174   const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
175   for (; channels >= 2; channels -= 2) {
176     int32_t vacc0 = buffer[0];
177     const int32_t vi0x0 = (int32_t) i0[0];
178     int32_t vacc1 = buffer[1];
179     const int32_t vi0x1 = (int32_t) i0[1];
180     buffer += 2;
181     i0 += 2;
182 
183     vacc0 += vi0x0;
184     const int32_t vi1x0 = (int32_t) i1[0];
185     vacc1 += vi0x1;
186     const int32_t vi1x1 = (int32_t) i1[1];
187     i1 += 2;
188     vacc0 += vi1x0;
189     const int32_t vi2x0 = (int32_t) i2[0];
190     vacc1 += vi1x1;
191     const int32_t vi2x1 = (int32_t) i2[1];
192     i2 += 2;
193     vacc0 += vi2x0;
194     const int32_t vi3x0 = (int32_t) i3[0];
195     vacc1 += vi2x1;
196     const int32_t vi3x1 = (int32_t) i3[1];
197     i3 += 2;
198     vacc0 += vi3x0;
199     const int32_t vi4x0 = (int32_t) i4[0];
200     vacc1 += vi3x1;
201     const int32_t vi4x1 = (int32_t) i4[1];
202     i4 += 2;
203     vacc0 += vi4x0;
204     const int32_t vi5x0 = (int32_t) i5[0];
205     vacc1 += vi4x1;
206     const int32_t vi5x1 = (int32_t) i5[1];
207     i5 += 2;
208     vacc0 += vi5x0;
209     const int32_t vi6x0 = (int32_t) i6[0];
210     vacc1 += vi5x1;
211     const int32_t vi6x1 = (int32_t) i6[1];
212     i6 += 2;
213 
214     vacc0 += vi6x0;
215     vacc1 += vi6x1;
216 
217     float vfpacc0 = (float) vacc0 * vscale;
218     float vfpacc1 = (float) vacc1 * vscale;
219 
220     vfpacc0 += vmagic_bias;
221     vfpacc1 += vmagic_bias;
222 
223     int32_t vout0 = (int32_t) fp32_to_bits(vfpacc0);
224     int32_t vout1 = (int32_t) fp32_to_bits(vfpacc1);
225 
226     vout0 = math_max_s32(vout0, vmagic_min);
227     vout1 = math_max_s32(vout1, vmagic_min);
228 
229     vout0 = math_min_s32(vout0, vmagic_max);
230     vout1 = math_min_s32(vout1, vmagic_max);
231 
232     vout0 -= vmagic_bias_less_zero_point;
233     vout1 -= vmagic_bias_less_zero_point;
234 
235     output[0] = (uint8_t) vout0;
236     output[1] = (uint8_t) vout1;
237     output += 2;
238   }
239   if XNN_UNLIKELY(channels != 0) {
240     int32_t vacc = *buffer;
241     const int32_t vi0 = (int32_t) *i0;
242     const int32_t vi1 = (int32_t) *i1;
243 
244     vacc += vi0;
245     const int32_t vi2 = (int32_t) *i2;
246     vacc += vi1;
247     const int32_t vi3 = (int32_t) *i3;
248     vacc += vi2;
249     const int32_t vi4 = (int32_t) *i4;
250     vacc += vi3;
251     const int32_t vi5 = (int32_t) *i5;
252     vacc += vi4;
253     const int32_t vi6 = (int32_t) *i6;
254 
255     vacc += vi5;
256     vacc += vi6;
257 
258     float vfpacc = (float) vacc * vscale;
259     vfpacc += vmagic_bias;
260     int32_t vout = (int32_t) fp32_to_bits(vfpacc);
261     vout = math_max_s32(vout, vmagic_min);
262     vout = math_min_s32(vout, vmagic_max);
263     vout -= vmagic_bias_less_zero_point;
264 
265     *output = (uint8_t) vout;
266   }
267 }
268