1 // Auto-generated file. Do not edit!
2 // Template: src/qs8-gavgpool/multipass-scalar.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2021 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <fp16.h>
13
14 #include <xnnpack/gavgpool.h>
15 #include <xnnpack/math.h>
16
17
xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2(size_t rows,size_t channels,const uint8_t * input,size_t input_stride,const uint8_t * zero,int32_t * buffer,uint8_t * output,const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])18 void xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c2(
19 size_t rows,
20 size_t channels,
21 const uint8_t* input,
22 size_t input_stride,
23 const uint8_t* zero,
24 int32_t* buffer,
25 uint8_t* output,
26 const union xnn_qu8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
27 {
28 assert(rows > 7);
29 assert(channels != 0);
30
31 const uint8_t* i0 = input;
32 const uint8_t* i1 = (const uint8_t*) ((uintptr_t) i0 + input_stride);
33 const uint8_t* i2 = (const uint8_t*) ((uintptr_t) i1 + input_stride);
34 const uint8_t* i3 = (const uint8_t*) ((uintptr_t) i2 + input_stride);
35 const uint8_t* i4 = (const uint8_t*) ((uintptr_t) i3 + input_stride);
36 const uint8_t* i5 = (const uint8_t*) ((uintptr_t) i4 + input_stride);
37 const uint8_t* i6 = (const uint8_t*) ((uintptr_t) i5 + input_stride);
38 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 2) * sizeof(uint8_t);
39
40 const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
41 int32_t* b = buffer;
42 for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
43 const int32_t vi0x0 = (int32_t) i0[0];
44 const int32_t vi0x1 = (int32_t) i0[1];
45 i0 += 2;
46
47 int32_t vacc0 = vi0x0 + vinit_bias;
48 const int32_t vi1x0 = (int32_t) i1[0];
49 int32_t vacc1 = vi0x1 + vinit_bias;
50 const int32_t vi1x1 = (int32_t) i1[1];
51 i1 += 2;
52
53 vacc0 += vi1x0;
54 const int32_t vi2x0 = (int32_t) i2[0];
55 vacc1 += vi1x1;
56 const int32_t vi2x1 = (int32_t) i2[1];
57 i2 += 2;
58 vacc0 += vi2x0;
59 const int32_t vi3x0 = (int32_t) i3[0];
60 vacc1 += vi2x1;
61 const int32_t vi3x1 = (int32_t) i3[1];
62 i3 += 2;
63 vacc0 += vi3x0;
64 const int32_t vi4x0 = (int32_t) i4[0];
65 vacc1 += vi3x1;
66 const int32_t vi4x1 = (int32_t) i4[1];
67 i4 += 2;
68 vacc0 += vi4x0;
69 const int32_t vi5x0 = (int32_t) i5[0];
70 vacc1 += vi4x1;
71 const int32_t vi5x1 = (int32_t) i5[1];
72 i5 += 2;
73 vacc0 += vi5x0;
74 const int32_t vi6x0 = (int32_t) i6[0];
75 vacc1 += vi5x1;
76 const int32_t vi6x1 = (int32_t) i6[1];
77 i6 += 2;
78
79 vacc0 += vi6x0;
80 vacc1 += vi6x1;
81
82 b[0] = vacc0;
83 b[1] = vacc1;
84 b += 2;
85 }
86
87 for (rows -= 7; rows > 7; rows -= 7) {
88 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
89 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
90 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
91 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
92 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
93 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
94 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
95
96 int32_t* b = buffer;
97 for (ptrdiff_t c = (ptrdiff_t) channels; c > 0; c -= 2) {
98 int32_t vacc0 = b[0];
99 const int32_t vi0x0 = (int32_t) i0[0];
100 int32_t vacc1 = b[1];
101 const int32_t vi0x1 = (int32_t) i0[1];
102 i0 += 2;
103
104 vacc0 += vi0x0;
105 const int32_t vi1x0 = (int32_t) i1[0];
106 vacc1 += vi0x1;
107 const int32_t vi1x1 = (int32_t) i1[1];
108 i1 += 2;
109 vacc0 += vi1x0;
110 const int32_t vi2x0 = (int32_t) i2[0];
111 vacc1 += vi1x1;
112 const int32_t vi2x1 = (int32_t) i2[1];
113 i2 += 2;
114 vacc0 += vi2x0;
115 const int32_t vi3x0 = (int32_t) i3[0];
116 vacc1 += vi2x1;
117 const int32_t vi3x1 = (int32_t) i3[1];
118 i3 += 2;
119 vacc0 += vi3x0;
120 const int32_t vi4x0 = (int32_t) i4[0];
121 vacc1 += vi3x1;
122 const int32_t vi4x1 = (int32_t) i4[1];
123 i4 += 2;
124 vacc0 += vi4x0;
125 const int32_t vi5x0 = (int32_t) i5[0];
126 vacc1 += vi4x1;
127 const int32_t vi5x1 = (int32_t) i5[1];
128 i5 += 2;
129 vacc0 += vi5x0;
130 const int32_t vi6x0 = (int32_t) i6[0];
131 vacc1 += vi5x1;
132 const int32_t vi6x1 = (int32_t) i6[1];
133 i6 += 2;
134
135 vacc0 += vi6x0;
136 vacc1 += vi6x1;
137
138 b[0] = vacc0;
139 b[1] = vacc1;
140 b += 2;
141 }
142 }
143
144 i0 = (const uint8_t*) ((uintptr_t) i0 + input_increment);
145 i1 = (const uint8_t*) ((uintptr_t) i1 + input_increment);
146 if XNN_UNPREDICTABLE(rows < 2) {
147 i1 = zero;
148 }
149 i2 = (const uint8_t*) ((uintptr_t) i2 + input_increment);
150 if XNN_UNPREDICTABLE(rows <= 2) {
151 i2 = zero;
152 }
153 i3 = (const uint8_t*) ((uintptr_t) i3 + input_increment);
154 if XNN_UNPREDICTABLE(rows < 4) {
155 i3 = zero;
156 }
157 i4 = (const uint8_t*) ((uintptr_t) i4 + input_increment);
158 if XNN_UNPREDICTABLE(rows <= 4) {
159 i4 = zero;
160 }
161 i5 = (const uint8_t*) ((uintptr_t) i5 + input_increment);
162 if XNN_UNPREDICTABLE(rows < 6) {
163 i5 = zero;
164 }
165 i6 = (const uint8_t*) ((uintptr_t) i6 + input_increment);
166 if XNN_UNPREDICTABLE(rows <= 6) {
167 i6 = zero;
168 }
169
170 const float vscale = params->fp32_scalar_imagic.scale;
171 const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
172 const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
173 const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
174 const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
175 for (; channels >= 2; channels -= 2) {
176 int32_t vacc0 = buffer[0];
177 const int32_t vi0x0 = (int32_t) i0[0];
178 int32_t vacc1 = buffer[1];
179 const int32_t vi0x1 = (int32_t) i0[1];
180 buffer += 2;
181 i0 += 2;
182
183 vacc0 += vi0x0;
184 const int32_t vi1x0 = (int32_t) i1[0];
185 vacc1 += vi0x1;
186 const int32_t vi1x1 = (int32_t) i1[1];
187 i1 += 2;
188 vacc0 += vi1x0;
189 const int32_t vi2x0 = (int32_t) i2[0];
190 vacc1 += vi1x1;
191 const int32_t vi2x1 = (int32_t) i2[1];
192 i2 += 2;
193 vacc0 += vi2x0;
194 const int32_t vi3x0 = (int32_t) i3[0];
195 vacc1 += vi2x1;
196 const int32_t vi3x1 = (int32_t) i3[1];
197 i3 += 2;
198 vacc0 += vi3x0;
199 const int32_t vi4x0 = (int32_t) i4[0];
200 vacc1 += vi3x1;
201 const int32_t vi4x1 = (int32_t) i4[1];
202 i4 += 2;
203 vacc0 += vi4x0;
204 const int32_t vi5x0 = (int32_t) i5[0];
205 vacc1 += vi4x1;
206 const int32_t vi5x1 = (int32_t) i5[1];
207 i5 += 2;
208 vacc0 += vi5x0;
209 const int32_t vi6x0 = (int32_t) i6[0];
210 vacc1 += vi5x1;
211 const int32_t vi6x1 = (int32_t) i6[1];
212 i6 += 2;
213
214 vacc0 += vi6x0;
215 vacc1 += vi6x1;
216
217 float vfpacc0 = (float) vacc0 * vscale;
218 float vfpacc1 = (float) vacc1 * vscale;
219
220 vfpacc0 += vmagic_bias;
221 vfpacc1 += vmagic_bias;
222
223 int32_t vout0 = (int32_t) fp32_to_bits(vfpacc0);
224 int32_t vout1 = (int32_t) fp32_to_bits(vfpacc1);
225
226 vout0 = math_max_s32(vout0, vmagic_min);
227 vout1 = math_max_s32(vout1, vmagic_min);
228
229 vout0 = math_min_s32(vout0, vmagic_max);
230 vout1 = math_min_s32(vout1, vmagic_max);
231
232 vout0 -= vmagic_bias_less_zero_point;
233 vout1 -= vmagic_bias_less_zero_point;
234
235 output[0] = (uint8_t) vout0;
236 output[1] = (uint8_t) vout1;
237 output += 2;
238 }
239 if XNN_UNLIKELY(channels != 0) {
240 int32_t vacc = *buffer;
241 const int32_t vi0 = (int32_t) *i0;
242 const int32_t vi1 = (int32_t) *i1;
243
244 vacc += vi0;
245 const int32_t vi2 = (int32_t) *i2;
246 vacc += vi1;
247 const int32_t vi3 = (int32_t) *i3;
248 vacc += vi2;
249 const int32_t vi4 = (int32_t) *i4;
250 vacc += vi3;
251 const int32_t vi5 = (int32_t) *i5;
252 vacc += vi4;
253 const int32_t vi6 = (int32_t) *i6;
254
255 vacc += vi5;
256 vacc += vi6;
257
258 float vfpacc = (float) vacc * vscale;
259 vfpacc += vmagic_bias;
260 int32_t vout = (int32_t) fp32_to_bits(vfpacc);
261 vout = math_max_s32(vout, vmagic_min);
262 vout = math_min_s32(vout, vmagic_max);
263 vout -= vmagic_bias_less_zero_point;
264
265 *output = (uint8_t) vout;
266 }
267 }
268