1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <stdint.h>
7 #include <stddef.h>
8 #include <assert.h>
9 #include <math.h>
10
11 #include <fp16.h>
12
13 #include <xnnpack/math.h>
14 #include <xnnpack/params-init.h>
15
16
xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)17 void xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params(
18 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
19 uint8_t kernel_zero_point,
20 float scale,
21 uint8_t output_zero_point,
22 uint8_t output_min,
23 uint8_t output_max)
24 {
25 assert(scale >= 0x1.0p-32f);
26 assert(scale < 256.0f);
27
28 params->fp32_scalar_fmagic.kernel_zero_point = (int32_t) kernel_zero_point;
29 params->fp32_scalar_fmagic.scale = scale;
30 params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
31 params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
32 params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
33 params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
34 }
35
xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)36 void xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params(
37 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
38 uint8_t kernel_zero_point,
39 float scale,
40 uint8_t output_zero_point,
41 uint8_t output_min,
42 uint8_t output_max)
43 {
44 assert(scale >= 0x1.0p-32f);
45 assert(scale < 256.0f);
46
47 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
48 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
49 params->fp32_scalar_imagic.kernel_zero_point = (int32_t) kernel_zero_point;
50 params->fp32_scalar_imagic.scale = scale;
51 params->fp32_scalar_imagic.magic_bias = 12582912.0f;
52 params->fp32_scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
53 params->fp32_scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
54 params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
55 }
56
xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)57 void xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params(
58 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
59 uint8_t kernel_zero_point,
60 float scale,
61 uint8_t output_zero_point,
62 uint8_t output_min,
63 uint8_t output_max)
64 {
65 assert(scale >= 0x1.0p-32f);
66 assert(scale < 256.0f);
67
68 params->fp32_scalar_lrintf.kernel_zero_point = (int32_t) kernel_zero_point;
69 params->fp32_scalar_lrintf.scale = scale;
70 params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
71 params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
72 params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
73 }
74
75 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_conv_minmax_fp32_sse2_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)76 void xnn_init_qu8_conv_minmax_fp32_sse2_params(
77 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
78 uint8_t kernel_zero_point,
79 float scale,
80 uint8_t output_zero_point,
81 uint8_t output_min,
82 uint8_t output_max)
83 {
84 assert(scale >= 0x1.0p-32f);
85 assert(scale < 256.0f);
86
87 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
88 for (uint32_t i = 0; i < 4; i++) {
89 params->fp32_sse2.scale[i] = scale;
90 params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
91 }
92 for (uint32_t i = 0; i < 8; i++) {
93 params->fp32_sse2.kernel_zero_point[i] = (int16_t) kernel_zero_point;
94 params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
95 }
96 for (uint32_t i = 0; i < 16; i++) {
97 params->fp32_sse2.output_min[i] = output_min;
98 }
99 }
100
xnn_init_qu8_conv_minmax_fp32_avx2_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)101 void xnn_init_qu8_conv_minmax_fp32_avx2_params(
102 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
103 uint8_t kernel_zero_point,
104 float scale,
105 uint8_t output_zero_point,
106 uint8_t output_min,
107 uint8_t output_max)
108 {
109 assert(scale >= 0x1.0p-32f);
110 assert(scale < 256.0f);
111
112 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
113 for (uint32_t i = 0; i < 8; i++) {
114 params->fp32_avx2.scale[i] = scale;
115 params->fp32_avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
116 }
117 for (uint32_t i = 0; i < 16; i++) {
118 params->fp32_avx2.kernel_zero_point[i] = (int16_t) kernel_zero_point;
119 params->fp32_avx2.output_zero_point[i] = (int16_t) output_zero_point;
120 }
121 for (uint32_t i = 0; i < 32; i++) {
122 params->fp32_avx2.output_min[i] = output_min;
123 }
124 }
125
xnn_init_qu8_conv_minmax_fp32_avx512_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)126 void xnn_init_qu8_conv_minmax_fp32_avx512_params(
127 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
128 uint8_t kernel_zero_point,
129 float scale,
130 uint8_t output_zero_point,
131 uint8_t output_min,
132 uint8_t output_max)
133 {
134 assert(scale >= 0x1.0p-32f);
135 assert(scale < 256.0f);
136
137 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
138 for (uint32_t i = 0; i < 16; i++) {
139 params->fp32_avx512.scale[i] = scale;
140 params->fp32_avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
141 }
142 for (uint32_t i = 0; i < 32; i++) {
143 params->fp32_avx512.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
144 params->fp32_avx512.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
145 }
146 for (uint32_t i = 0; i < 64; i++) {
147 params->fp32_avx512.output_min[i] = output_min;
148 }
149 }
150 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
151
152 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_conv_minmax_fp32_neon_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)153 void xnn_init_qu8_conv_minmax_fp32_neon_params(
154 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
155 uint8_t kernel_zero_point,
156 float scale,
157 uint8_t output_zero_point,
158 uint8_t output_min,
159 uint8_t output_max)
160 {
161 assert(scale >= 0x1.0p-32f);
162 assert(scale < 256.0f);
163
164 params->fp32_neon.kernel_zero_point[0] = kernel_zero_point;
165 params->fp32_neon.kernel_zero_point[1] = kernel_zero_point;
166 params->fp32_neon.kernel_zero_point[2] = kernel_zero_point;
167 params->fp32_neon.kernel_zero_point[3] = kernel_zero_point;
168 params->fp32_neon.scale = scale;
169 params->fp32_neon.magic_bias = 12582912.0f;
170 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
171 params->fp32_neon.output_min = output_min;
172 params->fp32_neon.output_max = output_max;
173 }
174
xnn_init_qu8_conv_minmax_fp32_neonv8_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)175 void xnn_init_qu8_conv_minmax_fp32_neonv8_params(
176 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
177 uint8_t kernel_zero_point,
178 float scale,
179 uint8_t output_zero_point,
180 uint8_t output_min,
181 uint8_t output_max)
182 {
183 assert(scale >= 0x1.0p-32f);
184 assert(scale < 256.0f);
185
186 params->fp32_neonv8.kernel_zero_point[0] = kernel_zero_point;
187 params->fp32_neonv8.kernel_zero_point[1] = kernel_zero_point;
188 params->fp32_neonv8.kernel_zero_point[2] = kernel_zero_point;
189 params->fp32_neonv8.kernel_zero_point[3] = kernel_zero_point;
190 params->fp32_neonv8.scale = scale;
191 params->fp32_neonv8.output_zero_point = (int16_t) (uint16_t) output_zero_point;
192 params->fp32_neonv8.output_min = output_min;
193 params->fp32_neonv8.output_max = output_max;
194 }
195
xnn_init_qu8_conv_minmax_rndnu_neon_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)196 void xnn_init_qu8_conv_minmax_rndnu_neon_params(
197 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
198 uint8_t kernel_zero_point,
199 float scale,
200 uint8_t output_zero_point,
201 uint8_t output_min,
202 uint8_t output_max)
203 {
204 assert(scale >= 0x1.0p-32f);
205 assert(scale < 256.0f);
206
207 // Compute requantization parameters.
208 const uint32_t scale_bits = fp32_to_bits(scale);
209
210 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
211 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
212 assert(multiplier >= INT32_C(0x40000000));
213 assert(multiplier <= INT32_C(0x7FFFFF80));
214
215 // Shift is in [-8, 31] range.
216 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
217 assert(shift >= -8);
218 assert(shift < 32);
219
220 // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
221 const int32_t post_shift = math_max_s32(shift, 1);
222 const int32_t pre_shift = shift - post_shift;
223
224 params->rndnu_neon.kernel_zero_point[0] = kernel_zero_point;
225 params->rndnu_neon.kernel_zero_point[1] = kernel_zero_point;
226 params->rndnu_neon.kernel_zero_point[2] = kernel_zero_point;
227 params->rndnu_neon.kernel_zero_point[3] = kernel_zero_point;
228 params->rndnu_neon.right_pre_shift = -pre_shift;
229 params->rndnu_neon.multiplier = multiplier;
230 params->rndnu_neon.right_post_shift = -post_shift;
231 params->rndnu_neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
232 params->rndnu_neon.output_min = output_min;
233 params->rndnu_neon.output_max = output_max;
234 }
235 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
236
237 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_conv_minmax_fp32_wasmsimd_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)238 void xnn_init_qu8_conv_minmax_fp32_wasmsimd_params(
239 union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
240 uint8_t kernel_zero_point,
241 float scale,
242 uint8_t output_zero_point,
243 uint8_t output_min,
244 uint8_t output_max)
245 {
246 assert(scale >= 0x1.0p-32f);
247 assert(scale < 256.0f);
248
249 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
250 const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
251 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
252 for (uint32_t i = 0; i < 4; i++) {
253 params->fp32_wasmsimd.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
254 }
255 for (uint32_t i = 0; i < 2; i++) {
256 params->fp32_wasmsimd.scale[i] = scale;
257 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
258 params->fp32_wasmsimd.magic_min[i] = magic_min;
259 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
260 }
261 for (uint32_t i = 0; i < 8; i++) {
262 params->fp32_wasmsimd.output_max[i] = output_max;
263 }
264 }
265 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
266
xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)267 void xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params(
268 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
269 float scale,
270 int8_t output_zero_point,
271 int8_t output_min,
272 int8_t output_max)
273 {
274 assert(scale >= 0x1.0p-32f);
275 assert(scale < 256.0f);
276
277 params->fp32_scalar_fmagic.scale = scale;
278 params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
279 params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
280 params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
281 params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
282 }
283
xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)284 void xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params(
285 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
286 float scale,
287 int8_t output_zero_point,
288 int8_t output_min,
289 int8_t output_max)
290 {
291 assert(scale >= 0x1.0p-32f);
292 assert(scale < 256.0f);
293
294 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
295 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
296 params->fp32_scalar_imagic.scale = scale;
297 params->fp32_scalar_imagic.magic_bias = 12582912.0f;
298 params->fp32_scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
299 params->fp32_scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
300 params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
301 }
302
xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)303 void xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params(
304 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
305 float scale,
306 int8_t output_zero_point,
307 int8_t output_min,
308 int8_t output_max)
309 {
310 assert(scale >= 0x1.0p-32f);
311 assert(scale < 256.0f);
312
313 params->fp32_scalar_lrintf.scale = scale;
314 params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
315 params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
316 params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
317 }
318
319 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_conv_minmax_fp32_sse2_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)320 void xnn_init_qs8_conv_minmax_fp32_sse2_params(
321 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
322 float scale,
323 int8_t output_zero_point,
324 int8_t output_min,
325 int8_t output_max)
326 {
327 assert(scale >= 0x1.0p-32f);
328 assert(scale < 256.0f);
329
330 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
331 for (uint32_t i = 0; i < 4; i++) {
332 params->fp32_sse2.scale[i] = scale;
333 params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
334 }
335 for (uint32_t i = 0; i < 8; i++) {
336 params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
337 params->fp32_sse2.output_min[i] = (int16_t) output_min;
338 }
339 }
340
xnn_init_qs8_conv_minmax_fp32_sse4_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)341 void xnn_init_qs8_conv_minmax_fp32_sse4_params(
342 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
343 float scale,
344 int8_t output_zero_point,
345 int8_t output_min,
346 int8_t output_max)
347 {
348 assert(scale >= 0x1.0p-32f);
349 assert(scale < 256.0f);
350
351 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
352 for (uint32_t i = 0; i < 4; i++) {
353 params->fp32_sse4.scale[i] = scale;
354 params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
355 }
356 for (uint32_t i = 0; i < 8; i++) {
357 params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
358 }
359 for (uint32_t i = 0; i < 16; i++) {
360 params->fp32_sse4.output_min[i] = output_min;
361 }
362 }
363
xnn_init_qs8_conv_minmax_fp32_avx2_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)364 void xnn_init_qs8_conv_minmax_fp32_avx2_params(
365 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
366 float scale,
367 int8_t output_zero_point,
368 int8_t output_min,
369 int8_t output_max)
370 {
371 assert(scale >= 0x1.0p-32f);
372 assert(scale < 256.0f);
373
374 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
375 for (uint32_t i = 0; i < 8; i++) {
376 params->fp32_avx2.scale[i] = scale;
377 params->fp32_avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
378 }
379 for (uint32_t i = 0; i < 16; i++) {
380 params->fp32_avx2.output_zero_point[i] = (int16_t) output_zero_point;
381 }
382 for (uint32_t i = 0; i < 32; i++) {
383 params->fp32_avx2.output_min[i] = output_min;
384 }
385 }
386
xnn_init_qs8_conv_minmax_fp32_avx512_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)387 void xnn_init_qs8_conv_minmax_fp32_avx512_params(
388 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
389 float scale,
390 int8_t output_zero_point,
391 int8_t output_min,
392 int8_t output_max)
393 {
394 assert(scale >= 0x1.0p-32f);
395 assert(scale < 256.0f);
396
397 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
398 for (uint32_t i = 0; i < 16; i++) {
399 params->fp32_avx512.scale[i] = scale;
400 params->fp32_avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
401 }
402 for (uint32_t i = 0; i < 32; i++) {
403 params->fp32_avx512.output_zero_point[i] = (int16_t) output_zero_point;
404 }
405 for (uint32_t i = 0; i < 64; i++) {
406 params->fp32_avx512.output_min[i] = output_min;
407 }
408 }
409 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
410
411 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_conv_minmax_fp32_neon_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)412 void xnn_init_qs8_conv_minmax_fp32_neon_params(
413 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
414 float scale,
415 int8_t output_zero_point,
416 int8_t output_min,
417 int8_t output_max)
418 {
419 assert(scale >= 0x1.0p-32f);
420 assert(scale < 256.0f);
421
422 params->fp32_neon.scale = scale;
423 params->fp32_neon.magic_bias = 12582912.0f;
424 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
425 params->fp32_neon.output_min = output_min;
426 params->fp32_neon.output_max = output_max;
427 }
428
xnn_init_qs8_conv_minmax_fp32_neonv8_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)429 void xnn_init_qs8_conv_minmax_fp32_neonv8_params(
430 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
431 float scale,
432 int8_t output_zero_point,
433 int8_t output_min,
434 int8_t output_max)
435 {
436 assert(scale >= 0x1.0p-32f);
437 assert(scale < 256.0f);
438
439 params->fp32_neonv8.scale = scale;
440 params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
441 params->fp32_neonv8.output_min = output_min;
442 params->fp32_neonv8.output_max = output_max;
443 }
444
xnn_init_qs8_conv_minmax_rndnu_neon_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)445 void xnn_init_qs8_conv_minmax_rndnu_neon_params(
446 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
447 float scale,
448 int8_t output_zero_point,
449 int8_t output_min,
450 int8_t output_max)
451 {
452 assert(scale >= 0x1.0p-32f);
453 assert(scale < 256.0f);
454
455 // Compute requantization parameters.
456 const uint32_t scale_bits = fp32_to_bits(scale);
457
458 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
459 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
460 assert(multiplier >= INT32_C(0x40000000));
461 assert(multiplier <= INT32_C(0x7FFFFF80));
462
463 // Shift is in [-8, 31] range.
464 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
465 assert(shift >= -8);
466 assert(shift < 32);
467
468 // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
469 const int32_t post_shift = math_max_s32(shift, 1);
470 const int32_t pre_shift = shift - post_shift;
471
472 params->rndnu_neon.right_pre_shift = -pre_shift;
473 params->rndnu_neon.multiplier = multiplier;
474 params->rndnu_neon.right_post_shift = -post_shift;
475 params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
476 params->rndnu_neon.output_min = output_min;
477 params->rndnu_neon.output_max = output_max;
478 }
479 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
480
481 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_conv_minmax_fp32_wasmsimd_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)482 void xnn_init_qs8_conv_minmax_fp32_wasmsimd_params(
483 union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
484 float scale,
485 int8_t output_zero_point,
486 int8_t output_min,
487 int8_t output_max)
488 {
489 assert(scale >= 0x1.0p-32f);
490 assert(scale < 256.0f);
491
492 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
493 const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
494 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
495 for (uint32_t i = 0; i < 2; i++) {
496 params->fp32_wasmsimd.scale[i] = scale;
497 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
498 params->fp32_wasmsimd.magic_min[i] = magic_min;
499 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
500 }
501 for (uint32_t i = 0; i < 8; i++) {
502 params->fp32_wasmsimd.output_max[i] = output_max;
503 }
504 }
505 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
506
xnn_init_qc8_scale_fp32_params(size_t channels,size_t channels_tile,size_t stride,const float scale[XNN_MIN_ELEMENTS (1)],void * packed_w)507 void xnn_init_qc8_scale_fp32_params(
508 size_t channels,
509 size_t channels_tile,
510 size_t stride,
511 const float scale[XNN_MIN_ELEMENTS(1)],
512 void* packed_w)
513 {
514 for (size_t tile_start = 0; tile_start < channels; tile_start += channels_tile) {
515 const size_t tile_size = min(channels - tile_start, channels_tile);
516 for (size_t tile_offset = 0; tile_offset < tile_size; tile_offset++) {
517 ((float*) packed_w)[tile_offset] = scale[tile_start + tile_offset];
518 }
519 packed_w = (void*) ((uintptr_t) packed_w + stride);
520 }
521 }
522
xnn_init_qs8_minmax_scalar_fmagic_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)523 void xnn_init_qs8_minmax_scalar_fmagic_params(
524 union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
525 int8_t output_zero_point,
526 int8_t output_min,
527 int8_t output_max)
528 {
529 params->scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
530 params->scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
531 params->scalar_fmagic.magic_bias = 12582912.0f;
532 params->scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
533 }
534
xnn_init_qs8_minmax_scalar_imagic_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)535 void xnn_init_qs8_minmax_scalar_imagic_params(
536 union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
537 int8_t output_zero_point,
538 int8_t output_min,
539 int8_t output_max)
540 {
541 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
542 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
543 params->scalar_imagic.magic_bias = 12582912.0f;
544 params->scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
545 params->scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
546 params->scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
547 }
548
xnn_init_qs8_minmax_scalar_lrintf_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)549 void xnn_init_qs8_minmax_scalar_lrintf_params(
550 union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
551 int8_t output_zero_point,
552 int8_t output_min,
553 int8_t output_max)
554 {
555 params->scalar_lrintf.output_min_less_zero_point = (long) ((int32_t) output_min - (int32_t) output_zero_point);
556 params->scalar_lrintf.output_max_less_zero_point = (long) ((int32_t) output_max - (int32_t) output_zero_point);
557 params->scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
558 }
559
560 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_minmax_sse2_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)561 void xnn_init_qs8_minmax_sse2_params(
562 union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
563 int8_t output_zero_point,
564 int8_t output_min,
565 int8_t output_max)
566 {
567 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
568 for (uint32_t i = 0; i < 4; i++) {
569 params->sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
570 }
571 for (uint32_t i = 0; i < 8; i++) {
572 params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
573 params->sse2.output_min[i] = (int16_t) output_min;
574 }
575 }
576
xnn_init_qs8_minmax_sse4_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)577 void xnn_init_qs8_minmax_sse4_params(
578 union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
579 int8_t output_zero_point,
580 int8_t output_min,
581 int8_t output_max)
582 {
583 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
584 for (uint32_t i = 0; i < 4; i++) {
585 params->sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
586 }
587 for (uint32_t i = 0; i < 8; i++) {
588 params->sse4.output_zero_point[i] = (int16_t) output_zero_point;
589 }
590 for (uint32_t i = 0; i < 16; i++) {
591 params->sse4.output_min[i] = output_min;
592 }
593 }
594
xnn_init_qs8_minmax_avx2_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)595 void xnn_init_qs8_minmax_avx2_params(
596 union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
597 int8_t output_zero_point,
598 int8_t output_min,
599 int8_t output_max)
600 {
601 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
602 for (uint32_t i = 0; i < 8; i++) {
603 params->avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
604 }
605 for (uint32_t i = 0; i < 16; i++) {
606 params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
607 }
608 for (uint32_t i = 0; i < 32; i++) {
609 params->avx2.output_min[i] = output_min;
610 }
611 }
612
xnn_init_qs8_minmax_avx512_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)613 void xnn_init_qs8_minmax_avx512_params(
614 union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
615 int8_t output_zero_point,
616 int8_t output_min,
617 int8_t output_max)
618 {
619 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
620 for (uint32_t i = 0; i < 16; i++) {
621 params->avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
622 }
623 for (uint32_t i = 0; i < 32; i++) {
624 params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
625 }
626 for (uint32_t i = 0; i < 64; i++) {
627 params->avx512.output_min[i] = output_min;
628 }
629 }
630 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
631
632 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_minmax_neon_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)633 void xnn_init_qs8_minmax_neon_params(
634 union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
635 int8_t output_zero_point,
636 int8_t output_min,
637 int8_t output_max)
638 {
639 params->neon.magic_bias = 12582912.0f;
640 params->neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
641 params->neon.output_min = output_min;
642 params->neon.output_max = output_max;
643 }
644
xnn_init_qs8_minmax_neonv8_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)645 void xnn_init_qs8_minmax_neonv8_params(
646 union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
647 int8_t output_zero_point,
648 int8_t output_min,
649 int8_t output_max)
650 {
651 params->neonv8.output_zero_point = (int16_t) output_zero_point;
652 params->neonv8.output_min = output_min;
653 params->neonv8.output_max = output_max;
654 }
655 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
656
657 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_minmax_wasmsimd_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)658 void xnn_init_qs8_minmax_wasmsimd_params(
659 union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
660 int8_t output_zero_point,
661 int8_t output_min,
662 int8_t output_max)
663 {
664 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
665 const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
666 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
667 for (uint32_t i = 0; i < 2; i++) {
668 params->wasmsimd.magic_bias[i] = 12582912.0f;
669 params->wasmsimd.magic_min[i] = magic_min;
670 params->wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
671 }
672 for (uint32_t i = 0; i < 8; i++) {
673 params->wasmsimd.output_max[i] = output_max;
674 }
675 }
676 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
677
xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)678 void xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params(
679 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
680 int32_t init_bias,
681 float scale,
682 int8_t output_zero_point,
683 int8_t output_min,
684 int8_t output_max)
685 {
686 assert(scale >= 0x1.0p-32f);
687 assert(scale < 256.0f);
688
689 params->fp32_scalar_fmagic.init_bias = init_bias;
690 params->fp32_scalar_fmagic.scale = scale;
691 params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
692 params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
693 params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
694 params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
695 }
696
xnn_update_qs8_avgpool_minmax_fp32_scalar_fmagic_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)697 void xnn_update_qs8_avgpool_minmax_fp32_scalar_fmagic_params(
698 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
699 int32_t init_bias,
700 float scale)
701 {
702 assert(scale >= 0x1.0p-32f);
703 assert(scale < 256.0f);
704
705 params->fp32_scalar_fmagic.init_bias = init_bias;
706 params->fp32_scalar_fmagic.scale = scale;
707 }
708
xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)709 void xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params(
710 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
711 int32_t init_bias,
712 float scale,
713 int8_t output_zero_point,
714 int8_t output_min,
715 int8_t output_max)
716 {
717 assert(scale >= 0x1.0p-32f);
718 assert(scale < 256.0f);
719
720 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
721 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
722 params->fp32_scalar_imagic.init_bias = init_bias;
723 params->fp32_scalar_imagic.scale = scale;
724 params->fp32_scalar_imagic.magic_bias = 12582912.0f;
725 params->fp32_scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
726 params->fp32_scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
727 params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
728 }
729
xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)730 void xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params(
731 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
732 int32_t init_bias,
733 float scale)
734 {
735 assert(scale >= 0x1.0p-32f);
736 assert(scale < 256.0f);
737
738 params->fp32_scalar_imagic.init_bias = init_bias;
739 params->fp32_scalar_imagic.scale = scale;
740 }
741
xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)742 void xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params(
743 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
744 int32_t init_bias,
745 float scale,
746 int8_t output_zero_point,
747 int8_t output_min,
748 int8_t output_max)
749 {
750 assert(scale >= 0x1.0p-32f);
751 assert(scale < 256.0f);
752
753 params->fp32_scalar_lrintf.init_bias = init_bias;
754 params->fp32_scalar_lrintf.scale = scale;
755 params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
756 params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
757 params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
758 }
759
xnn_update_qs8_avgpool_minmax_fp32_scalar_lrintf_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)760 void xnn_update_qs8_avgpool_minmax_fp32_scalar_lrintf_params(
761 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
762 int32_t init_bias,
763 float scale)
764 {
765 assert(scale >= 0x1.0p-32f);
766 assert(scale < 256.0f);
767
768 params->fp32_scalar_lrintf.init_bias = init_bias;
769 params->fp32_scalar_lrintf.scale = scale;
770 }
771
772 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_avgpool_minmax_fp32_sse2_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)773 void xnn_init_qs8_avgpool_minmax_fp32_sse2_params(
774 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
775 int32_t init_bias,
776 float scale,
777 int8_t output_zero_point,
778 int8_t output_min,
779 int8_t output_max)
780 {
781 assert(scale >= 0x1.0p-32f);
782 assert(scale < 256.0f);
783
784 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
785 for (uint32_t i = 0; i < 4; i++) {
786 params->fp32_sse2.init_bias[i] = init_bias;
787 params->fp32_sse2.scale[i] = scale;
788 params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
789 }
790 for (uint32_t i = 0; i < 8; i++) {
791 params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
792 params->fp32_sse2.output_min[i] = (int16_t) output_min;
793 }
794 }
795
xnn_update_qs8_avgpool_minmax_fp32_sse2_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)796 void xnn_update_qs8_avgpool_minmax_fp32_sse2_params(
797 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
798 int32_t init_bias,
799 float scale)
800 {
801 assert(scale >= 0x1.0p-32f);
802 assert(scale < 256.0f);
803
804 for (uint32_t i = 0; i < 4; i++) {
805 params->fp32_sse2.init_bias[i] = init_bias;
806 params->fp32_sse2.scale[i] = scale;
807 }
808 }
809
xnn_init_qs8_avgpool_minmax_fp32_sse4_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)810 void xnn_init_qs8_avgpool_minmax_fp32_sse4_params(
811 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
812 int32_t init_bias,
813 float scale,
814 int8_t output_zero_point,
815 int8_t output_min,
816 int8_t output_max)
817 {
818 assert(scale >= 0x1.0p-32f);
819 assert(scale < 256.0f);
820
821 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
822 for (uint32_t i = 0; i < 4; i++) {
823 params->fp32_sse4.init_bias[i] = init_bias;
824 params->fp32_sse4.scale[i] = scale;
825 params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
826 }
827 for (uint32_t i = 0; i < 8; i++) {
828 params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
829 }
830 for (uint32_t i = 0; i < 16; i++) {
831 params->fp32_sse4.output_min[i] = output_min;
832 }
833 }
834
xnn_update_qs8_avgpool_minmax_fp32_sse4_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)835 void xnn_update_qs8_avgpool_minmax_fp32_sse4_params(
836 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
837 int32_t init_bias,
838 float scale)
839 {
840 assert(scale >= 0x1.0p-32f);
841 assert(scale < 256.0f);
842
843 for (uint32_t i = 0; i < 4; i++) {
844 params->fp32_sse4.init_bias[i] = init_bias;
845 params->fp32_sse4.scale[i] = scale;
846 }
847 }
848 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
849
850 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_avgpool_minmax_fp32_neon_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)851 void xnn_init_qs8_avgpool_minmax_fp32_neon_params(
852 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
853 int32_t init_bias,
854 float scale,
855 int8_t output_zero_point,
856 int8_t output_min,
857 int8_t output_max)
858 {
859 assert(scale >= 0x1.0p-32f);
860 assert(scale < 256.0f);
861
862 params->fp32_neon.init_bias = init_bias;
863 params->fp32_neon.scale = scale;
864 params->fp32_neon.magic_bias = 12582912.0f;
865 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
866 params->fp32_neon.output_min = output_min;
867 params->fp32_neon.output_max = output_max;
868 }
869
xnn_update_qs8_avgpool_minmax_fp32_neon_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)870 void xnn_update_qs8_avgpool_minmax_fp32_neon_params(
871 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
872 int32_t init_bias,
873 float scale)
874 {
875 assert(scale >= 0x1.0p-32f);
876 assert(scale < 256.0f);
877
878 params->fp32_neon.init_bias = init_bias;
879 params->fp32_neon.scale = scale;
880 }
881
xnn_init_qs8_avgpool_minmax_fp32_neonv8_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)882 void xnn_init_qs8_avgpool_minmax_fp32_neonv8_params(
883 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
884 int32_t init_bias,
885 float scale,
886 int8_t output_zero_point,
887 int8_t output_min,
888 int8_t output_max)
889 {
890 assert(scale >= 0x1.0p-32f);
891 assert(scale < 256.0f);
892
893 params->fp32_neonv8.init_bias = init_bias;
894 params->fp32_neonv8.scale = scale;
895 params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
896 params->fp32_neonv8.output_min = output_min;
897 params->fp32_neonv8.output_max = output_max;
898 }
899
xnn_update_qs8_avgpool_minmax_fp32_neonv8_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)900 void xnn_update_qs8_avgpool_minmax_fp32_neonv8_params(
901 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
902 int32_t init_bias,
903 float scale)
904 {
905 assert(scale >= 0x1.0p-32f);
906 assert(scale < 256.0f);
907
908 params->fp32_neonv8.init_bias = init_bias;
909 params->fp32_neonv8.scale = scale;
910 }
911
xnn_init_qs8_avgpool_minmax_rndnu_neon_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)912 void xnn_init_qs8_avgpool_minmax_rndnu_neon_params(
913 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
914 int32_t init_bias,
915 float scale,
916 int8_t output_zero_point,
917 int8_t output_min,
918 int8_t output_max)
919 {
920 assert(scale >= 0x1.0p-32f);
921 assert(scale < 256.0f);
922
923 // Compute requantization parameters.
924 const uint32_t scale_bits = fp32_to_bits(scale);
925
926 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
927 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
928 assert(multiplier >= INT32_C(0x40000000));
929 assert(multiplier <= INT32_C(0x7FFFFF80));
930
931 // Shift is in [-8, 31] range.
932 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
933 assert(shift >= -8);
934 assert(shift < 32);
935
936 // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
937 const int32_t post_shift = math_max_s32(shift, 1);
938 const int32_t pre_shift = shift - post_shift;
939
940 params->rndnu_neon.init_bias = init_bias;
941 params->rndnu_neon.left_pre_shift = -pre_shift;
942 params->rndnu_neon.multiplier = multiplier;
943 params->rndnu_neon.left_post_shift = -post_shift;
944 params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
945 params->rndnu_neon.output_min = output_min;
946 params->rndnu_neon.output_max = output_max;
947 }
948
xnn_update_qs8_avgpool_minmax_rndnu_neon_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)949 void xnn_update_qs8_avgpool_minmax_rndnu_neon_params(
950 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
951 int32_t init_bias,
952 float scale)
953 {
954 assert(scale >= 0x1.0p-32f);
955 assert(scale < 256.0f);
956
957 // Compute requantization parameters.
958 const uint32_t scale_bits = fp32_to_bits(scale);
959
960 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
961 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
962 assert(multiplier >= INT32_C(0x40000000));
963 assert(multiplier <= INT32_C(0x7FFFFF80));
964
965 // Shift is in [-8, 31] range.
966 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
967 assert(shift >= -8);
968 assert(shift < 32);
969
970 // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
971 const int32_t post_shift = math_max_s32(shift, 1);
972 const int32_t pre_shift = shift - post_shift;
973
974 params->rndnu_neon.init_bias = init_bias;
975 params->rndnu_neon.left_pre_shift = -pre_shift;
976 params->rndnu_neon.multiplier = multiplier;
977 params->rndnu_neon.left_post_shift = -post_shift;
978 }
979 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
980
981 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)982 void xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params(
983 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
984 int32_t init_bias,
985 float scale,
986 int8_t output_zero_point,
987 int8_t output_min,
988 int8_t output_max)
989 {
990 assert(scale >= 0x1.0p-32f);
991 assert(scale < 256.0f);
992
993 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
994 const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
995 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
996 for (uint32_t i = 0; i < 2; i++) {
997 params->fp32_wasmsimd.init_bias[i] = init_bias;
998 params->fp32_wasmsimd.scale[i] = scale;
999 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
1000 params->fp32_wasmsimd.magic_min[i] = magic_min;
1001 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
1002 }
1003 for (uint32_t i = 0; i < 8; i++) {
1004 params->fp32_wasmsimd.output_max[i] = output_max;
1005 }
1006 }
1007
xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1008 void xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params(
1009 union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1010 int32_t init_bias,
1011 float scale)
1012 {
1013 assert(scale >= 0x1.0p-32f);
1014 assert(scale < 256.0f);
1015
1016 for (uint32_t i = 0; i < 2; i++) {
1017 params->fp32_wasmsimd.init_bias[i] = init_bias;
1018 params->fp32_wasmsimd.scale[i] = scale;
1019 }
1020 }
1021 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1022
xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1023 void xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params(
1024 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1025 int32_t init_bias,
1026 float scale,
1027 uint8_t output_zero_point,
1028 uint8_t output_min,
1029 uint8_t output_max)
1030 {
1031 assert(scale >= 0x1.0p-32f);
1032 assert(scale < 256.0f);
1033
1034 params->fp32_scalar_fmagic.init_bias = init_bias;
1035 params->fp32_scalar_fmagic.scale = scale;
1036 params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1037 params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1038 params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
1039 params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1040 }
1041
xnn_update_qu8_avgpool_minmax_fp32_scalar_fmagic_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1042 void xnn_update_qu8_avgpool_minmax_fp32_scalar_fmagic_params(
1043 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1044 int32_t init_bias,
1045 float scale)
1046 {
1047 assert(scale >= 0x1.0p-32f);
1048 assert(scale < 256.0f);
1049
1050 params->fp32_scalar_fmagic.init_bias = init_bias;
1051 params->fp32_scalar_fmagic.scale = scale;
1052 }
1053
xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1054 void xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params(
1055 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1056 int32_t init_bias,
1057 float scale,
1058 uint8_t output_zero_point,
1059 uint8_t output_min,
1060 uint8_t output_max)
1061 {
1062 assert(scale >= 0x1.0p-32f);
1063 assert(scale < 256.0f);
1064
1065 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1066 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1067 params->fp32_scalar_imagic.init_bias = init_bias;
1068 params->fp32_scalar_imagic.scale = scale;
1069 params->fp32_scalar_imagic.magic_bias = 12582912.0f;
1070 params->fp32_scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
1071 params->fp32_scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
1072 params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1073 }
1074
xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1075 void xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params(
1076 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1077 int32_t init_bias,
1078 float scale)
1079 {
1080 assert(scale >= 0x1.0p-32f);
1081 assert(scale < 256.0f);
1082
1083 params->fp32_scalar_imagic.init_bias = init_bias;
1084 params->fp32_scalar_imagic.scale = scale;
1085 }
1086
xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1087 void xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params(
1088 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1089 int32_t init_bias,
1090 float scale,
1091 uint8_t output_zero_point,
1092 uint8_t output_min,
1093 uint8_t output_max)
1094 {
1095 assert(scale >= 0x1.0p-32f);
1096 assert(scale < 256.0f);
1097
1098 params->fp32_scalar_lrintf.init_bias = init_bias;
1099 params->fp32_scalar_lrintf.scale = scale;
1100 params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1101 params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1102 params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
1103 }
1104
xnn_update_qu8_avgpool_minmax_fp32_scalar_lrintf_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1105 void xnn_update_qu8_avgpool_minmax_fp32_scalar_lrintf_params(
1106 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1107 int32_t init_bias,
1108 float scale)
1109 {
1110 assert(scale >= 0x1.0p-32f);
1111 assert(scale < 256.0f);
1112
1113 params->fp32_scalar_lrintf.init_bias = init_bias;
1114 params->fp32_scalar_lrintf.scale = scale;
1115 }
1116
1117 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_avgpool_minmax_fp32_sse2_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1118 void xnn_init_qu8_avgpool_minmax_fp32_sse2_params(
1119 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1120 int32_t init_bias,
1121 float scale,
1122 uint8_t output_zero_point,
1123 uint8_t output_min,
1124 uint8_t output_max)
1125 {
1126 assert(scale >= 0x1.0p-32f);
1127 assert(scale < 256.0f);
1128
1129 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1130 for (uint32_t i = 0; i < 4; i++) {
1131 params->fp32_sse2.init_bias[i] = init_bias;
1132 params->fp32_sse2.scale[i] = scale;
1133 params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
1134 }
1135 for (uint32_t i = 0; i < 8; i++) {
1136 params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
1137 }
1138 for (uint32_t i = 0; i < 16; i++) {
1139 params->fp32_sse2.output_min[i] = output_min;
1140 }
1141 }
1142
xnn_update_qu8_avgpool_minmax_fp32_sse2_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1143 void xnn_update_qu8_avgpool_minmax_fp32_sse2_params(
1144 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1145 int32_t init_bias,
1146 float scale)
1147 {
1148 assert(scale >= 0x1.0p-32f);
1149 assert(scale < 256.0f);
1150
1151 for (uint32_t i = 0; i < 4; i++) {
1152 params->fp32_sse2.init_bias[i] = init_bias;
1153 params->fp32_sse2.scale[i] = scale;
1154 }
1155 }
1156
xnn_init_qu8_avgpool_minmax_fp32_sse4_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1157 void xnn_init_qu8_avgpool_minmax_fp32_sse4_params(
1158 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1159 int32_t init_bias,
1160 float scale,
1161 uint8_t output_zero_point,
1162 uint8_t output_min,
1163 uint8_t output_max)
1164 {
1165 assert(scale >= 0x1.0p-32f);
1166 assert(scale < 256.0f);
1167
1168 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1169 for (uint32_t i = 0; i < 4; i++) {
1170 params->fp32_sse4.init_bias[i] = init_bias;
1171 params->fp32_sse4.scale[i] = scale;
1172 params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
1173 }
1174 for (uint32_t i = 0; i < 8; i++) {
1175 params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
1176 }
1177 for (uint32_t i = 0; i < 16; i++) {
1178 params->fp32_sse4.output_min[i] = output_min;
1179 }
1180 }
1181
xnn_update_qu8_avgpool_minmax_fp32_sse4_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1182 void xnn_update_qu8_avgpool_minmax_fp32_sse4_params(
1183 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1184 int32_t init_bias,
1185 float scale)
1186 {
1187 assert(scale >= 0x1.0p-32f);
1188 assert(scale < 256.0f);
1189
1190 for (uint32_t i = 0; i < 4; i++) {
1191 params->fp32_sse4.init_bias[i] = init_bias;
1192 params->fp32_sse4.scale[i] = scale;
1193 }
1194 }
1195 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1196
1197 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_avgpool_minmax_fp32_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1198 void xnn_init_qu8_avgpool_minmax_fp32_neon_params(
1199 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1200 int32_t init_bias,
1201 float scale,
1202 uint8_t output_zero_point,
1203 uint8_t output_min,
1204 uint8_t output_max)
1205 {
1206 assert(scale >= 0x1.0p-32f);
1207 assert(scale < 256.0f);
1208
1209 params->fp32_neon.init_bias = init_bias;
1210 params->fp32_neon.scale = scale;
1211 params->fp32_neon.magic_bias = 12582912.0f;
1212 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1213 params->fp32_neon.output_min = output_min;
1214 params->fp32_neon.output_max = output_max;
1215 }
1216
xnn_update_qu8_avgpool_minmax_fp32_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1217 void xnn_update_qu8_avgpool_minmax_fp32_neon_params(
1218 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1219 int32_t init_bias,
1220 float scale)
1221 {
1222 assert(scale >= 0x1.0p-32f);
1223 assert(scale < 256.0f);
1224
1225 params->fp32_neon.init_bias = init_bias;
1226 params->fp32_neon.scale = scale;
1227 }
1228
xnn_init_qu8_avgpool_minmax_fp32_neonv8_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1229 void xnn_init_qu8_avgpool_minmax_fp32_neonv8_params(
1230 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1231 int32_t init_bias,
1232 float scale,
1233 uint8_t output_zero_point,
1234 uint8_t output_min,
1235 uint8_t output_max)
1236 {
1237 assert(scale >= 0x1.0p-32f);
1238 assert(scale < 256.0f);
1239
1240 params->fp32_neonv8.init_bias = init_bias;
1241 params->fp32_neonv8.scale = scale;
1242 params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
1243 params->fp32_neonv8.output_min = output_min;
1244 params->fp32_neonv8.output_max = output_max;
1245 }
1246
xnn_update_qu8_avgpool_minmax_fp32_neonv8_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1247 void xnn_update_qu8_avgpool_minmax_fp32_neonv8_params(
1248 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1249 int32_t init_bias,
1250 float scale)
1251 {
1252 assert(scale >= 0x1.0p-32f);
1253 assert(scale < 256.0f);
1254
1255 params->fp32_neonv8.init_bias = init_bias;
1256 params->fp32_neonv8.scale = scale;
1257 }
1258
xnn_init_qu8_avgpool_minmax_rndnu_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1259 void xnn_init_qu8_avgpool_minmax_rndnu_neon_params(
1260 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1261 int32_t init_bias,
1262 float scale,
1263 uint8_t output_zero_point,
1264 uint8_t output_min,
1265 uint8_t output_max)
1266 {
1267 assert(scale >= 0x1.0p-32f);
1268 assert(scale < 256.0f);
1269
1270 // Compute requantization parameters.
1271 const uint32_t scale_bits = fp32_to_bits(scale);
1272
1273 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
1274 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
1275 assert(multiplier >= INT32_C(0x40000000));
1276 assert(multiplier <= INT32_C(0x7FFFFF80));
1277
1278 // Shift is in [-8, 31] range.
1279 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
1280 assert(shift >= -8);
1281 assert(shift < 32);
1282
1283 // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
1284 const int32_t post_shift = math_max_s32(shift, 1);
1285 const int32_t pre_shift = shift - post_shift;
1286
1287 params->rndnu_neon.init_bias = init_bias;
1288 params->rndnu_neon.left_pre_shift = -pre_shift;
1289 params->rndnu_neon.multiplier = multiplier;
1290 params->rndnu_neon.left_post_shift = -post_shift;
1291 params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
1292 params->rndnu_neon.output_min = output_min;
1293 params->rndnu_neon.output_max = output_max;
1294 }
1295
xnn_update_qu8_avgpool_minmax_rndnu_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1296 void xnn_update_qu8_avgpool_minmax_rndnu_neon_params(
1297 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1298 int32_t init_bias,
1299 float scale)
1300 {
1301 assert(scale >= 0x1.0p-32f);
1302 assert(scale < 256.0f);
1303
1304 // Compute requantization parameters.
1305 const uint32_t scale_bits = fp32_to_bits(scale);
1306
1307 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
1308 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
1309 assert(multiplier >= INT32_C(0x40000000));
1310 assert(multiplier <= INT32_C(0x7FFFFF80));
1311
1312 // Shift is in [-8, 31] range.
1313 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
1314 assert(shift >= -8);
1315 assert(shift < 32);
1316
1317 // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
1318 const int32_t post_shift = math_max_s32(shift, 1);
1319 const int32_t pre_shift = shift - post_shift;
1320
1321 params->rndnu_neon.init_bias = init_bias;
1322 params->rndnu_neon.left_pre_shift = -pre_shift;
1323 params->rndnu_neon.multiplier = multiplier;
1324 params->rndnu_neon.left_post_shift = -post_shift;
1325 }
1326 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1327
1328 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1329 void xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params(
1330 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1331 int32_t init_bias,
1332 float scale,
1333 uint8_t output_zero_point,
1334 uint8_t output_min,
1335 uint8_t output_max)
1336 {
1337 assert(scale >= 0x1.0p-32f);
1338 assert(scale < 256.0f);
1339
1340 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1341 const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
1342 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1343 for (uint32_t i = 0; i < 2; i++) {
1344 params->fp32_wasmsimd.init_bias[i] = init_bias;
1345 params->fp32_wasmsimd.scale[i] = scale;
1346 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
1347 params->fp32_wasmsimd.magic_min[i] = magic_min;
1348 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
1349 }
1350 for (uint32_t i = 0; i < 8; i++) {
1351 params->fp32_wasmsimd.output_max[i] = output_max;
1352 }
1353 }
1354
xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1355 void xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params(
1356 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1357 int32_t init_bias,
1358 float scale)
1359 {
1360 assert(scale >= 0x1.0p-32f);
1361 assert(scale < 256.0f);
1362
1363 for (uint32_t i = 0; i < 2; i++) {
1364 params->fp32_wasmsimd.init_bias[i] = init_bias;
1365 params->fp32_wasmsimd.scale[i] = scale;
1366 }
1367 }
1368 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1369
xnn_init_qu8_avgpool_minmax_scalar_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1370 void xnn_init_qu8_avgpool_minmax_scalar_params(
1371 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1372 int32_t bias,
1373 float scale,
1374 uint8_t output_zero_point,
1375 uint8_t output_min,
1376 uint8_t output_max)
1377 {
1378 // Compute requantization parameters.
1379 assert(scale >= 0x1.0p-32f);
1380 assert(scale < 256.0f);
1381 const uint32_t scale_bits = fp32_to_bits(scale);
1382
1383 // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1384 const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1385 assert(multiplier >= INT32_C(0x00800000));
1386 assert(multiplier <= INT32_C(0x00FFFFFF));
1387
1388 // Shift is in [16, 55] range.
1389 const int32_t shift = 127 + 23 - (scale_bits >> 23);
1390 assert(shift >= 16);
1391 assert(shift < 64);
1392
1393 const uint32_t right_shift = (uint32_t) shift;
1394 const int64_t rounding = INT64_C(1) << (right_shift - 1);
1395 params->scalar.bias = bias;
1396 params->scalar.rounding = rounding;
1397 params->scalar.multiplier = multiplier;
1398 params->scalar.right_shift = right_shift;
1399 params->scalar.output_min_less_zero_point =
1400 (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
1401 params->scalar.output_max_less_zero_point =
1402 (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
1403 params->scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
1404 }
1405
1406 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_avgpool_minmax_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1407 void xnn_init_qu8_avgpool_minmax_neon_params(
1408 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1409 int32_t bias,
1410 float scale,
1411 uint8_t output_zero_point,
1412 uint8_t output_min,
1413 uint8_t output_max)
1414 {
1415 // Compute requantization parameters.
1416 assert(scale >= 0x1.0p-32f);
1417 assert(scale < 256.0f);
1418 const uint32_t scale_bits = fp32_to_bits(scale);
1419
1420 // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1421 const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1422 assert(multiplier >= INT32_C(0x00800000));
1423 assert(multiplier <= INT32_C(0x00FFFFFF));
1424
1425 // Shift is in [16, 55] range.
1426 const int32_t shift = 127 + 23 - (scale_bits >> 23);
1427 assert(shift >= 16);
1428 assert(shift < 64);
1429
1430 params->neon.bias = bias;
1431 params->neon.multiplier = multiplier;
1432 params->neon.left_shift = (int64_t) -shift;
1433 params->neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
1434 params->neon.output_min = output_min;
1435 params->neon.output_max = output_max;
1436 }
1437 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1438
1439 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_avgpool_minmax_sse2_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1440 void xnn_init_qu8_avgpool_minmax_sse2_params(
1441 union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1442 int32_t bias,
1443 float scale,
1444 uint8_t output_zero_point,
1445 uint8_t output_min,
1446 uint8_t output_max)
1447 {
1448 // Compute requantization parameters.
1449 assert(scale >= 0x1.0p-32f);
1450 assert(scale < 256.0f);
1451 const uint32_t scale_bits = fp32_to_bits(scale);
1452
1453 // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1454 const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1455 assert(multiplier >= INT32_C(0x00800000));
1456 assert(multiplier <= INT32_C(0x00FFFFFF));
1457
1458 // Shift is in [16, 55] range.
1459 const int32_t shift = 127 + 23 - (scale_bits >> 23);
1460 assert(shift >= 16);
1461 assert(shift < 64);
1462
1463 const uint32_t right_shift = (uint32_t) shift;
1464 const uint64_t rounding = UINT64_C(1) << (right_shift - 1);
1465 params->sse2.bias[0] = bias;
1466 params->sse2.bias[1] = bias;
1467 params->sse2.bias[2] = bias;
1468 params->sse2.bias[3] = bias;
1469 params->sse2.multiplier[0] = (uint32_t) multiplier;
1470 params->sse2.multiplier[1] = (uint32_t) multiplier;
1471 params->sse2.multiplier[2] = (uint32_t) multiplier;
1472 params->sse2.multiplier[3] = (uint32_t) multiplier;
1473 params->sse2.rounding[0] = rounding;
1474 params->sse2.rounding[1] = rounding;
1475 params->sse2.right_shift[0] = (uint64_t) right_shift;
1476 params->sse2.right_shift[1] = (uint64_t) right_shift;
1477 for (uint32_t i = 0; i < 8; i++) {
1478 params->sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
1479 }
1480 for (uint32_t i = 0; i < 16; i++) {
1481 params->sse2.output_min[i] = output_min;
1482 params->sse2.output_max[i] = output_max;
1483 }
1484 }
1485 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1486
xnn_update_qu8_avgpool_minmax_scalar_params(union xnn_qu8_avgpool_minmax_params * params,int32_t bias,float scale)1487 void xnn_update_qu8_avgpool_minmax_scalar_params(
1488 union xnn_qu8_avgpool_minmax_params* params,
1489 int32_t bias,
1490 float scale)
1491 {
1492 // Compute requantization parameters.
1493 assert(scale >= 0x1.0p-32f);
1494 assert(scale < 256.0f);
1495 const uint32_t scale_bits = fp32_to_bits(scale);
1496
1497 // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1498 const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1499 assert(multiplier >= INT32_C(0x00800000));
1500 assert(multiplier <= INT32_C(0x00FFFFFF));
1501
1502 // Shift is in [16, 55] range.
1503 const int32_t shift = 127 + 23 - (scale_bits >> 23);
1504 assert(shift >= 16);
1505 assert(shift < 64);
1506
1507 const int64_t rounding = INT64_C(1) << ((uint32_t) shift - 1);
1508 params->scalar.bias = bias;
1509 params->scalar.multiplier = multiplier;
1510 params->scalar.rounding = rounding;
1511 params->scalar.right_shift = (uint32_t) shift;
1512 }
1513
1514 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_update_qu8_avgpool_minmax_neon_params(union xnn_qu8_avgpool_minmax_params * params,int32_t bias,float scale)1515 void xnn_update_qu8_avgpool_minmax_neon_params(
1516 union xnn_qu8_avgpool_minmax_params* params,
1517 int32_t bias,
1518 float scale)
1519 {
1520 // Compute requantization parameters.
1521 assert(scale >= 0x1.0p-32f);
1522 assert(scale < 256.0f);
1523 const uint32_t scale_bits = fp32_to_bits(scale);
1524
1525 // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1526 const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1527 assert(multiplier >= INT32_C(0x00800000));
1528 assert(multiplier <= INT32_C(0x00FFFFFF));
1529
1530 // Shift is in [16, 55] range.
1531 const int32_t shift = 127 + 23 - (scale_bits >> 23);
1532 assert(shift >= 16);
1533 assert(shift < 64);
1534
1535 params->neon.bias = bias;
1536 params->neon.multiplier = multiplier;
1537 params->neon.left_shift = (int64_t) -shift;
1538 }
1539 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1540
1541 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_update_qu8_avgpool_minmax_sse2_params(union xnn_qu8_avgpool_minmax_params * params,int32_t bias,float scale)1542 void xnn_update_qu8_avgpool_minmax_sse2_params(
1543 union xnn_qu8_avgpool_minmax_params* params,
1544 int32_t bias,
1545 float scale)
1546 {
1547 // Compute requantization parameters.
1548 assert(scale >= 0x1.0p-32f);
1549 assert(scale < 256.0f);
1550 const uint32_t scale_bits = fp32_to_bits(scale);
1551
1552 // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1553 const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1554 assert(multiplier >= INT32_C(0x00800000));
1555 assert(multiplier <= INT32_C(0x00FFFFFF));
1556
1557 // Shift is in [16, 55] range.
1558 const int32_t shift = 127 + 23 - (scale_bits >> 23);
1559 assert(shift >= 16);
1560 assert(shift < 64);
1561
1562 const uint64_t rounding = UINT64_C(1) << ((uint32_t) shift - 1);
1563 params->sse2.bias[0] = bias;
1564 params->sse2.bias[1] = bias;
1565 params->sse2.bias[2] = bias;
1566 params->sse2.bias[3] = bias;
1567 params->sse2.multiplier[0] = (uint32_t) multiplier;
1568 params->sse2.multiplier[1] = (uint32_t) multiplier;
1569 params->sse2.multiplier[2] = (uint32_t) multiplier;
1570 params->sse2.multiplier[3] = (uint32_t) multiplier;
1571 params->sse2.rounding[0] = rounding;
1572 params->sse2.rounding[1] = rounding;
1573 params->sse2.right_shift[0] = (uint64_t) (uint32_t) shift;
1574 params->sse2.right_shift[1] = (uint64_t) (uint32_t) shift;
1575 }
1576 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1577
xnn_update_f32_scaleminmax_scalar_params(union xnn_f32_scaleminmax_params * params,float scale)1578 void xnn_update_f32_scaleminmax_scalar_params(
1579 union xnn_f32_scaleminmax_params* params,
1580 float scale)
1581 {
1582 params->scalar.scale = scale;
1583 }
1584
1585 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_update_f32_scaleminmax_sse_params(union xnn_f32_scaleminmax_params * params,float scale)1586 void xnn_update_f32_scaleminmax_sse_params(
1587 union xnn_f32_scaleminmax_params* params,
1588 float scale)
1589 {
1590 for (uint32_t i = 0; i < 4; i++) {
1591 params->sse.scale[i] = scale;
1592 }
1593 }
1594 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1595
1596 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_scaleminmax_neon_params(union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t scale,uint16_t min,uint16_t max)1597 void xnn_init_f16_scaleminmax_neon_params(
1598 union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1599 uint16_t scale,
1600 uint16_t min,
1601 uint16_t max)
1602 {
1603 params->neon.scale = scale;
1604 params->neon.min = min;
1605 params->neon.max = max;
1606 }
1607 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1608
1609 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_scaleminmax_avx_params(union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t scale,uint16_t min,uint16_t max)1610 void xnn_init_f16_scaleminmax_avx_params(
1611 union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1612 uint16_t scale,
1613 uint16_t min,
1614 uint16_t max)
1615 {
1616 const float scale_f32 = fp16_ieee_to_fp32_value(scale);
1617 const float min_f32 = fp16_ieee_to_fp32_value(min);
1618 const float max_f32 = fp16_ieee_to_fp32_value(max);
1619 for (uint32_t i = 0; i < 8; i++) {
1620 params->avx.scale[i] = scale_f32;
1621 params->avx.min[i] = min_f32;
1622 params->avx.max[i] = max_f32;
1623 }
1624 }
1625 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1626
1627 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_update_f16_scaleminmax_neon_params(union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t scale)1628 void xnn_update_f16_scaleminmax_neon_params(
1629 union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1630 uint16_t scale)
1631 {
1632 params->neon.scale = scale;
1633 }
1634 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1635
1636 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_update_f16_scaleminmax_avx_params(union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t scale)1637 void xnn_update_f16_scaleminmax_avx_params(
1638 union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1639 uint16_t scale)
1640 {
1641 const float scale_f32 = fp16_ieee_to_fp32_value(scale);
1642 for (uint32_t i = 0; i < 8; i++) {
1643 params->avx.scale[i] = scale_f32;
1644 }
1645 }
1646 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1647
xnn_init_f32_scaleminmax_scalar_params(union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],float scale,float min,float max)1648 void xnn_init_f32_scaleminmax_scalar_params(
1649 union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1650 float scale,
1651 float min,
1652 float max)
1653 {
1654 params->scalar.scale = scale;
1655 params->scalar.min = min;
1656 params->scalar.max = max;
1657 }
1658
1659 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_scaleminmax_sse_params(union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],float scale,float min,float max)1660 void xnn_init_f32_scaleminmax_sse_params(
1661 union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1662 float scale,
1663 float min,
1664 float max)
1665 {
1666 for (uint32_t i = 0; i < 4; i++) {
1667 params->sse.scale[i] = scale;
1668 params->sse.min[i] = min;
1669 params->sse.max[i] = max;
1670 }
1671 }
1672 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1673
xnn_init_f32_gavgpool_params(union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS (1)],float multiplier,float output_min,float output_max,uint32_t width)1674 void xnn_init_f32_gavgpool_params(
1675 union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)],
1676 float multiplier,
1677 float output_min,
1678 float output_max,
1679 uint32_t width)
1680 {
1681 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1682 for (uint32_t i = 0; i < 4; i++) {
1683 params->sse.multiplier[i] = multiplier;
1684 params->sse.output_min[i] = output_min;
1685 params->sse.output_max[i] = output_max;
1686 }
1687
1688 const uint32_t w = (width - 1) & 3;
1689 params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
1690 params->sse.mask[1] = -(uint32_t) (w >= 1);
1691 params->sse.mask[2] = -(uint32_t) (w >= 2);
1692 params->sse.mask[3] = -(uint32_t) (w >= 3);
1693 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
1694 params->neon.multiplier = multiplier;
1695 params->neon.output_min = output_min;
1696 params->neon.output_max = output_max;
1697
1698 const uint32_t w = (width - 1) & 3;
1699 params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
1700 params->neon.mask[1] = -(uint32_t) (w >= 1);
1701 params->neon.mask[2] = -(uint32_t) (w >= 2);
1702 params->neon.mask[3] = -(uint32_t) (w >= 3);
1703 #else
1704 params->scalar.multiplier = multiplier;
1705 params->scalar.output_min = output_min;
1706 params->scalar.output_max = output_max;
1707
1708 const uint32_t w = (width - 1) & 3;
1709 params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
1710 params->scalar.mask[1] = -(int32_t) (w >= 1);
1711 params->scalar.mask[2] = -(int32_t) (w >= 2);
1712 params->scalar.mask[3] = -(int32_t) (w >= 3);
1713 #endif
1714 }
1715
xnn_update_f32_gavgpool_params(union xnn_f32_gavgpool_params * params,float multiplier,uint32_t width)1716 void xnn_update_f32_gavgpool_params(
1717 union xnn_f32_gavgpool_params* params,
1718 float multiplier,
1719 uint32_t width)
1720 {
1721 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1722 for (uint32_t i = 0; i < 4; i++) {
1723 params->sse.multiplier[i] = multiplier;
1724 }
1725
1726 const uint32_t w = (width - 1) & 3;
1727 params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
1728 params->sse.mask[1] = -(uint32_t) (w >= 1);
1729 params->sse.mask[2] = -(uint32_t) (w >= 2);
1730 params->sse.mask[3] = -(uint32_t) (w >= 3);
1731 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
1732 params->neon.multiplier = multiplier;
1733
1734 const uint32_t w = (width - 1) & 3;
1735 params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
1736 params->neon.mask[1] = -(uint32_t) (w >= 1);
1737 params->neon.mask[2] = -(uint32_t) (w >= 2);
1738 params->neon.mask[3] = -(uint32_t) (w >= 3);
1739 #else
1740 params->scalar.multiplier = multiplier;
1741
1742 const uint32_t w = (width - 1) & 3;
1743 params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
1744 params->scalar.mask[1] = -(int32_t) (w >= 1);
1745 params->scalar.mask[2] = -(int32_t) (w >= 2);
1746 params->scalar.mask[3] = -(int32_t) (w >= 3);
1747 #endif
1748 }
1749
xnn_init_scalar_f32_gavgpool_params(union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS (1)],float multiplier,float output_min,float output_max,uint32_t width)1750 void xnn_init_scalar_f32_gavgpool_params(
1751 union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)],
1752 float multiplier,
1753 float output_min,
1754 float output_max,
1755 uint32_t width)
1756 {
1757 params->scalar.multiplier = multiplier;
1758 params->scalar.output_min = output_min;
1759 params->scalar.output_max = output_max;
1760
1761 const uint32_t w = (width - 1) & 3;
1762 params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
1763 params->scalar.mask[1] = -(int32_t) (w >= 1);
1764 params->scalar.mask[2] = -(int32_t) (w >= 2);
1765 params->scalar.mask[3] = -(int32_t) (w >= 3);
1766 }
1767
1768 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_minmax_neon_params(union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t min,uint16_t max)1769 void xnn_init_f16_minmax_neon_params(
1770 union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS(1)],
1771 uint16_t min,
1772 uint16_t max)
1773 {
1774 params->neon.min = min;
1775 params->neon.max = max;
1776 }
1777 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1778
1779 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_minmax_avx_params(union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t min,uint16_t max)1780 void xnn_init_f16_minmax_avx_params(
1781 union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS(1)],
1782 uint16_t min,
1783 uint16_t max)
1784 {
1785 const float min_f32 = fp16_ieee_to_fp32_value(min);
1786 const float max_f32 = fp16_ieee_to_fp32_value(max);
1787 for (uint32_t i = 0; i < 8; i++) {
1788 params->avx.min[i] = min_f32;
1789 params->avx.max[i] = max_f32;
1790 }
1791 }
1792 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1793
1794 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_default_avx_params(union xnn_f32_default_params params[XNN_MIN_ELEMENTS (1)])1795 void xnn_init_f32_default_avx_params(
1796 union xnn_f32_default_params params[XNN_MIN_ELEMENTS(1)])
1797 {
1798 for (uint32_t i = 0; i < 7; i++) {
1799 params->avx.mask_table[i] = -1;
1800 }
1801 for (uint32_t i = 7; i < 14; i++) {
1802 params->avx.mask_table[i] = 0;
1803 }
1804 }
1805 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1806
xnn_init_f32_minmax_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)1807 void xnn_init_f32_minmax_params(
1808 union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1809 float output_min,
1810 float output_max)
1811 {
1812 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1813 for (uint32_t i = 0; i < 4; i++) {
1814 params->sse.min[i] = output_min;
1815 params->sse.max[i] = output_max;
1816 }
1817 #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1818 params->wasmsimd.min[0] = output_min;
1819 params->wasmsimd.min[1] = output_min;
1820 params->wasmsimd.max[0] = output_max;
1821 params->wasmsimd.max[1] = output_max;
1822 #else
1823 params->scalar.min = output_min;
1824 params->scalar.max = output_max;
1825 #endif
1826 }
1827
1828 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_minmax_sse_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)1829 void xnn_init_f32_minmax_sse_params(
1830 union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1831 float output_min,
1832 float output_max)
1833 {
1834 for (uint32_t i = 0; i < 4; i++) {
1835 params->sse.min[i] = output_min;
1836 params->sse.max[i] = output_max;
1837 }
1838 }
1839
xnn_init_f32_minmax_avx_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)1840 void xnn_init_f32_minmax_avx_params(
1841 union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1842 float output_min,
1843 float output_max)
1844 {
1845 for (uint32_t i = 0; i < 8; i++) {
1846 params->avx.min[i] = output_min;
1847 params->avx.max[i] = output_max;
1848 }
1849 for (uint32_t i = 0; i < 7; i++) {
1850 params->avx.mask_table[i] = -1;
1851 }
1852 for (uint32_t i = 7; i < 14; i++) {
1853 params->avx.mask_table[i] = 0;
1854 }
1855 }
1856 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1857
1858 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_minmax_wasmsimd_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)1859 void xnn_init_f32_minmax_wasmsimd_params(
1860 union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1861 float output_min,
1862 float output_max)
1863 {
1864 params->wasmsimd.min[0] = output_min;
1865 params->wasmsimd.min[1] = output_min;
1866 params->wasmsimd.max[0] = output_max;
1867 params->wasmsimd.max[1] = output_max;
1868 }
1869 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1870
xnn_init_f32_minmax_scalar_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)1871 void xnn_init_f32_minmax_scalar_params(
1872 union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1873 float output_min,
1874 float output_max)
1875 {
1876 params->scalar.min = output_min;
1877 params->scalar.max = output_max;
1878 }
1879
1880 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_hswish_neon_params(union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS (1)])1881 void xnn_init_f16_hswish_neon_params(
1882 union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS(1)])
1883 {
1884 params->neon.sixth = UINT16_C(0x3155);
1885 params->neon.three = UINT16_C(0x4200);
1886 params->neon.six = UINT16_C(0x4600);
1887 }
1888 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
1889
1890 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_hswish_avx_params(union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS (1)])1891 void xnn_init_f16_hswish_avx_params(
1892 union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS(1)])
1893 {
1894 for (uint32_t i = 0; i < 8; i++) {
1895 params->avx.sixth[i] = 0x1.554000p-3f;
1896 params->avx.three[i] = 3.0f;
1897 params->avx.six[i] = UINT16_C(0x4600);
1898 }
1899 }
1900 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1901
xnn_init_f32_hswish_scalar_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])1902 void xnn_init_f32_hswish_scalar_params(
1903 union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1904 {
1905 params->scalar.sixth = 0x1.555556p-3f;
1906 params->scalar.three = 3.0f;
1907 params->scalar.six = 6.0f;
1908 }
1909
1910 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_hswish_sse_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])1911 void xnn_init_f32_hswish_sse_params(
1912 union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1913 {
1914 for (uint32_t i = 0; i < 4; i++) {
1915 params->sse.sixth[i] = 0x1.555556p-3f;
1916 params->sse.half[i] = 0.5f;
1917 params->sse.one[i] = 1.0f;
1918 }
1919 }
1920
xnn_init_f32_hswish_avx_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])1921 void xnn_init_f32_hswish_avx_params(
1922 union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1923 {
1924 for (uint32_t i = 0; i < 8; i++) {
1925 params->avx.sixth[i] = 0x1.555556p-3f;
1926 params->avx.half[i] = 0.5f;
1927 params->avx.one[i] = 1.0f;
1928 }
1929 for (uint32_t i = 0; i < 7; i++) {
1930 params->avx.mask_table[i] = -1;
1931 }
1932 for (uint32_t i = 7; i < 14; i++) {
1933 params->avx.mask_table[i] = 0;
1934 }
1935 }
1936
xnn_init_f32_hswish_avx512_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])1937 void xnn_init_f32_hswish_avx512_params(
1938 union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1939 {
1940 params->avx512.sixth = 0x1.555556p-3f;
1941 params->avx512.half = 0.5f;
1942 params->avx512.one = 1.0f;
1943 }
1944 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1945
1946 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_hswish_wasmsimd_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])1947 void xnn_init_f32_hswish_wasmsimd_params(
1948 union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1949 {
1950 for (uint32_t i = 0; i < 2; i++) {
1951 params->wasmsimd.sixth[i] = 0x1.555556p-3f;
1952 params->wasmsimd.three[i] = 3.0f;
1953 params->wasmsimd.six[i] = 6.0f;
1954 }
1955 }
1956 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1957
xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])1958 void xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params(
1959 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
1960 {
1961 params->scalar_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
1962 params->scalar_rr2_lut64_p2.minus_log2e = -0x1.715476p0f;
1963 params->scalar_rr2_lut64_p2.ln2_hi = 0x1.630000p-1f;
1964 params->scalar_rr2_lut64_p2.ln2_lo = -0x1.BD0106p-13f;
1965 params->scalar_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
1966 params->scalar_rr2_lut64_p2.one = 1.0f;
1967 params->scalar_rr2_lut64_p2.denorm_cutoff = 0x1.5D589Ep+6f;
1968 }
1969
xnn_init_f32_sigmoid_scalar_rr2_lut2048_p1_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])1970 void xnn_init_f32_sigmoid_scalar_rr2_lut2048_p1_params(
1971 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
1972 {
1973 params->scalar_rr2_lut2048_p1.magic_bias = 0x1.800000p12f;
1974 params->scalar_rr2_lut2048_p1.minus_log2e = -0x1.715476p0f;
1975 params->scalar_rr2_lut2048_p1.ln2_hi = 0x1.600000p-1f;
1976 params->scalar_rr2_lut2048_p1.ln2_lo = 0x1.7217F8p-8f;
1977 params->scalar_rr2_lut2048_p1.c1 = -0x1.FFFFFEp-1f;
1978 params->scalar_rr2_lut2048_p1.one = 1.0f;
1979 params->scalar_rr2_lut2048_p1.denorm_cutoff = 0x1.5D589Ep+6f;
1980 }
1981
xnn_init_f32_sigmoid_scalar_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])1982 void xnn_init_f32_sigmoid_scalar_rr2_p5_params(
1983 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
1984 {
1985 params->scalar_rr2_p5.magic_bias = 0x1.8000FEp23f;
1986 params->scalar_rr2_p5.minus_log2e = -0x1.715476p0f;
1987 params->scalar_rr2_p5.ln2_hi = 0x1.62E400p-1f;
1988 params->scalar_rr2_p5.ln2_lo = 0x1.7F7D1Cp-20f;
1989 params->scalar_rr2_p5.c5 = -0x1.0F9F9Cp-7f;
1990 params->scalar_rr2_p5.c4 = 0x1.573A1Ap-5f;
1991 params->scalar_rr2_p5.c3 = -0x1.555A80p-3f;
1992 params->scalar_rr2_p5.c2 = 0x1.FFFDC6p-2f;
1993 params->scalar_rr2_p5.c1 = -0x1.FFFFF6p-1f;
1994 params->scalar_rr2_p5.one = 1.0f;
1995 params->scalar_rr2_p5.denorm_cutoff = 0x1.5D589Ep+6f;
1996 }
1997
1998 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])1999 void xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params(
2000 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2001 {
2002 params->neon_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
2003 params->neon_rr2_lut64_p2.minus_log2e = -0x1.715476p0f;
2004 params->neon_rr2_lut64_p2.ln2_hi = 0x1.630000p-1f;
2005 params->neon_rr2_lut64_p2.ln2_lo = -0x1.BD0106p-13f;
2006 params->neon_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2007 params->neon_rr2_lut64_p2.denorm_cutoff = 0x1.5D589Ep+6f;
2008 }
2009
xnn_init_f32_sigmoid_neon_rr2_lut2048_p1_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2010 void xnn_init_f32_sigmoid_neon_rr2_lut2048_p1_params(
2011 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2012 {
2013 params->neon_rr2_lut2048_p1.magic_bias = 0x1.800000p12f;
2014 params->neon_rr2_lut2048_p1.minus_log2e = -0x1.715476p0f;
2015 params->neon_rr2_lut2048_p1.ln2_hi = 0x1.600000p-1f;
2016 params->neon_rr2_lut2048_p1.ln2_lo = 0x1.7217F8p-8f;
2017 params->neon_rr2_lut2048_p1.c1 = -0x1.FFFFFEp-1f;
2018 params->neon_rr2_lut2048_p1.denorm_cutoff = 0x1.5D589Ep+6f;
2019 }
2020
xnn_init_f32_sigmoid_neon_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2021 void xnn_init_f32_sigmoid_neon_rr2_p5_params(
2022 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2023 {
2024 params->neon_rr2_p5.magic_bias = 0x1.8000FEp23f;
2025 params->neon_rr2_p5.minus_log2e = -0x1.715476p0f;
2026 params->neon_rr2_p5.ln2_hi = 0x1.62E400p-1f;
2027 params->neon_rr2_p5.ln2_lo = 0x1.7F7D1Cp-20f;
2028 params->neon_rr2_p5.c5 = -0x1.0F9F9Cp-7f;
2029 params->neon_rr2_p5.c4 = 0x1.573A1Ap-5f;
2030 params->neon_rr2_p5.c3 = -0x1.555A80p-3f;
2031 params->neon_rr2_p5.c2 = 0x1.FFFDC6p-2f;
2032 params->neon_rr2_p5.c1 = -0x1.FFFFF6p-1f;
2033 params->neon_rr2_p5.denorm_cutoff = 0x1.5D589Ep+6f;
2034 }
2035
xnn_init_f32_sigmoid_neonfma_rr1_lut2048_p1_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2036 void xnn_init_f32_sigmoid_neonfma_rr1_lut2048_p1_params(
2037 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2038 {
2039 params->neonfma_rr1_lut2048_p1.magic_bias = 0x1.800000p12f;
2040 params->neonfma_rr1_lut2048_p1.minus_log2e = -0x1.715476p0f;
2041 params->neonfma_rr1_lut2048_p1.ln2 = 0x1.62E430p-1f;
2042 params->neonfma_rr1_lut2048_p1.c1 = -0x1.FFFFFEp-1f;
2043 params->neonfma_rr1_lut2048_p1.denorm_cutoff = 0x1.5D589Ep+6f;
2044 }
2045
xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2046 void xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params(
2047 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2048 {
2049 params->neonfma_rr1_lut64_p2.magic_bias = 0x1.800000p17f;
2050 params->neonfma_rr1_lut64_p2.minus_log2e = -0x1.715476p0f;
2051 params->neonfma_rr1_lut64_p2.ln2 = 0x1.62E430p-1f;
2052 params->neonfma_rr1_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2053 params->neonfma_rr1_lut64_p2.denorm_cutoff = 0x1.5D589Ep+6f;
2054 }
2055
xnn_init_f32_sigmoid_neonfma_rr1_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2056 void xnn_init_f32_sigmoid_neonfma_rr1_p5_params(
2057 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2058 {
2059 params->neonfma_rr1_p5.magic_bias = 0x1.8000FEp23f;
2060 params->neonfma_rr1_p5.minus_log2e = -0x1.715476p0f;
2061 params->neonfma_rr1_p5.ln2 = 0x1.62E430p-1f;
2062 params->neonfma_rr1_p5.c5 = -0x1.0F9F9Cp-7f;
2063 params->neonfma_rr1_p5.c4 = 0x1.573A1Ap-5f;
2064 params->neonfma_rr1_p5.c3 = -0x1.555A80p-3f;
2065 params->neonfma_rr1_p5.c2 = 0x1.FFFDC6p-2f;
2066 params->neonfma_rr1_p5.c1 = -0x1.FFFFF6p-1f;
2067 params->neonfma_rr1_p5.denorm_cutoff = 0x1.5D589Ep+6f;
2068 }
2069 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2070
2071 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2072 void xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params(
2073 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2074 {
2075 for (uint32_t i = 0; i < 4; i++) {
2076 params->sse2_rr2_lut64_p2.sign_mask[i] = -0.0f;
2077 params->sse2_rr2_lut64_p2.magic_bias[i] = 0x1.800000p17f;
2078 params->sse2_rr2_lut64_p2.log2e[i] = 0x1.715476p0f;
2079 params->sse2_rr2_lut64_p2.index_mask[i] = UINT32_C(0x3F);
2080 params->sse2_rr2_lut64_p2.minus_ln2_hi[i] = -0x1.630000p-1f;
2081 params->sse2_rr2_lut64_p2.minus_ln2_lo[i] = 0x1.BD0106p-13f;
2082 params->sse2_rr2_lut64_p2.c2[i] = 0x1.FFFF0Ap-2f;
2083 params->sse2_rr2_lut64_p2.one[i] = 1.0f;
2084 params->sse2_rr2_lut64_p2.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2085 }
2086 }
2087
xnn_init_f32_sigmoid_sse2_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2088 void xnn_init_f32_sigmoid_sse2_rr2_p5_params(
2089 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2090 {
2091 for (uint32_t i = 0; i < 4; i++) {
2092 params->sse2_rr2_p5.sign_mask[i] = -0.0f;
2093 params->sse2_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2094 params->sse2_rr2_p5.log2e[i] = 0x1.715476p0f;
2095 params->sse2_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
2096 params->sse2_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2097 params->sse2_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
2098 params->sse2_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2099 params->sse2_rr2_p5.c3[i] = 0x1.555A80p-3f;
2100 params->sse2_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2101 params->sse2_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
2102 params->sse2_rr2_p5.one[i] = 1.0f;
2103 params->sse2_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2104 }
2105 }
2106
xnn_init_f32_sigmoid_avx_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2107 void xnn_init_f32_sigmoid_avx_rr2_p5_params(
2108 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2109 {
2110 for (uint32_t i = 0; i < 8; i++) {
2111 params->avx_rr2_p5.sign_mask[i] = -0.0f;
2112 params->avx_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2113 params->avx_rr2_p5.log2e[i] = 0x1.715476p0f;
2114 params->avx_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
2115 params->avx_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2116 params->avx_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
2117 params->avx_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2118 params->avx_rr2_p5.c3[i] = 0x1.555A80p-3f;
2119 params->avx_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2120 params->avx_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
2121 params->avx_rr2_p5.one[i] = 1.0f;
2122 params->avx_rr2_p5.two[i] = 2.0f;
2123 params->avx_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2124 }
2125 for (uint32_t i = 0; i < 7; i++) {
2126 params->avx_rr2_p5.mask_table[i] = -1;
2127 }
2128 for (uint32_t i = 7; i < 14; i++) {
2129 params->avx_rr2_p5.mask_table[i] = 0;
2130 }
2131 }
2132
xnn_init_f32_sigmoid_avx2_rr1_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2133 void xnn_init_f32_sigmoid_avx2_rr1_p5_params(
2134 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2135 {
2136 for (uint32_t i = 0; i < 8; i++) {
2137 params->avx2_rr1_p5.sign_mask[i] = -0.0f;
2138 params->avx2_rr1_p5.magic_bias[i] = 0x1.8000FEp23f;
2139 params->avx2_rr1_p5.log2e[i] = 0x1.715476p0f;
2140 params->avx2_rr1_p5.minus_ln2[i] = -0x1.62E430p-1f;
2141 params->avx2_rr1_p5.c5[i] = 0x1.0F9F9Cp-7f;
2142 params->avx2_rr1_p5.c4[i] = 0x1.573A1Ap-5f;
2143 params->avx2_rr1_p5.c3[i] = 0x1.555A80p-3f;
2144 params->avx2_rr1_p5.c2[i] = 0x1.FFFDC6p-2f;
2145 params->avx2_rr1_p5.c1[i] = 0x1.FFFFF6p-1f;
2146 params->avx2_rr1_p5.one[i] = 1.0f;
2147 params->avx2_rr1_p5.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2148 }
2149 for (uint32_t i = 0; i < 7; i++) {
2150 params->avx2_rr1_p5.mask_table[i] = -1;
2151 }
2152 for (uint32_t i = 7; i < 14; i++) {
2153 params->avx2_rr1_p5.mask_table[i] = 0;
2154 }
2155 }
2156
xnn_init_f32_sigmoid_avx512_rr1_lut16_p3_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2157 void xnn_init_f32_sigmoid_avx512_rr1_lut16_p3_params(
2158 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2159 {
2160 params->avx512_rr1_lut16_p3.sign_mask = UINT32_C(0x80000000);
2161 params->avx512_rr1_lut16_p3.magic_bias = 0x1.800000p19f;
2162 params->avx512_rr1_lut16_p3.log2e = 0x1.715476p0f;
2163 params->avx512_rr1_lut16_p3.minus_ln2 = -0x1.62E430p-1f;
2164 params->avx512_rr1_lut16_p3.c3 = 0x1.55559Ap-3f;
2165 params->avx512_rr1_lut16_p3.c2 = 0x1.00021Ep-1f;
2166 params->avx512_rr1_lut16_p3.one = 1.0f;
2167 params->avx512_rr1_lut16_p3.table[ 0] = 0x1.000000p+0f;
2168 params->avx512_rr1_lut16_p3.table[ 1] = 0x1.0B5586p+0f;
2169 params->avx512_rr1_lut16_p3.table[ 2] = 0x1.172B84p+0f;
2170 params->avx512_rr1_lut16_p3.table[ 3] = 0x1.2387A6p+0f;
2171 params->avx512_rr1_lut16_p3.table[ 4] = 0x1.306FE0p+0f;
2172 params->avx512_rr1_lut16_p3.table[ 5] = 0x1.3DEA64p+0f;
2173 params->avx512_rr1_lut16_p3.table[ 6] = 0x1.4BFDAEp+0f;
2174 params->avx512_rr1_lut16_p3.table[ 7] = 0x1.5AB07Ep+0f;
2175 params->avx512_rr1_lut16_p3.table[ 8] = 0x1.6A09E6p+0f;
2176 params->avx512_rr1_lut16_p3.table[ 9] = 0x1.7A1148p+0f;
2177 params->avx512_rr1_lut16_p3.table[10] = 0x1.8ACE54p+0f;
2178 params->avx512_rr1_lut16_p3.table[11] = 0x1.9C4918p+0f;
2179 params->avx512_rr1_lut16_p3.table[12] = 0x1.AE89FAp+0f;
2180 params->avx512_rr1_lut16_p3.table[13] = 0x1.C199BEp+0f;
2181 params->avx512_rr1_lut16_p3.table[14] = 0x1.D5818Ep+0f;
2182 params->avx512_rr1_lut16_p3.table[15] = 0x1.EA4AFAp+0f;
2183 }
2184
xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2185 void xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params(
2186 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2187 {
2188 params->avx512_rr2_lut32_p2.sign_mask = UINT32_C(0x80000000);
2189 params->avx512_rr2_lut32_p2.magic_bias = 0x1.800000p18f;
2190 params->avx512_rr2_lut32_p2.log2e = 0x1.715476p0f;
2191 params->avx512_rr2_lut32_p2.minus_ln2_hi = -0x1.62E430p-1f;
2192 params->avx512_rr2_lut32_p2.minus_ln2_lo = 0x1.05C61p-29f;
2193 params->avx512_rr2_lut32_p2.c2 = 0x1.000000p-1f;
2194 params->avx512_rr2_lut32_p2.c1 = 0x1.0000F6p-0f;
2195 params->avx512_rr2_lut32_p2.one = 1.0f;
2196
2197 params->avx512_rr2_lut32_p2.table_lo[ 0] = 0x1.000000p+0f;
2198 params->avx512_rr2_lut32_p2.table_lo[ 1] = 0x1.059B0Ep+0f;
2199 params->avx512_rr2_lut32_p2.table_lo[ 2] = 0x1.0B5586p+0f;
2200 params->avx512_rr2_lut32_p2.table_lo[ 3] = 0x1.11301Ep+0f;
2201 params->avx512_rr2_lut32_p2.table_lo[ 4] = 0x1.172B84p+0f;
2202 params->avx512_rr2_lut32_p2.table_lo[ 5] = 0x1.1D4874p+0f;
2203 params->avx512_rr2_lut32_p2.table_lo[ 6] = 0x1.2387A6p+0f;
2204 params->avx512_rr2_lut32_p2.table_lo[ 7] = 0x1.29E9E0p+0f;
2205 params->avx512_rr2_lut32_p2.table_lo[ 8] = 0x1.306FE0p+0f;
2206 params->avx512_rr2_lut32_p2.table_lo[ 9] = 0x1.371A74p+0f;
2207 params->avx512_rr2_lut32_p2.table_lo[10] = 0x1.3DEA64p+0f;
2208 params->avx512_rr2_lut32_p2.table_lo[11] = 0x1.44E086p+0f;
2209 params->avx512_rr2_lut32_p2.table_lo[12] = 0x1.4BFDAEp+0f;
2210 params->avx512_rr2_lut32_p2.table_lo[13] = 0x1.5342B6p+0f;
2211 params->avx512_rr2_lut32_p2.table_lo[14] = 0x1.5AB07Ep+0f;
2212 params->avx512_rr2_lut32_p2.table_lo[15] = 0x1.6247ECp+0f;
2213
2214 params->avx512_rr2_lut32_p2.table_hi[ 0] = 0x1.6A09E6p+0f;
2215 params->avx512_rr2_lut32_p2.table_hi[ 1] = 0x1.71F75Ep+0f;
2216 params->avx512_rr2_lut32_p2.table_hi[ 2] = 0x1.7A1148p+0f;
2217 params->avx512_rr2_lut32_p2.table_hi[ 3] = 0x1.82589Ap+0f;
2218 params->avx512_rr2_lut32_p2.table_hi[ 4] = 0x1.8ACE54p+0f;
2219 params->avx512_rr2_lut32_p2.table_hi[ 5] = 0x1.93737Cp+0f;
2220 params->avx512_rr2_lut32_p2.table_hi[ 6] = 0x1.9C4918p+0f;
2221 params->avx512_rr2_lut32_p2.table_hi[ 7] = 0x1.A5503Cp+0f;
2222 params->avx512_rr2_lut32_p2.table_hi[ 8] = 0x1.AE89FAp+0f;
2223 params->avx512_rr2_lut32_p2.table_hi[ 9] = 0x1.B7F770p+0f;
2224 params->avx512_rr2_lut32_p2.table_hi[10] = 0x1.C199BEp+0f;
2225 params->avx512_rr2_lut32_p2.table_hi[11] = 0x1.CB720Ep+0f;
2226 params->avx512_rr2_lut32_p2.table_hi[12] = 0x1.D5818Ep+0f;
2227 params->avx512_rr2_lut32_p2.table_hi[13] = 0x1.DFC974p+0f;
2228 params->avx512_rr2_lut32_p2.table_hi[14] = 0x1.EA4AFAp+0f;
2229 params->avx512_rr2_lut32_p2.table_hi[15] = 0x1.F50766p+0f;
2230 }
2231
xnn_init_f32_sigmoid_avx512_rr1_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2232 void xnn_init_f32_sigmoid_avx512_rr1_p5_params(
2233 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2234 {
2235 params->avx512_rr1_p5.sign_mask = UINT32_C(0x80000000);
2236 params->avx512_rr1_p5.log2e = 0x1.715476p0f;
2237 params->avx512_rr1_p5.minus_ln2 = -0x1.62E430p-1f;
2238 params->avx512_rr1_p5.c5 = 0x1.0F9F9Cp-7f;
2239 params->avx512_rr1_p5.c4 = 0x1.573A1Ap-5f;
2240 params->avx512_rr1_p5.c3 = 0x1.555A80p-3f;
2241 params->avx512_rr1_p5.c2 = 0x1.FFFDC6p-2f;
2242 params->avx512_rr1_p5.c1 = 0x1.FFFFF6p-1f;
2243 params->avx512_rr1_p5.one = 1.0f;
2244 }
2245 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2246
2247 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_sigmoid_wasmsimd_rr2_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2248 void xnn_init_f32_sigmoid_wasmsimd_rr2_lut64_p2_params(
2249 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2250 {
2251 for (uint32_t i = 0; i < 2; i++) {
2252 params->wasmsimd_rr2_lut64_p2.magic_bias[i] = 0x1.800000p17f;
2253 params->wasmsimd_rr2_lut64_p2.minus_log2e[i] = -0x1.715476p0f;
2254 params->wasmsimd_rr2_lut64_p2.index_mask[i] = UINT32_C(0x3F);
2255 params->wasmsimd_rr2_lut64_p2.ln2_hi[i] = 0x1.630000p-1f;
2256 params->wasmsimd_rr2_lut64_p2.ln2_lo[i] = -0x1.BD0106p-13f;
2257 params->wasmsimd_rr2_lut64_p2.c2[i] = 0x1.FFFF0Ap-2f;
2258 params->wasmsimd_rr2_lut64_p2.one[i] = 1.0f;
2259 params->wasmsimd_rr2_lut64_p2.denorm_cutoff[i] = 0x1.5D589Ep+6f;
2260 }
2261 }
2262
xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2263 void xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params(
2264 union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2265 {
2266 for (uint32_t i = 0; i < 2; i++) {
2267 params->wasmsimd_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2268 params->wasmsimd_rr2_p5.minus_log2e[i] = -0x1.715476p+0f;
2269 params->wasmsimd_rr2_p5.ln2_hi[i] = 0x1.62E400p-1f;
2270 params->wasmsimd_rr2_p5.ln2_lo[i] = 0x1.7F7D1Cp-20f;
2271 params->wasmsimd_rr2_p5.c5[i] = -0x1.0F9F9Cp-7f;
2272 params->wasmsimd_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2273 params->wasmsimd_rr2_p5.c3[i] = -0x1.555A80p-3f;
2274 params->wasmsimd_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2275 params->wasmsimd_rr2_p5.c1[i] = -0x1.FFFFF6p-1f;
2276 params->wasmsimd_rr2_p5.one[i] = 1.0f;
2277 params->wasmsimd_rr2_p5.denorm_cutoff[i] = 0x1.5D589Ep+6f;
2278 }
2279 }
2280 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2281
2282 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_abs_sse_params(union xnn_f32_abs_params params[XNN_MIN_ELEMENTS (1)])2283 void xnn_init_f32_abs_sse_params(
2284 union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2285 {
2286 for (uint32_t i = 0; i < 4; i++) {
2287 params->sse.nonsign_mask[i] = math_nonsign_mask_f32();
2288 }
2289 }
2290
xnn_init_f32_abs_avx_params(union xnn_f32_abs_params params[XNN_MIN_ELEMENTS (1)])2291 void xnn_init_f32_abs_avx_params(
2292 union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2293 {
2294 for (uint32_t i = 0; i < 8; i++) {
2295 params->avx.nonsign_mask[i] = math_nonsign_mask_f32();
2296 }
2297 for (uint32_t i = 0; i < 7; i++) {
2298 params->avx.mask_table[i] = -1;
2299 }
2300 for (uint32_t i = 7; i < 14; i++) {
2301 params->avx.mask_table[i] = 0;
2302 }
2303 }
2304
xnn_init_f32_abs_avx512_params(union xnn_f32_abs_params params[XNN_MIN_ELEMENTS (1)])2305 void xnn_init_f32_abs_avx512_params(
2306 union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2307 {
2308 params->avx512.nonsign_mask = UINT32_C(0x7FFFFFFF);
2309 }
2310 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2311
2312 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_abs_wasmsimd_params(union xnn_f32_abs_params params[XNN_MIN_ELEMENTS (1)])2313 void xnn_init_f32_abs_wasmsimd_params(
2314 union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2315 {
2316 params->wasmsimd.nonsign_mask[0] = math_nonsign_mask_f32();
2317 params->wasmsimd.nonsign_mask[1] = math_nonsign_mask_f32();
2318 }
2319 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2320
2321 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_neg_sse_params(union xnn_f32_neg_params params[XNN_MIN_ELEMENTS (1)])2322 void xnn_init_f32_neg_sse_params(
2323 union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2324 {
2325 for (uint32_t i = 0; i < 4; i++) {
2326 params->sse.sign_mask[i] = -0.0f;
2327 }
2328 }
2329
xnn_init_f32_neg_avx_params(union xnn_f32_neg_params params[XNN_MIN_ELEMENTS (1)])2330 void xnn_init_f32_neg_avx_params(
2331 union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2332 {
2333 for (uint32_t i = 0; i < 8; i++) {
2334 params->avx.sign_mask[i] = -0.0f;
2335 }
2336 for (uint32_t i = 0; i < 7; i++) {
2337 params->avx.mask_table[i] = -1;
2338 }
2339 for (uint32_t i = 7; i < 14; i++) {
2340 params->avx.mask_table[i] = 0;
2341 }
2342 }
2343
xnn_init_f32_neg_avx512_params(union xnn_f32_neg_params params[XNN_MIN_ELEMENTS (1)])2344 void xnn_init_f32_neg_avx512_params(
2345 union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2346 {
2347 params->avx512.sign_mask = UINT32_C(0x80000000);
2348 }
2349 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2350
2351 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_neg_wasmsimd_params(union xnn_f32_neg_params params[XNN_MIN_ELEMENTS (1)])2352 void xnn_init_f32_neg_wasmsimd_params(
2353 union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2354 {
2355 params->wasmsimd.sign_mask[0] = -0.0f;
2356 params->wasmsimd.sign_mask[1] = -0.0f;
2357 }
2358 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2359
2360 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_rnd_sse2_params(union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS (1)])2361 void xnn_init_f32_rnd_sse2_params(
2362 union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
2363 {
2364 for (uint32_t i = 0; i < 4; i++) {
2365 params->sse2.sign_mask[i] = -0.0f;
2366 params->sse2.one[i] = 1.0f;
2367 }
2368 }
2369
xnn_init_f32_rnd_avx_params(union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS (1)])2370 void xnn_init_f32_rnd_avx_params(
2371 union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
2372 {
2373 for (uint32_t i = 0; i < 7; i++) {
2374 params->avx.mask_table[i] = -1;
2375 }
2376 for (uint32_t i = 7; i < 14; i++) {
2377 params->avx.mask_table[i] = 0;
2378 }
2379 }
2380 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2381
2382 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_rnd_wasmsimd_params(union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS (1)])2383 void xnn_init_f32_rnd_wasmsimd_params(
2384 union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
2385 {
2386 params->wasmsimd.sign_mask[0] = -0.0f;
2387 params->wasmsimd.sign_mask[1] = -0.0f;
2388 params->wasmsimd.magic_bias[0] = 0x1.000000p+23f;
2389 params->wasmsimd.magic_bias[1] = 0x1.000000p+23f;
2390 params->wasmsimd.one[0] = 1.0f;
2391 params->wasmsimd.one[1] = 1.0f;
2392 }
2393 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2394
xnn_init_f32_elu_scalar_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2395 void xnn_init_f32_elu_scalar_rr2_lut16_p3_params(
2396 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2397 float prescale,
2398 float alpha,
2399 float beta)
2400 {
2401 params->scalar_rr2_lut16_p3.prescale = prescale;
2402 params->scalar_rr2_lut16_p3.alpha = alpha;
2403 params->scalar_rr2_lut16_p3.beta = beta;
2404 params->scalar_rr2_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2405 params->scalar_rr2_lut16_p3.magic_bias = 0x1.800000p19f;
2406 params->scalar_rr2_lut16_p3.log2e = 0x1.715476p+0f;
2407 params->scalar_rr2_lut16_p3.minus_ln2_hi = -0x1.62E400p-1f;
2408 params->scalar_rr2_lut16_p3.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2409 params->scalar_rr2_lut16_p3.c3 = 0x1.55561Cp-3f;
2410 params->scalar_rr2_lut16_p3.c2 = 0x1.0001ECp-1f;
2411 params->scalar_rr2_lut16_p3.one = 1.0f;
2412 }
2413
xnn_init_f32_elu_scalar_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2414 void xnn_init_f32_elu_scalar_rr2_p6_params(
2415 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2416 float prescale,
2417 float alpha,
2418 float beta)
2419 {
2420 params->scalar_rr2_p6.prescale = prescale;
2421 params->scalar_rr2_p6.alpha = alpha;
2422 params->scalar_rr2_p6.beta = beta;
2423 params->scalar_rr2_p6.sat_cutoff = -0x1.154246p+4f;
2424 params->scalar_rr2_p6.magic_bias = 0x1.8000FEp23f;
2425 params->scalar_rr2_p6.log2e = 0x1.715476p+0f;
2426 params->scalar_rr2_p6.minus_ln2_hi = -0x1.62E440p-1f;
2427 params->scalar_rr2_p6.minus_ln2_lo = 0x1.0105C6p-21f;
2428 params->scalar_rr2_p6.c6 = 0x1.6b7338p-10f;
2429 params->scalar_rr2_p6.c5 = 0x1.12278Ep-7f;
2430 params->scalar_rr2_p6.c4 = 0x1.555716p-5f;
2431 params->scalar_rr2_p6.c3 = 0x1.5554B0p-3f;
2432 params->scalar_rr2_p6.c2 = 0x1.FFFFFEp-2f;
2433 params->scalar_rr2_p6.one = 1.0f;
2434 }
2435
2436 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_elu_neon_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2437 void xnn_init_f32_elu_neon_rr2_lut16_p3_params(
2438 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2439 float prescale,
2440 float alpha,
2441 float beta)
2442 {
2443 params->neon_rr2_lut16_p3.prescale = prescale;
2444 params->neon_rr2_lut16_p3.alpha = alpha;
2445 params->neon_rr2_lut16_p3.beta = beta;
2446 params->neon_rr2_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2447 params->neon_rr2_lut16_p3.magic_bias = 0x1.800000p19f;
2448 params->neon_rr2_lut16_p3.log2e = 0x1.715476p+0f;
2449 params->neon_rr2_lut16_p3.minus_ln2_hi = -0x1.62E400p-1f;
2450 params->neon_rr2_lut16_p3.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2451 params->neon_rr2_lut16_p3.c3 = 0x1.55561Cp-3f;
2452 params->neon_rr2_lut16_p3.c2 = 0x1.0001ECp-1f;
2453 }
2454
xnn_init_f32_elu_neon_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2455 void xnn_init_f32_elu_neon_rr2_p6_params(
2456 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2457 float prescale,
2458 float alpha,
2459 float beta)
2460 {
2461 params->neon_rr2_p6.prescale = prescale;
2462 params->neon_rr2_p6.alpha = alpha;
2463 params->neon_rr2_p6.beta = beta;
2464 params->neon_rr2_p6.sat_cutoff = -0x1.154246p+4f;
2465 params->neon_rr2_p6.magic_bias = 0x1.8000FEp23f;
2466 params->neon_rr2_p6.log2e = 0x1.715476p+0f;
2467 params->neon_rr2_p6.minus_ln2_hi = -0x1.62E440p-1f;
2468 params->neon_rr2_p6.minus_ln2_lo = 0x1.0105C6p-21f;
2469 params->neon_rr2_p6.c6 = 0x1.6b7338p-10f;
2470 params->neon_rr2_p6.c5 = 0x1.12278Ep-7f;
2471 params->neon_rr2_p6.c4 = 0x1.555716p-5f;
2472 params->neon_rr2_p6.c3 = 0x1.5554B0p-3f;
2473 params->neon_rr2_p6.c2 = 0x1.FFFFFEp-2f;
2474 }
2475
xnn_init_f32_elu_neonfma_rr1_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2476 void xnn_init_f32_elu_neonfma_rr1_lut16_p3_params(
2477 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2478 float prescale,
2479 float alpha,
2480 float beta)
2481 {
2482 params->neonfma_rr1_lut16_p3.prescale = prescale;
2483 params->neonfma_rr1_lut16_p3.alpha = alpha;
2484 params->neonfma_rr1_lut16_p3.beta = beta;
2485 params->neonfma_rr1_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2486 params->neonfma_rr1_lut16_p3.magic_bias = 0x1.800000p19f;
2487 params->neonfma_rr1_lut16_p3.log2e = 0x1.715476p+0f;
2488 params->neonfma_rr1_lut16_p3.minus_ln2 = -0x1.62E430p-1f;
2489 params->neonfma_rr1_lut16_p3.c3 = 0x1.55561Cp-3f;
2490 params->neonfma_rr1_lut16_p3.c2 = 0x1.0001ECp-1f;
2491 }
2492
xnn_init_f32_elu_neonfma_rr1_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2493 void xnn_init_f32_elu_neonfma_rr1_p6_params(
2494 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2495 float prescale,
2496 float alpha,
2497 float beta)
2498 {
2499 params->neonfma_rr1_p6.prescale = prescale;
2500 params->neonfma_rr1_p6.alpha = alpha;
2501 params->neonfma_rr1_p6.beta = beta;
2502 params->neonfma_rr1_p6.sat_cutoff = -0x1.154246p+4f;
2503 params->neonfma_rr1_p6.magic_bias = 0x1.8000FEp23f;
2504 params->neonfma_rr1_p6.log2e = 0x1.715476p+0f;
2505 params->neonfma_rr1_p6.minus_ln2 = -0x1.62E430p-1f;
2506 params->neonfma_rr1_p6.c6 = 0x1.6b7338p-10f;
2507 params->neonfma_rr1_p6.c5 = 0x1.12278Ep-7f;
2508 params->neonfma_rr1_p6.c4 = 0x1.555716p-5f;
2509 params->neonfma_rr1_p6.c3 = 0x1.5554B0p-3f;
2510 params->neonfma_rr1_p6.c2 = 0x1.FFFFFEp-2f;
2511 }
2512 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2513
2514 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_elu_sse2_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2515 void xnn_init_f32_elu_sse2_rr2_lut16_p3_params(
2516 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2517 float prescale,
2518 float alpha,
2519 float beta)
2520 {
2521 for (uint32_t i = 0; i < 4; i++) {
2522 params->sse2_rr2_lut16_p3.prescale[i] = prescale;
2523 params->sse2_rr2_lut16_p3.alpha[i] = alpha;
2524 params->sse2_rr2_lut16_p3.beta[i] = beta;
2525 params->sse2_rr2_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2526 params->sse2_rr2_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2527 params->sse2_rr2_lut16_p3.log2e[i] = 0x1.715476p+0f;
2528 params->sse2_rr2_lut16_p3.index_mask[i] = UINT32_C(0xF);
2529 params->sse2_rr2_lut16_p3.minus_ln2_hi[i] = -0x1.62E400p-1f;
2530 params->sse2_rr2_lut16_p3.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2531 params->sse2_rr2_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2532 params->sse2_rr2_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2533 params->sse2_rr2_lut16_p3.one[i] = 1.0f;
2534 }
2535 }
2536
xnn_init_f32_elu_sse2_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2537 void xnn_init_f32_elu_sse2_rr2_p6_params(
2538 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2539 float prescale,
2540 float alpha,
2541 float beta)
2542 {
2543 for (uint32_t i = 0; i < 4; i++) {
2544 params->sse2_rr2_p6.prescale[i] = prescale;
2545 params->sse2_rr2_p6.alpha[i] = alpha;
2546 params->sse2_rr2_p6.beta[i] = beta;
2547 params->sse2_rr2_p6.sat_cutoff[i] = -0x1.154246p+4f;
2548 params->sse2_rr2_p6.magic_bias[i] = 0x1.8000FEp23f;
2549 params->sse2_rr2_p6.log2e[i] = 0x1.715476p+0f;
2550 params->sse2_rr2_p6.minus_ln2_hi[i] = -0x1.62E440p-1f;
2551 params->sse2_rr2_p6.minus_ln2_lo[i] = 0x1.0105C6p-21f;
2552 params->sse2_rr2_p6.c6[i] = 0x1.6b7338p-10f;
2553 params->sse2_rr2_p6.c5[i] = 0x1.12278Ep-7f;
2554 params->sse2_rr2_p6.c4[i] = 0x1.555716p-5f;
2555 params->sse2_rr2_p6.c3[i] = 0x1.5554B0p-3f;
2556 params->sse2_rr2_p6.c2[i] = 0x1.FFFFFEp-2f;
2557 params->sse2_rr2_p6.one[i] = 1.0f;
2558 }
2559 }
2560
xnn_init_f32_elu_avx_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2561 void xnn_init_f32_elu_avx_rr2_lut16_p3_params(
2562 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2563 float prescale,
2564 float alpha,
2565 float beta)
2566 {
2567 for (uint32_t i = 0; i < 8; i++) {
2568 params->avx_rr2_lut16_p3.prescale[i] = prescale;
2569 params->avx_rr2_lut16_p3.alpha[i] = alpha;
2570 params->avx_rr2_lut16_p3.beta[i] = beta;
2571 params->avx_rr2_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2572 params->avx_rr2_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2573 params->avx_rr2_lut16_p3.log2e[i] = 0x1.715476p+0f;
2574 params->avx_rr2_lut16_p3.index_mask[i] = UINT32_C(0xF);
2575 params->avx_rr2_lut16_p3.minus_ln2_hi[i] = -0x1.62E400p-1f;
2576 params->avx_rr2_lut16_p3.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2577 params->avx_rr2_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2578 params->avx_rr2_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2579 params->avx_rr2_lut16_p3.one[i] = 1.0f;
2580 }
2581 for (uint32_t i = 0; i < 7; i++) {
2582 params->avx_rr2_lut16_p3.mask_table[i] = -1;
2583 }
2584 for (uint32_t i = 7; i < 14; i++) {
2585 params->avx_rr2_lut16_p3.mask_table[i] = 0;
2586 }
2587 }
2588
xnn_init_f32_elu_avx_rr2_lut4_p4_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2589 void xnn_init_f32_elu_avx_rr2_lut4_p4_params(
2590 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2591 float prescale,
2592 float alpha,
2593 float beta)
2594 {
2595 for (uint32_t i = 0; i < 8; i++) {
2596 params->avx_rr2_lut4_p4.prescale[i] = prescale;
2597 params->avx_rr2_lut4_p4.alpha[i] = alpha;
2598 params->avx_rr2_lut4_p4.beta[i] = beta;
2599 params->avx_rr2_lut4_p4.sat_cutoff[i] = -0x1.154246p+4f;
2600 params->avx_rr2_lut4_p4.magic_bias[i] = 0x1.8003F8p21f;
2601 params->avx_rr2_lut4_p4.log2e[i] = 0x1.715476p+0f;
2602 params->avx_rr2_lut4_p4.index_mask[i] = UINT32_C(0x3);
2603 }
2604 params->avx_rr2_lut4_p4.table[0] = 0x1.000000p+0f;
2605 params->avx_rr2_lut4_p4.table[1] = 0x1.306FE0p+0f;
2606 params->avx_rr2_lut4_p4.table[2] = 0x1.6A09E6p+0f;
2607 params->avx_rr2_lut4_p4.table[3] = 0x1.AE89FAp+0f;
2608 params->avx_rr2_lut4_p4.table[4] = 0x1.000000p+0f;
2609 params->avx_rr2_lut4_p4.table[5] = 0x1.306FE0p+0f;
2610 params->avx_rr2_lut4_p4.table[6] = 0x1.6A09E6p+0f;
2611 params->avx_rr2_lut4_p4.table[7] = 0x1.AE89FAp+0f;
2612 for (uint32_t i = 0; i < 8; i++) {
2613 params->avx_rr2_lut4_p4.minus_ln2_hi[i] = -0x1.62E400p-1f;
2614 params->avx_rr2_lut4_p4.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2615 params->avx_rr2_lut4_p4.c4[i] = 0x1.554F9Ap-5f;
2616 params->avx_rr2_lut4_p4.c3[i] = 0x1.557082p-3f;
2617 params->avx_rr2_lut4_p4.c2[i] = 0x1.000002p-1f;
2618 params->avx_rr2_lut4_p4.one[i] = 1.0f;
2619 }
2620 for (uint32_t i = 0; i < 7; i++) {
2621 params->avx_rr2_lut4_p4.mask_table[i] = -1;
2622 }
2623 for (uint32_t i = 7; i < 14; i++) {
2624 params->avx_rr2_lut4_p4.mask_table[i] = 0;
2625 }
2626 }
2627
xnn_init_f32_elu_avx_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2628 void xnn_init_f32_elu_avx_rr2_p6_params(
2629 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2630 float prescale,
2631 float alpha,
2632 float beta)
2633 {
2634 for (uint32_t i = 0; i < 8; i++) {
2635 params->avx_rr2_p6.prescale[i] = prescale;
2636 params->avx_rr2_p6.alpha[i] = alpha;
2637 params->avx_rr2_p6.beta[i] = beta;
2638 params->avx_rr2_p6.sat_cutoff[i] = -0x1.154246p+4f;
2639 params->avx_rr2_p6.magic_bias[i] = 0x1.8000FEp23f;
2640 params->avx_rr2_p6.log2e[i] = 0x1.715476p+0f;
2641 params->avx_rr2_p6.minus_ln2_hi[i] = -0x1.62E440p-1f;
2642 params->avx_rr2_p6.minus_ln2_lo[i] = 0x1.0105C6p-21f;
2643 params->avx_rr2_p6.c6[i] = 0x1.6b7338p-10f;
2644 params->avx_rr2_p6.c5[i] = 0x1.12278Ep-7f;
2645 params->avx_rr2_p6.c4[i] = 0x1.555716p-5f;
2646 params->avx_rr2_p6.c3[i] = 0x1.5554B0p-3f;
2647 params->avx_rr2_p6.c2[i] = 0x1.FFFFFEp-2f;
2648 params->avx_rr2_p6.one[i] = 1.0f;
2649 }
2650 for (uint32_t i = 0; i < 7; i++) {
2651 params->avx_rr2_p6.mask_table[i] = -1;
2652 }
2653 for (uint32_t i = 7; i < 14; i++) {
2654 params->avx_rr2_p6.mask_table[i] = 0;
2655 }
2656 }
2657
xnn_init_f32_elu_avx2_rr1_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2658 void xnn_init_f32_elu_avx2_rr1_lut16_p3_params(
2659 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2660 float prescale,
2661 float alpha,
2662 float beta)
2663 {
2664 for (uint32_t i = 0; i < 8; i++) {
2665 params->avx2_rr1_lut16_p3.prescale[i] = prescale;
2666 params->avx2_rr1_lut16_p3.alpha[i] = alpha;
2667 params->avx2_rr1_lut16_p3.beta[i] = beta;
2668 params->avx2_rr1_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2669 params->avx2_rr1_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2670 params->avx2_rr1_lut16_p3.log2e[i] = 0x1.715476p+0f;
2671 params->avx2_rr1_lut16_p3.index_mask[i] = UINT32_C(0xF);
2672 params->avx2_rr1_lut16_p3.minus_ln2[i] = -0x1.62E430p-1f;
2673 params->avx2_rr1_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2674 params->avx2_rr1_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2675 }
2676 for (uint32_t i = 0; i < 7; i++) {
2677 params->avx2_rr1_lut16_p3.mask_table[i] = -1;
2678 }
2679 for (uint32_t i = 7; i < 14; i++) {
2680 params->avx2_rr1_lut16_p3.mask_table[i] = 0;
2681 }
2682 }
2683
xnn_init_f32_elu_avx2_rr1_lut8_p4_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2684 void xnn_init_f32_elu_avx2_rr1_lut8_p4_params(
2685 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2686 float prescale,
2687 float alpha,
2688 float beta)
2689 {
2690 for (uint32_t i = 0; i < 8; i++) {
2691 params->avx2_rr1_lut8_p4.prescale[i] = prescale;
2692 params->avx2_rr1_lut8_p4.alpha[i] = alpha;
2693 params->avx2_rr1_lut8_p4.beta[i] = beta;
2694 params->avx2_rr1_lut8_p4.sat_cutoff[i] = -0x1.154246p+4f;
2695 params->avx2_rr1_lut8_p4.magic_bias[i] = 0x1.800000p20f;
2696 params->avx2_rr1_lut8_p4.log2e[i] = 0x1.715476p+0f;
2697 }
2698 params->avx2_rr1_lut8_p4.table[0] = UINT32_C(0x3F800000);
2699 params->avx2_rr1_lut8_p4.table[1] = UINT32_C(0x3F7B95C2);
2700 params->avx2_rr1_lut8_p4.table[2] = UINT32_C(0x3F7837F0);
2701 params->avx2_rr1_lut8_p4.table[3] = UINT32_C(0x3F75FED7);
2702 params->avx2_rr1_lut8_p4.table[4] = UINT32_C(0x3F7504F3);
2703 params->avx2_rr1_lut8_p4.table[5] = UINT32_C(0x3F75672A);
2704 params->avx2_rr1_lut8_p4.table[6] = UINT32_C(0x3F7744FD);
2705 params->avx2_rr1_lut8_p4.table[7] = UINT32_C(0x3F7AC0C7);
2706 for (uint32_t i = 0; i < 8; i++) {
2707 params->avx2_rr1_lut8_p4.minus_ln2[i] = -0x1.62E430p-1f;
2708 params->avx2_rr1_lut8_p4.c4[i] = 0x1.5558ECp-5f;
2709 params->avx2_rr1_lut8_p4.c3[i] = 0x1.555C20p-3f;
2710 params->avx2_rr1_lut8_p4.c2[i] = 0x1.000000p-1f;
2711 }
2712 for (uint32_t i = 0; i < 7; i++) {
2713 params->avx2_rr1_lut8_p4.mask_table[i] = -1;
2714 }
2715 for (uint32_t i = 7; i < 14; i++) {
2716 params->avx2_rr1_lut8_p4.mask_table[i] = 0;
2717 }
2718 }
2719
xnn_init_f32_elu_avx2_rr1_lut4_p4_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2720 void xnn_init_f32_elu_avx2_rr1_lut4_p4_params(
2721 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2722 float prescale,
2723 float alpha,
2724 float beta)
2725 {
2726 for (uint32_t i = 0; i < 8; i++) {
2727 params->avx2_rr1_lut4_p4.prescale[i] = prescale;
2728 params->avx2_rr1_lut4_p4.alpha[i] = alpha;
2729 params->avx2_rr1_lut4_p4.beta[i] = beta;
2730 params->avx2_rr1_lut4_p4.sat_cutoff[i] = -0x1.154246p+4f;
2731 params->avx2_rr1_lut4_p4.magic_bias[i] = 0x1.800000p21f;
2732 params->avx2_rr1_lut4_p4.log2e[i] = 0x1.715476p+0f;
2733 }
2734 params->avx2_rr1_lut4_p4.table[0] = 0x1.000000p+0f;
2735 params->avx2_rr1_lut4_p4.table[1] = 0x1.F06FE0p-1f;
2736 params->avx2_rr1_lut4_p4.table[2] = 0x1.EA09E6p-1f;
2737 params->avx2_rr1_lut4_p4.table[3] = 0x1.EE89FAp-1f;
2738 params->avx2_rr1_lut4_p4.table[4] = 0x1.000000p+0f;
2739 params->avx2_rr1_lut4_p4.table[5] = 0x1.F06FE0p-1f;
2740 params->avx2_rr1_lut4_p4.table[6] = 0x1.EA09E6p-1f;
2741 params->avx2_rr1_lut4_p4.table[7] = 0x1.EE89FAp-1f;
2742 for (uint32_t i = 0; i < 8; i++) {
2743 params->avx2_rr1_lut4_p4.minus_ln2[i] = -0x1.62E430p-1f;
2744 params->avx2_rr1_lut4_p4.c4[i] = 0x1.554F9Ap-5f;
2745 params->avx2_rr1_lut4_p4.c3[i] = 0x1.557082p-3f;
2746 params->avx2_rr1_lut4_p4.c2[i] = 0x1.000002p-1f;
2747 }
2748 for (uint32_t i = 0; i < 7; i++) {
2749 params->avx2_rr1_lut4_p4.mask_table[i] = -1;
2750 }
2751 for (uint32_t i = 7; i < 14; i++) {
2752 params->avx2_rr1_lut4_p4.mask_table[i] = 0;
2753 }
2754 }
2755
xnn_init_f32_elu_avx2_rr1_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2756 void xnn_init_f32_elu_avx2_rr1_p6_params(
2757 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2758 float prescale,
2759 float alpha,
2760 float beta)
2761 {
2762 for (uint32_t i = 0; i < 8; i++) {
2763 params->avx2_rr1_p6.prescale[i] = prescale;
2764 params->avx2_rr1_p6.alpha[i] = alpha;
2765 params->avx2_rr1_p6.beta[i] = beta;
2766 params->avx2_rr1_p6.sat_cutoff[i] = -0x1.154246p+4f;
2767 params->avx2_rr1_p6.magic_bias[i] = 0x1.8000FEp23f;
2768 params->avx2_rr1_p6.log2e[i] = 0x1.715476p+0f;
2769 params->avx2_rr1_p6.minus_ln2[i] = -0x1.62E430p-1f;
2770 params->avx2_rr1_p6.c6[i] = 0x1.6B7338p-10f;
2771 params->avx2_rr1_p6.c5[i] = 0x1.12278Ep-7f;
2772 params->avx2_rr1_p6.c4[i] = 0x1.555716p-5f;
2773 params->avx2_rr1_p6.c3[i] = 0x1.5554B0p-3f;
2774 params->avx2_rr1_p6.c2[i] = 0x1.FFFFFEp-2f;
2775 }
2776 for (uint32_t i = 0; i < 7; i++) {
2777 params->avx2_rr1_p6.mask_table[i] = -1;
2778 }
2779 for (uint32_t i = 7; i < 14; i++) {
2780 params->avx2_rr1_p6.mask_table[i] = 0;
2781 }
2782 }
2783
xnn_init_f32_elu_avx512_rr1_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2784 void xnn_init_f32_elu_avx512_rr1_lut16_p3_params(
2785 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2786 float prescale,
2787 float alpha,
2788 float beta)
2789 {
2790 params->avx512_rr1_lut16_p3.prescale = prescale;
2791 params->avx512_rr1_lut16_p3.alpha = alpha;
2792 params->avx512_rr1_lut16_p3.beta = beta;
2793 params->avx512_rr1_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2794 params->avx512_rr1_lut16_p3.magic_bias = 0x1.800000p19f;
2795 params->avx512_rr1_lut16_p3.log2e = 0x1.715476p+0f;
2796 params->avx512_rr1_lut16_p3.minus_ln2 = -0x1.62E430p-1f;
2797 params->avx512_rr1_lut16_p3.c3 = 0x1.55561Cp-3f;
2798 params->avx512_rr1_lut16_p3.c2 = 0x1.0001ECp-1f;
2799 params->avx512_rr1_lut16_p3.table[ 0] = UINT32_C(0x3F800000);
2800 params->avx512_rr1_lut16_p3.table[ 1] = UINT32_C(0x3F7DAAC3);
2801 params->avx512_rr1_lut16_p3.table[ 2] = UINT32_C(0x3F7B95C2);
2802 params->avx512_rr1_lut16_p3.table[ 3] = UINT32_C(0x3F79C3D3);
2803 params->avx512_rr1_lut16_p3.table[ 4] = UINT32_C(0x3F7837F0);
2804 params->avx512_rr1_lut16_p3.table[ 5] = UINT32_C(0x3F76F532);
2805 params->avx512_rr1_lut16_p3.table[ 6] = UINT32_C(0x3F75FED7);
2806 params->avx512_rr1_lut16_p3.table[ 7] = UINT32_C(0x3F75583F);
2807 params->avx512_rr1_lut16_p3.table[ 8] = UINT32_C(0x3F7504F3);
2808 params->avx512_rr1_lut16_p3.table[ 9] = UINT32_C(0x3F7508A4);
2809 params->avx512_rr1_lut16_p3.table[10] = UINT32_C(0x3F75672A);
2810 params->avx512_rr1_lut16_p3.table[11] = UINT32_C(0x3F76248C);
2811 params->avx512_rr1_lut16_p3.table[12] = UINT32_C(0x3F7744FD);
2812 params->avx512_rr1_lut16_p3.table[13] = UINT32_C(0x3F78CCDF);
2813 params->avx512_rr1_lut16_p3.table[14] = UINT32_C(0x3F7AC0C7);
2814 params->avx512_rr1_lut16_p3.table[15] = UINT32_C(0x3F7D257D);
2815 }
2816
xnn_init_f32_elu_avx512_rr1_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2817 void xnn_init_f32_elu_avx512_rr1_p6_params(
2818 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2819 float prescale,
2820 float alpha,
2821 float beta)
2822 {
2823 params->avx512_rr1_p6.prescale = prescale;
2824 params->avx512_rr1_p6.alpha = alpha;
2825 params->avx512_rr1_p6.beta = beta;
2826 params->avx512_rr1_p6.sat_cutoff = -0x1.154246p+4f;
2827 params->avx512_rr1_p6.magic_bias = 0x1.8000FEp23f;
2828 params->avx512_rr1_p6.log2e = 0x1.715476p+0f;
2829 params->avx512_rr1_p6.minus_ln2 = -0x1.62E430p-1f;
2830 params->avx512_rr1_p6.c6 = 0x1.6B7338p-10f;
2831 params->avx512_rr1_p6.c5 = 0x1.12278Ep-7f;
2832 params->avx512_rr1_p6.c4 = 0x1.555716p-5f;
2833 params->avx512_rr1_p6.c3 = 0x1.5554B0p-3f;
2834 params->avx512_rr1_p6.c2 = 0x1.FFFFFEp-2f;
2835 }
2836 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
2837
2838 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2839 void xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params(
2840 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2841 float prescale,
2842 float alpha,
2843 float beta)
2844 {
2845 for (uint32_t i = 0; i < 2; i++) {
2846 params->wasmsimd_rr2_lut16_p3.prescale[i] = prescale;
2847 params->wasmsimd_rr2_lut16_p3.alpha[i] = alpha;
2848 params->wasmsimd_rr2_lut16_p3.beta[i] = beta;
2849 params->wasmsimd_rr2_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2850 params->wasmsimd_rr2_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2851 params->wasmsimd_rr2_lut16_p3.log2e[i] = 0x1.715476p+0f;
2852 params->wasmsimd_rr2_lut16_p3.index_mask[i] = UINT32_C(0xF);
2853 params->wasmsimd_rr2_lut16_p3.minus_ln2_hi[i] = -0x1.62E400p-1f;
2854 params->wasmsimd_rr2_lut16_p3.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2855 params->wasmsimd_rr2_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2856 params->wasmsimd_rr2_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2857 params->wasmsimd_rr2_lut16_p3.one[i] = 1.0f;
2858 }
2859 }
2860
xnn_init_f32_elu_wasmsimd_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2861 void xnn_init_f32_elu_wasmsimd_rr2_p6_params(
2862 union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2863 float prescale,
2864 float alpha,
2865 float beta)
2866 {
2867 for (uint32_t i = 0; i < 2; i++) {
2868 params->wasmsimd_rr2_p6.prescale[i] = prescale;
2869 params->wasmsimd_rr2_p6.alpha[i] = alpha;
2870 params->wasmsimd_rr2_p6.beta[i] = beta;
2871 params->wasmsimd_rr2_p6.sat_cutoff[i] = -0x1.154246p+4f;
2872 params->wasmsimd_rr2_p6.magic_bias[i] = 0x1.8000FEp23f;
2873 params->wasmsimd_rr2_p6.log2e[i] = 0x1.715476p+0f;
2874 params->wasmsimd_rr2_p6.minus_ln2_hi[i] = -0x1.62E440p-1f;
2875 params->wasmsimd_rr2_p6.minus_ln2_lo[i] = 0x1.0105C6p-21f;
2876 params->wasmsimd_rr2_p6.c6[i] = 0x1.6b7338p-10f;
2877 params->wasmsimd_rr2_p6.c5[i] = 0x1.12278Ep-7f;
2878 params->wasmsimd_rr2_p6.c4[i] = 0x1.555716p-5f;
2879 params->wasmsimd_rr2_p6.c3[i] = 0x1.5554B0p-3f;
2880 params->wasmsimd_rr2_p6.c2[i] = 0x1.FFFFFEp-2f;
2881 params->wasmsimd_rr2_p6.one[i] = 1.0f;
2882 }
2883 }
2884 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2885
xnn_init_f32_expminus_scalar_rr2_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2886 void xnn_init_f32_expminus_scalar_rr2_p5_params(
2887 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2888 {
2889 params->scalar_rr2_p5.log2e = 0x1.715476p+0f;
2890 params->scalar_rr2_p5.magic_bias = 0x1.8000FEp23f;
2891 params->scalar_rr2_p5.minus_ln2_hi = -0x1.62E400p-1f;
2892 params->scalar_rr2_p5.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2893 params->scalar_rr2_p5.c5 = 0x1.0F9F9Cp-7f;
2894 params->scalar_rr2_p5.c4 = 0x1.573A1Ap-5f;
2895 params->scalar_rr2_p5.c3 = 0x1.555A80p-3f;
2896 params->scalar_rr2_p5.c2 = 0x1.FFFDC6p-2f;
2897 params->scalar_rr2_p5.c1 = 0x1.FFFFF6p-1f;
2898 params->scalar_rr2_p5.denorm_cutoff = -0x1.5D589Ep6f;
2899 }
2900
xnn_init_f32_expminus_scalar_rr2_lut64_p2_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2901 void xnn_init_f32_expminus_scalar_rr2_lut64_p2_params(
2902 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2903 {
2904 params->scalar_rr2_lut64_p2.log2e = 0x1.715476p0f;
2905 params->scalar_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
2906 params->scalar_rr2_lut64_p2.minus_ln2_hi = -0x1.630000p-1f;
2907 params->scalar_rr2_lut64_p2.minus_ln2_lo = 0x1.BD0106p-13f;
2908 params->scalar_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2909 params->scalar_rr2_lut64_p2.denorm_cutoff = -0x1.5D589Ep6f;
2910 }
2911
2912 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_expminus_neon_rr2_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2913 void xnn_init_f32_expminus_neon_rr2_p5_params(
2914 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2915 {
2916 params->neon_rr2_p5.log2e = 0x1.715476p+0f;
2917 params->neon_rr2_p5.magic_bias = 0x1.8000FEp23f;
2918 params->neon_rr2_p5.minus_ln2_hi = -0x1.62E400p-1f;
2919 params->neon_rr2_p5.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2920 params->neon_rr2_p5.c5 = 0x1.0F9F9Cp-7f;
2921 params->neon_rr2_p5.c4 = 0x1.573A1Ap-5f;
2922 params->neon_rr2_p5.c3 = 0x1.555A80p-3f;
2923 params->neon_rr2_p5.c2 = 0x1.FFFDC6p-2f;
2924 params->neon_rr2_p5.c1 = 0x1.FFFFF6p-1f;
2925 params->neon_rr2_p5.denorm_cutoff = -0x1.5D589Ep6f;
2926 }
2927
xnn_init_f32_expminus_neon_rr2_lut64_p2_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2928 void xnn_init_f32_expminus_neon_rr2_lut64_p2_params(
2929 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2930 {
2931 params->neon_rr2_lut64_p2.log2e = 0x1.715476p+0f;
2932 params->neon_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
2933 params->neon_rr2_lut64_p2.minus_ln2_hi = -0x1.62E400p-1f;
2934 params->neon_rr2_lut64_p2.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2935 params->neon_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2936 params->neon_rr2_lut64_p2.denorm_cutoff = -0x1.5D589Ep6f;
2937 }
2938
xnn_init_f32_expminus_neonfma_rr1_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2939 void xnn_init_f32_expminus_neonfma_rr1_p5_params(
2940 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2941 {
2942 params->neonfma_rr1_p5.log2e = 0x1.715476p+0f;
2943 params->neonfma_rr1_p5.magic_bias = 0x1.8000FEp23f;
2944 params->neonfma_rr1_p5.minus_ln2 = -0x1.62E430p-1f;
2945 params->neonfma_rr1_p5.c5 = 0x1.0F9F9Cp-7f;
2946 params->neonfma_rr1_p5.c4 = 0x1.573A1Ap-5f;
2947 params->neonfma_rr1_p5.c3 = 0x1.555A80p-3f;
2948 params->neonfma_rr1_p5.c2 = 0x1.FFFDC6p-2f;
2949 params->neonfma_rr1_p5.c1 = 0x1.FFFFF6p-1f;
2950 params->neonfma_rr1_p5.denorm_cutoff = -0x1.5D589Ep6f;
2951 }
2952
xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2953 void xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params(
2954 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2955 {
2956 params->neonfma_rr1_lut64_p2.log2e = 0x1.715476p+0f;
2957 params->neonfma_rr1_lut64_p2.magic_bias = 0x1.800000p17f;
2958 params->neonfma_rr1_lut64_p2.minus_ln2 = -0x1.62E430p-1f;
2959 params->neonfma_rr1_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2960 params->neonfma_rr1_lut64_p2.denorm_cutoff = -0x1.5D589Ep6f;
2961 }
2962 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
2963
2964 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_expminus_sse2_rr2_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2965 void xnn_init_f32_expminus_sse2_rr2_p5_params(
2966 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2967 {
2968 for (uint32_t i = 0; i < 4; i++) {
2969 params->sse2_rr2_p5.log2e[i] = 0x1.715476p+0f;
2970 params->sse2_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2971 params->sse2_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
2972 params->sse2_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2973 params->sse2_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
2974 params->sse2_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2975 params->sse2_rr2_p5.c3[i] = 0x1.555A80p-3f;
2976 params->sse2_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2977 params->sse2_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
2978 params->sse2_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep6f;
2979 }
2980 }
2981
xnn_init_f32_expminus_avx2_rr1_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2982 void xnn_init_f32_expminus_avx2_rr1_p5_params(
2983 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2984 {
2985 for (uint32_t i = 0; i < 8; i++) {
2986 params->avx2_rr1_p5.log2e[i] = 0x1.715476p+0f;
2987 params->avx2_rr1_p5.magic_bias[i] = 0x1.8000FEp23f;
2988 params->avx2_rr1_p5.minus_ln2[i] = -0x1.62E430p-1f;
2989 params->avx2_rr1_p5.c5[i] = 0x1.0F9F9Cp-7f;
2990 params->avx2_rr1_p5.c4[i] = 0x1.573A1Ap-5f;
2991 params->avx2_rr1_p5.c3[i] = 0x1.555A80p-3f;
2992 params->avx2_rr1_p5.c2[i] = 0x1.FFFDC6p-2f;
2993 params->avx2_rr1_p5.c1[i] = 0x1.FFFFF6p-1f;
2994 params->avx2_rr1_p5.denorm_cutoff[i] = -0x1.5D589Ep6f;
2995 }
2996 for (uint32_t i = 0; i < 7; i++) {
2997 params->avx2_rr1_p5.mask_table[i] = -1;
2998 }
2999 for (uint32_t i = 7; i < 14; i++) {
3000 params->avx2_rr1_p5.mask_table[i] = 0;
3001 }
3002 }
3003
xnn_init_f32_expminus_avx512_rr1_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3004 void xnn_init_f32_expminus_avx512_rr1_p5_params(
3005 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3006 {
3007 params->avx512_rr1_p5.log2e = 0x1.715476p+0f;
3008 params->avx512_rr1_p5.minus_ln2 = -0x1.62E430p-1f;
3009 params->avx512_rr1_p5.c5 = 0x1.0F9F9Cp-7f;
3010 params->avx512_rr1_p5.c4 = 0x1.573A1Ap-5f;
3011 params->avx512_rr1_p5.c3 = 0x1.555A80p-3f;
3012 params->avx512_rr1_p5.c2 = 0x1.FFFDC6p-2f;
3013 params->avx512_rr1_p5.c1 = 0x1.FFFFF6p-1f;
3014 params->avx512_rr1_p5.c0 = 1.0f;
3015 }
3016 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3017
3018 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_expminus_wasmsimd_rr2_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3019 void xnn_init_f32_expminus_wasmsimd_rr2_p5_params(
3020 union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3021 {
3022 for (uint32_t i = 0; i < 2; i++) {
3023 params->wasmsimd_rr2_p5.log2e[i] = 0x1.715476p+0f;
3024 params->wasmsimd_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
3025 params->wasmsimd_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
3026 params->wasmsimd_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
3027 params->wasmsimd_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
3028 params->wasmsimd_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
3029 params->wasmsimd_rr2_p5.c3[i] = 0x1.555A80p-3f;
3030 params->wasmsimd_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
3031 params->wasmsimd_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
3032 params->wasmsimd_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep6f;
3033 }
3034 }
3035 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3036
xnn_init_f32_lrelu_scalar_params(union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS (1)],float slope)3037 void xnn_init_f32_lrelu_scalar_params(
3038 union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3039 float slope)
3040 {
3041 params->scalar.slope = slope;
3042 }
3043
3044 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_lrelu_sse_params(union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS (1)],float slope)3045 void xnn_init_f32_lrelu_sse_params(
3046 union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3047 float slope)
3048 {
3049 for (uint32_t i = 0; i < 4; i++) {
3050 params->sse.slope[i] = slope;
3051 }
3052 }
3053
xnn_init_f32_lrelu_avx_params(union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS (1)],float slope)3054 void xnn_init_f32_lrelu_avx_params(
3055 union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3056 float slope)
3057 {
3058 for (uint32_t i = 0; i < 8; i++) {
3059 params->avx.slope[i] = slope;
3060 }
3061 for (uint32_t i = 0; i < 7; i++) {
3062 params->avx.mask_table[i] = -1;
3063 }
3064 for (uint32_t i = 7; i < 14; i++) {
3065 params->avx.mask_table[i] = 0;
3066 }
3067 }
3068 #endif
3069
3070 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_lrelu_wasmsimd_params(union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS (1)],float slope)3071 void xnn_init_f32_lrelu_wasmsimd_params(
3072 union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3073 float slope)
3074 {
3075 params->wasmsimd.slope[0] = slope;
3076 params->wasmsimd.slope[1] = slope;
3077 }
3078 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3079
3080 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_sqrt_avx_params(union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS (1)])3081 void xnn_init_f32_sqrt_avx_params(
3082 union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)])
3083 {
3084 for (uint32_t i = 0; i < 7; i++) {
3085 params->avx.mask_table[i] = -1;
3086 }
3087 for (uint32_t i = 7; i < 14; i++) {
3088 params->avx.mask_table[i] = 0;
3089 }
3090 }
3091
xnn_init_f32_sqrt_fma_params(union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS (1)])3092 void xnn_init_f32_sqrt_fma_params(
3093 union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)])
3094 {
3095 for (uint32_t i = 0; i < 8; i++) {
3096 params->fma.half[i] = 0.5f;
3097 }
3098 for (uint32_t i = 0; i < 7; i++) {
3099 params->fma.mask_table[i] = -1;
3100 }
3101 for (uint32_t i = 7; i < 14; i++) {
3102 params->fma.mask_table[i] = 0;
3103 }
3104 }
3105
xnn_init_f32_sqrt_avx512_params(union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS (1)])3106 void xnn_init_f32_sqrt_avx512_params(
3107 union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)])
3108 {
3109 params->avx512.half = 0.5f;
3110 }
3111 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3112
xnn_init_f32_chw_params(union xnn_f32_chw_params params[XNN_MIN_ELEMENTS (1)],uint32_t width,float output_min,float output_max)3113 void xnn_init_f32_chw_params(
3114 union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
3115 uint32_t width,
3116 float output_min,
3117 float output_max)
3118 {
3119 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
3120 for (uint32_t i = 0; i < 4; i++) {
3121 params->sse.min[i] = output_min;
3122 params->sse.max[i] = output_max;
3123 }
3124
3125 const uint32_t w4 = (width - 1) & 3;
3126 params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
3127 params->sse.mask[1] = -(uint32_t) (w4 >= 1);
3128 params->sse.mask[2] = -(uint32_t) (w4 >= 2);
3129 params->sse.mask[3] = -(uint32_t) (w4 >= 3);
3130
3131 const uint32_t w8 = (width - 1) & 7;
3132 params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
3133 params->sse.mask_even[1] = -(uint32_t) (w8 >= 2);
3134 params->sse.mask_even[2] = -(uint32_t) (w8 >= 4);
3135 params->sse.mask_even[3] = -(uint32_t) (w8 >= 6);
3136 params->sse.mask_odd[0] = -(uint32_t) (w8 >= 1);
3137 params->sse.mask_odd[1] = -(uint32_t) (w8 >= 3);
3138 params->sse.mask_odd[2] = -(uint32_t) (w8 >= 5);
3139 params->sse.mask_odd[3] = -(uint32_t) (w8 >= 7);
3140 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
3141 params->neon.min = output_min;
3142 params->neon.max = output_max;
3143
3144 const uint32_t w4 = (width - 1) & 3;
3145 params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
3146 params->neon.mask[1] = -(uint32_t) (w4 >= 1);
3147 params->neon.mask[2] = -(uint32_t) (w4 >= 2);
3148 params->neon.mask[3] = -(uint32_t) (w4 >= 3);
3149
3150 const uint32_t w8 = (width - 1) & 7;
3151 params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
3152 params->neon.mask_even[1] = -(uint32_t) (w8 >= 2);
3153 params->neon.mask_even[2] = -(uint32_t) (w8 >= 4);
3154 params->neon.mask_even[3] = -(uint32_t) (w8 >= 6);
3155 params->neon.mask_odd[0] = -(uint32_t) (w8 >= 1);
3156 params->neon.mask_odd[1] = -(uint32_t) (w8 >= 3);
3157 params->neon.mask_odd[2] = -(uint32_t) (w8 >= 5);
3158 params->neon.mask_odd[3] = -(uint32_t) (w8 >= 7);
3159 #else
3160 params->scalar.min = output_min;
3161 params->scalar.max = output_max;
3162
3163 const uint32_t w4 = (width - 1) & 3;
3164 params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
3165 params->scalar.mask[1] = -(uint32_t) (w4 >= 1);
3166 params->scalar.mask[2] = -(uint32_t) (w4 >= 2);
3167 params->scalar.mask[3] = -(uint32_t) (w4 >= 3);
3168
3169 const uint32_t w8 = (width - 1) & 7;
3170 params->scalar.mask_even[0] = UINT32_C(0xFFFFFFFF);
3171 params->scalar.mask_even[1] = -(uint32_t) (w8 >= 2);
3172 params->scalar.mask_even[2] = -(uint32_t) (w8 >= 4);
3173 params->scalar.mask_even[3] = -(uint32_t) (w8 >= 6);
3174 params->scalar.mask_odd[0] = -(uint32_t) (w8 >= 1);
3175 params->scalar.mask_odd[1] = -(uint32_t) (w8 >= 3);
3176 params->scalar.mask_odd[2] = -(uint32_t) (w8 >= 5);
3177 params->scalar.mask_odd[3] = -(uint32_t) (w8 >= 7);
3178 #endif
3179 }
3180
xnn_update_f32_chw_params(union xnn_f32_chw_params * params,uint32_t width)3181 void xnn_update_f32_chw_params(
3182 union xnn_f32_chw_params* params,
3183 uint32_t width)
3184 {
3185 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
3186 const uint32_t w4 = (width - 1) & 3;
3187 params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
3188 params->sse.mask[1] = -(uint32_t) (w4 >= 1);
3189 params->sse.mask[2] = -(uint32_t) (w4 >= 2);
3190 params->sse.mask[3] = -(uint32_t) (w4 >= 3);
3191
3192 const uint32_t w8 = (width - 1) & 7;
3193 params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
3194 params->sse.mask_even[1] = -(uint32_t) (w8 >= 2);
3195 params->sse.mask_even[2] = -(uint32_t) (w8 >= 4);
3196 params->sse.mask_even[3] = -(uint32_t) (w8 >= 6);
3197 params->sse.mask_odd[0] = -(uint32_t) (w8 >= 1);
3198 params->sse.mask_odd[1] = -(uint32_t) (w8 >= 3);
3199 params->sse.mask_odd[2] = -(uint32_t) (w8 >= 5);
3200 params->sse.mask_odd[3] = -(uint32_t) (w8 >= 7);
3201 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
3202 const uint32_t w4 = (width - 1) & 3;
3203 params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
3204 params->neon.mask[1] = -(uint32_t) (w4 >= 1);
3205 params->neon.mask[2] = -(uint32_t) (w4 >= 2);
3206 params->neon.mask[3] = -(uint32_t) (w4 >= 3);
3207
3208 const uint32_t w8 = (width - 1) & 7;
3209 params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
3210 params->neon.mask_even[1] = -(uint32_t) (w8 >= 2);
3211 params->neon.mask_even[2] = -(uint32_t) (w8 >= 4);
3212 params->neon.mask_even[3] = -(uint32_t) (w8 >= 6);
3213 params->neon.mask_odd[0] = -(uint32_t) (w8 >= 1);
3214 params->neon.mask_odd[1] = -(uint32_t) (w8 >= 3);
3215 params->neon.mask_odd[2] = -(uint32_t) (w8 >= 5);
3216 params->neon.mask_odd[3] = -(uint32_t) (w8 >= 7);
3217 #else
3218 const uint32_t w4 = (width - 1) & 3;
3219 params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
3220 params->scalar.mask[1] = -(uint32_t) (w4 >= 1);
3221 params->scalar.mask[2] = -(uint32_t) (w4 >= 2);
3222 params->scalar.mask[3] = -(uint32_t) (w4 >= 3);
3223
3224 const uint32_t w8 = (width - 1) & 7;
3225 params->scalar.mask_even[0] = UINT32_C(0xFFFFFFFF);
3226 params->scalar.mask_even[1] = -(uint32_t) (w8 >= 2);
3227 params->scalar.mask_even[2] = -(uint32_t) (w8 >= 4);
3228 params->scalar.mask_even[3] = -(uint32_t) (w8 >= 6);
3229 params->scalar.mask_odd[0] = -(uint32_t) (w8 >= 1);
3230 params->scalar.mask_odd[1] = -(uint32_t) (w8 >= 3);
3231 params->scalar.mask_odd[2] = -(uint32_t) (w8 >= 5);
3232 params->scalar.mask_odd[3] = -(uint32_t) (w8 >= 7);
3233 #endif
3234 }
3235
xnn_init_scalar_f32_chw_params(union xnn_f32_chw_params params[XNN_MIN_ELEMENTS (1)],uint32_t width,float output_min,float output_max)3236 void xnn_init_scalar_f32_chw_params(
3237 union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
3238 uint32_t width,
3239 float output_min,
3240 float output_max)
3241 {
3242 params->scalar.min = output_min;
3243 params->scalar.max = output_max;
3244
3245 const uint32_t w4 = (width - 1) & 3;
3246 params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
3247 params->scalar.mask[1] = -(uint32_t) (w4 >= 1);
3248 params->scalar.mask[2] = -(uint32_t) (w4 >= 2);
3249 params->scalar.mask[3] = -(uint32_t) (w4 >= 3);
3250
3251 const uint32_t w8 = (width - 1) & 7;
3252 params->scalar.mask_even[0] = UINT32_C(0xFFFFFFFF);
3253 params->scalar.mask_even[1] = -(uint32_t) (w8 >= 2);
3254 params->scalar.mask_even[2] = -(uint32_t) (w8 >= 4);
3255 params->scalar.mask_even[3] = -(uint32_t) (w8 >= 6);
3256 params->scalar.mask_odd[0] = -(uint32_t) (w8 >= 1);
3257 params->scalar.mask_odd[1] = -(uint32_t) (w8 >= 3);
3258 params->scalar.mask_odd[2] = -(uint32_t) (w8 >= 5);
3259 params->scalar.mask_odd[3] = -(uint32_t) (w8 >= 7);
3260 }
3261
3262 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_s8_minmax_sse2_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)3263 void xnn_init_s8_minmax_sse2_params(
3264 union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3265 int8_t output_min,
3266 int8_t output_max)
3267 {
3268 assert(output_min < output_max);
3269
3270 const uint8_t output_min_with_bias = UINT8_C(0x80) ^ (uint8_t) output_min;
3271 const uint8_t output_max_with_bias = UINT8_C(0x80) ^ (uint8_t) output_max;
3272 for (uint32_t i = 0; i < 16; i++) {
3273 params->sse2.bias[i] = UINT8_C(0x80);
3274 params->sse2.min_with_bias[i] = output_min_with_bias;
3275 params->sse2.max_with_bias[i] = output_max_with_bias;
3276 }
3277 }
3278
xnn_init_s8_minmax_sse4_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)3279 void xnn_init_s8_minmax_sse4_params(
3280 union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3281 int8_t output_min,
3282 int8_t output_max)
3283 {
3284 assert(output_min < output_max);
3285
3286 for (uint32_t i = 0; i < 16; i++) {
3287 params->sse4.min[i] = output_min;
3288 params->sse4.max[i] = output_max;
3289 }
3290 }
3291 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3292
3293 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_s8_minmax_neon_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)3294 void xnn_init_s8_minmax_neon_params(
3295 union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3296 int8_t output_min,
3297 int8_t output_max)
3298 {
3299 assert(output_min < output_max);
3300
3301 params->neon.min = output_min;
3302 params->neon.max = output_max;
3303 }
3304 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3305
3306 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_s8_minmax_wasmsimd_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)3307 void xnn_init_s8_minmax_wasmsimd_params(
3308 union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3309 int8_t output_min,
3310 int8_t output_max)
3311 {
3312 assert(output_min < output_max);
3313
3314 for (uint32_t i = 0; i < 8; i++) {
3315 params->wasmsimd.min[i] = output_min;
3316 params->wasmsimd.max[i] = output_max;
3317 }
3318 }
3319 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3320
xnn_init_s8_minmax_scalar_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)3321 void xnn_init_s8_minmax_scalar_params(
3322 union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3323 int8_t output_min,
3324 int8_t output_max)
3325 {
3326 assert(output_min < output_max);
3327
3328 params->scalar.min = (int32_t) output_min;
3329 params->scalar.max = (int32_t) output_max;
3330 }
3331
xnn_init_u8_minmax_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)3332 void xnn_init_u8_minmax_params(
3333 union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3334 uint8_t output_min,
3335 uint8_t output_max)
3336 {
3337 assert(output_min < output_max);
3338
3339 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
3340 for (uint32_t i = 0; i < 16; i++) {
3341 params->sse2.min[i] = output_min;
3342 params->sse2.max[i] = output_max;
3343 }
3344 #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
3345 params->neon.min = output_min;
3346 params->neon.max = output_max;
3347 #else
3348 params->scalar.min = (uint32_t) output_min;
3349 params->scalar.max = (uint32_t) output_max;
3350 #endif
3351 }
3352
3353 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_u8_minmax_sse2_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)3354 void xnn_init_u8_minmax_sse2_params(
3355 union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3356 uint8_t output_min,
3357 uint8_t output_max)
3358 {
3359 assert(output_min < output_max);
3360
3361 for (uint32_t i = 0; i < 16; i++) {
3362 params->sse2.min[i] = output_min;
3363 params->sse2.max[i] = output_max;
3364 }
3365 }
3366 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3367
3368 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_u8_minmax_wasmsimd_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)3369 void xnn_init_u8_minmax_wasmsimd_params(
3370 union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3371 uint8_t output_min,
3372 uint8_t output_max)
3373 {
3374 assert(output_min < output_max);
3375
3376 for (uint32_t i = 0; i < 8; i++) {
3377 params->wasmsimd.min[i] = output_min;
3378 params->wasmsimd.max[i] = output_max;
3379 }
3380 }
3381 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3382
3383 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_u8_minmax_neon_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)3384 void xnn_init_u8_minmax_neon_params(
3385 union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3386 uint8_t output_min,
3387 uint8_t output_max)
3388 {
3389 assert(output_min < output_max);
3390
3391 params->neon.min = output_min;
3392 params->neon.max = output_max;
3393 }
3394 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3395
xnn_init_u8_minmax_scalar_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)3396 void xnn_init_u8_minmax_scalar_params(
3397 union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3398 uint8_t output_min,
3399 uint8_t output_max)
3400 {
3401 assert(output_min < output_max);
3402
3403 params->scalar.min = (uint32_t) output_min;
3404 params->scalar.max = (uint32_t) output_max;
3405 }
3406
3407 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_add_minmax_sse2_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3408 void xnn_init_qu8_add_minmax_sse2_params(
3409 union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3410 uint8_t a_zero_point,
3411 uint8_t b_zero_point,
3412 uint8_t output_zero_point,
3413 float a_output_scale,
3414 float b_output_scale,
3415 uint8_t output_min,
3416 uint8_t output_max)
3417 {
3418 const float abs_a_output_scale = fabsf(a_output_scale);
3419 const float abs_b_output_scale = fabsf(b_output_scale);
3420 assert(abs_a_output_scale >= 0x1.0p-10f);
3421 assert(abs_b_output_scale >= 0x1.0p-10f);
3422 assert(abs_a_output_scale < 0x1.0p+8f);
3423 assert(abs_b_output_scale < 0x1.0p+8f);
3424
3425 // Compute requantization parameters.
3426 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3427 assert(max_abs_output_scale >= 0x1.0p-10f);
3428 assert(max_abs_output_scale < 0x1.0p+8f);
3429 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3430 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3431
3432 // Shift is in [12, 30] range.
3433 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3434 assert(shift <= 30);
3435 assert(shift >= 12);
3436
3437 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3438 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3439 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3440 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3441 assert(abs_a_multiplier <= INT32_C(0x00200000));
3442 assert(abs_b_multiplier <= INT32_C(0x00200000));
3443
3444 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3445 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3446
3447 const int32_t rounding = INT32_C(1) << (shift - 1);
3448 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
3449 for (uint32_t i = 0; i < 4; i++) {
3450 params->sse2.bias[i] = bias;
3451 }
3452 const uint16_t a_multiplier_lo = (uint16_t) a_multiplier;
3453 const uint16_t a_multiplier_hi = (uint16_t) ((uint32_t) a_multiplier >> 16);
3454 const uint16_t b_multiplier_lo = (uint16_t) b_multiplier;
3455 const uint16_t b_multiplier_hi = (uint16_t) ((uint32_t) b_multiplier >> 16);
3456 for (uint32_t i = 0; i < 8; i++) {
3457 params->sse2.a_multiplier_lo[i] = a_multiplier_lo;
3458 params->sse2.a_multiplier_hi[i] = a_multiplier_hi;
3459 params->sse2.b_multiplier_lo[i] = b_multiplier_lo;
3460 params->sse2.b_multiplier_hi[i] = b_multiplier_hi;
3461 }
3462 params->sse2.shift = shift;
3463 params->sse2.b_multiplier = (uint32_t) b_multiplier;
3464 for (uint32_t i = 0; i < 8; i++) {
3465 params->sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3466 }
3467 for (uint32_t i = 0; i < 16; i++) {
3468 params->sse2.output_min[i] = output_min;
3469 params->sse2.output_max[i] = output_max;
3470 }
3471 }
3472
xnn_init_qu8_add_minmax_sse4_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3473 void xnn_init_qu8_add_minmax_sse4_params(
3474 union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3475 uint8_t a_zero_point,
3476 uint8_t b_zero_point,
3477 uint8_t output_zero_point,
3478 float a_output_scale,
3479 float b_output_scale,
3480 uint8_t output_min,
3481 uint8_t output_max)
3482 {
3483 const float abs_a_output_scale = fabsf(a_output_scale);
3484 const float abs_b_output_scale = fabsf(b_output_scale);
3485 assert(abs_a_output_scale >= 0x1.0p-10f);
3486 assert(abs_b_output_scale >= 0x1.0p-10f);
3487 assert(abs_a_output_scale < 0x1.0p+8f);
3488 assert(abs_b_output_scale < 0x1.0p+8f);
3489
3490 // Compute requantization parameters.
3491 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3492 assert(max_abs_output_scale >= 0x1.0p-10f);
3493 assert(max_abs_output_scale < 0x1.0p+8f);
3494 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3495 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3496
3497 // Shift is in [12, 30] range.
3498 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3499 assert(shift <= 30);
3500 assert(shift >= 12);
3501
3502 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3503 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3504 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3505 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3506 assert(abs_a_multiplier <= INT32_C(0x00200000));
3507 assert(abs_b_multiplier <= INT32_C(0x00200000));
3508
3509 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3510 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3511
3512 const int32_t rounding = INT32_C(1) << (shift - 1);
3513 const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
3514 for (uint32_t i = 0; i < 4; i++) {
3515 params->sse4.bias[i] = bias;
3516 params->sse4.a_multiplier[i] = a_multiplier;
3517 params->sse4.b_multiplier[i] = b_multiplier;
3518 params->sse4.shift[i] = shift;
3519 }
3520 for (uint32_t i = 0; i < 8; i++) {
3521 params->sse4.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3522 }
3523 for (uint32_t i = 0; i < 16; i++) {
3524 params->sse4.output_min[i] = output_min;
3525 params->sse4.output_max[i] = output_max;
3526 }
3527 }
3528
xnn_init_qu8_add_minmax_avx2_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3529 void xnn_init_qu8_add_minmax_avx2_params(
3530 union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3531 uint8_t a_zero_point,
3532 uint8_t b_zero_point,
3533 uint8_t output_zero_point,
3534 float a_output_scale,
3535 float b_output_scale,
3536 uint8_t output_min,
3537 uint8_t output_max)
3538 {
3539 const float abs_a_output_scale = fabsf(a_output_scale);
3540 const float abs_b_output_scale = fabsf(b_output_scale);
3541 assert(abs_a_output_scale >= 0x1.0p-10f);
3542 assert(abs_b_output_scale >= 0x1.0p-10f);
3543 assert(abs_a_output_scale < 0x1.0p+8f);
3544 assert(abs_b_output_scale < 0x1.0p+8f);
3545
3546 // Compute requantization parameters.
3547 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3548 assert(max_abs_output_scale >= 0x1.0p-10f);
3549 assert(max_abs_output_scale < 0x1.0p+8f);
3550 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3551 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3552
3553 // Shift is in [12, 30] range.
3554 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3555 assert(shift <= 30);
3556 assert(shift >= 12);
3557
3558 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3559 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3560 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3561 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3562 assert(abs_a_multiplier <= INT32_C(0x00200000));
3563 assert(abs_b_multiplier <= INT32_C(0x00200000));
3564
3565 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3566 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3567
3568 const int32_t rounding = INT32_C(1) << (shift - 1);
3569 const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
3570 for (uint32_t i = 0; i < 8; i++) {
3571 params->avx2.bias[i] = bias;
3572 params->avx2.a_multiplier[i] = a_multiplier;
3573 params->avx2.b_multiplier[i] = b_multiplier;
3574 params->avx2.shift[i] = shift;
3575 }
3576 for (uint32_t i = 0; i < 16; i++) {
3577 params->avx2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3578 params->avx2.output_min[i] = output_min;
3579 params->avx2.output_max[i] = output_max;
3580 }
3581 }
3582
xnn_init_qu8_add_minmax_avx512_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3583 void xnn_init_qu8_add_minmax_avx512_params(
3584 union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3585 uint8_t a_zero_point,
3586 uint8_t b_zero_point,
3587 uint8_t output_zero_point,
3588 float a_output_scale,
3589 float b_output_scale,
3590 uint8_t output_min,
3591 uint8_t output_max)
3592 {
3593 const float abs_a_output_scale = fabsf(a_output_scale);
3594 const float abs_b_output_scale = fabsf(b_output_scale);
3595 assert(abs_a_output_scale >= 0x1.0p-10f);
3596 assert(abs_b_output_scale >= 0x1.0p-10f);
3597 assert(abs_a_output_scale < 0x1.0p+8f);
3598 assert(abs_b_output_scale < 0x1.0p+8f);
3599
3600 // Compute requantization parameters.
3601 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3602 assert(max_abs_output_scale >= 0x1.0p-10f);
3603 assert(max_abs_output_scale < 0x1.0p+8f);
3604 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3605 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3606
3607 // Shift is in [12, 30] range.
3608 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3609 assert(shift <= 30);
3610 assert(shift >= 12);
3611
3612 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3613 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3614 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3615 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3616 assert(abs_a_multiplier <= INT32_C(0x00200000));
3617 assert(abs_b_multiplier <= INT32_C(0x00200000));
3618
3619 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3620 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3621
3622 const int32_t rounding = INT32_C(1) << (shift - 1);
3623 const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
3624 for (uint32_t i = 0; i < 16; i++) {
3625 params->avx512.bias[i] = bias;
3626 params->avx512.a_multiplier[i] = a_multiplier;
3627 params->avx512.b_multiplier[i] = b_multiplier;
3628 params->avx512.shift[i] = shift;
3629 }
3630 for (uint32_t i = 0; i < 32; i++) {
3631 params->avx512.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3632 params->avx512.output_min[i] = output_min;
3633 params->avx512.output_max[i] = output_max;
3634 }
3635 }
3636 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
3637
3638 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_add_minmax_neon_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3639 void xnn_init_qu8_add_minmax_neon_params(
3640 union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3641 uint8_t a_zero_point,
3642 uint8_t b_zero_point,
3643 uint8_t output_zero_point,
3644 float a_output_scale,
3645 float b_output_scale,
3646 uint8_t output_min,
3647 uint8_t output_max)
3648 {
3649 const float abs_a_output_scale = fabsf(a_output_scale);
3650 const float abs_b_output_scale = fabsf(b_output_scale);
3651 assert(abs_a_output_scale >= 0x1.0p-10f);
3652 assert(abs_b_output_scale >= 0x1.0p-10f);
3653 assert(abs_a_output_scale < 0x1.0p+8f);
3654 assert(abs_b_output_scale < 0x1.0p+8f);
3655
3656 // Compute requantization parameters.
3657 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3658 assert(max_abs_output_scale >= 0x1.0p-10f);
3659 assert(max_abs_output_scale < 0x1.0p+8f);
3660 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3661 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3662
3663 // Shift is in [12, 30] range.
3664 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3665 assert(shift <= 30);
3666 assert(shift >= 12);
3667
3668 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3669 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3670 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3671 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3672 assert(abs_a_multiplier <= INT32_C(0x00200000));
3673 assert(abs_b_multiplier <= INT32_C(0x00200000));
3674
3675 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3676 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3677
3678 params->neon.a_zero_point = a_zero_point;
3679 params->neon.b_zero_point = b_zero_point;
3680 params->neon.a_multiplier = (int32_t) a_multiplier;
3681 params->neon.b_multiplier = (int32_t) b_multiplier;
3682 params->neon.right_shift = (int32_t) -shift;
3683 params->neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
3684 params->neon.output_min = output_min;
3685 params->neon.output_max = output_max;
3686 }
3687 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
3688
3689 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_add_minmax_wasmsimd_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3690 void xnn_init_qu8_add_minmax_wasmsimd_params(
3691 union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3692 uint8_t a_zero_point,
3693 uint8_t b_zero_point,
3694 uint8_t output_zero_point,
3695 float a_output_scale,
3696 float b_output_scale,
3697 uint8_t output_min,
3698 uint8_t output_max)
3699 {
3700 const float abs_a_output_scale = fabsf(a_output_scale);
3701 const float abs_b_output_scale = fabsf(b_output_scale);
3702 assert(abs_a_output_scale >= 0x1.0p-10f);
3703 assert(abs_b_output_scale >= 0x1.0p-10f);
3704 assert(abs_a_output_scale < 0x1.0p+8f);
3705 assert(abs_b_output_scale < 0x1.0p+8f);
3706
3707 // Compute requantization parameters.
3708 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3709 assert(max_abs_output_scale >= 0x1.0p-10f);
3710 assert(max_abs_output_scale < 0x1.0p+8f);
3711 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3712 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3713
3714 // Shift is in [12, 30] range.
3715 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3716 assert(shift <= 30);
3717 assert(shift >= 12);
3718
3719 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3720 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3721 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3722 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3723 assert(abs_a_multiplier <= INT32_C(0x00200000));
3724 assert(abs_b_multiplier <= INT32_C(0x00200000));
3725
3726 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3727 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3728
3729 const int32_t rounding = INT32_C(1) << (shift - 1);
3730 const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
3731 for (uint32_t i = 0; i < 2; i++) {
3732 params->wasmsimd.bias[i] = bias;
3733 params->wasmsimd.a_multiplier[i] = a_multiplier;
3734 params->wasmsimd.b_multiplier[i] = b_multiplier;
3735 }
3736 params->wasmsimd.shift = shift;
3737 for (uint32_t i = 0; i < 4; i++) {
3738 params->wasmsimd.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3739 }
3740 for (uint32_t i = 0; i < 8; i++) {
3741 params->wasmsimd.output_min[i] = output_min;
3742 params->wasmsimd.output_max[i] = output_max;
3743 }
3744 }
3745 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3746
xnn_init_qu8_add_minmax_scalar_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3747 void xnn_init_qu8_add_minmax_scalar_params(
3748 union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3749 uint8_t a_zero_point,
3750 uint8_t b_zero_point,
3751 uint8_t output_zero_point,
3752 float a_output_scale,
3753 float b_output_scale,
3754 uint8_t output_min,
3755 uint8_t output_max)
3756 {
3757 const float abs_a_output_scale = fabsf(a_output_scale);
3758 const float abs_b_output_scale = fabsf(b_output_scale);
3759 assert(abs_a_output_scale >= 0x1.0p-10f);
3760 assert(abs_b_output_scale >= 0x1.0p-10f);
3761 assert(abs_a_output_scale < 0x1.0p+8f);
3762 assert(abs_b_output_scale < 0x1.0p+8f);
3763
3764 // Compute requantization parameters.
3765 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3766 assert(max_abs_output_scale >= 0x1.0p-10f);
3767 assert(max_abs_output_scale < 0x1.0p+8f);
3768 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3769 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3770
3771 // Shift is in [12, 30] range.
3772 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3773 assert(shift <= 30);
3774 assert(shift >= 12);
3775
3776 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3777 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3778 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3779 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3780 assert(abs_a_multiplier <= INT32_C(0x00200000));
3781 assert(abs_b_multiplier <= INT32_C(0x00200000));
3782
3783 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3784 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3785
3786 const int32_t rounding = INT32_C(1) << (shift - 1);
3787 params->scalar.bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
3788 params->scalar.a_multiplier = a_multiplier;
3789 params->scalar.b_multiplier = b_multiplier;
3790 params->scalar.shift = shift;
3791 params->scalar.output_min_less_zero_point = (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
3792 params->scalar.output_max_less_zero_point = (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
3793 params->scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
3794 }
3795
3796 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_add_minmax_sse2_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)3797 void xnn_init_qs8_add_minmax_sse2_params(
3798 union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3799 int8_t a_zero_point,
3800 int8_t b_zero_point,
3801 int8_t output_zero_point,
3802 float a_output_scale,
3803 float b_output_scale,
3804 int8_t output_min,
3805 int8_t output_max)
3806 {
3807 const float abs_a_output_scale = fabsf(a_output_scale);
3808 const float abs_b_output_scale = fabsf(b_output_scale);
3809 assert(abs_a_output_scale >= 0x1.0p-10f);
3810 assert(abs_b_output_scale >= 0x1.0p-10f);
3811 assert(abs_a_output_scale < 0x1.0p+8f);
3812 assert(abs_b_output_scale < 0x1.0p+8f);
3813
3814 // Compute requantization parameters.
3815 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3816 assert(max_abs_output_scale >= 0x1.0p-10f);
3817 assert(max_abs_output_scale < 0x1.0p+8f);
3818 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3819 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3820
3821 // Shift is in [12, 30] range.
3822 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3823 assert(shift <= 30);
3824 assert(shift >= 12);
3825
3826 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3827 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3828 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3829 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3830 assert(abs_a_multiplier <= INT32_C(0x00200000));
3831 assert(abs_b_multiplier <= INT32_C(0x00200000));
3832
3833 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3834 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3835
3836 const int32_t rounding = INT32_C(1) << (shift - 1);
3837 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
3838 for (uint32_t i = 0; i < 4; i++) {
3839 params->sse2.bias[i] = bias;
3840 }
3841 const uint16_t a_multiplier_lo = (uint16_t) a_multiplier;
3842 const uint16_t a_multiplier_hi = (uint16_t) ((uint32_t) a_multiplier >> 16);
3843 const uint16_t b_multiplier_lo = (uint16_t) b_multiplier;
3844 const uint16_t b_multiplier_hi = (uint16_t) ((uint32_t) b_multiplier >> 16);
3845 for (uint32_t i = 0; i < 8; i++) {
3846 params->sse2.a_multiplier_lo[i] = a_multiplier_lo;
3847 params->sse2.a_multiplier_hi[i] = a_multiplier_hi;
3848 params->sse2.b_multiplier_lo[i] = b_multiplier_lo;
3849 params->sse2.b_multiplier_hi[i] = b_multiplier_hi;
3850 }
3851 params->sse2.shift = shift;
3852 params->sse2.b_multiplier = (uint32_t) b_multiplier;
3853 for (uint32_t i = 0; i < 8; i++) {
3854 params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
3855 params->sse2.output_min[i] = (int16_t) output_min;
3856 params->sse2.output_max[i] = (int16_t) output_max;
3857 }
3858 }
3859
xnn_init_qs8_add_minmax_sse4_mul16_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)3860 void xnn_init_qs8_add_minmax_sse4_mul16_params(
3861 union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3862 int8_t a_zero_point,
3863 int8_t b_zero_point,
3864 int8_t output_zero_point,
3865 float a_output_scale,
3866 float b_output_scale,
3867 int8_t output_min,
3868 int8_t output_max)
3869 {
3870 const float abs_a_output_scale = fabsf(a_output_scale);
3871 const float abs_b_output_scale = fabsf(b_output_scale);
3872 assert(abs_a_output_scale >= 0x1.0p-10f);
3873 assert(abs_b_output_scale >= 0x1.0p-10f);
3874 assert(abs_a_output_scale < 0x1.0p+8f);
3875 assert(abs_b_output_scale < 0x1.0p+8f);
3876
3877 // Compute requantization parameters.
3878 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3879 assert(max_abs_output_scale >= 0x1.0p-10f);
3880 assert(max_abs_output_scale < 0x1.0p+8f);
3881 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3882 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3883
3884 // Shift is in [12, 30] range.
3885 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3886 assert(shift <= 30);
3887 assert(shift >= 12);
3888
3889 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3890 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3891 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3892 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3893 assert(abs_a_multiplier <= INT32_C(0x00200000));
3894 assert(abs_b_multiplier <= INT32_C(0x00200000));
3895
3896 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3897 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3898
3899 const int32_t rounding = INT32_C(1) << (shift - 1);
3900 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
3901 for (uint32_t i = 0; i < 4; i++) {
3902 params->sse4_mul16.bias[i] = bias;
3903 }
3904 const uint16_t a_multiplier_lo = (uint16_t) a_multiplier;
3905 const uint16_t a_multiplier_hi = (uint16_t) ((uint32_t) a_multiplier >> 16);
3906 const uint16_t b_multiplier_lo = (uint16_t) b_multiplier;
3907 const uint16_t b_multiplier_hi = (uint16_t) ((uint32_t) b_multiplier >> 16);
3908 for (uint32_t i = 0; i < 8; i++) {
3909 params->sse4_mul16.a_multiplier_lo[i] = a_multiplier_lo;
3910 params->sse4_mul16.a_multiplier_hi[i] = a_multiplier_hi;
3911 params->sse4_mul16.b_multiplier_lo[i] = b_multiplier_lo;
3912 params->sse4_mul16.b_multiplier_hi[i] = b_multiplier_hi;
3913 }
3914 params->sse4_mul16.shift = shift;
3915 params->sse4_mul16.b_multiplier = (uint32_t) b_multiplier;
3916 for (uint32_t i = 0; i < 8; i++) {
3917 params->sse4_mul16.output_zero_point[i] = (int16_t) output_zero_point;
3918 }
3919 for (uint32_t i = 0; i < 16; i++) {
3920 params->sse4_mul16.output_min[i] = output_min;
3921 params->sse4_mul16.output_max[i] = output_max;
3922 }
3923 }
3924
xnn_init_qs8_add_minmax_sse4_mul32_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)3925 void xnn_init_qs8_add_minmax_sse4_mul32_params(
3926 union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3927 int8_t a_zero_point,
3928 int8_t b_zero_point,
3929 int8_t output_zero_point,
3930 float a_output_scale,
3931 float b_output_scale,
3932 int8_t output_min,
3933 int8_t output_max)
3934 {
3935 const float abs_a_output_scale = fabsf(a_output_scale);
3936 const float abs_b_output_scale = fabsf(b_output_scale);
3937 assert(abs_a_output_scale >= 0x1.0p-10f);
3938 assert(abs_b_output_scale >= 0x1.0p-10f);
3939 assert(abs_a_output_scale < 0x1.0p+8f);
3940 assert(abs_b_output_scale < 0x1.0p+8f);
3941
3942 // Compute requantization parameters.
3943 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3944 assert(max_abs_output_scale >= 0x1.0p-10f);
3945 assert(max_abs_output_scale < 0x1.0p+8f);
3946 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3947 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3948
3949 // Shift is in [12, 30] range.
3950 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3951 assert(shift <= 30);
3952 assert(shift >= 12);
3953
3954 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3955 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3956 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3957 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3958 assert(abs_a_multiplier <= INT32_C(0x00200000));
3959 assert(abs_b_multiplier <= INT32_C(0x00200000));
3960
3961 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3962 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3963
3964 const int32_t rounding = INT32_C(1) << (shift - 1);
3965 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
3966 for (uint32_t i = 0; i < 4; i++) {
3967 params->sse4_mul32.bias[i] = bias;
3968 params->sse4_mul32.a_multiplier[i] = a_multiplier;
3969 params->sse4_mul32.b_multiplier[i] = b_multiplier;
3970 params->sse4_mul32.shift[i] = shift;
3971 }
3972 for (uint32_t i = 0; i < 8; i++) {
3973 params->sse4_mul32.output_zero_point[i] = (int16_t) output_zero_point;
3974 }
3975 for (uint32_t i = 0; i < 16; i++) {
3976 params->sse4_mul32.output_min[i] = output_min;
3977 params->sse4_mul32.output_max[i] = output_max;
3978 }
3979 }
3980
xnn_init_qs8_add_minmax_avx2_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)3981 void xnn_init_qs8_add_minmax_avx2_params(
3982 union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3983 int8_t a_zero_point,
3984 int8_t b_zero_point,
3985 int8_t output_zero_point,
3986 float a_output_scale,
3987 float b_output_scale,
3988 int8_t output_min,
3989 int8_t output_max)
3990 {
3991 const float abs_a_output_scale = fabsf(a_output_scale);
3992 const float abs_b_output_scale = fabsf(b_output_scale);
3993 assert(abs_a_output_scale >= 0x1.0p-10f);
3994 assert(abs_b_output_scale >= 0x1.0p-10f);
3995 assert(abs_a_output_scale < 0x1.0p+8f);
3996 assert(abs_b_output_scale < 0x1.0p+8f);
3997
3998 // Compute requantization parameters.
3999 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4000 assert(max_abs_output_scale >= 0x1.0p-10f);
4001 assert(max_abs_output_scale < 0x1.0p+8f);
4002 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
4003 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4004
4005 // Shift is in [12, 30] range.
4006 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4007 assert(shift <= 30);
4008 assert(shift >= 12);
4009
4010 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4011 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
4012 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
4013 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4014 assert(abs_a_multiplier <= INT32_C(0x00200000));
4015 assert(abs_b_multiplier <= INT32_C(0x00200000));
4016
4017 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4018 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4019
4020 const int32_t rounding = INT32_C(1) << (shift - 1);
4021 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4022 for (uint32_t i = 0; i < 8; i++) {
4023 params->avx2.bias[i] = bias;
4024 params->avx2.a_multiplier[i] = a_multiplier;
4025 params->avx2.b_multiplier[i] = b_multiplier;
4026 params->avx2.shift[i] = shift;
4027 }
4028 for (uint32_t i = 0; i < 16; i++) {
4029 params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
4030 params->avx2.output_min[i] = output_min;
4031 params->avx2.output_max[i] = output_max;
4032 }
4033 }
4034
xnn_init_qs8_add_minmax_avx512_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)4035 void xnn_init_qs8_add_minmax_avx512_params(
4036 union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
4037 int8_t a_zero_point,
4038 int8_t b_zero_point,
4039 int8_t output_zero_point,
4040 float a_output_scale,
4041 float b_output_scale,
4042 int8_t output_min,
4043 int8_t output_max)
4044 {
4045 const float abs_a_output_scale = fabsf(a_output_scale);
4046 const float abs_b_output_scale = fabsf(b_output_scale);
4047 assert(abs_a_output_scale >= 0x1.0p-10f);
4048 assert(abs_b_output_scale >= 0x1.0p-10f);
4049 assert(abs_a_output_scale < 0x1.0p+8f);
4050 assert(abs_b_output_scale < 0x1.0p+8f);
4051
4052 // Compute requantization parameters.
4053 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4054 assert(max_abs_output_scale >= 0x1.0p-10f);
4055 assert(max_abs_output_scale < 0x1.0p+8f);
4056 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
4057 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4058
4059 // Shift is in [12, 30] range.
4060 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4061 assert(shift <= 30);
4062 assert(shift >= 12);
4063
4064 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4065 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
4066 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
4067 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4068 assert(abs_a_multiplier <= INT32_C(0x00200000));
4069 assert(abs_b_multiplier <= INT32_C(0x00200000));
4070
4071 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4072 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4073
4074 const int32_t rounding = INT32_C(1) << (shift - 1);
4075 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4076 for (uint32_t i = 0; i < 16; i++) {
4077 params->avx512.bias[i] = bias;
4078 params->avx512.a_multiplier[i] = a_multiplier;
4079 params->avx512.b_multiplier[i] = b_multiplier;
4080 params->avx512.shift[i] = shift;
4081 }
4082 for (uint32_t i = 0; i < 32; i++) {
4083 params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
4084 params->avx512.output_min[i] = output_min;
4085 params->avx512.output_max[i] = output_max;
4086 }
4087 }
4088 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
4089
4090 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_add_minmax_neon_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)4091 void xnn_init_qs8_add_minmax_neon_params(
4092 union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
4093 int8_t a_zero_point,
4094 int8_t b_zero_point,
4095 int8_t output_zero_point,
4096 float a_output_scale,
4097 float b_output_scale,
4098 int8_t output_min,
4099 int8_t output_max)
4100 {
4101 const float abs_a_output_scale = fabsf(a_output_scale);
4102 const float abs_b_output_scale = fabsf(b_output_scale);
4103 assert(abs_a_output_scale >= 0x1.0p-10f);
4104 assert(abs_b_output_scale >= 0x1.0p-10f);
4105 assert(abs_a_output_scale < 0x1.0p+8f);
4106 assert(abs_b_output_scale < 0x1.0p+8f);
4107
4108 // Compute requantization parameters.
4109 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4110 assert(max_abs_output_scale >= 0x1.0p-10f);
4111 assert(max_abs_output_scale < 0x1.0p+8f);
4112 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
4113 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4114
4115 // Shift is in [12, 30] range.
4116 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4117 assert(shift <= 30);
4118 assert(shift >= 12);
4119
4120 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4121 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
4122 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
4123 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4124 assert(abs_a_multiplier <= INT32_C(0x00200000));
4125 assert(abs_b_multiplier <= INT32_C(0x00200000));
4126
4127 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4128 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4129
4130 params->neon.a_zero_point = a_zero_point;
4131 params->neon.b_zero_point = b_zero_point;
4132 params->neon.a_multiplier = (int32_t) a_multiplier;
4133 params->neon.b_multiplier = (int32_t) b_multiplier;
4134 params->neon.right_shift = (int32_t) -shift;
4135 params->neon.output_zero_point = (int16_t) output_zero_point;
4136 params->neon.output_min = output_min;
4137 params->neon.output_max = output_max;
4138 }
4139 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4140
4141 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_add_minmax_wasmsimd_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)4142 void xnn_init_qs8_add_minmax_wasmsimd_params(
4143 union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
4144 int8_t a_zero_point,
4145 int8_t b_zero_point,
4146 int8_t output_zero_point,
4147 float a_output_scale,
4148 float b_output_scale,
4149 int8_t output_min,
4150 int8_t output_max)
4151 {
4152 const float abs_a_output_scale = fabsf(a_output_scale);
4153 const float abs_b_output_scale = fabsf(b_output_scale);
4154 assert(abs_a_output_scale >= 0x1.0p-10f);
4155 assert(abs_b_output_scale >= 0x1.0p-10f);
4156 assert(abs_a_output_scale < 0x1.0p+8f);
4157 assert(abs_b_output_scale < 0x1.0p+8f);
4158
4159 // Compute requantization parameters.
4160 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4161 assert(max_abs_output_scale >= 0x1.0p-10f);
4162 assert(max_abs_output_scale < 0x1.0p+8f);
4163 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
4164 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4165
4166 // Shift is in [12, 30] range.
4167 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4168 assert(shift <= 30);
4169 assert(shift >= 12);
4170
4171 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4172 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
4173 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
4174 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4175 assert(abs_a_multiplier <= INT32_C(0x00200000));
4176 assert(abs_b_multiplier <= INT32_C(0x00200000));
4177
4178 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4179 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4180
4181 const int32_t rounding = INT32_C(1) << (shift - 1);
4182 const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4183 for (uint32_t i = 0; i < 2; i++) {
4184 params->wasmsimd.bias[i] = bias;
4185 params->wasmsimd.a_multiplier[i] = a_multiplier;
4186 params->wasmsimd.b_multiplier[i] = b_multiplier;
4187 }
4188 params->wasmsimd.shift = shift;
4189 for (uint32_t i = 0; i < 4; i++) {
4190 params->wasmsimd.output_zero_point[i] = (int16_t) output_zero_point;
4191 }
4192 for (uint32_t i = 0; i < 8; i++) {
4193 params->wasmsimd.output_min[i] = output_min;
4194 params->wasmsimd.output_max[i] = output_max;
4195 }
4196 }
4197 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4198
xnn_init_qs8_add_minmax_scalar_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)4199 void xnn_init_qs8_add_minmax_scalar_params(
4200 union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
4201 int8_t a_zero_point,
4202 int8_t b_zero_point,
4203 int8_t output_zero_point,
4204 float a_output_scale,
4205 float b_output_scale,
4206 int8_t output_min,
4207 int8_t output_max)
4208 {
4209 const float abs_a_output_scale = fabsf(a_output_scale);
4210 const float abs_b_output_scale = fabsf(b_output_scale);
4211 assert(abs_a_output_scale >= 0x1.0p-10f);
4212 assert(abs_b_output_scale >= 0x1.0p-10f);
4213 assert(abs_a_output_scale < 0x1.0p+8f);
4214 assert(abs_b_output_scale < 0x1.0p+8f);
4215
4216 // Compute requantization parameters.
4217 const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4218 assert(max_abs_output_scale >= 0x1.0p-10f);
4219 assert(max_abs_output_scale < 0x1.0p+8f);
4220 const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
4221 const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4222
4223 // Shift is in [12, 30] range.
4224 const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4225 assert(shift <= 30);
4226 assert(shift >= 12);
4227
4228 // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4229 const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
4230 const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
4231 assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4232 assert(abs_a_multiplier <= INT32_C(0x00200000));
4233 assert(abs_b_multiplier <= INT32_C(0x00200000));
4234
4235 const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4236 const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4237
4238 const int32_t rounding = INT32_C(1) << (shift - 1);
4239 params->scalar.bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4240 params->scalar.a_multiplier = a_multiplier;
4241 params->scalar.b_multiplier = b_multiplier;
4242 params->scalar.shift = shift;
4243 params->scalar.output_min_less_zero_point = (int32_t) output_min - (int32_t) output_zero_point;
4244 params->scalar.output_max_less_zero_point = (int32_t) output_max - (int32_t) output_zero_point;
4245 params->scalar.output_zero_point = (int32_t) output_zero_point;
4246 }
4247
xnn_init_qu8_mul_minmax_fp32_scalar_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)4248 void xnn_init_qu8_mul_minmax_fp32_scalar_params(
4249 union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4250 uint8_t a_zero_point,
4251 uint8_t b_zero_point,
4252 uint8_t output_zero_point,
4253 float product_output_scale,
4254 uint8_t output_min,
4255 uint8_t output_max)
4256 {
4257 assert(product_output_scale >= 0x1.0p-16f);
4258 assert(product_output_scale < 0x1.0p+8f);
4259
4260 params->fp32_scalar.a_zero_point = (int16_t) (uint16_t) a_zero_point;
4261 params->fp32_scalar.b_zero_point = (int16_t) (uint16_t) b_zero_point;
4262 params->fp32_scalar.scale = product_output_scale;
4263 params->fp32_scalar.output_min_less_zero_point = (float) (int32_t) ((uint32_t) output_min - (uint32_t) output_zero_point);
4264 params->fp32_scalar.output_max_less_zero_point = (float) (int32_t) ((uint32_t) output_max - (uint32_t) output_zero_point);
4265 params->fp32_scalar.magic_bias = 12582912.0f;
4266 params->fp32_scalar.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) (uint32_t) output_zero_point;
4267 }
4268
4269 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_mul_minmax_fp32_neon_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)4270 void xnn_init_qu8_mul_minmax_fp32_neon_params(
4271 union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4272 uint8_t a_zero_point,
4273 uint8_t b_zero_point,
4274 uint8_t output_zero_point,
4275 float product_output_scale,
4276 uint8_t output_min,
4277 uint8_t output_max)
4278 {
4279 assert(product_output_scale >= 0x1.0p-16f);
4280 assert(product_output_scale < 0x1.0p+8f);
4281
4282 params->fp32_neon.a_zero_point[0] = a_zero_point;
4283 params->fp32_neon.a_zero_point[1] = a_zero_point;
4284 params->fp32_neon.b_zero_point[0] = b_zero_point;
4285 params->fp32_neon.b_zero_point[1] = b_zero_point;
4286 params->fp32_neon.scale = product_output_scale;
4287 params->fp32_neon.magic_bias = 12582912.0f;
4288 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4289 params->fp32_neon.output_min = output_min;
4290 params->fp32_neon.output_max = output_max;
4291 }
4292
xnn_init_qu8_mul_minmax_fp32_neonv8_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)4293 void xnn_init_qu8_mul_minmax_fp32_neonv8_params(
4294 union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4295 uint8_t a_zero_point,
4296 uint8_t b_zero_point,
4297 uint8_t output_zero_point,
4298 float product_output_scale,
4299 uint8_t output_min,
4300 uint8_t output_max)
4301 {
4302 assert(product_output_scale >= 0x1.0p-16f);
4303 assert(product_output_scale < 0x1.0p+8f);
4304
4305 params->fp32_neonv8.a_zero_point[0] = a_zero_point;
4306 params->fp32_neonv8.a_zero_point[1] = a_zero_point;
4307 params->fp32_neonv8.b_zero_point[0] = b_zero_point;
4308 params->fp32_neonv8.b_zero_point[1] = b_zero_point;
4309 params->fp32_neonv8.scale = product_output_scale;
4310 params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
4311 params->fp32_neonv8.output_min = output_min;
4312 params->fp32_neonv8.output_max = output_max;
4313 }
4314
xnn_init_qu8_mul_minmax_rndnu_neon_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)4315 void xnn_init_qu8_mul_minmax_rndnu_neon_params(
4316 union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4317 uint8_t a_zero_point,
4318 uint8_t b_zero_point,
4319 uint8_t output_zero_point,
4320 float product_output_scale,
4321 uint8_t output_min,
4322 uint8_t output_max)
4323 {
4324 assert(product_output_scale >= 0x1.0p-16f);
4325 assert(product_output_scale < 0x1.0p+8f);
4326
4327 // Compute requantization parameters.
4328 const uint32_t scale_bits = fp32_to_bits(product_output_scale);
4329
4330 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
4331 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
4332 assert(multiplier >= INT32_C(0x40000000));
4333 assert(multiplier <= INT32_C(0x7FFFFF80));
4334
4335 // Shift is in [-8, 15] range.
4336 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
4337 assert(shift >= -8);
4338 assert(shift < 16);
4339
4340 // Split shift into pre_shift + post_shift, post_shift in [1, 15] range.
4341 const int32_t post_shift = math_max_s32(shift, 1);
4342 const int32_t pre_shift = shift - post_shift;
4343
4344 params->rndnu_neon.a_zero_point[0] = a_zero_point;
4345 params->rndnu_neon.a_zero_point[1] = a_zero_point;
4346 params->rndnu_neon.b_zero_point[0] = b_zero_point;
4347 params->rndnu_neon.b_zero_point[1] = b_zero_point;
4348 params->rndnu_neon.left_pre_shift = -pre_shift;
4349 params->rndnu_neon.multiplier = multiplier;
4350 params->rndnu_neon.left_post_shift = -post_shift;
4351 params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
4352 params->rndnu_neon.output_min = output_min;
4353 params->rndnu_neon.output_max = output_max;
4354 }
4355 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4356
4357 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_mul_minmax_fp32_sse2_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)4358 void xnn_init_qu8_mul_minmax_fp32_sse2_params(
4359 union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4360 uint8_t a_zero_point,
4361 uint8_t b_zero_point,
4362 uint8_t output_zero_point,
4363 float product_output_scale,
4364 uint8_t output_min,
4365 uint8_t output_max)
4366 {
4367 assert(product_output_scale >= 0x1.0p-16f);
4368 assert(product_output_scale < 0x1.0p+8f);
4369
4370 for (uint32_t i = 0; i < 8; i++) {
4371 params->fp32_sse2.a_zero_point[i] = (int16_t) (uint16_t) a_zero_point;
4372 params->fp32_sse2.b_zero_point[i] = (int16_t) (uint16_t) b_zero_point;
4373 }
4374 for (uint32_t i = 0; i < 4; i++) {
4375 params->fp32_sse2.scale[i] = product_output_scale;
4376 }
4377 for (uint32_t i = 0; i < 8; i++) {
4378 params->fp32_sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4379 }
4380 for (uint32_t i = 0; i < 16; i++) {
4381 params->fp32_sse2.output_min[i] = output_min;
4382 params->fp32_sse2.output_max[i] = output_max;
4383 }
4384 }
4385 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
4386
4387 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_mul_minmax_fp32_wasmsimd_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)4388 void xnn_init_qu8_mul_minmax_fp32_wasmsimd_params(
4389 union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4390 uint8_t a_zero_point,
4391 uint8_t b_zero_point,
4392 uint8_t output_zero_point,
4393 float product_output_scale,
4394 uint8_t output_min,
4395 uint8_t output_max)
4396 {
4397 assert(product_output_scale >= 0x1.0p-16f);
4398 assert(product_output_scale < 0x1.0p+8f);
4399
4400 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
4401 const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
4402 const int32_t magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4403 for (uint32_t i = 0; i < 4; i++) {
4404 params->fp32_wasmsimd.a_zero_point[i] = (int16_t) a_zero_point;
4405 params->fp32_wasmsimd.b_zero_point[i] = (int16_t) b_zero_point;
4406 }
4407 for (uint32_t i = 0; i < 2; i++) {
4408 params->fp32_wasmsimd.scale[i] = product_output_scale;
4409 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
4410 params->fp32_wasmsimd.magic_min[i] = magic_min;
4411 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_output_zero_point;
4412 }
4413 for (uint32_t i = 0; i < 8; i++) {
4414 params->fp32_wasmsimd.output_max[i] = output_max;
4415 }
4416 }
4417 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4418
xnn_init_qs8_mul_minmax_fp32_scalar_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4419 void xnn_init_qs8_mul_minmax_fp32_scalar_params(
4420 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4421 int8_t a_zero_point,
4422 int8_t b_zero_point,
4423 int8_t output_zero_point,
4424 float product_output_scale,
4425 int8_t output_min,
4426 int8_t output_max)
4427 {
4428 assert(product_output_scale >= 0x1.0p-16f);
4429 assert(product_output_scale < 0x1.0p+8f);
4430
4431 params->fp32_scalar.a_zero_point = (int16_t) a_zero_point;
4432 params->fp32_scalar.b_zero_point = (int16_t) b_zero_point;
4433 params->fp32_scalar.scale = product_output_scale;
4434 params->fp32_scalar.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
4435 params->fp32_scalar.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4436 params->fp32_scalar.magic_bias = 12582912.0f;
4437 params->fp32_scalar.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4438 }
4439
4440 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_mul_minmax_fp32_neon_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4441 void xnn_init_qs8_mul_minmax_fp32_neon_params(
4442 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4443 int8_t a_zero_point,
4444 int8_t b_zero_point,
4445 int8_t output_zero_point,
4446 float product_output_scale,
4447 int8_t output_min,
4448 int8_t output_max)
4449 {
4450 assert(product_output_scale >= 0x1.0p-16f);
4451 assert(product_output_scale < 0x1.0p+8f);
4452
4453 params->fp32_neon.a_zero_point[0] = a_zero_point;
4454 params->fp32_neon.a_zero_point[1] = a_zero_point;
4455 params->fp32_neon.b_zero_point[0] = b_zero_point;
4456 params->fp32_neon.b_zero_point[1] = b_zero_point;
4457 params->fp32_neon.scale = product_output_scale;
4458 params->fp32_neon.magic_bias = 12582912.0f;
4459 params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4460 params->fp32_neon.output_min = output_min;
4461 params->fp32_neon.output_max = output_max;
4462 }
4463
xnn_init_qs8_mul_minmax_fp32_neonv8_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4464 void xnn_init_qs8_mul_minmax_fp32_neonv8_params(
4465 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4466 int8_t a_zero_point,
4467 int8_t b_zero_point,
4468 int8_t output_zero_point,
4469 float product_output_scale,
4470 int8_t output_min,
4471 int8_t output_max)
4472 {
4473 assert(product_output_scale >= 0x1.0p-16f);
4474 assert(product_output_scale < 0x1.0p+8f);
4475
4476 params->fp32_neonv8.a_zero_point[0] = a_zero_point;
4477 params->fp32_neonv8.a_zero_point[1] = a_zero_point;
4478 params->fp32_neonv8.b_zero_point[0] = b_zero_point;
4479 params->fp32_neonv8.b_zero_point[1] = b_zero_point;
4480 params->fp32_neonv8.scale = product_output_scale;
4481 params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
4482 params->fp32_neonv8.output_min = output_min;
4483 params->fp32_neonv8.output_max = output_max;
4484 }
4485
xnn_init_qs8_mul_minmax_rndnu_neon_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4486 void xnn_init_qs8_mul_minmax_rndnu_neon_params(
4487 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4488 int8_t a_zero_point,
4489 int8_t b_zero_point,
4490 int8_t output_zero_point,
4491 float product_output_scale,
4492 int8_t output_min,
4493 int8_t output_max)
4494 {
4495 assert(product_output_scale >= 0x1.0p-16f);
4496 assert(product_output_scale < 0x1.0p+8f);
4497
4498 // Compute requantization parameters.
4499 const uint32_t scale_bits = fp32_to_bits(product_output_scale);
4500
4501 // Multiplier is in [0x40000000, 0x7FFFFF80] range.
4502 const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
4503 assert(multiplier >= INT32_C(0x40000000));
4504 assert(multiplier <= INT32_C(0x7FFFFF80));
4505
4506 // Shift is in [-8, 15] range.
4507 const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
4508 assert(shift >= -8);
4509 assert(shift < 16);
4510
4511 // Split shift into pre_shift + post_shift, post_shift in [1, 15] range.
4512 const int32_t post_shift = math_max_s32(shift, 1);
4513 const int32_t pre_shift = shift - post_shift;
4514
4515 params->rndnu_neon.a_zero_point[0] = a_zero_point;
4516 params->rndnu_neon.a_zero_point[1] = a_zero_point;
4517 params->rndnu_neon.b_zero_point[0] = b_zero_point;
4518 params->rndnu_neon.b_zero_point[1] = b_zero_point;
4519 params->rndnu_neon.left_pre_shift = -pre_shift;
4520 params->rndnu_neon.multiplier = multiplier;
4521 params->rndnu_neon.left_post_shift = -post_shift;
4522 params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
4523 params->rndnu_neon.output_min = output_min;
4524 params->rndnu_neon.output_max = output_max;
4525 }
4526 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4527
4528 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_mul_minmax_fp32_sse2_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4529 void xnn_init_qs8_mul_minmax_fp32_sse2_params(
4530 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4531 int8_t a_zero_point,
4532 int8_t b_zero_point,
4533 int8_t output_zero_point,
4534 float product_output_scale,
4535 int8_t output_min,
4536 int8_t output_max)
4537 {
4538 assert(product_output_scale >= 0x1.0p-16f);
4539 assert(product_output_scale < 0x1.0p+8f);
4540
4541 for (uint32_t i = 0; i < 8; i++) {
4542 params->fp32_sse2.a_zero_point[i] = (int16_t) a_zero_point;
4543 params->fp32_sse2.b_zero_point[i] = (int16_t) b_zero_point;
4544 }
4545 for (uint32_t i = 0; i < 4; i++) {
4546 params->fp32_sse2.scale[i] = product_output_scale;
4547 }
4548 for (uint32_t i = 0; i < 8; i++) {
4549 params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
4550 }
4551 for (uint32_t i = 0; i < 8; i++) {
4552 params->fp32_sse2.output_min[i] = (int16_t) output_min;
4553 params->fp32_sse2.output_max[i] = (int16_t) output_max;
4554 }
4555 }
4556
xnn_init_qs8_mul_minmax_fp32_sse4_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4557 void xnn_init_qs8_mul_minmax_fp32_sse4_params(
4558 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4559 int8_t a_zero_point,
4560 int8_t b_zero_point,
4561 int8_t output_zero_point,
4562 float product_output_scale,
4563 int8_t output_min,
4564 int8_t output_max)
4565 {
4566 assert(product_output_scale >= 0x1.0p-16f);
4567 assert(product_output_scale < 0x1.0p+8f);
4568
4569 for (uint32_t i = 0; i < 8; i++) {
4570 params->fp32_sse4.a_zero_point[i] = (int16_t) a_zero_point;
4571 params->fp32_sse4.b_zero_point[i] = (int16_t) b_zero_point;
4572 }
4573 for (uint32_t i = 0; i < 4; i++) {
4574 params->fp32_sse4.scale[i] = product_output_scale;
4575 }
4576 for (uint32_t i = 0; i < 8; i++) {
4577 params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
4578 }
4579 for (uint32_t i = 0; i < 16; i++) {
4580 params->fp32_sse4.output_min[i] = output_min;
4581 params->fp32_sse4.output_max[i] = output_max;
4582 }
4583 }
4584 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
4585
4586 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_mul_minmax_fp32_wasmsimd_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4587 void xnn_init_qs8_mul_minmax_fp32_wasmsimd_params(
4588 union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4589 int8_t a_zero_point,
4590 int8_t b_zero_point,
4591 int8_t output_zero_point,
4592 float product_output_scale,
4593 int8_t output_min,
4594 int8_t output_max)
4595 {
4596 assert(product_output_scale >= 0x1.0p-16f);
4597 assert(product_output_scale < 0x1.0p+8f);
4598
4599 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
4600 const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
4601 const int32_t magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4602 for (uint32_t i = 0; i < 4; i++) {
4603 params->fp32_wasmsimd.a_zero_point[i] = (int16_t) a_zero_point;
4604 params->fp32_wasmsimd.b_zero_point[i] = (int16_t) b_zero_point;
4605 }
4606 for (uint32_t i = 0; i < 2; i++) {
4607 params->fp32_wasmsimd.scale[i] = product_output_scale;
4608 params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
4609 params->fp32_wasmsimd.magic_min[i] = magic_min;
4610 params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_output_zero_point;
4611 }
4612 for (uint32_t i = 0; i < 8; i++) {
4613 params->fp32_wasmsimd.output_max[i] = output_max;
4614 }
4615 }
4616 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4617
xnn_init_f16_f32_cvt_scalar_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])4618 XNN_INTERNAL void xnn_init_f16_f32_cvt_scalar_params(
4619 union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
4620 {
4621 params->scalar.sign_mask = UINT32_C(0x80000000);
4622 params->scalar.exp_offset = UINT32_C(0x70000000);
4623 params->scalar.exp_scale = 0x1.0p-112f;
4624 params->scalar.magic_mask = UINT32_C(0x3F000000);
4625 params->scalar.magic_bias = 0.5f;
4626 params->scalar.denorm_cutoff = UINT32_C(0x08000000);
4627 }
4628
4629 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_f32_cvt_neon_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])4630 XNN_INTERNAL void xnn_init_f16_f32_cvt_neon_params(
4631 union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
4632 {
4633 params->neon.exp_scale = 0x1.0p-112f;
4634 }
4635 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4636
4637 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_f32_cvt_sse_int16_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])4638 XNN_INTERNAL void xnn_init_f16_f32_cvt_sse_int16_params(
4639 union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
4640 {
4641 for (uint32_t i = 0; i < 8; i++) {
4642 params->sse_int16.sign_mask[i] = UINT16_C(0x8000);
4643 params->sse_int16.exp_offset[i] = UINT16_C(0x7000);
4644 }
4645 for (uint32_t i = 0; i < 4; i++) {
4646 params->sse_int16.exp_scale[i] = 0x1.0p-112f;
4647 }
4648 for (uint32_t i = 0; i < 8; i++) {
4649 params->sse_int16.magic_mask[i] = UINT16_C(0x3F00);
4650 }
4651 for (uint32_t i = 0; i < 4; i++) {
4652 params->sse_int16.magic_bias[i] = 0.5f;
4653 }
4654 for (uint32_t i = 0; i < 8; i++) {
4655 params->sse_int16.denorm_cutoff[i] = INT16_C(0x0400);
4656 }
4657 }
4658
xnn_init_f16_f32_cvt_sse_int32_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])4659 XNN_INTERNAL void xnn_init_f16_f32_cvt_sse_int32_params(
4660 union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
4661 {
4662 for (uint32_t i = 0; i < 4; i++) {
4663 params->sse_int32.sign_mask[i] = UINT32_C(0x80000000);
4664 params->sse_int32.exp_offset[i] = UINT32_C(0x70000000);
4665 params->sse_int32.exp_scale[i] = 0x1.0p-112f;
4666 params->sse_int32.magic_bias[i] = UINT32_C(0x3F000000);
4667 params->sse_int32.denorm_cutoff[i] = INT32_C(0x04000000);
4668 }
4669 }
4670 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
4671
4672 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f16_f32_cvt_wasmsimd_int16_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])4673 XNN_INTERNAL void xnn_init_f16_f32_cvt_wasmsimd_int16_params(
4674 union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
4675 {
4676 for (uint32_t i = 0; i < 4; i++) {
4677 params->wasmsimd_int16.sign_mask[i] = UINT16_C(0x8000);
4678 params->wasmsimd_int16.exp_offset[i] = UINT16_C(0x7000);
4679 }
4680 for (uint32_t i = 0; i < 2; i++) {
4681 params->wasmsimd_int16.exp_scale[i] = 0x1.0p-112f;
4682 }
4683 for (uint32_t i = 0; i < 4; i++) {
4684 params->wasmsimd_int16.magic_mask[i] = UINT16_C(0x3F00);
4685 }
4686 for (uint32_t i = 0; i < 2; i++) {
4687 params->wasmsimd_int16.magic_bias[i] = 0.5f;
4688 }
4689 for (uint32_t i = 0; i < 4; i++) {
4690 params->wasmsimd_int16.denorm_cutoff[i] = INT16_C(0x0400);
4691 }
4692 }
4693
xnn_init_f16_f32_cvt_wasmsimd_int32_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])4694 XNN_INTERNAL void xnn_init_f16_f32_cvt_wasmsimd_int32_params(
4695 union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
4696 {
4697 for (uint32_t i = 0; i < 2; i++) {
4698 params->wasmsimd_int32.sign_mask[i] = UINT32_C(0x80000000);
4699 params->wasmsimd_int32.exp_offset[i] = UINT32_C(0x70000000);
4700 params->wasmsimd_int32.exp_scale[i] = 0x1.0p-112f;
4701 params->wasmsimd_int32.magic_bias[i] = UINT32_C(0x3F000000);
4702 params->wasmsimd_int32.denorm_cutoff[i] = INT32_C(0x04000000);
4703 }
4704 }
4705 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4706
xnn_init_f32_f16_cvt_scalar_bitcast_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])4707 XNN_INTERNAL void xnn_init_f32_f16_cvt_scalar_bitcast_params(
4708 union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
4709 {
4710 params->scalar_bitcast.nonsign_mask = UINT32_C(0x7FFFFFFF);
4711 params->scalar_bitcast.exp_bias = UINT32_C(0x07800000);
4712 params->scalar_bitcast.scale_to_inf = 0x1.0p+112f;
4713 params->scalar_bitcast.expw_max = UINT32_C(0x7F800000);
4714 params->scalar_bitcast.scale_to_zero = 0x1.0p-110f;
4715 params->scalar_bitcast.bias_min = UINT32_C(0x40000000);
4716 params->scalar_bitcast.exph_mask = UINT16_C(0x7C00);
4717 params->scalar_bitcast.manth_mask = UINT16_C(0x0FFF);
4718 params->scalar_bitcast.nanh = UINT16_C(0x7E00);
4719 }
4720
xnn_init_f32_f16_cvt_scalar_fabsf_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])4721 XNN_INTERNAL void xnn_init_f32_f16_cvt_scalar_fabsf_params(
4722 union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
4723 {
4724 params->scalar_fabsf.scale_to_inf = 0x1.0p+112f;
4725 params->scalar_fabsf.exp_bias = UINT32_C(0x07800000);
4726 params->scalar_fabsf.scale_to_zero = 0x1.0p-110f;
4727 params->scalar_fabsf.expw_max = UINT32_C(0x7F800000);
4728 params->scalar_fabsf.bias_min = UINT32_C(0x40000000);
4729 params->scalar_fabsf.exph_mask = UINT16_C(0x7C00);
4730 params->scalar_fabsf.manth_mask = UINT16_C(0x0FFF);
4731 params->scalar_fabsf.nanh = UINT16_C(0x7E00);
4732 }
4733
4734 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_f16_cvt_neon_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])4735 XNN_INTERNAL void xnn_init_f32_f16_cvt_neon_params(
4736 union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
4737 {
4738 params->neon.exp_bias = UINT32_C(0x07800000);
4739 params->neon.scale_to_inf = 0x1.0p+112f;
4740 params->neon.expw_max = UINT32_C(0x7F800000);
4741 params->neon.scale_to_zero = 0x1.0p-110f;
4742 }
4743 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4744
4745 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_f16_cvt_sse2_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])4746 XNN_INTERNAL void xnn_init_f32_f16_cvt_sse2_params(
4747 union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
4748 {
4749 for (uint32_t i = 0; i < 4; i++) {
4750 params->sse2.nonsign_mask[i] = UINT32_C(0x7FFFFFFF);
4751 params->sse2.exp_bias[i] = UINT32_C(0x07800000);
4752 params->sse2.scale_to_inf[i] = 0x1.0p+112f;
4753 params->sse2.expw_max[i] = UINT32_C(0x7F800000);
4754 params->sse2.scale_to_zero[i] = 0x1.0p-110f;
4755 }
4756 params->sse2.bias_min[0] = INT16_C(0x8000);
4757 params->sse2.bias_min[1] = INT16_C(0x4000);
4758 params->sse2.bias_min[2] = INT16_C(0x8000);
4759 params->sse2.bias_min[3] = INT16_C(0x4000);
4760 params->sse2.bias_min[4] = INT16_C(0x8000);
4761 params->sse2.bias_min[5] = INT16_C(0x4000);
4762 params->sse2.bias_min[6] = INT16_C(0x8000);
4763 params->sse2.bias_min[7] = INT16_C(0x4000);
4764 for (uint32_t i = 0; i < 4; i++) {
4765 params->sse2.manth_mask[i] = UINT32_C(0x00000FFF);
4766 params->sse2.exph_mask[i] = UINT32_C(0x00007C00);
4767 }
4768 for (uint32_t i = 0; i < 8; i++) {
4769 params->sse2.nanh[i] = UINT16_C(0x7E00);
4770 }
4771 }
4772
xnn_init_f32_f16_cvt_f16c_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])4773 XNN_INTERNAL void xnn_init_f32_f16_cvt_f16c_params(
4774 union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
4775 {
4776 for (uint32_t i = 0; i < 7; i++) {
4777 params->f16c.mask_table[i] = -1;
4778 }
4779 for (uint32_t i = 7; i < 14; i++) {
4780 params->f16c.mask_table[i] = 0;
4781 }
4782 }
4783 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
4784
4785 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_f16_cvt_wasmsimd_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])4786 XNN_INTERNAL void xnn_init_f32_f16_cvt_wasmsimd_params(
4787 union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
4788 {
4789 for (uint32_t i = 0; i < 2; i++) {
4790 params->wasmsimd.exp_bias[i] = UINT32_C(0x07800000);
4791 params->wasmsimd.scale_to_inf[i] = 0x1.0p+112f;
4792 params->wasmsimd.expw_max[i] = UINT32_C(0x7F800000);
4793 params->wasmsimd.scale_to_zero[i] = 0x1.0p-110f;
4794 }
4795 params->wasmsimd.bias_min[0] = INT16_C(0x8000);
4796 params->wasmsimd.bias_min[1] = INT16_C(0x4000);
4797 params->wasmsimd.bias_min[2] = INT16_C(0x8000);
4798 params->wasmsimd.bias_min[3] = INT16_C(0x4000);
4799 for (uint32_t i = 0; i < 2; i++) {
4800 params->wasmsimd.manth_mask[i] = UINT32_C(0x00000FFF);
4801 params->wasmsimd.exph_mask[i] = UINT32_C(0x00007C00);
4802 }
4803 for (uint32_t i = 0; i < 4; i++) {
4804 params->wasmsimd.nanh[i] = UINT16_C(0x7E00);
4805 }
4806 }
4807 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4808
xnn_init_f32_qs8_cvt_scalar_fmagic_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4809 XNN_INTERNAL void xnn_init_f32_qs8_cvt_scalar_fmagic_params(
4810 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4811 float scale,
4812 int8_t output_zero_point,
4813 int8_t output_min,
4814 int8_t output_max)
4815 {
4816 params->scalar_fmagic.scale = scale;
4817 params->scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
4818 params->scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4819 params->scalar_fmagic.magic_bias = 12582912.0f;
4820 params->scalar_fmagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4821 }
4822
xnn_init_f32_qs8_cvt_scalar_imagic_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4823 XNN_INTERNAL void xnn_init_f32_qs8_cvt_scalar_imagic_params(
4824 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4825 float scale,
4826 int8_t output_zero_point,
4827 int8_t output_min,
4828 int8_t output_max)
4829 {
4830 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
4831 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4832 params->scalar_imagic.scale = scale;
4833 params->scalar_imagic.magic_bias = 12582912.0f;
4834 params->scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
4835 params->scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
4836 params->scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4837 }
4838
xnn_init_f32_qs8_cvt_scalar_lrintf_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4839 XNN_INTERNAL void xnn_init_f32_qs8_cvt_scalar_lrintf_params(
4840 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4841 float scale,
4842 int8_t output_zero_point,
4843 int8_t output_min,
4844 int8_t output_max)
4845 {
4846 params->scalar_lrintf.scale = scale;
4847 params->scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
4848 params->scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4849 params->scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
4850 }
4851
4852 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_qs8_cvt_neon_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4853 XNN_INTERNAL void xnn_init_f32_qs8_cvt_neon_params(
4854 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4855 float scale,
4856 int8_t output_zero_point,
4857 int8_t output_min,
4858 int8_t output_max)
4859 {
4860 params->neon.scale = scale;
4861 params->neon.magic_bias = 12582912.0f;
4862 params->neon.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4863 params->neon.output_min = output_min;
4864 params->neon.output_max = output_max;
4865 }
4866
xnn_init_f32_qs8_cvt_neonv8_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4867 XNN_INTERNAL void xnn_init_f32_qs8_cvt_neonv8_params(
4868 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4869 float scale,
4870 int8_t output_zero_point,
4871 int8_t output_min,
4872 int8_t output_max)
4873 {
4874 params->neonv8.scale = scale;
4875 params->neonv8.output_zero_point = (int16_t) output_zero_point;
4876 params->neonv8.output_min = output_min;
4877 params->neonv8.output_max = output_max;
4878 }
4879 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
4880
4881 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_qs8_cvt_sse2_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4882 XNN_INTERNAL void xnn_init_f32_qs8_cvt_sse2_params(
4883 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4884 float scale,
4885 int8_t output_zero_point,
4886 int8_t output_min,
4887 int8_t output_max)
4888 {
4889 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4890 for (uint32_t i = 0; i < 4; i++) {
4891 params->sse2.scale[i] = scale;
4892 params->sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
4893 }
4894 for (uint32_t i = 0; i < 8; i++) {
4895 params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
4896 params->sse2.output_min[i] = (int16_t) output_min;
4897 }
4898 }
4899
xnn_init_f32_qs8_cvt_sse4_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4900 XNN_INTERNAL void xnn_init_f32_qs8_cvt_sse4_params(
4901 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4902 float scale,
4903 int8_t output_zero_point,
4904 int8_t output_min,
4905 int8_t output_max)
4906 {
4907 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4908 for (uint32_t i = 0; i < 4; i++) {
4909 params->sse4.scale[i] = scale;
4910 params->sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
4911 }
4912 for (uint32_t i = 0; i < 8; i++) {
4913 params->sse4.output_zero_point[i] = (int16_t) output_zero_point;
4914 }
4915 for (uint32_t i = 0; i < 16; i++) {
4916 params->sse4.output_min[i] = output_min;
4917 }
4918 }
4919
xnn_init_f32_qs8_cvt_avx_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4920 XNN_INTERNAL void xnn_init_f32_qs8_cvt_avx_params(
4921 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4922 float scale,
4923 int8_t output_zero_point,
4924 int8_t output_min,
4925 int8_t output_max)
4926 {
4927 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4928 for (uint32_t i = 0; i < 8; i++) {
4929 params->avx.scale[i] = scale;
4930 params->avx.output_max_less_zero_point[i] = output_max_less_zero_point;
4931 }
4932 for (uint32_t i = 0; i < 8; i++) {
4933 params->avx.output_zero_point[i] = (int16_t) output_zero_point;
4934 }
4935 for (uint32_t i = 0; i < 16; i++) {
4936 params->avx.output_min[i] = output_min;
4937 }
4938 for (uint32_t i = 0; i < 7; i++) {
4939 params->avx.mask_table[i] = -1;
4940 }
4941 for (uint32_t i = 7; i < 14; i++) {
4942 params->avx.mask_table[i] = 0;
4943 }
4944 }
4945
xnn_init_f32_qs8_cvt_avx2_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4946 XNN_INTERNAL void xnn_init_f32_qs8_cvt_avx2_params(
4947 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4948 float scale,
4949 int8_t output_zero_point,
4950 int8_t output_min,
4951 int8_t output_max)
4952 {
4953 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4954 for (uint32_t i = 0; i < 8; i++) {
4955 params->avx2.scale[i] = scale;
4956 params->avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
4957 }
4958 for (uint32_t i = 0; i < 16; i++) {
4959 params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
4960 }
4961 params->avx2.shuffle_mask[0] = 0;
4962 params->avx2.shuffle_mask[1] = 4;
4963 params->avx2.shuffle_mask[2] = 1;
4964 params->avx2.shuffle_mask[3] = 5;
4965 params->avx2.shuffle_mask[4] = 2;
4966 params->avx2.shuffle_mask[5] = 6;
4967 params->avx2.shuffle_mask[6] = 3;
4968 params->avx2.shuffle_mask[7] = 7;
4969 for (uint32_t i = 0; i < 32; i++) {
4970 params->avx2.output_min[i] = output_min;
4971 }
4972 for (uint32_t i = 0; i < 7; i++) {
4973 params->avx2.mask_table[i] = -1;
4974 }
4975 for (uint32_t i = 7; i < 14; i++) {
4976 params->avx2.mask_table[i] = 0;
4977 }
4978 }
4979
xnn_init_f32_qs8_cvt_avx512_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4980 XNN_INTERNAL void xnn_init_f32_qs8_cvt_avx512_params(
4981 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4982 float scale,
4983 int8_t output_zero_point,
4984 int8_t output_min,
4985 int8_t output_max)
4986 {
4987 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4988 for (uint32_t i = 0; i < 16; i++) {
4989 params->avx512.scale[i] = scale;
4990 params->avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
4991 }
4992 for (uint32_t i = 0; i < 32; i++) {
4993 params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
4994 }
4995 for (uint32_t i = 0; i < 64; i++) {
4996 params->avx512.output_min[i] = output_min;
4997 }
4998 params->avx512.shuffle512_mask[0] = 0;
4999 params->avx512.shuffle512_mask[1] = 4;
5000 params->avx512.shuffle512_mask[2] = 8;
5001 params->avx512.shuffle512_mask[3] = 12;
5002 params->avx512.shuffle512_mask[4] = 1;
5003 params->avx512.shuffle512_mask[5] = 5;
5004 params->avx512.shuffle512_mask[6] = 9;
5005 params->avx512.shuffle512_mask[7] = 13;
5006 params->avx512.shuffle512_mask[8] = 2;
5007 params->avx512.shuffle512_mask[9] = 6;
5008 params->avx512.shuffle512_mask[10] = 10;
5009 params->avx512.shuffle512_mask[11] = 14;
5010 params->avx512.shuffle512_mask[12] = 3;
5011 params->avx512.shuffle512_mask[13] = 7;
5012 params->avx512.shuffle512_mask[14] = 11;
5013 params->avx512.shuffle512_mask[15] = 15;
5014 params->avx512.shuffle256_mask[0] = 0;
5015 params->avx512.shuffle256_mask[1] = 4;
5016 params->avx512.shuffle256_mask[2] = 2;
5017 params->avx512.shuffle256_mask[3] = 6;
5018 params->avx512.shuffle256_mask[4] = 1;
5019 params->avx512.shuffle256_mask[5] = 5;
5020 params->avx512.shuffle256_mask[6] = 3;
5021 params->avx512.shuffle256_mask[7] = 7;
5022 }
5023 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5024
5025 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_qs8_cvt_wasmsimd_cvt_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5026 XNN_INTERNAL void xnn_init_f32_qs8_cvt_wasmsimd_cvt_params(
5027 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5028 float scale,
5029 int8_t output_zero_point,
5030 int8_t output_min,
5031 int8_t output_max)
5032 {
5033 for (uint32_t i = 0; i < 2; i++) {
5034 params->wasmsimd_cvt.scale[i] = scale;
5035 }
5036 for (uint32_t i = 0; i < 4; i++) {
5037 params->wasmsimd_cvt.output_zero_point[i] = (int16_t) output_zero_point;
5038 }
5039 for (uint32_t i = 0; i < 8; i++) {
5040 params->wasmsimd_cvt.output_min[i] = output_min;
5041 params->wasmsimd_cvt.output_max[i] = output_max;
5042 }
5043 }
5044
xnn_init_f32_qs8_cvt_wasmsimd_magic_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5045 XNN_INTERNAL void xnn_init_f32_qs8_cvt_wasmsimd_magic_params(
5046 union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5047 float scale,
5048 int8_t output_zero_point,
5049 int8_t output_min,
5050 int8_t output_max)
5051 {
5052 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5053 const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
5054 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5055 for (uint32_t i = 0; i < 2; i++) {
5056 params->wasmsimd_magic.scale[i] = scale;
5057 params->wasmsimd_magic.magic_bias[i] = 12582912.0f;
5058 params->wasmsimd_magic.magic_min[i] = magic_min;
5059 params->wasmsimd_magic.magic_bias_less_zero_point[i] = magic_bias_less_zero_point;
5060 }
5061 for (uint32_t i = 0; i < 8; i++) {
5062 params->wasmsimd_magic.output_max[i] = output_max;
5063 }
5064 }
5065 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5066
xnn_init_f32_qu8_cvt_scalar_fmagic_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5067 XNN_INTERNAL void xnn_init_f32_qu8_cvt_scalar_fmagic_params(
5068 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5069 float scale,
5070 uint8_t output_zero_point,
5071 uint8_t output_min,
5072 uint8_t output_max)
5073 {
5074 params->scalar_fmagic.scale = scale;
5075 params->scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5076 params->scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5077 params->scalar_fmagic.magic_bias = 12582912.0f;
5078 params->scalar_fmagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5079 }
5080
xnn_init_f32_qu8_cvt_scalar_imagic_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5081 XNN_INTERNAL void xnn_init_f32_qu8_cvt_scalar_imagic_params(
5082 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5083 float scale,
5084 uint8_t output_zero_point,
5085 uint8_t output_min,
5086 uint8_t output_max)
5087 {
5088 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5089 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5090 params->scalar_imagic.scale = scale;
5091 params->scalar_imagic.magic_bias = 12582912.0f;
5092 params->scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
5093 params->scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
5094 params->scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5095 }
5096
xnn_init_f32_qu8_cvt_scalar_lrintf_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5097 XNN_INTERNAL void xnn_init_f32_qu8_cvt_scalar_lrintf_params(
5098 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5099 float scale,
5100 uint8_t output_zero_point,
5101 uint8_t output_min,
5102 uint8_t output_max)
5103 {
5104 params->scalar_lrintf.scale = scale;
5105 params->scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5106 params->scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5107 params->scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
5108 }
5109
5110 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_qu8_cvt_neon_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5111 XNN_INTERNAL void xnn_init_f32_qu8_cvt_neon_params(
5112 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5113 float scale,
5114 uint8_t output_zero_point,
5115 uint8_t output_min,
5116 uint8_t output_max)
5117 {
5118 params->neon.scale = scale;
5119 params->neon.magic_bias = 12582912.0f;
5120 params->neon.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5121 params->neon.output_min = output_min;
5122 params->neon.output_max = output_max;
5123 }
5124
xnn_init_f32_qu8_cvt_neonv8_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5125 XNN_INTERNAL void xnn_init_f32_qu8_cvt_neonv8_params(
5126 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5127 float scale,
5128 uint8_t output_zero_point,
5129 uint8_t output_min,
5130 uint8_t output_max)
5131 {
5132 params->neonv8.scale = scale;
5133 params->neonv8.output_zero_point = (int16_t) output_zero_point;
5134 params->neonv8.output_min = output_min;
5135 params->neonv8.output_max = output_max;
5136 }
5137 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5138
5139 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_qu8_cvt_sse2_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5140 XNN_INTERNAL void xnn_init_f32_qu8_cvt_sse2_params(
5141 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5142 float scale,
5143 uint8_t output_zero_point,
5144 uint8_t output_min,
5145 uint8_t output_max)
5146 {
5147 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5148 for (uint32_t i = 0; i < 4; i++) {
5149 params->sse2.scale[i] = scale;
5150 params->sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
5151 }
5152 for (uint32_t i = 0; i < 8; i++) {
5153 params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
5154 }
5155 for (uint32_t i = 0; i < 16; i++) {
5156 params->sse2.output_min[i] = output_min;
5157 }
5158 }
5159
xnn_init_f32_qu8_cvt_avx_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5160 XNN_INTERNAL void xnn_init_f32_qu8_cvt_avx_params(
5161 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5162 float scale,
5163 uint8_t output_zero_point,
5164 uint8_t output_min,
5165 uint8_t output_max)
5166 {
5167 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5168 for (uint32_t i = 0; i < 8; i++) {
5169 params->avx.scale[i] = scale;
5170 params->avx.output_max_less_zero_point[i] = output_max_less_zero_point;
5171 }
5172 for (uint32_t i = 0; i < 8; i++) {
5173 params->avx.output_zero_point[i] = (int16_t) output_zero_point;
5174 }
5175 for (uint32_t i = 0; i < 16; i++) {
5176 params->avx.output_min[i] = output_min;
5177 }
5178 for (uint32_t i = 0; i < 7; i++) {
5179 params->avx.mask_table[i] = -1;
5180 }
5181 for (uint32_t i = 7; i < 14; i++) {
5182 params->avx.mask_table[i] = 0;
5183 }
5184 }
5185
xnn_init_f32_qu8_cvt_avx2_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5186 XNN_INTERNAL void xnn_init_f32_qu8_cvt_avx2_params(
5187 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5188 float scale,
5189 uint8_t output_zero_point,
5190 uint8_t output_min,
5191 uint8_t output_max)
5192 {
5193 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5194 for (uint32_t i = 0; i < 8; i++) {
5195 params->avx2.scale[i] = scale;
5196 params->avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
5197 }
5198 for (uint32_t i = 0; i < 16; i++) {
5199 params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
5200 }
5201 params->avx2.shuffle_mask[0] = 0;
5202 params->avx2.shuffle_mask[1] = 4;
5203 params->avx2.shuffle_mask[2] = 1;
5204 params->avx2.shuffle_mask[3] = 5;
5205 params->avx2.shuffle_mask[4] = 2;
5206 params->avx2.shuffle_mask[5] = 6;
5207 params->avx2.shuffle_mask[6] = 3;
5208 params->avx2.shuffle_mask[7] = 7;
5209 for (uint32_t i = 0; i < 32; i++) {
5210 params->avx2.output_min[i] = output_min;
5211 }
5212 for (uint32_t i = 0; i < 7; i++) {
5213 params->avx2.mask_table[i] = -1;
5214 }
5215 for (uint32_t i = 7; i < 14; i++) {
5216 params->avx2.mask_table[i] = 0;
5217 }
5218 }
5219
xnn_init_f32_qu8_cvt_avx512_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5220 XNN_INTERNAL void xnn_init_f32_qu8_cvt_avx512_params(
5221 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5222 float scale,
5223 uint8_t output_zero_point,
5224 uint8_t output_min,
5225 uint8_t output_max)
5226 {
5227 const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5228 for (uint32_t i = 0; i < 16; i++) {
5229 params->avx512.scale[i] = scale;
5230 params->avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
5231 }
5232 for (uint32_t i = 0; i < 32; i++) {
5233 params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
5234 }
5235 for (uint32_t i = 0; i < 64; i++) {
5236 params->avx512.output_min[i] = output_min;
5237 }
5238 params->avx512.shuffle512_mask[0] = 0;
5239 params->avx512.shuffle512_mask[1] = 4;
5240 params->avx512.shuffle512_mask[2] = 8;
5241 params->avx512.shuffle512_mask[3] = 12;
5242 params->avx512.shuffle512_mask[4] = 1;
5243 params->avx512.shuffle512_mask[5] = 5;
5244 params->avx512.shuffle512_mask[6] = 9;
5245 params->avx512.shuffle512_mask[7] = 13;
5246 params->avx512.shuffle512_mask[8] = 2;
5247 params->avx512.shuffle512_mask[9] = 6;
5248 params->avx512.shuffle512_mask[10] = 10;
5249 params->avx512.shuffle512_mask[11] = 14;
5250 params->avx512.shuffle512_mask[12] = 3;
5251 params->avx512.shuffle512_mask[13] = 7;
5252 params->avx512.shuffle512_mask[14] = 11;
5253 params->avx512.shuffle512_mask[15] = 15;
5254 params->avx512.shuffle256_mask[0] = 0;
5255 params->avx512.shuffle256_mask[1] = 4;
5256 params->avx512.shuffle256_mask[2] = 2;
5257 params->avx512.shuffle256_mask[3] = 6;
5258 params->avx512.shuffle256_mask[4] = 1;
5259 params->avx512.shuffle256_mask[5] = 5;
5260 params->avx512.shuffle256_mask[6] = 3;
5261 params->avx512.shuffle256_mask[7] = 7;
5262 }
5263 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5264
5265 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_qu8_cvt_wasmsimd_cvt_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5266 XNN_INTERNAL void xnn_init_f32_qu8_cvt_wasmsimd_cvt_params(
5267 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5268 float scale,
5269 uint8_t output_zero_point,
5270 uint8_t output_min,
5271 uint8_t output_max)
5272 {
5273 for (uint32_t i = 0; i < 2; i++) {
5274 params->wasmsimd_cvt.scale[i] = scale;
5275 }
5276 for (uint32_t i = 0; i < 4; i++) {
5277 params->wasmsimd_cvt.output_zero_point[i] = (int16_t) output_zero_point;
5278 }
5279 for (uint32_t i = 0; i < 8; i++) {
5280 params->wasmsimd_cvt.output_min[i] = output_min;
5281 params->wasmsimd_cvt.output_max[i] = output_max;
5282 }
5283 }
5284
xnn_init_f32_qu8_cvt_wasmsimd_magic_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5285 XNN_INTERNAL void xnn_init_f32_qu8_cvt_wasmsimd_magic_params(
5286 union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5287 float scale,
5288 uint8_t output_zero_point,
5289 uint8_t output_min,
5290 uint8_t output_max)
5291 {
5292 const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5293 const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
5294 const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5295 for (uint32_t i = 0; i < 2; i++) {
5296 params->wasmsimd_magic.scale[i] = scale;
5297 params->wasmsimd_magic.magic_bias[i] = 12582912.0f;
5298 params->wasmsimd_magic.magic_min[i] = magic_min;
5299 params->wasmsimd_magic.magic_bias_less_zero_point[i] = magic_bias_less_zero_point;
5300 }
5301 for (uint32_t i = 0; i < 8; i++) {
5302 params->wasmsimd_magic.output_max[i] = output_max;
5303 }
5304 }
5305 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5306
xnn_init_qs8_f32_cvt_scalar_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5307 XNN_INTERNAL void xnn_init_qs8_f32_cvt_scalar_params(
5308 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5309 float scale,
5310 int8_t zero_point)
5311 {
5312 params->scalar.zero_point = (int32_t) zero_point;
5313 params->scalar.scale = scale;
5314 }
5315
5316 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_f32_cvt_neon_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5317 XNN_INTERNAL void xnn_init_qs8_f32_cvt_neon_params(
5318 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5319 float scale,
5320 int8_t zero_point)
5321 {
5322 params->neon.minus_zero_point[0] = -(int16_t) zero_point;
5323 params->neon.minus_zero_point[1] = -(int16_t) zero_point;
5324 params->neon.scale = scale;
5325 }
5326 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5327
5328 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_f32_cvt_sse2_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5329 XNN_INTERNAL void xnn_init_qs8_f32_cvt_sse2_params(
5330 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5331 float scale,
5332 int8_t zero_point)
5333 {
5334 for (uint32_t i = 0; i < 16; i++) {
5335 params->sse2.sign_mask[i] = UINT8_C(0x80);
5336 }
5337 for (uint32_t i = 0; i < 8; i++) {
5338 params->sse2.magic_exp[i] = UINT16_C(0x4B00);
5339 }
5340 const float magic_bias = (float) (INT32_C(0x00800080) + (int32_t) zero_point);
5341 for (uint32_t i = 0; i < 4; i++) {
5342 params->sse2.magic_bias[i] = magic_bias;
5343 params->sse2.scale[i] = scale;
5344 }
5345 }
5346
xnn_init_qs8_f32_cvt_sse4_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5347 XNN_INTERNAL void xnn_init_qs8_f32_cvt_sse4_params(
5348 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5349 float scale,
5350 int8_t zero_point)
5351 {
5352 for (uint32_t i = 0; i < 4; i++) {
5353 params->sse4.minus_zero_point[i] = -(int32_t) zero_point;
5354 params->sse4.scale[i] = scale;
5355 }
5356 }
5357
xnn_init_qs8_f32_cvt_avx_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5358 XNN_INTERNAL void xnn_init_qs8_f32_cvt_avx_params(
5359 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5360 float scale,
5361 int8_t zero_point)
5362 {
5363 for (uint32_t i = 0; i < 8; i++) {
5364 params->avx.minus_zero_point[i] = -(int32_t) zero_point;
5365 params->avx.scale[i] = scale;
5366 }
5367 }
5368
xnn_init_qs8_f32_cvt_avx512_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5369 XNN_INTERNAL void xnn_init_qs8_f32_cvt_avx512_params(
5370 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5371 float scale,
5372 int8_t zero_point)
5373 {
5374 for (uint32_t i = 0; i < 16; i++) {
5375 params->avx512.minus_zero_point[i] = -(int32_t) zero_point;
5376 params->avx512.scale[i] = scale;
5377 }
5378 }
5379 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5380
5381 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_f32_cvt_wasmsimd_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5382 XNN_INTERNAL void xnn_init_qs8_f32_cvt_wasmsimd_params(
5383 union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5384 float scale,
5385 int8_t zero_point)
5386 {
5387 for (uint32_t i = 0; i < 4; i++) {
5388 params->wasmsimd.minus_zero_point[i] = -(int16_t) zero_point;
5389 }
5390 for (uint32_t i = 0; i < 2; i++) {
5391 params->wasmsimd.scale[i] = scale;
5392 }
5393 }
5394 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5395
xnn_init_qu8_f32_cvt_scalar_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5396 XNN_INTERNAL void xnn_init_qu8_f32_cvt_scalar_params(
5397 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5398 float scale,
5399 uint8_t zero_point)
5400 {
5401 params->scalar.zero_point = (int32_t) zero_point;
5402 params->scalar.scale = scale;
5403 }
5404
5405 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_f32_cvt_neon_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5406 XNN_INTERNAL void xnn_init_qu8_f32_cvt_neon_params(
5407 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5408 float scale,
5409 uint8_t zero_point)
5410 {
5411 params->neon.minus_zero_point[0] = -(int16_t) zero_point;
5412 params->neon.minus_zero_point[1] = -(int16_t) zero_point;
5413 params->neon.scale = scale;
5414 }
5415 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
5416
5417 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_f32_cvt_sse2_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5418 XNN_INTERNAL void xnn_init_qu8_f32_cvt_sse2_params(
5419 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5420 float scale,
5421 uint8_t zero_point)
5422 {
5423 for (uint32_t i = 0; i < 8; i++) {
5424 params->sse2.magic_exp[i] = UINT16_C(0x4B00);
5425 }
5426 const float magic_bias = (float) (INT32_C(0x00800000) + (int32_t) zero_point);
5427 for (uint32_t i = 0; i < 4; i++) {
5428 params->sse2.magic_bias[i] = magic_bias;
5429 params->sse2.scale[i] = scale;
5430 }
5431 }
5432
xnn_init_qu8_f32_cvt_sse4_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5433 XNN_INTERNAL void xnn_init_qu8_f32_cvt_sse4_params(
5434 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5435 float scale,
5436 uint8_t zero_point)
5437 {
5438 for (uint32_t i = 0; i < 4; i++) {
5439 params->sse4.minus_zero_point[i] = -(int32_t) zero_point;
5440 params->sse4.scale[i] = scale;
5441 }
5442 }
5443
xnn_init_qu8_f32_cvt_avx_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5444 XNN_INTERNAL void xnn_init_qu8_f32_cvt_avx_params(
5445 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5446 float scale,
5447 uint8_t zero_point)
5448 {
5449 for (uint32_t i = 0; i < 8; i++) {
5450 params->avx.minus_zero_point[i] = -(int32_t) zero_point;
5451 params->avx.scale[i] = scale;
5452 }
5453 }
5454
xnn_init_qu8_f32_cvt_avx512_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5455 XNN_INTERNAL void xnn_init_qu8_f32_cvt_avx512_params(
5456 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5457 float scale,
5458 uint8_t zero_point)
5459 {
5460 for (uint32_t i = 0; i < 16; i++) {
5461 params->avx512.minus_zero_point[i] = -(int32_t) zero_point;
5462 params->avx512.scale[i] = scale;
5463 }
5464 }
5465 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
5466
5467 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_f32_cvt_wasmsimd_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5468 XNN_INTERNAL void xnn_init_qu8_f32_cvt_wasmsimd_params(
5469 union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5470 float scale,
5471 uint8_t zero_point)
5472 {
5473 for (uint32_t i = 0; i < 4; i++) {
5474 params->wasmsimd.minus_zero_point[i] = -(int16_t) zero_point;
5475 }
5476 for (uint32_t i = 0; i < 2; i++) {
5477 params->wasmsimd.scale[i] = scale;
5478 }
5479 }
5480 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5481