• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <stdint.h>
7 #include <stddef.h>
8 #include <assert.h>
9 #include <math.h>
10 
11 #include <fp16.h>
12 
13 #include <xnnpack/math.h>
14 #include <xnnpack/params-init.h>
15 
16 
xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)17 void xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params(
18   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
19   uint8_t kernel_zero_point,
20   float scale,
21   uint8_t output_zero_point,
22   uint8_t output_min,
23   uint8_t output_max)
24 {
25   assert(scale >= 0x1.0p-32f);
26   assert(scale < 256.0f);
27 
28   params->fp32_scalar_fmagic.kernel_zero_point = (int32_t) kernel_zero_point;
29   params->fp32_scalar_fmagic.scale = scale;
30   params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
31   params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
32   params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
33   params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
34 }
35 
xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)36 void xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params(
37   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
38   uint8_t kernel_zero_point,
39   float scale,
40   uint8_t output_zero_point,
41   uint8_t output_min,
42   uint8_t output_max)
43 {
44   assert(scale >= 0x1.0p-32f);
45   assert(scale < 256.0f);
46 
47   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
48   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
49   params->fp32_scalar_imagic.kernel_zero_point = (int32_t) kernel_zero_point;
50   params->fp32_scalar_imagic.scale = scale;
51   params->fp32_scalar_imagic.magic_bias = 12582912.0f;
52   params->fp32_scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
53   params->fp32_scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
54   params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
55 }
56 
xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)57 void xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params(
58   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
59   uint8_t kernel_zero_point,
60   float scale,
61   uint8_t output_zero_point,
62   uint8_t output_min,
63   uint8_t output_max)
64 {
65   assert(scale >= 0x1.0p-32f);
66   assert(scale < 256.0f);
67 
68   params->fp32_scalar_lrintf.kernel_zero_point = (int32_t) kernel_zero_point;
69   params->fp32_scalar_lrintf.scale = scale;
70   params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
71   params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
72   params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
73 }
74 
75 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_conv_minmax_fp32_sse2_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)76 void xnn_init_qu8_conv_minmax_fp32_sse2_params(
77   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
78   uint8_t kernel_zero_point,
79   float scale,
80   uint8_t output_zero_point,
81   uint8_t output_min,
82   uint8_t output_max)
83 {
84   assert(scale >= 0x1.0p-32f);
85   assert(scale < 256.0f);
86 
87   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
88   for (uint32_t i = 0; i < 4; i++) {
89     params->fp32_sse2.scale[i] = scale;
90     params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
91   }
92   for (uint32_t i = 0; i < 8; i++) {
93     params->fp32_sse2.kernel_zero_point[i] = (int16_t) kernel_zero_point;
94     params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
95   }
96   for (uint32_t i = 0; i < 16; i++) {
97     params->fp32_sse2.output_min[i] = output_min;
98   }
99 }
100 
xnn_init_qu8_conv_minmax_fp32_avx2_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)101 void xnn_init_qu8_conv_minmax_fp32_avx2_params(
102   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
103   uint8_t kernel_zero_point,
104   float scale,
105   uint8_t output_zero_point,
106   uint8_t output_min,
107   uint8_t output_max)
108 {
109   assert(scale >= 0x1.0p-32f);
110   assert(scale < 256.0f);
111 
112   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
113   for (uint32_t i = 0; i < 8; i++) {
114     params->fp32_avx2.scale[i] = scale;
115     params->fp32_avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
116   }
117   for (uint32_t i = 0; i < 16; i++) {
118     params->fp32_avx2.kernel_zero_point[i] = (int16_t) kernel_zero_point;
119     params->fp32_avx2.output_zero_point[i] = (int16_t) output_zero_point;
120   }
121   for (uint32_t i = 0; i < 32; i++) {
122     params->fp32_avx2.output_min[i] = output_min;
123   }
124 }
125 
xnn_init_qu8_conv_minmax_fp32_avx512_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)126 void xnn_init_qu8_conv_minmax_fp32_avx512_params(
127   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
128   uint8_t kernel_zero_point,
129   float scale,
130   uint8_t output_zero_point,
131   uint8_t output_min,
132   uint8_t output_max)
133 {
134   assert(scale >= 0x1.0p-32f);
135   assert(scale < 256.0f);
136 
137   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
138   for (uint32_t i = 0; i < 16; i++) {
139     params->fp32_avx512.scale[i] = scale;
140     params->fp32_avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
141   }
142   for (uint32_t i = 0; i < 32; i++) {
143     params->fp32_avx512.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
144     params->fp32_avx512.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
145   }
146   for (uint32_t i = 0; i < 64; i++) {
147     params->fp32_avx512.output_min[i] = output_min;
148   }
149 }
150 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
151 
152 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_conv_minmax_fp32_neon_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)153 void xnn_init_qu8_conv_minmax_fp32_neon_params(
154   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
155   uint8_t kernel_zero_point,
156   float scale,
157   uint8_t output_zero_point,
158   uint8_t output_min,
159   uint8_t output_max)
160 {
161   assert(scale >= 0x1.0p-32f);
162   assert(scale < 256.0f);
163 
164   params->fp32_neon.kernel_zero_point[0] = kernel_zero_point;
165   params->fp32_neon.kernel_zero_point[1] = kernel_zero_point;
166   params->fp32_neon.kernel_zero_point[2] = kernel_zero_point;
167   params->fp32_neon.kernel_zero_point[3] = kernel_zero_point;
168   params->fp32_neon.scale = scale;
169   params->fp32_neon.magic_bias = 12582912.0f;
170   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
171   params->fp32_neon.output_min = output_min;
172   params->fp32_neon.output_max = output_max;
173 }
174 
xnn_init_qu8_conv_minmax_fp32_neonv8_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)175 void xnn_init_qu8_conv_minmax_fp32_neonv8_params(
176   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
177   uint8_t kernel_zero_point,
178   float scale,
179   uint8_t output_zero_point,
180   uint8_t output_min,
181   uint8_t output_max)
182 {
183   assert(scale >= 0x1.0p-32f);
184   assert(scale < 256.0f);
185 
186   params->fp32_neonv8.kernel_zero_point[0] = kernel_zero_point;
187   params->fp32_neonv8.kernel_zero_point[1] = kernel_zero_point;
188   params->fp32_neonv8.kernel_zero_point[2] = kernel_zero_point;
189   params->fp32_neonv8.kernel_zero_point[3] = kernel_zero_point;
190   params->fp32_neonv8.scale = scale;
191   params->fp32_neonv8.output_zero_point = (int16_t) (uint16_t) output_zero_point;
192   params->fp32_neonv8.output_min = output_min;
193   params->fp32_neonv8.output_max = output_max;
194 }
195 
xnn_init_qu8_conv_minmax_rndnu_neon_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)196 void xnn_init_qu8_conv_minmax_rndnu_neon_params(
197   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
198   uint8_t kernel_zero_point,
199   float scale,
200   uint8_t output_zero_point,
201   uint8_t output_min,
202   uint8_t output_max)
203 {
204   assert(scale >= 0x1.0p-32f);
205   assert(scale < 256.0f);
206 
207   // Compute requantization parameters.
208   const uint32_t scale_bits = fp32_to_bits(scale);
209 
210   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
211   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
212   assert(multiplier >= INT32_C(0x40000000));
213   assert(multiplier <= INT32_C(0x7FFFFF80));
214 
215   // Shift is in [-8, 31] range.
216   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
217   assert(shift >= -8);
218   assert(shift < 32);
219 
220   // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
221   const int32_t post_shift = math_max_s32(shift, 1);
222   const int32_t pre_shift = shift - post_shift;
223 
224   params->rndnu_neon.kernel_zero_point[0] = kernel_zero_point;
225   params->rndnu_neon.kernel_zero_point[1] = kernel_zero_point;
226   params->rndnu_neon.kernel_zero_point[2] = kernel_zero_point;
227   params->rndnu_neon.kernel_zero_point[3] = kernel_zero_point;
228   params->rndnu_neon.right_pre_shift = -pre_shift;
229   params->rndnu_neon.multiplier = multiplier;
230   params->rndnu_neon.right_post_shift = -post_shift;
231   params->rndnu_neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
232   params->rndnu_neon.output_min = output_min;
233   params->rndnu_neon.output_max = output_max;
234 }
235 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
236 
237 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_conv_minmax_fp32_wasmsimd_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)238 void xnn_init_qu8_conv_minmax_fp32_wasmsimd_params(
239   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
240   uint8_t kernel_zero_point,
241   float scale,
242   uint8_t output_zero_point,
243   uint8_t output_min,
244   uint8_t output_max)
245 {
246   assert(scale >= 0x1.0p-32f);
247   assert(scale < 256.0f);
248 
249   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
250   const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
251   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
252   for (uint32_t i = 0; i < 4; i++) {
253     params->fp32_wasmsimd.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
254   }
255   for (uint32_t i = 0; i < 2; i++) {
256     params->fp32_wasmsimd.scale[i] = scale;
257     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
258     params->fp32_wasmsimd.magic_min[i] = magic_min;
259     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
260   }
261   for (uint32_t i = 0; i < 8; i++) {
262     params->fp32_wasmsimd.output_max[i] = output_max;
263   }
264 }
265 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
266 
xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)267 void xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params(
268   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
269   float scale,
270   int8_t output_zero_point,
271   int8_t output_min,
272   int8_t output_max)
273 {
274   assert(scale >= 0x1.0p-32f);
275   assert(scale < 256.0f);
276 
277   params->fp32_scalar_fmagic.scale = scale;
278   params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
279   params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
280   params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
281   params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
282 }
283 
xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)284 void xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params(
285   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
286   float scale,
287   int8_t output_zero_point,
288   int8_t output_min,
289   int8_t output_max)
290 {
291   assert(scale >= 0x1.0p-32f);
292   assert(scale < 256.0f);
293 
294   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
295   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
296   params->fp32_scalar_imagic.scale = scale;
297   params->fp32_scalar_imagic.magic_bias = 12582912.0f;
298   params->fp32_scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
299   params->fp32_scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
300   params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
301 }
302 
xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)303 void xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params(
304   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
305   float scale,
306   int8_t output_zero_point,
307   int8_t output_min,
308   int8_t output_max)
309 {
310   assert(scale >= 0x1.0p-32f);
311   assert(scale < 256.0f);
312 
313   params->fp32_scalar_lrintf.scale = scale;
314   params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
315   params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
316   params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
317 }
318 
319 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_conv_minmax_fp32_sse2_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)320 void xnn_init_qs8_conv_minmax_fp32_sse2_params(
321   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
322   float scale,
323   int8_t output_zero_point,
324   int8_t output_min,
325   int8_t output_max)
326 {
327   assert(scale >= 0x1.0p-32f);
328   assert(scale < 256.0f);
329 
330   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
331   for (uint32_t i = 0; i < 4; i++) {
332     params->fp32_sse2.scale[i] = scale;
333     params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
334   }
335   for (uint32_t i = 0; i < 8; i++) {
336     params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
337     params->fp32_sse2.output_min[i] = (int16_t) output_min;
338   }
339 }
340 
xnn_init_qs8_conv_minmax_fp32_sse4_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)341 void xnn_init_qs8_conv_minmax_fp32_sse4_params(
342   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
343   float scale,
344   int8_t output_zero_point,
345   int8_t output_min,
346   int8_t output_max)
347 {
348   assert(scale >= 0x1.0p-32f);
349   assert(scale < 256.0f);
350 
351   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
352   for (uint32_t i = 0; i < 4; i++) {
353     params->fp32_sse4.scale[i] = scale;
354     params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
355   }
356   for (uint32_t i = 0; i < 8; i++) {
357     params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
358   }
359   for (uint32_t i = 0; i < 16; i++) {
360     params->fp32_sse4.output_min[i] = output_min;
361   }
362 }
363 
xnn_init_qs8_conv_minmax_fp32_avx2_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)364 void xnn_init_qs8_conv_minmax_fp32_avx2_params(
365   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
366   float scale,
367   int8_t output_zero_point,
368   int8_t output_min,
369   int8_t output_max)
370 {
371   assert(scale >= 0x1.0p-32f);
372   assert(scale < 256.0f);
373 
374   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
375   for (uint32_t i = 0; i < 8; i++) {
376     params->fp32_avx2.scale[i] = scale;
377     params->fp32_avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
378   }
379   for (uint32_t i = 0; i < 16; i++) {
380     params->fp32_avx2.output_zero_point[i] = (int16_t) output_zero_point;
381   }
382   for (uint32_t i = 0; i < 32; i++) {
383     params->fp32_avx2.output_min[i] = output_min;
384   }
385 }
386 
xnn_init_qs8_conv_minmax_fp32_avx512_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)387 void xnn_init_qs8_conv_minmax_fp32_avx512_params(
388   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
389   float scale,
390   int8_t output_zero_point,
391   int8_t output_min,
392   int8_t output_max)
393 {
394   assert(scale >= 0x1.0p-32f);
395   assert(scale < 256.0f);
396 
397   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
398   for (uint32_t i = 0; i < 16; i++) {
399     params->fp32_avx512.scale[i] = scale;
400     params->fp32_avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
401   }
402   for (uint32_t i = 0; i < 32; i++) {
403     params->fp32_avx512.output_zero_point[i] = (int16_t) output_zero_point;
404   }
405   for (uint32_t i = 0; i < 64; i++) {
406     params->fp32_avx512.output_min[i] = output_min;
407   }
408 }
409 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
410 
411 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_conv_minmax_fp32_neon_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)412 void xnn_init_qs8_conv_minmax_fp32_neon_params(
413   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
414   float scale,
415   int8_t output_zero_point,
416   int8_t output_min,
417   int8_t output_max)
418 {
419   assert(scale >= 0x1.0p-32f);
420   assert(scale < 256.0f);
421 
422   params->fp32_neon.scale = scale;
423   params->fp32_neon.magic_bias = 12582912.0f;
424   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
425   params->fp32_neon.output_min = output_min;
426   params->fp32_neon.output_max = output_max;
427 }
428 
xnn_init_qs8_conv_minmax_fp32_neonv8_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)429 void xnn_init_qs8_conv_minmax_fp32_neonv8_params(
430   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
431   float scale,
432   int8_t output_zero_point,
433   int8_t output_min,
434   int8_t output_max)
435 {
436   assert(scale >= 0x1.0p-32f);
437   assert(scale < 256.0f);
438 
439   params->fp32_neonv8.scale = scale;
440   params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
441   params->fp32_neonv8.output_min = output_min;
442   params->fp32_neonv8.output_max = output_max;
443 }
444 
xnn_init_qs8_conv_minmax_rndnu_neon_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)445 void xnn_init_qs8_conv_minmax_rndnu_neon_params(
446   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
447   float scale,
448   int8_t output_zero_point,
449   int8_t output_min,
450   int8_t output_max)
451 {
452   assert(scale >= 0x1.0p-32f);
453   assert(scale < 256.0f);
454 
455   // Compute requantization parameters.
456   const uint32_t scale_bits = fp32_to_bits(scale);
457 
458   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
459   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
460   assert(multiplier >= INT32_C(0x40000000));
461   assert(multiplier <= INT32_C(0x7FFFFF80));
462 
463   // Shift is in [-8, 31] range.
464   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
465   assert(shift >= -8);
466   assert(shift < 32);
467 
468   // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
469   const int32_t post_shift = math_max_s32(shift, 1);
470   const int32_t pre_shift = shift - post_shift;
471 
472   params->rndnu_neon.right_pre_shift = -pre_shift;
473   params->rndnu_neon.multiplier = multiplier;
474   params->rndnu_neon.right_post_shift = -post_shift;
475   params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
476   params->rndnu_neon.output_min = output_min;
477   params->rndnu_neon.output_max = output_max;
478 }
479 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
480 
481 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_conv_minmax_fp32_wasmsimd_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)482 void xnn_init_qs8_conv_minmax_fp32_wasmsimd_params(
483   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
484   float scale,
485   int8_t output_zero_point,
486   int8_t output_min,
487   int8_t output_max)
488 {
489   assert(scale >= 0x1.0p-32f);
490   assert(scale < 256.0f);
491 
492   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
493   const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
494   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
495   for (uint32_t i = 0; i < 2; i++) {
496     params->fp32_wasmsimd.scale[i] = scale;
497     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
498     params->fp32_wasmsimd.magic_min[i] = magic_min;
499     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
500   }
501   for (uint32_t i = 0; i < 8; i++) {
502     params->fp32_wasmsimd.output_max[i] = output_max;
503   }
504 }
505 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
506 
xnn_init_qc8_scale_fp32_params(size_t channels,size_t channels_tile,size_t stride,const float scale[XNN_MIN_ELEMENTS (1)],void * packed_w)507 void xnn_init_qc8_scale_fp32_params(
508   size_t channels,
509   size_t channels_tile,
510   size_t stride,
511   const float scale[XNN_MIN_ELEMENTS(1)],
512   void* packed_w)
513 {
514   for (size_t tile_start = 0; tile_start < channels; tile_start += channels_tile) {
515     const size_t tile_size = min(channels - tile_start, channels_tile);
516     for (size_t tile_offset = 0; tile_offset < tile_size; tile_offset++) {
517       ((float*) packed_w)[tile_offset] = scale[tile_start + tile_offset];
518     }
519     packed_w = (void*) ((uintptr_t) packed_w + stride);
520   }
521 }
522 
xnn_init_qs8_minmax_scalar_fmagic_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)523 void xnn_init_qs8_minmax_scalar_fmagic_params(
524   union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
525   int8_t output_zero_point,
526   int8_t output_min,
527   int8_t output_max)
528 {
529   params->scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
530   params->scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
531   params->scalar_fmagic.magic_bias = 12582912.0f;
532   params->scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
533 }
534 
xnn_init_qs8_minmax_scalar_imagic_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)535 void xnn_init_qs8_minmax_scalar_imagic_params(
536   union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
537   int8_t output_zero_point,
538   int8_t output_min,
539   int8_t output_max)
540 {
541   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
542   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
543   params->scalar_imagic.magic_bias = 12582912.0f;
544   params->scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
545   params->scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
546   params->scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
547 }
548 
xnn_init_qs8_minmax_scalar_lrintf_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)549 void xnn_init_qs8_minmax_scalar_lrintf_params(
550   union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
551   int8_t output_zero_point,
552   int8_t output_min,
553   int8_t output_max)
554 {
555   params->scalar_lrintf.output_min_less_zero_point = (long) ((int32_t) output_min - (int32_t) output_zero_point);
556   params->scalar_lrintf.output_max_less_zero_point = (long) ((int32_t) output_max - (int32_t) output_zero_point);
557   params->scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
558 }
559 
560 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_minmax_sse2_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)561 void xnn_init_qs8_minmax_sse2_params(
562   union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
563   int8_t output_zero_point,
564   int8_t output_min,
565   int8_t output_max)
566 {
567   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
568   for (uint32_t i = 0; i < 4; i++) {
569     params->sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
570   }
571   for (uint32_t i = 0; i < 8; i++) {
572     params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
573     params->sse2.output_min[i] = (int16_t) output_min;
574   }
575 }
576 
xnn_init_qs8_minmax_sse4_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)577 void xnn_init_qs8_minmax_sse4_params(
578   union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
579   int8_t output_zero_point,
580   int8_t output_min,
581   int8_t output_max)
582 {
583   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
584   for (uint32_t i = 0; i < 4; i++) {
585     params->sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
586   }
587   for (uint32_t i = 0; i < 8; i++) {
588     params->sse4.output_zero_point[i] = (int16_t) output_zero_point;
589   }
590   for (uint32_t i = 0; i < 16; i++) {
591     params->sse4.output_min[i] = output_min;
592   }
593 }
594 
xnn_init_qs8_minmax_avx2_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)595 void xnn_init_qs8_minmax_avx2_params(
596   union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
597   int8_t output_zero_point,
598   int8_t output_min,
599   int8_t output_max)
600 {
601   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
602   for (uint32_t i = 0; i < 8; i++) {
603     params->avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
604   }
605   for (uint32_t i = 0; i < 16; i++) {
606     params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
607   }
608   for (uint32_t i = 0; i < 32; i++) {
609     params->avx2.output_min[i] = output_min;
610   }
611 }
612 
xnn_init_qs8_minmax_avx512_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)613 void xnn_init_qs8_minmax_avx512_params(
614   union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
615   int8_t output_zero_point,
616   int8_t output_min,
617   int8_t output_max)
618 {
619   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
620   for (uint32_t i = 0; i < 16; i++) {
621     params->avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
622   }
623   for (uint32_t i = 0; i < 32; i++) {
624     params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
625   }
626   for (uint32_t i = 0; i < 64; i++) {
627     params->avx512.output_min[i] = output_min;
628   }
629 }
630 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
631 
632 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_minmax_neon_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)633 void xnn_init_qs8_minmax_neon_params(
634   union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
635   int8_t output_zero_point,
636   int8_t output_min,
637   int8_t output_max)
638 {
639   params->neon.magic_bias = 12582912.0f;
640   params->neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
641   params->neon.output_min = output_min;
642   params->neon.output_max = output_max;
643 }
644 
xnn_init_qs8_minmax_neonv8_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)645 void xnn_init_qs8_minmax_neonv8_params(
646   union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
647   int8_t output_zero_point,
648   int8_t output_min,
649   int8_t output_max)
650 {
651   params->neonv8.output_zero_point = (int16_t) output_zero_point;
652   params->neonv8.output_min = output_min;
653   params->neonv8.output_max = output_max;
654 }
655 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
656 
657 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_minmax_wasmsimd_params(union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)658 void xnn_init_qs8_minmax_wasmsimd_params(
659   union xnn_qs8_minmax_params params[XNN_MIN_ELEMENTS(1)],
660   int8_t output_zero_point,
661   int8_t output_min,
662   int8_t output_max)
663 {
664   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
665   const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
666   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
667   for (uint32_t i = 0; i < 2; i++) {
668     params->wasmsimd.magic_bias[i] = 12582912.0f;
669     params->wasmsimd.magic_min[i] = magic_min;
670     params->wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
671   }
672   for (uint32_t i = 0; i < 8; i++) {
673     params->wasmsimd.output_max[i] = output_max;
674   }
675 }
676 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
677 
xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)678 void xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params(
679   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
680   int32_t init_bias,
681   float scale,
682   int8_t output_zero_point,
683   int8_t output_min,
684   int8_t output_max)
685 {
686   assert(scale >= 0x1.0p-32f);
687   assert(scale < 256.0f);
688 
689   params->fp32_scalar_fmagic.init_bias = init_bias;
690   params->fp32_scalar_fmagic.scale = scale;
691   params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
692   params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
693   params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
694   params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
695 }
696 
xnn_update_qs8_avgpool_minmax_fp32_scalar_fmagic_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)697 void xnn_update_qs8_avgpool_minmax_fp32_scalar_fmagic_params(
698   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
699   int32_t init_bias,
700   float scale)
701 {
702   assert(scale >= 0x1.0p-32f);
703   assert(scale < 256.0f);
704 
705   params->fp32_scalar_fmagic.init_bias = init_bias;
706   params->fp32_scalar_fmagic.scale = scale;
707 }
708 
xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)709 void xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params(
710   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
711   int32_t init_bias,
712   float scale,
713   int8_t output_zero_point,
714   int8_t output_min,
715   int8_t output_max)
716 {
717   assert(scale >= 0x1.0p-32f);
718   assert(scale < 256.0f);
719 
720   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
721   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
722   params->fp32_scalar_imagic.init_bias = init_bias;
723   params->fp32_scalar_imagic.scale = scale;
724   params->fp32_scalar_imagic.magic_bias = 12582912.0f;
725   params->fp32_scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
726   params->fp32_scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
727   params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
728 }
729 
xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)730 void xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params(
731   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
732   int32_t init_bias,
733   float scale)
734 {
735   assert(scale >= 0x1.0p-32f);
736   assert(scale < 256.0f);
737 
738   params->fp32_scalar_imagic.init_bias = init_bias;
739   params->fp32_scalar_imagic.scale = scale;
740 }
741 
xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)742 void xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params(
743   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
744   int32_t init_bias,
745   float scale,
746   int8_t output_zero_point,
747   int8_t output_min,
748   int8_t output_max)
749 {
750   assert(scale >= 0x1.0p-32f);
751   assert(scale < 256.0f);
752 
753   params->fp32_scalar_lrintf.init_bias = init_bias;
754   params->fp32_scalar_lrintf.scale = scale;
755   params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
756   params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
757   params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
758 }
759 
xnn_update_qs8_avgpool_minmax_fp32_scalar_lrintf_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)760 void xnn_update_qs8_avgpool_minmax_fp32_scalar_lrintf_params(
761   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
762   int32_t init_bias,
763   float scale)
764 {
765   assert(scale >= 0x1.0p-32f);
766   assert(scale < 256.0f);
767 
768   params->fp32_scalar_lrintf.init_bias = init_bias;
769   params->fp32_scalar_lrintf.scale = scale;
770 }
771 
772 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_avgpool_minmax_fp32_sse2_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)773 void xnn_init_qs8_avgpool_minmax_fp32_sse2_params(
774   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
775   int32_t init_bias,
776   float scale,
777   int8_t output_zero_point,
778   int8_t output_min,
779   int8_t output_max)
780 {
781   assert(scale >= 0x1.0p-32f);
782   assert(scale < 256.0f);
783 
784   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
785   for (uint32_t i = 0; i < 4; i++) {
786     params->fp32_sse2.init_bias[i] = init_bias;
787     params->fp32_sse2.scale[i] = scale;
788     params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
789   }
790   for (uint32_t i = 0; i < 8; i++) {
791     params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
792     params->fp32_sse2.output_min[i] = (int16_t) output_min;
793   }
794 }
795 
xnn_update_qs8_avgpool_minmax_fp32_sse2_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)796 void xnn_update_qs8_avgpool_minmax_fp32_sse2_params(
797   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
798   int32_t init_bias,
799   float scale)
800 {
801   assert(scale >= 0x1.0p-32f);
802   assert(scale < 256.0f);
803 
804   for (uint32_t i = 0; i < 4; i++) {
805     params->fp32_sse2.init_bias[i] = init_bias;
806     params->fp32_sse2.scale[i] = scale;
807   }
808 }
809 
xnn_init_qs8_avgpool_minmax_fp32_sse4_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)810 void xnn_init_qs8_avgpool_minmax_fp32_sse4_params(
811   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
812   int32_t init_bias,
813   float scale,
814   int8_t output_zero_point,
815   int8_t output_min,
816   int8_t output_max)
817 {
818   assert(scale >= 0x1.0p-32f);
819   assert(scale < 256.0f);
820 
821   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
822   for (uint32_t i = 0; i < 4; i++) {
823     params->fp32_sse4.init_bias[i] = init_bias;
824     params->fp32_sse4.scale[i] = scale;
825     params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
826   }
827   for (uint32_t i = 0; i < 8; i++) {
828     params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
829   }
830   for (uint32_t i = 0; i < 16; i++) {
831     params->fp32_sse4.output_min[i] = output_min;
832   }
833 }
834 
xnn_update_qs8_avgpool_minmax_fp32_sse4_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)835 void xnn_update_qs8_avgpool_minmax_fp32_sse4_params(
836   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
837   int32_t init_bias,
838   float scale)
839 {
840   assert(scale >= 0x1.0p-32f);
841   assert(scale < 256.0f);
842 
843   for (uint32_t i = 0; i < 4; i++) {
844     params->fp32_sse4.init_bias[i] = init_bias;
845     params->fp32_sse4.scale[i] = scale;
846   }
847 }
848 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
849 
850 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_avgpool_minmax_fp32_neon_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)851 void xnn_init_qs8_avgpool_minmax_fp32_neon_params(
852   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
853   int32_t init_bias,
854   float scale,
855   int8_t output_zero_point,
856   int8_t output_min,
857   int8_t output_max)
858 {
859   assert(scale >= 0x1.0p-32f);
860   assert(scale < 256.0f);
861 
862   params->fp32_neon.init_bias = init_bias;
863   params->fp32_neon.scale = scale;
864   params->fp32_neon.magic_bias = 12582912.0f;
865   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
866   params->fp32_neon.output_min = output_min;
867   params->fp32_neon.output_max = output_max;
868 }
869 
xnn_update_qs8_avgpool_minmax_fp32_neon_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)870 void xnn_update_qs8_avgpool_minmax_fp32_neon_params(
871   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
872   int32_t init_bias,
873   float scale)
874 {
875   assert(scale >= 0x1.0p-32f);
876   assert(scale < 256.0f);
877 
878   params->fp32_neon.init_bias = init_bias;
879   params->fp32_neon.scale = scale;
880 }
881 
xnn_init_qs8_avgpool_minmax_fp32_neonv8_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)882 void xnn_init_qs8_avgpool_minmax_fp32_neonv8_params(
883   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
884   int32_t init_bias,
885   float scale,
886   int8_t output_zero_point,
887   int8_t output_min,
888   int8_t output_max)
889 {
890   assert(scale >= 0x1.0p-32f);
891   assert(scale < 256.0f);
892 
893   params->fp32_neonv8.init_bias = init_bias;
894   params->fp32_neonv8.scale = scale;
895   params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
896   params->fp32_neonv8.output_min = output_min;
897   params->fp32_neonv8.output_max = output_max;
898 }
899 
xnn_update_qs8_avgpool_minmax_fp32_neonv8_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)900 void xnn_update_qs8_avgpool_minmax_fp32_neonv8_params(
901   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
902   int32_t init_bias,
903   float scale)
904 {
905   assert(scale >= 0x1.0p-32f);
906   assert(scale < 256.0f);
907 
908   params->fp32_neonv8.init_bias = init_bias;
909   params->fp32_neonv8.scale = scale;
910 }
911 
xnn_init_qs8_avgpool_minmax_rndnu_neon_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)912 void xnn_init_qs8_avgpool_minmax_rndnu_neon_params(
913   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
914   int32_t init_bias,
915   float scale,
916   int8_t output_zero_point,
917   int8_t output_min,
918   int8_t output_max)
919 {
920   assert(scale >= 0x1.0p-32f);
921   assert(scale < 256.0f);
922 
923   // Compute requantization parameters.
924   const uint32_t scale_bits = fp32_to_bits(scale);
925 
926   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
927   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
928   assert(multiplier >= INT32_C(0x40000000));
929   assert(multiplier <= INT32_C(0x7FFFFF80));
930 
931   // Shift is in [-8, 31] range.
932   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
933   assert(shift >= -8);
934   assert(shift < 32);
935 
936   // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
937   const int32_t post_shift = math_max_s32(shift, 1);
938   const int32_t pre_shift = shift - post_shift;
939 
940   params->rndnu_neon.init_bias = init_bias;
941   params->rndnu_neon.left_pre_shift = -pre_shift;
942   params->rndnu_neon.multiplier = multiplier;
943   params->rndnu_neon.left_post_shift = -post_shift;
944   params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
945   params->rndnu_neon.output_min = output_min;
946   params->rndnu_neon.output_max = output_max;
947 }
948 
xnn_update_qs8_avgpool_minmax_rndnu_neon_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)949 void xnn_update_qs8_avgpool_minmax_rndnu_neon_params(
950   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
951   int32_t init_bias,
952   float scale)
953 {
954   assert(scale >= 0x1.0p-32f);
955   assert(scale < 256.0f);
956 
957   // Compute requantization parameters.
958   const uint32_t scale_bits = fp32_to_bits(scale);
959 
960   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
961   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
962   assert(multiplier >= INT32_C(0x40000000));
963   assert(multiplier <= INT32_C(0x7FFFFF80));
964 
965   // Shift is in [-8, 31] range.
966   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
967   assert(shift >= -8);
968   assert(shift < 32);
969 
970   // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
971   const int32_t post_shift = math_max_s32(shift, 1);
972   const int32_t pre_shift = shift - post_shift;
973 
974   params->rndnu_neon.init_bias = init_bias;
975   params->rndnu_neon.left_pre_shift = -pre_shift;
976   params->rndnu_neon.multiplier = multiplier;
977   params->rndnu_neon.left_post_shift = -post_shift;
978 }
979 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
980 
981 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)982 void xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params(
983   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
984   int32_t init_bias,
985   float scale,
986   int8_t output_zero_point,
987   int8_t output_min,
988   int8_t output_max)
989 {
990   assert(scale >= 0x1.0p-32f);
991   assert(scale < 256.0f);
992 
993   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
994   const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
995   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
996   for (uint32_t i = 0; i < 2; i++) {
997     params->fp32_wasmsimd.init_bias[i] = init_bias;
998     params->fp32_wasmsimd.scale[i] = scale;
999     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
1000     params->fp32_wasmsimd.magic_min[i] = magic_min;
1001     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
1002   }
1003   for (uint32_t i = 0; i < 8; i++) {
1004     params->fp32_wasmsimd.output_max[i] = output_max;
1005   }
1006 }
1007 
xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1008 void xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params(
1009   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1010   int32_t init_bias,
1011   float scale)
1012 {
1013   assert(scale >= 0x1.0p-32f);
1014   assert(scale < 256.0f);
1015 
1016   for (uint32_t i = 0; i < 2; i++) {
1017     params->fp32_wasmsimd.init_bias[i] = init_bias;
1018     params->fp32_wasmsimd.scale[i] = scale;
1019   }
1020 }
1021 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1022 
xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1023 void xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params(
1024   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1025   int32_t init_bias,
1026   float scale,
1027   uint8_t output_zero_point,
1028   uint8_t output_min,
1029   uint8_t output_max)
1030 {
1031   assert(scale >= 0x1.0p-32f);
1032   assert(scale < 256.0f);
1033 
1034   params->fp32_scalar_fmagic.init_bias = init_bias;
1035   params->fp32_scalar_fmagic.scale = scale;
1036   params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1037   params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1038   params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
1039   params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1040 }
1041 
xnn_update_qu8_avgpool_minmax_fp32_scalar_fmagic_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1042 void xnn_update_qu8_avgpool_minmax_fp32_scalar_fmagic_params(
1043   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1044   int32_t init_bias,
1045   float scale)
1046 {
1047   assert(scale >= 0x1.0p-32f);
1048   assert(scale < 256.0f);
1049 
1050   params->fp32_scalar_fmagic.init_bias = init_bias;
1051   params->fp32_scalar_fmagic.scale = scale;
1052 }
1053 
xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1054 void xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params(
1055   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1056   int32_t init_bias,
1057   float scale,
1058   uint8_t output_zero_point,
1059   uint8_t output_min,
1060   uint8_t output_max)
1061 {
1062   assert(scale >= 0x1.0p-32f);
1063   assert(scale < 256.0f);
1064 
1065   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1066   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1067   params->fp32_scalar_imagic.init_bias = init_bias;
1068   params->fp32_scalar_imagic.scale = scale;
1069   params->fp32_scalar_imagic.magic_bias = 12582912.0f;
1070   params->fp32_scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
1071   params->fp32_scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
1072   params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1073 }
1074 
xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1075 void xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params(
1076   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1077   int32_t init_bias,
1078   float scale)
1079 {
1080   assert(scale >= 0x1.0p-32f);
1081   assert(scale < 256.0f);
1082 
1083   params->fp32_scalar_imagic.init_bias = init_bias;
1084   params->fp32_scalar_imagic.scale = scale;
1085 }
1086 
xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1087 void xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params(
1088   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1089   int32_t init_bias,
1090   float scale,
1091   uint8_t output_zero_point,
1092   uint8_t output_min,
1093   uint8_t output_max)
1094 {
1095   assert(scale >= 0x1.0p-32f);
1096   assert(scale < 256.0f);
1097 
1098   params->fp32_scalar_lrintf.init_bias = init_bias;
1099   params->fp32_scalar_lrintf.scale = scale;
1100   params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1101   params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1102   params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
1103 }
1104 
xnn_update_qu8_avgpool_minmax_fp32_scalar_lrintf_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1105 void xnn_update_qu8_avgpool_minmax_fp32_scalar_lrintf_params(
1106   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1107   int32_t init_bias,
1108   float scale)
1109 {
1110   assert(scale >= 0x1.0p-32f);
1111   assert(scale < 256.0f);
1112 
1113   params->fp32_scalar_lrintf.init_bias = init_bias;
1114   params->fp32_scalar_lrintf.scale = scale;
1115 }
1116 
1117 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_avgpool_minmax_fp32_sse2_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1118 void xnn_init_qu8_avgpool_minmax_fp32_sse2_params(
1119   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1120   int32_t init_bias,
1121   float scale,
1122   uint8_t output_zero_point,
1123   uint8_t output_min,
1124   uint8_t output_max)
1125 {
1126   assert(scale >= 0x1.0p-32f);
1127   assert(scale < 256.0f);
1128 
1129   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1130   for (uint32_t i = 0; i < 4; i++) {
1131     params->fp32_sse2.init_bias[i] = init_bias;
1132     params->fp32_sse2.scale[i] = scale;
1133     params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
1134   }
1135   for (uint32_t i = 0; i < 8; i++) {
1136     params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
1137   }
1138   for (uint32_t i = 0; i < 16; i++) {
1139     params->fp32_sse2.output_min[i] = output_min;
1140   }
1141 }
1142 
xnn_update_qu8_avgpool_minmax_fp32_sse2_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1143 void xnn_update_qu8_avgpool_minmax_fp32_sse2_params(
1144   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1145   int32_t init_bias,
1146   float scale)
1147 {
1148   assert(scale >= 0x1.0p-32f);
1149   assert(scale < 256.0f);
1150 
1151   for (uint32_t i = 0; i < 4; i++) {
1152     params->fp32_sse2.init_bias[i] = init_bias;
1153     params->fp32_sse2.scale[i] = scale;
1154   }
1155 }
1156 
xnn_init_qu8_avgpool_minmax_fp32_sse4_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1157 void xnn_init_qu8_avgpool_minmax_fp32_sse4_params(
1158   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1159   int32_t init_bias,
1160   float scale,
1161   uint8_t output_zero_point,
1162   uint8_t output_min,
1163   uint8_t output_max)
1164 {
1165   assert(scale >= 0x1.0p-32f);
1166   assert(scale < 256.0f);
1167 
1168   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1169   for (uint32_t i = 0; i < 4; i++) {
1170     params->fp32_sse4.init_bias[i] = init_bias;
1171     params->fp32_sse4.scale[i] = scale;
1172     params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
1173   }
1174   for (uint32_t i = 0; i < 8; i++) {
1175     params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
1176   }
1177   for (uint32_t i = 0; i < 16; i++) {
1178     params->fp32_sse4.output_min[i] = output_min;
1179   }
1180 }
1181 
xnn_update_qu8_avgpool_minmax_fp32_sse4_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1182 void xnn_update_qu8_avgpool_minmax_fp32_sse4_params(
1183   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1184   int32_t init_bias,
1185   float scale)
1186 {
1187   assert(scale >= 0x1.0p-32f);
1188   assert(scale < 256.0f);
1189 
1190   for (uint32_t i = 0; i < 4; i++) {
1191     params->fp32_sse4.init_bias[i] = init_bias;
1192     params->fp32_sse4.scale[i] = scale;
1193   }
1194 }
1195 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1196 
1197 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_avgpool_minmax_fp32_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1198 void xnn_init_qu8_avgpool_minmax_fp32_neon_params(
1199   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1200   int32_t init_bias,
1201   float scale,
1202   uint8_t output_zero_point,
1203   uint8_t output_min,
1204   uint8_t output_max)
1205 {
1206   assert(scale >= 0x1.0p-32f);
1207   assert(scale < 256.0f);
1208 
1209   params->fp32_neon.init_bias = init_bias;
1210   params->fp32_neon.scale = scale;
1211   params->fp32_neon.magic_bias = 12582912.0f;
1212   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1213   params->fp32_neon.output_min = output_min;
1214   params->fp32_neon.output_max = output_max;
1215 }
1216 
xnn_update_qu8_avgpool_minmax_fp32_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1217 void xnn_update_qu8_avgpool_minmax_fp32_neon_params(
1218   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1219   int32_t init_bias,
1220   float scale)
1221 {
1222   assert(scale >= 0x1.0p-32f);
1223   assert(scale < 256.0f);
1224 
1225   params->fp32_neon.init_bias = init_bias;
1226   params->fp32_neon.scale = scale;
1227 }
1228 
xnn_init_qu8_avgpool_minmax_fp32_neonv8_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1229 void xnn_init_qu8_avgpool_minmax_fp32_neonv8_params(
1230   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1231   int32_t init_bias,
1232   float scale,
1233   uint8_t output_zero_point,
1234   uint8_t output_min,
1235   uint8_t output_max)
1236 {
1237   assert(scale >= 0x1.0p-32f);
1238   assert(scale < 256.0f);
1239 
1240   params->fp32_neonv8.init_bias = init_bias;
1241   params->fp32_neonv8.scale = scale;
1242   params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
1243   params->fp32_neonv8.output_min = output_min;
1244   params->fp32_neonv8.output_max = output_max;
1245 }
1246 
xnn_update_qu8_avgpool_minmax_fp32_neonv8_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1247 void xnn_update_qu8_avgpool_minmax_fp32_neonv8_params(
1248   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1249   int32_t init_bias,
1250   float scale)
1251 {
1252   assert(scale >= 0x1.0p-32f);
1253   assert(scale < 256.0f);
1254 
1255   params->fp32_neonv8.init_bias = init_bias;
1256   params->fp32_neonv8.scale = scale;
1257 }
1258 
xnn_init_qu8_avgpool_minmax_rndnu_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1259 void xnn_init_qu8_avgpool_minmax_rndnu_neon_params(
1260   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1261   int32_t init_bias,
1262   float scale,
1263   uint8_t output_zero_point,
1264   uint8_t output_min,
1265   uint8_t output_max)
1266 {
1267   assert(scale >= 0x1.0p-32f);
1268   assert(scale < 256.0f);
1269 
1270   // Compute requantization parameters.
1271   const uint32_t scale_bits = fp32_to_bits(scale);
1272 
1273   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
1274   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
1275   assert(multiplier >= INT32_C(0x40000000));
1276   assert(multiplier <= INT32_C(0x7FFFFF80));
1277 
1278   // Shift is in [-8, 31] range.
1279   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
1280   assert(shift >= -8);
1281   assert(shift < 32);
1282 
1283   // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
1284   const int32_t post_shift = math_max_s32(shift, 1);
1285   const int32_t pre_shift = shift - post_shift;
1286 
1287   params->rndnu_neon.init_bias = init_bias;
1288   params->rndnu_neon.left_pre_shift = -pre_shift;
1289   params->rndnu_neon.multiplier = multiplier;
1290   params->rndnu_neon.left_post_shift = -post_shift;
1291   params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
1292   params->rndnu_neon.output_min = output_min;
1293   params->rndnu_neon.output_max = output_max;
1294 }
1295 
xnn_update_qu8_avgpool_minmax_rndnu_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1296 void xnn_update_qu8_avgpool_minmax_rndnu_neon_params(
1297   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1298   int32_t init_bias,
1299   float scale)
1300 {
1301   assert(scale >= 0x1.0p-32f);
1302   assert(scale < 256.0f);
1303 
1304   // Compute requantization parameters.
1305   const uint32_t scale_bits = fp32_to_bits(scale);
1306 
1307   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
1308   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
1309   assert(multiplier >= INT32_C(0x40000000));
1310   assert(multiplier <= INT32_C(0x7FFFFF80));
1311 
1312   // Shift is in [-8, 31] range.
1313   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
1314   assert(shift >= -8);
1315   assert(shift < 32);
1316 
1317   // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
1318   const int32_t post_shift = math_max_s32(shift, 1);
1319   const int32_t pre_shift = shift - post_shift;
1320 
1321   params->rndnu_neon.init_bias = init_bias;
1322   params->rndnu_neon.left_pre_shift = -pre_shift;
1323   params->rndnu_neon.multiplier = multiplier;
1324   params->rndnu_neon.left_post_shift = -post_shift;
1325 }
1326 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1327 
1328 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1329 void xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params(
1330   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1331   int32_t init_bias,
1332   float scale,
1333   uint8_t output_zero_point,
1334   uint8_t output_min,
1335   uint8_t output_max)
1336 {
1337   assert(scale >= 0x1.0p-32f);
1338   assert(scale < 256.0f);
1339 
1340   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1341   const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
1342   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1343   for (uint32_t i = 0; i < 2; i++) {
1344     params->fp32_wasmsimd.init_bias[i] = init_bias;
1345     params->fp32_wasmsimd.scale[i] = scale;
1346     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
1347     params->fp32_wasmsimd.magic_min[i] = magic_min;
1348     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
1349   }
1350   for (uint32_t i = 0; i < 8; i++) {
1351     params->fp32_wasmsimd.output_max[i] = output_max;
1352   }
1353 }
1354 
xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1355 void xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params(
1356   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1357   int32_t init_bias,
1358   float scale)
1359 {
1360   assert(scale >= 0x1.0p-32f);
1361   assert(scale < 256.0f);
1362 
1363   for (uint32_t i = 0; i < 2; i++) {
1364     params->fp32_wasmsimd.init_bias[i] = init_bias;
1365     params->fp32_wasmsimd.scale[i] = scale;
1366   }
1367 }
1368 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1369 
xnn_init_qu8_avgpool_minmax_scalar_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1370 void xnn_init_qu8_avgpool_minmax_scalar_params(
1371   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1372   int32_t bias,
1373   float scale,
1374   uint8_t output_zero_point,
1375   uint8_t output_min,
1376   uint8_t output_max)
1377 {
1378   // Compute requantization parameters.
1379   assert(scale >= 0x1.0p-32f);
1380   assert(scale < 256.0f);
1381   const uint32_t scale_bits = fp32_to_bits(scale);
1382 
1383   // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1384   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1385   assert(multiplier >= INT32_C(0x00800000));
1386   assert(multiplier <= INT32_C(0x00FFFFFF));
1387 
1388   // Shift is in [16, 55] range.
1389   const int32_t shift = 127 + 23 - (scale_bits >> 23);
1390   assert(shift >= 16);
1391   assert(shift < 64);
1392 
1393   const uint32_t right_shift = (uint32_t) shift;
1394   const int64_t rounding = INT64_C(1) << (right_shift - 1);
1395   params->scalar.bias = bias;
1396   params->scalar.rounding = rounding;
1397   params->scalar.multiplier = multiplier;
1398   params->scalar.right_shift = right_shift;
1399   params->scalar.output_min_less_zero_point =
1400     (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
1401   params->scalar.output_max_less_zero_point =
1402     (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
1403   params->scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
1404 }
1405 
1406 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_avgpool_minmax_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1407 void xnn_init_qu8_avgpool_minmax_neon_params(
1408   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1409   int32_t bias,
1410   float scale,
1411   uint8_t output_zero_point,
1412   uint8_t output_min,
1413   uint8_t output_max)
1414 {
1415   // Compute requantization parameters.
1416   assert(scale >= 0x1.0p-32f);
1417   assert(scale < 256.0f);
1418   const uint32_t scale_bits = fp32_to_bits(scale);
1419 
1420   // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1421   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1422   assert(multiplier >= INT32_C(0x00800000));
1423   assert(multiplier <= INT32_C(0x00FFFFFF));
1424 
1425   // Shift is in [16, 55] range.
1426   const int32_t shift = 127 + 23 - (scale_bits >> 23);
1427   assert(shift >= 16);
1428   assert(shift < 64);
1429 
1430   params->neon.bias = bias;
1431   params->neon.multiplier = multiplier;
1432   params->neon.left_shift = (int64_t) -shift;
1433   params->neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
1434   params->neon.output_min = output_min;
1435   params->neon.output_max = output_max;
1436 }
1437 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1438 
1439 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_avgpool_minmax_sse2_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1440 void xnn_init_qu8_avgpool_minmax_sse2_params(
1441   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1442   int32_t bias,
1443   float scale,
1444   uint8_t output_zero_point,
1445   uint8_t output_min,
1446   uint8_t output_max)
1447 {
1448   // Compute requantization parameters.
1449   assert(scale >= 0x1.0p-32f);
1450   assert(scale < 256.0f);
1451   const uint32_t scale_bits = fp32_to_bits(scale);
1452 
1453   // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1454   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1455   assert(multiplier >= INT32_C(0x00800000));
1456   assert(multiplier <= INT32_C(0x00FFFFFF));
1457 
1458   // Shift is in [16, 55] range.
1459   const int32_t shift = 127 + 23 - (scale_bits >> 23);
1460   assert(shift >= 16);
1461   assert(shift < 64);
1462 
1463   const uint32_t right_shift = (uint32_t) shift;
1464   const uint64_t rounding = UINT64_C(1) << (right_shift - 1);
1465   params->sse2.bias[0] = bias;
1466   params->sse2.bias[1] = bias;
1467   params->sse2.bias[2] = bias;
1468   params->sse2.bias[3] = bias;
1469   params->sse2.multiplier[0] = (uint32_t) multiplier;
1470   params->sse2.multiplier[1] = (uint32_t) multiplier;
1471   params->sse2.multiplier[2] = (uint32_t) multiplier;
1472   params->sse2.multiplier[3] = (uint32_t) multiplier;
1473   params->sse2.rounding[0] = rounding;
1474   params->sse2.rounding[1] = rounding;
1475   params->sse2.right_shift[0] = (uint64_t) right_shift;
1476   params->sse2.right_shift[1] = (uint64_t) right_shift;
1477   for (uint32_t i = 0; i < 8; i++) {
1478     params->sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
1479   }
1480   for (uint32_t i = 0; i < 16; i++) {
1481     params->sse2.output_min[i] = output_min;
1482     params->sse2.output_max[i] = output_max;
1483   }
1484 }
1485 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1486 
xnn_update_qu8_avgpool_minmax_scalar_params(union xnn_qu8_avgpool_minmax_params * params,int32_t bias,float scale)1487 void xnn_update_qu8_avgpool_minmax_scalar_params(
1488   union xnn_qu8_avgpool_minmax_params* params,
1489   int32_t bias,
1490   float scale)
1491 {
1492   // Compute requantization parameters.
1493   assert(scale >= 0x1.0p-32f);
1494   assert(scale < 256.0f);
1495   const uint32_t scale_bits = fp32_to_bits(scale);
1496 
1497   // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1498   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1499   assert(multiplier >= INT32_C(0x00800000));
1500   assert(multiplier <= INT32_C(0x00FFFFFF));
1501 
1502   // Shift is in [16, 55] range.
1503   const int32_t shift = 127 + 23 - (scale_bits >> 23);
1504   assert(shift >= 16);
1505   assert(shift < 64);
1506 
1507   const int64_t rounding = INT64_C(1) << ((uint32_t) shift - 1);
1508   params->scalar.bias = bias;
1509   params->scalar.multiplier = multiplier;
1510   params->scalar.rounding = rounding;
1511   params->scalar.right_shift = (uint32_t) shift;
1512 }
1513 
1514 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_update_qu8_avgpool_minmax_neon_params(union xnn_qu8_avgpool_minmax_params * params,int32_t bias,float scale)1515 void xnn_update_qu8_avgpool_minmax_neon_params(
1516   union xnn_qu8_avgpool_minmax_params* params,
1517   int32_t bias,
1518   float scale)
1519 {
1520   // Compute requantization parameters.
1521   assert(scale >= 0x1.0p-32f);
1522   assert(scale < 256.0f);
1523   const uint32_t scale_bits = fp32_to_bits(scale);
1524 
1525   // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1526   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1527   assert(multiplier >= INT32_C(0x00800000));
1528   assert(multiplier <= INT32_C(0x00FFFFFF));
1529 
1530   // Shift is in [16, 55] range.
1531   const int32_t shift = 127 + 23 - (scale_bits >> 23);
1532   assert(shift >= 16);
1533   assert(shift < 64);
1534 
1535   params->neon.bias = bias;
1536   params->neon.multiplier = multiplier;
1537   params->neon.left_shift = (int64_t) -shift;
1538 }
1539 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1540 
1541 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_update_qu8_avgpool_minmax_sse2_params(union xnn_qu8_avgpool_minmax_params * params,int32_t bias,float scale)1542 void xnn_update_qu8_avgpool_minmax_sse2_params(
1543   union xnn_qu8_avgpool_minmax_params* params,
1544   int32_t bias,
1545   float scale)
1546 {
1547   // Compute requantization parameters.
1548   assert(scale >= 0x1.0p-32f);
1549   assert(scale < 256.0f);
1550   const uint32_t scale_bits = fp32_to_bits(scale);
1551 
1552   // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1553   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1554   assert(multiplier >= INT32_C(0x00800000));
1555   assert(multiplier <= INT32_C(0x00FFFFFF));
1556 
1557   // Shift is in [16, 55] range.
1558   const int32_t shift = 127 + 23 - (scale_bits >> 23);
1559   assert(shift >= 16);
1560   assert(shift < 64);
1561 
1562   const uint64_t rounding = UINT64_C(1) << ((uint32_t) shift - 1);
1563   params->sse2.bias[0] = bias;
1564   params->sse2.bias[1] = bias;
1565   params->sse2.bias[2] = bias;
1566   params->sse2.bias[3] = bias;
1567   params->sse2.multiplier[0] = (uint32_t) multiplier;
1568   params->sse2.multiplier[1] = (uint32_t) multiplier;
1569   params->sse2.multiplier[2] = (uint32_t) multiplier;
1570   params->sse2.multiplier[3] = (uint32_t) multiplier;
1571   params->sse2.rounding[0] = rounding;
1572   params->sse2.rounding[1] = rounding;
1573   params->sse2.right_shift[0] = (uint64_t) (uint32_t) shift;
1574   params->sse2.right_shift[1] = (uint64_t) (uint32_t) shift;
1575 }
1576 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1577 
xnn_update_f32_scaleminmax_scalar_params(union xnn_f32_scaleminmax_params * params,float scale)1578 void xnn_update_f32_scaleminmax_scalar_params(
1579   union xnn_f32_scaleminmax_params* params,
1580   float scale)
1581 {
1582   params->scalar.scale = scale;
1583 }
1584 
1585 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_update_f32_scaleminmax_sse_params(union xnn_f32_scaleminmax_params * params,float scale)1586 void xnn_update_f32_scaleminmax_sse_params(
1587   union xnn_f32_scaleminmax_params* params,
1588   float scale)
1589 {
1590   for (uint32_t i = 0; i < 4; i++) {
1591     params->sse.scale[i] = scale;
1592   }
1593 }
1594 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1595 
1596 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_scaleminmax_neon_params(union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t scale,uint16_t min,uint16_t max)1597 void xnn_init_f16_scaleminmax_neon_params(
1598   union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1599   uint16_t scale,
1600   uint16_t min,
1601   uint16_t max)
1602 {
1603   params->neon.scale = scale;
1604   params->neon.min = min;
1605   params->neon.max = max;
1606 }
1607 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1608 
1609 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_scaleminmax_avx_params(union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t scale,uint16_t min,uint16_t max)1610 void xnn_init_f16_scaleminmax_avx_params(
1611   union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1612   uint16_t scale,
1613   uint16_t min,
1614   uint16_t max)
1615 {
1616   const float scale_f32 = fp16_ieee_to_fp32_value(scale);
1617   const float min_f32 = fp16_ieee_to_fp32_value(min);
1618   const float max_f32 = fp16_ieee_to_fp32_value(max);
1619   for (uint32_t i = 0; i < 8; i++) {
1620     params->avx.scale[i] = scale_f32;
1621     params->avx.min[i] = min_f32;
1622     params->avx.max[i] = max_f32;
1623   }
1624 }
1625 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1626 
1627 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_update_f16_scaleminmax_neon_params(union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t scale)1628 void xnn_update_f16_scaleminmax_neon_params(
1629   union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1630   uint16_t scale)
1631 {
1632   params->neon.scale = scale;
1633 }
1634 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1635 
1636 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_update_f16_scaleminmax_avx_params(union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t scale)1637 void xnn_update_f16_scaleminmax_avx_params(
1638   union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1639   uint16_t scale)
1640 {
1641   const float scale_f32 = fp16_ieee_to_fp32_value(scale);
1642   for (uint32_t i = 0; i < 8; i++) {
1643     params->avx.scale[i] = scale_f32;
1644   }
1645 }
1646 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1647 
xnn_init_f32_scaleminmax_scalar_params(union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],float scale,float min,float max)1648 void xnn_init_f32_scaleminmax_scalar_params(
1649   union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1650   float scale,
1651   float min,
1652   float max)
1653 {
1654   params->scalar.scale = scale;
1655   params->scalar.min = min;
1656   params->scalar.max = max;
1657 }
1658 
1659 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_scaleminmax_sse_params(union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],float scale,float min,float max)1660 void xnn_init_f32_scaleminmax_sse_params(
1661   union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1662   float scale,
1663   float min,
1664   float max)
1665 {
1666   for (uint32_t i = 0; i < 4; i++) {
1667     params->sse.scale[i] = scale;
1668     params->sse.min[i] = min;
1669     params->sse.max[i] = max;
1670   }
1671 }
1672 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1673 
xnn_init_f32_gavgpool_params(union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS (1)],float multiplier,float output_min,float output_max,uint32_t width)1674 void xnn_init_f32_gavgpool_params(
1675   union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)],
1676   float multiplier,
1677   float output_min,
1678   float output_max,
1679   uint32_t width)
1680 {
1681   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1682     for (uint32_t i = 0; i < 4; i++) {
1683       params->sse.multiplier[i] = multiplier;
1684       params->sse.output_min[i] = output_min;
1685       params->sse.output_max[i] = output_max;
1686     }
1687 
1688     const uint32_t w = (width - 1) & 3;
1689     params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
1690     params->sse.mask[1] = -(uint32_t) (w >= 1);
1691     params->sse.mask[2] = -(uint32_t) (w >= 2);
1692     params->sse.mask[3] = -(uint32_t) (w >= 3);
1693   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
1694     params->neon.multiplier = multiplier;
1695     params->neon.output_min = output_min;
1696     params->neon.output_max = output_max;
1697 
1698     const uint32_t w = (width - 1) & 3;
1699     params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
1700     params->neon.mask[1] = -(uint32_t) (w >= 1);
1701     params->neon.mask[2] = -(uint32_t) (w >= 2);
1702     params->neon.mask[3] = -(uint32_t) (w >= 3);
1703   #else
1704     params->scalar.multiplier = multiplier;
1705     params->scalar.output_min = output_min;
1706     params->scalar.output_max = output_max;
1707 
1708     const uint32_t w = (width - 1) & 3;
1709     params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
1710     params->scalar.mask[1] = -(int32_t) (w >= 1);
1711     params->scalar.mask[2] = -(int32_t) (w >= 2);
1712     params->scalar.mask[3] = -(int32_t) (w >= 3);
1713   #endif
1714 }
1715 
xnn_update_f32_gavgpool_params(union xnn_f32_gavgpool_params * params,float multiplier,uint32_t width)1716 void xnn_update_f32_gavgpool_params(
1717   union xnn_f32_gavgpool_params* params,
1718   float multiplier,
1719   uint32_t width)
1720 {
1721   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1722     for (uint32_t i = 0; i < 4; i++) {
1723       params->sse.multiplier[i] = multiplier;
1724     }
1725 
1726     const uint32_t w = (width - 1) & 3;
1727     params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
1728     params->sse.mask[1] = -(uint32_t) (w >= 1);
1729     params->sse.mask[2] = -(uint32_t) (w >= 2);
1730     params->sse.mask[3] = -(uint32_t) (w >= 3);
1731   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
1732     params->neon.multiplier = multiplier;
1733 
1734     const uint32_t w = (width - 1) & 3;
1735     params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
1736     params->neon.mask[1] = -(uint32_t) (w >= 1);
1737     params->neon.mask[2] = -(uint32_t) (w >= 2);
1738     params->neon.mask[3] = -(uint32_t) (w >= 3);
1739   #else
1740     params->scalar.multiplier = multiplier;
1741 
1742     const uint32_t w = (width - 1) & 3;
1743     params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
1744     params->scalar.mask[1] = -(int32_t) (w >= 1);
1745     params->scalar.mask[2] = -(int32_t) (w >= 2);
1746     params->scalar.mask[3] = -(int32_t) (w >= 3);
1747   #endif
1748 }
1749 
xnn_init_scalar_f32_gavgpool_params(union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS (1)],float multiplier,float output_min,float output_max,uint32_t width)1750 void xnn_init_scalar_f32_gavgpool_params(
1751   union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)],
1752   float multiplier,
1753   float output_min,
1754   float output_max,
1755   uint32_t width)
1756 {
1757   params->scalar.multiplier = multiplier;
1758   params->scalar.output_min = output_min;
1759   params->scalar.output_max = output_max;
1760 
1761   const uint32_t w = (width - 1) & 3;
1762   params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
1763   params->scalar.mask[1] = -(int32_t) (w >= 1);
1764   params->scalar.mask[2] = -(int32_t) (w >= 2);
1765   params->scalar.mask[3] = -(int32_t) (w >= 3);
1766 }
1767 
1768 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_minmax_neon_params(union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t min,uint16_t max)1769 void xnn_init_f16_minmax_neon_params(
1770   union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS(1)],
1771   uint16_t min,
1772   uint16_t max)
1773 {
1774   params->neon.min = min;
1775   params->neon.max = max;
1776 }
1777 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1778 
1779 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_minmax_avx_params(union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t min,uint16_t max)1780 void xnn_init_f16_minmax_avx_params(
1781   union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS(1)],
1782   uint16_t min,
1783   uint16_t max)
1784 {
1785   const float min_f32 = fp16_ieee_to_fp32_value(min);
1786   const float max_f32 = fp16_ieee_to_fp32_value(max);
1787   for (uint32_t i = 0; i < 8; i++) {
1788     params->avx.min[i] = min_f32;
1789     params->avx.max[i] = max_f32;
1790   }
1791 }
1792 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1793 
1794 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_default_avx_params(union xnn_f32_default_params params[XNN_MIN_ELEMENTS (1)])1795 void xnn_init_f32_default_avx_params(
1796   union xnn_f32_default_params params[XNN_MIN_ELEMENTS(1)])
1797 {
1798   for (uint32_t i = 0; i < 7; i++) {
1799     params->avx.mask_table[i] = -1;
1800   }
1801   for (uint32_t i = 7; i < 14; i++) {
1802     params->avx.mask_table[i] = 0;
1803   }
1804 }
1805 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1806 
xnn_init_f32_minmax_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)1807 void xnn_init_f32_minmax_params(
1808   union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1809   float output_min,
1810   float output_max)
1811 {
1812   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1813     for (uint32_t i = 0; i < 4; i++) {
1814       params->sse.min[i] = output_min;
1815       params->sse.max[i] = output_max;
1816     }
1817   #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1818     params->wasmsimd.min[0] = output_min;
1819     params->wasmsimd.min[1] = output_min;
1820     params->wasmsimd.max[0] = output_max;
1821     params->wasmsimd.max[1] = output_max;
1822   #else
1823     params->scalar.min = output_min;
1824     params->scalar.max = output_max;
1825   #endif
1826 }
1827 
1828 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_minmax_sse_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)1829 void xnn_init_f32_minmax_sse_params(
1830   union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1831   float output_min,
1832   float output_max)
1833 {
1834   for (uint32_t i = 0; i < 4; i++) {
1835     params->sse.min[i] = output_min;
1836     params->sse.max[i] = output_max;
1837   }
1838 }
1839 
xnn_init_f32_minmax_avx_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)1840 void xnn_init_f32_minmax_avx_params(
1841   union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1842   float output_min,
1843   float output_max)
1844 {
1845   for (uint32_t i = 0; i < 8; i++) {
1846     params->avx.min[i] = output_min;
1847     params->avx.max[i] = output_max;
1848   }
1849   for (uint32_t i = 0; i < 7; i++) {
1850     params->avx.mask_table[i] = -1;
1851   }
1852   for (uint32_t i = 7; i < 14; i++) {
1853     params->avx.mask_table[i] = 0;
1854   }
1855 }
1856 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1857 
1858 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_minmax_wasmsimd_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)1859 void xnn_init_f32_minmax_wasmsimd_params(
1860   union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1861   float output_min,
1862   float output_max)
1863 {
1864   params->wasmsimd.min[0] = output_min;
1865   params->wasmsimd.min[1] = output_min;
1866   params->wasmsimd.max[0] = output_max;
1867   params->wasmsimd.max[1] = output_max;
1868 }
1869 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1870 
xnn_init_f32_minmax_scalar_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)1871 void xnn_init_f32_minmax_scalar_params(
1872   union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1873   float output_min,
1874   float output_max)
1875 {
1876   params->scalar.min = output_min;
1877   params->scalar.max = output_max;
1878 }
1879 
1880 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_hswish_neon_params(union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS (1)])1881 void xnn_init_f16_hswish_neon_params(
1882   union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS(1)])
1883 {
1884   params->neon.sixth = UINT16_C(0x3155);
1885   params->neon.three = UINT16_C(0x4200);
1886   params->neon.six = UINT16_C(0x4600);
1887 }
1888 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1889 
1890 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_hswish_avx_params(union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS (1)])1891 void xnn_init_f16_hswish_avx_params(
1892   union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS(1)])
1893 {
1894   for (uint32_t i = 0; i < 8; i++) {
1895     params->avx.sixth[i] = 0x1.554000p-3f;
1896     params->avx.three[i] = 3.0f;
1897     params->avx.six[i] = UINT16_C(0x4600);
1898   }
1899 }
1900 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1901 
xnn_init_f32_hswish_scalar_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])1902 void xnn_init_f32_hswish_scalar_params(
1903   union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1904 {
1905   params->scalar.sixth = 0x1.555556p-3f;
1906   params->scalar.three = 3.0f;
1907   params->scalar.six = 6.0f;
1908 }
1909 
1910 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_hswish_sse_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])1911 void xnn_init_f32_hswish_sse_params(
1912   union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1913 {
1914   for (uint32_t i = 0; i < 4; i++) {
1915     params->sse.sixth[i] = 0x1.555556p-3f;
1916     params->sse.half[i] = 0.5f;
1917     params->sse.one[i] = 1.0f;
1918   }
1919 }
1920 
xnn_init_f32_hswish_avx_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])1921 void xnn_init_f32_hswish_avx_params(
1922   union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1923 {
1924   for (uint32_t i = 0; i < 8; i++) {
1925     params->avx.sixth[i] = 0x1.555556p-3f;
1926     params->avx.half[i] = 0.5f;
1927     params->avx.one[i] = 1.0f;
1928   }
1929   for (uint32_t i = 0; i < 7; i++) {
1930     params->avx.mask_table[i] = -1;
1931   }
1932   for (uint32_t i = 7; i < 14; i++) {
1933     params->avx.mask_table[i] = 0;
1934   }
1935 }
1936 
xnn_init_f32_hswish_avx512_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])1937 void xnn_init_f32_hswish_avx512_params(
1938   union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1939 {
1940   params->avx512.sixth = 0x1.555556p-3f;
1941   params->avx512.half = 0.5f;
1942   params->avx512.one = 1.0f;
1943 }
1944 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1945 
1946 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_hswish_wasmsimd_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])1947 void xnn_init_f32_hswish_wasmsimd_params(
1948   union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
1949 {
1950   for (uint32_t i = 0; i < 2; i++) {
1951     params->wasmsimd.sixth[i] = 0x1.555556p-3f;
1952     params->wasmsimd.three[i] = 3.0f;
1953     params->wasmsimd.six[i] = 6.0f;
1954   }
1955 }
1956 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1957 
xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])1958 void xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params(
1959   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
1960 {
1961   params->scalar_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
1962   params->scalar_rr2_lut64_p2.minus_log2e = -0x1.715476p0f;
1963   params->scalar_rr2_lut64_p2.ln2_hi = 0x1.630000p-1f;
1964   params->scalar_rr2_lut64_p2.ln2_lo = -0x1.BD0106p-13f;
1965   params->scalar_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
1966   params->scalar_rr2_lut64_p2.one = 1.0f;
1967   params->scalar_rr2_lut64_p2.denorm_cutoff = 0x1.5D589Ep+6f;
1968 }
1969 
xnn_init_f32_sigmoid_scalar_rr2_lut2048_p1_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])1970 void xnn_init_f32_sigmoid_scalar_rr2_lut2048_p1_params(
1971   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
1972 {
1973   params->scalar_rr2_lut2048_p1.magic_bias = 0x1.800000p12f;
1974   params->scalar_rr2_lut2048_p1.minus_log2e = -0x1.715476p0f;
1975   params->scalar_rr2_lut2048_p1.ln2_hi = 0x1.600000p-1f;
1976   params->scalar_rr2_lut2048_p1.ln2_lo = 0x1.7217F8p-8f;
1977   params->scalar_rr2_lut2048_p1.c1 = -0x1.FFFFFEp-1f;
1978   params->scalar_rr2_lut2048_p1.one = 1.0f;
1979   params->scalar_rr2_lut2048_p1.denorm_cutoff = 0x1.5D589Ep+6f;
1980 }
1981 
xnn_init_f32_sigmoid_scalar_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])1982 void xnn_init_f32_sigmoid_scalar_rr2_p5_params(
1983   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
1984 {
1985   params->scalar_rr2_p5.magic_bias = 0x1.8000FEp23f;
1986   params->scalar_rr2_p5.minus_log2e = -0x1.715476p0f;
1987   params->scalar_rr2_p5.ln2_hi = 0x1.62E400p-1f;
1988   params->scalar_rr2_p5.ln2_lo = 0x1.7F7D1Cp-20f;
1989   params->scalar_rr2_p5.c5 = -0x1.0F9F9Cp-7f;
1990   params->scalar_rr2_p5.c4 = 0x1.573A1Ap-5f;
1991   params->scalar_rr2_p5.c3 = -0x1.555A80p-3f;
1992   params->scalar_rr2_p5.c2 = 0x1.FFFDC6p-2f;
1993   params->scalar_rr2_p5.c1 = -0x1.FFFFF6p-1f;
1994   params->scalar_rr2_p5.one = 1.0f;
1995   params->scalar_rr2_p5.denorm_cutoff = 0x1.5D589Ep+6f;
1996 }
1997 
1998 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])1999 void xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params(
2000   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2001 {
2002   params->neon_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
2003   params->neon_rr2_lut64_p2.minus_log2e = -0x1.715476p0f;
2004   params->neon_rr2_lut64_p2.ln2_hi = 0x1.630000p-1f;
2005   params->neon_rr2_lut64_p2.ln2_lo = -0x1.BD0106p-13f;
2006   params->neon_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2007   params->neon_rr2_lut64_p2.denorm_cutoff = 0x1.5D589Ep+6f;
2008 }
2009 
xnn_init_f32_sigmoid_neon_rr2_lut2048_p1_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2010 void xnn_init_f32_sigmoid_neon_rr2_lut2048_p1_params(
2011   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2012 {
2013   params->neon_rr2_lut2048_p1.magic_bias = 0x1.800000p12f;
2014   params->neon_rr2_lut2048_p1.minus_log2e = -0x1.715476p0f;
2015   params->neon_rr2_lut2048_p1.ln2_hi = 0x1.600000p-1f;
2016   params->neon_rr2_lut2048_p1.ln2_lo = 0x1.7217F8p-8f;
2017   params->neon_rr2_lut2048_p1.c1 = -0x1.FFFFFEp-1f;
2018   params->neon_rr2_lut2048_p1.denorm_cutoff = 0x1.5D589Ep+6f;
2019 }
2020 
xnn_init_f32_sigmoid_neon_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2021 void xnn_init_f32_sigmoid_neon_rr2_p5_params(
2022   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2023 {
2024   params->neon_rr2_p5.magic_bias = 0x1.8000FEp23f;
2025   params->neon_rr2_p5.minus_log2e = -0x1.715476p0f;
2026   params->neon_rr2_p5.ln2_hi = 0x1.62E400p-1f;
2027   params->neon_rr2_p5.ln2_lo = 0x1.7F7D1Cp-20f;
2028   params->neon_rr2_p5.c5 = -0x1.0F9F9Cp-7f;
2029   params->neon_rr2_p5.c4 = 0x1.573A1Ap-5f;
2030   params->neon_rr2_p5.c3 = -0x1.555A80p-3f;
2031   params->neon_rr2_p5.c2 = 0x1.FFFDC6p-2f;
2032   params->neon_rr2_p5.c1 = -0x1.FFFFF6p-1f;
2033   params->neon_rr2_p5.denorm_cutoff = 0x1.5D589Ep+6f;
2034 }
2035 
xnn_init_f32_sigmoid_neonfma_rr1_lut2048_p1_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2036 void xnn_init_f32_sigmoid_neonfma_rr1_lut2048_p1_params(
2037   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2038 {
2039   params->neonfma_rr1_lut2048_p1.magic_bias = 0x1.800000p12f;
2040   params->neonfma_rr1_lut2048_p1.minus_log2e = -0x1.715476p0f;
2041   params->neonfma_rr1_lut2048_p1.ln2 = 0x1.62E430p-1f;
2042   params->neonfma_rr1_lut2048_p1.c1 = -0x1.FFFFFEp-1f;
2043   params->neonfma_rr1_lut2048_p1.denorm_cutoff = 0x1.5D589Ep+6f;
2044 }
2045 
xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2046 void xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params(
2047   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2048 {
2049   params->neonfma_rr1_lut64_p2.magic_bias = 0x1.800000p17f;
2050   params->neonfma_rr1_lut64_p2.minus_log2e = -0x1.715476p0f;
2051   params->neonfma_rr1_lut64_p2.ln2 = 0x1.62E430p-1f;
2052   params->neonfma_rr1_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2053   params->neonfma_rr1_lut64_p2.denorm_cutoff = 0x1.5D589Ep+6f;
2054 }
2055 
xnn_init_f32_sigmoid_neonfma_rr1_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2056 void xnn_init_f32_sigmoid_neonfma_rr1_p5_params(
2057   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2058 {
2059   params->neonfma_rr1_p5.magic_bias = 0x1.8000FEp23f;
2060   params->neonfma_rr1_p5.minus_log2e = -0x1.715476p0f;
2061   params->neonfma_rr1_p5.ln2 = 0x1.62E430p-1f;
2062   params->neonfma_rr1_p5.c5 = -0x1.0F9F9Cp-7f;
2063   params->neonfma_rr1_p5.c4 = 0x1.573A1Ap-5f;
2064   params->neonfma_rr1_p5.c3 = -0x1.555A80p-3f;
2065   params->neonfma_rr1_p5.c2 = 0x1.FFFDC6p-2f;
2066   params->neonfma_rr1_p5.c1 = -0x1.FFFFF6p-1f;
2067   params->neonfma_rr1_p5.denorm_cutoff = 0x1.5D589Ep+6f;
2068 }
2069 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
2070 
2071 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2072 void xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params(
2073   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2074 {
2075   for (uint32_t i = 0; i < 4; i++) {
2076     params->sse2_rr2_lut64_p2.sign_mask[i] = -0.0f;
2077     params->sse2_rr2_lut64_p2.magic_bias[i] = 0x1.800000p17f;
2078     params->sse2_rr2_lut64_p2.log2e[i] = 0x1.715476p0f;
2079     params->sse2_rr2_lut64_p2.index_mask[i] = UINT32_C(0x3F);
2080     params->sse2_rr2_lut64_p2.minus_ln2_hi[i] = -0x1.630000p-1f;
2081     params->sse2_rr2_lut64_p2.minus_ln2_lo[i] = 0x1.BD0106p-13f;
2082     params->sse2_rr2_lut64_p2.c2[i] = 0x1.FFFF0Ap-2f;
2083     params->sse2_rr2_lut64_p2.one[i] = 1.0f;
2084     params->sse2_rr2_lut64_p2.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2085   }
2086 }
2087 
xnn_init_f32_sigmoid_sse2_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2088 void xnn_init_f32_sigmoid_sse2_rr2_p5_params(
2089   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2090 {
2091   for (uint32_t i = 0; i < 4; i++) {
2092     params->sse2_rr2_p5.sign_mask[i] = -0.0f;
2093     params->sse2_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2094     params->sse2_rr2_p5.log2e[i] = 0x1.715476p0f;
2095     params->sse2_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
2096     params->sse2_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2097     params->sse2_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
2098     params->sse2_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2099     params->sse2_rr2_p5.c3[i] = 0x1.555A80p-3f;
2100     params->sse2_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2101     params->sse2_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
2102     params->sse2_rr2_p5.one[i] = 1.0f;
2103     params->sse2_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2104   }
2105 }
2106 
xnn_init_f32_sigmoid_avx_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2107 void xnn_init_f32_sigmoid_avx_rr2_p5_params(
2108   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2109 {
2110   for (uint32_t i = 0; i < 8; i++) {
2111     params->avx_rr2_p5.sign_mask[i] = -0.0f;
2112     params->avx_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2113     params->avx_rr2_p5.log2e[i] = 0x1.715476p0f;
2114     params->avx_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
2115     params->avx_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2116     params->avx_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
2117     params->avx_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2118     params->avx_rr2_p5.c3[i] = 0x1.555A80p-3f;
2119     params->avx_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2120     params->avx_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
2121     params->avx_rr2_p5.one[i] = 1.0f;
2122     params->avx_rr2_p5.two[i] = 2.0f;
2123     params->avx_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2124   }
2125   for (uint32_t i = 0; i < 7; i++) {
2126     params->avx_rr2_p5.mask_table[i] = -1;
2127   }
2128   for (uint32_t i = 7; i < 14; i++) {
2129     params->avx_rr2_p5.mask_table[i] = 0;
2130   }
2131 }
2132 
xnn_init_f32_sigmoid_avx2_rr1_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2133 void xnn_init_f32_sigmoid_avx2_rr1_p5_params(
2134   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2135 {
2136   for (uint32_t i = 0; i < 8; i++) {
2137     params->avx2_rr1_p5.sign_mask[i] = -0.0f;
2138     params->avx2_rr1_p5.magic_bias[i] = 0x1.8000FEp23f;
2139     params->avx2_rr1_p5.log2e[i] = 0x1.715476p0f;
2140     params->avx2_rr1_p5.minus_ln2[i] = -0x1.62E430p-1f;
2141     params->avx2_rr1_p5.c5[i] = 0x1.0F9F9Cp-7f;
2142     params->avx2_rr1_p5.c4[i] = 0x1.573A1Ap-5f;
2143     params->avx2_rr1_p5.c3[i] = 0x1.555A80p-3f;
2144     params->avx2_rr1_p5.c2[i] = 0x1.FFFDC6p-2f;
2145     params->avx2_rr1_p5.c1[i] = 0x1.FFFFF6p-1f;
2146     params->avx2_rr1_p5.one[i] = 1.0f;
2147     params->avx2_rr1_p5.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2148   }
2149   for (uint32_t i = 0; i < 7; i++) {
2150     params->avx2_rr1_p5.mask_table[i] = -1;
2151   }
2152   for (uint32_t i = 7; i < 14; i++) {
2153     params->avx2_rr1_p5.mask_table[i] = 0;
2154   }
2155 }
2156 
xnn_init_f32_sigmoid_avx512_rr1_lut16_p3_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2157 void xnn_init_f32_sigmoid_avx512_rr1_lut16_p3_params(
2158   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2159 {
2160   params->avx512_rr1_lut16_p3.sign_mask = UINT32_C(0x80000000);
2161   params->avx512_rr1_lut16_p3.magic_bias = 0x1.800000p19f;
2162   params->avx512_rr1_lut16_p3.log2e = 0x1.715476p0f;
2163   params->avx512_rr1_lut16_p3.minus_ln2 = -0x1.62E430p-1f;
2164   params->avx512_rr1_lut16_p3.c3 = 0x1.55559Ap-3f;
2165   params->avx512_rr1_lut16_p3.c2 = 0x1.00021Ep-1f;
2166   params->avx512_rr1_lut16_p3.one = 1.0f;
2167   params->avx512_rr1_lut16_p3.table[ 0] = 0x1.000000p+0f;
2168   params->avx512_rr1_lut16_p3.table[ 1] = 0x1.0B5586p+0f;
2169   params->avx512_rr1_lut16_p3.table[ 2] = 0x1.172B84p+0f;
2170   params->avx512_rr1_lut16_p3.table[ 3] = 0x1.2387A6p+0f;
2171   params->avx512_rr1_lut16_p3.table[ 4] = 0x1.306FE0p+0f;
2172   params->avx512_rr1_lut16_p3.table[ 5] = 0x1.3DEA64p+0f;
2173   params->avx512_rr1_lut16_p3.table[ 6] = 0x1.4BFDAEp+0f;
2174   params->avx512_rr1_lut16_p3.table[ 7] = 0x1.5AB07Ep+0f;
2175   params->avx512_rr1_lut16_p3.table[ 8] = 0x1.6A09E6p+0f;
2176   params->avx512_rr1_lut16_p3.table[ 9] = 0x1.7A1148p+0f;
2177   params->avx512_rr1_lut16_p3.table[10] = 0x1.8ACE54p+0f;
2178   params->avx512_rr1_lut16_p3.table[11] = 0x1.9C4918p+0f;
2179   params->avx512_rr1_lut16_p3.table[12] = 0x1.AE89FAp+0f;
2180   params->avx512_rr1_lut16_p3.table[13] = 0x1.C199BEp+0f;
2181   params->avx512_rr1_lut16_p3.table[14] = 0x1.D5818Ep+0f;
2182   params->avx512_rr1_lut16_p3.table[15] = 0x1.EA4AFAp+0f;
2183 }
2184 
xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2185 void xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params(
2186   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2187 {
2188   params->avx512_rr2_lut32_p2.sign_mask = UINT32_C(0x80000000);
2189   params->avx512_rr2_lut32_p2.magic_bias = 0x1.800000p18f;
2190   params->avx512_rr2_lut32_p2.log2e = 0x1.715476p0f;
2191   params->avx512_rr2_lut32_p2.minus_ln2_hi = -0x1.62E430p-1f;
2192   params->avx512_rr2_lut32_p2.minus_ln2_lo = 0x1.05C61p-29f;
2193   params->avx512_rr2_lut32_p2.c2 = 0x1.000000p-1f;
2194   params->avx512_rr2_lut32_p2.c1 = 0x1.0000F6p-0f;
2195   params->avx512_rr2_lut32_p2.one = 1.0f;
2196 
2197   params->avx512_rr2_lut32_p2.table_lo[ 0] = 0x1.000000p+0f;
2198   params->avx512_rr2_lut32_p2.table_lo[ 1] = 0x1.059B0Ep+0f;
2199   params->avx512_rr2_lut32_p2.table_lo[ 2] = 0x1.0B5586p+0f;
2200   params->avx512_rr2_lut32_p2.table_lo[ 3] = 0x1.11301Ep+0f;
2201   params->avx512_rr2_lut32_p2.table_lo[ 4] = 0x1.172B84p+0f;
2202   params->avx512_rr2_lut32_p2.table_lo[ 5] = 0x1.1D4874p+0f;
2203   params->avx512_rr2_lut32_p2.table_lo[ 6] = 0x1.2387A6p+0f;
2204   params->avx512_rr2_lut32_p2.table_lo[ 7] = 0x1.29E9E0p+0f;
2205   params->avx512_rr2_lut32_p2.table_lo[ 8] = 0x1.306FE0p+0f;
2206   params->avx512_rr2_lut32_p2.table_lo[ 9] = 0x1.371A74p+0f;
2207   params->avx512_rr2_lut32_p2.table_lo[10] = 0x1.3DEA64p+0f;
2208   params->avx512_rr2_lut32_p2.table_lo[11] = 0x1.44E086p+0f;
2209   params->avx512_rr2_lut32_p2.table_lo[12] = 0x1.4BFDAEp+0f;
2210   params->avx512_rr2_lut32_p2.table_lo[13] = 0x1.5342B6p+0f;
2211   params->avx512_rr2_lut32_p2.table_lo[14] = 0x1.5AB07Ep+0f;
2212   params->avx512_rr2_lut32_p2.table_lo[15] = 0x1.6247ECp+0f;
2213 
2214   params->avx512_rr2_lut32_p2.table_hi[ 0] = 0x1.6A09E6p+0f;
2215   params->avx512_rr2_lut32_p2.table_hi[ 1] = 0x1.71F75Ep+0f;
2216   params->avx512_rr2_lut32_p2.table_hi[ 2] = 0x1.7A1148p+0f;
2217   params->avx512_rr2_lut32_p2.table_hi[ 3] = 0x1.82589Ap+0f;
2218   params->avx512_rr2_lut32_p2.table_hi[ 4] = 0x1.8ACE54p+0f;
2219   params->avx512_rr2_lut32_p2.table_hi[ 5] = 0x1.93737Cp+0f;
2220   params->avx512_rr2_lut32_p2.table_hi[ 6] = 0x1.9C4918p+0f;
2221   params->avx512_rr2_lut32_p2.table_hi[ 7] = 0x1.A5503Cp+0f;
2222   params->avx512_rr2_lut32_p2.table_hi[ 8] = 0x1.AE89FAp+0f;
2223   params->avx512_rr2_lut32_p2.table_hi[ 9] = 0x1.B7F770p+0f;
2224   params->avx512_rr2_lut32_p2.table_hi[10] = 0x1.C199BEp+0f;
2225   params->avx512_rr2_lut32_p2.table_hi[11] = 0x1.CB720Ep+0f;
2226   params->avx512_rr2_lut32_p2.table_hi[12] = 0x1.D5818Ep+0f;
2227   params->avx512_rr2_lut32_p2.table_hi[13] = 0x1.DFC974p+0f;
2228   params->avx512_rr2_lut32_p2.table_hi[14] = 0x1.EA4AFAp+0f;
2229   params->avx512_rr2_lut32_p2.table_hi[15] = 0x1.F50766p+0f;
2230 }
2231 
xnn_init_f32_sigmoid_avx512_rr1_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2232 void xnn_init_f32_sigmoid_avx512_rr1_p5_params(
2233   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2234 {
2235   params->avx512_rr1_p5.sign_mask = UINT32_C(0x80000000);
2236   params->avx512_rr1_p5.log2e = 0x1.715476p0f;
2237   params->avx512_rr1_p5.minus_ln2 = -0x1.62E430p-1f;
2238   params->avx512_rr1_p5.c5 = 0x1.0F9F9Cp-7f;
2239   params->avx512_rr1_p5.c4 = 0x1.573A1Ap-5f;
2240   params->avx512_rr1_p5.c3 = 0x1.555A80p-3f;
2241   params->avx512_rr1_p5.c2 = 0x1.FFFDC6p-2f;
2242   params->avx512_rr1_p5.c1 = 0x1.FFFFF6p-1f;
2243   params->avx512_rr1_p5.one = 1.0f;
2244 }
2245 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2246 
2247 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_sigmoid_wasmsimd_rr2_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2248 void xnn_init_f32_sigmoid_wasmsimd_rr2_lut64_p2_params(
2249   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2250 {
2251   for (uint32_t i = 0; i < 2; i++) {
2252     params->wasmsimd_rr2_lut64_p2.magic_bias[i] = 0x1.800000p17f;
2253     params->wasmsimd_rr2_lut64_p2.minus_log2e[i] = -0x1.715476p0f;
2254     params->wasmsimd_rr2_lut64_p2.index_mask[i] = UINT32_C(0x3F);
2255     params->wasmsimd_rr2_lut64_p2.ln2_hi[i] = 0x1.630000p-1f;
2256     params->wasmsimd_rr2_lut64_p2.ln2_lo[i] = -0x1.BD0106p-13f;
2257     params->wasmsimd_rr2_lut64_p2.c2[i] = 0x1.FFFF0Ap-2f;
2258     params->wasmsimd_rr2_lut64_p2.one[i] = 1.0f;
2259     params->wasmsimd_rr2_lut64_p2.denorm_cutoff[i] = 0x1.5D589Ep+6f;
2260   }
2261 }
2262 
xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2263 void xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params(
2264   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2265 {
2266   for (uint32_t i = 0; i < 2; i++) {
2267     params->wasmsimd_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2268     params->wasmsimd_rr2_p5.minus_log2e[i] = -0x1.715476p+0f;
2269     params->wasmsimd_rr2_p5.ln2_hi[i] = 0x1.62E400p-1f;
2270     params->wasmsimd_rr2_p5.ln2_lo[i] = 0x1.7F7D1Cp-20f;
2271     params->wasmsimd_rr2_p5.c5[i] = -0x1.0F9F9Cp-7f;
2272     params->wasmsimd_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2273     params->wasmsimd_rr2_p5.c3[i] = -0x1.555A80p-3f;
2274     params->wasmsimd_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2275     params->wasmsimd_rr2_p5.c1[i] = -0x1.FFFFF6p-1f;
2276     params->wasmsimd_rr2_p5.one[i] = 1.0f;
2277     params->wasmsimd_rr2_p5.denorm_cutoff[i] = 0x1.5D589Ep+6f;
2278   }
2279 }
2280 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2281 
2282 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_abs_sse_params(union xnn_f32_abs_params params[XNN_MIN_ELEMENTS (1)])2283 void xnn_init_f32_abs_sse_params(
2284   union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2285 {
2286   for (uint32_t i = 0; i < 4; i++) {
2287     params->sse.nonsign_mask[i] = math_nonsign_mask_f32();
2288   }
2289 }
2290 
xnn_init_f32_abs_avx_params(union xnn_f32_abs_params params[XNN_MIN_ELEMENTS (1)])2291 void xnn_init_f32_abs_avx_params(
2292   union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2293 {
2294   for (uint32_t i = 0; i < 8; i++) {
2295     params->avx.nonsign_mask[i] = math_nonsign_mask_f32();
2296   }
2297   for (uint32_t i = 0; i < 7; i++) {
2298     params->avx.mask_table[i] = -1;
2299   }
2300   for (uint32_t i = 7; i < 14; i++) {
2301     params->avx.mask_table[i] = 0;
2302   }
2303 }
2304 
xnn_init_f32_abs_avx512_params(union xnn_f32_abs_params params[XNN_MIN_ELEMENTS (1)])2305 void xnn_init_f32_abs_avx512_params(
2306   union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2307 {
2308   params->avx512.nonsign_mask = UINT32_C(0x7FFFFFFF);
2309 }
2310 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2311 
2312 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_abs_wasmsimd_params(union xnn_f32_abs_params params[XNN_MIN_ELEMENTS (1)])2313 void xnn_init_f32_abs_wasmsimd_params(
2314   union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2315 {
2316   params->wasmsimd.nonsign_mask[0] = math_nonsign_mask_f32();
2317   params->wasmsimd.nonsign_mask[1] = math_nonsign_mask_f32();
2318 }
2319 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2320 
2321 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_neg_sse_params(union xnn_f32_neg_params params[XNN_MIN_ELEMENTS (1)])2322 void xnn_init_f32_neg_sse_params(
2323   union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2324 {
2325   for (uint32_t i = 0; i < 4; i++) {
2326     params->sse.sign_mask[i] = -0.0f;
2327   }
2328 }
2329 
xnn_init_f32_neg_avx_params(union xnn_f32_neg_params params[XNN_MIN_ELEMENTS (1)])2330 void xnn_init_f32_neg_avx_params(
2331   union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2332 {
2333   for (uint32_t i = 0; i < 8; i++) {
2334     params->avx.sign_mask[i] = -0.0f;
2335   }
2336   for (uint32_t i = 0; i < 7; i++) {
2337     params->avx.mask_table[i] = -1;
2338   }
2339   for (uint32_t i = 7; i < 14; i++) {
2340     params->avx.mask_table[i] = 0;
2341   }
2342 }
2343 
xnn_init_f32_neg_avx512_params(union xnn_f32_neg_params params[XNN_MIN_ELEMENTS (1)])2344 void xnn_init_f32_neg_avx512_params(
2345   union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2346 {
2347   params->avx512.sign_mask = UINT32_C(0x80000000);
2348 }
2349 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2350 
2351 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_neg_wasmsimd_params(union xnn_f32_neg_params params[XNN_MIN_ELEMENTS (1)])2352 void xnn_init_f32_neg_wasmsimd_params(
2353   union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2354 {
2355   params->wasmsimd.sign_mask[0] = -0.0f;
2356   params->wasmsimd.sign_mask[1] = -0.0f;
2357 }
2358 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2359 
2360 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_rnd_sse2_params(union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS (1)])2361 void xnn_init_f32_rnd_sse2_params(
2362   union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
2363 {
2364   for (uint32_t i = 0; i < 4; i++) {
2365     params->sse2.sign_mask[i] = -0.0f;
2366     params->sse2.one[i] = 1.0f;
2367   }
2368 }
2369 
xnn_init_f32_rnd_avx_params(union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS (1)])2370 void xnn_init_f32_rnd_avx_params(
2371   union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
2372 {
2373   for (uint32_t i = 0; i < 7; i++) {
2374     params->avx.mask_table[i] = -1;
2375   }
2376   for (uint32_t i = 7; i < 14; i++) {
2377     params->avx.mask_table[i] = 0;
2378   }
2379 }
2380 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2381 
2382 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_rnd_wasmsimd_params(union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS (1)])2383 void xnn_init_f32_rnd_wasmsimd_params(
2384   union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
2385 {
2386   params->wasmsimd.sign_mask[0] = -0.0f;
2387   params->wasmsimd.sign_mask[1] = -0.0f;
2388   params->wasmsimd.magic_bias[0] = 0x1.000000p+23f;
2389   params->wasmsimd.magic_bias[1] = 0x1.000000p+23f;
2390   params->wasmsimd.one[0] = 1.0f;
2391   params->wasmsimd.one[1] = 1.0f;
2392 }
2393 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2394 
xnn_init_f32_elu_scalar_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2395 void xnn_init_f32_elu_scalar_rr2_lut16_p3_params(
2396   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2397   float prescale,
2398   float alpha,
2399   float beta)
2400 {
2401   params->scalar_rr2_lut16_p3.prescale = prescale;
2402   params->scalar_rr2_lut16_p3.alpha = alpha;
2403   params->scalar_rr2_lut16_p3.beta = beta;
2404   params->scalar_rr2_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2405   params->scalar_rr2_lut16_p3.magic_bias = 0x1.800000p19f;
2406   params->scalar_rr2_lut16_p3.log2e = 0x1.715476p+0f;
2407   params->scalar_rr2_lut16_p3.minus_ln2_hi = -0x1.62E400p-1f;
2408   params->scalar_rr2_lut16_p3.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2409   params->scalar_rr2_lut16_p3.c3 = 0x1.55561Cp-3f;
2410   params->scalar_rr2_lut16_p3.c2 = 0x1.0001ECp-1f;
2411   params->scalar_rr2_lut16_p3.one = 1.0f;
2412 }
2413 
xnn_init_f32_elu_scalar_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2414 void xnn_init_f32_elu_scalar_rr2_p6_params(
2415   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2416   float prescale,
2417   float alpha,
2418   float beta)
2419 {
2420   params->scalar_rr2_p6.prescale = prescale;
2421   params->scalar_rr2_p6.alpha = alpha;
2422   params->scalar_rr2_p6.beta = beta;
2423   params->scalar_rr2_p6.sat_cutoff = -0x1.154246p+4f;
2424   params->scalar_rr2_p6.magic_bias = 0x1.8000FEp23f;
2425   params->scalar_rr2_p6.log2e = 0x1.715476p+0f;
2426   params->scalar_rr2_p6.minus_ln2_hi = -0x1.62E440p-1f;
2427   params->scalar_rr2_p6.minus_ln2_lo = 0x1.0105C6p-21f;
2428   params->scalar_rr2_p6.c6 = 0x1.6b7338p-10f;
2429   params->scalar_rr2_p6.c5 = 0x1.12278Ep-7f;
2430   params->scalar_rr2_p6.c4 = 0x1.555716p-5f;
2431   params->scalar_rr2_p6.c3 = 0x1.5554B0p-3f;
2432   params->scalar_rr2_p6.c2 = 0x1.FFFFFEp-2f;
2433   params->scalar_rr2_p6.one = 1.0f;
2434 }
2435 
2436 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_elu_neon_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2437 void xnn_init_f32_elu_neon_rr2_lut16_p3_params(
2438   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2439   float prescale,
2440   float alpha,
2441   float beta)
2442 {
2443   params->neon_rr2_lut16_p3.prescale = prescale;
2444   params->neon_rr2_lut16_p3.alpha = alpha;
2445   params->neon_rr2_lut16_p3.beta = beta;
2446   params->neon_rr2_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2447   params->neon_rr2_lut16_p3.magic_bias = 0x1.800000p19f;
2448   params->neon_rr2_lut16_p3.log2e = 0x1.715476p+0f;
2449   params->neon_rr2_lut16_p3.minus_ln2_hi = -0x1.62E400p-1f;
2450   params->neon_rr2_lut16_p3.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2451   params->neon_rr2_lut16_p3.c3 = 0x1.55561Cp-3f;
2452   params->neon_rr2_lut16_p3.c2 = 0x1.0001ECp-1f;
2453 }
2454 
xnn_init_f32_elu_neon_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2455 void xnn_init_f32_elu_neon_rr2_p6_params(
2456   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2457   float prescale,
2458   float alpha,
2459   float beta)
2460 {
2461   params->neon_rr2_p6.prescale = prescale;
2462   params->neon_rr2_p6.alpha = alpha;
2463   params->neon_rr2_p6.beta = beta;
2464   params->neon_rr2_p6.sat_cutoff = -0x1.154246p+4f;
2465   params->neon_rr2_p6.magic_bias = 0x1.8000FEp23f;
2466   params->neon_rr2_p6.log2e = 0x1.715476p+0f;
2467   params->neon_rr2_p6.minus_ln2_hi = -0x1.62E440p-1f;
2468   params->neon_rr2_p6.minus_ln2_lo = 0x1.0105C6p-21f;
2469   params->neon_rr2_p6.c6 = 0x1.6b7338p-10f;
2470   params->neon_rr2_p6.c5 = 0x1.12278Ep-7f;
2471   params->neon_rr2_p6.c4 = 0x1.555716p-5f;
2472   params->neon_rr2_p6.c3 = 0x1.5554B0p-3f;
2473   params->neon_rr2_p6.c2 = 0x1.FFFFFEp-2f;
2474 }
2475 
xnn_init_f32_elu_neonfma_rr1_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2476 void xnn_init_f32_elu_neonfma_rr1_lut16_p3_params(
2477   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2478   float prescale,
2479   float alpha,
2480   float beta)
2481 {
2482   params->neonfma_rr1_lut16_p3.prescale = prescale;
2483   params->neonfma_rr1_lut16_p3.alpha = alpha;
2484   params->neonfma_rr1_lut16_p3.beta = beta;
2485   params->neonfma_rr1_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2486   params->neonfma_rr1_lut16_p3.magic_bias = 0x1.800000p19f;
2487   params->neonfma_rr1_lut16_p3.log2e = 0x1.715476p+0f;
2488   params->neonfma_rr1_lut16_p3.minus_ln2 = -0x1.62E430p-1f;
2489   params->neonfma_rr1_lut16_p3.c3 = 0x1.55561Cp-3f;
2490   params->neonfma_rr1_lut16_p3.c2 = 0x1.0001ECp-1f;
2491 }
2492 
xnn_init_f32_elu_neonfma_rr1_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2493 void xnn_init_f32_elu_neonfma_rr1_p6_params(
2494   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2495   float prescale,
2496   float alpha,
2497   float beta)
2498 {
2499   params->neonfma_rr1_p6.prescale = prescale;
2500   params->neonfma_rr1_p6.alpha = alpha;
2501   params->neonfma_rr1_p6.beta = beta;
2502   params->neonfma_rr1_p6.sat_cutoff = -0x1.154246p+4f;
2503   params->neonfma_rr1_p6.magic_bias = 0x1.8000FEp23f;
2504   params->neonfma_rr1_p6.log2e = 0x1.715476p+0f;
2505   params->neonfma_rr1_p6.minus_ln2 = -0x1.62E430p-1f;
2506   params->neonfma_rr1_p6.c6 = 0x1.6b7338p-10f;
2507   params->neonfma_rr1_p6.c5 = 0x1.12278Ep-7f;
2508   params->neonfma_rr1_p6.c4 = 0x1.555716p-5f;
2509   params->neonfma_rr1_p6.c3 = 0x1.5554B0p-3f;
2510   params->neonfma_rr1_p6.c2 = 0x1.FFFFFEp-2f;
2511 }
2512 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
2513 
2514 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_elu_sse2_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2515 void xnn_init_f32_elu_sse2_rr2_lut16_p3_params(
2516   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2517   float prescale,
2518   float alpha,
2519   float beta)
2520 {
2521   for (uint32_t i = 0; i < 4; i++) {
2522     params->sse2_rr2_lut16_p3.prescale[i] = prescale;
2523     params->sse2_rr2_lut16_p3.alpha[i] = alpha;
2524     params->sse2_rr2_lut16_p3.beta[i] = beta;
2525     params->sse2_rr2_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2526     params->sse2_rr2_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2527     params->sse2_rr2_lut16_p3.log2e[i] = 0x1.715476p+0f;
2528     params->sse2_rr2_lut16_p3.index_mask[i] = UINT32_C(0xF);
2529     params->sse2_rr2_lut16_p3.minus_ln2_hi[i] = -0x1.62E400p-1f;
2530     params->sse2_rr2_lut16_p3.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2531     params->sse2_rr2_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2532     params->sse2_rr2_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2533     params->sse2_rr2_lut16_p3.one[i] = 1.0f;
2534   }
2535 }
2536 
xnn_init_f32_elu_sse2_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2537 void xnn_init_f32_elu_sse2_rr2_p6_params(
2538   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2539   float prescale,
2540   float alpha,
2541   float beta)
2542 {
2543   for (uint32_t i = 0; i < 4; i++) {
2544     params->sse2_rr2_p6.prescale[i] = prescale;
2545     params->sse2_rr2_p6.alpha[i] = alpha;
2546     params->sse2_rr2_p6.beta[i] = beta;
2547     params->sse2_rr2_p6.sat_cutoff[i] = -0x1.154246p+4f;
2548     params->sse2_rr2_p6.magic_bias[i] = 0x1.8000FEp23f;
2549     params->sse2_rr2_p6.log2e[i] = 0x1.715476p+0f;
2550     params->sse2_rr2_p6.minus_ln2_hi[i] = -0x1.62E440p-1f;
2551     params->sse2_rr2_p6.minus_ln2_lo[i] = 0x1.0105C6p-21f;
2552     params->sse2_rr2_p6.c6[i] = 0x1.6b7338p-10f;
2553     params->sse2_rr2_p6.c5[i] = 0x1.12278Ep-7f;
2554     params->sse2_rr2_p6.c4[i] = 0x1.555716p-5f;
2555     params->sse2_rr2_p6.c3[i] = 0x1.5554B0p-3f;
2556     params->sse2_rr2_p6.c2[i] = 0x1.FFFFFEp-2f;
2557     params->sse2_rr2_p6.one[i] = 1.0f;
2558   }
2559 }
2560 
xnn_init_f32_elu_avx_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2561 void xnn_init_f32_elu_avx_rr2_lut16_p3_params(
2562   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2563   float prescale,
2564   float alpha,
2565   float beta)
2566 {
2567   for (uint32_t i = 0; i < 8; i++) {
2568     params->avx_rr2_lut16_p3.prescale[i] = prescale;
2569     params->avx_rr2_lut16_p3.alpha[i] = alpha;
2570     params->avx_rr2_lut16_p3.beta[i] = beta;
2571     params->avx_rr2_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2572     params->avx_rr2_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2573     params->avx_rr2_lut16_p3.log2e[i] = 0x1.715476p+0f;
2574     params->avx_rr2_lut16_p3.index_mask[i] = UINT32_C(0xF);
2575     params->avx_rr2_lut16_p3.minus_ln2_hi[i] = -0x1.62E400p-1f;
2576     params->avx_rr2_lut16_p3.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2577     params->avx_rr2_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2578     params->avx_rr2_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2579     params->avx_rr2_lut16_p3.one[i] = 1.0f;
2580   }
2581   for (uint32_t i = 0; i < 7; i++) {
2582     params->avx_rr2_lut16_p3.mask_table[i] = -1;
2583   }
2584   for (uint32_t i = 7; i < 14; i++) {
2585     params->avx_rr2_lut16_p3.mask_table[i] = 0;
2586   }
2587 }
2588 
xnn_init_f32_elu_avx_rr2_lut4_p4_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2589 void xnn_init_f32_elu_avx_rr2_lut4_p4_params(
2590   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2591   float prescale,
2592   float alpha,
2593   float beta)
2594 {
2595   for (uint32_t i = 0; i < 8; i++) {
2596     params->avx_rr2_lut4_p4.prescale[i] = prescale;
2597     params->avx_rr2_lut4_p4.alpha[i] = alpha;
2598     params->avx_rr2_lut4_p4.beta[i] = beta;
2599     params->avx_rr2_lut4_p4.sat_cutoff[i] = -0x1.154246p+4f;
2600     params->avx_rr2_lut4_p4.magic_bias[i] = 0x1.8003F8p21f;
2601     params->avx_rr2_lut4_p4.log2e[i] = 0x1.715476p+0f;
2602     params->avx_rr2_lut4_p4.index_mask[i] = UINT32_C(0x3);
2603   }
2604   params->avx_rr2_lut4_p4.table[0] = 0x1.000000p+0f;
2605   params->avx_rr2_lut4_p4.table[1] = 0x1.306FE0p+0f;
2606   params->avx_rr2_lut4_p4.table[2] = 0x1.6A09E6p+0f;
2607   params->avx_rr2_lut4_p4.table[3] = 0x1.AE89FAp+0f;
2608   params->avx_rr2_lut4_p4.table[4] = 0x1.000000p+0f;
2609   params->avx_rr2_lut4_p4.table[5] = 0x1.306FE0p+0f;
2610   params->avx_rr2_lut4_p4.table[6] = 0x1.6A09E6p+0f;
2611   params->avx_rr2_lut4_p4.table[7] = 0x1.AE89FAp+0f;
2612   for (uint32_t i = 0; i < 8; i++) {
2613     params->avx_rr2_lut4_p4.minus_ln2_hi[i] = -0x1.62E400p-1f;
2614     params->avx_rr2_lut4_p4.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2615     params->avx_rr2_lut4_p4.c4[i] = 0x1.554F9Ap-5f;
2616     params->avx_rr2_lut4_p4.c3[i] = 0x1.557082p-3f;
2617     params->avx_rr2_lut4_p4.c2[i] = 0x1.000002p-1f;
2618     params->avx_rr2_lut4_p4.one[i] = 1.0f;
2619   }
2620   for (uint32_t i = 0; i < 7; i++) {
2621     params->avx_rr2_lut4_p4.mask_table[i] = -1;
2622   }
2623   for (uint32_t i = 7; i < 14; i++) {
2624     params->avx_rr2_lut4_p4.mask_table[i] = 0;
2625   }
2626 }
2627 
xnn_init_f32_elu_avx_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2628 void xnn_init_f32_elu_avx_rr2_p6_params(
2629   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2630   float prescale,
2631   float alpha,
2632   float beta)
2633 {
2634   for (uint32_t i = 0; i < 8; i++) {
2635     params->avx_rr2_p6.prescale[i] = prescale;
2636     params->avx_rr2_p6.alpha[i] = alpha;
2637     params->avx_rr2_p6.beta[i] = beta;
2638     params->avx_rr2_p6.sat_cutoff[i] = -0x1.154246p+4f;
2639     params->avx_rr2_p6.magic_bias[i] = 0x1.8000FEp23f;
2640     params->avx_rr2_p6.log2e[i] = 0x1.715476p+0f;
2641     params->avx_rr2_p6.minus_ln2_hi[i] = -0x1.62E440p-1f;
2642     params->avx_rr2_p6.minus_ln2_lo[i] = 0x1.0105C6p-21f;
2643     params->avx_rr2_p6.c6[i] = 0x1.6b7338p-10f;
2644     params->avx_rr2_p6.c5[i] = 0x1.12278Ep-7f;
2645     params->avx_rr2_p6.c4[i] = 0x1.555716p-5f;
2646     params->avx_rr2_p6.c3[i] = 0x1.5554B0p-3f;
2647     params->avx_rr2_p6.c2[i] = 0x1.FFFFFEp-2f;
2648     params->avx_rr2_p6.one[i] = 1.0f;
2649   }
2650   for (uint32_t i = 0; i < 7; i++) {
2651     params->avx_rr2_p6.mask_table[i] = -1;
2652   }
2653   for (uint32_t i = 7; i < 14; i++) {
2654     params->avx_rr2_p6.mask_table[i] = 0;
2655   }
2656 }
2657 
xnn_init_f32_elu_avx2_rr1_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2658 void xnn_init_f32_elu_avx2_rr1_lut16_p3_params(
2659   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2660   float prescale,
2661   float alpha,
2662   float beta)
2663 {
2664   for (uint32_t i = 0; i < 8; i++) {
2665     params->avx2_rr1_lut16_p3.prescale[i] = prescale;
2666     params->avx2_rr1_lut16_p3.alpha[i] = alpha;
2667     params->avx2_rr1_lut16_p3.beta[i] = beta;
2668     params->avx2_rr1_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2669     params->avx2_rr1_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2670     params->avx2_rr1_lut16_p3.log2e[i] = 0x1.715476p+0f;
2671     params->avx2_rr1_lut16_p3.index_mask[i] = UINT32_C(0xF);
2672     params->avx2_rr1_lut16_p3.minus_ln2[i] = -0x1.62E430p-1f;
2673     params->avx2_rr1_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2674     params->avx2_rr1_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2675   }
2676   for (uint32_t i = 0; i < 7; i++) {
2677     params->avx2_rr1_lut16_p3.mask_table[i] = -1;
2678   }
2679   for (uint32_t i = 7; i < 14; i++) {
2680     params->avx2_rr1_lut16_p3.mask_table[i] = 0;
2681   }
2682 }
2683 
xnn_init_f32_elu_avx2_rr1_lut8_p4_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2684 void xnn_init_f32_elu_avx2_rr1_lut8_p4_params(
2685   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2686   float prescale,
2687   float alpha,
2688   float beta)
2689 {
2690   for (uint32_t i = 0; i < 8; i++) {
2691     params->avx2_rr1_lut8_p4.prescale[i] = prescale;
2692     params->avx2_rr1_lut8_p4.alpha[i] = alpha;
2693     params->avx2_rr1_lut8_p4.beta[i] = beta;
2694     params->avx2_rr1_lut8_p4.sat_cutoff[i] = -0x1.154246p+4f;
2695     params->avx2_rr1_lut8_p4.magic_bias[i] = 0x1.800000p20f;
2696     params->avx2_rr1_lut8_p4.log2e[i] = 0x1.715476p+0f;
2697   }
2698   params->avx2_rr1_lut8_p4.table[0] = UINT32_C(0x3F800000);
2699   params->avx2_rr1_lut8_p4.table[1] = UINT32_C(0x3F7B95C2);
2700   params->avx2_rr1_lut8_p4.table[2] = UINT32_C(0x3F7837F0);
2701   params->avx2_rr1_lut8_p4.table[3] = UINT32_C(0x3F75FED7);
2702   params->avx2_rr1_lut8_p4.table[4] = UINT32_C(0x3F7504F3);
2703   params->avx2_rr1_lut8_p4.table[5] = UINT32_C(0x3F75672A);
2704   params->avx2_rr1_lut8_p4.table[6] = UINT32_C(0x3F7744FD);
2705   params->avx2_rr1_lut8_p4.table[7] = UINT32_C(0x3F7AC0C7);
2706   for (uint32_t i = 0; i < 8; i++) {
2707     params->avx2_rr1_lut8_p4.minus_ln2[i] = -0x1.62E430p-1f;
2708     params->avx2_rr1_lut8_p4.c4[i] = 0x1.5558ECp-5f;
2709     params->avx2_rr1_lut8_p4.c3[i] = 0x1.555C20p-3f;
2710     params->avx2_rr1_lut8_p4.c2[i] = 0x1.000000p-1f;
2711   }
2712   for (uint32_t i = 0; i < 7; i++) {
2713     params->avx2_rr1_lut8_p4.mask_table[i] = -1;
2714   }
2715   for (uint32_t i = 7; i < 14; i++) {
2716     params->avx2_rr1_lut8_p4.mask_table[i] = 0;
2717   }
2718 }
2719 
xnn_init_f32_elu_avx2_rr1_lut4_p4_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2720 void xnn_init_f32_elu_avx2_rr1_lut4_p4_params(
2721   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2722   float prescale,
2723   float alpha,
2724   float beta)
2725 {
2726   for (uint32_t i = 0; i < 8; i++) {
2727     params->avx2_rr1_lut4_p4.prescale[i] = prescale;
2728     params->avx2_rr1_lut4_p4.alpha[i] = alpha;
2729     params->avx2_rr1_lut4_p4.beta[i] = beta;
2730     params->avx2_rr1_lut4_p4.sat_cutoff[i] = -0x1.154246p+4f;
2731     params->avx2_rr1_lut4_p4.magic_bias[i] = 0x1.800000p21f;
2732     params->avx2_rr1_lut4_p4.log2e[i] = 0x1.715476p+0f;
2733   }
2734   params->avx2_rr1_lut4_p4.table[0] = 0x1.000000p+0f;
2735   params->avx2_rr1_lut4_p4.table[1] = 0x1.F06FE0p-1f;
2736   params->avx2_rr1_lut4_p4.table[2] = 0x1.EA09E6p-1f;
2737   params->avx2_rr1_lut4_p4.table[3] = 0x1.EE89FAp-1f;
2738   params->avx2_rr1_lut4_p4.table[4] = 0x1.000000p+0f;
2739   params->avx2_rr1_lut4_p4.table[5] = 0x1.F06FE0p-1f;
2740   params->avx2_rr1_lut4_p4.table[6] = 0x1.EA09E6p-1f;
2741   params->avx2_rr1_lut4_p4.table[7] = 0x1.EE89FAp-1f;
2742   for (uint32_t i = 0; i < 8; i++) {
2743     params->avx2_rr1_lut4_p4.minus_ln2[i] = -0x1.62E430p-1f;
2744     params->avx2_rr1_lut4_p4.c4[i] = 0x1.554F9Ap-5f;
2745     params->avx2_rr1_lut4_p4.c3[i] = 0x1.557082p-3f;
2746     params->avx2_rr1_lut4_p4.c2[i] = 0x1.000002p-1f;
2747   }
2748   for (uint32_t i = 0; i < 7; i++) {
2749     params->avx2_rr1_lut4_p4.mask_table[i] = -1;
2750   }
2751   for (uint32_t i = 7; i < 14; i++) {
2752     params->avx2_rr1_lut4_p4.mask_table[i] = 0;
2753   }
2754 }
2755 
xnn_init_f32_elu_avx2_rr1_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2756 void xnn_init_f32_elu_avx2_rr1_p6_params(
2757   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2758   float prescale,
2759   float alpha,
2760   float beta)
2761 {
2762   for (uint32_t i = 0; i < 8; i++) {
2763     params->avx2_rr1_p6.prescale[i] = prescale;
2764     params->avx2_rr1_p6.alpha[i] = alpha;
2765     params->avx2_rr1_p6.beta[i] = beta;
2766     params->avx2_rr1_p6.sat_cutoff[i] = -0x1.154246p+4f;
2767     params->avx2_rr1_p6.magic_bias[i] = 0x1.8000FEp23f;
2768     params->avx2_rr1_p6.log2e[i] = 0x1.715476p+0f;
2769     params->avx2_rr1_p6.minus_ln2[i] = -0x1.62E430p-1f;
2770     params->avx2_rr1_p6.c6[i] = 0x1.6B7338p-10f;
2771     params->avx2_rr1_p6.c5[i] = 0x1.12278Ep-7f;
2772     params->avx2_rr1_p6.c4[i] = 0x1.555716p-5f;
2773     params->avx2_rr1_p6.c3[i] = 0x1.5554B0p-3f;
2774     params->avx2_rr1_p6.c2[i] = 0x1.FFFFFEp-2f;
2775   }
2776   for (uint32_t i = 0; i < 7; i++) {
2777     params->avx2_rr1_p6.mask_table[i] = -1;
2778   }
2779   for (uint32_t i = 7; i < 14; i++) {
2780     params->avx2_rr1_p6.mask_table[i] = 0;
2781   }
2782 }
2783 
xnn_init_f32_elu_avx512_rr1_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2784 void xnn_init_f32_elu_avx512_rr1_lut16_p3_params(
2785   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2786   float prescale,
2787   float alpha,
2788   float beta)
2789 {
2790   params->avx512_rr1_lut16_p3.prescale = prescale;
2791   params->avx512_rr1_lut16_p3.alpha = alpha;
2792   params->avx512_rr1_lut16_p3.beta = beta;
2793   params->avx512_rr1_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2794   params->avx512_rr1_lut16_p3.magic_bias = 0x1.800000p19f;
2795   params->avx512_rr1_lut16_p3.log2e = 0x1.715476p+0f;
2796   params->avx512_rr1_lut16_p3.minus_ln2 = -0x1.62E430p-1f;
2797   params->avx512_rr1_lut16_p3.c3 = 0x1.55561Cp-3f;
2798   params->avx512_rr1_lut16_p3.c2 = 0x1.0001ECp-1f;
2799   params->avx512_rr1_lut16_p3.table[ 0] = UINT32_C(0x3F800000);
2800   params->avx512_rr1_lut16_p3.table[ 1] = UINT32_C(0x3F7DAAC3);
2801   params->avx512_rr1_lut16_p3.table[ 2] = UINT32_C(0x3F7B95C2);
2802   params->avx512_rr1_lut16_p3.table[ 3] = UINT32_C(0x3F79C3D3);
2803   params->avx512_rr1_lut16_p3.table[ 4] = UINT32_C(0x3F7837F0);
2804   params->avx512_rr1_lut16_p3.table[ 5] = UINT32_C(0x3F76F532);
2805   params->avx512_rr1_lut16_p3.table[ 6] = UINT32_C(0x3F75FED7);
2806   params->avx512_rr1_lut16_p3.table[ 7] = UINT32_C(0x3F75583F);
2807   params->avx512_rr1_lut16_p3.table[ 8] = UINT32_C(0x3F7504F3);
2808   params->avx512_rr1_lut16_p3.table[ 9] = UINT32_C(0x3F7508A4);
2809   params->avx512_rr1_lut16_p3.table[10] = UINT32_C(0x3F75672A);
2810   params->avx512_rr1_lut16_p3.table[11] = UINT32_C(0x3F76248C);
2811   params->avx512_rr1_lut16_p3.table[12] = UINT32_C(0x3F7744FD);
2812   params->avx512_rr1_lut16_p3.table[13] = UINT32_C(0x3F78CCDF);
2813   params->avx512_rr1_lut16_p3.table[14] = UINT32_C(0x3F7AC0C7);
2814   params->avx512_rr1_lut16_p3.table[15] = UINT32_C(0x3F7D257D);
2815 }
2816 
xnn_init_f32_elu_avx512_rr1_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2817 void xnn_init_f32_elu_avx512_rr1_p6_params(
2818   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2819   float prescale,
2820   float alpha,
2821   float beta)
2822 {
2823   params->avx512_rr1_p6.prescale = prescale;
2824   params->avx512_rr1_p6.alpha = alpha;
2825   params->avx512_rr1_p6.beta = beta;
2826   params->avx512_rr1_p6.sat_cutoff = -0x1.154246p+4f;
2827   params->avx512_rr1_p6.magic_bias = 0x1.8000FEp23f;
2828   params->avx512_rr1_p6.log2e = 0x1.715476p+0f;
2829   params->avx512_rr1_p6.minus_ln2 = -0x1.62E430p-1f;
2830   params->avx512_rr1_p6.c6 = 0x1.6B7338p-10f;
2831   params->avx512_rr1_p6.c5 = 0x1.12278Ep-7f;
2832   params->avx512_rr1_p6.c4 = 0x1.555716p-5f;
2833   params->avx512_rr1_p6.c3 = 0x1.5554B0p-3f;
2834   params->avx512_rr1_p6.c2 = 0x1.FFFFFEp-2f;
2835 }
2836 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2837 
2838 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2839 void xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params(
2840   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2841   float prescale,
2842   float alpha,
2843   float beta)
2844 {
2845   for (uint32_t i = 0; i < 2; i++) {
2846     params->wasmsimd_rr2_lut16_p3.prescale[i] = prescale;
2847     params->wasmsimd_rr2_lut16_p3.alpha[i] = alpha;
2848     params->wasmsimd_rr2_lut16_p3.beta[i] = beta;
2849     params->wasmsimd_rr2_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2850     params->wasmsimd_rr2_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2851     params->wasmsimd_rr2_lut16_p3.log2e[i] = 0x1.715476p+0f;
2852     params->wasmsimd_rr2_lut16_p3.index_mask[i] = UINT32_C(0xF);
2853     params->wasmsimd_rr2_lut16_p3.minus_ln2_hi[i] = -0x1.62E400p-1f;
2854     params->wasmsimd_rr2_lut16_p3.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2855     params->wasmsimd_rr2_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2856     params->wasmsimd_rr2_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2857     params->wasmsimd_rr2_lut16_p3.one[i] = 1.0f;
2858   }
2859 }
2860 
xnn_init_f32_elu_wasmsimd_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2861 void xnn_init_f32_elu_wasmsimd_rr2_p6_params(
2862   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2863   float prescale,
2864   float alpha,
2865   float beta)
2866 {
2867   for (uint32_t i = 0; i < 2; i++) {
2868     params->wasmsimd_rr2_p6.prescale[i] = prescale;
2869     params->wasmsimd_rr2_p6.alpha[i] = alpha;
2870     params->wasmsimd_rr2_p6.beta[i] = beta;
2871     params->wasmsimd_rr2_p6.sat_cutoff[i] = -0x1.154246p+4f;
2872     params->wasmsimd_rr2_p6.magic_bias[i] = 0x1.8000FEp23f;
2873     params->wasmsimd_rr2_p6.log2e[i] = 0x1.715476p+0f;
2874     params->wasmsimd_rr2_p6.minus_ln2_hi[i] = -0x1.62E440p-1f;
2875     params->wasmsimd_rr2_p6.minus_ln2_lo[i] = 0x1.0105C6p-21f;
2876     params->wasmsimd_rr2_p6.c6[i] = 0x1.6b7338p-10f;
2877     params->wasmsimd_rr2_p6.c5[i] = 0x1.12278Ep-7f;
2878     params->wasmsimd_rr2_p6.c4[i] = 0x1.555716p-5f;
2879     params->wasmsimd_rr2_p6.c3[i] = 0x1.5554B0p-3f;
2880     params->wasmsimd_rr2_p6.c2[i] = 0x1.FFFFFEp-2f;
2881     params->wasmsimd_rr2_p6.one[i] = 1.0f;
2882   }
2883 }
2884 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2885 
xnn_init_f32_expminus_scalar_rr2_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2886 void xnn_init_f32_expminus_scalar_rr2_p5_params(
2887   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2888 {
2889   params->scalar_rr2_p5.log2e = 0x1.715476p+0f;
2890   params->scalar_rr2_p5.magic_bias = 0x1.8000FEp23f;
2891   params->scalar_rr2_p5.minus_ln2_hi = -0x1.62E400p-1f;
2892   params->scalar_rr2_p5.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2893   params->scalar_rr2_p5.c5 = 0x1.0F9F9Cp-7f;
2894   params->scalar_rr2_p5.c4 = 0x1.573A1Ap-5f;
2895   params->scalar_rr2_p5.c3 = 0x1.555A80p-3f;
2896   params->scalar_rr2_p5.c2 = 0x1.FFFDC6p-2f;
2897   params->scalar_rr2_p5.c1 = 0x1.FFFFF6p-1f;
2898   params->scalar_rr2_p5.denorm_cutoff = -0x1.5D589Ep6f;
2899 }
2900 
xnn_init_f32_expminus_scalar_rr2_lut64_p2_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2901 void xnn_init_f32_expminus_scalar_rr2_lut64_p2_params(
2902   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2903 {
2904   params->scalar_rr2_lut64_p2.log2e  = 0x1.715476p0f;
2905   params->scalar_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
2906   params->scalar_rr2_lut64_p2.minus_ln2_hi = -0x1.630000p-1f;
2907   params->scalar_rr2_lut64_p2.minus_ln2_lo = 0x1.BD0106p-13f;
2908   params->scalar_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2909   params->scalar_rr2_lut64_p2.denorm_cutoff = -0x1.5D589Ep6f;
2910 }
2911 
2912 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_expminus_neon_rr2_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2913 void xnn_init_f32_expminus_neon_rr2_p5_params(
2914   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2915 {
2916   params->neon_rr2_p5.log2e = 0x1.715476p+0f;
2917   params->neon_rr2_p5.magic_bias = 0x1.8000FEp23f;
2918   params->neon_rr2_p5.minus_ln2_hi = -0x1.62E400p-1f;
2919   params->neon_rr2_p5.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2920   params->neon_rr2_p5.c5 = 0x1.0F9F9Cp-7f;
2921   params->neon_rr2_p5.c4 = 0x1.573A1Ap-5f;
2922   params->neon_rr2_p5.c3 = 0x1.555A80p-3f;
2923   params->neon_rr2_p5.c2 = 0x1.FFFDC6p-2f;
2924   params->neon_rr2_p5.c1 = 0x1.FFFFF6p-1f;
2925   params->neon_rr2_p5.denorm_cutoff = -0x1.5D589Ep6f;
2926 }
2927 
xnn_init_f32_expminus_neon_rr2_lut64_p2_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2928 void xnn_init_f32_expminus_neon_rr2_lut64_p2_params(
2929   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2930 {
2931   params->neon_rr2_lut64_p2.log2e = 0x1.715476p+0f;
2932   params->neon_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
2933   params->neon_rr2_lut64_p2.minus_ln2_hi = -0x1.62E400p-1f;
2934   params->neon_rr2_lut64_p2.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2935   params->neon_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2936   params->neon_rr2_lut64_p2.denorm_cutoff = -0x1.5D589Ep6f;
2937 }
2938 
xnn_init_f32_expminus_neonfma_rr1_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2939 void xnn_init_f32_expminus_neonfma_rr1_p5_params(
2940   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2941 {
2942   params->neonfma_rr1_p5.log2e = 0x1.715476p+0f;
2943   params->neonfma_rr1_p5.magic_bias = 0x1.8000FEp23f;
2944   params->neonfma_rr1_p5.minus_ln2 = -0x1.62E430p-1f;
2945   params->neonfma_rr1_p5.c5 = 0x1.0F9F9Cp-7f;
2946   params->neonfma_rr1_p5.c4 = 0x1.573A1Ap-5f;
2947   params->neonfma_rr1_p5.c3 = 0x1.555A80p-3f;
2948   params->neonfma_rr1_p5.c2 = 0x1.FFFDC6p-2f;
2949   params->neonfma_rr1_p5.c1 = 0x1.FFFFF6p-1f;
2950   params->neonfma_rr1_p5.denorm_cutoff = -0x1.5D589Ep6f;
2951 }
2952 
xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2953 void xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params(
2954   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2955 {
2956   params->neonfma_rr1_lut64_p2.log2e = 0x1.715476p+0f;
2957   params->neonfma_rr1_lut64_p2.magic_bias = 0x1.800000p17f;
2958   params->neonfma_rr1_lut64_p2.minus_ln2 = -0x1.62E430p-1f;
2959   params->neonfma_rr1_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2960   params->neonfma_rr1_lut64_p2.denorm_cutoff = -0x1.5D589Ep6f;
2961 }
2962 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
2963 
2964 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_expminus_sse2_rr2_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2965 void xnn_init_f32_expminus_sse2_rr2_p5_params(
2966   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2967 {
2968   for (uint32_t i = 0; i < 4; i++) {
2969     params->sse2_rr2_p5.log2e[i] = 0x1.715476p+0f;
2970     params->sse2_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2971     params->sse2_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
2972     params->sse2_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2973     params->sse2_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
2974     params->sse2_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2975     params->sse2_rr2_p5.c3[i] = 0x1.555A80p-3f;
2976     params->sse2_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2977     params->sse2_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
2978     params->sse2_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep6f;
2979   }
2980 }
2981 
xnn_init_f32_expminus_avx2_rr1_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])2982 void xnn_init_f32_expminus_avx2_rr1_p5_params(
2983   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
2984 {
2985   for (uint32_t i = 0; i < 8; i++) {
2986     params->avx2_rr1_p5.log2e[i] = 0x1.715476p+0f;
2987     params->avx2_rr1_p5.magic_bias[i] = 0x1.8000FEp23f;
2988     params->avx2_rr1_p5.minus_ln2[i] = -0x1.62E430p-1f;
2989     params->avx2_rr1_p5.c5[i] = 0x1.0F9F9Cp-7f;
2990     params->avx2_rr1_p5.c4[i] = 0x1.573A1Ap-5f;
2991     params->avx2_rr1_p5.c3[i] = 0x1.555A80p-3f;
2992     params->avx2_rr1_p5.c2[i] = 0x1.FFFDC6p-2f;
2993     params->avx2_rr1_p5.c1[i] = 0x1.FFFFF6p-1f;
2994     params->avx2_rr1_p5.denorm_cutoff[i] = -0x1.5D589Ep6f;
2995   }
2996   for (uint32_t i = 0; i < 7; i++) {
2997     params->avx2_rr1_p5.mask_table[i] = -1;
2998   }
2999   for (uint32_t i = 7; i < 14; i++) {
3000     params->avx2_rr1_p5.mask_table[i] = 0;
3001   }
3002 }
3003 
xnn_init_f32_expminus_avx512_rr1_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3004 void xnn_init_f32_expminus_avx512_rr1_p5_params(
3005   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3006 {
3007   params->avx512_rr1_p5.log2e = 0x1.715476p+0f;
3008   params->avx512_rr1_p5.minus_ln2 = -0x1.62E430p-1f;
3009   params->avx512_rr1_p5.c5 = 0x1.0F9F9Cp-7f;
3010   params->avx512_rr1_p5.c4 = 0x1.573A1Ap-5f;
3011   params->avx512_rr1_p5.c3 = 0x1.555A80p-3f;
3012   params->avx512_rr1_p5.c2 = 0x1.FFFDC6p-2f;
3013   params->avx512_rr1_p5.c1 = 0x1.FFFFF6p-1f;
3014   params->avx512_rr1_p5.c0 = 1.0f;
3015 }
3016 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3017 
3018 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_expminus_wasmsimd_rr2_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3019 void xnn_init_f32_expminus_wasmsimd_rr2_p5_params(
3020   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3021 {
3022   for (uint32_t i = 0; i < 2; i++) {
3023     params->wasmsimd_rr2_p5.log2e[i] = 0x1.715476p+0f;
3024     params->wasmsimd_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
3025     params->wasmsimd_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
3026     params->wasmsimd_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
3027     params->wasmsimd_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
3028     params->wasmsimd_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
3029     params->wasmsimd_rr2_p5.c3[i] = 0x1.555A80p-3f;
3030     params->wasmsimd_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
3031     params->wasmsimd_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
3032     params->wasmsimd_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep6f;
3033   }
3034 }
3035 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3036 
xnn_init_f32_lrelu_scalar_params(union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS (1)],float slope)3037 void xnn_init_f32_lrelu_scalar_params(
3038   union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3039   float slope)
3040 {
3041   params->scalar.slope = slope;
3042 }
3043 
3044 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_lrelu_sse_params(union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS (1)],float slope)3045 void xnn_init_f32_lrelu_sse_params(
3046   union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3047   float slope)
3048 {
3049   for (uint32_t i = 0; i < 4; i++) {
3050     params->sse.slope[i] = slope;
3051   }
3052 }
3053 
xnn_init_f32_lrelu_avx_params(union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS (1)],float slope)3054 void xnn_init_f32_lrelu_avx_params(
3055   union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3056   float slope)
3057 {
3058   for (uint32_t i = 0; i < 8; i++) {
3059     params->avx.slope[i] = slope;
3060   }
3061   for (uint32_t i = 0; i < 7; i++) {
3062     params->avx.mask_table[i] = -1;
3063   }
3064   for (uint32_t i = 7; i < 14; i++) {
3065     params->avx.mask_table[i] = 0;
3066   }
3067 }
3068 #endif
3069 
3070 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_lrelu_wasmsimd_params(union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS (1)],float slope)3071 void xnn_init_f32_lrelu_wasmsimd_params(
3072   union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3073   float slope)
3074 {
3075   params->wasmsimd.slope[0] = slope;
3076   params->wasmsimd.slope[1] = slope;
3077 }
3078 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3079 
3080 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_sqrt_avx_params(union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS (1)])3081 void xnn_init_f32_sqrt_avx_params(
3082   union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)])
3083 {
3084   for (uint32_t i = 0; i < 7; i++) {
3085     params->avx.mask_table[i] = -1;
3086   }
3087   for (uint32_t i = 7; i < 14; i++) {
3088     params->avx.mask_table[i] = 0;
3089   }
3090 }
3091 
xnn_init_f32_sqrt_fma_params(union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS (1)])3092 void xnn_init_f32_sqrt_fma_params(
3093   union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)])
3094 {
3095   for (uint32_t i = 0; i < 8; i++) {
3096     params->fma.half[i] = 0.5f;
3097   }
3098   for (uint32_t i = 0; i < 7; i++) {
3099     params->fma.mask_table[i] = -1;
3100   }
3101   for (uint32_t i = 7; i < 14; i++) {
3102     params->fma.mask_table[i] = 0;
3103   }
3104 }
3105 
xnn_init_f32_sqrt_avx512_params(union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS (1)])3106 void xnn_init_f32_sqrt_avx512_params(
3107   union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)])
3108 {
3109   params->avx512.half = 0.5f;
3110 }
3111 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3112 
xnn_init_f32_chw_params(union xnn_f32_chw_params params[XNN_MIN_ELEMENTS (1)],uint32_t width,float output_min,float output_max)3113 void xnn_init_f32_chw_params(
3114   union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
3115   uint32_t width,
3116   float output_min,
3117   float output_max)
3118 {
3119   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
3120     for (uint32_t i = 0; i < 4; i++) {
3121       params->sse.min[i] = output_min;
3122       params->sse.max[i] = output_max;
3123     }
3124 
3125     const uint32_t w4 = (width - 1) & 3;
3126     params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
3127     params->sse.mask[1] = -(uint32_t) (w4 >= 1);
3128     params->sse.mask[2] = -(uint32_t) (w4 >= 2);
3129     params->sse.mask[3] = -(uint32_t) (w4 >= 3);
3130 
3131     const uint32_t w8 = (width - 1) & 7;
3132     params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
3133     params->sse.mask_even[1] = -(uint32_t) (w8 >= 2);
3134     params->sse.mask_even[2] = -(uint32_t) (w8 >= 4);
3135     params->sse.mask_even[3] = -(uint32_t) (w8 >= 6);
3136     params->sse.mask_odd[0] = -(uint32_t) (w8 >= 1);
3137     params->sse.mask_odd[1] = -(uint32_t) (w8 >= 3);
3138     params->sse.mask_odd[2] = -(uint32_t) (w8 >= 5);
3139     params->sse.mask_odd[3] = -(uint32_t) (w8 >= 7);
3140   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
3141     params->neon.min = output_min;
3142     params->neon.max = output_max;
3143 
3144     const uint32_t w4 = (width - 1) & 3;
3145     params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
3146     params->neon.mask[1] = -(uint32_t) (w4 >= 1);
3147     params->neon.mask[2] = -(uint32_t) (w4 >= 2);
3148     params->neon.mask[3] = -(uint32_t) (w4 >= 3);
3149 
3150     const uint32_t w8 = (width - 1) & 7;
3151     params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
3152     params->neon.mask_even[1] = -(uint32_t) (w8 >= 2);
3153     params->neon.mask_even[2] = -(uint32_t) (w8 >= 4);
3154     params->neon.mask_even[3] = -(uint32_t) (w8 >= 6);
3155     params->neon.mask_odd[0] = -(uint32_t) (w8 >= 1);
3156     params->neon.mask_odd[1] = -(uint32_t) (w8 >= 3);
3157     params->neon.mask_odd[2] = -(uint32_t) (w8 >= 5);
3158     params->neon.mask_odd[3] = -(uint32_t) (w8 >= 7);
3159   #else
3160     params->scalar.min = output_min;
3161     params->scalar.max = output_max;
3162 
3163     const uint32_t w4 = (width - 1) & 3;
3164     params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
3165     params->scalar.mask[1] = -(uint32_t) (w4 >= 1);
3166     params->scalar.mask[2] = -(uint32_t) (w4 >= 2);
3167     params->scalar.mask[3] = -(uint32_t) (w4 >= 3);
3168 
3169     const uint32_t w8 = (width - 1) & 7;
3170     params->scalar.mask_even[0] = UINT32_C(0xFFFFFFFF);
3171     params->scalar.mask_even[1] = -(uint32_t) (w8 >= 2);
3172     params->scalar.mask_even[2] = -(uint32_t) (w8 >= 4);
3173     params->scalar.mask_even[3] = -(uint32_t) (w8 >= 6);
3174     params->scalar.mask_odd[0] = -(uint32_t) (w8 >= 1);
3175     params->scalar.mask_odd[1] = -(uint32_t) (w8 >= 3);
3176     params->scalar.mask_odd[2] = -(uint32_t) (w8 >= 5);
3177     params->scalar.mask_odd[3] = -(uint32_t) (w8 >= 7);
3178   #endif
3179 }
3180 
xnn_update_f32_chw_params(union xnn_f32_chw_params * params,uint32_t width)3181 void xnn_update_f32_chw_params(
3182   union xnn_f32_chw_params* params,
3183   uint32_t width)
3184 {
3185   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
3186     const uint32_t w4 = (width - 1) & 3;
3187     params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
3188     params->sse.mask[1] = -(uint32_t) (w4 >= 1);
3189     params->sse.mask[2] = -(uint32_t) (w4 >= 2);
3190     params->sse.mask[3] = -(uint32_t) (w4 >= 3);
3191 
3192     const uint32_t w8 = (width - 1) & 7;
3193     params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
3194     params->sse.mask_even[1] = -(uint32_t) (w8 >= 2);
3195     params->sse.mask_even[2] = -(uint32_t) (w8 >= 4);
3196     params->sse.mask_even[3] = -(uint32_t) (w8 >= 6);
3197     params->sse.mask_odd[0] = -(uint32_t) (w8 >= 1);
3198     params->sse.mask_odd[1] = -(uint32_t) (w8 >= 3);
3199     params->sse.mask_odd[2] = -(uint32_t) (w8 >= 5);
3200     params->sse.mask_odd[3] = -(uint32_t) (w8 >= 7);
3201   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
3202     const uint32_t w4 = (width - 1) & 3;
3203     params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
3204     params->neon.mask[1] = -(uint32_t) (w4 >= 1);
3205     params->neon.mask[2] = -(uint32_t) (w4 >= 2);
3206     params->neon.mask[3] = -(uint32_t) (w4 >= 3);
3207 
3208     const uint32_t w8 = (width - 1) & 7;
3209     params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
3210     params->neon.mask_even[1] = -(uint32_t) (w8 >= 2);
3211     params->neon.mask_even[2] = -(uint32_t) (w8 >= 4);
3212     params->neon.mask_even[3] = -(uint32_t) (w8 >= 6);
3213     params->neon.mask_odd[0] = -(uint32_t) (w8 >= 1);
3214     params->neon.mask_odd[1] = -(uint32_t) (w8 >= 3);
3215     params->neon.mask_odd[2] = -(uint32_t) (w8 >= 5);
3216     params->neon.mask_odd[3] = -(uint32_t) (w8 >= 7);
3217   #else
3218     const uint32_t w4 = (width - 1) & 3;
3219     params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
3220     params->scalar.mask[1] = -(uint32_t) (w4 >= 1);
3221     params->scalar.mask[2] = -(uint32_t) (w4 >= 2);
3222     params->scalar.mask[3] = -(uint32_t) (w4 >= 3);
3223 
3224     const uint32_t w8 = (width - 1) & 7;
3225     params->scalar.mask_even[0] = UINT32_C(0xFFFFFFFF);
3226     params->scalar.mask_even[1] = -(uint32_t) (w8 >= 2);
3227     params->scalar.mask_even[2] = -(uint32_t) (w8 >= 4);
3228     params->scalar.mask_even[3] = -(uint32_t) (w8 >= 6);
3229     params->scalar.mask_odd[0] = -(uint32_t) (w8 >= 1);
3230     params->scalar.mask_odd[1] = -(uint32_t) (w8 >= 3);
3231     params->scalar.mask_odd[2] = -(uint32_t) (w8 >= 5);
3232     params->scalar.mask_odd[3] = -(uint32_t) (w8 >= 7);
3233   #endif
3234 }
3235 
xnn_init_scalar_f32_chw_params(union xnn_f32_chw_params params[XNN_MIN_ELEMENTS (1)],uint32_t width,float output_min,float output_max)3236 void xnn_init_scalar_f32_chw_params(
3237   union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
3238   uint32_t width,
3239   float output_min,
3240   float output_max)
3241 {
3242   params->scalar.min = output_min;
3243   params->scalar.max = output_max;
3244 
3245   const uint32_t w4 = (width - 1) & 3;
3246   params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
3247   params->scalar.mask[1] = -(uint32_t) (w4 >= 1);
3248   params->scalar.mask[2] = -(uint32_t) (w4 >= 2);
3249   params->scalar.mask[3] = -(uint32_t) (w4 >= 3);
3250 
3251   const uint32_t w8 = (width - 1) & 7;
3252   params->scalar.mask_even[0] = UINT32_C(0xFFFFFFFF);
3253   params->scalar.mask_even[1] = -(uint32_t) (w8 >= 2);
3254   params->scalar.mask_even[2] = -(uint32_t) (w8 >= 4);
3255   params->scalar.mask_even[3] = -(uint32_t) (w8 >= 6);
3256   params->scalar.mask_odd[0] = -(uint32_t) (w8 >= 1);
3257   params->scalar.mask_odd[1] = -(uint32_t) (w8 >= 3);
3258   params->scalar.mask_odd[2] = -(uint32_t) (w8 >= 5);
3259   params->scalar.mask_odd[3] = -(uint32_t) (w8 >= 7);
3260 }
3261 
3262 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_s8_minmax_sse2_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)3263 void xnn_init_s8_minmax_sse2_params(
3264   union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3265   int8_t output_min,
3266   int8_t output_max)
3267 {
3268   assert(output_min < output_max);
3269 
3270   const uint8_t output_min_with_bias = UINT8_C(0x80) ^ (uint8_t) output_min;
3271   const uint8_t output_max_with_bias = UINT8_C(0x80) ^ (uint8_t) output_max;
3272   for (uint32_t i = 0; i < 16; i++) {
3273     params->sse2.bias[i] = UINT8_C(0x80);
3274     params->sse2.min_with_bias[i] = output_min_with_bias;
3275     params->sse2.max_with_bias[i] = output_max_with_bias;
3276   }
3277 }
3278 
xnn_init_s8_minmax_sse4_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)3279 void xnn_init_s8_minmax_sse4_params(
3280   union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3281   int8_t output_min,
3282   int8_t output_max)
3283 {
3284   assert(output_min < output_max);
3285 
3286   for (uint32_t i = 0; i < 16; i++) {
3287     params->sse4.min[i] = output_min;
3288     params->sse4.max[i] = output_max;
3289   }
3290 }
3291 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3292 
3293 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_s8_minmax_neon_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)3294 void xnn_init_s8_minmax_neon_params(
3295   union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3296   int8_t output_min,
3297   int8_t output_max)
3298 {
3299   assert(output_min < output_max);
3300 
3301   params->neon.min = output_min;
3302   params->neon.max = output_max;
3303 }
3304 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
3305 
3306 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_s8_minmax_wasmsimd_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)3307 void xnn_init_s8_minmax_wasmsimd_params(
3308   union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3309   int8_t output_min,
3310   int8_t output_max)
3311 {
3312   assert(output_min < output_max);
3313 
3314   for (uint32_t i = 0; i < 8; i++) {
3315     params->wasmsimd.min[i] = output_min;
3316     params->wasmsimd.max[i] = output_max;
3317   }
3318 }
3319 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3320 
xnn_init_s8_minmax_scalar_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)3321 void xnn_init_s8_minmax_scalar_params(
3322   union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3323   int8_t output_min,
3324   int8_t output_max)
3325 {
3326   assert(output_min < output_max);
3327 
3328   params->scalar.min = (int32_t) output_min;
3329   params->scalar.max = (int32_t) output_max;
3330 }
3331 
xnn_init_u8_minmax_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)3332 void xnn_init_u8_minmax_params(
3333   union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3334   uint8_t output_min,
3335   uint8_t output_max)
3336 {
3337   assert(output_min < output_max);
3338 
3339   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
3340     for (uint32_t i = 0; i < 16; i++) {
3341       params->sse2.min[i] = output_min;
3342       params->sse2.max[i] = output_max;
3343     }
3344   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
3345     params->neon.min = output_min;
3346     params->neon.max = output_max;
3347   #else
3348     params->scalar.min = (uint32_t) output_min;
3349     params->scalar.max = (uint32_t) output_max;
3350   #endif
3351 }
3352 
3353 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_u8_minmax_sse2_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)3354 void xnn_init_u8_minmax_sse2_params(
3355   union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3356   uint8_t output_min,
3357   uint8_t output_max)
3358 {
3359   assert(output_min < output_max);
3360 
3361   for (uint32_t i = 0; i < 16; i++) {
3362     params->sse2.min[i] = output_min;
3363     params->sse2.max[i] = output_max;
3364   }
3365 }
3366 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3367 
3368 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_u8_minmax_wasmsimd_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)3369 void xnn_init_u8_minmax_wasmsimd_params(
3370   union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3371   uint8_t output_min,
3372   uint8_t output_max)
3373 {
3374   assert(output_min < output_max);
3375 
3376   for (uint32_t i = 0; i < 8; i++) {
3377     params->wasmsimd.min[i] = output_min;
3378     params->wasmsimd.max[i] = output_max;
3379   }
3380 }
3381 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3382 
3383 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_u8_minmax_neon_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)3384 void xnn_init_u8_minmax_neon_params(
3385   union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3386   uint8_t output_min,
3387   uint8_t output_max)
3388 {
3389   assert(output_min < output_max);
3390 
3391   params->neon.min = output_min;
3392   params->neon.max = output_max;
3393 }
3394 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
3395 
xnn_init_u8_minmax_scalar_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)3396 void xnn_init_u8_minmax_scalar_params(
3397   union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
3398   uint8_t output_min,
3399   uint8_t output_max)
3400 {
3401   assert(output_min < output_max);
3402 
3403   params->scalar.min = (uint32_t) output_min;
3404   params->scalar.max = (uint32_t) output_max;
3405 }
3406 
3407 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_add_minmax_sse2_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3408 void xnn_init_qu8_add_minmax_sse2_params(
3409   union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3410   uint8_t a_zero_point,
3411   uint8_t b_zero_point,
3412   uint8_t output_zero_point,
3413   float a_output_scale,
3414   float b_output_scale,
3415   uint8_t output_min,
3416   uint8_t output_max)
3417 {
3418   const float abs_a_output_scale = fabsf(a_output_scale);
3419   const float abs_b_output_scale = fabsf(b_output_scale);
3420   assert(abs_a_output_scale >= 0x1.0p-10f);
3421   assert(abs_b_output_scale >= 0x1.0p-10f);
3422   assert(abs_a_output_scale < 0x1.0p+8f);
3423   assert(abs_b_output_scale < 0x1.0p+8f);
3424 
3425   // Compute requantization parameters.
3426   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3427   assert(max_abs_output_scale >= 0x1.0p-10f);
3428   assert(max_abs_output_scale < 0x1.0p+8f);
3429   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3430   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3431 
3432   // Shift is in [12, 30] range.
3433   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3434   assert(shift <= 30);
3435   assert(shift >= 12);
3436 
3437   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3438   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3439   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3440   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3441   assert(abs_a_multiplier <= INT32_C(0x00200000));
3442   assert(abs_b_multiplier <= INT32_C(0x00200000));
3443 
3444   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3445   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3446 
3447   const int32_t rounding = INT32_C(1) << (shift - 1);
3448   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
3449   for (uint32_t i = 0; i < 4; i++) {
3450     params->sse2.bias[i] = bias;
3451   }
3452   const uint16_t a_multiplier_lo = (uint16_t) a_multiplier;
3453   const uint16_t a_multiplier_hi = (uint16_t) ((uint32_t) a_multiplier >> 16);
3454   const uint16_t b_multiplier_lo = (uint16_t) b_multiplier;
3455   const uint16_t b_multiplier_hi = (uint16_t) ((uint32_t) b_multiplier >> 16);
3456   for (uint32_t i = 0; i < 8; i++) {
3457     params->sse2.a_multiplier_lo[i] = a_multiplier_lo;
3458     params->sse2.a_multiplier_hi[i] = a_multiplier_hi;
3459     params->sse2.b_multiplier_lo[i] = b_multiplier_lo;
3460     params->sse2.b_multiplier_hi[i] = b_multiplier_hi;
3461   }
3462   params->sse2.shift = shift;
3463   params->sse2.b_multiplier = (uint32_t) b_multiplier;
3464   for (uint32_t i = 0; i < 8; i++) {
3465     params->sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3466   }
3467   for (uint32_t i = 0; i < 16; i++) {
3468     params->sse2.output_min[i] = output_min;
3469     params->sse2.output_max[i] = output_max;
3470   }
3471 }
3472 
xnn_init_qu8_add_minmax_sse4_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3473 void xnn_init_qu8_add_minmax_sse4_params(
3474   union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3475   uint8_t a_zero_point,
3476   uint8_t b_zero_point,
3477   uint8_t output_zero_point,
3478   float a_output_scale,
3479   float b_output_scale,
3480   uint8_t output_min,
3481   uint8_t output_max)
3482 {
3483   const float abs_a_output_scale = fabsf(a_output_scale);
3484   const float abs_b_output_scale = fabsf(b_output_scale);
3485   assert(abs_a_output_scale >= 0x1.0p-10f);
3486   assert(abs_b_output_scale >= 0x1.0p-10f);
3487   assert(abs_a_output_scale < 0x1.0p+8f);
3488   assert(abs_b_output_scale < 0x1.0p+8f);
3489 
3490   // Compute requantization parameters.
3491   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3492   assert(max_abs_output_scale >= 0x1.0p-10f);
3493   assert(max_abs_output_scale < 0x1.0p+8f);
3494   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3495   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3496 
3497   // Shift is in [12, 30] range.
3498   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3499   assert(shift <= 30);
3500   assert(shift >= 12);
3501 
3502   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3503   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3504   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3505   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3506   assert(abs_a_multiplier <= INT32_C(0x00200000));
3507   assert(abs_b_multiplier <= INT32_C(0x00200000));
3508 
3509   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3510   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3511 
3512   const int32_t rounding = INT32_C(1) << (shift - 1);
3513   const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
3514   for (uint32_t i = 0; i < 4; i++) {
3515     params->sse4.bias[i] = bias;
3516     params->sse4.a_multiplier[i] = a_multiplier;
3517     params->sse4.b_multiplier[i] = b_multiplier;
3518     params->sse4.shift[i] = shift;
3519   }
3520   for (uint32_t i = 0; i < 8; i++) {
3521     params->sse4.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3522   }
3523   for (uint32_t i = 0; i < 16; i++) {
3524     params->sse4.output_min[i] = output_min;
3525     params->sse4.output_max[i] = output_max;
3526   }
3527 }
3528 
xnn_init_qu8_add_minmax_avx2_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3529 void xnn_init_qu8_add_minmax_avx2_params(
3530   union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3531   uint8_t a_zero_point,
3532   uint8_t b_zero_point,
3533   uint8_t output_zero_point,
3534   float a_output_scale,
3535   float b_output_scale,
3536   uint8_t output_min,
3537   uint8_t output_max)
3538 {
3539   const float abs_a_output_scale = fabsf(a_output_scale);
3540   const float abs_b_output_scale = fabsf(b_output_scale);
3541   assert(abs_a_output_scale >= 0x1.0p-10f);
3542   assert(abs_b_output_scale >= 0x1.0p-10f);
3543   assert(abs_a_output_scale < 0x1.0p+8f);
3544   assert(abs_b_output_scale < 0x1.0p+8f);
3545 
3546   // Compute requantization parameters.
3547   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3548   assert(max_abs_output_scale >= 0x1.0p-10f);
3549   assert(max_abs_output_scale < 0x1.0p+8f);
3550   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3551   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3552 
3553   // Shift is in [12, 30] range.
3554   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3555   assert(shift <= 30);
3556   assert(shift >= 12);
3557 
3558   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3559   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3560   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3561   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3562   assert(abs_a_multiplier <= INT32_C(0x00200000));
3563   assert(abs_b_multiplier <= INT32_C(0x00200000));
3564 
3565   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3566   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3567 
3568   const int32_t rounding = INT32_C(1) << (shift - 1);
3569   const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
3570   for (uint32_t i = 0; i < 8; i++) {
3571     params->avx2.bias[i] = bias;
3572     params->avx2.a_multiplier[i] = a_multiplier;
3573     params->avx2.b_multiplier[i] = b_multiplier;
3574     params->avx2.shift[i] = shift;
3575   }
3576   for (uint32_t i = 0; i < 16; i++) {
3577     params->avx2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3578     params->avx2.output_min[i] = output_min;
3579     params->avx2.output_max[i] = output_max;
3580   }
3581 }
3582 
xnn_init_qu8_add_minmax_avx512_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3583 void xnn_init_qu8_add_minmax_avx512_params(
3584   union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3585   uint8_t a_zero_point,
3586   uint8_t b_zero_point,
3587   uint8_t output_zero_point,
3588   float a_output_scale,
3589   float b_output_scale,
3590   uint8_t output_min,
3591   uint8_t output_max)
3592 {
3593   const float abs_a_output_scale = fabsf(a_output_scale);
3594   const float abs_b_output_scale = fabsf(b_output_scale);
3595   assert(abs_a_output_scale >= 0x1.0p-10f);
3596   assert(abs_b_output_scale >= 0x1.0p-10f);
3597   assert(abs_a_output_scale < 0x1.0p+8f);
3598   assert(abs_b_output_scale < 0x1.0p+8f);
3599 
3600   // Compute requantization parameters.
3601   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3602   assert(max_abs_output_scale >= 0x1.0p-10f);
3603   assert(max_abs_output_scale < 0x1.0p+8f);
3604   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3605   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3606 
3607   // Shift is in [12, 30] range.
3608   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3609   assert(shift <= 30);
3610   assert(shift >= 12);
3611 
3612   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3613   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3614   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3615   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3616   assert(abs_a_multiplier <= INT32_C(0x00200000));
3617   assert(abs_b_multiplier <= INT32_C(0x00200000));
3618 
3619   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3620   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3621 
3622   const int32_t rounding = INT32_C(1) << (shift - 1);
3623   const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
3624   for (uint32_t i = 0; i < 16; i++) {
3625     params->avx512.bias[i] = bias;
3626     params->avx512.a_multiplier[i] = a_multiplier;
3627     params->avx512.b_multiplier[i] = b_multiplier;
3628     params->avx512.shift[i] = shift;
3629   }
3630   for (uint32_t i = 0; i < 32; i++) {
3631     params->avx512.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3632     params->avx512.output_min[i] = output_min;
3633     params->avx512.output_max[i] = output_max;
3634   }
3635 }
3636 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3637 
3638 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_add_minmax_neon_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3639 void xnn_init_qu8_add_minmax_neon_params(
3640   union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3641   uint8_t a_zero_point,
3642   uint8_t b_zero_point,
3643   uint8_t output_zero_point,
3644   float a_output_scale,
3645   float b_output_scale,
3646   uint8_t output_min,
3647   uint8_t output_max)
3648 {
3649   const float abs_a_output_scale = fabsf(a_output_scale);
3650   const float abs_b_output_scale = fabsf(b_output_scale);
3651   assert(abs_a_output_scale >= 0x1.0p-10f);
3652   assert(abs_b_output_scale >= 0x1.0p-10f);
3653   assert(abs_a_output_scale < 0x1.0p+8f);
3654   assert(abs_b_output_scale < 0x1.0p+8f);
3655 
3656   // Compute requantization parameters.
3657   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3658   assert(max_abs_output_scale >= 0x1.0p-10f);
3659   assert(max_abs_output_scale < 0x1.0p+8f);
3660   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3661   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3662 
3663   // Shift is in [12, 30] range.
3664   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3665   assert(shift <= 30);
3666   assert(shift >= 12);
3667 
3668   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3669   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3670   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3671   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3672   assert(abs_a_multiplier <= INT32_C(0x00200000));
3673   assert(abs_b_multiplier <= INT32_C(0x00200000));
3674 
3675   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3676   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3677 
3678   params->neon.a_zero_point = a_zero_point;
3679   params->neon.b_zero_point = b_zero_point;
3680   params->neon.a_multiplier = (int32_t) a_multiplier;
3681   params->neon.b_multiplier = (int32_t) b_multiplier;
3682   params->neon.right_shift = (int32_t) -shift;
3683   params->neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
3684   params->neon.output_min = output_min;
3685   params->neon.output_max = output_max;
3686 }
3687 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
3688 
3689 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_add_minmax_wasmsimd_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3690 void xnn_init_qu8_add_minmax_wasmsimd_params(
3691   union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3692   uint8_t a_zero_point,
3693   uint8_t b_zero_point,
3694   uint8_t output_zero_point,
3695   float a_output_scale,
3696   float b_output_scale,
3697   uint8_t output_min,
3698   uint8_t output_max)
3699 {
3700   const float abs_a_output_scale = fabsf(a_output_scale);
3701   const float abs_b_output_scale = fabsf(b_output_scale);
3702   assert(abs_a_output_scale >= 0x1.0p-10f);
3703   assert(abs_b_output_scale >= 0x1.0p-10f);
3704   assert(abs_a_output_scale < 0x1.0p+8f);
3705   assert(abs_b_output_scale < 0x1.0p+8f);
3706 
3707   // Compute requantization parameters.
3708   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3709   assert(max_abs_output_scale >= 0x1.0p-10f);
3710   assert(max_abs_output_scale < 0x1.0p+8f);
3711   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3712   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3713 
3714   // Shift is in [12, 30] range.
3715   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3716   assert(shift <= 30);
3717   assert(shift >= 12);
3718 
3719   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3720   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3721   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3722   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3723   assert(abs_a_multiplier <= INT32_C(0x00200000));
3724   assert(abs_b_multiplier <= INT32_C(0x00200000));
3725 
3726   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3727   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3728 
3729   const int32_t rounding = INT32_C(1) << (shift - 1);
3730   const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
3731   for (uint32_t i = 0; i < 2; i++) {
3732     params->wasmsimd.bias[i] = bias;
3733     params->wasmsimd.a_multiplier[i] = a_multiplier;
3734     params->wasmsimd.b_multiplier[i] = b_multiplier;
3735   }
3736   params->wasmsimd.shift = shift;
3737   for (uint32_t i = 0; i < 4; i++) {
3738     params->wasmsimd.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3739   }
3740   for (uint32_t i = 0; i < 8; i++) {
3741     params->wasmsimd.output_min[i] = output_min;
3742     params->wasmsimd.output_max[i] = output_max;
3743   }
3744 }
3745 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3746 
xnn_init_qu8_add_minmax_scalar_params(union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)3747 void xnn_init_qu8_add_minmax_scalar_params(
3748   union xnn_qu8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3749   uint8_t a_zero_point,
3750   uint8_t b_zero_point,
3751   uint8_t output_zero_point,
3752   float a_output_scale,
3753   float b_output_scale,
3754   uint8_t output_min,
3755   uint8_t output_max)
3756 {
3757   const float abs_a_output_scale = fabsf(a_output_scale);
3758   const float abs_b_output_scale = fabsf(b_output_scale);
3759   assert(abs_a_output_scale >= 0x1.0p-10f);
3760   assert(abs_b_output_scale >= 0x1.0p-10f);
3761   assert(abs_a_output_scale < 0x1.0p+8f);
3762   assert(abs_b_output_scale < 0x1.0p+8f);
3763 
3764   // Compute requantization parameters.
3765   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3766   assert(max_abs_output_scale >= 0x1.0p-10f);
3767   assert(max_abs_output_scale < 0x1.0p+8f);
3768   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3769   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3770 
3771   // Shift is in [12, 30] range.
3772   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3773   assert(shift <= 30);
3774   assert(shift >= 12);
3775 
3776   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3777   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3778   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3779   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3780   assert(abs_a_multiplier <= INT32_C(0x00200000));
3781   assert(abs_b_multiplier <= INT32_C(0x00200000));
3782 
3783   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3784   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3785 
3786   const int32_t rounding = INT32_C(1) << (shift - 1);
3787   params->scalar.bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
3788   params->scalar.a_multiplier = a_multiplier;
3789   params->scalar.b_multiplier = b_multiplier;
3790   params->scalar.shift = shift;
3791   params->scalar.output_min_less_zero_point = (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
3792   params->scalar.output_max_less_zero_point = (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
3793   params->scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
3794 }
3795 
3796 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_add_minmax_sse2_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)3797 void xnn_init_qs8_add_minmax_sse2_params(
3798   union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3799   int8_t a_zero_point,
3800   int8_t b_zero_point,
3801   int8_t output_zero_point,
3802   float a_output_scale,
3803   float b_output_scale,
3804   int8_t output_min,
3805   int8_t output_max)
3806 {
3807   const float abs_a_output_scale = fabsf(a_output_scale);
3808   const float abs_b_output_scale = fabsf(b_output_scale);
3809   assert(abs_a_output_scale >= 0x1.0p-10f);
3810   assert(abs_b_output_scale >= 0x1.0p-10f);
3811   assert(abs_a_output_scale < 0x1.0p+8f);
3812   assert(abs_b_output_scale < 0x1.0p+8f);
3813 
3814   // Compute requantization parameters.
3815   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3816   assert(max_abs_output_scale >= 0x1.0p-10f);
3817   assert(max_abs_output_scale < 0x1.0p+8f);
3818   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3819   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3820 
3821   // Shift is in [12, 30] range.
3822   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3823   assert(shift <= 30);
3824   assert(shift >= 12);
3825 
3826   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3827   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3828   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3829   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3830   assert(abs_a_multiplier <= INT32_C(0x00200000));
3831   assert(abs_b_multiplier <= INT32_C(0x00200000));
3832 
3833   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3834   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3835 
3836   const int32_t rounding = INT32_C(1) << (shift - 1);
3837   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
3838   for (uint32_t i = 0; i < 4; i++) {
3839     params->sse2.bias[i] = bias;
3840   }
3841   const uint16_t a_multiplier_lo = (uint16_t) a_multiplier;
3842   const uint16_t a_multiplier_hi = (uint16_t) ((uint32_t) a_multiplier >> 16);
3843   const uint16_t b_multiplier_lo = (uint16_t) b_multiplier;
3844   const uint16_t b_multiplier_hi = (uint16_t) ((uint32_t) b_multiplier >> 16);
3845   for (uint32_t i = 0; i < 8; i++) {
3846     params->sse2.a_multiplier_lo[i] = a_multiplier_lo;
3847     params->sse2.a_multiplier_hi[i] = a_multiplier_hi;
3848     params->sse2.b_multiplier_lo[i] = b_multiplier_lo;
3849     params->sse2.b_multiplier_hi[i] = b_multiplier_hi;
3850   }
3851   params->sse2.shift = shift;
3852   params->sse2.b_multiplier = (uint32_t) b_multiplier;
3853   for (uint32_t i = 0; i < 8; i++) {
3854     params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
3855     params->sse2.output_min[i] = (int16_t) output_min;
3856     params->sse2.output_max[i] = (int16_t) output_max;
3857   }
3858 }
3859 
xnn_init_qs8_add_minmax_sse4_mul16_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)3860 void xnn_init_qs8_add_minmax_sse4_mul16_params(
3861   union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3862   int8_t a_zero_point,
3863   int8_t b_zero_point,
3864   int8_t output_zero_point,
3865   float a_output_scale,
3866   float b_output_scale,
3867   int8_t output_min,
3868   int8_t output_max)
3869 {
3870   const float abs_a_output_scale = fabsf(a_output_scale);
3871   const float abs_b_output_scale = fabsf(b_output_scale);
3872   assert(abs_a_output_scale >= 0x1.0p-10f);
3873   assert(abs_b_output_scale >= 0x1.0p-10f);
3874   assert(abs_a_output_scale < 0x1.0p+8f);
3875   assert(abs_b_output_scale < 0x1.0p+8f);
3876 
3877   // Compute requantization parameters.
3878   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3879   assert(max_abs_output_scale >= 0x1.0p-10f);
3880   assert(max_abs_output_scale < 0x1.0p+8f);
3881   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3882   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3883 
3884   // Shift is in [12, 30] range.
3885   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3886   assert(shift <= 30);
3887   assert(shift >= 12);
3888 
3889   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3890   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3891   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3892   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3893   assert(abs_a_multiplier <= INT32_C(0x00200000));
3894   assert(abs_b_multiplier <= INT32_C(0x00200000));
3895 
3896   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3897   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3898 
3899   const int32_t rounding = INT32_C(1) << (shift - 1);
3900   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
3901   for (uint32_t i = 0; i < 4; i++) {
3902     params->sse4_mul16.bias[i] = bias;
3903   }
3904   const uint16_t a_multiplier_lo = (uint16_t) a_multiplier;
3905   const uint16_t a_multiplier_hi = (uint16_t) ((uint32_t) a_multiplier >> 16);
3906   const uint16_t b_multiplier_lo = (uint16_t) b_multiplier;
3907   const uint16_t b_multiplier_hi = (uint16_t) ((uint32_t) b_multiplier >> 16);
3908   for (uint32_t i = 0; i < 8; i++) {
3909     params->sse4_mul16.a_multiplier_lo[i] = a_multiplier_lo;
3910     params->sse4_mul16.a_multiplier_hi[i] = a_multiplier_hi;
3911     params->sse4_mul16.b_multiplier_lo[i] = b_multiplier_lo;
3912     params->sse4_mul16.b_multiplier_hi[i] = b_multiplier_hi;
3913   }
3914   params->sse4_mul16.shift = shift;
3915   params->sse4_mul16.b_multiplier = (uint32_t) b_multiplier;
3916   for (uint32_t i = 0; i < 8; i++) {
3917     params->sse4_mul16.output_zero_point[i] = (int16_t) output_zero_point;
3918   }
3919   for (uint32_t i = 0; i < 16; i++) {
3920     params->sse4_mul16.output_min[i] = output_min;
3921     params->sse4_mul16.output_max[i] = output_max;
3922   }
3923 }
3924 
xnn_init_qs8_add_minmax_sse4_mul32_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)3925 void xnn_init_qs8_add_minmax_sse4_mul32_params(
3926   union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3927   int8_t a_zero_point,
3928   int8_t b_zero_point,
3929   int8_t output_zero_point,
3930   float a_output_scale,
3931   float b_output_scale,
3932   int8_t output_min,
3933   int8_t output_max)
3934 {
3935   const float abs_a_output_scale = fabsf(a_output_scale);
3936   const float abs_b_output_scale = fabsf(b_output_scale);
3937   assert(abs_a_output_scale >= 0x1.0p-10f);
3938   assert(abs_b_output_scale >= 0x1.0p-10f);
3939   assert(abs_a_output_scale < 0x1.0p+8f);
3940   assert(abs_b_output_scale < 0x1.0p+8f);
3941 
3942   // Compute requantization parameters.
3943   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
3944   assert(max_abs_output_scale >= 0x1.0p-10f);
3945   assert(max_abs_output_scale < 0x1.0p+8f);
3946   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
3947   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
3948 
3949   // Shift is in [12, 30] range.
3950   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
3951   assert(shift <= 30);
3952   assert(shift >= 12);
3953 
3954   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
3955   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
3956   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
3957   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
3958   assert(abs_a_multiplier <= INT32_C(0x00200000));
3959   assert(abs_b_multiplier <= INT32_C(0x00200000));
3960 
3961   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
3962   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
3963 
3964   const int32_t rounding = INT32_C(1) << (shift - 1);
3965   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
3966   for (uint32_t i = 0; i < 4; i++) {
3967     params->sse4_mul32.bias[i] = bias;
3968     params->sse4_mul32.a_multiplier[i] = a_multiplier;
3969     params->sse4_mul32.b_multiplier[i] = b_multiplier;
3970     params->sse4_mul32.shift[i] = shift;
3971   }
3972   for (uint32_t i = 0; i < 8; i++) {
3973     params->sse4_mul32.output_zero_point[i] = (int16_t) output_zero_point;
3974   }
3975   for (uint32_t i = 0; i < 16; i++) {
3976     params->sse4_mul32.output_min[i] = output_min;
3977     params->sse4_mul32.output_max[i] = output_max;
3978   }
3979 }
3980 
xnn_init_qs8_add_minmax_avx2_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)3981 void xnn_init_qs8_add_minmax_avx2_params(
3982   union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
3983   int8_t a_zero_point,
3984   int8_t b_zero_point,
3985   int8_t output_zero_point,
3986   float a_output_scale,
3987   float b_output_scale,
3988   int8_t output_min,
3989   int8_t output_max)
3990 {
3991   const float abs_a_output_scale = fabsf(a_output_scale);
3992   const float abs_b_output_scale = fabsf(b_output_scale);
3993   assert(abs_a_output_scale >= 0x1.0p-10f);
3994   assert(abs_b_output_scale >= 0x1.0p-10f);
3995   assert(abs_a_output_scale < 0x1.0p+8f);
3996   assert(abs_b_output_scale < 0x1.0p+8f);
3997 
3998   // Compute requantization parameters.
3999   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4000   assert(max_abs_output_scale >= 0x1.0p-10f);
4001   assert(max_abs_output_scale < 0x1.0p+8f);
4002   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
4003   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4004 
4005   // Shift is in [12, 30] range.
4006   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4007   assert(shift <= 30);
4008   assert(shift >= 12);
4009 
4010   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4011   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
4012   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
4013   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4014   assert(abs_a_multiplier <= INT32_C(0x00200000));
4015   assert(abs_b_multiplier <= INT32_C(0x00200000));
4016 
4017   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4018   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4019 
4020   const int32_t rounding = INT32_C(1) << (shift - 1);
4021   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4022   for (uint32_t i = 0; i < 8; i++) {
4023     params->avx2.bias[i] = bias;
4024     params->avx2.a_multiplier[i] = a_multiplier;
4025     params->avx2.b_multiplier[i] = b_multiplier;
4026     params->avx2.shift[i] = shift;
4027   }
4028   for (uint32_t i = 0; i < 16; i++) {
4029     params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
4030     params->avx2.output_min[i] = output_min;
4031     params->avx2.output_max[i] = output_max;
4032   }
4033 }
4034 
xnn_init_qs8_add_minmax_avx512_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)4035 void xnn_init_qs8_add_minmax_avx512_params(
4036   union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
4037   int8_t a_zero_point,
4038   int8_t b_zero_point,
4039   int8_t output_zero_point,
4040   float a_output_scale,
4041   float b_output_scale,
4042   int8_t output_min,
4043   int8_t output_max)
4044 {
4045   const float abs_a_output_scale = fabsf(a_output_scale);
4046   const float abs_b_output_scale = fabsf(b_output_scale);
4047   assert(abs_a_output_scale >= 0x1.0p-10f);
4048   assert(abs_b_output_scale >= 0x1.0p-10f);
4049   assert(abs_a_output_scale < 0x1.0p+8f);
4050   assert(abs_b_output_scale < 0x1.0p+8f);
4051 
4052   // Compute requantization parameters.
4053   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4054   assert(max_abs_output_scale >= 0x1.0p-10f);
4055   assert(max_abs_output_scale < 0x1.0p+8f);
4056   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
4057   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4058 
4059   // Shift is in [12, 30] range.
4060   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4061   assert(shift <= 30);
4062   assert(shift >= 12);
4063 
4064   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4065   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
4066   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
4067   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4068   assert(abs_a_multiplier <= INT32_C(0x00200000));
4069   assert(abs_b_multiplier <= INT32_C(0x00200000));
4070 
4071   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4072   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4073 
4074   const int32_t rounding = INT32_C(1) << (shift - 1);
4075   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4076   for (uint32_t i = 0; i < 16; i++) {
4077     params->avx512.bias[i] = bias;
4078     params->avx512.a_multiplier[i] = a_multiplier;
4079     params->avx512.b_multiplier[i] = b_multiplier;
4080     params->avx512.shift[i] = shift;
4081   }
4082   for (uint32_t i = 0; i < 32; i++) {
4083     params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
4084     params->avx512.output_min[i] = output_min;
4085     params->avx512.output_max[i] = output_max;
4086   }
4087 }
4088 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
4089 
4090 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_add_minmax_neon_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)4091 void xnn_init_qs8_add_minmax_neon_params(
4092   union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
4093   int8_t a_zero_point,
4094   int8_t b_zero_point,
4095   int8_t output_zero_point,
4096   float a_output_scale,
4097   float b_output_scale,
4098   int8_t output_min,
4099   int8_t output_max)
4100 {
4101   const float abs_a_output_scale = fabsf(a_output_scale);
4102   const float abs_b_output_scale = fabsf(b_output_scale);
4103   assert(abs_a_output_scale >= 0x1.0p-10f);
4104   assert(abs_b_output_scale >= 0x1.0p-10f);
4105   assert(abs_a_output_scale < 0x1.0p+8f);
4106   assert(abs_b_output_scale < 0x1.0p+8f);
4107 
4108   // Compute requantization parameters.
4109   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4110   assert(max_abs_output_scale >= 0x1.0p-10f);
4111   assert(max_abs_output_scale < 0x1.0p+8f);
4112   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
4113   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4114 
4115   // Shift is in [12, 30] range.
4116   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4117   assert(shift <= 30);
4118   assert(shift >= 12);
4119 
4120   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4121   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
4122   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
4123   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4124   assert(abs_a_multiplier <= INT32_C(0x00200000));
4125   assert(abs_b_multiplier <= INT32_C(0x00200000));
4126 
4127   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4128   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4129 
4130   params->neon.a_zero_point = a_zero_point;
4131   params->neon.b_zero_point = b_zero_point;
4132   params->neon.a_multiplier = (int32_t) a_multiplier;
4133   params->neon.b_multiplier = (int32_t) b_multiplier;
4134   params->neon.right_shift = (int32_t) -shift;
4135   params->neon.output_zero_point = (int16_t) output_zero_point;
4136   params->neon.output_min = output_min;
4137   params->neon.output_max = output_max;
4138 }
4139 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
4140 
4141 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_add_minmax_wasmsimd_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)4142 void xnn_init_qs8_add_minmax_wasmsimd_params(
4143   union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
4144   int8_t a_zero_point,
4145   int8_t b_zero_point,
4146   int8_t output_zero_point,
4147   float a_output_scale,
4148   float b_output_scale,
4149   int8_t output_min,
4150   int8_t output_max)
4151 {
4152   const float abs_a_output_scale = fabsf(a_output_scale);
4153   const float abs_b_output_scale = fabsf(b_output_scale);
4154   assert(abs_a_output_scale >= 0x1.0p-10f);
4155   assert(abs_b_output_scale >= 0x1.0p-10f);
4156   assert(abs_a_output_scale < 0x1.0p+8f);
4157   assert(abs_b_output_scale < 0x1.0p+8f);
4158 
4159   // Compute requantization parameters.
4160   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4161   assert(max_abs_output_scale >= 0x1.0p-10f);
4162   assert(max_abs_output_scale < 0x1.0p+8f);
4163   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
4164   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4165 
4166   // Shift is in [12, 30] range.
4167   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4168   assert(shift <= 30);
4169   assert(shift >= 12);
4170 
4171   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4172   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
4173   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
4174   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4175   assert(abs_a_multiplier <= INT32_C(0x00200000));
4176   assert(abs_b_multiplier <= INT32_C(0x00200000));
4177 
4178   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4179   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4180 
4181   const int32_t rounding = INT32_C(1) << (shift - 1);
4182   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4183   for (uint32_t i = 0; i < 2; i++) {
4184     params->wasmsimd.bias[i] = bias;
4185     params->wasmsimd.a_multiplier[i] = a_multiplier;
4186     params->wasmsimd.b_multiplier[i] = b_multiplier;
4187   }
4188   params->wasmsimd.shift = shift;
4189   for (uint32_t i = 0; i < 4; i++) {
4190     params->wasmsimd.output_zero_point[i] = (int16_t) output_zero_point;
4191   }
4192   for (uint32_t i = 0; i < 8; i++) {
4193     params->wasmsimd.output_min[i] = output_min;
4194     params->wasmsimd.output_max[i] = output_max;
4195   }
4196 }
4197 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4198 
xnn_init_qs8_add_minmax_scalar_params(union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)4199 void xnn_init_qs8_add_minmax_scalar_params(
4200   union xnn_qs8_addsub_minmax_params params[XNN_MIN_ELEMENTS(1)],
4201   int8_t a_zero_point,
4202   int8_t b_zero_point,
4203   int8_t output_zero_point,
4204   float a_output_scale,
4205   float b_output_scale,
4206   int8_t output_min,
4207   int8_t output_max)
4208 {
4209   const float abs_a_output_scale = fabsf(a_output_scale);
4210   const float abs_b_output_scale = fabsf(b_output_scale);
4211   assert(abs_a_output_scale >= 0x1.0p-10f);
4212   assert(abs_b_output_scale >= 0x1.0p-10f);
4213   assert(abs_a_output_scale < 0x1.0p+8f);
4214   assert(abs_b_output_scale < 0x1.0p+8f);
4215 
4216   // Compute requantization parameters.
4217   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4218   assert(max_abs_output_scale >= 0x1.0p-10f);
4219   assert(max_abs_output_scale < 0x1.0p+8f);
4220   const uint32_t max_scale_bits = fp32_to_bits(max_abs_output_scale);
4221   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4222 
4223   // Shift is in [12, 30] range.
4224   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4225   assert(shift <= 30);
4226   assert(shift >= 12);
4227 
4228   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4229   const int32_t abs_a_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_a_output_scale) + (shift << 23)));
4230   const int32_t abs_b_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(abs_b_output_scale) + (shift << 23)));
4231   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4232   assert(abs_a_multiplier <= INT32_C(0x00200000));
4233   assert(abs_b_multiplier <= INT32_C(0x00200000));
4234 
4235   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4236   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4237 
4238   const int32_t rounding = INT32_C(1) << (shift - 1);
4239   params->scalar.bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4240   params->scalar.a_multiplier = a_multiplier;
4241   params->scalar.b_multiplier = b_multiplier;
4242   params->scalar.shift = shift;
4243   params->scalar.output_min_less_zero_point = (int32_t) output_min - (int32_t) output_zero_point;
4244   params->scalar.output_max_less_zero_point = (int32_t) output_max - (int32_t) output_zero_point;
4245   params->scalar.output_zero_point = (int32_t) output_zero_point;
4246 }
4247 
xnn_init_qu8_mul_minmax_fp32_scalar_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)4248 void xnn_init_qu8_mul_minmax_fp32_scalar_params(
4249   union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4250   uint8_t a_zero_point,
4251   uint8_t b_zero_point,
4252   uint8_t output_zero_point,
4253   float product_output_scale,
4254   uint8_t output_min,
4255   uint8_t output_max)
4256 {
4257   assert(product_output_scale >= 0x1.0p-16f);
4258   assert(product_output_scale < 0x1.0p+8f);
4259 
4260   params->fp32_scalar.a_zero_point = (int16_t) (uint16_t) a_zero_point;
4261   params->fp32_scalar.b_zero_point = (int16_t) (uint16_t) b_zero_point;
4262   params->fp32_scalar.scale = product_output_scale;
4263   params->fp32_scalar.output_min_less_zero_point = (float) (int32_t) ((uint32_t) output_min - (uint32_t) output_zero_point);
4264   params->fp32_scalar.output_max_less_zero_point = (float) (int32_t) ((uint32_t) output_max - (uint32_t) output_zero_point);
4265   params->fp32_scalar.magic_bias = 12582912.0f;
4266   params->fp32_scalar.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) (uint32_t) output_zero_point;
4267 }
4268 
4269 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_mul_minmax_fp32_neon_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)4270 void xnn_init_qu8_mul_minmax_fp32_neon_params(
4271   union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4272   uint8_t a_zero_point,
4273   uint8_t b_zero_point,
4274   uint8_t output_zero_point,
4275   float product_output_scale,
4276   uint8_t output_min,
4277   uint8_t output_max)
4278 {
4279   assert(product_output_scale >= 0x1.0p-16f);
4280   assert(product_output_scale < 0x1.0p+8f);
4281 
4282   params->fp32_neon.a_zero_point[0] = a_zero_point;
4283   params->fp32_neon.a_zero_point[1] = a_zero_point;
4284   params->fp32_neon.b_zero_point[0] = b_zero_point;
4285   params->fp32_neon.b_zero_point[1] = b_zero_point;
4286   params->fp32_neon.scale = product_output_scale;
4287   params->fp32_neon.magic_bias = 12582912.0f;
4288   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4289   params->fp32_neon.output_min = output_min;
4290   params->fp32_neon.output_max = output_max;
4291 }
4292 
xnn_init_qu8_mul_minmax_fp32_neonv8_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)4293 void xnn_init_qu8_mul_minmax_fp32_neonv8_params(
4294   union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4295   uint8_t a_zero_point,
4296   uint8_t b_zero_point,
4297   uint8_t output_zero_point,
4298   float product_output_scale,
4299   uint8_t output_min,
4300   uint8_t output_max)
4301 {
4302   assert(product_output_scale >= 0x1.0p-16f);
4303   assert(product_output_scale < 0x1.0p+8f);
4304 
4305   params->fp32_neonv8.a_zero_point[0] = a_zero_point;
4306   params->fp32_neonv8.a_zero_point[1] = a_zero_point;
4307   params->fp32_neonv8.b_zero_point[0] = b_zero_point;
4308   params->fp32_neonv8.b_zero_point[1] = b_zero_point;
4309   params->fp32_neonv8.scale = product_output_scale;
4310   params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
4311   params->fp32_neonv8.output_min = output_min;
4312   params->fp32_neonv8.output_max = output_max;
4313 }
4314 
xnn_init_qu8_mul_minmax_rndnu_neon_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)4315 void xnn_init_qu8_mul_minmax_rndnu_neon_params(
4316   union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4317   uint8_t a_zero_point,
4318   uint8_t b_zero_point,
4319   uint8_t output_zero_point,
4320   float product_output_scale,
4321   uint8_t output_min,
4322   uint8_t output_max)
4323 {
4324   assert(product_output_scale >= 0x1.0p-16f);
4325   assert(product_output_scale < 0x1.0p+8f);
4326 
4327   // Compute requantization parameters.
4328   const uint32_t scale_bits = fp32_to_bits(product_output_scale);
4329 
4330   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
4331   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
4332   assert(multiplier >= INT32_C(0x40000000));
4333   assert(multiplier <= INT32_C(0x7FFFFF80));
4334 
4335   // Shift is in [-8, 15] range.
4336   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
4337   assert(shift >= -8);
4338   assert(shift < 16);
4339 
4340   // Split shift into pre_shift + post_shift, post_shift in [1, 15] range.
4341   const int32_t post_shift = math_max_s32(shift, 1);
4342   const int32_t pre_shift = shift - post_shift;
4343 
4344   params->rndnu_neon.a_zero_point[0] = a_zero_point;
4345   params->rndnu_neon.a_zero_point[1] = a_zero_point;
4346   params->rndnu_neon.b_zero_point[0] = b_zero_point;
4347   params->rndnu_neon.b_zero_point[1] = b_zero_point;
4348   params->rndnu_neon.left_pre_shift = -pre_shift;
4349   params->rndnu_neon.multiplier = multiplier;
4350   params->rndnu_neon.left_post_shift = -post_shift;
4351   params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
4352   params->rndnu_neon.output_min = output_min;
4353   params->rndnu_neon.output_max = output_max;
4354 }
4355 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
4356 
4357 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_mul_minmax_fp32_sse2_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)4358 void xnn_init_qu8_mul_minmax_fp32_sse2_params(
4359   union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4360   uint8_t a_zero_point,
4361   uint8_t b_zero_point,
4362   uint8_t output_zero_point,
4363   float product_output_scale,
4364   uint8_t output_min,
4365   uint8_t output_max)
4366 {
4367   assert(product_output_scale >= 0x1.0p-16f);
4368   assert(product_output_scale < 0x1.0p+8f);
4369 
4370   for (uint32_t i = 0; i < 8; i++) {
4371     params->fp32_sse2.a_zero_point[i] = (int16_t) (uint16_t) a_zero_point;
4372     params->fp32_sse2.b_zero_point[i] = (int16_t) (uint16_t) b_zero_point;
4373   }
4374   for (uint32_t i = 0; i < 4; i++) {
4375     params->fp32_sse2.scale[i] = product_output_scale;
4376   }
4377   for (uint32_t i = 0; i < 8; i++) {
4378     params->fp32_sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4379   }
4380   for (uint32_t i = 0; i < 16; i++) {
4381     params->fp32_sse2.output_min[i] = output_min;
4382     params->fp32_sse2.output_max[i] = output_max;
4383   }
4384 }
4385 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
4386 
4387 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_mul_minmax_fp32_wasmsimd_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)4388 void xnn_init_qu8_mul_minmax_fp32_wasmsimd_params(
4389   union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4390   uint8_t a_zero_point,
4391   uint8_t b_zero_point,
4392   uint8_t output_zero_point,
4393   float product_output_scale,
4394   uint8_t output_min,
4395   uint8_t output_max)
4396 {
4397   assert(product_output_scale >= 0x1.0p-16f);
4398   assert(product_output_scale < 0x1.0p+8f);
4399 
4400   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
4401   const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
4402   const int32_t magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4403   for (uint32_t i = 0; i < 4; i++) {
4404     params->fp32_wasmsimd.a_zero_point[i] = (int16_t) a_zero_point;
4405     params->fp32_wasmsimd.b_zero_point[i] = (int16_t) b_zero_point;
4406   }
4407   for (uint32_t i = 0; i < 2; i++) {
4408     params->fp32_wasmsimd.scale[i] = product_output_scale;
4409     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
4410     params->fp32_wasmsimd.magic_min[i] = magic_min;
4411     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_output_zero_point;
4412   }
4413   for (uint32_t i = 0; i < 8; i++) {
4414     params->fp32_wasmsimd.output_max[i] = output_max;
4415   }
4416 }
4417 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4418 
xnn_init_qs8_mul_minmax_fp32_scalar_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4419 void xnn_init_qs8_mul_minmax_fp32_scalar_params(
4420   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4421   int8_t a_zero_point,
4422   int8_t b_zero_point,
4423   int8_t output_zero_point,
4424   float product_output_scale,
4425   int8_t output_min,
4426   int8_t output_max)
4427 {
4428   assert(product_output_scale >= 0x1.0p-16f);
4429   assert(product_output_scale < 0x1.0p+8f);
4430 
4431   params->fp32_scalar.a_zero_point = (int16_t) a_zero_point;
4432   params->fp32_scalar.b_zero_point = (int16_t) b_zero_point;
4433   params->fp32_scalar.scale = product_output_scale;
4434   params->fp32_scalar.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
4435   params->fp32_scalar.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4436   params->fp32_scalar.magic_bias = 12582912.0f;
4437   params->fp32_scalar.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4438 }
4439 
4440 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_mul_minmax_fp32_neon_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4441 void xnn_init_qs8_mul_minmax_fp32_neon_params(
4442   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4443   int8_t a_zero_point,
4444   int8_t b_zero_point,
4445   int8_t output_zero_point,
4446   float product_output_scale,
4447   int8_t output_min,
4448   int8_t output_max)
4449 {
4450   assert(product_output_scale >= 0x1.0p-16f);
4451   assert(product_output_scale < 0x1.0p+8f);
4452 
4453   params->fp32_neon.a_zero_point[0] = a_zero_point;
4454   params->fp32_neon.a_zero_point[1] = a_zero_point;
4455   params->fp32_neon.b_zero_point[0] = b_zero_point;
4456   params->fp32_neon.b_zero_point[1] = b_zero_point;
4457   params->fp32_neon.scale = product_output_scale;
4458   params->fp32_neon.magic_bias = 12582912.0f;
4459   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4460   params->fp32_neon.output_min = output_min;
4461   params->fp32_neon.output_max = output_max;
4462 }
4463 
xnn_init_qs8_mul_minmax_fp32_neonv8_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4464 void xnn_init_qs8_mul_minmax_fp32_neonv8_params(
4465   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4466   int8_t a_zero_point,
4467   int8_t b_zero_point,
4468   int8_t output_zero_point,
4469   float product_output_scale,
4470   int8_t output_min,
4471   int8_t output_max)
4472 {
4473   assert(product_output_scale >= 0x1.0p-16f);
4474   assert(product_output_scale < 0x1.0p+8f);
4475 
4476   params->fp32_neonv8.a_zero_point[0] = a_zero_point;
4477   params->fp32_neonv8.a_zero_point[1] = a_zero_point;
4478   params->fp32_neonv8.b_zero_point[0] = b_zero_point;
4479   params->fp32_neonv8.b_zero_point[1] = b_zero_point;
4480   params->fp32_neonv8.scale = product_output_scale;
4481   params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
4482   params->fp32_neonv8.output_min = output_min;
4483   params->fp32_neonv8.output_max = output_max;
4484 }
4485 
xnn_init_qs8_mul_minmax_rndnu_neon_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4486 void xnn_init_qs8_mul_minmax_rndnu_neon_params(
4487   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4488   int8_t a_zero_point,
4489   int8_t b_zero_point,
4490   int8_t output_zero_point,
4491   float product_output_scale,
4492   int8_t output_min,
4493   int8_t output_max)
4494 {
4495   assert(product_output_scale >= 0x1.0p-16f);
4496   assert(product_output_scale < 0x1.0p+8f);
4497 
4498   // Compute requantization parameters.
4499   const uint32_t scale_bits = fp32_to_bits(product_output_scale);
4500 
4501   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
4502   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
4503   assert(multiplier >= INT32_C(0x40000000));
4504   assert(multiplier <= INT32_C(0x7FFFFF80));
4505 
4506   // Shift is in [-8, 15] range.
4507   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
4508   assert(shift >= -8);
4509   assert(shift < 16);
4510 
4511   // Split shift into pre_shift + post_shift, post_shift in [1, 15] range.
4512   const int32_t post_shift = math_max_s32(shift, 1);
4513   const int32_t pre_shift = shift - post_shift;
4514 
4515   params->rndnu_neon.a_zero_point[0] = a_zero_point;
4516   params->rndnu_neon.a_zero_point[1] = a_zero_point;
4517   params->rndnu_neon.b_zero_point[0] = b_zero_point;
4518   params->rndnu_neon.b_zero_point[1] = b_zero_point;
4519   params->rndnu_neon.left_pre_shift = -pre_shift;
4520   params->rndnu_neon.multiplier = multiplier;
4521   params->rndnu_neon.left_post_shift = -post_shift;
4522   params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
4523   params->rndnu_neon.output_min = output_min;
4524   params->rndnu_neon.output_max = output_max;
4525 }
4526 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
4527 
4528 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_mul_minmax_fp32_sse2_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4529 void xnn_init_qs8_mul_minmax_fp32_sse2_params(
4530   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4531   int8_t a_zero_point,
4532   int8_t b_zero_point,
4533   int8_t output_zero_point,
4534   float product_output_scale,
4535   int8_t output_min,
4536   int8_t output_max)
4537 {
4538   assert(product_output_scale >= 0x1.0p-16f);
4539   assert(product_output_scale < 0x1.0p+8f);
4540 
4541   for (uint32_t i = 0; i < 8; i++) {
4542     params->fp32_sse2.a_zero_point[i] = (int16_t) a_zero_point;
4543     params->fp32_sse2.b_zero_point[i] = (int16_t) b_zero_point;
4544   }
4545   for (uint32_t i = 0; i < 4; i++) {
4546     params->fp32_sse2.scale[i] = product_output_scale;
4547   }
4548   for (uint32_t i = 0; i < 8; i++) {
4549     params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
4550   }
4551   for (uint32_t i = 0; i < 8; i++) {
4552     params->fp32_sse2.output_min[i] = (int16_t) output_min;
4553     params->fp32_sse2.output_max[i] = (int16_t) output_max;
4554   }
4555 }
4556 
xnn_init_qs8_mul_minmax_fp32_sse4_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4557 void xnn_init_qs8_mul_minmax_fp32_sse4_params(
4558   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4559   int8_t a_zero_point,
4560   int8_t b_zero_point,
4561   int8_t output_zero_point,
4562   float product_output_scale,
4563   int8_t output_min,
4564   int8_t output_max)
4565 {
4566   assert(product_output_scale >= 0x1.0p-16f);
4567   assert(product_output_scale < 0x1.0p+8f);
4568 
4569   for (uint32_t i = 0; i < 8; i++) {
4570     params->fp32_sse4.a_zero_point[i] = (int16_t) a_zero_point;
4571     params->fp32_sse4.b_zero_point[i] = (int16_t) b_zero_point;
4572   }
4573   for (uint32_t i = 0; i < 4; i++) {
4574     params->fp32_sse4.scale[i] = product_output_scale;
4575   }
4576   for (uint32_t i = 0; i < 8; i++) {
4577     params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
4578   }
4579   for (uint32_t i = 0; i < 16; i++) {
4580     params->fp32_sse4.output_min[i] = output_min;
4581     params->fp32_sse4.output_max[i] = output_max;
4582   }
4583 }
4584 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
4585 
4586 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_mul_minmax_fp32_wasmsimd_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)4587 void xnn_init_qs8_mul_minmax_fp32_wasmsimd_params(
4588   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
4589   int8_t a_zero_point,
4590   int8_t b_zero_point,
4591   int8_t output_zero_point,
4592   float product_output_scale,
4593   int8_t output_min,
4594   int8_t output_max)
4595 {
4596   assert(product_output_scale >= 0x1.0p-16f);
4597   assert(product_output_scale < 0x1.0p+8f);
4598 
4599   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
4600   const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
4601   const int32_t magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4602   for (uint32_t i = 0; i < 4; i++) {
4603     params->fp32_wasmsimd.a_zero_point[i] = (int16_t) a_zero_point;
4604     params->fp32_wasmsimd.b_zero_point[i] = (int16_t) b_zero_point;
4605   }
4606   for (uint32_t i = 0; i < 2; i++) {
4607     params->fp32_wasmsimd.scale[i] = product_output_scale;
4608     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
4609     params->fp32_wasmsimd.magic_min[i] = magic_min;
4610     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_output_zero_point;
4611   }
4612   for (uint32_t i = 0; i < 8; i++) {
4613     params->fp32_wasmsimd.output_max[i] = output_max;
4614   }
4615 }
4616 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4617 
xnn_init_f16_f32_cvt_scalar_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])4618 XNN_INTERNAL void xnn_init_f16_f32_cvt_scalar_params(
4619   union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
4620 {
4621   params->scalar.sign_mask = UINT32_C(0x80000000);
4622   params->scalar.exp_offset = UINT32_C(0x70000000);
4623   params->scalar.exp_scale = 0x1.0p-112f;
4624   params->scalar.magic_mask = UINT32_C(0x3F000000);
4625   params->scalar.magic_bias = 0.5f;
4626   params->scalar.denorm_cutoff = UINT32_C(0x08000000);
4627 }
4628 
4629 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_f32_cvt_neon_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])4630 XNN_INTERNAL void xnn_init_f16_f32_cvt_neon_params(
4631   union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
4632 {
4633   params->neon.exp_scale = 0x1.0p-112f;
4634 }
4635 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
4636 
4637 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_f32_cvt_sse_int16_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])4638 XNN_INTERNAL void xnn_init_f16_f32_cvt_sse_int16_params(
4639   union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
4640 {
4641   for (uint32_t i = 0; i < 8; i++) {
4642     params->sse_int16.sign_mask[i] = UINT16_C(0x8000);
4643     params->sse_int16.exp_offset[i] = UINT16_C(0x7000);
4644   }
4645   for (uint32_t i = 0; i < 4; i++) {
4646     params->sse_int16.exp_scale[i] = 0x1.0p-112f;
4647   }
4648   for (uint32_t i = 0; i < 8; i++) {
4649     params->sse_int16.magic_mask[i] = UINT16_C(0x3F00);
4650   }
4651   for (uint32_t i = 0; i < 4; i++) {
4652     params->sse_int16.magic_bias[i] = 0.5f;
4653   }
4654   for (uint32_t i = 0; i < 8; i++) {
4655     params->sse_int16.denorm_cutoff[i] = INT16_C(0x0400);
4656   }
4657 }
4658 
xnn_init_f16_f32_cvt_sse_int32_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])4659 XNN_INTERNAL void xnn_init_f16_f32_cvt_sse_int32_params(
4660   union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
4661 {
4662   for (uint32_t i = 0; i < 4; i++) {
4663     params->sse_int32.sign_mask[i] = UINT32_C(0x80000000);
4664     params->sse_int32.exp_offset[i] = UINT32_C(0x70000000);
4665     params->sse_int32.exp_scale[i] = 0x1.0p-112f;
4666     params->sse_int32.magic_bias[i] = UINT32_C(0x3F000000);
4667     params->sse_int32.denorm_cutoff[i] = INT32_C(0x04000000);
4668   }
4669 }
4670 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
4671 
4672 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f16_f32_cvt_wasmsimd_int16_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])4673 XNN_INTERNAL void xnn_init_f16_f32_cvt_wasmsimd_int16_params(
4674   union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
4675 {
4676   for (uint32_t i = 0; i < 4; i++) {
4677     params->wasmsimd_int16.sign_mask[i] = UINT16_C(0x8000);
4678     params->wasmsimd_int16.exp_offset[i] = UINT16_C(0x7000);
4679   }
4680   for (uint32_t i = 0; i < 2; i++) {
4681     params->wasmsimd_int16.exp_scale[i] = 0x1.0p-112f;
4682   }
4683   for (uint32_t i = 0; i < 4; i++) {
4684     params->wasmsimd_int16.magic_mask[i] = UINT16_C(0x3F00);
4685   }
4686   for (uint32_t i = 0; i < 2; i++) {
4687     params->wasmsimd_int16.magic_bias[i] = 0.5f;
4688   }
4689   for (uint32_t i = 0; i < 4; i++) {
4690     params->wasmsimd_int16.denorm_cutoff[i] = INT16_C(0x0400);
4691   }
4692 }
4693 
xnn_init_f16_f32_cvt_wasmsimd_int32_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])4694 XNN_INTERNAL void xnn_init_f16_f32_cvt_wasmsimd_int32_params(
4695   union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
4696 {
4697   for (uint32_t i = 0; i < 2; i++) {
4698     params->wasmsimd_int32.sign_mask[i] = UINT32_C(0x80000000);
4699     params->wasmsimd_int32.exp_offset[i] = UINT32_C(0x70000000);
4700     params->wasmsimd_int32.exp_scale[i] = 0x1.0p-112f;
4701     params->wasmsimd_int32.magic_bias[i] = UINT32_C(0x3F000000);
4702     params->wasmsimd_int32.denorm_cutoff[i] = INT32_C(0x04000000);
4703   }
4704 }
4705 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4706 
xnn_init_f32_f16_cvt_scalar_bitcast_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])4707 XNN_INTERNAL void xnn_init_f32_f16_cvt_scalar_bitcast_params(
4708   union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
4709 {
4710   params->scalar_bitcast.nonsign_mask = UINT32_C(0x7FFFFFFF);
4711   params->scalar_bitcast.exp_bias = UINT32_C(0x07800000);
4712   params->scalar_bitcast.scale_to_inf = 0x1.0p+112f;
4713   params->scalar_bitcast.expw_max = UINT32_C(0x7F800000);
4714   params->scalar_bitcast.scale_to_zero = 0x1.0p-110f;
4715   params->scalar_bitcast.bias_min = UINT32_C(0x40000000);
4716   params->scalar_bitcast.exph_mask = UINT16_C(0x7C00);
4717   params->scalar_bitcast.manth_mask = UINT16_C(0x0FFF);
4718   params->scalar_bitcast.nanh = UINT16_C(0x7E00);
4719 }
4720 
xnn_init_f32_f16_cvt_scalar_fabsf_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])4721 XNN_INTERNAL void xnn_init_f32_f16_cvt_scalar_fabsf_params(
4722   union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
4723 {
4724   params->scalar_fabsf.scale_to_inf = 0x1.0p+112f;
4725   params->scalar_fabsf.exp_bias = UINT32_C(0x07800000);
4726   params->scalar_fabsf.scale_to_zero = 0x1.0p-110f;
4727   params->scalar_fabsf.expw_max = UINT32_C(0x7F800000);
4728   params->scalar_fabsf.bias_min = UINT32_C(0x40000000);
4729   params->scalar_fabsf.exph_mask = UINT16_C(0x7C00);
4730   params->scalar_fabsf.manth_mask = UINT16_C(0x0FFF);
4731   params->scalar_fabsf.nanh = UINT16_C(0x7E00);
4732 }
4733 
4734 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_f16_cvt_neon_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])4735 XNN_INTERNAL void xnn_init_f32_f16_cvt_neon_params(
4736   union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
4737 {
4738   params->neon.exp_bias = UINT32_C(0x07800000);
4739   params->neon.scale_to_inf = 0x1.0p+112f;
4740   params->neon.expw_max = UINT32_C(0x7F800000);
4741   params->neon.scale_to_zero = 0x1.0p-110f;
4742 }
4743 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
4744 
4745 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_f16_cvt_sse2_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])4746 XNN_INTERNAL void xnn_init_f32_f16_cvt_sse2_params(
4747   union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
4748 {
4749   for (uint32_t i = 0; i < 4; i++) {
4750     params->sse2.nonsign_mask[i] = UINT32_C(0x7FFFFFFF);
4751     params->sse2.exp_bias[i] = UINT32_C(0x07800000);
4752     params->sse2.scale_to_inf[i] = 0x1.0p+112f;
4753     params->sse2.expw_max[i] = UINT32_C(0x7F800000);
4754     params->sse2.scale_to_zero[i] = 0x1.0p-110f;
4755   }
4756   params->sse2.bias_min[0] = INT16_C(0x8000);
4757   params->sse2.bias_min[1] = INT16_C(0x4000);
4758   params->sse2.bias_min[2] = INT16_C(0x8000);
4759   params->sse2.bias_min[3] = INT16_C(0x4000);
4760   params->sse2.bias_min[4] = INT16_C(0x8000);
4761   params->sse2.bias_min[5] = INT16_C(0x4000);
4762   params->sse2.bias_min[6] = INT16_C(0x8000);
4763   params->sse2.bias_min[7] = INT16_C(0x4000);
4764   for (uint32_t i = 0; i < 4; i++) {
4765     params->sse2.manth_mask[i] = UINT32_C(0x00000FFF);
4766     params->sse2.exph_mask[i] = UINT32_C(0x00007C00);
4767   }
4768   for (uint32_t i = 0; i < 8; i++) {
4769     params->sse2.nanh[i] = UINT16_C(0x7E00);
4770   }
4771 }
4772 
xnn_init_f32_f16_cvt_f16c_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])4773 XNN_INTERNAL void xnn_init_f32_f16_cvt_f16c_params(
4774   union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
4775 {
4776   for (uint32_t i = 0; i < 7; i++) {
4777     params->f16c.mask_table[i] = -1;
4778   }
4779   for (uint32_t i = 7; i < 14; i++) {
4780     params->f16c.mask_table[i] = 0;
4781   }
4782 }
4783 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
4784 
4785 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_f16_cvt_wasmsimd_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])4786 XNN_INTERNAL void xnn_init_f32_f16_cvt_wasmsimd_params(
4787   union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
4788 {
4789   for (uint32_t i = 0; i < 2; i++) {
4790     params->wasmsimd.exp_bias[i] = UINT32_C(0x07800000);
4791     params->wasmsimd.scale_to_inf[i] = 0x1.0p+112f;
4792     params->wasmsimd.expw_max[i] = UINT32_C(0x7F800000);
4793     params->wasmsimd.scale_to_zero[i] = 0x1.0p-110f;
4794   }
4795   params->wasmsimd.bias_min[0] = INT16_C(0x8000);
4796   params->wasmsimd.bias_min[1] = INT16_C(0x4000);
4797   params->wasmsimd.bias_min[2] = INT16_C(0x8000);
4798   params->wasmsimd.bias_min[3] = INT16_C(0x4000);
4799   for (uint32_t i = 0; i < 2; i++) {
4800     params->wasmsimd.manth_mask[i] = UINT32_C(0x00000FFF);
4801     params->wasmsimd.exph_mask[i] = UINT32_C(0x00007C00);
4802   }
4803   for (uint32_t i = 0; i < 4; i++) {
4804     params->wasmsimd.nanh[i] = UINT16_C(0x7E00);
4805   }
4806 }
4807 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4808 
xnn_init_f32_qs8_cvt_scalar_fmagic_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4809 XNN_INTERNAL void xnn_init_f32_qs8_cvt_scalar_fmagic_params(
4810   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4811   float scale,
4812   int8_t output_zero_point,
4813   int8_t output_min,
4814   int8_t output_max)
4815 {
4816   params->scalar_fmagic.scale = scale;
4817   params->scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
4818   params->scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4819   params->scalar_fmagic.magic_bias = 12582912.0f;
4820   params->scalar_fmagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4821 }
4822 
xnn_init_f32_qs8_cvt_scalar_imagic_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4823 XNN_INTERNAL void xnn_init_f32_qs8_cvt_scalar_imagic_params(
4824   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4825   float scale,
4826   int8_t output_zero_point,
4827   int8_t output_min,
4828   int8_t output_max)
4829 {
4830   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
4831   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4832   params->scalar_imagic.scale = scale;
4833   params->scalar_imagic.magic_bias = 12582912.0f;
4834   params->scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
4835   params->scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
4836   params->scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4837 }
4838 
xnn_init_f32_qs8_cvt_scalar_lrintf_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4839 XNN_INTERNAL void xnn_init_f32_qs8_cvt_scalar_lrintf_params(
4840   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4841   float scale,
4842   int8_t output_zero_point,
4843   int8_t output_min,
4844   int8_t output_max)
4845 {
4846   params->scalar_lrintf.scale = scale;
4847   params->scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
4848   params->scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4849   params->scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
4850 }
4851 
4852 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_qs8_cvt_neon_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4853 XNN_INTERNAL void xnn_init_f32_qs8_cvt_neon_params(
4854   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4855   float scale,
4856   int8_t output_zero_point,
4857   int8_t output_min,
4858   int8_t output_max)
4859 {
4860   params->neon.scale = scale;
4861   params->neon.magic_bias = 12582912.0f;
4862   params->neon.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
4863   params->neon.output_min = output_min;
4864   params->neon.output_max = output_max;
4865 }
4866 
xnn_init_f32_qs8_cvt_neonv8_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4867 XNN_INTERNAL void xnn_init_f32_qs8_cvt_neonv8_params(
4868   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4869   float scale,
4870   int8_t output_zero_point,
4871   int8_t output_min,
4872   int8_t output_max)
4873 {
4874   params->neonv8.scale = scale;
4875   params->neonv8.output_zero_point = (int16_t) output_zero_point;
4876   params->neonv8.output_min = output_min;
4877   params->neonv8.output_max = output_max;
4878 }
4879 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
4880 
4881 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_qs8_cvt_sse2_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4882 XNN_INTERNAL void xnn_init_f32_qs8_cvt_sse2_params(
4883   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4884   float scale,
4885   int8_t output_zero_point,
4886   int8_t output_min,
4887   int8_t output_max)
4888 {
4889   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4890   for (uint32_t i = 0; i < 4; i++) {
4891     params->sse2.scale[i] = scale;
4892     params->sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
4893   }
4894   for (uint32_t i = 0; i < 8; i++) {
4895     params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
4896     params->sse2.output_min[i] = (int16_t) output_min;
4897   }
4898 }
4899 
xnn_init_f32_qs8_cvt_sse4_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4900 XNN_INTERNAL void xnn_init_f32_qs8_cvt_sse4_params(
4901   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4902   float scale,
4903   int8_t output_zero_point,
4904   int8_t output_min,
4905   int8_t output_max)
4906 {
4907   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4908   for (uint32_t i = 0; i < 4; i++) {
4909     params->sse4.scale[i] = scale;
4910     params->sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
4911   }
4912   for (uint32_t i = 0; i < 8; i++) {
4913     params->sse4.output_zero_point[i] = (int16_t) output_zero_point;
4914   }
4915   for (uint32_t i = 0; i < 16; i++) {
4916     params->sse4.output_min[i] = output_min;
4917   }
4918 }
4919 
xnn_init_f32_qs8_cvt_avx_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4920 XNN_INTERNAL void xnn_init_f32_qs8_cvt_avx_params(
4921   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4922   float scale,
4923   int8_t output_zero_point,
4924   int8_t output_min,
4925   int8_t output_max)
4926 {
4927   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4928   for (uint32_t i = 0; i < 8; i++) {
4929     params->avx.scale[i] = scale;
4930     params->avx.output_max_less_zero_point[i] = output_max_less_zero_point;
4931   }
4932   for (uint32_t i = 0; i < 8; i++) {
4933     params->avx.output_zero_point[i] = (int16_t) output_zero_point;
4934   }
4935   for (uint32_t i = 0; i < 16; i++) {
4936     params->avx.output_min[i] = output_min;
4937   }
4938   for (uint32_t i = 0; i < 7; i++) {
4939     params->avx.mask_table[i] = -1;
4940   }
4941   for (uint32_t i = 7; i < 14; i++) {
4942     params->avx.mask_table[i] = 0;
4943   }
4944 }
4945 
xnn_init_f32_qs8_cvt_avx2_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4946 XNN_INTERNAL void xnn_init_f32_qs8_cvt_avx2_params(
4947   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4948   float scale,
4949   int8_t output_zero_point,
4950   int8_t output_min,
4951   int8_t output_max)
4952 {
4953   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4954   for (uint32_t i = 0; i < 8; i++) {
4955     params->avx2.scale[i] = scale;
4956     params->avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
4957   }
4958   for (uint32_t i = 0; i < 16; i++) {
4959     params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
4960   }
4961   params->avx2.shuffle_mask[0] = 0;
4962   params->avx2.shuffle_mask[1] = 4;
4963   params->avx2.shuffle_mask[2] = 1;
4964   params->avx2.shuffle_mask[3] = 5;
4965   params->avx2.shuffle_mask[4] = 2;
4966   params->avx2.shuffle_mask[5] = 6;
4967   params->avx2.shuffle_mask[6] = 3;
4968   params->avx2.shuffle_mask[7] = 7;
4969   for (uint32_t i = 0; i < 32; i++) {
4970     params->avx2.output_min[i] = output_min;
4971   }
4972   for (uint32_t i = 0; i < 7; i++) {
4973     params->avx2.mask_table[i] = -1;
4974   }
4975   for (uint32_t i = 7; i < 14; i++) {
4976     params->avx2.mask_table[i] = 0;
4977   }
4978 }
4979 
xnn_init_f32_qs8_cvt_avx512_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)4980 XNN_INTERNAL void xnn_init_f32_qs8_cvt_avx512_params(
4981   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
4982   float scale,
4983   int8_t output_zero_point,
4984   int8_t output_min,
4985   int8_t output_max)
4986 {
4987   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
4988   for (uint32_t i = 0; i < 16; i++) {
4989     params->avx512.scale[i] = scale;
4990     params->avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
4991   }
4992   for (uint32_t i = 0; i < 32; i++) {
4993     params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
4994   }
4995   for (uint32_t i = 0; i < 64; i++) {
4996     params->avx512.output_min[i] = output_min;
4997   }
4998   params->avx512.shuffle512_mask[0] = 0;
4999   params->avx512.shuffle512_mask[1] = 4;
5000   params->avx512.shuffle512_mask[2] = 8;
5001   params->avx512.shuffle512_mask[3] = 12;
5002   params->avx512.shuffle512_mask[4] = 1;
5003   params->avx512.shuffle512_mask[5] = 5;
5004   params->avx512.shuffle512_mask[6] = 9;
5005   params->avx512.shuffle512_mask[7] = 13;
5006   params->avx512.shuffle512_mask[8] = 2;
5007   params->avx512.shuffle512_mask[9] = 6;
5008   params->avx512.shuffle512_mask[10] = 10;
5009   params->avx512.shuffle512_mask[11] = 14;
5010   params->avx512.shuffle512_mask[12] = 3;
5011   params->avx512.shuffle512_mask[13] = 7;
5012   params->avx512.shuffle512_mask[14] = 11;
5013   params->avx512.shuffle512_mask[15] = 15;
5014   params->avx512.shuffle256_mask[0] = 0;
5015   params->avx512.shuffle256_mask[1] = 4;
5016   params->avx512.shuffle256_mask[2] = 2;
5017   params->avx512.shuffle256_mask[3] = 6;
5018   params->avx512.shuffle256_mask[4] = 1;
5019   params->avx512.shuffle256_mask[5] = 5;
5020   params->avx512.shuffle256_mask[6] = 3;
5021   params->avx512.shuffle256_mask[7] = 7;
5022 }
5023 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
5024 
5025 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_qs8_cvt_wasmsimd_cvt_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5026 XNN_INTERNAL void xnn_init_f32_qs8_cvt_wasmsimd_cvt_params(
5027   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5028   float scale,
5029   int8_t output_zero_point,
5030   int8_t output_min,
5031   int8_t output_max)
5032 {
5033   for (uint32_t i = 0; i < 2; i++) {
5034     params->wasmsimd_cvt.scale[i] = scale;
5035   }
5036   for (uint32_t i = 0; i < 4; i++) {
5037     params->wasmsimd_cvt.output_zero_point[i] = (int16_t) output_zero_point;
5038   }
5039   for (uint32_t i = 0; i < 8; i++) {
5040     params->wasmsimd_cvt.output_min[i] = output_min;
5041     params->wasmsimd_cvt.output_max[i] = output_max;
5042   }
5043 }
5044 
xnn_init_f32_qs8_cvt_wasmsimd_magic_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5045 XNN_INTERNAL void xnn_init_f32_qs8_cvt_wasmsimd_magic_params(
5046   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5047   float scale,
5048   int8_t output_zero_point,
5049   int8_t output_min,
5050   int8_t output_max)
5051 {
5052   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5053   const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
5054   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5055   for (uint32_t i = 0; i < 2; i++) {
5056     params->wasmsimd_magic.scale[i] = scale;
5057     params->wasmsimd_magic.magic_bias[i] = 12582912.0f;
5058     params->wasmsimd_magic.magic_min[i] = magic_min;
5059     params->wasmsimd_magic.magic_bias_less_zero_point[i] = magic_bias_less_zero_point;
5060   }
5061   for (uint32_t i = 0; i < 8; i++) {
5062     params->wasmsimd_magic.output_max[i] = output_max;
5063   }
5064 }
5065 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5066 
xnn_init_f32_qu8_cvt_scalar_fmagic_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5067 XNN_INTERNAL void xnn_init_f32_qu8_cvt_scalar_fmagic_params(
5068   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5069   float scale,
5070   uint8_t output_zero_point,
5071   uint8_t output_min,
5072   uint8_t output_max)
5073 {
5074   params->scalar_fmagic.scale = scale;
5075   params->scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5076   params->scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5077   params->scalar_fmagic.magic_bias = 12582912.0f;
5078   params->scalar_fmagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5079 }
5080 
xnn_init_f32_qu8_cvt_scalar_imagic_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5081 XNN_INTERNAL void xnn_init_f32_qu8_cvt_scalar_imagic_params(
5082   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5083   float scale,
5084   uint8_t output_zero_point,
5085   uint8_t output_min,
5086   uint8_t output_max)
5087 {
5088   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5089   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5090   params->scalar_imagic.scale = scale;
5091   params->scalar_imagic.magic_bias = 12582912.0f;
5092   params->scalar_imagic.magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
5093   params->scalar_imagic.magic_max = (int32_t) fp32_to_bits(12582912.0f + output_max_less_zero_point);
5094   params->scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5095 }
5096 
xnn_init_f32_qu8_cvt_scalar_lrintf_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5097 XNN_INTERNAL void xnn_init_f32_qu8_cvt_scalar_lrintf_params(
5098   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5099   float scale,
5100   uint8_t output_zero_point,
5101   uint8_t output_min,
5102   uint8_t output_max)
5103 {
5104   params->scalar_lrintf.scale = scale;
5105   params->scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5106   params->scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5107   params->scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
5108 }
5109 
5110 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_qu8_cvt_neon_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5111 XNN_INTERNAL void xnn_init_f32_qu8_cvt_neon_params(
5112   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5113   float scale,
5114   uint8_t output_zero_point,
5115   uint8_t output_min,
5116   uint8_t output_max)
5117 {
5118   params->neon.scale = scale;
5119   params->neon.magic_bias = 12582912.0f;
5120   params->neon.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5121   params->neon.output_min = output_min;
5122   params->neon.output_max = output_max;
5123 }
5124 
xnn_init_f32_qu8_cvt_neonv8_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5125 XNN_INTERNAL void xnn_init_f32_qu8_cvt_neonv8_params(
5126   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5127   float scale,
5128   uint8_t output_zero_point,
5129   uint8_t output_min,
5130   uint8_t output_max)
5131 {
5132   params->neonv8.scale = scale;
5133   params->neonv8.output_zero_point = (int16_t) output_zero_point;
5134   params->neonv8.output_min = output_min;
5135   params->neonv8.output_max = output_max;
5136 }
5137 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
5138 
5139 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_qu8_cvt_sse2_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5140 XNN_INTERNAL void xnn_init_f32_qu8_cvt_sse2_params(
5141   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5142   float scale,
5143   uint8_t output_zero_point,
5144   uint8_t output_min,
5145   uint8_t output_max)
5146 {
5147   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5148   for (uint32_t i = 0; i < 4; i++) {
5149     params->sse2.scale[i] = scale;
5150     params->sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
5151   }
5152   for (uint32_t i = 0; i < 8; i++) {
5153     params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
5154   }
5155   for (uint32_t i = 0; i < 16; i++) {
5156     params->sse2.output_min[i] = output_min;
5157   }
5158 }
5159 
xnn_init_f32_qu8_cvt_avx_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5160 XNN_INTERNAL void xnn_init_f32_qu8_cvt_avx_params(
5161   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5162   float scale,
5163   uint8_t output_zero_point,
5164   uint8_t output_min,
5165   uint8_t output_max)
5166 {
5167   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5168   for (uint32_t i = 0; i < 8; i++) {
5169     params->avx.scale[i] = scale;
5170     params->avx.output_max_less_zero_point[i] = output_max_less_zero_point;
5171   }
5172   for (uint32_t i = 0; i < 8; i++) {
5173     params->avx.output_zero_point[i] = (int16_t) output_zero_point;
5174   }
5175   for (uint32_t i = 0; i < 16; i++) {
5176     params->avx.output_min[i] = output_min;
5177   }
5178   for (uint32_t i = 0; i < 7; i++) {
5179     params->avx.mask_table[i] = -1;
5180   }
5181   for (uint32_t i = 7; i < 14; i++) {
5182     params->avx.mask_table[i] = 0;
5183   }
5184 }
5185 
xnn_init_f32_qu8_cvt_avx2_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5186 XNN_INTERNAL void xnn_init_f32_qu8_cvt_avx2_params(
5187   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5188   float scale,
5189   uint8_t output_zero_point,
5190   uint8_t output_min,
5191   uint8_t output_max)
5192 {
5193   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5194   for (uint32_t i = 0; i < 8; i++) {
5195     params->avx2.scale[i] = scale;
5196     params->avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
5197   }
5198   for (uint32_t i = 0; i < 16; i++) {
5199     params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
5200   }
5201   params->avx2.shuffle_mask[0] = 0;
5202   params->avx2.shuffle_mask[1] = 4;
5203   params->avx2.shuffle_mask[2] = 1;
5204   params->avx2.shuffle_mask[3] = 5;
5205   params->avx2.shuffle_mask[4] = 2;
5206   params->avx2.shuffle_mask[5] = 6;
5207   params->avx2.shuffle_mask[6] = 3;
5208   params->avx2.shuffle_mask[7] = 7;
5209   for (uint32_t i = 0; i < 32; i++) {
5210     params->avx2.output_min[i] = output_min;
5211   }
5212   for (uint32_t i = 0; i < 7; i++) {
5213     params->avx2.mask_table[i] = -1;
5214   }
5215   for (uint32_t i = 7; i < 14; i++) {
5216     params->avx2.mask_table[i] = 0;
5217   }
5218 }
5219 
xnn_init_f32_qu8_cvt_avx512_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5220 XNN_INTERNAL void xnn_init_f32_qu8_cvt_avx512_params(
5221   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5222   float scale,
5223   uint8_t output_zero_point,
5224   uint8_t output_min,
5225   uint8_t output_max)
5226 {
5227   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5228   for (uint32_t i = 0; i < 16; i++) {
5229     params->avx512.scale[i] = scale;
5230     params->avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
5231   }
5232   for (uint32_t i = 0; i < 32; i++) {
5233     params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
5234   }
5235   for (uint32_t i = 0; i < 64; i++) {
5236     params->avx512.output_min[i] = output_min;
5237   }
5238   params->avx512.shuffle512_mask[0] = 0;
5239   params->avx512.shuffle512_mask[1] = 4;
5240   params->avx512.shuffle512_mask[2] = 8;
5241   params->avx512.shuffle512_mask[3] = 12;
5242   params->avx512.shuffle512_mask[4] = 1;
5243   params->avx512.shuffle512_mask[5] = 5;
5244   params->avx512.shuffle512_mask[6] = 9;
5245   params->avx512.shuffle512_mask[7] = 13;
5246   params->avx512.shuffle512_mask[8] = 2;
5247   params->avx512.shuffle512_mask[9] = 6;
5248   params->avx512.shuffle512_mask[10] = 10;
5249   params->avx512.shuffle512_mask[11] = 14;
5250   params->avx512.shuffle512_mask[12] = 3;
5251   params->avx512.shuffle512_mask[13] = 7;
5252   params->avx512.shuffle512_mask[14] = 11;
5253   params->avx512.shuffle512_mask[15] = 15;
5254   params->avx512.shuffle256_mask[0] = 0;
5255   params->avx512.shuffle256_mask[1] = 4;
5256   params->avx512.shuffle256_mask[2] = 2;
5257   params->avx512.shuffle256_mask[3] = 6;
5258   params->avx512.shuffle256_mask[4] = 1;
5259   params->avx512.shuffle256_mask[5] = 5;
5260   params->avx512.shuffle256_mask[6] = 3;
5261   params->avx512.shuffle256_mask[7] = 7;
5262 }
5263 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
5264 
5265 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_qu8_cvt_wasmsimd_cvt_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5266 XNN_INTERNAL void xnn_init_f32_qu8_cvt_wasmsimd_cvt_params(
5267   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5268   float scale,
5269   uint8_t output_zero_point,
5270   uint8_t output_min,
5271   uint8_t output_max)
5272 {
5273   for (uint32_t i = 0; i < 2; i++) {
5274     params->wasmsimd_cvt.scale[i] = scale;
5275   }
5276   for (uint32_t i = 0; i < 4; i++) {
5277     params->wasmsimd_cvt.output_zero_point[i] = (int16_t) output_zero_point;
5278   }
5279   for (uint32_t i = 0; i < 8; i++) {
5280     params->wasmsimd_cvt.output_min[i] = output_min;
5281     params->wasmsimd_cvt.output_max[i] = output_max;
5282   }
5283 }
5284 
xnn_init_f32_qu8_cvt_wasmsimd_magic_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)5285 XNN_INTERNAL void xnn_init_f32_qu8_cvt_wasmsimd_magic_params(
5286   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5287   float scale,
5288   uint8_t output_zero_point,
5289   uint8_t output_min,
5290   uint8_t output_max)
5291 {
5292   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5293   const int32_t magic_min = (int32_t) fp32_to_bits(12582912.0f + output_min_less_zero_point);
5294   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5295   for (uint32_t i = 0; i < 2; i++) {
5296     params->wasmsimd_magic.scale[i] = scale;
5297     params->wasmsimd_magic.magic_bias[i] = 12582912.0f;
5298     params->wasmsimd_magic.magic_min[i] = magic_min;
5299     params->wasmsimd_magic.magic_bias_less_zero_point[i] = magic_bias_less_zero_point;
5300   }
5301   for (uint32_t i = 0; i < 8; i++) {
5302     params->wasmsimd_magic.output_max[i] = output_max;
5303   }
5304 }
5305 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5306 
xnn_init_qs8_f32_cvt_scalar_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5307 XNN_INTERNAL void xnn_init_qs8_f32_cvt_scalar_params(
5308   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5309   float scale,
5310   int8_t zero_point)
5311 {
5312   params->scalar.zero_point = (int32_t) zero_point;
5313   params->scalar.scale = scale;
5314 }
5315 
5316 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_f32_cvt_neon_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5317 XNN_INTERNAL void xnn_init_qs8_f32_cvt_neon_params(
5318   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5319   float scale,
5320   int8_t zero_point)
5321 {
5322   params->neon.minus_zero_point[0] = -(int16_t) zero_point;
5323   params->neon.minus_zero_point[1] = -(int16_t) zero_point;
5324   params->neon.scale = scale;
5325 }
5326 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
5327 
5328 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_f32_cvt_sse2_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5329 XNN_INTERNAL void xnn_init_qs8_f32_cvt_sse2_params(
5330   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5331   float scale,
5332   int8_t zero_point)
5333 {
5334   for (uint32_t i = 0; i < 16; i++) {
5335     params->sse2.sign_mask[i] = UINT8_C(0x80);
5336   }
5337   for (uint32_t i = 0; i < 8; i++) {
5338     params->sse2.magic_exp[i] = UINT16_C(0x4B00);
5339   }
5340   const float magic_bias = (float) (INT32_C(0x00800080) + (int32_t) zero_point);
5341   for (uint32_t i = 0; i < 4; i++) {
5342     params->sse2.magic_bias[i] = magic_bias;
5343     params->sse2.scale[i] = scale;
5344   }
5345 }
5346 
xnn_init_qs8_f32_cvt_sse4_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5347 XNN_INTERNAL void xnn_init_qs8_f32_cvt_sse4_params(
5348   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5349   float scale,
5350   int8_t zero_point)
5351 {
5352   for (uint32_t i = 0; i < 4; i++) {
5353     params->sse4.minus_zero_point[i] = -(int32_t) zero_point;
5354     params->sse4.scale[i] = scale;
5355   }
5356 }
5357 
xnn_init_qs8_f32_cvt_avx_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5358 XNN_INTERNAL void xnn_init_qs8_f32_cvt_avx_params(
5359   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5360   float scale,
5361   int8_t zero_point)
5362 {
5363   for (uint32_t i = 0; i < 8; i++) {
5364     params->avx.minus_zero_point[i] = -(int32_t) zero_point;
5365     params->avx.scale[i] = scale;
5366   }
5367 }
5368 
xnn_init_qs8_f32_cvt_avx512_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5369 XNN_INTERNAL void xnn_init_qs8_f32_cvt_avx512_params(
5370   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5371   float scale,
5372   int8_t zero_point)
5373 {
5374   for (uint32_t i = 0; i < 16; i++) {
5375     params->avx512.minus_zero_point[i] = -(int32_t) zero_point;
5376     params->avx512.scale[i] = scale;
5377   }
5378 }
5379 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
5380 
5381 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_f32_cvt_wasmsimd_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)5382 XNN_INTERNAL void xnn_init_qs8_f32_cvt_wasmsimd_params(
5383   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5384   float scale,
5385   int8_t zero_point)
5386 {
5387   for (uint32_t i = 0; i < 4; i++) {
5388     params->wasmsimd.minus_zero_point[i] = -(int16_t) zero_point;
5389   }
5390   for (uint32_t i = 0; i < 2; i++) {
5391     params->wasmsimd.scale[i] = scale;
5392   }
5393 }
5394 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5395 
xnn_init_qu8_f32_cvt_scalar_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5396 XNN_INTERNAL void xnn_init_qu8_f32_cvt_scalar_params(
5397   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5398   float scale,
5399   uint8_t zero_point)
5400 {
5401   params->scalar.zero_point = (int32_t) zero_point;
5402   params->scalar.scale = scale;
5403 }
5404 
5405 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_f32_cvt_neon_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5406 XNN_INTERNAL void xnn_init_qu8_f32_cvt_neon_params(
5407   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5408   float scale,
5409   uint8_t zero_point)
5410 {
5411   params->neon.minus_zero_point[0] = -(int16_t) zero_point;
5412   params->neon.minus_zero_point[1] = -(int16_t) zero_point;
5413   params->neon.scale = scale;
5414 }
5415 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
5416 
5417 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_f32_cvt_sse2_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5418 XNN_INTERNAL void xnn_init_qu8_f32_cvt_sse2_params(
5419   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5420   float scale,
5421   uint8_t zero_point)
5422 {
5423   for (uint32_t i = 0; i < 8; i++) {
5424     params->sse2.magic_exp[i] = UINT16_C(0x4B00);
5425   }
5426   const float magic_bias = (float) (INT32_C(0x00800000) + (int32_t) zero_point);
5427   for (uint32_t i = 0; i < 4; i++) {
5428     params->sse2.magic_bias[i] = magic_bias;
5429     params->sse2.scale[i] = scale;
5430   }
5431 }
5432 
xnn_init_qu8_f32_cvt_sse4_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5433 XNN_INTERNAL void xnn_init_qu8_f32_cvt_sse4_params(
5434   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5435   float scale,
5436   uint8_t zero_point)
5437 {
5438   for (uint32_t i = 0; i < 4; i++) {
5439     params->sse4.minus_zero_point[i] = -(int32_t) zero_point;
5440     params->sse4.scale[i] = scale;
5441   }
5442 }
5443 
xnn_init_qu8_f32_cvt_avx_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5444 XNN_INTERNAL void xnn_init_qu8_f32_cvt_avx_params(
5445   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5446   float scale,
5447   uint8_t zero_point)
5448 {
5449   for (uint32_t i = 0; i < 8; i++) {
5450     params->avx.minus_zero_point[i] = -(int32_t) zero_point;
5451     params->avx.scale[i] = scale;
5452   }
5453 }
5454 
xnn_init_qu8_f32_cvt_avx512_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5455 XNN_INTERNAL void xnn_init_qu8_f32_cvt_avx512_params(
5456   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5457   float scale,
5458   uint8_t zero_point)
5459 {
5460   for (uint32_t i = 0; i < 16; i++) {
5461     params->avx512.minus_zero_point[i] = -(int32_t) zero_point;
5462     params->avx512.scale[i] = scale;
5463   }
5464 }
5465 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
5466 
5467 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_f32_cvt_wasmsimd_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)5468 XNN_INTERNAL void xnn_init_qu8_f32_cvt_wasmsimd_params(
5469   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
5470   float scale,
5471   uint8_t zero_point)
5472 {
5473   for (uint32_t i = 0; i < 4; i++) {
5474     params->wasmsimd.minus_zero_point[i] = -(int16_t) zero_point;
5475   }
5476   for (uint32_t i = 0; i < 2; i++) {
5477     params->wasmsimd.scale[i] = scale;
5478   }
5479 }
5480 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5481