• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <stdint.h>
7 #include <stddef.h>
8 #include <assert.h>
9 #include <math.h>
10 
11 #include <fp16.h>
12 
13 #include <xnnpack/math.h>
14 #include <xnnpack/microparams-init.h>
15 #include <xnnpack/unaligned.h>
16 
17 
xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params(union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)18 size_t xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params(
19   union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
20   int8_t output_zero_point,
21   int8_t output_min,
22   int8_t output_max)
23 {
24   params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
25   params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
26   params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
27   params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
28   return sizeof(params->fp32_scalar_fmagic);
29 }
30 
xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params(union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)31 size_t xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params(
32   union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
33   int8_t output_zero_point,
34   int8_t output_min,
35   int8_t output_max)
36 {
37   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
38   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
39   params->fp32_scalar_imagic.magic_bias = 12582912.0f;
40   params->fp32_scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
41   params->fp32_scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
42   params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
43   return sizeof(params->fp32_scalar_imagic);
44 }
45 
xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params(union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)46 size_t xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params(
47   union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
48   int8_t output_zero_point,
49   int8_t output_min,
50   int8_t output_max)
51 {
52   params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
53   params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
54   params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
55   return sizeof(params->fp32_scalar_lrintf);
56 }
57 
58 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qc8_conv_minmax_fp32_sse2_params(union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)59 size_t xnn_init_qc8_conv_minmax_fp32_sse2_params(
60   union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
61   int8_t output_zero_point,
62   int8_t output_min,
63   int8_t output_max)
64 {
65   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
66   for (uint32_t i = 0; i < 4; i++) {
67     params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
68   }
69   for (uint32_t i = 0; i < 8; i++) {
70     params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
71     params->fp32_sse2.output_min[i] = (int16_t) output_min;
72   }
73   return sizeof(params->fp32_sse2);
74 }
75 
xnn_init_qc8_conv_minmax_fp32_sse4_params(union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)76 size_t xnn_init_qc8_conv_minmax_fp32_sse4_params(
77   union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
78   int8_t output_zero_point,
79   int8_t output_min,
80   int8_t output_max)
81 {
82   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
83   for (uint32_t i = 0; i < 4; i++) {
84     params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
85   }
86   for (uint32_t i = 0; i < 8; i++) {
87     params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
88   }
89   for (uint32_t i = 0; i < 16; i++) {
90     params->fp32_sse4.output_min[i] = output_min;
91   }
92   return sizeof(params->fp32_sse4);
93 }
94 
xnn_init_qc8_conv_minmax_fp32_avx2_params(union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)95 size_t xnn_init_qc8_conv_minmax_fp32_avx2_params(
96   union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
97   int8_t output_zero_point,
98   int8_t output_min,
99   int8_t output_max)
100 {
101   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
102   for (uint32_t i = 0; i < 8; i++) {
103     params->fp32_avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
104   }
105   for (uint32_t i = 0; i < 16; i++) {
106     params->fp32_avx2.output_zero_point[i] = (int16_t) output_zero_point;
107   }
108   for (uint32_t i = 0; i < 32; i++) {
109     params->fp32_avx2.output_min[i] = output_min;
110   }
111   return sizeof(params->fp32_avx2);
112 }
113 
xnn_init_qc8_conv_minmax_fp32_avx512_params(union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)114 size_t xnn_init_qc8_conv_minmax_fp32_avx512_params(
115   union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
116   int8_t output_zero_point,
117   int8_t output_min,
118   int8_t output_max)
119 {
120   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
121   for (uint32_t i = 0; i < 16; i++) {
122     params->fp32_avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
123   }
124   for (uint32_t i = 0; i < 32; i++) {
125     params->fp32_avx512.output_zero_point[i] = (int16_t) output_zero_point;
126   }
127   for (uint32_t i = 0; i < 64; i++) {
128     params->fp32_avx512.output_min[i] = output_min;
129   }
130   return sizeof(params->fp32_avx512);
131 }
132 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
133 
134 #if XNN_ARCH_ARM
xnn_init_qc8_conv_minmax_fp32_armsimd32_params(union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)135 size_t xnn_init_qc8_conv_minmax_fp32_armsimd32_params(
136   union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
137   int8_t output_zero_point,
138   int8_t output_min,
139   int8_t output_max)
140 {
141   params->fp32_armsimd32.magic_bias = 12582912.0f;
142   params->fp32_armsimd32.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
143   params->fp32_armsimd32.output_min = (uint32_t) (uint8_t) output_min * UINT32_C(0x01010101);
144   params->fp32_armsimd32.output_max = (uint32_t) (uint8_t) output_max * UINT32_C(0x01010101);
145   return sizeof(params->fp32_armsimd32);
146 }
147 #endif  // XNN_ARCH_ARM
148 
149 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qc8_conv_minmax_fp32_neon_params(union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)150 size_t xnn_init_qc8_conv_minmax_fp32_neon_params(
151   union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
152   int8_t output_zero_point,
153   int8_t output_min,
154   int8_t output_max)
155 {
156   params->fp32_neon.magic_bias = 12582912.0f;
157   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
158   params->fp32_neon.output_min = output_min;
159   params->fp32_neon.output_max = output_max;
160   return sizeof(params->fp32_neon);
161 }
162 
xnn_init_qc8_conv_minmax_fp32_neonv8_params(union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)163 size_t xnn_init_qc8_conv_minmax_fp32_neonv8_params(
164   union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
165   int8_t output_zero_point,
166   int8_t output_min,
167   int8_t output_max)
168 {
169   params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
170   params->fp32_neonv8.output_min = output_min;
171   params->fp32_neonv8.output_max = output_max;
172   return sizeof(params->fp32_neonv8);
173 }
174 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
175 
176 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qc8_conv_minmax_fp32_wasmsimd_params(union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_zero_point,int8_t output_min,int8_t output_max)177 size_t xnn_init_qc8_conv_minmax_fp32_wasmsimd_params(
178   union xnn_qc8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
179   int8_t output_zero_point,
180   int8_t output_min,
181   int8_t output_max)
182 {
183   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
184   const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
185   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
186   for (uint32_t i = 0; i < 2; i++) {
187     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
188     params->fp32_wasmsimd.magic_min[i] = magic_min;
189     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
190   }
191   for (uint32_t i = 0; i < 8; i++) {
192     params->fp32_wasmsimd.output_max[i] = output_max;
193   }
194   return sizeof(params->fp32_wasmsimd);
195 }
196 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
197 
xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)198 size_t xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params(
199   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
200   float scale,
201   int8_t output_zero_point,
202   int8_t output_min,
203   int8_t output_max)
204 {
205   assert(scale >= 0x1.0p-32f);
206   assert(scale < 256.0f);
207 
208   params->fp32_scalar_fmagic.scale = scale;
209   params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
210   params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
211   params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
212   params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
213   return sizeof(params->fp32_scalar_fmagic);
214 }
215 
xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)216 size_t xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params(
217   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
218   float scale,
219   int8_t output_zero_point,
220   int8_t output_min,
221   int8_t output_max)
222 {
223   assert(scale >= 0x1.0p-32f);
224   assert(scale < 256.0f);
225 
226   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
227   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
228   params->fp32_scalar_imagic.scale = scale;
229   params->fp32_scalar_imagic.magic_bias = 12582912.0f;
230   params->fp32_scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
231   params->fp32_scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
232   params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
233   return sizeof(params->fp32_scalar_imagic);
234 }
235 
xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)236 size_t xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params(
237   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
238   float scale,
239   int8_t output_zero_point,
240   int8_t output_min,
241   int8_t output_max)
242 {
243   assert(scale >= 0x1.0p-32f);
244   assert(scale < 256.0f);
245 
246   params->fp32_scalar_lrintf.scale = scale;
247   params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
248   params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
249   params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
250   return sizeof(params->fp32_scalar_lrintf);
251 }
252 
253 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_conv_minmax_fp32_sse2_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)254 size_t xnn_init_qs8_conv_minmax_fp32_sse2_params(
255   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
256   float scale,
257   int8_t output_zero_point,
258   int8_t output_min,
259   int8_t output_max)
260 {
261   assert(scale >= 0x1.0p-32f);
262   assert(scale < 256.0f);
263 
264   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
265   for (uint32_t i = 0; i < 4; i++) {
266     params->fp32_sse2.scale[i] = scale;
267     params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
268   }
269   for (uint32_t i = 0; i < 8; i++) {
270     params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
271     params->fp32_sse2.output_min[i] = (int16_t) output_min;
272   }
273   return sizeof(params->fp32_sse2);
274 }
275 
xnn_init_qs8_conv_minmax_fp32_sse4_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)276 size_t xnn_init_qs8_conv_minmax_fp32_sse4_params(
277   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
278   float scale,
279   int8_t output_zero_point,
280   int8_t output_min,
281   int8_t output_max)
282 {
283   assert(scale >= 0x1.0p-32f);
284   assert(scale < 256.0f);
285 
286   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
287   for (uint32_t i = 0; i < 4; i++) {
288     params->fp32_sse4.scale[i] = scale;
289     params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
290   }
291   for (uint32_t i = 0; i < 8; i++) {
292     params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
293   }
294   for (uint32_t i = 0; i < 16; i++) {
295     params->fp32_sse4.output_min[i] = output_min;
296   }
297   return sizeof(params->fp32_sse4);
298 }
299 
xnn_init_qs8_conv_minmax_fp32_avx2_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)300 size_t xnn_init_qs8_conv_minmax_fp32_avx2_params(
301   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
302   float scale,
303   int8_t output_zero_point,
304   int8_t output_min,
305   int8_t output_max)
306 {
307   assert(scale >= 0x1.0p-32f);
308   assert(scale < 256.0f);
309 
310   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
311   for (uint32_t i = 0; i < 8; i++) {
312     params->fp32_avx2.scale[i] = scale;
313     params->fp32_avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
314   }
315   for (uint32_t i = 0; i < 16; i++) {
316     params->fp32_avx2.output_zero_point[i] = (int16_t) output_zero_point;
317   }
318   for (uint32_t i = 0; i < 32; i++) {
319     params->fp32_avx2.output_min[i] = output_min;
320   }
321   return sizeof(params->fp32_avx2);
322 }
323 
xnn_init_qs8_conv_minmax_fp32_avx512_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)324 size_t xnn_init_qs8_conv_minmax_fp32_avx512_params(
325   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
326   float scale,
327   int8_t output_zero_point,
328   int8_t output_min,
329   int8_t output_max)
330 {
331   assert(scale >= 0x1.0p-32f);
332   assert(scale < 256.0f);
333 
334   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
335   for (uint32_t i = 0; i < 16; i++) {
336     params->fp32_avx512.scale[i] = scale;
337     params->fp32_avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
338   }
339   for (uint32_t i = 0; i < 32; i++) {
340     params->fp32_avx512.output_zero_point[i] = (int16_t) output_zero_point;
341   }
342   for (uint32_t i = 0; i < 64; i++) {
343     params->fp32_avx512.output_min[i] = output_min;
344   }
345   return sizeof(params->fp32_avx512);
346 }
347 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
348 
349 #if XNN_ARCH_ARM
xnn_init_qs8_conv_minmax_fp32_armsimd32_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)350 size_t xnn_init_qs8_conv_minmax_fp32_armsimd32_params(
351   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
352   float scale,
353   int8_t output_zero_point,
354   int8_t output_min,
355   int8_t output_max)
356 {
357   assert(scale >= 0x1.0p-32f);
358   assert(scale < 256.0f);
359 
360   params->fp32_armsimd32.scale = scale;
361   params->fp32_armsimd32.magic_bias = 12582912.0f;
362   params->fp32_armsimd32.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
363   params->fp32_armsimd32.output_min = (uint32_t) (uint8_t) output_min * UINT32_C(0x01010101);
364   params->fp32_armsimd32.output_max = (uint32_t) (uint8_t) output_max * UINT32_C(0x01010101);
365   return sizeof(params->fp32_armsimd32);
366 }
367 #endif  // XNN_ARCH_ARM
368 
369 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_conv_minmax_fp32_neon_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)370 size_t xnn_init_qs8_conv_minmax_fp32_neon_params(
371   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
372   float scale,
373   int8_t output_zero_point,
374   int8_t output_min,
375   int8_t output_max)
376 {
377   assert(scale >= 0x1.0p-32f);
378   assert(scale < 256.0f);
379 
380   params->fp32_neon.scale = scale;
381   params->fp32_neon.magic_bias = 12582912.0f;
382   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
383   params->fp32_neon.output_min = output_min;
384   params->fp32_neon.output_max = output_max;
385   return sizeof(params->fp32_neon);
386 }
387 
xnn_init_qs8_conv_minmax_fp32_neonv8_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)388 size_t xnn_init_qs8_conv_minmax_fp32_neonv8_params(
389   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
390   float scale,
391   int8_t output_zero_point,
392   int8_t output_min,
393   int8_t output_max)
394 {
395   assert(scale >= 0x1.0p-32f);
396   assert(scale < 256.0f);
397 
398   params->fp32_neonv8.scale = scale;
399   params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
400   params->fp32_neonv8.output_min = output_min;
401   params->fp32_neonv8.output_max = output_max;
402   return sizeof(params->fp32_neonv8);
403 }
404 
xnn_init_qs8_conv_minmax_rndnu_neon_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)405 size_t xnn_init_qs8_conv_minmax_rndnu_neon_params(
406   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
407   float scale,
408   int8_t output_zero_point,
409   int8_t output_min,
410   int8_t output_max)
411 {
412   assert(scale >= 0x1.0p-32f);
413   assert(scale < 256.0f);
414 
415   // Compute requantization parameters.
416   const uint32_t scale_bits = float_as_uint32(scale);
417 
418   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
419   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
420   assert(multiplier >= INT32_C(0x40000000));
421   assert(multiplier <= INT32_C(0x7FFFFF80));
422 
423   // Shift is in [-8, 31] range.
424   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
425   assert(shift >= -8);
426   assert(shift < 32);
427 
428   // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
429   const int32_t post_shift = math_max_s32(shift, 1);
430   const int32_t pre_shift = shift - post_shift;
431 
432   params->rndnu_neon.right_pre_shift = -pre_shift;
433   params->rndnu_neon.multiplier = multiplier;
434   params->rndnu_neon.right_post_shift = -post_shift;
435   params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
436   params->rndnu_neon.output_min = output_min;
437   params->rndnu_neon.output_max = output_max;
438   return sizeof(params->rndnu_neon);
439 }
440 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
441 
442 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_conv_minmax_fp32_wasmsimd_params(union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)443 size_t xnn_init_qs8_conv_minmax_fp32_wasmsimd_params(
444   union xnn_qs8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
445   float scale,
446   int8_t output_zero_point,
447   int8_t output_min,
448   int8_t output_max)
449 {
450   assert(scale >= 0x1.0p-32f);
451   assert(scale < 256.0f);
452 
453   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
454   const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
455   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
456   for (uint32_t i = 0; i < 2; i++) {
457     params->fp32_wasmsimd.scale[i] = scale;
458     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
459     params->fp32_wasmsimd.magic_min[i] = magic_min;
460     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
461   }
462   for (uint32_t i = 0; i < 8; i++) {
463     params->fp32_wasmsimd.output_max[i] = output_max;
464   }
465   return sizeof(params->fp32_wasmsimd);
466 }
467 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
468 
xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)469 size_t xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params(
470   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
471   uint8_t kernel_zero_point,
472   float scale,
473   uint8_t output_zero_point,
474   uint8_t output_min,
475   uint8_t output_max)
476 {
477   assert(scale >= 0x1.0p-32f);
478   assert(scale < 256.0f);
479 
480   params->fp32_scalar_fmagic.kernel_zero_point = (int32_t) kernel_zero_point;
481   params->fp32_scalar_fmagic.scale = scale;
482   params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
483   params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
484   params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
485   params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
486   return sizeof(params->fp32_scalar_fmagic);
487 }
488 
xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)489 size_t xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params(
490   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
491   uint8_t kernel_zero_point,
492   float scale,
493   uint8_t output_zero_point,
494   uint8_t output_min,
495   uint8_t output_max)
496 {
497   assert(scale >= 0x1.0p-32f);
498   assert(scale < 256.0f);
499 
500   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
501   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
502   params->fp32_scalar_imagic.kernel_zero_point = (int32_t) kernel_zero_point;
503   params->fp32_scalar_imagic.scale = scale;
504   params->fp32_scalar_imagic.magic_bias = 12582912.0f;
505   params->fp32_scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
506   params->fp32_scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
507   params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
508   return sizeof(params->fp32_scalar_imagic);
509 }
510 
xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)511 size_t xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params(
512   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
513   uint8_t kernel_zero_point,
514   float scale,
515   uint8_t output_zero_point,
516   uint8_t output_min,
517   uint8_t output_max)
518 {
519   assert(scale >= 0x1.0p-32f);
520   assert(scale < 256.0f);
521 
522   params->fp32_scalar_lrintf.kernel_zero_point = (int32_t) kernel_zero_point;
523   params->fp32_scalar_lrintf.scale = scale;
524   params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
525   params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
526   params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
527   return sizeof(params->fp32_scalar_lrintf);
528 }
529 
530 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_conv_minmax_fp32_sse2_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)531 size_t xnn_init_qu8_conv_minmax_fp32_sse2_params(
532   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
533   uint8_t kernel_zero_point,
534   float scale,
535   uint8_t output_zero_point,
536   uint8_t output_min,
537   uint8_t output_max)
538 {
539   assert(scale >= 0x1.0p-32f);
540   assert(scale < 256.0f);
541 
542   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
543   for (uint32_t i = 0; i < 4; i++) {
544     params->fp32_sse2.scale[i] = scale;
545     params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
546   }
547   for (uint32_t i = 0; i < 8; i++) {
548     params->fp32_sse2.kernel_zero_point[i] = (int16_t) kernel_zero_point;
549     params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
550   }
551   for (uint32_t i = 0; i < 16; i++) {
552     params->fp32_sse2.output_min[i] = output_min;
553   }
554   return sizeof(params->fp32_sse2);
555 }
556 
xnn_init_qu8_conv_minmax_fp32_avx2_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)557 size_t xnn_init_qu8_conv_minmax_fp32_avx2_params(
558   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
559   uint8_t kernel_zero_point,
560   float scale,
561   uint8_t output_zero_point,
562   uint8_t output_min,
563   uint8_t output_max)
564 {
565   assert(scale >= 0x1.0p-32f);
566   assert(scale < 256.0f);
567 
568   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
569   for (uint32_t i = 0; i < 8; i++) {
570     params->fp32_avx2.scale[i] = scale;
571     params->fp32_avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
572   }
573   for (uint32_t i = 0; i < 16; i++) {
574     params->fp32_avx2.kernel_zero_point[i] = (int16_t) kernel_zero_point;
575     params->fp32_avx2.output_zero_point[i] = (int16_t) output_zero_point;
576   }
577   for (uint32_t i = 0; i < 32; i++) {
578     params->fp32_avx2.output_min[i] = output_min;
579   }
580   return sizeof(params->fp32_avx2);
581 }
582 
xnn_init_qu8_conv_minmax_fp32_avx512_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)583 size_t xnn_init_qu8_conv_minmax_fp32_avx512_params(
584   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
585   uint8_t kernel_zero_point,
586   float scale,
587   uint8_t output_zero_point,
588   uint8_t output_min,
589   uint8_t output_max)
590 {
591   assert(scale >= 0x1.0p-32f);
592   assert(scale < 256.0f);
593 
594   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
595   for (uint32_t i = 0; i < 16; i++) {
596     params->fp32_avx512.scale[i] = scale;
597     params->fp32_avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
598   }
599   for (uint32_t i = 0; i < 32; i++) {
600     params->fp32_avx512.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
601     params->fp32_avx512.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
602   }
603   for (uint32_t i = 0; i < 64; i++) {
604     params->fp32_avx512.output_min[i] = output_min;
605   }
606   return sizeof(params->fp32_avx512);
607 }
608 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
609 
610 #if XNN_ARCH_ARM
xnn_init_qu8_conv_minmax_fp32_armsimd32_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)611 size_t xnn_init_qu8_conv_minmax_fp32_armsimd32_params(
612   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
613   uint8_t kernel_zero_point,
614   float scale,
615   uint8_t output_zero_point,
616   uint8_t output_min,
617   uint8_t output_max)
618 {
619   assert(scale >= 0x1.0p-32f);
620   assert(scale < 256.0f);
621 
622   const int32_t minus_kernel_zero_point = -(int32_t) kernel_zero_point;
623   params->fp32_armsimd32.scale = scale;
624   params->fp32_armsimd32.magic_bias = 12582912.0f;
625   params->fp32_armsimd32.minus_kernel_zero_point = (uint32_t) (uint16_t) minus_kernel_zero_point * UINT32_C(0x00010001);
626   params->fp32_armsimd32.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
627   params->fp32_armsimd32.output_min = (uint32_t) output_min * UINT32_C(0x01010101);
628   params->fp32_armsimd32.output_max = (uint32_t) output_max * UINT32_C(0x01010101);
629   return sizeof(params->fp32_armsimd32);
630 }
631 #endif  // XNN_ARCH_ARM
632 
633 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_conv_minmax_fp32_neon_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)634 size_t xnn_init_qu8_conv_minmax_fp32_neon_params(
635   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
636   uint8_t kernel_zero_point,
637   float scale,
638   uint8_t output_zero_point,
639   uint8_t output_min,
640   uint8_t output_max)
641 {
642   assert(scale >= 0x1.0p-32f);
643   assert(scale < 256.0f);
644 
645   params->fp32_neon.kernel_zero_point[0] = kernel_zero_point;
646   params->fp32_neon.kernel_zero_point[1] = kernel_zero_point;
647   params->fp32_neon.kernel_zero_point[2] = kernel_zero_point;
648   params->fp32_neon.kernel_zero_point[3] = kernel_zero_point;
649   params->fp32_neon.scale = scale;
650   params->fp32_neon.magic_bias = 12582912.0f;
651   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
652   params->fp32_neon.output_min = output_min;
653   params->fp32_neon.output_max = output_max;
654   return sizeof(params->fp32_neon);
655 }
656 
xnn_init_qu8_conv_minmax_fp32_neonv8_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)657 size_t xnn_init_qu8_conv_minmax_fp32_neonv8_params(
658   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
659   uint8_t kernel_zero_point,
660   float scale,
661   uint8_t output_zero_point,
662   uint8_t output_min,
663   uint8_t output_max)
664 {
665   assert(scale >= 0x1.0p-32f);
666   assert(scale < 256.0f);
667 
668   params->fp32_neonv8.kernel_zero_point[0] = kernel_zero_point;
669   params->fp32_neonv8.kernel_zero_point[1] = kernel_zero_point;
670   params->fp32_neonv8.kernel_zero_point[2] = kernel_zero_point;
671   params->fp32_neonv8.kernel_zero_point[3] = kernel_zero_point;
672   params->fp32_neonv8.scale = scale;
673   params->fp32_neonv8.output_zero_point = (int16_t) (uint16_t) output_zero_point;
674   params->fp32_neonv8.output_min = output_min;
675   params->fp32_neonv8.output_max = output_max;
676   return sizeof(params->fp32_neonv8);
677 }
678 
xnn_init_qu8_conv_minmax_rndnu_neon_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)679 size_t xnn_init_qu8_conv_minmax_rndnu_neon_params(
680   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
681   uint8_t kernel_zero_point,
682   float scale,
683   uint8_t output_zero_point,
684   uint8_t output_min,
685   uint8_t output_max)
686 {
687   assert(scale >= 0x1.0p-32f);
688   assert(scale < 256.0f);
689 
690   // Compute requantization parameters.
691   const uint32_t scale_bits = float_as_uint32(scale);
692 
693   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
694   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
695   assert(multiplier >= INT32_C(0x40000000));
696   assert(multiplier <= INT32_C(0x7FFFFF80));
697 
698   // Shift is in [-8, 31] range.
699   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
700   assert(shift >= -8);
701   assert(shift < 32);
702 
703   // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
704   const int32_t post_shift = math_max_s32(shift, 1);
705   const int32_t pre_shift = shift - post_shift;
706 
707   params->rndnu_neon.kernel_zero_point[0] = kernel_zero_point;
708   params->rndnu_neon.kernel_zero_point[1] = kernel_zero_point;
709   params->rndnu_neon.kernel_zero_point[2] = kernel_zero_point;
710   params->rndnu_neon.kernel_zero_point[3] = kernel_zero_point;
711   params->rndnu_neon.right_pre_shift = -pre_shift;
712   params->rndnu_neon.multiplier = multiplier;
713   params->rndnu_neon.right_post_shift = -post_shift;
714   params->rndnu_neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
715   params->rndnu_neon.output_min = output_min;
716   params->rndnu_neon.output_max = output_max;
717   return sizeof(params->rndnu_neon);
718 }
719 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
720 
721 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_conv_minmax_fp32_wasmsimd_params(union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t kernel_zero_point,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)722 size_t xnn_init_qu8_conv_minmax_fp32_wasmsimd_params(
723   union xnn_qu8_conv_minmax_params params[XNN_MIN_ELEMENTS(1)],
724   uint8_t kernel_zero_point,
725   float scale,
726   uint8_t output_zero_point,
727   uint8_t output_min,
728   uint8_t output_max)
729 {
730   assert(scale >= 0x1.0p-32f);
731   assert(scale < 256.0f);
732 
733   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
734   const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
735   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
736   for (uint32_t i = 0; i < 4; i++) {
737     params->fp32_wasmsimd.kernel_zero_point[i] = (int16_t) (uint16_t) kernel_zero_point;
738   }
739   for (uint32_t i = 0; i < 2; i++) {
740     params->fp32_wasmsimd.scale[i] = scale;
741     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
742     params->fp32_wasmsimd.magic_min[i] = magic_min;
743     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
744   }
745   for (uint32_t i = 0; i < 8; i++) {
746     params->fp32_wasmsimd.output_max[i] = output_max;
747   }
748   return sizeof(params->fp32_wasmsimd);
749 }
750 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
751 
xnn_init_qc8_scale_fp32_params(size_t channels,size_t channels_tile,size_t stride,const float scale[XNN_MIN_ELEMENTS (1)],void * packed_w)752 void xnn_init_qc8_scale_fp32_params(
753   size_t channels,
754   size_t channels_tile,
755   size_t stride,
756   const float scale[XNN_MIN_ELEMENTS(1)],
757   void* packed_w)
758 {
759   for (size_t tile_start = 0; tile_start < channels; tile_start += channels_tile) {
760     const size_t tile_size = min(channels - tile_start, channels_tile);
761     for (size_t tile_offset = 0; tile_offset < tile_size; tile_offset++) {
762       unaligned_indexed_store_f32(packed_w, tile_offset, scale[tile_start + tile_offset]);
763     }
764     packed_w = (void*) ((uintptr_t) packed_w + stride);
765   }
766 }
767 
xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)768 size_t xnn_init_qs8_avgpool_minmax_fp32_scalar_fmagic_params(
769   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
770   int32_t init_bias,
771   float scale,
772   int8_t output_zero_point,
773   int8_t output_min,
774   int8_t output_max)
775 {
776   assert(scale >= 0x1.0p-32f);
777   assert(scale < 256.0f);
778 
779   params->fp32_scalar_fmagic.init_bias = init_bias;
780   params->fp32_scalar_fmagic.scale = scale;
781   params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
782   params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
783   params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
784   params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
785   return sizeof(params->fp32_scalar_fmagic);
786 }
787 
xnn_update_qs8_avgpool_minmax_fp32_scalar_fmagic_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)788 void xnn_update_qs8_avgpool_minmax_fp32_scalar_fmagic_params(
789   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
790   int32_t init_bias,
791   float scale)
792 {
793   assert(scale >= 0x1.0p-32f);
794   assert(scale < 256.0f);
795 
796   params->fp32_scalar_fmagic.init_bias = init_bias;
797   params->fp32_scalar_fmagic.scale = scale;
798 }
799 
xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)800 size_t xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params(
801   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
802   int32_t init_bias,
803   float scale,
804   int8_t output_zero_point,
805   int8_t output_min,
806   int8_t output_max)
807 {
808   assert(scale >= 0x1.0p-32f);
809   assert(scale < 256.0f);
810 
811   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
812   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
813   params->fp32_scalar_imagic.init_bias = init_bias;
814   params->fp32_scalar_imagic.scale = scale;
815   params->fp32_scalar_imagic.magic_bias = 12582912.0f;
816   params->fp32_scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
817   params->fp32_scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
818   params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
819   return sizeof(params->fp32_scalar_imagic);
820 }
821 
xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)822 void xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params(
823   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
824   int32_t init_bias,
825   float scale)
826 {
827   assert(scale >= 0x1.0p-32f);
828   assert(scale < 256.0f);
829 
830   params->fp32_scalar_imagic.init_bias = init_bias;
831   params->fp32_scalar_imagic.scale = scale;
832 }
833 
xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)834 size_t xnn_init_qs8_avgpool_minmax_fp32_scalar_lrintf_params(
835   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
836   int32_t init_bias,
837   float scale,
838   int8_t output_zero_point,
839   int8_t output_min,
840   int8_t output_max)
841 {
842   assert(scale >= 0x1.0p-32f);
843   assert(scale < 256.0f);
844 
845   params->fp32_scalar_lrintf.init_bias = init_bias;
846   params->fp32_scalar_lrintf.scale = scale;
847   params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
848   params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
849   params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
850   return sizeof(params->fp32_scalar_lrintf);
851 }
852 
xnn_update_qs8_avgpool_minmax_fp32_scalar_lrintf_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)853 void xnn_update_qs8_avgpool_minmax_fp32_scalar_lrintf_params(
854   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
855   int32_t init_bias,
856   float scale)
857 {
858   assert(scale >= 0x1.0p-32f);
859   assert(scale < 256.0f);
860 
861   params->fp32_scalar_lrintf.init_bias = init_bias;
862   params->fp32_scalar_lrintf.scale = scale;
863 }
864 
865 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_avgpool_minmax_fp32_sse2_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)866 size_t xnn_init_qs8_avgpool_minmax_fp32_sse2_params(
867   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
868   int32_t init_bias,
869   float scale,
870   int8_t output_zero_point,
871   int8_t output_min,
872   int8_t output_max)
873 {
874   assert(scale >= 0x1.0p-32f);
875   assert(scale < 256.0f);
876 
877   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
878   for (uint32_t i = 0; i < 4; i++) {
879     params->fp32_sse2.init_bias[i] = init_bias;
880     params->fp32_sse2.scale[i] = scale;
881     params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
882   }
883   for (uint32_t i = 0; i < 8; i++) {
884     params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
885     params->fp32_sse2.output_min[i] = (int16_t) output_min;
886   }
887   return sizeof(params->fp32_sse2);
888 }
889 
xnn_update_qs8_avgpool_minmax_fp32_sse2_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)890 void xnn_update_qs8_avgpool_minmax_fp32_sse2_params(
891   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
892   int32_t init_bias,
893   float scale)
894 {
895   assert(scale >= 0x1.0p-32f);
896   assert(scale < 256.0f);
897 
898   for (uint32_t i = 0; i < 4; i++) {
899     params->fp32_sse2.init_bias[i] = init_bias;
900     params->fp32_sse2.scale[i] = scale;
901   }
902 }
903 
xnn_init_qs8_avgpool_minmax_fp32_sse4_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)904 size_t xnn_init_qs8_avgpool_minmax_fp32_sse4_params(
905   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
906   int32_t init_bias,
907   float scale,
908   int8_t output_zero_point,
909   int8_t output_min,
910   int8_t output_max)
911 {
912   assert(scale >= 0x1.0p-32f);
913   assert(scale < 256.0f);
914 
915   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
916   for (uint32_t i = 0; i < 4; i++) {
917     params->fp32_sse4.init_bias[i] = init_bias;
918     params->fp32_sse4.scale[i] = scale;
919     params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
920   }
921   for (uint32_t i = 0; i < 8; i++) {
922     params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
923   }
924   for (uint32_t i = 0; i < 16; i++) {
925     params->fp32_sse4.output_min[i] = output_min;
926   }
927   return sizeof(params->fp32_sse4);
928 }
929 
xnn_update_qs8_avgpool_minmax_fp32_sse4_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)930 void xnn_update_qs8_avgpool_minmax_fp32_sse4_params(
931   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
932   int32_t init_bias,
933   float scale)
934 {
935   assert(scale >= 0x1.0p-32f);
936   assert(scale < 256.0f);
937 
938   for (uint32_t i = 0; i < 4; i++) {
939     params->fp32_sse4.init_bias[i] = init_bias;
940     params->fp32_sse4.scale[i] = scale;
941   }
942 }
943 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
944 
945 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_avgpool_minmax_fp32_neon_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)946 size_t xnn_init_qs8_avgpool_minmax_fp32_neon_params(
947   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
948   int32_t init_bias,
949   float scale,
950   int8_t output_zero_point,
951   int8_t output_min,
952   int8_t output_max)
953 {
954   assert(scale >= 0x1.0p-32f);
955   assert(scale < 256.0f);
956 
957   params->fp32_neon.init_bias = init_bias;
958   params->fp32_neon.scale = scale;
959   params->fp32_neon.magic_bias = 12582912.0f;
960   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
961   params->fp32_neon.output_min = output_min;
962   params->fp32_neon.output_max = output_max;
963   return sizeof(params->fp32_neon);
964 }
965 
xnn_update_qs8_avgpool_minmax_fp32_neon_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)966 void xnn_update_qs8_avgpool_minmax_fp32_neon_params(
967   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
968   int32_t init_bias,
969   float scale)
970 {
971   assert(scale >= 0x1.0p-32f);
972   assert(scale < 256.0f);
973 
974   params->fp32_neon.init_bias = init_bias;
975   params->fp32_neon.scale = scale;
976 }
977 
xnn_init_qs8_avgpool_minmax_fp32_neonv8_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)978 size_t xnn_init_qs8_avgpool_minmax_fp32_neonv8_params(
979   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
980   int32_t init_bias,
981   float scale,
982   int8_t output_zero_point,
983   int8_t output_min,
984   int8_t output_max)
985 {
986   assert(scale >= 0x1.0p-32f);
987   assert(scale < 256.0f);
988 
989   params->fp32_neonv8.init_bias = init_bias;
990   params->fp32_neonv8.scale = scale;
991   params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
992   params->fp32_neonv8.output_min = output_min;
993   params->fp32_neonv8.output_max = output_max;
994   return sizeof(params->fp32_neonv8);
995 }
996 
xnn_update_qs8_avgpool_minmax_fp32_neonv8_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)997 void xnn_update_qs8_avgpool_minmax_fp32_neonv8_params(
998   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
999   int32_t init_bias,
1000   float scale)
1001 {
1002   assert(scale >= 0x1.0p-32f);
1003   assert(scale < 256.0f);
1004 
1005   params->fp32_neonv8.init_bias = init_bias;
1006   params->fp32_neonv8.scale = scale;
1007 }
1008 
xnn_init_qs8_avgpool_minmax_rndnu_neon_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)1009 size_t xnn_init_qs8_avgpool_minmax_rndnu_neon_params(
1010   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1011   int32_t init_bias,
1012   float scale,
1013   int8_t output_zero_point,
1014   int8_t output_min,
1015   int8_t output_max)
1016 {
1017   assert(scale >= 0x1.0p-32f);
1018   assert(scale < 256.0f);
1019 
1020   // Compute requantization parameters.
1021   const uint32_t scale_bits = float_as_uint32(scale);
1022 
1023   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
1024   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
1025   assert(multiplier >= INT32_C(0x40000000));
1026   assert(multiplier <= INT32_C(0x7FFFFF80));
1027 
1028   // Shift is in [-8, 31] range.
1029   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
1030   assert(shift >= -8);
1031   assert(shift < 32);
1032 
1033   // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
1034   const int32_t post_shift = math_max_s32(shift, 1);
1035   const int32_t pre_shift = shift - post_shift;
1036 
1037   params->rndnu_neon.init_bias = init_bias;
1038   params->rndnu_neon.left_pre_shift = -pre_shift;
1039   params->rndnu_neon.multiplier = multiplier;
1040   params->rndnu_neon.left_post_shift = -post_shift;
1041   params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
1042   params->rndnu_neon.output_min = output_min;
1043   params->rndnu_neon.output_max = output_max;
1044   return sizeof(params->rndnu_neon);
1045 }
1046 
xnn_update_qs8_avgpool_minmax_rndnu_neon_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1047 void xnn_update_qs8_avgpool_minmax_rndnu_neon_params(
1048   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1049   int32_t init_bias,
1050   float scale)
1051 {
1052   assert(scale >= 0x1.0p-32f);
1053   assert(scale < 256.0f);
1054 
1055   // Compute requantization parameters.
1056   const uint32_t scale_bits = float_as_uint32(scale);
1057 
1058   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
1059   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
1060   assert(multiplier >= INT32_C(0x40000000));
1061   assert(multiplier <= INT32_C(0x7FFFFF80));
1062 
1063   // Shift is in [-8, 31] range.
1064   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
1065   assert(shift >= -8);
1066   assert(shift < 32);
1067 
1068   // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
1069   const int32_t post_shift = math_max_s32(shift, 1);
1070   const int32_t pre_shift = shift - post_shift;
1071 
1072   params->rndnu_neon.init_bias = init_bias;
1073   params->rndnu_neon.left_pre_shift = -pre_shift;
1074   params->rndnu_neon.multiplier = multiplier;
1075   params->rndnu_neon.left_post_shift = -post_shift;
1076 }
1077 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1078 
1079 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)1080 size_t xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params(
1081   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1082   int32_t init_bias,
1083   float scale,
1084   int8_t output_zero_point,
1085   int8_t output_min,
1086   int8_t output_max)
1087 {
1088   assert(scale >= 0x1.0p-32f);
1089   assert(scale < 256.0f);
1090 
1091   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1092   const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
1093   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1094   for (uint32_t i = 0; i < 2; i++) {
1095     params->fp32_wasmsimd.init_bias[i] = init_bias;
1096     params->fp32_wasmsimd.scale[i] = scale;
1097     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
1098     params->fp32_wasmsimd.magic_min[i] = magic_min;
1099     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
1100   }
1101   for (uint32_t i = 0; i < 8; i++) {
1102     params->fp32_wasmsimd.output_max[i] = output_max;
1103   }
1104   return sizeof(params->fp32_wasmsimd);
1105 }
1106 
xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params(union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1107 void xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params(
1108   union xnn_qs8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1109   int32_t init_bias,
1110   float scale)
1111 {
1112   assert(scale >= 0x1.0p-32f);
1113   assert(scale < 256.0f);
1114 
1115   for (uint32_t i = 0; i < 2; i++) {
1116     params->fp32_wasmsimd.init_bias[i] = init_bias;
1117     params->fp32_wasmsimd.scale[i] = scale;
1118   }
1119 }
1120 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1121 
xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1122 size_t xnn_init_qu8_avgpool_minmax_fp32_scalar_fmagic_params(
1123   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1124   int32_t init_bias,
1125   float scale,
1126   uint8_t output_zero_point,
1127   uint8_t output_min,
1128   uint8_t output_max)
1129 {
1130   assert(scale >= 0x1.0p-32f);
1131   assert(scale < 256.0f);
1132 
1133   params->fp32_scalar_fmagic.init_bias = init_bias;
1134   params->fp32_scalar_fmagic.scale = scale;
1135   params->fp32_scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1136   params->fp32_scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1137   params->fp32_scalar_fmagic.magic_bias = 12582912.0f;
1138   params->fp32_scalar_fmagic.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1139   return sizeof(params->fp32_scalar_fmagic);
1140 }
1141 
xnn_update_qu8_avgpool_minmax_fp32_scalar_fmagic_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1142 void xnn_update_qu8_avgpool_minmax_fp32_scalar_fmagic_params(
1143   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1144   int32_t init_bias,
1145   float scale)
1146 {
1147   assert(scale >= 0x1.0p-32f);
1148   assert(scale < 256.0f);
1149 
1150   params->fp32_scalar_fmagic.init_bias = init_bias;
1151   params->fp32_scalar_fmagic.scale = scale;
1152 }
1153 
xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1154 size_t xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params(
1155   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1156   int32_t init_bias,
1157   float scale,
1158   uint8_t output_zero_point,
1159   uint8_t output_min,
1160   uint8_t output_max)
1161 {
1162   assert(scale >= 0x1.0p-32f);
1163   assert(scale < 256.0f);
1164 
1165   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1166   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1167   params->fp32_scalar_imagic.init_bias = init_bias;
1168   params->fp32_scalar_imagic.scale = scale;
1169   params->fp32_scalar_imagic.magic_bias = 12582912.0f;
1170   params->fp32_scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
1171   params->fp32_scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
1172   params->fp32_scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1173   return sizeof(params->fp32_scalar_imagic);
1174 }
1175 
xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1176 void xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params(
1177   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1178   int32_t init_bias,
1179   float scale)
1180 {
1181   assert(scale >= 0x1.0p-32f);
1182   assert(scale < 256.0f);
1183 
1184   params->fp32_scalar_imagic.init_bias = init_bias;
1185   params->fp32_scalar_imagic.scale = scale;
1186 }
1187 
xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1188 size_t xnn_init_qu8_avgpool_minmax_fp32_scalar_lrintf_params(
1189   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1190   int32_t init_bias,
1191   float scale,
1192   uint8_t output_zero_point,
1193   uint8_t output_min,
1194   uint8_t output_max)
1195 {
1196   assert(scale >= 0x1.0p-32f);
1197   assert(scale < 256.0f);
1198 
1199   params->fp32_scalar_lrintf.init_bias = init_bias;
1200   params->fp32_scalar_lrintf.scale = scale;
1201   params->fp32_scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1202   params->fp32_scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1203   params->fp32_scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
1204   return sizeof(params->fp32_scalar_lrintf);
1205 }
1206 
xnn_update_qu8_avgpool_minmax_fp32_scalar_lrintf_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1207 void xnn_update_qu8_avgpool_minmax_fp32_scalar_lrintf_params(
1208   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1209   int32_t init_bias,
1210   float scale)
1211 {
1212   assert(scale >= 0x1.0p-32f);
1213   assert(scale < 256.0f);
1214 
1215   params->fp32_scalar_lrintf.init_bias = init_bias;
1216   params->fp32_scalar_lrintf.scale = scale;
1217 }
1218 
1219 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_avgpool_minmax_fp32_sse2_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1220 size_t xnn_init_qu8_avgpool_minmax_fp32_sse2_params(
1221   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1222   int32_t init_bias,
1223   float scale,
1224   uint8_t output_zero_point,
1225   uint8_t output_min,
1226   uint8_t output_max)
1227 {
1228   assert(scale >= 0x1.0p-32f);
1229   assert(scale < 256.0f);
1230 
1231   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1232   for (uint32_t i = 0; i < 4; i++) {
1233     params->fp32_sse2.init_bias[i] = init_bias;
1234     params->fp32_sse2.scale[i] = scale;
1235     params->fp32_sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
1236   }
1237   for (uint32_t i = 0; i < 8; i++) {
1238     params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
1239   }
1240   for (uint32_t i = 0; i < 16; i++) {
1241     params->fp32_sse2.output_min[i] = output_min;
1242   }
1243   return sizeof(params->fp32_sse2);
1244 }
1245 
xnn_update_qu8_avgpool_minmax_fp32_sse2_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1246 void xnn_update_qu8_avgpool_minmax_fp32_sse2_params(
1247   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1248   int32_t init_bias,
1249   float scale)
1250 {
1251   assert(scale >= 0x1.0p-32f);
1252   assert(scale < 256.0f);
1253 
1254   for (uint32_t i = 0; i < 4; i++) {
1255     params->fp32_sse2.init_bias[i] = init_bias;
1256     params->fp32_sse2.scale[i] = scale;
1257   }
1258 }
1259 
xnn_init_qu8_avgpool_minmax_fp32_sse4_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1260 size_t xnn_init_qu8_avgpool_minmax_fp32_sse4_params(
1261   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1262   int32_t init_bias,
1263   float scale,
1264   uint8_t output_zero_point,
1265   uint8_t output_min,
1266   uint8_t output_max)
1267 {
1268   assert(scale >= 0x1.0p-32f);
1269   assert(scale < 256.0f);
1270 
1271   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
1272   for (uint32_t i = 0; i < 4; i++) {
1273     params->fp32_sse4.init_bias[i] = init_bias;
1274     params->fp32_sse4.scale[i] = scale;
1275     params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
1276   }
1277   for (uint32_t i = 0; i < 8; i++) {
1278     params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
1279   }
1280   for (uint32_t i = 0; i < 16; i++) {
1281     params->fp32_sse4.output_min[i] = output_min;
1282   }
1283   return sizeof(params->fp32_sse4);
1284 }
1285 
xnn_update_qu8_avgpool_minmax_fp32_sse4_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1286 void xnn_update_qu8_avgpool_minmax_fp32_sse4_params(
1287   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1288   int32_t init_bias,
1289   float scale)
1290 {
1291   assert(scale >= 0x1.0p-32f);
1292   assert(scale < 256.0f);
1293 
1294   for (uint32_t i = 0; i < 4; i++) {
1295     params->fp32_sse4.init_bias[i] = init_bias;
1296     params->fp32_sse4.scale[i] = scale;
1297   }
1298 }
1299 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1300 
1301 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_avgpool_minmax_fp32_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1302 size_t xnn_init_qu8_avgpool_minmax_fp32_neon_params(
1303   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1304   int32_t init_bias,
1305   float scale,
1306   uint8_t output_zero_point,
1307   uint8_t output_min,
1308   uint8_t output_max)
1309 {
1310   assert(scale >= 0x1.0p-32f);
1311   assert(scale < 256.0f);
1312 
1313   params->fp32_neon.init_bias = init_bias;
1314   params->fp32_neon.scale = scale;
1315   params->fp32_neon.magic_bias = 12582912.0f;
1316   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1317   params->fp32_neon.output_min = output_min;
1318   params->fp32_neon.output_max = output_max;
1319   return sizeof(params->fp32_neon);
1320 }
1321 
xnn_update_qu8_avgpool_minmax_fp32_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1322 void xnn_update_qu8_avgpool_minmax_fp32_neon_params(
1323   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1324   int32_t init_bias,
1325   float scale)
1326 {
1327   assert(scale >= 0x1.0p-32f);
1328   assert(scale < 256.0f);
1329 
1330   params->fp32_neon.init_bias = init_bias;
1331   params->fp32_neon.scale = scale;
1332 }
1333 
xnn_init_qu8_avgpool_minmax_fp32_neonv8_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1334 size_t xnn_init_qu8_avgpool_minmax_fp32_neonv8_params(
1335   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1336   int32_t init_bias,
1337   float scale,
1338   uint8_t output_zero_point,
1339   uint8_t output_min,
1340   uint8_t output_max)
1341 {
1342   assert(scale >= 0x1.0p-32f);
1343   assert(scale < 256.0f);
1344 
1345   params->fp32_neonv8.init_bias = init_bias;
1346   params->fp32_neonv8.scale = scale;
1347   params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
1348   params->fp32_neonv8.output_min = output_min;
1349   params->fp32_neonv8.output_max = output_max;
1350   return sizeof(params->fp32_neonv8);
1351 }
1352 
xnn_update_qu8_avgpool_minmax_fp32_neonv8_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1353 void xnn_update_qu8_avgpool_minmax_fp32_neonv8_params(
1354   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1355   int32_t init_bias,
1356   float scale)
1357 {
1358   assert(scale >= 0x1.0p-32f);
1359   assert(scale < 256.0f);
1360 
1361   params->fp32_neonv8.init_bias = init_bias;
1362   params->fp32_neonv8.scale = scale;
1363 }
1364 
xnn_init_qu8_avgpool_minmax_rndnu_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1365 size_t xnn_init_qu8_avgpool_minmax_rndnu_neon_params(
1366   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1367   int32_t init_bias,
1368   float scale,
1369   uint8_t output_zero_point,
1370   uint8_t output_min,
1371   uint8_t output_max)
1372 {
1373   assert(scale >= 0x1.0p-32f);
1374   assert(scale < 256.0f);
1375 
1376   // Compute requantization parameters.
1377   const uint32_t scale_bits = float_as_uint32(scale);
1378 
1379   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
1380   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
1381   assert(multiplier >= INT32_C(0x40000000));
1382   assert(multiplier <= INT32_C(0x7FFFFF80));
1383 
1384   // Shift is in [-8, 31] range.
1385   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
1386   assert(shift >= -8);
1387   assert(shift < 32);
1388 
1389   // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
1390   const int32_t post_shift = math_max_s32(shift, 1);
1391   const int32_t pre_shift = shift - post_shift;
1392 
1393   params->rndnu_neon.init_bias = init_bias;
1394   params->rndnu_neon.left_pre_shift = -pre_shift;
1395   params->rndnu_neon.multiplier = multiplier;
1396   params->rndnu_neon.left_post_shift = -post_shift;
1397   params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
1398   params->rndnu_neon.output_min = output_min;
1399   params->rndnu_neon.output_max = output_max;
1400   return sizeof(params->rndnu_neon);
1401 }
1402 
xnn_update_qu8_avgpool_minmax_rndnu_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1403 void xnn_update_qu8_avgpool_minmax_rndnu_neon_params(
1404   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1405   int32_t init_bias,
1406   float scale)
1407 {
1408   assert(scale >= 0x1.0p-32f);
1409   assert(scale < 256.0f);
1410 
1411   // Compute requantization parameters.
1412   const uint32_t scale_bits = float_as_uint32(scale);
1413 
1414   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
1415   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
1416   assert(multiplier >= INT32_C(0x40000000));
1417   assert(multiplier <= INT32_C(0x7FFFFF80));
1418 
1419   // Shift is in [-8, 31] range.
1420   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
1421   assert(shift >= -8);
1422   assert(shift < 32);
1423 
1424   // Split shift into pre_shift + post_shift, post_shift in [1, 31] range.
1425   const int32_t post_shift = math_max_s32(shift, 1);
1426   const int32_t pre_shift = shift - post_shift;
1427 
1428   params->rndnu_neon.init_bias = init_bias;
1429   params->rndnu_neon.left_pre_shift = -pre_shift;
1430   params->rndnu_neon.multiplier = multiplier;
1431   params->rndnu_neon.left_post_shift = -post_shift;
1432 }
1433 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1434 
1435 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1436 size_t xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params(
1437   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1438   int32_t init_bias,
1439   float scale,
1440   uint8_t output_zero_point,
1441   uint8_t output_min,
1442   uint8_t output_max)
1443 {
1444   assert(scale >= 0x1.0p-32f);
1445   assert(scale < 256.0f);
1446 
1447   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
1448   const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
1449   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
1450   for (uint32_t i = 0; i < 2; i++) {
1451     params->fp32_wasmsimd.init_bias[i] = init_bias;
1452     params->fp32_wasmsimd.scale[i] = scale;
1453     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
1454     params->fp32_wasmsimd.magic_min[i] = magic_min;
1455     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_zero_point;
1456   }
1457   for (uint32_t i = 0; i < 8; i++) {
1458     params->fp32_wasmsimd.output_max[i] = output_max;
1459   }
1460   return sizeof(params->fp32_wasmsimd);
1461 }
1462 
xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t init_bias,float scale)1463 void xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params(
1464   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1465   int32_t init_bias,
1466   float scale)
1467 {
1468   assert(scale >= 0x1.0p-32f);
1469   assert(scale < 256.0f);
1470 
1471   for (uint32_t i = 0; i < 2; i++) {
1472     params->fp32_wasmsimd.init_bias[i] = init_bias;
1473     params->fp32_wasmsimd.scale[i] = scale;
1474   }
1475 }
1476 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1477 
xnn_init_qu8_avgpool_minmax_scalar_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1478 size_t xnn_init_qu8_avgpool_minmax_scalar_params(
1479   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1480   int32_t bias,
1481   float scale,
1482   uint8_t output_zero_point,
1483   uint8_t output_min,
1484   uint8_t output_max)
1485 {
1486   // Compute requantization parameters.
1487   assert(scale >= 0x1.0p-32f);
1488   assert(scale < 256.0f);
1489   const uint32_t scale_bits = float_as_uint32(scale);
1490 
1491   // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1492   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1493   assert(multiplier >= INT32_C(0x00800000));
1494   assert(multiplier <= INT32_C(0x00FFFFFF));
1495 
1496   // Shift is in [16, 55] range.
1497   const int32_t shift = 127 + 23 - (scale_bits >> 23);
1498   assert(shift >= 16);
1499   assert(shift < 64);
1500 
1501   const uint32_t right_shift = (uint32_t) shift;
1502   const int64_t rounding = INT64_C(1) << (right_shift - 1);
1503   params->scalar.bias = bias;
1504   params->scalar.rounding = rounding;
1505   params->scalar.multiplier = multiplier;
1506   params->scalar.right_shift = right_shift;
1507   params->scalar.output_min_less_zero_point =
1508     (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
1509   params->scalar.output_max_less_zero_point =
1510     (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
1511   params->scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
1512   return sizeof(params->scalar);
1513 }
1514 
1515 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_avgpool_minmax_neon_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1516 size_t xnn_init_qu8_avgpool_minmax_neon_params(
1517   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1518   int32_t bias,
1519   float scale,
1520   uint8_t output_zero_point,
1521   uint8_t output_min,
1522   uint8_t output_max)
1523 {
1524   // Compute requantization parameters.
1525   assert(scale >= 0x1.0p-32f);
1526   assert(scale < 256.0f);
1527   const uint32_t scale_bits = float_as_uint32(scale);
1528 
1529   // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1530   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1531   assert(multiplier >= INT32_C(0x00800000));
1532   assert(multiplier <= INT32_C(0x00FFFFFF));
1533 
1534   // Shift is in [16, 55] range.
1535   const int32_t shift = 127 + 23 - (scale_bits >> 23);
1536   assert(shift >= 16);
1537   assert(shift < 64);
1538 
1539   params->neon.bias = bias;
1540   params->neon.multiplier = multiplier;
1541   params->neon.left_shift = (int64_t) -shift;
1542   params->neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
1543   params->neon.output_min = output_min;
1544   params->neon.output_max = output_max;
1545   return sizeof(params->neon);
1546 }
1547 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1548 
1549 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_avgpool_minmax_sse2_params(union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS (1)],int32_t bias,float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)1550 size_t xnn_init_qu8_avgpool_minmax_sse2_params(
1551   union xnn_qu8_avgpool_minmax_params params[XNN_MIN_ELEMENTS(1)],
1552   int32_t bias,
1553   float scale,
1554   uint8_t output_zero_point,
1555   uint8_t output_min,
1556   uint8_t output_max)
1557 {
1558   // Compute requantization parameters.
1559   assert(scale >= 0x1.0p-32f);
1560   assert(scale < 256.0f);
1561   const uint32_t scale_bits = float_as_uint32(scale);
1562 
1563   // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1564   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1565   assert(multiplier >= INT32_C(0x00800000));
1566   assert(multiplier <= INT32_C(0x00FFFFFF));
1567 
1568   // Shift is in [16, 55] range.
1569   const int32_t shift = 127 + 23 - (scale_bits >> 23);
1570   assert(shift >= 16);
1571   assert(shift < 64);
1572 
1573   const uint32_t right_shift = (uint32_t) shift;
1574   const uint64_t rounding = UINT64_C(1) << (right_shift - 1);
1575   params->sse2.bias[0] = bias;
1576   params->sse2.bias[1] = bias;
1577   params->sse2.bias[2] = bias;
1578   params->sse2.bias[3] = bias;
1579   params->sse2.multiplier[0] = (uint32_t) multiplier;
1580   params->sse2.multiplier[1] = (uint32_t) multiplier;
1581   params->sse2.multiplier[2] = (uint32_t) multiplier;
1582   params->sse2.multiplier[3] = (uint32_t) multiplier;
1583   params->sse2.rounding[0] = rounding;
1584   params->sse2.rounding[1] = rounding;
1585   params->sse2.right_shift[0] = (uint64_t) right_shift;
1586   params->sse2.right_shift[1] = (uint64_t) right_shift;
1587   for (uint32_t i = 0; i < 8; i++) {
1588     params->sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
1589   }
1590   for (uint32_t i = 0; i < 16; i++) {
1591     params->sse2.output_min[i] = output_min;
1592     params->sse2.output_max[i] = output_max;
1593   }
1594   return sizeof(params->sse2);
1595 }
1596 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1597 
xnn_update_qu8_avgpool_minmax_scalar_params(union xnn_qu8_avgpool_minmax_params * params,int32_t bias,float scale)1598 void xnn_update_qu8_avgpool_minmax_scalar_params(
1599   union xnn_qu8_avgpool_minmax_params* params,
1600   int32_t bias,
1601   float scale)
1602 {
1603   // Compute requantization parameters.
1604   assert(scale >= 0x1.0p-32f);
1605   assert(scale < 256.0f);
1606   const uint32_t scale_bits = float_as_uint32(scale);
1607 
1608   // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1609   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1610   assert(multiplier >= INT32_C(0x00800000));
1611   assert(multiplier <= INT32_C(0x00FFFFFF));
1612 
1613   // Shift is in [16, 55] range.
1614   const int32_t shift = 127 + 23 - (scale_bits >> 23);
1615   assert(shift >= 16);
1616   assert(shift < 64);
1617 
1618   const int64_t rounding = INT64_C(1) << ((uint32_t) shift - 1);
1619   params->scalar.bias = bias;
1620   params->scalar.multiplier = multiplier;
1621   params->scalar.rounding = rounding;
1622   params->scalar.right_shift = (uint32_t) shift;
1623 }
1624 
1625 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_update_qu8_avgpool_minmax_neon_params(union xnn_qu8_avgpool_minmax_params * params,int32_t bias,float scale)1626 void xnn_update_qu8_avgpool_minmax_neon_params(
1627   union xnn_qu8_avgpool_minmax_params* params,
1628   int32_t bias,
1629   float scale)
1630 {
1631   // Compute requantization parameters.
1632   assert(scale >= 0x1.0p-32f);
1633   assert(scale < 256.0f);
1634   const uint32_t scale_bits = float_as_uint32(scale);
1635 
1636   // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1637   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1638   assert(multiplier >= INT32_C(0x00800000));
1639   assert(multiplier <= INT32_C(0x00FFFFFF));
1640 
1641   // Shift is in [16, 55] range.
1642   const int32_t shift = 127 + 23 - (scale_bits >> 23);
1643   assert(shift >= 16);
1644   assert(shift < 64);
1645 
1646   params->neon.bias = bias;
1647   params->neon.multiplier = multiplier;
1648   params->neon.left_shift = (int64_t) -shift;
1649 }
1650 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1651 
1652 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_update_qu8_avgpool_minmax_sse2_params(union xnn_qu8_avgpool_minmax_params * params,int32_t bias,float scale)1653 void xnn_update_qu8_avgpool_minmax_sse2_params(
1654   union xnn_qu8_avgpool_minmax_params* params,
1655   int32_t bias,
1656   float scale)
1657 {
1658   // Compute requantization parameters.
1659   assert(scale >= 0x1.0p-32f);
1660   assert(scale < 256.0f);
1661   const uint32_t scale_bits = float_as_uint32(scale);
1662 
1663   // Multiplier is in [0x00800000, 0x00FFFFFF] range.
1664   const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
1665   assert(multiplier >= INT32_C(0x00800000));
1666   assert(multiplier <= INT32_C(0x00FFFFFF));
1667 
1668   // Shift is in [16, 55] range.
1669   const int32_t shift = 127 + 23 - (scale_bits >> 23);
1670   assert(shift >= 16);
1671   assert(shift < 64);
1672 
1673   const uint64_t rounding = UINT64_C(1) << ((uint32_t) shift - 1);
1674   params->sse2.bias[0] = bias;
1675   params->sse2.bias[1] = bias;
1676   params->sse2.bias[2] = bias;
1677   params->sse2.bias[3] = bias;
1678   params->sse2.multiplier[0] = (uint32_t) multiplier;
1679   params->sse2.multiplier[1] = (uint32_t) multiplier;
1680   params->sse2.multiplier[2] = (uint32_t) multiplier;
1681   params->sse2.multiplier[3] = (uint32_t) multiplier;
1682   params->sse2.rounding[0] = rounding;
1683   params->sse2.rounding[1] = rounding;
1684   params->sse2.right_shift[0] = (uint64_t) (uint32_t) shift;
1685   params->sse2.right_shift[1] = (uint64_t) (uint32_t) shift;
1686 }
1687 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1688 
xnn_update_f32_scaleminmax_scalar_params(union xnn_f32_scaleminmax_params * params,float scale)1689 void xnn_update_f32_scaleminmax_scalar_params(
1690   union xnn_f32_scaleminmax_params* params,
1691   float scale)
1692 {
1693   params->scalar.scale = scale;
1694 }
1695 
1696 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_update_f32_scaleminmax_sse_params(union xnn_f32_scaleminmax_params * params,float scale)1697 void xnn_update_f32_scaleminmax_sse_params(
1698   union xnn_f32_scaleminmax_params* params,
1699   float scale)
1700 {
1701   for (uint32_t i = 0; i < 4; i++) {
1702     params->sse.scale[i] = scale;
1703   }
1704 }
1705 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1706 
1707 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_scaleminmax_neon_params(union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t scale,uint16_t min,uint16_t max)1708 size_t xnn_init_f16_scaleminmax_neon_params(
1709   union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1710   uint16_t scale,
1711   uint16_t min,
1712   uint16_t max)
1713 {
1714   params->neon.scale = scale;
1715   params->neon.min = min;
1716   params->neon.max = max;
1717   return sizeof(params->neon);
1718 }
1719 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1720 
1721 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_scaleminmax_avx_params(union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t scale,uint16_t min,uint16_t max)1722 size_t xnn_init_f16_scaleminmax_avx_params(
1723   union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1724   uint16_t scale,
1725   uint16_t min,
1726   uint16_t max)
1727 {
1728   const float scale_f32 = fp16_ieee_to_fp32_value(scale);
1729   const float min_f32 = fp16_ieee_to_fp32_value(min);
1730   const float max_f32 = fp16_ieee_to_fp32_value(max);
1731   for (uint32_t i = 0; i < 8; i++) {
1732     params->avx.scale[i] = scale_f32;
1733     params->avx.min[i] = min_f32;
1734     params->avx.max[i] = max_f32;
1735   }
1736   return sizeof(params->avx);
1737 }
1738 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1739 
1740 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_update_f16_scaleminmax_neon_params(union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t scale)1741 void xnn_update_f16_scaleminmax_neon_params(
1742   union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1743   uint16_t scale)
1744 {
1745   params->neon.scale = scale;
1746 }
1747 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1748 
1749 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_update_f16_scaleminmax_avx_params(union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t scale)1750 void xnn_update_f16_scaleminmax_avx_params(
1751   union xnn_f16_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1752   uint16_t scale)
1753 {
1754   const float scale_f32 = fp16_ieee_to_fp32_value(scale);
1755   for (uint32_t i = 0; i < 8; i++) {
1756     params->avx.scale[i] = scale_f32;
1757   }
1758 }
1759 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1760 
xnn_init_f32_scaleminmax_scalar_params(union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],float scale,float min,float max)1761 size_t xnn_init_f32_scaleminmax_scalar_params(
1762   union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1763   float scale,
1764   float min,
1765   float max)
1766 {
1767   params->scalar.scale = scale;
1768   params->scalar.min = min;
1769   params->scalar.max = max;
1770   return sizeof(params->scalar);
1771 }
1772 
1773 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_scaleminmax_sse_params(union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS (1)],float scale,float min,float max)1774 size_t xnn_init_f32_scaleminmax_sse_params(
1775   union xnn_f32_scaleminmax_params params[XNN_MIN_ELEMENTS(1)],
1776   float scale,
1777   float min,
1778   float max)
1779 {
1780   for (uint32_t i = 0; i < 4; i++) {
1781     params->sse.scale[i] = scale;
1782     params->sse.min[i] = min;
1783     params->sse.max[i] = max;
1784   }
1785   return sizeof(params->sse);
1786 }
1787 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1788 
xnn_init_f32_gavgpool_params(union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS (1)],float multiplier,float output_min,float output_max,uint32_t width)1789 size_t xnn_init_f32_gavgpool_params(
1790   union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)],
1791   float multiplier,
1792   float output_min,
1793   float output_max,
1794   uint32_t width)
1795 {
1796   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1797     for (uint32_t i = 0; i < 4; i++) {
1798       params->sse.multiplier[i] = multiplier;
1799       params->sse.output_min[i] = output_min;
1800       params->sse.output_max[i] = output_max;
1801     }
1802 
1803     const uint32_t w = (width - 1) & 3;
1804     params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
1805     params->sse.mask[1] = -(uint32_t) (w >= 1);
1806     params->sse.mask[2] = -(uint32_t) (w >= 2);
1807     params->sse.mask[3] = -(uint32_t) (w >= 3);
1808     return sizeof(params->sse);
1809   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
1810     params->neon.multiplier = multiplier;
1811     params->neon.output_min = output_min;
1812     params->neon.output_max = output_max;
1813 
1814     const uint32_t w = (width - 1) & 3;
1815     params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
1816     params->neon.mask[1] = -(uint32_t) (w >= 1);
1817     params->neon.mask[2] = -(uint32_t) (w >= 2);
1818     params->neon.mask[3] = -(uint32_t) (w >= 3);
1819     return sizeof(params->neon);
1820   #else
1821     params->scalar.multiplier = multiplier;
1822     params->scalar.output_min = output_min;
1823     params->scalar.output_max = output_max;
1824 
1825     const uint32_t w = (width - 1) & 3;
1826     params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
1827     params->scalar.mask[1] = -(int32_t) (w >= 1);
1828     params->scalar.mask[2] = -(int32_t) (w >= 2);
1829     params->scalar.mask[3] = -(int32_t) (w >= 3);
1830     return sizeof(params->scalar);
1831   #endif
1832 }
1833 
xnn_init_f16_gavgpool_neonfp16arith_x4_params(union xnn_f16_gavgpool_params params[XNN_MIN_ELEMENTS (1)],uint16_t multiplier,uint16_t output_min,uint16_t output_max,uint32_t width)1834 size_t xnn_init_f16_gavgpool_neonfp16arith_x4_params(
1835   union xnn_f16_gavgpool_params params[XNN_MIN_ELEMENTS(1)],
1836   uint16_t multiplier,
1837   uint16_t output_min,
1838   uint16_t output_max,
1839   uint32_t width)
1840 {
1841   #if XNN_ARCH_ARM || XNN_ARCH_ARM64
1842     params->neonfp16arith.multiplier = multiplier;
1843     params->neonfp16arith.output_min = output_min;
1844     params->neonfp16arith.output_max = output_max;
1845 
1846     const uint32_t w = (width - 1) & 3;
1847     params->neonfp16arith.mask[0] = UINT16_C(0xFFFF);
1848     params->neonfp16arith.mask[1] = -(uint16_t) (w >= 1);
1849     params->neonfp16arith.mask[2] = -(uint16_t) (w >= 2);
1850     params->neonfp16arith.mask[3] = -(uint16_t) (w >= 3);
1851     return sizeof(params->neonfp16arith);
1852   #else
1853     return 0;
1854   #endif
1855 }
1856 
xnn_init_f16_gavgpool_neonfp16arith_x8_params(union xnn_f16_gavgpool_params params[XNN_MIN_ELEMENTS (1)],uint16_t multiplier,uint16_t output_min,uint16_t output_max,uint32_t width)1857 size_t xnn_init_f16_gavgpool_neonfp16arith_x8_params(
1858   union xnn_f16_gavgpool_params params[XNN_MIN_ELEMENTS(1)],
1859   uint16_t multiplier,
1860   uint16_t output_min,
1861   uint16_t output_max,
1862   uint32_t width)
1863 {
1864   #if XNN_ARCH_ARM || XNN_ARCH_ARM64
1865     params->neonfp16arith.multiplier = multiplier;
1866     params->neonfp16arith.output_min = output_min;
1867     params->neonfp16arith.output_max = output_max;
1868 
1869     const uint32_t w = (width - 1) & 7;
1870     params->neonfp16arith.mask[0] = UINT16_C(0xFFFF);
1871     params->neonfp16arith.mask[1] = -(uint16_t) (w >= 1);
1872     params->neonfp16arith.mask[2] = -(uint16_t) (w >= 2);
1873     params->neonfp16arith.mask[3] = -(uint16_t) (w >= 3);
1874     params->neonfp16arith.mask[4] = -(uint16_t) (w >= 4);
1875     params->neonfp16arith.mask[5] = -(uint16_t) (w >= 5);
1876     params->neonfp16arith.mask[6] = -(uint16_t) (w >= 6);
1877     params->neonfp16arith.mask[7] = -(uint16_t) (w >= 7);
1878     return sizeof(params->neonfp16arith);
1879   #else
1880     return 0;
1881   #endif
1882 }
1883 
xnn_update_f32_gavgpool_params(union xnn_f32_gavgpool_params * params,float multiplier,uint32_t width)1884 void xnn_update_f32_gavgpool_params(
1885   union xnn_f32_gavgpool_params* params,
1886   float multiplier,
1887   uint32_t width)
1888 {
1889   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1890     for (uint32_t i = 0; i < 4; i++) {
1891       params->sse.multiplier[i] = multiplier;
1892     }
1893 
1894     const uint32_t w = (width - 1) & 3;
1895     params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
1896     params->sse.mask[1] = -(uint32_t) (w >= 1);
1897     params->sse.mask[2] = -(uint32_t) (w >= 2);
1898     params->sse.mask[3] = -(uint32_t) (w >= 3);
1899   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
1900     params->neon.multiplier = multiplier;
1901 
1902     const uint32_t w = (width - 1) & 3;
1903     params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
1904     params->neon.mask[1] = -(uint32_t) (w >= 1);
1905     params->neon.mask[2] = -(uint32_t) (w >= 2);
1906     params->neon.mask[3] = -(uint32_t) (w >= 3);
1907   #else
1908     params->scalar.multiplier = multiplier;
1909 
1910     const uint32_t w = (width - 1) & 3;
1911     params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
1912     params->scalar.mask[1] = -(int32_t) (w >= 1);
1913     params->scalar.mask[2] = -(int32_t) (w >= 2);
1914     params->scalar.mask[3] = -(int32_t) (w >= 3);
1915   #endif
1916 }
1917 
xnn_init_scalar_f32_gavgpool_params(union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS (1)],float multiplier,float output_min,float output_max,uint32_t width)1918 size_t xnn_init_scalar_f32_gavgpool_params(
1919   union xnn_f32_gavgpool_params params[XNN_MIN_ELEMENTS(1)],
1920   float multiplier,
1921   float output_min,
1922   float output_max,
1923   uint32_t width)
1924 {
1925   params->scalar.multiplier = multiplier;
1926   params->scalar.output_min = output_min;
1927   params->scalar.output_max = output_max;
1928 
1929   const uint32_t w = (width - 1) & 3;
1930   params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
1931   params->scalar.mask[1] = -(int32_t) (w >= 1);
1932   params->scalar.mask[2] = -(int32_t) (w >= 2);
1933   params->scalar.mask[3] = -(int32_t) (w >= 3);
1934   return sizeof(params->scalar);
1935 }
1936 
xnn_init_bf16_minmax_scalar_params(union xnn_bf16_minmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t output_min,uint16_t output_max)1937 size_t xnn_init_bf16_minmax_scalar_params(
1938   union xnn_bf16_minmax_params params[XNN_MIN_ELEMENTS(1)],
1939   uint16_t output_min,
1940   uint16_t output_max)
1941 {
1942   params->scalar.min = uint32_as_float((uint32_t) output_min << 16);
1943   params->scalar.max = uint32_as_float((uint32_t) output_max << 16);
1944   return sizeof(params->scalar);
1945 }
1946 
1947 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_minmax_neon_params(union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t min,uint16_t max)1948 size_t xnn_init_f16_minmax_neon_params(
1949   union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS(1)],
1950   uint16_t min,
1951   uint16_t max)
1952 {
1953   params->neon.min = min;
1954   params->neon.max = max;
1955   return sizeof(params->neon);
1956 }
1957 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1958 
1959 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_minmax_avx_params(union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS (1)],uint16_t min,uint16_t max)1960 size_t xnn_init_f16_minmax_avx_params(
1961   union xnn_f16_minmax_params params[XNN_MIN_ELEMENTS(1)],
1962   uint16_t min,
1963   uint16_t max)
1964 {
1965   const float min_f32 = fp16_ieee_to_fp32_value(min);
1966   const float max_f32 = fp16_ieee_to_fp32_value(max);
1967   for (uint32_t i = 0; i < 8; i++) {
1968     params->avx.min[i] = min_f32;
1969     params->avx.max[i] = max_f32;
1970   }
1971   return sizeof(params->avx);
1972 }
1973 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1974 
1975 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_default_avx_params(union xnn_f32_default_params params[XNN_MIN_ELEMENTS (1)])1976 size_t xnn_init_f32_default_avx_params(
1977   union xnn_f32_default_params params[XNN_MIN_ELEMENTS(1)])
1978 {
1979   for (uint32_t i = 0; i < 7; i++) {
1980     params->avx.mask_table[i] = -1;
1981   }
1982   for (uint32_t i = 7; i < 14; i++) {
1983     params->avx.mask_table[i] = 0;
1984   }
1985   return sizeof(params->avx);
1986 }
1987 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1988 
xnn_init_f32_minmax_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)1989 size_t xnn_init_f32_minmax_params(
1990   union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
1991   float output_min,
1992   float output_max)
1993 {
1994   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1995     for (uint32_t i = 0; i < 4; i++) {
1996       params->sse.min[i] = output_min;
1997       params->sse.max[i] = output_max;
1998     }
1999     return sizeof(params->sse);
2000   #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2001     params->wasmsimd.min[0] = output_min;
2002     params->wasmsimd.min[1] = output_min;
2003     params->wasmsimd.max[0] = output_max;
2004     params->wasmsimd.max[1] = output_max;
2005     return sizeof(params->wasmsimd);
2006   #else
2007     params->scalar.min = output_min;
2008     params->scalar.max = output_max;
2009     return sizeof(params->scalar);
2010   #endif
2011 }
2012 
2013 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_minmax_sse_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)2014 size_t xnn_init_f32_minmax_sse_params(
2015   union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
2016   float output_min,
2017   float output_max)
2018 {
2019   for (uint32_t i = 0; i < 4; i++) {
2020     params->sse.min[i] = output_min;
2021     params->sse.max[i] = output_max;
2022   }
2023   return sizeof(params->sse);
2024 }
2025 
xnn_init_f32_minmax_avx_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)2026 size_t xnn_init_f32_minmax_avx_params(
2027   union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
2028   float output_min,
2029   float output_max)
2030 {
2031   for (uint32_t i = 0; i < 8; i++) {
2032     params->avx.min[i] = output_min;
2033     params->avx.max[i] = output_max;
2034   }
2035   for (uint32_t i = 0; i < 7; i++) {
2036     params->avx.mask_table[i] = -1;
2037   }
2038   for (uint32_t i = 7; i < 14; i++) {
2039     params->avx.mask_table[i] = 0;
2040   }
2041   return sizeof(params->avx);
2042 }
2043 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2044 
2045 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_minmax_wasmsimd_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)2046 size_t xnn_init_f32_minmax_wasmsimd_params(
2047   union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
2048   float output_min,
2049   float output_max)
2050 {
2051   params->wasmsimd.min[0] = output_min;
2052   params->wasmsimd.min[1] = output_min;
2053   params->wasmsimd.max[0] = output_max;
2054   params->wasmsimd.max[1] = output_max;
2055   return sizeof(params->wasmsimd);
2056 }
2057 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2058 
xnn_init_f32_minmax_scalar_params(union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS (1)],float output_min,float output_max)2059 size_t xnn_init_f32_minmax_scalar_params(
2060   union xnn_f32_minmax_params params[XNN_MIN_ELEMENTS(1)],
2061   float output_min,
2062   float output_max)
2063 {
2064   params->scalar.min = output_min;
2065   params->scalar.max = output_max;
2066   return sizeof(params->scalar);
2067 }
2068 
2069 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_hswish_neon_params(union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS (1)])2070 size_t xnn_init_f16_hswish_neon_params(
2071   union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS(1)])
2072 {
2073   params->neon.sixth = UINT16_C(0x3155);
2074   params->neon.three = UINT16_C(0x4200);
2075   params->neon.six = UINT16_C(0x4600);
2076   return sizeof(params->neon);
2077 }
2078 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
2079 
2080 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_hswish_avx_params(union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS (1)])2081 size_t xnn_init_f16_hswish_avx_params(
2082   union xnn_f16_hswish_params params[XNN_MIN_ELEMENTS(1)])
2083 {
2084   for (uint32_t i = 0; i < 8; i++) {
2085     params->avx.sixth[i] = 0x1.554000p-3f;
2086     params->avx.three[i] = 3.0f;
2087     params->avx.six[i] = UINT16_C(0x4600);
2088   }
2089   return sizeof(params->avx);
2090 }
2091 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2092 
xnn_init_f32_hswish_scalar_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])2093 size_t xnn_init_f32_hswish_scalar_params(
2094   union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
2095 {
2096   params->scalar.sixth = 0x1.555556p-3f;
2097   params->scalar.three = 3.0f;
2098   params->scalar.six = 6.0f;
2099   return sizeof(params->scalar);
2100 }
2101 
2102 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_hswish_sse_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])2103 size_t xnn_init_f32_hswish_sse_params(
2104   union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
2105 {
2106   for (uint32_t i = 0; i < 4; i++) {
2107     params->sse.sixth[i] = 0x1.555556p-3f;
2108     params->sse.half[i] = 0.5f;
2109     params->sse.one[i] = 1.0f;
2110   }
2111   return sizeof(params->sse);
2112 }
2113 
xnn_init_f32_hswish_avx_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])2114 size_t xnn_init_f32_hswish_avx_params(
2115   union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
2116 {
2117   for (uint32_t i = 0; i < 8; i++) {
2118     params->avx.sixth[i] = 0x1.555556p-3f;
2119     params->avx.half[i] = 0.5f;
2120     params->avx.one[i] = 1.0f;
2121   }
2122   for (uint32_t i = 0; i < 7; i++) {
2123     params->avx.mask_table[i] = -1;
2124   }
2125   for (uint32_t i = 7; i < 14; i++) {
2126     params->avx.mask_table[i] = 0;
2127   }
2128   return sizeof(params->avx);
2129 }
2130 
xnn_init_f32_hswish_avx512_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])2131 size_t xnn_init_f32_hswish_avx512_params(
2132   union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
2133 {
2134   params->avx512.sixth = 0x1.555556p-3f;
2135   params->avx512.half = 0.5f;
2136   params->avx512.one = 1.0f;
2137   return sizeof(params->avx512);
2138 }
2139 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2140 
2141 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_hswish_wasmsimd_params(union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS (1)])2142 size_t xnn_init_f32_hswish_wasmsimd_params(
2143   union xnn_f32_hswish_params params[XNN_MIN_ELEMENTS(1)])
2144 {
2145   for (uint32_t i = 0; i < 2; i++) {
2146     params->wasmsimd.sixth[i] = 0x1.555556p-3f;
2147     params->wasmsimd.three[i] = 3.0f;
2148     params->wasmsimd.six[i] = 6.0f;
2149   }
2150   return sizeof(params->wasmsimd);
2151 }
2152 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2153 
2154 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params(union xnn_f16_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2155 size_t xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params(
2156   union xnn_f16_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2157 {
2158   params->neonfp16arith_rr2_p2.magic_bias = UINT16_C(0x660F);  // 0x1.83Cp+10h
2159   params->neonfp16arith_rr2_p2.minus_log2e = UINT16_C(0xBDC5);  // -0x1.714p+0h
2160   params->neonfp16arith_rr2_p2.ln2_hi = UINT16_C(0x398C);  // 0x1.630p-1h
2161   params->neonfp16arith_rr2_p2.ln2_lo = UINT16_C(0x8AF4);  // -0x1.BD0p-13h
2162   params->neonfp16arith_rr2_p2.c2 = UINT16_C(0x37F9);  // 0x1.FE4p-2h
2163   params->neonfp16arith_rr2_p2.c1 = UINT16_C(0xBC0E);  // -0x1.038p+0h
2164   params->neonfp16arith_rr2_p2.denorm_cutoff = UINT16_C(0xC8DA);  // -0x1.368p+3h
2165   return sizeof(params->neonfp16arith_rr2_p2);
2166 }
2167 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
2168 
2169 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_sigmoid_avx2_rr1_p2_params(union xnn_f16_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2170 size_t xnn_init_f16_sigmoid_avx2_rr1_p2_params(
2171   union xnn_f16_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2172 {
2173   for (uint32_t i = 0; i < 8; i++) {
2174     params->avx2_rr1_p2.sign_mask[i] = -0.0f;
2175     params->avx2_rr1_p2.magic_bias[i] = 0x1.8000FEp23f;
2176     params->avx2_rr1_p2.log2e[i] = 0x1.715476p0f;
2177     params->avx2_rr1_p2.minus_ln2[i] = -0x1.62E43p-1f;
2178     params->avx2_rr1_p2.c2[i] = 0x1.FF3A32p-2f;
2179     params->avx2_rr1_p2.c1[i] = 0x1.039E10p+0f;
2180     params->avx2_rr1_p2.one[i] = 1.0f;
2181     params->avx2_rr1_p2.denorm_cutoff[i] = -0x1.368000p+3f;
2182   }
2183   return sizeof(params->avx2_rr1_p2);
2184 }
2185 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2186 
xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2187 size_t xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params(
2188   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2189 {
2190   params->scalar_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
2191   params->scalar_rr2_lut64_p2.minus_log2e = -0x1.715476p0f;
2192   params->scalar_rr2_lut64_p2.ln2_hi = 0x1.630000p-1f;
2193   params->scalar_rr2_lut64_p2.ln2_lo = -0x1.BD0106p-13f;
2194   params->scalar_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2195   params->scalar_rr2_lut64_p2.one = 1.0f;
2196   params->scalar_rr2_lut64_p2.denorm_cutoff = 0x1.5D589Ep+6f;
2197   return sizeof(params->scalar_rr2_lut64_p2);
2198 }
2199 
xnn_init_f32_sigmoid_scalar_rr2_lut2048_p1_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2200 size_t xnn_init_f32_sigmoid_scalar_rr2_lut2048_p1_params(
2201   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2202 {
2203   params->scalar_rr2_lut2048_p1.magic_bias = 0x1.800000p12f;
2204   params->scalar_rr2_lut2048_p1.minus_log2e = -0x1.715476p0f;
2205   params->scalar_rr2_lut2048_p1.ln2_hi = 0x1.600000p-1f;
2206   params->scalar_rr2_lut2048_p1.ln2_lo = 0x1.7217F8p-8f;
2207   params->scalar_rr2_lut2048_p1.c1 = -0x1.FFFFFEp-1f;
2208   params->scalar_rr2_lut2048_p1.one = 1.0f;
2209   params->scalar_rr2_lut2048_p1.denorm_cutoff = 0x1.5D589Ep+6f;
2210   return sizeof(params->scalar_rr2_lut2048_p1);
2211 }
2212 
xnn_init_f32_sigmoid_scalar_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2213 size_t xnn_init_f32_sigmoid_scalar_rr2_p5_params(
2214   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2215 {
2216   params->scalar_rr2_p5.magic_bias = 0x1.8000FEp23f;
2217   params->scalar_rr2_p5.minus_log2e = -0x1.715476p0f;
2218   params->scalar_rr2_p5.ln2_hi = 0x1.62E400p-1f;
2219   params->scalar_rr2_p5.ln2_lo = 0x1.7F7D1Cp-20f;
2220   params->scalar_rr2_p5.c5 = -0x1.0F9F9Cp-7f;
2221   params->scalar_rr2_p5.c4 = 0x1.573A1Ap-5f;
2222   params->scalar_rr2_p5.c3 = -0x1.555A80p-3f;
2223   params->scalar_rr2_p5.c2 = 0x1.FFFDC6p-2f;
2224   params->scalar_rr2_p5.c1 = -0x1.FFFFF6p-1f;
2225   params->scalar_rr2_p5.one = 1.0f;
2226   params->scalar_rr2_p5.denorm_cutoff = 0x1.5D589Ep+6f;
2227   return sizeof(params->scalar_rr2_p5);
2228 }
2229 
2230 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2231 size_t xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params(
2232   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2233 {
2234   params->neon_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
2235   params->neon_rr2_lut64_p2.minus_log2e = -0x1.715476p0f;
2236   params->neon_rr2_lut64_p2.ln2_hi = 0x1.630000p-1f;
2237   params->neon_rr2_lut64_p2.ln2_lo = -0x1.BD0106p-13f;
2238   params->neon_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2239   params->neon_rr2_lut64_p2.denorm_cutoff = 0x1.5D589Ep+6f;
2240   return sizeof(params->neon_rr2_lut64_p2);
2241 }
2242 
xnn_init_f32_sigmoid_neon_rr2_lut2048_p1_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2243 size_t xnn_init_f32_sigmoid_neon_rr2_lut2048_p1_params(
2244   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2245 {
2246   params->neon_rr2_lut2048_p1.magic_bias = 0x1.800000p12f;
2247   params->neon_rr2_lut2048_p1.minus_log2e = -0x1.715476p0f;
2248   params->neon_rr2_lut2048_p1.ln2_hi = 0x1.600000p-1f;
2249   params->neon_rr2_lut2048_p1.ln2_lo = 0x1.7217F8p-8f;
2250   params->neon_rr2_lut2048_p1.c1 = -0x1.FFFFFEp-1f;
2251   params->neon_rr2_lut2048_p1.denorm_cutoff = 0x1.5D589Ep+6f;
2252   return sizeof(params->neon_rr2_lut2048_p1);
2253 }
2254 
xnn_init_f32_sigmoid_neon_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2255 size_t xnn_init_f32_sigmoid_neon_rr2_p5_params(
2256   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2257 {
2258   params->neon_rr2_p5.magic_bias = 0x1.8000FEp23f;
2259   params->neon_rr2_p5.minus_log2e = -0x1.715476p0f;
2260   params->neon_rr2_p5.ln2_hi = 0x1.62E400p-1f;
2261   params->neon_rr2_p5.ln2_lo = 0x1.7F7D1Cp-20f;
2262   params->neon_rr2_p5.c5 = -0x1.0F9F9Cp-7f;
2263   params->neon_rr2_p5.c4 = 0x1.573A1Ap-5f;
2264   params->neon_rr2_p5.c3 = -0x1.555A80p-3f;
2265   params->neon_rr2_p5.c2 = 0x1.FFFDC6p-2f;
2266   params->neon_rr2_p5.c1 = -0x1.FFFFF6p-1f;
2267   params->neon_rr2_p5.denorm_cutoff = 0x1.5D589Ep+6f;
2268   return sizeof(params->neon_rr2_p5);
2269 }
2270 
xnn_init_f32_sigmoid_neonfma_rr1_lut2048_p1_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2271 size_t xnn_init_f32_sigmoid_neonfma_rr1_lut2048_p1_params(
2272   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2273 {
2274   params->neonfma_rr1_lut2048_p1.magic_bias = 0x1.800000p12f;
2275   params->neonfma_rr1_lut2048_p1.minus_log2e = -0x1.715476p0f;
2276   params->neonfma_rr1_lut2048_p1.ln2 = 0x1.62E430p-1f;
2277   params->neonfma_rr1_lut2048_p1.c1 = -0x1.FFFFFEp-1f;
2278   params->neonfma_rr1_lut2048_p1.denorm_cutoff = 0x1.5D589Ep+6f;
2279   return sizeof(params->neonfma_rr1_lut2048_p1);
2280 }
2281 
xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2282 size_t xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params(
2283   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2284 {
2285   params->neonfma_rr1_lut64_p2.magic_bias = 0x1.800000p17f;
2286   params->neonfma_rr1_lut64_p2.minus_log2e = -0x1.715476p0f;
2287   params->neonfma_rr1_lut64_p2.ln2 = 0x1.62E430p-1f;
2288   params->neonfma_rr1_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
2289   params->neonfma_rr1_lut64_p2.denorm_cutoff = 0x1.5D589Ep+6f;
2290   return sizeof(params->neonfma_rr1_lut64_p2);
2291 }
2292 
xnn_init_f32_sigmoid_neonfma_rr1_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2293 size_t xnn_init_f32_sigmoid_neonfma_rr1_p5_params(
2294   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2295 {
2296   params->neonfma_rr1_p5.magic_bias = 0x1.8000FEp23f;
2297   params->neonfma_rr1_p5.minus_log2e = -0x1.715476p0f;
2298   params->neonfma_rr1_p5.ln2 = 0x1.62E430p-1f;
2299   params->neonfma_rr1_p5.c5 = -0x1.0F9F9Cp-7f;
2300   params->neonfma_rr1_p5.c4 = 0x1.573A1Ap-5f;
2301   params->neonfma_rr1_p5.c3 = -0x1.555A80p-3f;
2302   params->neonfma_rr1_p5.c2 = 0x1.FFFDC6p-2f;
2303   params->neonfma_rr1_p5.c1 = -0x1.FFFFF6p-1f;
2304   params->neonfma_rr1_p5.denorm_cutoff = 0x1.5D589Ep+6f;
2305   return sizeof(params->neonfma_rr1_p5);
2306 }
2307 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
2308 
2309 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2310 size_t xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params(
2311   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2312 {
2313   for (uint32_t i = 0; i < 4; i++) {
2314     params->sse2_rr2_lut64_p2.sign_mask[i] = -0.0f;
2315     params->sse2_rr2_lut64_p2.magic_bias[i] = 0x1.800000p17f;
2316     params->sse2_rr2_lut64_p2.log2e[i] = 0x1.715476p0f;
2317     params->sse2_rr2_lut64_p2.index_mask[i] = UINT32_C(0x3F);
2318     params->sse2_rr2_lut64_p2.minus_ln2_hi[i] = -0x1.630000p-1f;
2319     params->sse2_rr2_lut64_p2.minus_ln2_lo[i] = 0x1.BD0106p-13f;
2320     params->sse2_rr2_lut64_p2.c2[i] = 0x1.FFFF0Ap-2f;
2321     params->sse2_rr2_lut64_p2.one[i] = 1.0f;
2322     params->sse2_rr2_lut64_p2.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2323   }
2324   return sizeof(params->sse2_rr2_lut64_p2);
2325 }
2326 
xnn_init_f32_sigmoid_sse2_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2327 size_t xnn_init_f32_sigmoid_sse2_rr2_p5_params(
2328   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2329 {
2330   for (uint32_t i = 0; i < 4; i++) {
2331     params->sse2_rr2_p5.sign_mask[i] = -0.0f;
2332     params->sse2_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2333     params->sse2_rr2_p5.log2e[i] = 0x1.715476p0f;
2334     params->sse2_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
2335     params->sse2_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2336     params->sse2_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
2337     params->sse2_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2338     params->sse2_rr2_p5.c3[i] = 0x1.555A80p-3f;
2339     params->sse2_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2340     params->sse2_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
2341     params->sse2_rr2_p5.one[i] = 1.0f;
2342     params->sse2_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2343   }
2344   return sizeof(params->sse2_rr2_p5);
2345 }
2346 
xnn_init_f32_sigmoid_avx_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2347 size_t xnn_init_f32_sigmoid_avx_rr2_p5_params(
2348   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2349 {
2350   for (uint32_t i = 0; i < 8; i++) {
2351     params->avx_rr2_p5.sign_mask[i] = -0.0f;
2352     params->avx_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2353     params->avx_rr2_p5.log2e[i] = 0x1.715476p0f;
2354     params->avx_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
2355     params->avx_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2356     params->avx_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
2357     params->avx_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2358     params->avx_rr2_p5.c3[i] = 0x1.555A80p-3f;
2359     params->avx_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2360     params->avx_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
2361     params->avx_rr2_p5.one[i] = 1.0f;
2362     params->avx_rr2_p5.two[i] = 2.0f;
2363     params->avx_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2364   }
2365   for (uint32_t i = 0; i < 7; i++) {
2366     params->avx_rr2_p5.mask_table[i] = -1;
2367   }
2368   for (uint32_t i = 7; i < 14; i++) {
2369     params->avx_rr2_p5.mask_table[i] = 0;
2370   }
2371   return sizeof(params->avx_rr2_p5);
2372 }
2373 
xnn_init_f32_sigmoid_avx2_rr1_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2374 size_t xnn_init_f32_sigmoid_avx2_rr1_p5_params(
2375   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2376 {
2377   for (uint32_t i = 0; i < 8; i++) {
2378     params->avx2_rr1_p5.sign_mask[i] = -0.0f;
2379     params->avx2_rr1_p5.magic_bias[i] = 0x1.8000FEp23f;
2380     params->avx2_rr1_p5.log2e[i] = 0x1.715476p0f;
2381     params->avx2_rr1_p5.minus_ln2[i] = -0x1.62E430p-1f;
2382     params->avx2_rr1_p5.c5[i] = 0x1.0F9F9Cp-7f;
2383     params->avx2_rr1_p5.c4[i] = 0x1.573A1Ap-5f;
2384     params->avx2_rr1_p5.c3[i] = 0x1.555A80p-3f;
2385     params->avx2_rr1_p5.c2[i] = 0x1.FFFDC6p-2f;
2386     params->avx2_rr1_p5.c1[i] = 0x1.FFFFF6p-1f;
2387     params->avx2_rr1_p5.one[i] = 1.0f;
2388     params->avx2_rr1_p5.denorm_cutoff[i] = -0x1.5D589Ep+6f;
2389   }
2390   for (uint32_t i = 0; i < 7; i++) {
2391     params->avx2_rr1_p5.mask_table[i] = -1;
2392   }
2393   for (uint32_t i = 7; i < 14; i++) {
2394     params->avx2_rr1_p5.mask_table[i] = 0;
2395   }
2396   return sizeof(params->avx2_rr1_p5);
2397 }
2398 
xnn_init_f32_sigmoid_avx512_rr1_lut16_p3_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2399 size_t xnn_init_f32_sigmoid_avx512_rr1_lut16_p3_params(
2400   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2401 {
2402   params->avx512_rr1_lut16_p3.sign_mask = UINT32_C(0x80000000);
2403   params->avx512_rr1_lut16_p3.magic_bias = 0x1.800000p19f;
2404   params->avx512_rr1_lut16_p3.log2e = 0x1.715476p0f;
2405   params->avx512_rr1_lut16_p3.minus_ln2 = -0x1.62E430p-1f;
2406   params->avx512_rr1_lut16_p3.c3 = 0x1.55559Ap-3f;
2407   params->avx512_rr1_lut16_p3.c2 = 0x1.00021Ep-1f;
2408   params->avx512_rr1_lut16_p3.one = 1.0f;
2409   params->avx512_rr1_lut16_p3.table[ 0] = 0x1.000000p+0f;
2410   params->avx512_rr1_lut16_p3.table[ 1] = 0x1.0B5586p+0f;
2411   params->avx512_rr1_lut16_p3.table[ 2] = 0x1.172B84p+0f;
2412   params->avx512_rr1_lut16_p3.table[ 3] = 0x1.2387A6p+0f;
2413   params->avx512_rr1_lut16_p3.table[ 4] = 0x1.306FE0p+0f;
2414   params->avx512_rr1_lut16_p3.table[ 5] = 0x1.3DEA64p+0f;
2415   params->avx512_rr1_lut16_p3.table[ 6] = 0x1.4BFDAEp+0f;
2416   params->avx512_rr1_lut16_p3.table[ 7] = 0x1.5AB07Ep+0f;
2417   params->avx512_rr1_lut16_p3.table[ 8] = 0x1.6A09E6p+0f;
2418   params->avx512_rr1_lut16_p3.table[ 9] = 0x1.7A1148p+0f;
2419   params->avx512_rr1_lut16_p3.table[10] = 0x1.8ACE54p+0f;
2420   params->avx512_rr1_lut16_p3.table[11] = 0x1.9C4918p+0f;
2421   params->avx512_rr1_lut16_p3.table[12] = 0x1.AE89FAp+0f;
2422   params->avx512_rr1_lut16_p3.table[13] = 0x1.C199BEp+0f;
2423   params->avx512_rr1_lut16_p3.table[14] = 0x1.D5818Ep+0f;
2424   params->avx512_rr1_lut16_p3.table[15] = 0x1.EA4AFAp+0f;
2425   return sizeof(params->avx512_rr1_lut16_p3);
2426 }
2427 
xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2428 size_t xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params(
2429   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2430 {
2431   params->avx512_rr2_lut32_p2.sign_mask = UINT32_C(0x80000000);
2432   params->avx512_rr2_lut32_p2.magic_bias = 0x1.800000p18f;
2433   params->avx512_rr2_lut32_p2.log2e = 0x1.715476p0f;
2434   params->avx512_rr2_lut32_p2.minus_ln2_hi = -0x1.62E430p-1f;
2435   params->avx512_rr2_lut32_p2.minus_ln2_lo = 0x1.05C61p-29f;
2436   params->avx512_rr2_lut32_p2.c2 = 0x1.000000p-1f;
2437   params->avx512_rr2_lut32_p2.c1 = 0x1.0000F6p-0f;
2438   params->avx512_rr2_lut32_p2.one = 1.0f;
2439 
2440   params->avx512_rr2_lut32_p2.table_lo[ 0] = 0x1.000000p+0f;
2441   params->avx512_rr2_lut32_p2.table_lo[ 1] = 0x1.059B0Ep+0f;
2442   params->avx512_rr2_lut32_p2.table_lo[ 2] = 0x1.0B5586p+0f;
2443   params->avx512_rr2_lut32_p2.table_lo[ 3] = 0x1.11301Ep+0f;
2444   params->avx512_rr2_lut32_p2.table_lo[ 4] = 0x1.172B84p+0f;
2445   params->avx512_rr2_lut32_p2.table_lo[ 5] = 0x1.1D4874p+0f;
2446   params->avx512_rr2_lut32_p2.table_lo[ 6] = 0x1.2387A6p+0f;
2447   params->avx512_rr2_lut32_p2.table_lo[ 7] = 0x1.29E9E0p+0f;
2448   params->avx512_rr2_lut32_p2.table_lo[ 8] = 0x1.306FE0p+0f;
2449   params->avx512_rr2_lut32_p2.table_lo[ 9] = 0x1.371A74p+0f;
2450   params->avx512_rr2_lut32_p2.table_lo[10] = 0x1.3DEA64p+0f;
2451   params->avx512_rr2_lut32_p2.table_lo[11] = 0x1.44E086p+0f;
2452   params->avx512_rr2_lut32_p2.table_lo[12] = 0x1.4BFDAEp+0f;
2453   params->avx512_rr2_lut32_p2.table_lo[13] = 0x1.5342B6p+0f;
2454   params->avx512_rr2_lut32_p2.table_lo[14] = 0x1.5AB07Ep+0f;
2455   params->avx512_rr2_lut32_p2.table_lo[15] = 0x1.6247ECp+0f;
2456 
2457   params->avx512_rr2_lut32_p2.table_hi[ 0] = 0x1.6A09E6p+0f;
2458   params->avx512_rr2_lut32_p2.table_hi[ 1] = 0x1.71F75Ep+0f;
2459   params->avx512_rr2_lut32_p2.table_hi[ 2] = 0x1.7A1148p+0f;
2460   params->avx512_rr2_lut32_p2.table_hi[ 3] = 0x1.82589Ap+0f;
2461   params->avx512_rr2_lut32_p2.table_hi[ 4] = 0x1.8ACE54p+0f;
2462   params->avx512_rr2_lut32_p2.table_hi[ 5] = 0x1.93737Cp+0f;
2463   params->avx512_rr2_lut32_p2.table_hi[ 6] = 0x1.9C4918p+0f;
2464   params->avx512_rr2_lut32_p2.table_hi[ 7] = 0x1.A5503Cp+0f;
2465   params->avx512_rr2_lut32_p2.table_hi[ 8] = 0x1.AE89FAp+0f;
2466   params->avx512_rr2_lut32_p2.table_hi[ 9] = 0x1.B7F770p+0f;
2467   params->avx512_rr2_lut32_p2.table_hi[10] = 0x1.C199BEp+0f;
2468   params->avx512_rr2_lut32_p2.table_hi[11] = 0x1.CB720Ep+0f;
2469   params->avx512_rr2_lut32_p2.table_hi[12] = 0x1.D5818Ep+0f;
2470   params->avx512_rr2_lut32_p2.table_hi[13] = 0x1.DFC974p+0f;
2471   params->avx512_rr2_lut32_p2.table_hi[14] = 0x1.EA4AFAp+0f;
2472   params->avx512_rr2_lut32_p2.table_hi[15] = 0x1.F50766p+0f;
2473   return sizeof(params->avx512_rr2_lut32_p2);
2474 }
2475 
xnn_init_f32_sigmoid_avx512_rr1_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2476 size_t xnn_init_f32_sigmoid_avx512_rr1_p5_params(
2477   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2478 {
2479   params->avx512_rr1_p5.sign_mask = UINT32_C(0x80000000);
2480   params->avx512_rr1_p5.log2e = 0x1.715476p0f;
2481   params->avx512_rr1_p5.minus_ln2 = -0x1.62E430p-1f;
2482   params->avx512_rr1_p5.c5 = 0x1.0F9F9Cp-7f;
2483   params->avx512_rr1_p5.c4 = 0x1.573A1Ap-5f;
2484   params->avx512_rr1_p5.c3 = 0x1.555A80p-3f;
2485   params->avx512_rr1_p5.c2 = 0x1.FFFDC6p-2f;
2486   params->avx512_rr1_p5.c1 = 0x1.FFFFF6p-1f;
2487   params->avx512_rr1_p5.one = 1.0f;
2488   return sizeof(params->avx512_rr1_p5);
2489 }
2490 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2491 
2492 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_sigmoid_wasmsimd_rr2_lut64_p2_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2493 size_t xnn_init_f32_sigmoid_wasmsimd_rr2_lut64_p2_params(
2494   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2495 {
2496   for (uint32_t i = 0; i < 2; i++) {
2497     params->wasmsimd_rr2_lut64_p2.magic_bias[i] = 0x1.800000p17f;
2498     params->wasmsimd_rr2_lut64_p2.minus_log2e[i] = -0x1.715476p0f;
2499     params->wasmsimd_rr2_lut64_p2.index_mask[i] = UINT32_C(0x3F);
2500     params->wasmsimd_rr2_lut64_p2.ln2_hi[i] = 0x1.630000p-1f;
2501     params->wasmsimd_rr2_lut64_p2.ln2_lo[i] = -0x1.BD0106p-13f;
2502     params->wasmsimd_rr2_lut64_p2.c2[i] = 0x1.FFFF0Ap-2f;
2503     params->wasmsimd_rr2_lut64_p2.one[i] = 1.0f;
2504     params->wasmsimd_rr2_lut64_p2.denorm_cutoff[i] = 0x1.5D589Ep+6f;
2505   }
2506   return sizeof(params->wasmsimd_rr2_lut64_p2);
2507 }
2508 
xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params(union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS (1)])2509 size_t xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params(
2510   union xnn_f32_sigmoid_params params[XNN_MIN_ELEMENTS(1)])
2511 {
2512   for (uint32_t i = 0; i < 2; i++) {
2513     params->wasmsimd_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
2514     params->wasmsimd_rr2_p5.minus_log2e[i] = -0x1.715476p+0f;
2515     params->wasmsimd_rr2_p5.ln2_hi[i] = 0x1.62E400p-1f;
2516     params->wasmsimd_rr2_p5.ln2_lo[i] = 0x1.7F7D1Cp-20f;
2517     params->wasmsimd_rr2_p5.c5[i] = -0x1.0F9F9Cp-7f;
2518     params->wasmsimd_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
2519     params->wasmsimd_rr2_p5.c3[i] = -0x1.555A80p-3f;
2520     params->wasmsimd_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
2521     params->wasmsimd_rr2_p5.c1[i] = -0x1.FFFFF6p-1f;
2522     params->wasmsimd_rr2_p5.one[i] = 1.0f;
2523     params->wasmsimd_rr2_p5.denorm_cutoff[i] = 0x1.5D589Ep+6f;
2524   }
2525   return sizeof(params->wasmsimd_rr2_p5);
2526 }
2527 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2528 
2529 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_abs_sse_params(union xnn_f16_abs_params params[XNN_MIN_ELEMENTS (1)])2530 size_t xnn_init_f16_abs_sse_params(
2531   union xnn_f16_abs_params params[XNN_MIN_ELEMENTS(1)])
2532 {
2533   for (uint32_t i = 0; i < 8; i++) {
2534     params->sse.nonsign_mask[i] = UINT16_C(0x7FFF);
2535   }
2536   return sizeof(params->sse);
2537 }
2538 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2539 
2540 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_abs_sse_params(union xnn_f32_abs_params params[XNN_MIN_ELEMENTS (1)])2541 size_t xnn_init_f32_abs_sse_params(
2542   union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2543 {
2544   for (uint32_t i = 0; i < 4; i++) {
2545     params->sse.nonsign_mask[i] = math_nonsign_mask_f32();
2546   }
2547   return sizeof(params->sse);
2548 }
2549 
xnn_init_f32_abs_avx_params(union xnn_f32_abs_params params[XNN_MIN_ELEMENTS (1)])2550 size_t xnn_init_f32_abs_avx_params(
2551   union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2552 {
2553   for (uint32_t i = 0; i < 8; i++) {
2554     params->avx.nonsign_mask[i] = math_nonsign_mask_f32();
2555   }
2556   for (uint32_t i = 0; i < 7; i++) {
2557     params->avx.mask_table[i] = -1;
2558   }
2559   for (uint32_t i = 7; i < 14; i++) {
2560     params->avx.mask_table[i] = 0;
2561   }
2562   return sizeof(params->avx);
2563 }
2564 
xnn_init_f32_abs_avx512_params(union xnn_f32_abs_params params[XNN_MIN_ELEMENTS (1)])2565 size_t xnn_init_f32_abs_avx512_params(
2566   union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2567 {
2568   params->avx512.nonsign_mask = UINT32_C(0x7FFFFFFF);
2569   return sizeof(params->avx512);
2570 }
2571 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2572 
2573 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_abs_wasmsimd_params(union xnn_f32_abs_params params[XNN_MIN_ELEMENTS (1)])2574 size_t xnn_init_f32_abs_wasmsimd_params(
2575   union xnn_f32_abs_params params[XNN_MIN_ELEMENTS(1)])
2576 {
2577   params->wasmsimd.nonsign_mask[0] = math_nonsign_mask_f32();
2578   params->wasmsimd.nonsign_mask[1] = math_nonsign_mask_f32();
2579   return sizeof(params->wasmsimd);
2580 }
2581 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2582 
2583 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_neg_sse_params(union xnn_f16_neg_params params[XNN_MIN_ELEMENTS (1)])2584 size_t xnn_init_f16_neg_sse_params(
2585   union xnn_f16_neg_params params[XNN_MIN_ELEMENTS(1)])
2586 {
2587   for (uint32_t i = 0; i < 8; i++) {
2588     params->sse.sign_mask[i] = UINT16_C(0x8000);
2589   }
2590   return sizeof(params->sse);
2591 }
2592 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2593 
2594 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_neg_sse_params(union xnn_f32_neg_params params[XNN_MIN_ELEMENTS (1)])2595 size_t xnn_init_f32_neg_sse_params(
2596   union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2597 {
2598   for (uint32_t i = 0; i < 4; i++) {
2599     params->sse.sign_mask[i] = -0.0f;
2600   }
2601   return sizeof(params->sse);
2602 }
2603 
xnn_init_f32_neg_avx_params(union xnn_f32_neg_params params[XNN_MIN_ELEMENTS (1)])2604 size_t xnn_init_f32_neg_avx_params(
2605   union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2606 {
2607   for (uint32_t i = 0; i < 8; i++) {
2608     params->avx.sign_mask[i] = -0.0f;
2609   }
2610   for (uint32_t i = 0; i < 7; i++) {
2611     params->avx.mask_table[i] = -1;
2612   }
2613   for (uint32_t i = 7; i < 14; i++) {
2614     params->avx.mask_table[i] = 0;
2615   }
2616   return sizeof(params->avx);
2617 }
2618 
xnn_init_f32_neg_avx512_params(union xnn_f32_neg_params params[XNN_MIN_ELEMENTS (1)])2619 size_t xnn_init_f32_neg_avx512_params(
2620   union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2621 {
2622   params->avx512.sign_mask = UINT32_C(0x80000000);
2623   return sizeof(params->avx512);
2624 }
2625 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2626 
2627 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_neg_wasmsimd_params(union xnn_f32_neg_params params[XNN_MIN_ELEMENTS (1)])2628 size_t xnn_init_f32_neg_wasmsimd_params(
2629   union xnn_f32_neg_params params[XNN_MIN_ELEMENTS(1)])
2630 {
2631   params->wasmsimd.sign_mask[0] = -0.0f;
2632   params->wasmsimd.sign_mask[1] = -0.0f;
2633   return sizeof(params->wasmsimd);
2634 }
2635 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
2636 
2637 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_rnd_sse2_params(union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS (1)])2638 size_t xnn_init_f32_rnd_sse2_params(
2639   union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
2640 {
2641   for (uint32_t i = 0; i < 4; i++) {
2642     params->sse2.sign_mask[i] = -0.0f;
2643     params->sse2.one[i] = 1.0f;
2644   }
2645   return sizeof(params->sse2);
2646 }
2647 
xnn_init_f32_rnd_avx_params(union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS (1)])2648 size_t xnn_init_f32_rnd_avx_params(
2649   union xnn_f32_rnd_params params[XNN_MIN_ELEMENTS(1)])
2650 {
2651   for (uint32_t i = 0; i < 7; i++) {
2652     params->avx.mask_table[i] = -1;
2653   }
2654   for (uint32_t i = 7; i < 14; i++) {
2655     params->avx.mask_table[i] = 0;
2656   }
2657   return sizeof(params->avx);
2658 }
2659 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2660 
2661 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_elu_neonfp16arith_rr1_p3_params(union xnn_f16_elu_params params[XNN_MIN_ELEMENTS (1)],uint16_t prescale,uint16_t alpha,uint16_t beta)2662 size_t xnn_init_f16_elu_neonfp16arith_rr1_p3_params(
2663   union xnn_f16_elu_params params[XNN_MIN_ELEMENTS(1)],
2664   uint16_t prescale,
2665   uint16_t alpha,
2666   uint16_t beta)
2667 {
2668   params->neonfp16arith_rr1_p3.prescale = prescale;
2669   params->neonfp16arith_rr1_p3.sat_cutoff = UINT16_C(0xC829);  // -0x1.0A4p+3h;
2670   params->neonfp16arith_rr1_p3.magic_bias = UINT16_C(0x660F);  // 0x1.83Cp+10h
2671   params->neonfp16arith_rr1_p3.log2e = UINT16_C(0x3DC5);  // 0x1.714p+0h
2672   params->neonfp16arith_rr1_p3.minus_ln2 = UINT16_C(0xB98C);  // -0x1.62E430p-1h
2673   params->neonfp16arith_rr1_p3.c3 = UINT16_C(0x315B);  // 0x1.56Cp-3h
2674   params->neonfp16arith_rr1_p3.c2 = UINT16_C(0x3808);  // 0x1.020p-1h
2675   params->neonfp16arith_rr1_p3.minus_alpha = alpha ^ UINT16_C(0x8000);
2676   params->neonfp16arith_rr1_p3.beta = beta;
2677   return sizeof(params->neonfp16arith_rr1_p3);
2678 }
2679 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
2680 
2681 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_elu_avx2_rr1_p3_params(union xnn_f16_elu_params params[XNN_MIN_ELEMENTS (1)],uint16_t prescale,uint16_t alpha,uint16_t beta)2682 size_t xnn_init_f16_elu_avx2_rr1_p3_params(
2683   union xnn_f16_elu_params params[XNN_MIN_ELEMENTS(1)],
2684   uint16_t prescale,
2685   uint16_t alpha,
2686   uint16_t beta)
2687 {
2688   for (uint32_t i = 0; i < 8; i++) {
2689     params->avx2_rr1_p3.prescale[i] = fp16_ieee_to_fp32_value(prescale);
2690     params->avx2_rr1_p3.sat_cutoff[i] = -0x1.0A4000p+3f;
2691     params->avx2_rr1_p3.magic_bias[i] = 0x1.8000FEp23f;
2692     params->avx2_rr1_p3.log2e[i] = 0x1.715476p+0f;
2693     params->avx2_rr1_p3.minus_ln2[i] = -0x1.62E430p-1f;
2694     params->avx2_rr1_p3.c3[i] = 0x1.5554DCp-3f;
2695     params->avx2_rr1_p3.c2[i] = 0x1.01EBB2p-1f;
2696     params->avx2_rr1_p3.c1[i] = 0x1.0002F2p+0f;
2697     params->avx2_rr1_p3.alpha[i] = fp16_ieee_to_fp32_value(alpha);
2698     params->avx2_rr1_p3.beta[i] = fp16_ieee_to_fp32_value(beta);
2699   }
2700   return sizeof(params->avx2_rr1_p3);
2701 }
2702 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
2703 
xnn_init_f32_elu_scalar_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2704 size_t xnn_init_f32_elu_scalar_rr2_lut16_p3_params(
2705   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2706   float prescale,
2707   float alpha,
2708   float beta)
2709 {
2710   params->scalar_rr2_lut16_p3.prescale = prescale;
2711   params->scalar_rr2_lut16_p3.alpha = alpha;
2712   params->scalar_rr2_lut16_p3.beta = beta;
2713   params->scalar_rr2_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2714   params->scalar_rr2_lut16_p3.magic_bias = 0x1.800000p19f;
2715   params->scalar_rr2_lut16_p3.log2e = 0x1.715476p+0f;
2716   params->scalar_rr2_lut16_p3.minus_ln2_hi = -0x1.62E400p-1f;
2717   params->scalar_rr2_lut16_p3.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2718   params->scalar_rr2_lut16_p3.c3 = 0x1.55561Cp-3f;
2719   params->scalar_rr2_lut16_p3.c2 = 0x1.0001ECp-1f;
2720   params->scalar_rr2_lut16_p3.one = 1.0f;
2721   return sizeof(params->scalar_rr2_lut16_p3);
2722 }
2723 
xnn_init_f32_elu_scalar_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2724 size_t xnn_init_f32_elu_scalar_rr2_p6_params(
2725   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2726   float prescale,
2727   float alpha,
2728   float beta)
2729 {
2730   params->scalar_rr2_p6.prescale = prescale;
2731   params->scalar_rr2_p6.alpha = alpha;
2732   params->scalar_rr2_p6.beta = beta;
2733   params->scalar_rr2_p6.sat_cutoff = -0x1.154246p+4f;
2734   params->scalar_rr2_p6.magic_bias = 0x1.8000FEp23f;
2735   params->scalar_rr2_p6.log2e = 0x1.715476p+0f;
2736   params->scalar_rr2_p6.minus_ln2_hi = -0x1.62E440p-1f;
2737   params->scalar_rr2_p6.minus_ln2_lo = 0x1.0105C6p-21f;
2738   params->scalar_rr2_p6.c6 = 0x1.6b7338p-10f;
2739   params->scalar_rr2_p6.c5 = 0x1.12278Ep-7f;
2740   params->scalar_rr2_p6.c4 = 0x1.555716p-5f;
2741   params->scalar_rr2_p6.c3 = 0x1.5554B0p-3f;
2742   params->scalar_rr2_p6.c2 = 0x1.FFFFFEp-2f;
2743   params->scalar_rr2_p6.one = 1.0f;
2744   return sizeof(params->scalar_rr2_p6);
2745 }
2746 
2747 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_elu_neon_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2748 size_t xnn_init_f32_elu_neon_rr2_lut16_p3_params(
2749   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2750   float prescale,
2751   float alpha,
2752   float beta)
2753 {
2754   params->neon_rr2_lut16_p3.prescale = prescale;
2755   params->neon_rr2_lut16_p3.alpha = alpha;
2756   params->neon_rr2_lut16_p3.beta = beta;
2757   params->neon_rr2_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2758   params->neon_rr2_lut16_p3.magic_bias = 0x1.800000p19f;
2759   params->neon_rr2_lut16_p3.log2e = 0x1.715476p+0f;
2760   params->neon_rr2_lut16_p3.minus_ln2_hi = -0x1.62E400p-1f;
2761   params->neon_rr2_lut16_p3.minus_ln2_lo = -0x1.7F7D1Cp-20f;
2762   params->neon_rr2_lut16_p3.c3 = 0x1.55561Cp-3f;
2763   params->neon_rr2_lut16_p3.c2 = 0x1.0001ECp-1f;
2764   return sizeof(params->neon_rr2_lut16_p3);
2765 }
2766 
xnn_init_f32_elu_neon_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2767 size_t xnn_init_f32_elu_neon_rr2_p6_params(
2768   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2769   float prescale,
2770   float alpha,
2771   float beta)
2772 {
2773   params->neon_rr2_p6.prescale = prescale;
2774   params->neon_rr2_p6.alpha = alpha;
2775   params->neon_rr2_p6.beta = beta;
2776   params->neon_rr2_p6.sat_cutoff = -0x1.154246p+4f;
2777   params->neon_rr2_p6.magic_bias = 0x1.8000FEp23f;
2778   params->neon_rr2_p6.log2e = 0x1.715476p+0f;
2779   params->neon_rr2_p6.minus_ln2_hi = -0x1.62E440p-1f;
2780   params->neon_rr2_p6.minus_ln2_lo = 0x1.0105C6p-21f;
2781   params->neon_rr2_p6.c6 = 0x1.6b7338p-10f;
2782   params->neon_rr2_p6.c5 = 0x1.12278Ep-7f;
2783   params->neon_rr2_p6.c4 = 0x1.555716p-5f;
2784   params->neon_rr2_p6.c3 = 0x1.5554B0p-3f;
2785   params->neon_rr2_p6.c2 = 0x1.FFFFFEp-2f;
2786   return sizeof(params->neon_rr2_p6);
2787 }
2788 
xnn_init_f32_elu_neonfma_rr1_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2789 size_t xnn_init_f32_elu_neonfma_rr1_lut16_p3_params(
2790   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2791   float prescale,
2792   float alpha,
2793   float beta)
2794 {
2795   params->neonfma_rr1_lut16_p3.prescale = prescale;
2796   params->neonfma_rr1_lut16_p3.alpha = alpha;
2797   params->neonfma_rr1_lut16_p3.beta = beta;
2798   params->neonfma_rr1_lut16_p3.sat_cutoff = -0x1.154246p+4f;
2799   params->neonfma_rr1_lut16_p3.magic_bias = 0x1.800000p19f;
2800   params->neonfma_rr1_lut16_p3.log2e = 0x1.715476p+0f;
2801   params->neonfma_rr1_lut16_p3.minus_ln2 = -0x1.62E430p-1f;
2802   params->neonfma_rr1_lut16_p3.c3 = 0x1.55561Cp-3f;
2803   params->neonfma_rr1_lut16_p3.c2 = 0x1.0001ECp-1f;
2804   return sizeof(params->neonfma_rr1_lut16_p3);
2805 }
2806 
xnn_init_f32_elu_neonfma_rr1_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2807 size_t xnn_init_f32_elu_neonfma_rr1_p6_params(
2808   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2809   float prescale,
2810   float alpha,
2811   float beta)
2812 {
2813   params->neonfma_rr1_p6.prescale = prescale;
2814   params->neonfma_rr1_p6.alpha = alpha;
2815   params->neonfma_rr1_p6.beta = beta;
2816   params->neonfma_rr1_p6.sat_cutoff = -0x1.154246p+4f;
2817   params->neonfma_rr1_p6.magic_bias = 0x1.8000FEp23f;
2818   params->neonfma_rr1_p6.log2e = 0x1.715476p+0f;
2819   params->neonfma_rr1_p6.minus_ln2 = -0x1.62E430p-1f;
2820   params->neonfma_rr1_p6.c6 = 0x1.6b7338p-10f;
2821   params->neonfma_rr1_p6.c5 = 0x1.12278Ep-7f;
2822   params->neonfma_rr1_p6.c4 = 0x1.555716p-5f;
2823   params->neonfma_rr1_p6.c3 = 0x1.5554B0p-3f;
2824   params->neonfma_rr1_p6.c2 = 0x1.FFFFFEp-2f;
2825   return sizeof(params->neonfma_rr1_p6);
2826 }
2827 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
2828 
2829 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_elu_sse2_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2830 size_t xnn_init_f32_elu_sse2_rr2_lut16_p3_params(
2831   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2832   float prescale,
2833   float alpha,
2834   float beta)
2835 {
2836   for (uint32_t i = 0; i < 4; i++) {
2837     params->sse2_rr2_lut16_p3.prescale[i] = prescale;
2838     params->sse2_rr2_lut16_p3.alpha[i] = alpha;
2839     params->sse2_rr2_lut16_p3.beta[i] = beta;
2840     params->sse2_rr2_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2841     params->sse2_rr2_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2842     params->sse2_rr2_lut16_p3.log2e[i] = 0x1.715476p+0f;
2843     params->sse2_rr2_lut16_p3.index_mask[i] = UINT32_C(0xF);
2844     params->sse2_rr2_lut16_p3.minus_ln2_hi[i] = -0x1.62E400p-1f;
2845     params->sse2_rr2_lut16_p3.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2846     params->sse2_rr2_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2847     params->sse2_rr2_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2848     params->sse2_rr2_lut16_p3.one[i] = 1.0f;
2849   }
2850   return sizeof(params->sse2_rr2_lut16_p3);
2851 }
2852 
xnn_init_f32_elu_sse2_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2853 size_t xnn_init_f32_elu_sse2_rr2_p6_params(
2854   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2855   float prescale,
2856   float alpha,
2857   float beta)
2858 {
2859   for (uint32_t i = 0; i < 4; i++) {
2860     params->sse2_rr2_p6.prescale[i] = prescale;
2861     params->sse2_rr2_p6.alpha[i] = alpha;
2862     params->sse2_rr2_p6.beta[i] = beta;
2863     params->sse2_rr2_p6.sat_cutoff[i] = -0x1.154246p+4f;
2864     params->sse2_rr2_p6.magic_bias[i] = 0x1.8000FEp23f;
2865     params->sse2_rr2_p6.log2e[i] = 0x1.715476p+0f;
2866     params->sse2_rr2_p6.minus_ln2_hi[i] = -0x1.62E440p-1f;
2867     params->sse2_rr2_p6.minus_ln2_lo[i] = 0x1.0105C6p-21f;
2868     params->sse2_rr2_p6.c6[i] = 0x1.6b7338p-10f;
2869     params->sse2_rr2_p6.c5[i] = 0x1.12278Ep-7f;
2870     params->sse2_rr2_p6.c4[i] = 0x1.555716p-5f;
2871     params->sse2_rr2_p6.c3[i] = 0x1.5554B0p-3f;
2872     params->sse2_rr2_p6.c2[i] = 0x1.FFFFFEp-2f;
2873     params->sse2_rr2_p6.one[i] = 1.0f;
2874   }
2875   return sizeof(params->sse2_rr2_p6);
2876 }
2877 
xnn_init_f32_elu_avx_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2878 size_t xnn_init_f32_elu_avx_rr2_lut16_p3_params(
2879   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2880   float prescale,
2881   float alpha,
2882   float beta)
2883 {
2884   for (uint32_t i = 0; i < 8; i++) {
2885     params->avx_rr2_lut16_p3.prescale[i] = prescale;
2886     params->avx_rr2_lut16_p3.alpha[i] = alpha;
2887     params->avx_rr2_lut16_p3.beta[i] = beta;
2888     params->avx_rr2_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2889     params->avx_rr2_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2890     params->avx_rr2_lut16_p3.log2e[i] = 0x1.715476p+0f;
2891     params->avx_rr2_lut16_p3.index_mask[i] = UINT32_C(0xF);
2892     params->avx_rr2_lut16_p3.minus_ln2_hi[i] = -0x1.62E400p-1f;
2893     params->avx_rr2_lut16_p3.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2894     params->avx_rr2_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2895     params->avx_rr2_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2896     params->avx_rr2_lut16_p3.one[i] = 1.0f;
2897   }
2898   for (uint32_t i = 0; i < 7; i++) {
2899     params->avx_rr2_lut16_p3.mask_table[i] = -1;
2900   }
2901   for (uint32_t i = 7; i < 14; i++) {
2902     params->avx_rr2_lut16_p3.mask_table[i] = 0;
2903   }
2904   return sizeof(params->avx_rr2_lut16_p3);
2905 }
2906 
xnn_init_f32_elu_avx_rr2_lut4_p4_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2907 size_t xnn_init_f32_elu_avx_rr2_lut4_p4_params(
2908   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2909   float prescale,
2910   float alpha,
2911   float beta)
2912 {
2913   for (uint32_t i = 0; i < 8; i++) {
2914     params->avx_rr2_lut4_p4.prescale[i] = prescale;
2915     params->avx_rr2_lut4_p4.alpha[i] = alpha;
2916     params->avx_rr2_lut4_p4.beta[i] = beta;
2917     params->avx_rr2_lut4_p4.sat_cutoff[i] = -0x1.154246p+4f;
2918     params->avx_rr2_lut4_p4.magic_bias[i] = 0x1.8003F8p21f;
2919     params->avx_rr2_lut4_p4.log2e[i] = 0x1.715476p+0f;
2920     params->avx_rr2_lut4_p4.index_mask[i] = UINT32_C(0x3);
2921   }
2922   params->avx_rr2_lut4_p4.table[0] = 0x1.000000p+0f;
2923   params->avx_rr2_lut4_p4.table[1] = 0x1.306FE0p+0f;
2924   params->avx_rr2_lut4_p4.table[2] = 0x1.6A09E6p+0f;
2925   params->avx_rr2_lut4_p4.table[3] = 0x1.AE89FAp+0f;
2926   params->avx_rr2_lut4_p4.table[4] = 0x1.000000p+0f;
2927   params->avx_rr2_lut4_p4.table[5] = 0x1.306FE0p+0f;
2928   params->avx_rr2_lut4_p4.table[6] = 0x1.6A09E6p+0f;
2929   params->avx_rr2_lut4_p4.table[7] = 0x1.AE89FAp+0f;
2930   for (uint32_t i = 0; i < 8; i++) {
2931     params->avx_rr2_lut4_p4.minus_ln2_hi[i] = -0x1.62E400p-1f;
2932     params->avx_rr2_lut4_p4.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
2933     params->avx_rr2_lut4_p4.c4[i] = 0x1.554F9Ap-5f;
2934     params->avx_rr2_lut4_p4.c3[i] = 0x1.557082p-3f;
2935     params->avx_rr2_lut4_p4.c2[i] = 0x1.000002p-1f;
2936     params->avx_rr2_lut4_p4.one[i] = 1.0f;
2937   }
2938   for (uint32_t i = 0; i < 7; i++) {
2939     params->avx_rr2_lut4_p4.mask_table[i] = -1;
2940   }
2941   for (uint32_t i = 7; i < 14; i++) {
2942     params->avx_rr2_lut4_p4.mask_table[i] = 0;
2943   }
2944   return sizeof(params->avx_rr2_lut4_p4);
2945 }
2946 
xnn_init_f32_elu_avx_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2947 size_t xnn_init_f32_elu_avx_rr2_p6_params(
2948   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2949   float prescale,
2950   float alpha,
2951   float beta)
2952 {
2953   for (uint32_t i = 0; i < 8; i++) {
2954     params->avx_rr2_p6.prescale[i] = prescale;
2955     params->avx_rr2_p6.alpha[i] = alpha;
2956     params->avx_rr2_p6.beta[i] = beta;
2957     params->avx_rr2_p6.sat_cutoff[i] = -0x1.154246p+4f;
2958     params->avx_rr2_p6.magic_bias[i] = 0x1.8000FEp23f;
2959     params->avx_rr2_p6.log2e[i] = 0x1.715476p+0f;
2960     params->avx_rr2_p6.minus_ln2_hi[i] = -0x1.62E440p-1f;
2961     params->avx_rr2_p6.minus_ln2_lo[i] = 0x1.0105C6p-21f;
2962     params->avx_rr2_p6.c6[i] = 0x1.6b7338p-10f;
2963     params->avx_rr2_p6.c5[i] = 0x1.12278Ep-7f;
2964     params->avx_rr2_p6.c4[i] = 0x1.555716p-5f;
2965     params->avx_rr2_p6.c3[i] = 0x1.5554B0p-3f;
2966     params->avx_rr2_p6.c2[i] = 0x1.FFFFFEp-2f;
2967     params->avx_rr2_p6.one[i] = 1.0f;
2968   }
2969   for (uint32_t i = 0; i < 7; i++) {
2970     params->avx_rr2_p6.mask_table[i] = -1;
2971   }
2972   for (uint32_t i = 7; i < 14; i++) {
2973     params->avx_rr2_p6.mask_table[i] = 0;
2974   }
2975   return sizeof(params->avx_rr2_p6);
2976 }
2977 
xnn_init_f32_elu_avx2_rr1_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)2978 size_t xnn_init_f32_elu_avx2_rr1_lut16_p3_params(
2979   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
2980   float prescale,
2981   float alpha,
2982   float beta)
2983 {
2984   for (uint32_t i = 0; i < 8; i++) {
2985     params->avx2_rr1_lut16_p3.prescale[i] = prescale;
2986     params->avx2_rr1_lut16_p3.alpha[i] = alpha;
2987     params->avx2_rr1_lut16_p3.beta[i] = beta;
2988     params->avx2_rr1_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
2989     params->avx2_rr1_lut16_p3.magic_bias[i] = 0x1.800000p19f;
2990     params->avx2_rr1_lut16_p3.log2e[i] = 0x1.715476p+0f;
2991     params->avx2_rr1_lut16_p3.index_mask[i] = UINT32_C(0xF);
2992     params->avx2_rr1_lut16_p3.minus_ln2[i] = -0x1.62E430p-1f;
2993     params->avx2_rr1_lut16_p3.c3[i] = 0x1.55561Cp-3f;
2994     params->avx2_rr1_lut16_p3.c2[i] = 0x1.0001ECp-1f;
2995   }
2996   for (uint32_t i = 0; i < 7; i++) {
2997     params->avx2_rr1_lut16_p3.mask_table[i] = -1;
2998   }
2999   for (uint32_t i = 7; i < 14; i++) {
3000     params->avx2_rr1_lut16_p3.mask_table[i] = 0;
3001   }
3002   return sizeof(params->avx2_rr1_lut16_p3);
3003 }
3004 
xnn_init_f32_elu_avx2_rr1_lut8_p4_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)3005 size_t xnn_init_f32_elu_avx2_rr1_lut8_p4_params(
3006   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
3007   float prescale,
3008   float alpha,
3009   float beta)
3010 {
3011   for (uint32_t i = 0; i < 8; i++) {
3012     params->avx2_rr1_lut8_p4.prescale[i] = prescale;
3013     params->avx2_rr1_lut8_p4.alpha[i] = alpha;
3014     params->avx2_rr1_lut8_p4.beta[i] = beta;
3015     params->avx2_rr1_lut8_p4.sat_cutoff[i] = -0x1.154246p+4f;
3016     params->avx2_rr1_lut8_p4.magic_bias[i] = 0x1.800000p20f;
3017     params->avx2_rr1_lut8_p4.log2e[i] = 0x1.715476p+0f;
3018   }
3019   params->avx2_rr1_lut8_p4.table[0] = UINT32_C(0x3F800000);
3020   params->avx2_rr1_lut8_p4.table[1] = UINT32_C(0x3F7B95C2);
3021   params->avx2_rr1_lut8_p4.table[2] = UINT32_C(0x3F7837F0);
3022   params->avx2_rr1_lut8_p4.table[3] = UINT32_C(0x3F75FED7);
3023   params->avx2_rr1_lut8_p4.table[4] = UINT32_C(0x3F7504F3);
3024   params->avx2_rr1_lut8_p4.table[5] = UINT32_C(0x3F75672A);
3025   params->avx2_rr1_lut8_p4.table[6] = UINT32_C(0x3F7744FD);
3026   params->avx2_rr1_lut8_p4.table[7] = UINT32_C(0x3F7AC0C7);
3027   for (uint32_t i = 0; i < 8; i++) {
3028     params->avx2_rr1_lut8_p4.minus_ln2[i] = -0x1.62E430p-1f;
3029     params->avx2_rr1_lut8_p4.c4[i] = 0x1.5558ECp-5f;
3030     params->avx2_rr1_lut8_p4.c3[i] = 0x1.555C20p-3f;
3031     params->avx2_rr1_lut8_p4.c2[i] = 0x1.000000p-1f;
3032   }
3033   for (uint32_t i = 0; i < 7; i++) {
3034     params->avx2_rr1_lut8_p4.mask_table[i] = -1;
3035   }
3036   for (uint32_t i = 7; i < 14; i++) {
3037     params->avx2_rr1_lut8_p4.mask_table[i] = 0;
3038   }
3039   return sizeof(params->avx2_rr1_lut8_p4);
3040 }
3041 
xnn_init_f32_elu_avx2_rr1_lut4_p4_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)3042 size_t xnn_init_f32_elu_avx2_rr1_lut4_p4_params(
3043   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
3044   float prescale,
3045   float alpha,
3046   float beta)
3047 {
3048   for (uint32_t i = 0; i < 8; i++) {
3049     params->avx2_rr1_lut4_p4.prescale[i] = prescale;
3050     params->avx2_rr1_lut4_p4.alpha[i] = alpha;
3051     params->avx2_rr1_lut4_p4.beta[i] = beta;
3052     params->avx2_rr1_lut4_p4.sat_cutoff[i] = -0x1.154246p+4f;
3053     params->avx2_rr1_lut4_p4.magic_bias[i] = 0x1.800000p21f;
3054     params->avx2_rr1_lut4_p4.log2e[i] = 0x1.715476p+0f;
3055   }
3056   params->avx2_rr1_lut4_p4.table[0] = 0x1.000000p+0f;
3057   params->avx2_rr1_lut4_p4.table[1] = 0x1.F06FE0p-1f;
3058   params->avx2_rr1_lut4_p4.table[2] = 0x1.EA09E6p-1f;
3059   params->avx2_rr1_lut4_p4.table[3] = 0x1.EE89FAp-1f;
3060   params->avx2_rr1_lut4_p4.table[4] = 0x1.000000p+0f;
3061   params->avx2_rr1_lut4_p4.table[5] = 0x1.F06FE0p-1f;
3062   params->avx2_rr1_lut4_p4.table[6] = 0x1.EA09E6p-1f;
3063   params->avx2_rr1_lut4_p4.table[7] = 0x1.EE89FAp-1f;
3064   for (uint32_t i = 0; i < 8; i++) {
3065     params->avx2_rr1_lut4_p4.minus_ln2[i] = -0x1.62E430p-1f;
3066     params->avx2_rr1_lut4_p4.c4[i] = 0x1.554F9Ap-5f;
3067     params->avx2_rr1_lut4_p4.c3[i] = 0x1.557082p-3f;
3068     params->avx2_rr1_lut4_p4.c2[i] = 0x1.000002p-1f;
3069   }
3070   for (uint32_t i = 0; i < 7; i++) {
3071     params->avx2_rr1_lut4_p4.mask_table[i] = -1;
3072   }
3073   for (uint32_t i = 7; i < 14; i++) {
3074     params->avx2_rr1_lut4_p4.mask_table[i] = 0;
3075   }
3076   return sizeof(params->avx2_rr1_lut4_p4);
3077 }
3078 
xnn_init_f32_elu_avx2_rr1_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)3079 size_t xnn_init_f32_elu_avx2_rr1_p6_params(
3080   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
3081   float prescale,
3082   float alpha,
3083   float beta)
3084 {
3085   for (uint32_t i = 0; i < 8; i++) {
3086     params->avx2_rr1_p6.prescale[i] = prescale;
3087     params->avx2_rr1_p6.alpha[i] = alpha;
3088     params->avx2_rr1_p6.beta[i] = beta;
3089     params->avx2_rr1_p6.sat_cutoff[i] = -0x1.154246p+4f;
3090     params->avx2_rr1_p6.magic_bias[i] = 0x1.8000FEp23f;
3091     params->avx2_rr1_p6.log2e[i] = 0x1.715476p+0f;
3092     params->avx2_rr1_p6.minus_ln2[i] = -0x1.62E430p-1f;
3093     params->avx2_rr1_p6.c6[i] = 0x1.6B7338p-10f;
3094     params->avx2_rr1_p6.c5[i] = 0x1.12278Ep-7f;
3095     params->avx2_rr1_p6.c4[i] = 0x1.555716p-5f;
3096     params->avx2_rr1_p6.c3[i] = 0x1.5554B0p-3f;
3097     params->avx2_rr1_p6.c2[i] = 0x1.FFFFFEp-2f;
3098   }
3099   for (uint32_t i = 0; i < 7; i++) {
3100     params->avx2_rr1_p6.mask_table[i] = -1;
3101   }
3102   for (uint32_t i = 7; i < 14; i++) {
3103     params->avx2_rr1_p6.mask_table[i] = 0;
3104   }
3105   return sizeof(params->avx2_rr1_p6);
3106 }
3107 
xnn_init_f32_elu_avx512_rr1_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)3108 size_t xnn_init_f32_elu_avx512_rr1_lut16_p3_params(
3109   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
3110   float prescale,
3111   float alpha,
3112   float beta)
3113 {
3114   params->avx512_rr1_lut16_p3.prescale = prescale;
3115   params->avx512_rr1_lut16_p3.alpha = alpha;
3116   params->avx512_rr1_lut16_p3.beta = beta;
3117   params->avx512_rr1_lut16_p3.sat_cutoff = -0x1.154246p+4f;
3118   params->avx512_rr1_lut16_p3.magic_bias = 0x1.800000p19f;
3119   params->avx512_rr1_lut16_p3.log2e = 0x1.715476p+0f;
3120   params->avx512_rr1_lut16_p3.minus_ln2 = -0x1.62E430p-1f;
3121   params->avx512_rr1_lut16_p3.c3 = 0x1.55561Cp-3f;
3122   params->avx512_rr1_lut16_p3.c2 = 0x1.0001ECp-1f;
3123   params->avx512_rr1_lut16_p3.table[ 0] = UINT32_C(0x3F800000);
3124   params->avx512_rr1_lut16_p3.table[ 1] = UINT32_C(0x3F7DAAC3);
3125   params->avx512_rr1_lut16_p3.table[ 2] = UINT32_C(0x3F7B95C2);
3126   params->avx512_rr1_lut16_p3.table[ 3] = UINT32_C(0x3F79C3D3);
3127   params->avx512_rr1_lut16_p3.table[ 4] = UINT32_C(0x3F7837F0);
3128   params->avx512_rr1_lut16_p3.table[ 5] = UINT32_C(0x3F76F532);
3129   params->avx512_rr1_lut16_p3.table[ 6] = UINT32_C(0x3F75FED7);
3130   params->avx512_rr1_lut16_p3.table[ 7] = UINT32_C(0x3F75583F);
3131   params->avx512_rr1_lut16_p3.table[ 8] = UINT32_C(0x3F7504F3);
3132   params->avx512_rr1_lut16_p3.table[ 9] = UINT32_C(0x3F7508A4);
3133   params->avx512_rr1_lut16_p3.table[10] = UINT32_C(0x3F75672A);
3134   params->avx512_rr1_lut16_p3.table[11] = UINT32_C(0x3F76248C);
3135   params->avx512_rr1_lut16_p3.table[12] = UINT32_C(0x3F7744FD);
3136   params->avx512_rr1_lut16_p3.table[13] = UINT32_C(0x3F78CCDF);
3137   params->avx512_rr1_lut16_p3.table[14] = UINT32_C(0x3F7AC0C7);
3138   params->avx512_rr1_lut16_p3.table[15] = UINT32_C(0x3F7D257D);
3139   return sizeof(params->avx512_rr1_lut16_p3);
3140 }
3141 
xnn_init_f32_elu_avx512_rr1_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)3142 size_t xnn_init_f32_elu_avx512_rr1_p6_params(
3143   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
3144   float prescale,
3145   float alpha,
3146   float beta)
3147 {
3148   params->avx512_rr1_p6.prescale = prescale;
3149   params->avx512_rr1_p6.alpha = alpha;
3150   params->avx512_rr1_p6.beta = beta;
3151   params->avx512_rr1_p6.sat_cutoff = -0x1.154246p+4f;
3152   params->avx512_rr1_p6.magic_bias = 0x1.8000FEp23f;
3153   params->avx512_rr1_p6.log2e = 0x1.715476p+0f;
3154   params->avx512_rr1_p6.minus_ln2 = -0x1.62E430p-1f;
3155   params->avx512_rr1_p6.c6 = 0x1.6B7338p-10f;
3156   params->avx512_rr1_p6.c5 = 0x1.12278Ep-7f;
3157   params->avx512_rr1_p6.c4 = 0x1.555716p-5f;
3158   params->avx512_rr1_p6.c3 = 0x1.5554B0p-3f;
3159   params->avx512_rr1_p6.c2 = 0x1.FFFFFEp-2f;
3160   return sizeof(params->avx512_rr1_p6);
3161 }
3162 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3163 
3164 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)3165 size_t xnn_init_f32_elu_wasmsimd_rr2_lut16_p3_params(
3166   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
3167   float prescale,
3168   float alpha,
3169   float beta)
3170 {
3171   for (uint32_t i = 0; i < 2; i++) {
3172     params->wasmsimd_rr2_lut16_p3.prescale[i] = prescale;
3173     params->wasmsimd_rr2_lut16_p3.alpha[i] = alpha;
3174     params->wasmsimd_rr2_lut16_p3.beta[i] = beta;
3175     params->wasmsimd_rr2_lut16_p3.sat_cutoff[i] = -0x1.154246p+4f;
3176     params->wasmsimd_rr2_lut16_p3.magic_bias[i] = 0x1.800000p19f;
3177     params->wasmsimd_rr2_lut16_p3.log2e[i] = 0x1.715476p+0f;
3178     params->wasmsimd_rr2_lut16_p3.index_mask[i] = UINT32_C(0xF);
3179     params->wasmsimd_rr2_lut16_p3.minus_ln2_hi[i] = -0x1.62E400p-1f;
3180     params->wasmsimd_rr2_lut16_p3.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
3181     params->wasmsimd_rr2_lut16_p3.c3[i] = 0x1.55561Cp-3f;
3182     params->wasmsimd_rr2_lut16_p3.c2[i] = 0x1.0001ECp-1f;
3183     params->wasmsimd_rr2_lut16_p3.one[i] = 1.0f;
3184   }
3185   return sizeof(params->wasmsimd_rr2_lut16_p3);
3186 }
3187 
xnn_init_f32_elu_wasmsimd_rr2_p6_params(union xnn_f32_elu_params params[XNN_MIN_ELEMENTS (1)],float prescale,float alpha,float beta)3188 size_t xnn_init_f32_elu_wasmsimd_rr2_p6_params(
3189   union xnn_f32_elu_params params[XNN_MIN_ELEMENTS(1)],
3190   float prescale,
3191   float alpha,
3192   float beta)
3193 {
3194   for (uint32_t i = 0; i < 2; i++) {
3195     params->wasmsimd_rr2_p6.prescale[i] = prescale;
3196     params->wasmsimd_rr2_p6.alpha[i] = alpha;
3197     params->wasmsimd_rr2_p6.beta[i] = beta;
3198     params->wasmsimd_rr2_p6.sat_cutoff[i] = -0x1.154246p+4f;
3199     params->wasmsimd_rr2_p6.magic_bias[i] = 0x1.8000FEp23f;
3200     params->wasmsimd_rr2_p6.log2e[i] = 0x1.715476p+0f;
3201     params->wasmsimd_rr2_p6.minus_ln2_hi[i] = -0x1.62E440p-1f;
3202     params->wasmsimd_rr2_p6.minus_ln2_lo[i] = 0x1.0105C6p-21f;
3203     params->wasmsimd_rr2_p6.c6[i] = 0x1.6b7338p-10f;
3204     params->wasmsimd_rr2_p6.c5[i] = 0x1.12278Ep-7f;
3205     params->wasmsimd_rr2_p6.c4[i] = 0x1.555716p-5f;
3206     params->wasmsimd_rr2_p6.c3[i] = 0x1.5554B0p-3f;
3207     params->wasmsimd_rr2_p6.c2[i] = 0x1.FFFFFEp-2f;
3208     params->wasmsimd_rr2_p6.one[i] = 1.0f;
3209   }
3210   return sizeof(params->wasmsimd_rr2_p6);
3211 }
3212 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3213 
3214 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_expminus_neonfp16arith_rr2_p2_params(union xnn_f16_expminus_params params[XNN_MIN_ELEMENTS (1)])3215 size_t xnn_init_f16_expminus_neonfp16arith_rr2_p2_params(
3216   union xnn_f16_expminus_params params[XNN_MIN_ELEMENTS(1)])
3217 {
3218   params->neonfp16arith_rr2_p2.magic_bias = UINT16_C(0x660F);  // 0x1.83Cp+10h
3219   params->neonfp16arith_rr2_p2.log2e = UINT16_C(0x3DC5);  // 0x1.714p+0h
3220   params->neonfp16arith_rr2_p2.minus_ln2_hi = UINT16_C(0xB98C);  // -0x1.630p-1h
3221   params->neonfp16arith_rr2_p2.minus_ln2_lo = UINT16_C(0x0AF4);  // 0x1.BD0p-13h
3222   params->neonfp16arith_rr2_p2.c2 = UINT16_C(0x37F9);  // 0x1.FE4p-2h
3223   params->neonfp16arith_rr2_p2.c1 = UINT16_C(0x3C0E);  // 0x1.038p+0h
3224   params->neonfp16arith_rr2_p2.denorm_cutoff = UINT16_C(0xC8DA);  // -0x1.368p+3h
3225   return sizeof(params->neonfp16arith_rr2_p2);
3226 }
3227 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
3228 
3229 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_expminus_avx2_rr1_p2_params(union xnn_f16_expminus_params params[XNN_MIN_ELEMENTS (1)])3230 size_t xnn_init_f16_expminus_avx2_rr1_p2_params(
3231   union xnn_f16_expminus_params params[XNN_MIN_ELEMENTS(1)])
3232 {
3233   for (uint32_t i = 0; i < 8; i++) {
3234     params->avx2_rr1_p2.magic_bias[i] = 0x1.8000FEp23f;
3235     params->avx2_rr1_p2.log2e[i] = 0x1.715476p0f;
3236     params->avx2_rr1_p2.minus_ln2[i] = -0x1.62E43p-1f;
3237     params->avx2_rr1_p2.c2[i] = 0x1.FF3A32p-2f;
3238     params->avx2_rr1_p2.c1[i] = 0x1.039E10p+0f;
3239     params->avx2_rr1_p2.denorm_cutoff[i] = -0x1.368000p+3f;
3240   }
3241   return sizeof(params->avx2_rr1_p2);
3242 }
3243 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3244 
xnn_init_f32_expminus_scalar_rr2_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3245 size_t xnn_init_f32_expminus_scalar_rr2_p5_params(
3246   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3247 {
3248   params->scalar_rr2_p5.log2e = 0x1.715476p+0f;
3249   params->scalar_rr2_p5.magic_bias = 0x1.8000FEp23f;
3250   params->scalar_rr2_p5.minus_ln2_hi = -0x1.62E400p-1f;
3251   params->scalar_rr2_p5.minus_ln2_lo = -0x1.7F7D1Cp-20f;
3252   params->scalar_rr2_p5.c5 = 0x1.0F9F9Cp-7f;
3253   params->scalar_rr2_p5.c4 = 0x1.573A1Ap-5f;
3254   params->scalar_rr2_p5.c3 = 0x1.555A80p-3f;
3255   params->scalar_rr2_p5.c2 = 0x1.FFFDC6p-2f;
3256   params->scalar_rr2_p5.c1 = 0x1.FFFFF6p-1f;
3257   params->scalar_rr2_p5.denorm_cutoff = -0x1.5D589Ep6f;
3258   return sizeof(params->scalar_rr2_p5);
3259 }
3260 
xnn_init_f32_expminus_scalar_rr2_lut64_p2_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3261 size_t xnn_init_f32_expminus_scalar_rr2_lut64_p2_params(
3262   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3263 {
3264   params->scalar_rr2_lut64_p2.log2e  = 0x1.715476p0f;
3265   params->scalar_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
3266   params->scalar_rr2_lut64_p2.minus_ln2_hi = -0x1.630000p-1f;
3267   params->scalar_rr2_lut64_p2.minus_ln2_lo = 0x1.BD0106p-13f;
3268   params->scalar_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
3269   params->scalar_rr2_lut64_p2.denorm_cutoff = -0x1.5D589Ep6f;
3270   return sizeof(params->scalar_rr2_lut64_p2);
3271 }
3272 
3273 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_expminus_neon_rr2_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3274 size_t xnn_init_f32_expminus_neon_rr2_p5_params(
3275   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3276 {
3277   params->neon_rr2_p5.log2e = 0x1.715476p+0f;
3278   params->neon_rr2_p5.magic_bias = 0x1.8000FEp23f;
3279   params->neon_rr2_p5.minus_ln2_hi = -0x1.62E400p-1f;
3280   params->neon_rr2_p5.minus_ln2_lo = -0x1.7F7D1Cp-20f;
3281   params->neon_rr2_p5.c5 = 0x1.0F9F9Cp-7f;
3282   params->neon_rr2_p5.c4 = 0x1.573A1Ap-5f;
3283   params->neon_rr2_p5.c3 = 0x1.555A80p-3f;
3284   params->neon_rr2_p5.c2 = 0x1.FFFDC6p-2f;
3285   params->neon_rr2_p5.c1 = 0x1.FFFFF6p-1f;
3286   params->neon_rr2_p5.denorm_cutoff = -0x1.5D589Ep6f;
3287   return sizeof(params->neon_rr2_p5);
3288 }
3289 
xnn_init_f32_expminus_neon_rr2_lut64_p2_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3290 size_t xnn_init_f32_expminus_neon_rr2_lut64_p2_params(
3291   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3292 {
3293   params->neon_rr2_lut64_p2.log2e = 0x1.715476p+0f;
3294   params->neon_rr2_lut64_p2.magic_bias = 0x1.800000p17f;
3295   params->neon_rr2_lut64_p2.minus_ln2_hi = -0x1.62E400p-1f;
3296   params->neon_rr2_lut64_p2.minus_ln2_lo = -0x1.7F7D1Cp-20f;
3297   params->neon_rr2_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
3298   params->neon_rr2_lut64_p2.denorm_cutoff = -0x1.5D589Ep6f;
3299   return sizeof(params->neon_rr2_lut64_p2);
3300 }
3301 
xnn_init_f32_expminus_neonfma_rr1_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3302 size_t xnn_init_f32_expminus_neonfma_rr1_p5_params(
3303   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3304 {
3305   params->neonfma_rr1_p5.log2e = 0x1.715476p+0f;
3306   params->neonfma_rr1_p5.magic_bias = 0x1.8000FEp23f;
3307   params->neonfma_rr1_p5.minus_ln2 = -0x1.62E430p-1f;
3308   params->neonfma_rr1_p5.c5 = 0x1.0F9F9Cp-7f;
3309   params->neonfma_rr1_p5.c4 = 0x1.573A1Ap-5f;
3310   params->neonfma_rr1_p5.c3 = 0x1.555A80p-3f;
3311   params->neonfma_rr1_p5.c2 = 0x1.FFFDC6p-2f;
3312   params->neonfma_rr1_p5.c1 = 0x1.FFFFF6p-1f;
3313   params->neonfma_rr1_p5.denorm_cutoff = -0x1.5D589Ep6f;
3314   return sizeof(params->neonfma_rr1_p5);
3315 }
3316 
xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3317 size_t xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params(
3318   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3319 {
3320   params->neonfma_rr1_lut64_p2.log2e = 0x1.715476p+0f;
3321   params->neonfma_rr1_lut64_p2.magic_bias = 0x1.800000p17f;
3322   params->neonfma_rr1_lut64_p2.minus_ln2 = -0x1.62E430p-1f;
3323   params->neonfma_rr1_lut64_p2.c2 = 0x1.FFFF0Ap-2f;
3324   params->neonfma_rr1_lut64_p2.denorm_cutoff = -0x1.5D589Ep6f;
3325   return sizeof(params->neonfma_rr1_lut64_p2);
3326 }
3327 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
3328 
3329 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_expminus_sse2_rr2_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3330 size_t xnn_init_f32_expminus_sse2_rr2_p5_params(
3331   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3332 {
3333   for (uint32_t i = 0; i < 4; i++) {
3334     params->sse2_rr2_p5.log2e[i] = 0x1.715476p+0f;
3335     params->sse2_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
3336     params->sse2_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
3337     params->sse2_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
3338     params->sse2_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
3339     params->sse2_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
3340     params->sse2_rr2_p5.c3[i] = 0x1.555A80p-3f;
3341     params->sse2_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
3342     params->sse2_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
3343     params->sse2_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep6f;
3344   }
3345   return sizeof(params->sse2_rr2_p5);
3346 }
3347 
xnn_init_f32_expminus_avx2_rr1_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3348 size_t xnn_init_f32_expminus_avx2_rr1_p5_params(
3349   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3350 {
3351   for (uint32_t i = 0; i < 8; i++) {
3352     params->avx2_rr1_p5.log2e[i] = 0x1.715476p+0f;
3353     params->avx2_rr1_p5.magic_bias[i] = 0x1.8000FEp23f;
3354     params->avx2_rr1_p5.minus_ln2[i] = -0x1.62E430p-1f;
3355     params->avx2_rr1_p5.c5[i] = 0x1.0F9F9Cp-7f;
3356     params->avx2_rr1_p5.c4[i] = 0x1.573A1Ap-5f;
3357     params->avx2_rr1_p5.c3[i] = 0x1.555A80p-3f;
3358     params->avx2_rr1_p5.c2[i] = 0x1.FFFDC6p-2f;
3359     params->avx2_rr1_p5.c1[i] = 0x1.FFFFF6p-1f;
3360     params->avx2_rr1_p5.denorm_cutoff[i] = -0x1.5D589Ep6f;
3361   }
3362   for (uint32_t i = 0; i < 7; i++) {
3363     params->avx2_rr1_p5.mask_table[i] = -1;
3364   }
3365   for (uint32_t i = 7; i < 14; i++) {
3366     params->avx2_rr1_p5.mask_table[i] = 0;
3367   }
3368   return sizeof(params->avx2_rr1_p5);
3369 }
3370 
xnn_init_f32_expminus_avx512_rr1_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3371 size_t xnn_init_f32_expminus_avx512_rr1_p5_params(
3372   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3373 {
3374   params->avx512_rr1_p5.log2e = 0x1.715476p+0f;
3375   params->avx512_rr1_p5.minus_ln2 = -0x1.62E430p-1f;
3376   params->avx512_rr1_p5.c5 = 0x1.0F9F9Cp-7f;
3377   params->avx512_rr1_p5.c4 = 0x1.573A1Ap-5f;
3378   params->avx512_rr1_p5.c3 = 0x1.555A80p-3f;
3379   params->avx512_rr1_p5.c2 = 0x1.FFFDC6p-2f;
3380   params->avx512_rr1_p5.c1 = 0x1.FFFFF6p-1f;
3381   params->avx512_rr1_p5.c0 = 1.0f;
3382   return sizeof(params->avx512_rr1_p5);
3383 }
3384 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3385 
3386 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_expminus_wasmsimd_rr2_p5_params(union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS (1)])3387 size_t xnn_init_f32_expminus_wasmsimd_rr2_p5_params(
3388   union xnn_f32_expminus_params params[XNN_MIN_ELEMENTS(1)])
3389 {
3390   for (uint32_t i = 0; i < 2; i++) {
3391     params->wasmsimd_rr2_p5.log2e[i] = 0x1.715476p+0f;
3392     params->wasmsimd_rr2_p5.magic_bias[i] = 0x1.8000FEp23f;
3393     params->wasmsimd_rr2_p5.minus_ln2_hi[i] = -0x1.62E400p-1f;
3394     params->wasmsimd_rr2_p5.minus_ln2_lo[i] = -0x1.7F7D1Cp-20f;
3395     params->wasmsimd_rr2_p5.c5[i] = 0x1.0F9F9Cp-7f;
3396     params->wasmsimd_rr2_p5.c4[i] = 0x1.573A1Ap-5f;
3397     params->wasmsimd_rr2_p5.c3[i] = 0x1.555A80p-3f;
3398     params->wasmsimd_rr2_p5.c2[i] = 0x1.FFFDC6p-2f;
3399     params->wasmsimd_rr2_p5.c1[i] = 0x1.FFFFF6p-1f;
3400     params->wasmsimd_rr2_p5.denorm_cutoff[i] = -0x1.5D589Ep6f;
3401   }
3402   return sizeof(params->wasmsimd_rr2_p5);
3403 }
3404 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3405 
3406 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_lrelu_neon_params(union xnn_f16_lrelu_params params[XNN_MIN_ELEMENTS (1)],uint16_t slope)3407 size_t xnn_init_f16_lrelu_neon_params(
3408   union xnn_f16_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3409   uint16_t slope)
3410 {
3411   params->neon.slope = slope;
3412   return sizeof(params->neon);
3413 }
3414 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
3415 
3416 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_lrelu_avx_params(union xnn_f16_lrelu_params params[XNN_MIN_ELEMENTS (1)],uint16_t slope)3417 size_t xnn_init_f16_lrelu_avx_params(
3418   union xnn_f16_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3419   uint16_t slope)
3420 {
3421   for (uint32_t i = 0; i < 8; i++) {
3422     params->avx.slope[i] = fp16_ieee_to_fp32_value(slope);
3423   }
3424   return sizeof(params->avx);
3425 }
3426 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3427 
xnn_init_f32_lrelu_scalar_params(union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS (1)],float slope)3428 size_t xnn_init_f32_lrelu_scalar_params(
3429   union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3430   float slope)
3431 {
3432   params->scalar.slope = slope;
3433   return sizeof(params->scalar);
3434 }
3435 
3436 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_lrelu_sse_params(union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS (1)],float slope)3437 size_t xnn_init_f32_lrelu_sse_params(
3438   union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3439   float slope)
3440 {
3441   for (uint32_t i = 0; i < 4; i++) {
3442     params->sse.slope[i] = slope;
3443   }
3444   return sizeof(params->sse);
3445 }
3446 
xnn_init_f32_lrelu_avx_params(union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS (1)],float slope)3447 size_t xnn_init_f32_lrelu_avx_params(
3448   union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3449   float slope)
3450 {
3451   for (uint32_t i = 0; i < 8; i++) {
3452     params->avx.slope[i] = slope;
3453   }
3454   for (uint32_t i = 0; i < 7; i++) {
3455     params->avx.mask_table[i] = -1;
3456   }
3457   for (uint32_t i = 7; i < 14; i++) {
3458     params->avx.mask_table[i] = 0;
3459   }
3460   return sizeof(params->avx);
3461 }
3462 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3463 
3464 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_lrelu_wasmsimd_params(union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS (1)],float slope)3465 size_t xnn_init_f32_lrelu_wasmsimd_params(
3466   union xnn_f32_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3467   float slope)
3468 {
3469   params->wasmsimd.slope[0] = slope;
3470   params->wasmsimd.slope[1] = slope;
3471   return sizeof(params->wasmsimd);
3472 }
3473 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3474 
xnn_init_qs8_lrelu_scalar_select_params(union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,int8_t input_zero_point,int8_t output_zero_point)3475 size_t xnn_init_qs8_lrelu_scalar_select_params(
3476   union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3477   float positive_scale,
3478   float negative_scale,
3479   int8_t input_zero_point,
3480   int8_t output_zero_point)
3481 {
3482   assert(positive_scale >= 0x1.0p-8f);
3483   assert(positive_scale <= 0x1.0p+7f);
3484   assert(negative_scale <= 0x1.0p+7f);
3485   assert(negative_scale >= -0x1.FFFC00p+6f);
3486   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3487 
3488   const long positive_multiplier = lrintf(256.0f * positive_scale);
3489   assert(positive_multiplier >= 1L);
3490   assert(positive_multiplier <= 32768L);
3491   const long negative_multiplier = lrintf(256.0f * negative_scale);
3492   assert(negative_multiplier <= 32768L);
3493   assert(negative_multiplier >= -32767L);
3494   assert(negative_multiplier != 0L);
3495   params->scalar_select.input_zero_point = (int32_t) input_zero_point;
3496   params->scalar_select.positive_multiplier = (int32_t) positive_multiplier;
3497   params->scalar_select.negative_multiplier = (int32_t) negative_multiplier;
3498   params->scalar_select.bias = ((int32_t) output_zero_point << 8) + INT32_C(0x80);
3499   return sizeof(params->scalar_select);
3500 }
3501 
xnn_init_qs8_lrelu_scalar_andxor_params(union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,int8_t input_zero_point,int8_t output_zero_point)3502 size_t xnn_init_qs8_lrelu_scalar_andxor_params(
3503   union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3504   float positive_scale,
3505   float negative_scale,
3506   int8_t input_zero_point,
3507   int8_t output_zero_point)
3508 {
3509   assert(positive_scale >= 0x1.0p-8f);
3510   assert(positive_scale <= 0x1.0p+7f);
3511   assert(negative_scale <= 0x1.0p+7f);
3512   assert(negative_scale >= -0x1.FFFC00p+6f);
3513   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3514 
3515   const long positive_multiplier = lrintf(256.0f * positive_scale);
3516   assert(positive_multiplier >= 1L);
3517   assert(positive_multiplier <= 32768L);
3518   const long negative_multiplier = lrintf(256.0f * negative_scale);
3519   assert(negative_multiplier <= 32768L);
3520   assert(negative_multiplier >= -32767L);
3521   assert(negative_multiplier != 0L);
3522   params->scalar_andxor.input_zero_point = (int32_t) input_zero_point;
3523   params->scalar_andxor.multiplier_base = (int32_t) positive_multiplier;
3524   params->scalar_andxor.multiplier_diff = (int32_t) negative_multiplier ^ (int32_t) positive_multiplier;
3525   params->scalar_andxor.bias = ((int32_t) output_zero_point << 8) + INT32_C(0x80);
3526   return sizeof(params->scalar_andxor);
3527 }
3528 
3529 #if XNN_ARCH_ARM
xnn_init_qs8_lrelu_armsimd32_params(union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,int8_t input_zero_point,int8_t output_zero_point)3530 size_t xnn_init_qs8_lrelu_armsimd32_params(
3531   union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3532   float positive_scale,
3533   float negative_scale,
3534   int8_t input_zero_point,
3535   int8_t output_zero_point)
3536 {
3537   assert(positive_scale >= 0x1.0p-8f);
3538   assert(positive_scale <= 0x1.0p+7f);
3539   assert(negative_scale <= 0x1.0p+7f);
3540   assert(negative_scale >= -0x1.FFFC00p+6f);
3541   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3542 
3543   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3544   assert(positive_multiplier <= -1L);
3545   assert(positive_multiplier >= -32768L);
3546   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3547   assert(negative_multiplier >= -32768L);
3548   assert(negative_multiplier <= 32767L);
3549   assert(negative_multiplier != 0L);
3550   params->armsimd32.input_zero_point = (uint32_t) (uint16_t) (int16_t) input_zero_point * UINT32_C(0x00010001);
3551   params->armsimd32.positive_multiplier = (uint32_t) (uint16_t) (int16_t) positive_multiplier * UINT32_C(0x00010001);
3552   params->armsimd32.negative_multiplier = (uint32_t) (uint16_t) (int16_t) negative_multiplier * UINT32_C(0x00010001);
3553   params->armsimd32.bias = ((int32_t) output_zero_point << 8) + INT32_C(0x80);
3554   return sizeof(params->armsimd32);
3555 }
3556 #endif  // XNN_ARCH_ARM
3557 
3558 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_lrelu_neon_params(union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,int8_t input_zero_point,int8_t output_zero_point)3559 size_t xnn_init_qs8_lrelu_neon_params(
3560   union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3561   float positive_scale,
3562   float negative_scale,
3563   int8_t input_zero_point,
3564   int8_t output_zero_point)
3565 {
3566   assert(positive_scale >= 0x1.0p-8f);
3567   assert(positive_scale <= 0x1.0p+7f);
3568   assert(negative_scale <= 0x1.0p+7f);
3569   assert(negative_scale >= -0x1.FFFC00p+6f);
3570   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3571 
3572   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3573   assert(positive_multiplier <= -1L);
3574   assert(positive_multiplier >= -32768L);
3575   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3576   assert(negative_multiplier >= -32768L);
3577   assert(negative_multiplier <= 32767L);
3578   assert(negative_multiplier != 0L);
3579   params->neon.input_zero_point = (int16_t) input_zero_point;
3580   params->neon.positive_multiplier = (int16_t) positive_multiplier;
3581   params->neon.negative_multiplier = (int16_t) negative_multiplier;
3582   params->neon.output_zero_point = (int16_t) output_zero_point;
3583   return sizeof(params->neon);
3584 }
3585 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
3586 
3587 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_lrelu_sse2_params(union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,int8_t input_zero_point,int8_t output_zero_point)3588 size_t xnn_init_qs8_lrelu_sse2_params(
3589   union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3590   float positive_scale,
3591   float negative_scale,
3592   int8_t input_zero_point,
3593   int8_t output_zero_point)
3594 {
3595   assert(positive_scale >= 0x1.0p-8f);
3596   assert(positive_scale <= 0x1.0p+7f);
3597   assert(negative_scale <= 0x1.0p+7f);
3598   assert(negative_scale >= -0x1.FFFC00p+6f);
3599   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3600 
3601   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3602   assert(positive_multiplier <= -1L);
3603   assert(positive_multiplier >= -32768L);
3604   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3605   assert(negative_multiplier >= -32768L);
3606   assert(negative_multiplier <= 32767L);
3607   assert(negative_multiplier != 0L);
3608   const int16_t multiplier_base = (int16_t) negative_multiplier;
3609   const int16_t multiplier_diff = (int16_t) positive_multiplier ^ (int16_t) negative_multiplier;
3610   for (uint32_t i = 0; i < 8; i++) {
3611     params->sse2.input_zero_point[i] = (int16_t) input_zero_point;
3612     params->sse2.multiplier_diff[i] = multiplier_diff;
3613     params->sse2.multiplier_base[i] = multiplier_base;
3614     params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
3615   }
3616   return sizeof(params->sse2);
3617 }
3618 
xnn_init_qs8_lrelu_avx_params(union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,int8_t input_zero_point,int8_t output_zero_point)3619 size_t xnn_init_qs8_lrelu_avx_params(
3620   union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3621   float positive_scale,
3622   float negative_scale,
3623   int8_t input_zero_point,
3624   int8_t output_zero_point)
3625 {
3626   assert(positive_scale >= 0x1.0p-8f);
3627   assert(positive_scale <= 0x1.0p+7f);
3628   assert(negative_scale <= 0x1.0p+7f);
3629   assert(negative_scale >= -0x1.FFFC00p+6f);
3630   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3631 
3632   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3633   assert(positive_multiplier <= -1L);
3634   assert(positive_multiplier >= -32768L);
3635   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3636   assert(negative_multiplier >= -32768L);
3637   assert(negative_multiplier <= 32767L);
3638   assert(negative_multiplier != 0L);
3639   for (uint32_t i = 0; i < 8; i++) {
3640     params->avx.input_zero_point[i] = (int16_t) input_zero_point;
3641     params->avx.positive_multiplier[i] = (int16_t) positive_multiplier;
3642     params->avx.negative_multiplier[i] = (int16_t) negative_multiplier;
3643     params->avx.output_zero_point[i] = (int16_t) output_zero_point;
3644   }
3645   return sizeof(params->avx);
3646 }
3647 
xnn_init_qs8_lrelu_avx2_params(union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,int8_t input_zero_point,int8_t output_zero_point)3648 size_t xnn_init_qs8_lrelu_avx2_params(
3649   union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3650   float positive_scale,
3651   float negative_scale,
3652   int8_t input_zero_point,
3653   int8_t output_zero_point)
3654 {
3655   assert(positive_scale >= 0x1.0p-8f);
3656   assert(positive_scale <= 0x1.0p+7f);
3657   assert(negative_scale <= 0x1.0p+7f);
3658   assert(negative_scale >= -0x1.FFFC00p+6f);
3659   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3660 
3661   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3662   assert(positive_multiplier <= -1L);
3663   assert(positive_multiplier >= -32768L);
3664   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3665   assert(negative_multiplier >= -32768L);
3666   assert(negative_multiplier <= 32767L);
3667   assert(negative_multiplier != 0L);
3668   for (uint32_t i = 0; i < 16; i++) {
3669     params->avx2.input_zero_point[i] = (int16_t) input_zero_point;
3670     params->avx2.positive_multiplier[i] = (int16_t) positive_multiplier;
3671     params->avx2.negative_multiplier[i] = (int16_t) negative_multiplier;
3672     params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
3673   }
3674   return sizeof(params->avx2);
3675 }
3676 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3677 
3678 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_lrelu_wasmsimd_arm_params(union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,int8_t input_zero_point,int8_t output_zero_point)3679 size_t xnn_init_qs8_lrelu_wasmsimd_arm_params(
3680   union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3681   float positive_scale,
3682   float negative_scale,
3683   int8_t input_zero_point,
3684   int8_t output_zero_point)
3685 {
3686   assert(positive_scale >= 0x1.0p-8f);
3687   assert(positive_scale <= 0x1.0p+7f);
3688   assert(negative_scale <= 0x1.0p+7f);
3689   assert(negative_scale >= -0x1.FFFC00p+6f);
3690   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3691 
3692   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3693   assert(positive_multiplier <= -1L);
3694   assert(positive_multiplier >= -32768L);
3695   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3696   assert(negative_multiplier >= -32768L);
3697   assert(negative_multiplier <= 32767L);
3698   assert(negative_multiplier != 0L);
3699   for (uint32_t i = 0; i < 4; i++) {
3700     params->wasmsimd_arm.input_zero_point[i] = (int16_t) input_zero_point;
3701     params->wasmsimd_arm.positive_multiplier[i] = (int16_t) positive_multiplier;
3702     params->wasmsimd_arm.negative_multiplier[i] = (int16_t) negative_multiplier;
3703     params->wasmsimd_arm.output_zero_point[i] = (int16_t) output_zero_point;
3704   }
3705   return sizeof(params->wasmsimd_arm);
3706 }
3707 
xnn_init_qs8_lrelu_wasmsimd_x86_params(union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,int8_t input_zero_point,int8_t output_zero_point)3708 size_t xnn_init_qs8_lrelu_wasmsimd_x86_params(
3709   union xnn_qs8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3710   float positive_scale,
3711   float negative_scale,
3712   int8_t input_zero_point,
3713   int8_t output_zero_point)
3714 {
3715   assert(positive_scale >= 0x1.0p-8f);
3716   assert(positive_scale <= 0x1.0p+7f);
3717   assert(negative_scale <= 0x1.0p+7f);
3718   assert(negative_scale >= -0x1.FFFC00p+6f);
3719   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3720 
3721   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3722   assert(positive_multiplier <= -1L);
3723   assert(positive_multiplier >= -32768L);
3724   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3725   assert(negative_multiplier >= -32768L);
3726   assert(negative_multiplier <= 32767L);
3727   assert(negative_multiplier != 0L);
3728   const int16_t multiplier_base = (int16_t) negative_multiplier;
3729   const int16_t multiplier_diff = (int16_t) positive_multiplier ^ (int16_t) negative_multiplier;
3730   for (uint32_t i = 0; i < 4; i++) {
3731     params->wasmsimd_x86.input_zero_point[i] = (int16_t) input_zero_point;
3732     params->wasmsimd_x86.multiplier_diff[i] = multiplier_diff;
3733     params->wasmsimd_x86.multiplier_base[i] = multiplier_base;
3734     params->wasmsimd_x86.output_zero_point[i] = (int16_t) output_zero_point;
3735   }
3736   return sizeof(params->wasmsimd_x86);
3737 }
3738 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
3739 
xnn_init_qu8_lrelu_scalar_select_params(union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,uint8_t input_zero_point,uint8_t output_zero_point)3740 size_t xnn_init_qu8_lrelu_scalar_select_params(
3741   union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3742   float positive_scale,
3743   float negative_scale,
3744   uint8_t input_zero_point,
3745   uint8_t output_zero_point)
3746 {
3747   assert(positive_scale >= 0x1.0p-8f);
3748   assert(positive_scale <= 0x1.0p+7f);
3749   assert(negative_scale <= 0x1.0p+7f);
3750   assert(negative_scale >= -0x1.FFFC00p+6f);
3751   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3752 
3753   const long positive_multiplier = lrintf(256.0f * positive_scale);
3754   assert(positive_multiplier >= 1L);
3755   assert(positive_multiplier <= 32768L);
3756   const long negative_multiplier = lrintf(256.0f * negative_scale);
3757   assert(negative_multiplier <= 32768L);
3758   assert(negative_multiplier >= -32767L);
3759   assert(negative_multiplier != 0L);
3760   params->scalar_select.input_zero_point = (int32_t) input_zero_point;
3761   params->scalar_select.positive_multiplier = (int32_t) positive_multiplier;
3762   params->scalar_select.negative_multiplier = (int32_t) negative_multiplier;
3763   params->scalar_select.bias = ((int32_t) output_zero_point << 8) + INT32_C(0x80);
3764   return sizeof(params->scalar_select);
3765 }
3766 
xnn_init_qu8_lrelu_scalar_andxor_params(union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,uint8_t input_zero_point,uint8_t output_zero_point)3767 size_t xnn_init_qu8_lrelu_scalar_andxor_params(
3768   union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3769   float positive_scale,
3770   float negative_scale,
3771   uint8_t input_zero_point,
3772   uint8_t output_zero_point)
3773 {
3774   assert(positive_scale >= 0x1.0p-8f);
3775   assert(positive_scale <= 0x1.0p+7f);
3776   assert(negative_scale <= 0x1.0p+7f);
3777   assert(negative_scale >= -0x1.FFFC00p+6f);
3778   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3779 
3780   const long positive_multiplier = lrintf(256.0f * positive_scale);
3781   assert(positive_multiplier >= 1L);
3782   assert(positive_multiplier <= 32768L);
3783   const long negative_multiplier = lrintf(256.0f * negative_scale);
3784   assert(negative_multiplier <= 32768L);
3785   assert(negative_multiplier >= -32767L);
3786   assert(negative_multiplier != 0L);
3787   params->scalar_andxor.input_zero_point = (int32_t) input_zero_point;
3788   params->scalar_andxor.multiplier_base = (int32_t) positive_multiplier;
3789   params->scalar_andxor.multiplier_diff = (int32_t) negative_multiplier ^ (int32_t) positive_multiplier;
3790   params->scalar_andxor.bias = ((int32_t) output_zero_point << 8) + INT32_C(0x80);
3791   return sizeof(params->scalar_andxor);
3792 }
3793 
3794 #if XNN_ARCH_ARM
xnn_init_qu8_lrelu_armsimd32_params(union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,uint8_t input_zero_point,uint8_t output_zero_point)3795 size_t xnn_init_qu8_lrelu_armsimd32_params(
3796   union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3797   float positive_scale,
3798   float negative_scale,
3799   uint8_t input_zero_point,
3800   uint8_t output_zero_point)
3801 {
3802   assert(positive_scale >= 0x1.0p-8f);
3803   assert(positive_scale <= 0x1.0p+7f);
3804   assert(negative_scale <= 0x1.0p+7f);
3805   assert(negative_scale >= -0x1.FFFC00p+6f);
3806   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3807 
3808   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3809   assert(positive_multiplier <= -1L);
3810   assert(positive_multiplier >= -32768L);
3811   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3812   assert(negative_multiplier >= -32768L);
3813   assert(negative_multiplier <= 32767L);
3814   assert(negative_multiplier != 0L);
3815   params->armsimd32.input_zero_point = (uint32_t) input_zero_point * UINT32_C(0x00010001);
3816   params->armsimd32.positive_multiplier = (uint32_t) (uint16_t) (int16_t) positive_multiplier * UINT32_C(0x00010001);
3817   params->armsimd32.negative_multiplier = (uint32_t) (uint16_t) (int16_t) negative_multiplier * UINT32_C(0x00010001);
3818   params->armsimd32.bias = ((int32_t) output_zero_point << 8) + INT32_C(0x80);
3819   return sizeof(params->armsimd32);
3820 }
3821 #endif  // XNN_ARCH_ARM
3822 
3823 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_lrelu_neon_params(union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,uint8_t input_zero_point,uint8_t output_zero_point)3824 size_t xnn_init_qu8_lrelu_neon_params(
3825   union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3826   float positive_scale,
3827   float negative_scale,
3828   uint8_t input_zero_point,
3829   uint8_t output_zero_point)
3830 {
3831   assert(positive_scale >= 0x1.0p-8f);
3832   assert(positive_scale <= 0x1.0p+7f);
3833   assert(negative_scale <= 0x1.0p+7f);
3834   assert(negative_scale >= -0x1.FFFC00p+6f);
3835   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3836 
3837   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3838   assert(positive_multiplier <= -1L);
3839   assert(positive_multiplier >= -32768L);
3840   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3841   assert(negative_multiplier >= -32768L);
3842   assert(negative_multiplier <= 32767L);
3843   assert(negative_multiplier != 0L);
3844   params->neon.input_zero_point = (uint16_t) input_zero_point;
3845   params->neon.positive_multiplier = (int16_t) positive_multiplier;
3846   params->neon.negative_multiplier = (int16_t) negative_multiplier;
3847   params->neon.output_zero_point = (int16_t) output_zero_point;
3848   return sizeof(params->neon);
3849 }
3850 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
3851 
3852 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_lrelu_sse2_params(union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,uint8_t input_zero_point,uint8_t output_zero_point)3853 size_t xnn_init_qu8_lrelu_sse2_params(
3854   union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3855   float positive_scale,
3856   float negative_scale,
3857   uint8_t input_zero_point,
3858   uint8_t output_zero_point)
3859 {
3860   assert(positive_scale >= 0x1.0p-8f);
3861   assert(positive_scale <= 0x1.0p+7f);
3862   assert(negative_scale <= 0x1.0p+7f);
3863   assert(negative_scale >= -0x1.FFFC00p+6f);
3864   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3865 
3866   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3867   assert(positive_multiplier <= -1L);
3868   assert(positive_multiplier >= -32768L);
3869   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3870   assert(negative_multiplier >= -32768L);
3871   assert(negative_multiplier <= 32767L);
3872   assert(negative_multiplier != 0L);
3873   const int16_t multiplier_base = (int16_t) negative_multiplier;
3874   const int16_t multiplier_diff = (int16_t) positive_multiplier ^ (int16_t) negative_multiplier;
3875   for (uint32_t i = 0; i < 8; i++) {
3876     params->sse2.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
3877     params->sse2.multiplier_diff[i] = multiplier_diff;
3878     params->sse2.multiplier_base[i] = multiplier_base;
3879     params->sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3880   }
3881   return sizeof(params->sse2);
3882 }
3883 
xnn_init_qu8_lrelu_avx_params(union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,uint8_t input_zero_point,uint8_t output_zero_point)3884 size_t xnn_init_qu8_lrelu_avx_params(
3885   union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3886   float positive_scale,
3887   float negative_scale,
3888   uint8_t input_zero_point,
3889   uint8_t output_zero_point)
3890 {
3891   assert(positive_scale >= 0x1.0p-8f);
3892   assert(positive_scale <= 0x1.0p+7f);
3893   assert(negative_scale <= 0x1.0p+7f);
3894   assert(negative_scale >= -0x1.FFFC00p+6f);
3895   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3896 
3897   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3898   assert(positive_multiplier <= -1L);
3899   assert(positive_multiplier >= -32768L);
3900   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3901   assert(negative_multiplier >= -32768L);
3902   assert(negative_multiplier <= 32767L);
3903   assert(negative_multiplier != 0L);
3904   for (uint32_t i = 0; i < 8; i++) {
3905     params->avx.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
3906     params->avx.positive_multiplier[i] = (int16_t) positive_multiplier;
3907     params->avx.negative_multiplier[i] = (int16_t) negative_multiplier;
3908     params->avx.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3909   }
3910   return sizeof(params->avx);
3911 }
3912 
xnn_init_qu8_lrelu_avx2_params(union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,uint8_t input_zero_point,uint8_t output_zero_point)3913 size_t xnn_init_qu8_lrelu_avx2_params(
3914   union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3915   float positive_scale,
3916   float negative_scale,
3917   uint8_t input_zero_point,
3918   uint8_t output_zero_point)
3919 {
3920   assert(positive_scale >= 0x1.0p-8f);
3921   assert(positive_scale <= 0x1.0p+7f);
3922   assert(negative_scale <= 0x1.0p+7f);
3923   assert(negative_scale >= -0x1.FFFC00p+6f);
3924   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3925 
3926   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3927   assert(positive_multiplier <= -1L);
3928   assert(positive_multiplier >= -32768L);
3929   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3930   assert(negative_multiplier >= -32768L);
3931   assert(negative_multiplier <= 32767L);
3932   assert(negative_multiplier != 0L);
3933   for (uint32_t i = 0; i < 16; i++) {
3934     params->avx2.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
3935     params->avx2.positive_multiplier[i] = (int16_t) positive_multiplier;
3936     params->avx2.negative_multiplier[i] = (int16_t) negative_multiplier;
3937     params->avx2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3938   }
3939   return sizeof(params->avx2);
3940 }
3941 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
3942 
3943 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_lrelu_wasmsimd_arm_params(union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,uint8_t input_zero_point,uint8_t output_zero_point)3944 size_t xnn_init_qu8_lrelu_wasmsimd_arm_params(
3945   union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3946   float positive_scale,
3947   float negative_scale,
3948   uint8_t input_zero_point,
3949   uint8_t output_zero_point)
3950 {
3951   assert(positive_scale >= 0x1.0p-8f);
3952   assert(positive_scale <= 0x1.0p+7f);
3953   assert(negative_scale <= 0x1.0p+7f);
3954   assert(negative_scale >= -0x1.FFFC00p+6f);
3955   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3956 
3957   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3958   assert(positive_multiplier <= -1L);
3959   assert(positive_multiplier >= -32768L);
3960   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3961   assert(negative_multiplier >= -32768L);
3962   assert(negative_multiplier <= 32767L);
3963   assert(negative_multiplier != 0L);
3964   for (uint32_t i = 0; i < 4; i++) {
3965     params->wasmsimd_arm.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
3966     params->wasmsimd_arm.positive_multiplier[i] = (int16_t) positive_multiplier;
3967     params->wasmsimd_arm.negative_multiplier[i] = (int16_t) negative_multiplier;
3968     params->wasmsimd_arm.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
3969   }
3970   return sizeof(params->wasmsimd_arm);
3971 }
3972 
xnn_init_qu8_lrelu_wasmsimd_x86_params(union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS (1)],float positive_scale,float negative_scale,uint8_t input_zero_point,uint8_t output_zero_point)3973 size_t xnn_init_qu8_lrelu_wasmsimd_x86_params(
3974   union xnn_qu8_lrelu_params params[XNN_MIN_ELEMENTS(1)],
3975   float positive_scale,
3976   float negative_scale,
3977   uint8_t input_zero_point,
3978   uint8_t output_zero_point)
3979 {
3980   assert(positive_scale >= 0x1.0p-8f);
3981   assert(positive_scale <= 0x1.0p+7f);
3982   assert(negative_scale <= 0x1.0p+7f);
3983   assert(negative_scale >= -0x1.FFFC00p+6f);
3984   assert(fabsf(negative_scale) >= 0x1.0p-8f);
3985 
3986   const long positive_multiplier = lrintf(-256.0f * positive_scale);
3987   assert(positive_multiplier <= -1L);
3988   assert(positive_multiplier >= -32768L);
3989   const long negative_multiplier = lrintf(-256.0f * negative_scale);
3990   assert(negative_multiplier >= -32768L);
3991   assert(negative_multiplier <= 32767L);
3992   assert(negative_multiplier != 0L);
3993   const int16_t multiplier_base = (int16_t) negative_multiplier;
3994   const int16_t multiplier_diff = (int16_t) positive_multiplier ^ (int16_t) negative_multiplier;
3995   for (uint32_t i = 0; i < 4; i++) {
3996     params->wasmsimd_x86.input_zero_point[i] = (int16_t) (uint16_t) input_zero_point;
3997     params->wasmsimd_x86.multiplier_diff[i] = multiplier_diff;
3998     params->wasmsimd_x86.multiplier_base[i] = multiplier_base;
3999     params->wasmsimd_x86.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4000   }
4001   return sizeof(params->wasmsimd_x86);
4002 }
4003 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4004 
4005 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_sqrt_avx_params(union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS (1)])4006 size_t xnn_init_f32_sqrt_avx_params(
4007   union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)])
4008 {
4009   for (uint32_t i = 0; i < 7; i++) {
4010     params->avx.mask_table[i] = -1;
4011   }
4012   for (uint32_t i = 7; i < 14; i++) {
4013     params->avx.mask_table[i] = 0;
4014   }
4015   return sizeof(params->avx);
4016 }
4017 
xnn_init_f32_sqrt_fma_params(union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS (1)])4018 size_t xnn_init_f32_sqrt_fma_params(
4019   union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)])
4020 {
4021   for (uint32_t i = 0; i < 8; i++) {
4022     params->fma.half[i] = 0.5f;
4023   }
4024   for (uint32_t i = 0; i < 7; i++) {
4025     params->fma.mask_table[i] = -1;
4026   }
4027   for (uint32_t i = 7; i < 14; i++) {
4028     params->fma.mask_table[i] = 0;
4029   }
4030   return sizeof(params->fma);
4031 }
4032 
xnn_init_f32_sqrt_avx512_params(union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS (1)])4033 size_t xnn_init_f32_sqrt_avx512_params(
4034   union xnn_f32_sqrt_params params[XNN_MIN_ELEMENTS(1)])
4035 {
4036   params->avx512.half = 0.5f;
4037   return sizeof(params->avx512);
4038 }
4039 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
4040 
xnn_init_f32_chw_params(union xnn_f32_chw_params params[XNN_MIN_ELEMENTS (1)],uint32_t width,float output_min,float output_max)4041 size_t xnn_init_f32_chw_params(
4042   union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
4043   uint32_t width,
4044   float output_min,
4045   float output_max)
4046 {
4047   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
4048     for (uint32_t i = 0; i < 4; i++) {
4049       params->sse.min[i] = output_min;
4050       params->sse.max[i] = output_max;
4051     }
4052 
4053     const uint32_t w4 = (width - 1) & 3;
4054     params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
4055     params->sse.mask[1] = -(uint32_t) (w4 >= 1);
4056     params->sse.mask[2] = -(uint32_t) (w4 >= 2);
4057     params->sse.mask[3] = -(uint32_t) (w4 >= 3);
4058 
4059     const uint32_t w8 = (width - 1) & 7;
4060     params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
4061     params->sse.mask_even[1] = -(uint32_t) (w8 >= 2);
4062     params->sse.mask_even[2] = -(uint32_t) (w8 >= 4);
4063     params->sse.mask_even[3] = -(uint32_t) (w8 >= 6);
4064     params->sse.mask_odd[0] = -(uint32_t) (w8 >= 1);
4065     params->sse.mask_odd[1] = -(uint32_t) (w8 >= 3);
4066     params->sse.mask_odd[2] = -(uint32_t) (w8 >= 5);
4067     params->sse.mask_odd[3] = -(uint32_t) (w8 >= 7);
4068     return sizeof(params->sse);
4069   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
4070     params->neon.min = output_min;
4071     params->neon.max = output_max;
4072 
4073     const uint32_t w4 = (width - 1) & 3;
4074     params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
4075     params->neon.mask[1] = -(uint32_t) (w4 >= 1);
4076     params->neon.mask[2] = -(uint32_t) (w4 >= 2);
4077     params->neon.mask[3] = -(uint32_t) (w4 >= 3);
4078 
4079     const uint32_t w8 = (width - 1) & 7;
4080     params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
4081     params->neon.mask_even[1] = -(uint32_t) (w8 >= 2);
4082     params->neon.mask_even[2] = -(uint32_t) (w8 >= 4);
4083     params->neon.mask_even[3] = -(uint32_t) (w8 >= 6);
4084     params->neon.mask_odd[0] = -(uint32_t) (w8 >= 1);
4085     params->neon.mask_odd[1] = -(uint32_t) (w8 >= 3);
4086     params->neon.mask_odd[2] = -(uint32_t) (w8 >= 5);
4087     params->neon.mask_odd[3] = -(uint32_t) (w8 >= 7);
4088     return sizeof(params->neon);
4089   #else
4090     params->scalar.min = output_min;
4091     params->scalar.max = output_max;
4092 
4093     const uint32_t w4 = (width - 1) & 3;
4094     params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
4095     params->scalar.mask[1] = -(uint32_t) (w4 >= 1);
4096     params->scalar.mask[2] = -(uint32_t) (w4 >= 2);
4097     params->scalar.mask[3] = -(uint32_t) (w4 >= 3);
4098 
4099     const uint32_t w8 = (width - 1) & 7;
4100     params->scalar.mask_even[0] = UINT32_C(0xFFFFFFFF);
4101     params->scalar.mask_even[1] = -(uint32_t) (w8 >= 2);
4102     params->scalar.mask_even[2] = -(uint32_t) (w8 >= 4);
4103     params->scalar.mask_even[3] = -(uint32_t) (w8 >= 6);
4104     params->scalar.mask_odd[0] = -(uint32_t) (w8 >= 1);
4105     params->scalar.mask_odd[1] = -(uint32_t) (w8 >= 3);
4106     params->scalar.mask_odd[2] = -(uint32_t) (w8 >= 5);
4107     params->scalar.mask_odd[3] = -(uint32_t) (w8 >= 7);
4108     return sizeof(params->scalar);
4109   #endif
4110 }
4111 
xnn_init_f16_chw_params(union xnn_f16_chw_params params[XNN_MIN_ELEMENTS (1)],uint32_t width,uint16_t output_min,uint16_t output_max)4112 size_t xnn_init_f16_chw_params(
4113   union xnn_f16_chw_params params[XNN_MIN_ELEMENTS(1)],
4114   uint32_t width,
4115   uint16_t output_min,
4116   uint16_t output_max)
4117 {
4118   #if XNN_ARCH_ARM || XNN_ARCH_ARM64
4119     params->neonfp16arith.min = output_min;
4120     params->neonfp16arith.max = output_max;
4121 
4122     const uint32_t w4 = (width - 1) & 3;
4123     params->neonfp16arith.mask[0] = UINT16_C(0xFFFF);
4124     params->neonfp16arith.mask[1] = -(uint16_t) (w4 >= 1);
4125     params->neonfp16arith.mask[2] = -(uint16_t) (w4 >= 2);
4126     params->neonfp16arith.mask[3] = -(uint16_t) (w4 >= 3);
4127 
4128     const uint32_t w8 = (width - 1) & 7;
4129     params->neonfp16arith.maskx8[0] = UINT16_C(0xFFFF);
4130     params->neonfp16arith.maskx8[1] = -(uint16_t) (w8 >= 1);
4131     params->neonfp16arith.maskx8[2] = -(uint16_t) (w8 >= 2);
4132     params->neonfp16arith.maskx8[3] = -(uint16_t) (w8 >= 3);
4133     params->neonfp16arith.maskx8[4] = -(uint16_t) (w8 >= 4);
4134     params->neonfp16arith.maskx8[5] = -(uint16_t) (w8 >= 5);
4135     params->neonfp16arith.maskx8[6] = -(uint16_t) (w8 >= 6);
4136     params->neonfp16arith.maskx8[7] = -(uint16_t) (w8 >= 7);
4137 
4138     params->neonfp16arith.mask_even[0] = UINT16_C(0xFFFF);
4139     params->neonfp16arith.mask_even[1] = -(uint16_t) (w8 >= 2);
4140     params->neonfp16arith.mask_even[2] = -(uint16_t) (w8 >= 4);
4141     params->neonfp16arith.mask_even[3] = -(uint16_t) (w8 >= 6);
4142     params->neonfp16arith.mask_odd[0] = -(uint16_t) (w8 >= 1);
4143     params->neonfp16arith.mask_odd[1] = -(uint16_t) (w8 >= 3);
4144     params->neonfp16arith.mask_odd[2] = -(uint16_t) (w8 >= 5);
4145     params->neonfp16arith.mask_odd[3] = -(uint16_t) (w8 >= 7);
4146     return sizeof(params->neonfp16arith);
4147   #else
4148     return 0;
4149   #endif
4150 }
4151 
xnn_update_f32_chw_params(union xnn_f32_chw_params * params,uint32_t width)4152 void xnn_update_f32_chw_params(
4153   union xnn_f32_chw_params* params,
4154   uint32_t width)
4155 {
4156   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
4157     const uint32_t w4 = (width - 1) & 3;
4158     params->sse.mask[0] = UINT32_C(0xFFFFFFFF);
4159     params->sse.mask[1] = -(uint32_t) (w4 >= 1);
4160     params->sse.mask[2] = -(uint32_t) (w4 >= 2);
4161     params->sse.mask[3] = -(uint32_t) (w4 >= 3);
4162 
4163     const uint32_t w8 = (width - 1) & 7;
4164     params->sse.mask_even[0] = UINT32_C(0xFFFFFFFF);
4165     params->sse.mask_even[1] = -(uint32_t) (w8 >= 2);
4166     params->sse.mask_even[2] = -(uint32_t) (w8 >= 4);
4167     params->sse.mask_even[3] = -(uint32_t) (w8 >= 6);
4168     params->sse.mask_odd[0] = -(uint32_t) (w8 >= 1);
4169     params->sse.mask_odd[1] = -(uint32_t) (w8 >= 3);
4170     params->sse.mask_odd[2] = -(uint32_t) (w8 >= 5);
4171     params->sse.mask_odd[3] = -(uint32_t) (w8 >= 7);
4172   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
4173     const uint32_t w4 = (width - 1) & 3;
4174     params->neon.mask[0] = UINT32_C(0xFFFFFFFF);
4175     params->neon.mask[1] = -(uint32_t) (w4 >= 1);
4176     params->neon.mask[2] = -(uint32_t) (w4 >= 2);
4177     params->neon.mask[3] = -(uint32_t) (w4 >= 3);
4178 
4179     const uint32_t w8 = (width - 1) & 7;
4180     params->neon.mask_even[0] = UINT32_C(0xFFFFFFFF);
4181     params->neon.mask_even[1] = -(uint32_t) (w8 >= 2);
4182     params->neon.mask_even[2] = -(uint32_t) (w8 >= 4);
4183     params->neon.mask_even[3] = -(uint32_t) (w8 >= 6);
4184     params->neon.mask_odd[0] = -(uint32_t) (w8 >= 1);
4185     params->neon.mask_odd[1] = -(uint32_t) (w8 >= 3);
4186     params->neon.mask_odd[2] = -(uint32_t) (w8 >= 5);
4187     params->neon.mask_odd[3] = -(uint32_t) (w8 >= 7);
4188   #else
4189     const uint32_t w4 = (width - 1) & 3;
4190     params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
4191     params->scalar.mask[1] = -(uint32_t) (w4 >= 1);
4192     params->scalar.mask[2] = -(uint32_t) (w4 >= 2);
4193     params->scalar.mask[3] = -(uint32_t) (w4 >= 3);
4194 
4195     const uint32_t w8 = (width - 1) & 7;
4196     params->scalar.mask_even[0] = UINT32_C(0xFFFFFFFF);
4197     params->scalar.mask_even[1] = -(uint32_t) (w8 >= 2);
4198     params->scalar.mask_even[2] = -(uint32_t) (w8 >= 4);
4199     params->scalar.mask_even[3] = -(uint32_t) (w8 >= 6);
4200     params->scalar.mask_odd[0] = -(uint32_t) (w8 >= 1);
4201     params->scalar.mask_odd[1] = -(uint32_t) (w8 >= 3);
4202     params->scalar.mask_odd[2] = -(uint32_t) (w8 >= 5);
4203     params->scalar.mask_odd[3] = -(uint32_t) (w8 >= 7);
4204   #endif
4205 }
4206 
xnn_init_scalar_f32_chw_params(union xnn_f32_chw_params params[XNN_MIN_ELEMENTS (1)],uint32_t width,float output_min,float output_max)4207 size_t xnn_init_scalar_f32_chw_params(
4208   union xnn_f32_chw_params params[XNN_MIN_ELEMENTS(1)],
4209   uint32_t width,
4210   float output_min,
4211   float output_max)
4212 {
4213   params->scalar.min = output_min;
4214   params->scalar.max = output_max;
4215 
4216   const uint32_t w4 = (width - 1) & 3;
4217   params->scalar.mask[0] = UINT32_C(0xFFFFFFFF);
4218   params->scalar.mask[1] = -(uint32_t) (w4 >= 1);
4219   params->scalar.mask[2] = -(uint32_t) (w4 >= 2);
4220   params->scalar.mask[3] = -(uint32_t) (w4 >= 3);
4221 
4222   const uint32_t w8 = (width - 1) & 7;
4223   params->scalar.mask_even[0] = UINT32_C(0xFFFFFFFF);
4224   params->scalar.mask_even[1] = -(uint32_t) (w8 >= 2);
4225   params->scalar.mask_even[2] = -(uint32_t) (w8 >= 4);
4226   params->scalar.mask_even[3] = -(uint32_t) (w8 >= 6);
4227   params->scalar.mask_odd[0] = -(uint32_t) (w8 >= 1);
4228   params->scalar.mask_odd[1] = -(uint32_t) (w8 >= 3);
4229   params->scalar.mask_odd[2] = -(uint32_t) (w8 >= 5);
4230   params->scalar.mask_odd[3] = -(uint32_t) (w8 >= 7);
4231   return sizeof(params->scalar);
4232 }
4233 
4234 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_s8_minmax_sse2_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)4235 size_t xnn_init_s8_minmax_sse2_params(
4236   union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4237   int8_t output_min,
4238   int8_t output_max)
4239 {
4240   assert(output_min < output_max);
4241 
4242   const uint8_t output_min_with_bias = UINT8_C(0x80) ^ (uint8_t) output_min;
4243   const uint8_t output_max_with_bias = UINT8_C(0x80) ^ (uint8_t) output_max;
4244   for (uint32_t i = 0; i < 16; i++) {
4245     params->sse2.bias[i] = UINT8_C(0x80);
4246     params->sse2.min_with_bias[i] = output_min_with_bias;
4247     params->sse2.max_with_bias[i] = output_max_with_bias;
4248   }
4249   return sizeof(params->sse2);
4250 }
4251 
xnn_init_s8_minmax_sse4_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)4252 size_t xnn_init_s8_minmax_sse4_params(
4253   union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4254   int8_t output_min,
4255   int8_t output_max)
4256 {
4257   assert(output_min < output_max);
4258 
4259   for (uint32_t i = 0; i < 16; i++) {
4260     params->sse4.min[i] = output_min;
4261     params->sse4.max[i] = output_max;
4262   }
4263   return sizeof(params->sse4);
4264 }
4265 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
4266 
4267 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_s8_minmax_neon_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)4268 size_t xnn_init_s8_minmax_neon_params(
4269   union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4270   int8_t output_min,
4271   int8_t output_max)
4272 {
4273   assert(output_min < output_max);
4274 
4275   params->neon.min = output_min;
4276   params->neon.max = output_max;
4277   return sizeof(params->neon);
4278 }
4279 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
4280 
4281 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_s8_minmax_wasmsimd_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)4282 size_t xnn_init_s8_minmax_wasmsimd_params(
4283   union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4284   int8_t output_min,
4285   int8_t output_max)
4286 {
4287   assert(output_min < output_max);
4288 
4289   for (uint32_t i = 0; i < 8; i++) {
4290     params->wasmsimd.min[i] = output_min;
4291     params->wasmsimd.max[i] = output_max;
4292   }
4293   return sizeof(params->wasmsimd);
4294 }
4295 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4296 
xnn_init_s8_minmax_scalar_params(union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t output_min,int8_t output_max)4297 size_t xnn_init_s8_minmax_scalar_params(
4298   union xnn_s8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4299   int8_t output_min,
4300   int8_t output_max)
4301 {
4302   assert(output_min < output_max);
4303 
4304   params->scalar.min = (int32_t) output_min;
4305   params->scalar.max = (int32_t) output_max;
4306   return sizeof(params->scalar);
4307 }
4308 
xnn_init_u8_minmax_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)4309 size_t xnn_init_u8_minmax_params(
4310   union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4311   uint8_t output_min,
4312   uint8_t output_max)
4313 {
4314   assert(output_min < output_max);
4315 
4316   #if XNN_ARCH_X86 || XNN_ARCH_X86_64
4317     for (uint32_t i = 0; i < 16; i++) {
4318       params->sse2.min[i] = output_min;
4319       params->sse2.max[i] = output_max;
4320     }
4321     return sizeof(params->sse2);
4322   #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
4323     params->neon.min = output_min;
4324     params->neon.max = output_max;
4325     return sizeof(params->neon);
4326   #else
4327     params->scalar.min = (uint32_t) output_min;
4328     params->scalar.max = (uint32_t) output_max;
4329     return sizeof(params->scalar);
4330   #endif
4331 }
4332 
4333 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_u8_minmax_sse2_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)4334 size_t xnn_init_u8_minmax_sse2_params(
4335   union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4336   uint8_t output_min,
4337   uint8_t output_max)
4338 {
4339   assert(output_min < output_max);
4340 
4341   for (uint32_t i = 0; i < 16; i++) {
4342     params->sse2.min[i] = output_min;
4343     params->sse2.max[i] = output_max;
4344   }
4345   return sizeof(params->sse2);
4346 }
4347 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
4348 
4349 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_u8_minmax_wasmsimd_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)4350 size_t xnn_init_u8_minmax_wasmsimd_params(
4351   union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4352   uint8_t output_min,
4353   uint8_t output_max)
4354 {
4355   assert(output_min < output_max);
4356 
4357   for (uint32_t i = 0; i < 8; i++) {
4358     params->wasmsimd.min[i] = output_min;
4359     params->wasmsimd.max[i] = output_max;
4360   }
4361   return sizeof(params->wasmsimd);
4362 }
4363 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4364 
4365 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_u8_minmax_neon_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)4366 size_t xnn_init_u8_minmax_neon_params(
4367   union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4368   uint8_t output_min,
4369   uint8_t output_max)
4370 {
4371   assert(output_min < output_max);
4372 
4373   params->neon.min = output_min;
4374   params->neon.max = output_max;
4375   return sizeof(params->neon);
4376 }
4377 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
4378 
xnn_init_u8_minmax_scalar_params(union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t output_min,uint8_t output_max)4379 size_t xnn_init_u8_minmax_scalar_params(
4380   union xnn_u8_minmax_params params[XNN_MIN_ELEMENTS(1)],
4381   uint8_t output_min,
4382   uint8_t output_max)
4383 {
4384   assert(output_min < output_max);
4385 
4386   params->scalar.min = (uint32_t) output_min;
4387   params->scalar.max = (uint32_t) output_max;
4388   return sizeof(params->scalar);
4389 }
4390 
4391 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_add_minmax_sse2_params(union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)4392 size_t xnn_init_qu8_add_minmax_sse2_params(
4393   union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4394   uint8_t a_zero_point,
4395   uint8_t b_zero_point,
4396   uint8_t output_zero_point,
4397   float a_output_scale,
4398   float b_output_scale,
4399   uint8_t output_min,
4400   uint8_t output_max)
4401 {
4402   const float abs_a_output_scale = fabsf(a_output_scale);
4403   const float abs_b_output_scale = fabsf(b_output_scale);
4404   assert(abs_a_output_scale >= 0x1.0p-10f);
4405   assert(abs_b_output_scale >= 0x1.0p-10f);
4406   assert(abs_a_output_scale < 0x1.0p+8f);
4407   assert(abs_b_output_scale < 0x1.0p+8f);
4408 
4409   // Compute requantization parameters.
4410   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4411   assert(max_abs_output_scale >= 0x1.0p-10f);
4412   assert(max_abs_output_scale < 0x1.0p+8f);
4413   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4414   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4415 
4416   // Shift is in [12, 30] range.
4417   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4418   assert(shift <= 30);
4419   assert(shift >= 12);
4420 
4421   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4422   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4423   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4424   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4425   assert(abs_a_multiplier <= INT32_C(0x00200000));
4426   assert(abs_b_multiplier <= INT32_C(0x00200000));
4427 
4428   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4429   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4430 
4431   const int32_t rounding = INT32_C(1) << (shift - 1);
4432   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4433   for (uint32_t i = 0; i < 4; i++) {
4434     params->sse2.bias[i] = bias;
4435   }
4436   const uint16_t a_multiplier_lo = (uint16_t) a_multiplier;
4437   const uint16_t a_multiplier_hi = (uint16_t) ((uint32_t) a_multiplier >> 16);
4438   const uint16_t b_multiplier_lo = (uint16_t) b_multiplier;
4439   const uint16_t b_multiplier_hi = (uint16_t) ((uint32_t) b_multiplier >> 16);
4440   for (uint32_t i = 0; i < 8; i++) {
4441     params->sse2.a_multiplier_lo[i] = a_multiplier_lo;
4442     params->sse2.a_multiplier_hi[i] = a_multiplier_hi;
4443     params->sse2.b_multiplier_lo[i] = b_multiplier_lo;
4444     params->sse2.b_multiplier_hi[i] = b_multiplier_hi;
4445   }
4446   params->sse2.shift = shift;
4447   params->sse2.b_multiplier = (uint32_t) b_multiplier;
4448   for (uint32_t i = 0; i < 8; i++) {
4449     params->sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4450   }
4451   for (uint32_t i = 0; i < 16; i++) {
4452     params->sse2.output_min[i] = output_min;
4453     params->sse2.output_max[i] = output_max;
4454   }
4455   return sizeof(params->sse2);
4456 }
4457 
xnn_init_qu8_add_minmax_sse4_params(union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)4458 size_t xnn_init_qu8_add_minmax_sse4_params(
4459   union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4460   uint8_t a_zero_point,
4461   uint8_t b_zero_point,
4462   uint8_t output_zero_point,
4463   float a_output_scale,
4464   float b_output_scale,
4465   uint8_t output_min,
4466   uint8_t output_max)
4467 {
4468   const float abs_a_output_scale = fabsf(a_output_scale);
4469   const float abs_b_output_scale = fabsf(b_output_scale);
4470   assert(abs_a_output_scale >= 0x1.0p-10f);
4471   assert(abs_b_output_scale >= 0x1.0p-10f);
4472   assert(abs_a_output_scale < 0x1.0p+8f);
4473   assert(abs_b_output_scale < 0x1.0p+8f);
4474 
4475   // Compute requantization parameters.
4476   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4477   assert(max_abs_output_scale >= 0x1.0p-10f);
4478   assert(max_abs_output_scale < 0x1.0p+8f);
4479   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4480   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4481 
4482   // Shift is in [12, 30] range.
4483   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4484   assert(shift <= 30);
4485   assert(shift >= 12);
4486 
4487   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4488   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4489   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4490   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4491   assert(abs_a_multiplier <= INT32_C(0x00200000));
4492   assert(abs_b_multiplier <= INT32_C(0x00200000));
4493 
4494   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4495   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4496 
4497   const int32_t rounding = INT32_C(1) << (shift - 1);
4498   const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
4499   for (uint32_t i = 0; i < 4; i++) {
4500     params->sse4.bias[i] = bias;
4501     params->sse4.a_multiplier[i] = a_multiplier;
4502     params->sse4.b_multiplier[i] = b_multiplier;
4503   }
4504   for (uint32_t i = 0; i < 2; i++) {
4505     params->sse4.shift[i] = (uint64_t) shift;
4506   }
4507   for (uint32_t i = 0; i < 8; i++) {
4508     params->sse4.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4509   }
4510   for (uint32_t i = 0; i < 16; i++) {
4511     params->sse4.output_min[i] = output_min;
4512     params->sse4.output_max[i] = output_max;
4513   }
4514   return sizeof(params->sse4);
4515 }
4516 
xnn_init_qu8_add_minmax_avx2_params(union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)4517 size_t xnn_init_qu8_add_minmax_avx2_params(
4518   union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4519   uint8_t a_zero_point,
4520   uint8_t b_zero_point,
4521   uint8_t output_zero_point,
4522   float a_output_scale,
4523   float b_output_scale,
4524   uint8_t output_min,
4525   uint8_t output_max)
4526 {
4527   const float abs_a_output_scale = fabsf(a_output_scale);
4528   const float abs_b_output_scale = fabsf(b_output_scale);
4529   assert(abs_a_output_scale >= 0x1.0p-10f);
4530   assert(abs_b_output_scale >= 0x1.0p-10f);
4531   assert(abs_a_output_scale < 0x1.0p+8f);
4532   assert(abs_b_output_scale < 0x1.0p+8f);
4533 
4534   // Compute requantization parameters.
4535   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4536   assert(max_abs_output_scale >= 0x1.0p-10f);
4537   assert(max_abs_output_scale < 0x1.0p+8f);
4538   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4539   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4540 
4541   // Shift is in [12, 30] range.
4542   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4543   assert(shift <= 30);
4544   assert(shift >= 12);
4545 
4546   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4547   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4548   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4549   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4550   assert(abs_a_multiplier <= INT32_C(0x00200000));
4551   assert(abs_b_multiplier <= INT32_C(0x00200000));
4552 
4553   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4554   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4555 
4556   const int32_t rounding = INT32_C(1) << (shift - 1);
4557   const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
4558   for (uint32_t i = 0; i < 8; i++) {
4559     params->avx2.bias[i] = bias;
4560     params->avx2.a_multiplier[i] = a_multiplier;
4561     params->avx2.b_multiplier[i] = b_multiplier;
4562   }
4563   for (uint32_t i = 0; i < 4; i++) {
4564     params->avx2.shift[i] = (uint64_t) shift;
4565   }
4566   for (uint32_t i = 0; i < 16; i++) {
4567     params->avx2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4568     params->avx2.output_min[i] = output_min;
4569     params->avx2.output_max[i] = output_max;
4570   }
4571   return sizeof(params->avx2);
4572 }
4573 
xnn_init_qu8_add_minmax_avx512_params(union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)4574 size_t xnn_init_qu8_add_minmax_avx512_params(
4575   union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4576   uint8_t a_zero_point,
4577   uint8_t b_zero_point,
4578   uint8_t output_zero_point,
4579   float a_output_scale,
4580   float b_output_scale,
4581   uint8_t output_min,
4582   uint8_t output_max)
4583 {
4584   const float abs_a_output_scale = fabsf(a_output_scale);
4585   const float abs_b_output_scale = fabsf(b_output_scale);
4586   assert(abs_a_output_scale >= 0x1.0p-10f);
4587   assert(abs_b_output_scale >= 0x1.0p-10f);
4588   assert(abs_a_output_scale < 0x1.0p+8f);
4589   assert(abs_b_output_scale < 0x1.0p+8f);
4590 
4591   // Compute requantization parameters.
4592   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4593   assert(max_abs_output_scale >= 0x1.0p-10f);
4594   assert(max_abs_output_scale < 0x1.0p+8f);
4595   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4596   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4597 
4598   // Shift is in [12, 30] range.
4599   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4600   assert(shift <= 30);
4601   assert(shift >= 12);
4602 
4603   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4604   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4605   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4606   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4607   assert(abs_a_multiplier <= INT32_C(0x00200000));
4608   assert(abs_b_multiplier <= INT32_C(0x00200000));
4609 
4610   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4611   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4612 
4613   const int32_t rounding = INT32_C(1) << (shift - 1);
4614   const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
4615   for (uint32_t i = 0; i < 16; i++) {
4616     params->avx512.bias[i] = bias;
4617     params->avx512.a_multiplier[i] = a_multiplier;
4618     params->avx512.b_multiplier[i] = b_multiplier;
4619   }
4620   for (uint32_t i = 0; i < 8; i++) {
4621     params->avx512.shift[i] = (uint64_t) shift;
4622   }
4623   for (uint32_t i = 0; i < 32; i++) {
4624     params->avx512.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4625     params->avx512.output_min[i] = output_min;
4626     params->avx512.output_max[i] = output_max;
4627   }
4628   return sizeof(params->avx512);
4629 }
4630 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
4631 
4632 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_add_minmax_neon_params(union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)4633 size_t xnn_init_qu8_add_minmax_neon_params(
4634   union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4635   uint8_t a_zero_point,
4636   uint8_t b_zero_point,
4637   uint8_t output_zero_point,
4638   float a_output_scale,
4639   float b_output_scale,
4640   uint8_t output_min,
4641   uint8_t output_max)
4642 {
4643   const float abs_a_output_scale = fabsf(a_output_scale);
4644   const float abs_b_output_scale = fabsf(b_output_scale);
4645   assert(abs_a_output_scale >= 0x1.0p-10f);
4646   assert(abs_b_output_scale >= 0x1.0p-10f);
4647   assert(abs_a_output_scale < 0x1.0p+8f);
4648   assert(abs_b_output_scale < 0x1.0p+8f);
4649 
4650   // Compute requantization parameters.
4651   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4652   assert(max_abs_output_scale >= 0x1.0p-10f);
4653   assert(max_abs_output_scale < 0x1.0p+8f);
4654   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4655   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4656 
4657   // Shift is in [12, 30] range.
4658   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4659   assert(shift <= 30);
4660   assert(shift >= 12);
4661 
4662   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4663   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4664   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4665   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4666   assert(abs_a_multiplier <= INT32_C(0x00200000));
4667   assert(abs_b_multiplier <= INT32_C(0x00200000));
4668 
4669   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4670   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4671 
4672   params->neon.a_zero_point = a_zero_point;
4673   params->neon.b_zero_point = b_zero_point;
4674   params->neon.a_multiplier = (int32_t) a_multiplier;
4675   params->neon.b_multiplier = (int32_t) b_multiplier;
4676   params->neon.right_shift = (int32_t) -shift;
4677   params->neon.output_zero_point = (int16_t) (uint16_t) output_zero_point;
4678   params->neon.output_min = output_min;
4679   params->neon.output_max = output_max;
4680   return sizeof(params->neon);
4681 }
4682 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
4683 
4684 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_add_minmax_wasmsimd_params(union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)4685 size_t xnn_init_qu8_add_minmax_wasmsimd_params(
4686   union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4687   uint8_t a_zero_point,
4688   uint8_t b_zero_point,
4689   uint8_t output_zero_point,
4690   float a_output_scale,
4691   float b_output_scale,
4692   uint8_t output_min,
4693   uint8_t output_max)
4694 {
4695   const float abs_a_output_scale = fabsf(a_output_scale);
4696   const float abs_b_output_scale = fabsf(b_output_scale);
4697   assert(abs_a_output_scale >= 0x1.0p-10f);
4698   assert(abs_b_output_scale >= 0x1.0p-10f);
4699   assert(abs_a_output_scale < 0x1.0p+8f);
4700   assert(abs_b_output_scale < 0x1.0p+8f);
4701 
4702   // Compute requantization parameters.
4703   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4704   assert(max_abs_output_scale >= 0x1.0p-10f);
4705   assert(max_abs_output_scale < 0x1.0p+8f);
4706   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4707   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4708 
4709   // Shift is in [12, 30] range.
4710   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4711   assert(shift <= 30);
4712   assert(shift >= 12);
4713 
4714   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4715   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4716   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4717   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4718   assert(abs_a_multiplier <= INT32_C(0x00200000));
4719   assert(abs_b_multiplier <= INT32_C(0x00200000));
4720 
4721   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4722   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4723 
4724   const int32_t rounding = INT32_C(1) << (shift - 1);
4725   const int32_t bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
4726   for (uint32_t i = 0; i < 2; i++) {
4727     params->wasmsimd.bias[i] = bias;
4728     params->wasmsimd.a_multiplier[i] = a_multiplier;
4729     params->wasmsimd.b_multiplier[i] = b_multiplier;
4730   }
4731   params->wasmsimd.shift = shift;
4732   for (uint32_t i = 0; i < 4; i++) {
4733     params->wasmsimd.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
4734   }
4735   for (uint32_t i = 0; i < 8; i++) {
4736     params->wasmsimd.output_min[i] = output_min;
4737     params->wasmsimd.output_max[i] = output_max;
4738   }
4739   return sizeof(params->wasmsimd);
4740 }
4741 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4742 
xnn_init_qu8_add_minmax_scalar_params(union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float a_output_scale,float b_output_scale,uint8_t output_min,uint8_t output_max)4743 size_t xnn_init_qu8_add_minmax_scalar_params(
4744   union xnn_qu8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4745   uint8_t a_zero_point,
4746   uint8_t b_zero_point,
4747   uint8_t output_zero_point,
4748   float a_output_scale,
4749   float b_output_scale,
4750   uint8_t output_min,
4751   uint8_t output_max)
4752 {
4753   const float abs_a_output_scale = fabsf(a_output_scale);
4754   const float abs_b_output_scale = fabsf(b_output_scale);
4755   assert(abs_a_output_scale >= 0x1.0p-10f);
4756   assert(abs_b_output_scale >= 0x1.0p-10f);
4757   assert(abs_a_output_scale < 0x1.0p+8f);
4758   assert(abs_b_output_scale < 0x1.0p+8f);
4759 
4760   // Compute requantization parameters.
4761   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4762   assert(max_abs_output_scale >= 0x1.0p-10f);
4763   assert(max_abs_output_scale < 0x1.0p+8f);
4764   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4765   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4766 
4767   // Shift is in [12, 30] range.
4768   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4769   assert(shift <= 30);
4770   assert(shift >= 12);
4771 
4772   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4773   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4774   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4775   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4776   assert(abs_a_multiplier <= INT32_C(0x00200000));
4777   assert(abs_b_multiplier <= INT32_C(0x00200000));
4778 
4779   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4780   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4781 
4782   const int32_t rounding = INT32_C(1) << (shift - 1);
4783   params->scalar.bias = rounding - a_multiplier * (int32_t) (uint32_t) a_zero_point - b_multiplier * (int32_t) (uint32_t) b_zero_point;
4784   params->scalar.a_multiplier = a_multiplier;
4785   params->scalar.b_multiplier = b_multiplier;
4786   params->scalar.shift = shift;
4787   params->scalar.output_min_less_zero_point = (int32_t) (uint32_t) output_min - (int32_t) (uint32_t) output_zero_point;
4788   params->scalar.output_max_less_zero_point = (int32_t) (uint32_t) output_max - (int32_t) (uint32_t) output_zero_point;
4789   params->scalar.output_zero_point = (int32_t) (uint32_t) output_zero_point;
4790   return sizeof(params->scalar);
4791 }
4792 
4793 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_add_minmax_sse2_params(union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)4794 size_t xnn_init_qs8_add_minmax_sse2_params(
4795   union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4796   int8_t a_zero_point,
4797   int8_t b_zero_point,
4798   int8_t output_zero_point,
4799   float a_output_scale,
4800   float b_output_scale,
4801   int8_t output_min,
4802   int8_t output_max)
4803 {
4804   const float abs_a_output_scale = fabsf(a_output_scale);
4805   const float abs_b_output_scale = fabsf(b_output_scale);
4806   assert(abs_a_output_scale >= 0x1.0p-10f);
4807   assert(abs_b_output_scale >= 0x1.0p-10f);
4808   assert(abs_a_output_scale < 0x1.0p+8f);
4809   assert(abs_b_output_scale < 0x1.0p+8f);
4810 
4811   // Compute requantization parameters.
4812   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4813   assert(max_abs_output_scale >= 0x1.0p-10f);
4814   assert(max_abs_output_scale < 0x1.0p+8f);
4815   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4816   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4817 
4818   // Shift is in [12, 30] range.
4819   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4820   assert(shift <= 30);
4821   assert(shift >= 12);
4822 
4823   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4824   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4825   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4826   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4827   assert(abs_a_multiplier <= INT32_C(0x00200000));
4828   assert(abs_b_multiplier <= INT32_C(0x00200000));
4829 
4830   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4831   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4832 
4833   const int32_t rounding = INT32_C(1) << (shift - 1);
4834   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4835   for (uint32_t i = 0; i < 4; i++) {
4836     params->sse2.bias[i] = bias;
4837   }
4838   const uint16_t a_multiplier_lo = (uint16_t) a_multiplier;
4839   const uint16_t a_multiplier_hi = (uint16_t) ((uint32_t) a_multiplier >> 16);
4840   const uint16_t b_multiplier_lo = (uint16_t) b_multiplier;
4841   const uint16_t b_multiplier_hi = (uint16_t) ((uint32_t) b_multiplier >> 16);
4842   for (uint32_t i = 0; i < 8; i++) {
4843     params->sse2.a_multiplier_lo[i] = a_multiplier_lo;
4844     params->sse2.a_multiplier_hi[i] = a_multiplier_hi;
4845     params->sse2.b_multiplier_lo[i] = b_multiplier_lo;
4846     params->sse2.b_multiplier_hi[i] = b_multiplier_hi;
4847   }
4848   params->sse2.shift = shift;
4849   params->sse2.b_multiplier = (uint32_t) b_multiplier;
4850   for (uint32_t i = 0; i < 8; i++) {
4851     params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
4852     params->sse2.output_min[i] = (int16_t) output_min;
4853     params->sse2.output_max[i] = (int16_t) output_max;
4854   }
4855   return sizeof(params->sse2);
4856 }
4857 
xnn_init_qs8_add_minmax_sse4_mul16_params(union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)4858 size_t xnn_init_qs8_add_minmax_sse4_mul16_params(
4859   union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4860   int8_t a_zero_point,
4861   int8_t b_zero_point,
4862   int8_t output_zero_point,
4863   float a_output_scale,
4864   float b_output_scale,
4865   int8_t output_min,
4866   int8_t output_max)
4867 {
4868   const float abs_a_output_scale = fabsf(a_output_scale);
4869   const float abs_b_output_scale = fabsf(b_output_scale);
4870   assert(abs_a_output_scale >= 0x1.0p-10f);
4871   assert(abs_b_output_scale >= 0x1.0p-10f);
4872   assert(abs_a_output_scale < 0x1.0p+8f);
4873   assert(abs_b_output_scale < 0x1.0p+8f);
4874 
4875   // Compute requantization parameters.
4876   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4877   assert(max_abs_output_scale >= 0x1.0p-10f);
4878   assert(max_abs_output_scale < 0x1.0p+8f);
4879   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4880   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4881 
4882   // Shift is in [12, 30] range.
4883   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4884   assert(shift <= 30);
4885   assert(shift >= 12);
4886 
4887   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4888   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4889   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4890   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4891   assert(abs_a_multiplier <= INT32_C(0x00200000));
4892   assert(abs_b_multiplier <= INT32_C(0x00200000));
4893 
4894   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4895   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4896 
4897   const int32_t rounding = INT32_C(1) << (shift - 1);
4898   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4899   for (uint32_t i = 0; i < 4; i++) {
4900     params->sse4_mul16.bias[i] = bias;
4901   }
4902   const uint16_t a_multiplier_lo = (uint16_t) a_multiplier;
4903   const uint16_t a_multiplier_hi = (uint16_t) ((uint32_t) a_multiplier >> 16);
4904   const uint16_t b_multiplier_lo = (uint16_t) b_multiplier;
4905   const uint16_t b_multiplier_hi = (uint16_t) ((uint32_t) b_multiplier >> 16);
4906   for (uint32_t i = 0; i < 8; i++) {
4907     params->sse4_mul16.a_multiplier_lo[i] = a_multiplier_lo;
4908     params->sse4_mul16.a_multiplier_hi[i] = a_multiplier_hi;
4909     params->sse4_mul16.b_multiplier_lo[i] = b_multiplier_lo;
4910     params->sse4_mul16.b_multiplier_hi[i] = b_multiplier_hi;
4911   }
4912   params->sse4_mul16.shift = shift;
4913   params->sse4_mul16.b_multiplier = (uint32_t) b_multiplier;
4914   for (uint32_t i = 0; i < 8; i++) {
4915     params->sse4_mul16.output_zero_point[i] = (int16_t) output_zero_point;
4916   }
4917   for (uint32_t i = 0; i < 16; i++) {
4918     params->sse4_mul16.output_min[i] = output_min;
4919     params->sse4_mul16.output_max[i] = output_max;
4920   }
4921   return sizeof(params->sse4_mul16);
4922 }
4923 
xnn_init_qs8_add_minmax_sse4_mul32_params(union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)4924 size_t xnn_init_qs8_add_minmax_sse4_mul32_params(
4925   union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4926   int8_t a_zero_point,
4927   int8_t b_zero_point,
4928   int8_t output_zero_point,
4929   float a_output_scale,
4930   float b_output_scale,
4931   int8_t output_min,
4932   int8_t output_max)
4933 {
4934   const float abs_a_output_scale = fabsf(a_output_scale);
4935   const float abs_b_output_scale = fabsf(b_output_scale);
4936   assert(abs_a_output_scale >= 0x1.0p-10f);
4937   assert(abs_b_output_scale >= 0x1.0p-10f);
4938   assert(abs_a_output_scale < 0x1.0p+8f);
4939   assert(abs_b_output_scale < 0x1.0p+8f);
4940 
4941   // Compute requantization parameters.
4942   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
4943   assert(max_abs_output_scale >= 0x1.0p-10f);
4944   assert(max_abs_output_scale < 0x1.0p+8f);
4945   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
4946   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
4947 
4948   // Shift is in [12, 30] range.
4949   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
4950   assert(shift <= 30);
4951   assert(shift >= 12);
4952 
4953   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
4954   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
4955   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
4956   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
4957   assert(abs_a_multiplier <= INT32_C(0x00200000));
4958   assert(abs_b_multiplier <= INT32_C(0x00200000));
4959 
4960   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
4961   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
4962 
4963   const int32_t rounding = INT32_C(1) << (shift - 1);
4964   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
4965   for (uint32_t i = 0; i < 4; i++) {
4966     params->sse4_mul32.bias[i] = bias;
4967     params->sse4_mul32.a_multiplier[i] = a_multiplier;
4968     params->sse4_mul32.b_multiplier[i] = b_multiplier;
4969   }
4970   for (uint32_t i = 0; i < 2; i++) {
4971     params->sse4_mul32.shift[i] = (uint64_t) shift;
4972   }
4973   for (uint32_t i = 0; i < 8; i++) {
4974     params->sse4_mul32.output_zero_point[i] = (int16_t) output_zero_point;
4975   }
4976   for (uint32_t i = 0; i < 16; i++) {
4977     params->sse4_mul32.output_min[i] = output_min;
4978     params->sse4_mul32.output_max[i] = output_max;
4979   }
4980   return sizeof(params->sse4_mul32);
4981 }
4982 
xnn_init_qs8_add_minmax_avx2_params(union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)4983 size_t xnn_init_qs8_add_minmax_avx2_params(
4984   union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
4985   int8_t a_zero_point,
4986   int8_t b_zero_point,
4987   int8_t output_zero_point,
4988   float a_output_scale,
4989   float b_output_scale,
4990   int8_t output_min,
4991   int8_t output_max)
4992 {
4993   const float abs_a_output_scale = fabsf(a_output_scale);
4994   const float abs_b_output_scale = fabsf(b_output_scale);
4995   assert(abs_a_output_scale >= 0x1.0p-10f);
4996   assert(abs_b_output_scale >= 0x1.0p-10f);
4997   assert(abs_a_output_scale < 0x1.0p+8f);
4998   assert(abs_b_output_scale < 0x1.0p+8f);
4999 
5000   // Compute requantization parameters.
5001   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
5002   assert(max_abs_output_scale >= 0x1.0p-10f);
5003   assert(max_abs_output_scale < 0x1.0p+8f);
5004   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
5005   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
5006 
5007   // Shift is in [12, 30] range.
5008   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
5009   assert(shift <= 30);
5010   assert(shift >= 12);
5011 
5012   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
5013   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
5014   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
5015   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
5016   assert(abs_a_multiplier <= INT32_C(0x00200000));
5017   assert(abs_b_multiplier <= INT32_C(0x00200000));
5018 
5019   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
5020   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
5021 
5022   const int32_t rounding = INT32_C(1) << (shift - 1);
5023   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
5024   for (uint32_t i = 0; i < 8; i++) {
5025     params->avx2.bias[i] = bias;
5026     params->avx2.a_multiplier[i] = a_multiplier;
5027     params->avx2.b_multiplier[i] = b_multiplier;
5028   }
5029   for (uint32_t i = 0; i < 4; i++) {
5030     params->avx2.shift[i] = (uint64_t) shift;
5031   }
5032   for (uint32_t i = 0; i < 16; i++) {
5033     params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
5034     params->avx2.output_min[i] = output_min;
5035     params->avx2.output_max[i] = output_max;
5036   }
5037   return sizeof(params->avx2);
5038 }
5039 
xnn_init_qs8_add_minmax_avx512_params(union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)5040 size_t xnn_init_qs8_add_minmax_avx512_params(
5041   union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
5042   int8_t a_zero_point,
5043   int8_t b_zero_point,
5044   int8_t output_zero_point,
5045   float a_output_scale,
5046   float b_output_scale,
5047   int8_t output_min,
5048   int8_t output_max)
5049 {
5050   const float abs_a_output_scale = fabsf(a_output_scale);
5051   const float abs_b_output_scale = fabsf(b_output_scale);
5052   assert(abs_a_output_scale >= 0x1.0p-10f);
5053   assert(abs_b_output_scale >= 0x1.0p-10f);
5054   assert(abs_a_output_scale < 0x1.0p+8f);
5055   assert(abs_b_output_scale < 0x1.0p+8f);
5056 
5057   // Compute requantization parameters.
5058   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
5059   assert(max_abs_output_scale >= 0x1.0p-10f);
5060   assert(max_abs_output_scale < 0x1.0p+8f);
5061   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
5062   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
5063 
5064   // Shift is in [12, 30] range.
5065   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
5066   assert(shift <= 30);
5067   assert(shift >= 12);
5068 
5069   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
5070   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
5071   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
5072   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
5073   assert(abs_a_multiplier <= INT32_C(0x00200000));
5074   assert(abs_b_multiplier <= INT32_C(0x00200000));
5075 
5076   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
5077   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
5078 
5079   const int32_t rounding = INT32_C(1) << (shift - 1);
5080   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
5081   for (uint32_t i = 0; i < 16; i++) {
5082     params->avx512.bias[i] = bias;
5083     params->avx512.a_multiplier[i] = a_multiplier;
5084     params->avx512.b_multiplier[i] = b_multiplier;
5085   }
5086   for (uint32_t i = 0; i < 8; i++) {
5087     params->avx512.shift[i] = (uint64_t) shift;
5088   }
5089   for (uint32_t i = 0; i < 32; i++) {
5090     params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
5091     params->avx512.output_min[i] = output_min;
5092     params->avx512.output_max[i] = output_max;
5093   }
5094   return sizeof(params->avx512);
5095 }
5096 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
5097 
5098 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_add_minmax_neon_params(union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)5099 size_t xnn_init_qs8_add_minmax_neon_params(
5100   union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
5101   int8_t a_zero_point,
5102   int8_t b_zero_point,
5103   int8_t output_zero_point,
5104   float a_output_scale,
5105   float b_output_scale,
5106   int8_t output_min,
5107   int8_t output_max)
5108 {
5109   const float abs_a_output_scale = fabsf(a_output_scale);
5110   const float abs_b_output_scale = fabsf(b_output_scale);
5111   assert(abs_a_output_scale >= 0x1.0p-10f);
5112   assert(abs_b_output_scale >= 0x1.0p-10f);
5113   assert(abs_a_output_scale < 0x1.0p+8f);
5114   assert(abs_b_output_scale < 0x1.0p+8f);
5115 
5116   // Compute requantization parameters.
5117   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
5118   assert(max_abs_output_scale >= 0x1.0p-10f);
5119   assert(max_abs_output_scale < 0x1.0p+8f);
5120   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
5121   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
5122 
5123   // Shift is in [12, 30] range.
5124   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
5125   assert(shift <= 30);
5126   assert(shift >= 12);
5127 
5128   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
5129   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
5130   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
5131   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
5132   assert(abs_a_multiplier <= INT32_C(0x00200000));
5133   assert(abs_b_multiplier <= INT32_C(0x00200000));
5134 
5135   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
5136   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
5137 
5138   params->neon.a_zero_point = a_zero_point;
5139   params->neon.b_zero_point = b_zero_point;
5140   params->neon.a_multiplier = (int32_t) a_multiplier;
5141   params->neon.b_multiplier = (int32_t) b_multiplier;
5142   params->neon.right_shift = (int32_t) -shift;
5143   params->neon.output_zero_point = (int16_t) output_zero_point;
5144   params->neon.output_min = output_min;
5145   params->neon.output_max = output_max;
5146   return sizeof(params->neon);
5147 }
5148 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
5149 
5150 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_add_minmax_wasmsimd_params(union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)5151 size_t xnn_init_qs8_add_minmax_wasmsimd_params(
5152   union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
5153   int8_t a_zero_point,
5154   int8_t b_zero_point,
5155   int8_t output_zero_point,
5156   float a_output_scale,
5157   float b_output_scale,
5158   int8_t output_min,
5159   int8_t output_max)
5160 {
5161   const float abs_a_output_scale = fabsf(a_output_scale);
5162   const float abs_b_output_scale = fabsf(b_output_scale);
5163   assert(abs_a_output_scale >= 0x1.0p-10f);
5164   assert(abs_b_output_scale >= 0x1.0p-10f);
5165   assert(abs_a_output_scale < 0x1.0p+8f);
5166   assert(abs_b_output_scale < 0x1.0p+8f);
5167 
5168   // Compute requantization parameters.
5169   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
5170   assert(max_abs_output_scale >= 0x1.0p-10f);
5171   assert(max_abs_output_scale < 0x1.0p+8f);
5172   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
5173   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
5174 
5175   // Shift is in [12, 30] range.
5176   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
5177   assert(shift <= 30);
5178   assert(shift >= 12);
5179 
5180   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
5181   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
5182   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
5183   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
5184   assert(abs_a_multiplier <= INT32_C(0x00200000));
5185   assert(abs_b_multiplier <= INT32_C(0x00200000));
5186 
5187   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
5188   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
5189 
5190   const int32_t rounding = INT32_C(1) << (shift - 1);
5191   const int32_t bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
5192   for (uint32_t i = 0; i < 2; i++) {
5193     params->wasmsimd.bias[i] = bias;
5194     params->wasmsimd.a_multiplier[i] = a_multiplier;
5195     params->wasmsimd.b_multiplier[i] = b_multiplier;
5196   }
5197   params->wasmsimd.shift = shift;
5198   for (uint32_t i = 0; i < 4; i++) {
5199     params->wasmsimd.output_zero_point[i] = (int16_t) output_zero_point;
5200   }
5201   for (uint32_t i = 0; i < 8; i++) {
5202     params->wasmsimd.output_min[i] = output_min;
5203     params->wasmsimd.output_max[i] = output_max;
5204   }
5205   return sizeof(params->wasmsimd);
5206 }
5207 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5208 
xnn_init_qs8_add_minmax_scalar_params(union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float a_output_scale,float b_output_scale,int8_t output_min,int8_t output_max)5209 size_t xnn_init_qs8_add_minmax_scalar_params(
5210   union xnn_qs8_add_minmax_params params[XNN_MIN_ELEMENTS(1)],
5211   int8_t a_zero_point,
5212   int8_t b_zero_point,
5213   int8_t output_zero_point,
5214   float a_output_scale,
5215   float b_output_scale,
5216   int8_t output_min,
5217   int8_t output_max)
5218 {
5219   const float abs_a_output_scale = fabsf(a_output_scale);
5220   const float abs_b_output_scale = fabsf(b_output_scale);
5221   assert(abs_a_output_scale >= 0x1.0p-10f);
5222   assert(abs_b_output_scale >= 0x1.0p-10f);
5223   assert(abs_a_output_scale < 0x1.0p+8f);
5224   assert(abs_b_output_scale < 0x1.0p+8f);
5225 
5226   // Compute requantization parameters.
5227   const float max_abs_output_scale = math_max_f32(abs_a_output_scale, abs_b_output_scale);
5228   assert(max_abs_output_scale >= 0x1.0p-10f);
5229   assert(max_abs_output_scale < 0x1.0p+8f);
5230   const uint32_t max_scale_bits = float_as_uint32(max_abs_output_scale);
5231   const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
5232 
5233   // Shift is in [12, 30] range.
5234   const uint32_t shift = (uint32_t) (20 /* multiplier bits */ - max_scale_exponent);
5235   assert(shift <= 30);
5236   assert(shift >= 12);
5237 
5238   // Multipliers are in [0, 2**21) range, largest multiplier is in [2**20, 2**21) range.
5239   const int32_t abs_a_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_a_output_scale) + (shift << 23)));
5240   const int32_t abs_b_multiplier = (int32_t) lrintf(uint32_as_float(float_as_uint32(abs_b_output_scale) + (shift << 23)));
5241   assert(math_max_s32(abs_a_multiplier, abs_b_multiplier) >= INT32_C(0x00100000));
5242   assert(abs_a_multiplier <= INT32_C(0x00200000));
5243   assert(abs_b_multiplier <= INT32_C(0x00200000));
5244 
5245   const int32_t a_multiplier = signbit(a_output_scale) ? -abs_a_multiplier : abs_a_multiplier;
5246   const int32_t b_multiplier = signbit(b_output_scale) ? -abs_b_multiplier : abs_b_multiplier;
5247 
5248   const int32_t rounding = INT32_C(1) << (shift - 1);
5249   params->scalar.bias = rounding - a_multiplier * (int32_t) a_zero_point - b_multiplier * (int32_t) b_zero_point;
5250   params->scalar.a_multiplier = a_multiplier;
5251   params->scalar.b_multiplier = b_multiplier;
5252   params->scalar.shift = shift;
5253   params->scalar.output_min_less_zero_point = (int32_t) output_min - (int32_t) output_zero_point;
5254   params->scalar.output_max_less_zero_point = (int32_t) output_max - (int32_t) output_zero_point;
5255   params->scalar.output_zero_point = (int32_t) output_zero_point;
5256   return sizeof(params->scalar);
5257 }
5258 
xnn_init_qu8_mul_minmax_fp32_scalar_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)5259 size_t xnn_init_qu8_mul_minmax_fp32_scalar_params(
5260   union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5261   uint8_t a_zero_point,
5262   uint8_t b_zero_point,
5263   uint8_t output_zero_point,
5264   float product_output_scale,
5265   uint8_t output_min,
5266   uint8_t output_max)
5267 {
5268   assert(product_output_scale >= 0x1.0p-16f);
5269   assert(product_output_scale < 0x1.0p+8f);
5270 
5271   params->fp32_scalar.a_zero_point = (int16_t) (uint16_t) a_zero_point;
5272   params->fp32_scalar.b_zero_point = (int16_t) (uint16_t) b_zero_point;
5273   params->fp32_scalar.scale = product_output_scale;
5274   params->fp32_scalar.output_min_less_zero_point = (float) (int32_t) ((uint32_t) output_min - (uint32_t) output_zero_point);
5275   params->fp32_scalar.output_max_less_zero_point = (float) (int32_t) ((uint32_t) output_max - (uint32_t) output_zero_point);
5276   params->fp32_scalar.magic_bias = 12582912.0f;
5277   params->fp32_scalar.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) (uint32_t) output_zero_point;
5278   return sizeof(params->fp32_scalar);
5279 }
5280 
5281 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_mul_minmax_fp32_neon_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)5282 size_t xnn_init_qu8_mul_minmax_fp32_neon_params(
5283   union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5284   uint8_t a_zero_point,
5285   uint8_t b_zero_point,
5286   uint8_t output_zero_point,
5287   float product_output_scale,
5288   uint8_t output_min,
5289   uint8_t output_max)
5290 {
5291   assert(product_output_scale >= 0x1.0p-16f);
5292   assert(product_output_scale < 0x1.0p+8f);
5293 
5294   params->fp32_neon.a_zero_point[0] = a_zero_point;
5295   params->fp32_neon.a_zero_point[1] = a_zero_point;
5296   params->fp32_neon.b_zero_point[0] = b_zero_point;
5297   params->fp32_neon.b_zero_point[1] = b_zero_point;
5298   params->fp32_neon.scale = product_output_scale;
5299   params->fp32_neon.magic_bias = 12582912.0f;
5300   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5301   params->fp32_neon.output_min = output_min;
5302   params->fp32_neon.output_max = output_max;
5303   return sizeof(params->fp32_neon);
5304 }
5305 
xnn_init_qu8_mul_minmax_fp32_neonv8_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)5306 size_t xnn_init_qu8_mul_minmax_fp32_neonv8_params(
5307   union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5308   uint8_t a_zero_point,
5309   uint8_t b_zero_point,
5310   uint8_t output_zero_point,
5311   float product_output_scale,
5312   uint8_t output_min,
5313   uint8_t output_max)
5314 {
5315   assert(product_output_scale >= 0x1.0p-16f);
5316   assert(product_output_scale < 0x1.0p+8f);
5317 
5318   params->fp32_neonv8.a_zero_point[0] = a_zero_point;
5319   params->fp32_neonv8.a_zero_point[1] = a_zero_point;
5320   params->fp32_neonv8.b_zero_point[0] = b_zero_point;
5321   params->fp32_neonv8.b_zero_point[1] = b_zero_point;
5322   params->fp32_neonv8.scale = product_output_scale;
5323   params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
5324   params->fp32_neonv8.output_min = output_min;
5325   params->fp32_neonv8.output_max = output_max;
5326   return sizeof(params->fp32_neonv8);
5327 }
5328 
xnn_init_qu8_mul_minmax_rndnu_neon_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)5329 size_t xnn_init_qu8_mul_minmax_rndnu_neon_params(
5330   union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5331   uint8_t a_zero_point,
5332   uint8_t b_zero_point,
5333   uint8_t output_zero_point,
5334   float product_output_scale,
5335   uint8_t output_min,
5336   uint8_t output_max)
5337 {
5338   assert(product_output_scale >= 0x1.0p-16f);
5339   assert(product_output_scale < 0x1.0p+8f);
5340 
5341   // Compute requantization parameters.
5342   const uint32_t scale_bits = float_as_uint32(product_output_scale);
5343 
5344   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
5345   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
5346   assert(multiplier >= INT32_C(0x40000000));
5347   assert(multiplier <= INT32_C(0x7FFFFF80));
5348 
5349   // Shift is in [-8, 15] range.
5350   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
5351   assert(shift >= -8);
5352   assert(shift < 16);
5353 
5354   // Split shift into pre_shift + post_shift, post_shift in [1, 15] range.
5355   const int32_t post_shift = math_max_s32(shift, 1);
5356   const int32_t pre_shift = shift - post_shift;
5357 
5358   params->rndnu_neon.a_zero_point[0] = a_zero_point;
5359   params->rndnu_neon.a_zero_point[1] = a_zero_point;
5360   params->rndnu_neon.b_zero_point[0] = b_zero_point;
5361   params->rndnu_neon.b_zero_point[1] = b_zero_point;
5362   params->rndnu_neon.left_pre_shift = -pre_shift;
5363   params->rndnu_neon.multiplier = multiplier;
5364   params->rndnu_neon.left_post_shift = -post_shift;
5365   params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
5366   params->rndnu_neon.output_min = output_min;
5367   params->rndnu_neon.output_max = output_max;
5368   return sizeof(params->rndnu_neon);
5369 }
5370 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
5371 
5372 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_mul_minmax_fp32_sse2_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)5373 size_t xnn_init_qu8_mul_minmax_fp32_sse2_params(
5374   union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5375   uint8_t a_zero_point,
5376   uint8_t b_zero_point,
5377   uint8_t output_zero_point,
5378   float product_output_scale,
5379   uint8_t output_min,
5380   uint8_t output_max)
5381 {
5382   assert(product_output_scale >= 0x1.0p-16f);
5383   assert(product_output_scale < 0x1.0p+8f);
5384 
5385   for (uint32_t i = 0; i < 8; i++) {
5386     params->fp32_sse2.a_zero_point[i] = (int16_t) (uint16_t) a_zero_point;
5387     params->fp32_sse2.b_zero_point[i] = (int16_t) (uint16_t) b_zero_point;
5388   }
5389   for (uint32_t i = 0; i < 4; i++) {
5390     params->fp32_sse2.scale[i] = product_output_scale;
5391   }
5392   for (uint32_t i = 0; i < 8; i++) {
5393     params->fp32_sse2.output_zero_point[i] = (int16_t) (uint16_t) output_zero_point;
5394   }
5395   for (uint32_t i = 0; i < 16; i++) {
5396     params->fp32_sse2.output_min[i] = output_min;
5397     params->fp32_sse2.output_max[i] = output_max;
5398   }
5399   return sizeof(params->fp32_sse2);
5400 }
5401 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
5402 
5403 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_mul_minmax_fp32_wasmsimd_params(union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],uint8_t a_zero_point,uint8_t b_zero_point,uint8_t output_zero_point,float product_output_scale,uint8_t output_min,uint8_t output_max)5404 size_t xnn_init_qu8_mul_minmax_fp32_wasmsimd_params(
5405   union xnn_qu8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5406   uint8_t a_zero_point,
5407   uint8_t b_zero_point,
5408   uint8_t output_zero_point,
5409   float product_output_scale,
5410   uint8_t output_min,
5411   uint8_t output_max)
5412 {
5413   assert(product_output_scale >= 0x1.0p-16f);
5414   assert(product_output_scale < 0x1.0p+8f);
5415 
5416   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5417   const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
5418   const int32_t magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5419   for (uint32_t i = 0; i < 4; i++) {
5420     params->fp32_wasmsimd.a_zero_point[i] = (int16_t) a_zero_point;
5421     params->fp32_wasmsimd.b_zero_point[i] = (int16_t) b_zero_point;
5422   }
5423   for (uint32_t i = 0; i < 2; i++) {
5424     params->fp32_wasmsimd.scale[i] = product_output_scale;
5425     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
5426     params->fp32_wasmsimd.magic_min[i] = magic_min;
5427     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_output_zero_point;
5428   }
5429   for (uint32_t i = 0; i < 8; i++) {
5430     params->fp32_wasmsimd.output_max[i] = output_max;
5431   }
5432   return sizeof(params->fp32_wasmsimd);
5433 }
5434 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5435 
xnn_init_qs8_mul_minmax_fp32_scalar_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)5436 size_t xnn_init_qs8_mul_minmax_fp32_scalar_params(
5437   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5438   int8_t a_zero_point,
5439   int8_t b_zero_point,
5440   int8_t output_zero_point,
5441   float product_output_scale,
5442   int8_t output_min,
5443   int8_t output_max)
5444 {
5445   assert(product_output_scale >= 0x1.0p-16f);
5446   assert(product_output_scale < 0x1.0p+8f);
5447 
5448   params->fp32_scalar.a_zero_point = (int16_t) a_zero_point;
5449   params->fp32_scalar.b_zero_point = (int16_t) b_zero_point;
5450   params->fp32_scalar.scale = product_output_scale;
5451   params->fp32_scalar.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5452   params->fp32_scalar.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5453   params->fp32_scalar.magic_bias = 12582912.0f;
5454   params->fp32_scalar.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5455   return sizeof(params->fp32_scalar);
5456 }
5457 
5458 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_mul_minmax_fp32_neon_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)5459 size_t xnn_init_qs8_mul_minmax_fp32_neon_params(
5460   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5461   int8_t a_zero_point,
5462   int8_t b_zero_point,
5463   int8_t output_zero_point,
5464   float product_output_scale,
5465   int8_t output_min,
5466   int8_t output_max)
5467 {
5468   assert(product_output_scale >= 0x1.0p-16f);
5469   assert(product_output_scale < 0x1.0p+8f);
5470 
5471   params->fp32_neon.a_zero_point[0] = a_zero_point;
5472   params->fp32_neon.a_zero_point[1] = a_zero_point;
5473   params->fp32_neon.b_zero_point[0] = b_zero_point;
5474   params->fp32_neon.b_zero_point[1] = b_zero_point;
5475   params->fp32_neon.scale = product_output_scale;
5476   params->fp32_neon.magic_bias = 12582912.0f;
5477   params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5478   params->fp32_neon.output_min = output_min;
5479   params->fp32_neon.output_max = output_max;
5480   return sizeof(params->fp32_neon);
5481 }
5482 
xnn_init_qs8_mul_minmax_fp32_neonv8_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)5483 size_t xnn_init_qs8_mul_minmax_fp32_neonv8_params(
5484   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5485   int8_t a_zero_point,
5486   int8_t b_zero_point,
5487   int8_t output_zero_point,
5488   float product_output_scale,
5489   int8_t output_min,
5490   int8_t output_max)
5491 {
5492   assert(product_output_scale >= 0x1.0p-16f);
5493   assert(product_output_scale < 0x1.0p+8f);
5494 
5495   params->fp32_neonv8.a_zero_point[0] = a_zero_point;
5496   params->fp32_neonv8.a_zero_point[1] = a_zero_point;
5497   params->fp32_neonv8.b_zero_point[0] = b_zero_point;
5498   params->fp32_neonv8.b_zero_point[1] = b_zero_point;
5499   params->fp32_neonv8.scale = product_output_scale;
5500   params->fp32_neonv8.output_zero_point = (int16_t) output_zero_point;
5501   params->fp32_neonv8.output_min = output_min;
5502   params->fp32_neonv8.output_max = output_max;
5503   return sizeof(params->fp32_neonv8);
5504 }
5505 
xnn_init_qs8_mul_minmax_rndnu_neon_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)5506 size_t xnn_init_qs8_mul_minmax_rndnu_neon_params(
5507   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5508   int8_t a_zero_point,
5509   int8_t b_zero_point,
5510   int8_t output_zero_point,
5511   float product_output_scale,
5512   int8_t output_min,
5513   int8_t output_max)
5514 {
5515   assert(product_output_scale >= 0x1.0p-16f);
5516   assert(product_output_scale < 0x1.0p+8f);
5517 
5518   // Compute requantization parameters.
5519   const uint32_t scale_bits = float_as_uint32(product_output_scale);
5520 
5521   // Multiplier is in [0x40000000, 0x7FFFFF80] range.
5522   const int32_t multiplier = (int32_t) (((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
5523   assert(multiplier >= INT32_C(0x40000000));
5524   assert(multiplier <= INT32_C(0x7FFFFF80));
5525 
5526   // Shift is in [-8, 15] range.
5527   const int32_t shift = 127 + 31 - 32 - (scale_bits >> 23);
5528   assert(shift >= -8);
5529   assert(shift < 16);
5530 
5531   // Split shift into pre_shift + post_shift, post_shift in [1, 15] range.
5532   const int32_t post_shift = math_max_s32(shift, 1);
5533   const int32_t pre_shift = shift - post_shift;
5534 
5535   params->rndnu_neon.a_zero_point[0] = a_zero_point;
5536   params->rndnu_neon.a_zero_point[1] = a_zero_point;
5537   params->rndnu_neon.b_zero_point[0] = b_zero_point;
5538   params->rndnu_neon.b_zero_point[1] = b_zero_point;
5539   params->rndnu_neon.left_pre_shift = -pre_shift;
5540   params->rndnu_neon.multiplier = multiplier;
5541   params->rndnu_neon.left_post_shift = -post_shift;
5542   params->rndnu_neon.output_zero_point = (int16_t) output_zero_point;
5543   params->rndnu_neon.output_min = output_min;
5544   params->rndnu_neon.output_max = output_max;
5545   return sizeof(params->rndnu_neon);
5546 }
5547 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
5548 
5549 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_mul_minmax_fp32_sse2_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)5550 size_t xnn_init_qs8_mul_minmax_fp32_sse2_params(
5551   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5552   int8_t a_zero_point,
5553   int8_t b_zero_point,
5554   int8_t output_zero_point,
5555   float product_output_scale,
5556   int8_t output_min,
5557   int8_t output_max)
5558 {
5559   assert(product_output_scale >= 0x1.0p-16f);
5560   assert(product_output_scale < 0x1.0p+8f);
5561 
5562   for (uint32_t i = 0; i < 8; i++) {
5563     params->fp32_sse2.a_zero_point[i] = (int16_t) a_zero_point;
5564     params->fp32_sse2.b_zero_point[i] = (int16_t) b_zero_point;
5565   }
5566   for (uint32_t i = 0; i < 4; i++) {
5567     params->fp32_sse2.scale[i] = product_output_scale;
5568   }
5569   for (uint32_t i = 0; i < 8; i++) {
5570     params->fp32_sse2.output_zero_point[i] = (int16_t) output_zero_point;
5571   }
5572   for (uint32_t i = 0; i < 8; i++) {
5573     params->fp32_sse2.output_min[i] = (int16_t) output_min;
5574     params->fp32_sse2.output_max[i] = (int16_t) output_max;
5575   }
5576   return sizeof(params->fp32_sse2);
5577 }
5578 
xnn_init_qs8_mul_minmax_fp32_sse4_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)5579 size_t xnn_init_qs8_mul_minmax_fp32_sse4_params(
5580   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5581   int8_t a_zero_point,
5582   int8_t b_zero_point,
5583   int8_t output_zero_point,
5584   float product_output_scale,
5585   int8_t output_min,
5586   int8_t output_max)
5587 {
5588   assert(product_output_scale >= 0x1.0p-16f);
5589   assert(product_output_scale < 0x1.0p+8f);
5590 
5591   for (uint32_t i = 0; i < 8; i++) {
5592     params->fp32_sse4.a_zero_point[i] = (int16_t) a_zero_point;
5593     params->fp32_sse4.b_zero_point[i] = (int16_t) b_zero_point;
5594   }
5595   for (uint32_t i = 0; i < 4; i++) {
5596     params->fp32_sse4.scale[i] = product_output_scale;
5597   }
5598   for (uint32_t i = 0; i < 8; i++) {
5599     params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
5600   }
5601   for (uint32_t i = 0; i < 16; i++) {
5602     params->fp32_sse4.output_min[i] = output_min;
5603     params->fp32_sse4.output_max[i] = output_max;
5604   }
5605   return sizeof(params->fp32_sse4);
5606 }
5607 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
5608 
5609 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_mul_minmax_fp32_wasmsimd_params(union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS (1)],int8_t a_zero_point,int8_t b_zero_point,int8_t output_zero_point,float product_output_scale,int8_t output_min,int8_t output_max)5610 size_t xnn_init_qs8_mul_minmax_fp32_wasmsimd_params(
5611   union xnn_qs8_mul_minmax_params params[XNN_MIN_ELEMENTS(1)],
5612   int8_t a_zero_point,
5613   int8_t b_zero_point,
5614   int8_t output_zero_point,
5615   float product_output_scale,
5616   int8_t output_min,
5617   int8_t output_max)
5618 {
5619   assert(product_output_scale >= 0x1.0p-16f);
5620   assert(product_output_scale < 0x1.0p+8f);
5621 
5622   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5623   const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
5624   const int32_t magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5625   for (uint32_t i = 0; i < 4; i++) {
5626     params->fp32_wasmsimd.a_zero_point[i] = (int16_t) a_zero_point;
5627     params->fp32_wasmsimd.b_zero_point[i] = (int16_t) b_zero_point;
5628   }
5629   for (uint32_t i = 0; i < 2; i++) {
5630     params->fp32_wasmsimd.scale[i] = product_output_scale;
5631     params->fp32_wasmsimd.magic_bias[i] = 12582912.0f;
5632     params->fp32_wasmsimd.magic_min[i] = magic_min;
5633     params->fp32_wasmsimd.magic_bias_less_output_zero_point[i] = magic_bias_less_output_zero_point;
5634   }
5635   for (uint32_t i = 0; i < 8; i++) {
5636     params->fp32_wasmsimd.output_max[i] = output_max;
5637   }
5638   return sizeof(params->fp32_wasmsimd);
5639 }
5640 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5641 
xnn_init_f16_f32_cvt_scalar_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])5642 size_t xnn_init_f16_f32_cvt_scalar_params(
5643   union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
5644 {
5645   params->scalar.sign_mask = UINT32_C(0x80000000);
5646   params->scalar.exp_offset = UINT32_C(0x70000000);
5647   params->scalar.exp_scale = 0x1.0p-112f;
5648   params->scalar.magic_mask = UINT32_C(0x3F000000);
5649   params->scalar.magic_bias = 0.5f;
5650   params->scalar.denorm_cutoff = UINT32_C(0x08000000);
5651   return sizeof(params->scalar);
5652 }
5653 
5654 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f16_f32_cvt_neon_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])5655 size_t xnn_init_f16_f32_cvt_neon_params(
5656   union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
5657 {
5658   params->neon.exp_scale = 0x1.0p-112f;
5659   return sizeof(params->neon);
5660 }
5661 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
5662 
5663 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f16_f32_cvt_sse_int16_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])5664 size_t xnn_init_f16_f32_cvt_sse_int16_params(
5665   union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
5666 {
5667   for (uint32_t i = 0; i < 8; i++) {
5668     params->sse_int16.sign_mask[i] = UINT16_C(0x8000);
5669     params->sse_int16.exp_offset[i] = UINT16_C(0x7000);
5670   }
5671   for (uint32_t i = 0; i < 4; i++) {
5672     params->sse_int16.exp_scale[i] = 0x1.0p-112f;
5673   }
5674   for (uint32_t i = 0; i < 8; i++) {
5675     params->sse_int16.magic_mask[i] = UINT16_C(0x3F00);
5676   }
5677   for (uint32_t i = 0; i < 4; i++) {
5678     params->sse_int16.magic_bias[i] = 0.5f;
5679   }
5680   for (uint32_t i = 0; i < 8; i++) {
5681     params->sse_int16.denorm_cutoff[i] = INT16_C(0x0400);
5682   }
5683   return sizeof(params->sse_int16);
5684 }
5685 
xnn_init_f16_f32_cvt_sse_int32_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])5686 size_t xnn_init_f16_f32_cvt_sse_int32_params(
5687   union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
5688 {
5689   for (uint32_t i = 0; i < 4; i++) {
5690     params->sse_int32.sign_mask[i] = UINT32_C(0x80000000);
5691     params->sse_int32.exp_offset[i] = UINT32_C(0x70000000);
5692     params->sse_int32.exp_scale[i] = 0x1.0p-112f;
5693     params->sse_int32.magic_bias[i] = UINT32_C(0x3F000000);
5694     params->sse_int32.denorm_cutoff[i] = INT32_C(0x04000000);
5695   }
5696   return sizeof(params->sse_int32);
5697 }
5698 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
5699 
5700 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f16_f32_cvt_wasmsimd_int16_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])5701 size_t xnn_init_f16_f32_cvt_wasmsimd_int16_params(
5702   union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
5703 {
5704   for (uint32_t i = 0; i < 4; i++) {
5705     params->wasmsimd_int16.sign_mask[i] = UINT16_C(0x8000);
5706     params->wasmsimd_int16.exp_offset[i] = UINT16_C(0x7000);
5707   }
5708   for (uint32_t i = 0; i < 2; i++) {
5709     params->wasmsimd_int16.exp_scale[i] = 0x1.0p-112f;
5710   }
5711   for (uint32_t i = 0; i < 4; i++) {
5712     params->wasmsimd_int16.magic_mask[i] = UINT16_C(0x3F00);
5713   }
5714   for (uint32_t i = 0; i < 2; i++) {
5715     params->wasmsimd_int16.magic_bias[i] = 0.5f;
5716   }
5717   for (uint32_t i = 0; i < 4; i++) {
5718     params->wasmsimd_int16.denorm_cutoff[i] = INT16_C(0x0400);
5719   }
5720   return sizeof(params->wasmsimd_int16);
5721 }
5722 
xnn_init_f16_f32_cvt_wasmsimd_int32_params(union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS (1)])5723 size_t xnn_init_f16_f32_cvt_wasmsimd_int32_params(
5724   union xnn_f16_f32_cvt_params params[XNN_MIN_ELEMENTS(1)])
5725 {
5726   for (uint32_t i = 0; i < 2; i++) {
5727     params->wasmsimd_int32.sign_mask[i] = UINT32_C(0x80000000);
5728     params->wasmsimd_int32.exp_offset[i] = UINT32_C(0x70000000);
5729     params->wasmsimd_int32.exp_scale[i] = 0x1.0p-112f;
5730     params->wasmsimd_int32.magic_bias[i] = UINT32_C(0x3F000000);
5731     params->wasmsimd_int32.denorm_cutoff[i] = INT32_C(0x04000000);
5732   }
5733   return sizeof(params->wasmsimd_int32);
5734 }
5735 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5736 
xnn_init_f32_f16_cvt_scalar_bitcast_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])5737 size_t xnn_init_f32_f16_cvt_scalar_bitcast_params(
5738   union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
5739 {
5740   params->scalar_bitcast.nonsign_mask = UINT32_C(0x7FFFFFFF);
5741   params->scalar_bitcast.exp_bias = UINT32_C(0x07800000);
5742   params->scalar_bitcast.scale_to_inf = 0x1.0p+112f;
5743   params->scalar_bitcast.expw_max = UINT32_C(0x7F800000);
5744   params->scalar_bitcast.scale_to_zero = 0x1.0p-110f;
5745   params->scalar_bitcast.bias_min = UINT32_C(0x40000000);
5746   params->scalar_bitcast.exph_mask = UINT16_C(0x7C00);
5747   params->scalar_bitcast.manth_mask = UINT16_C(0x0FFF);
5748   params->scalar_bitcast.nanh = UINT16_C(0x7E00);
5749   return sizeof(params->scalar_bitcast);
5750 }
5751 
xnn_init_f32_f16_cvt_scalar_fabsf_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])5752 size_t xnn_init_f32_f16_cvt_scalar_fabsf_params(
5753   union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
5754 {
5755   params->scalar_fabsf.scale_to_inf = 0x1.0p+112f;
5756   params->scalar_fabsf.exp_bias = UINT32_C(0x07800000);
5757   params->scalar_fabsf.scale_to_zero = 0x1.0p-110f;
5758   params->scalar_fabsf.expw_max = UINT32_C(0x7F800000);
5759   params->scalar_fabsf.bias_min = UINT32_C(0x40000000);
5760   params->scalar_fabsf.exph_mask = UINT16_C(0x7C00);
5761   params->scalar_fabsf.manth_mask = UINT16_C(0x0FFF);
5762   params->scalar_fabsf.nanh = UINT16_C(0x7E00);
5763   return sizeof(params->scalar_fabsf);
5764 }
5765 
5766 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_f16_cvt_neon_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])5767 size_t xnn_init_f32_f16_cvt_neon_params(
5768   union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
5769 {
5770   params->neon.exp_bias = UINT32_C(0x07800000);
5771   params->neon.scale_to_inf = 0x1.0p+112f;
5772   params->neon.expw_max = UINT32_C(0x7F800000);
5773   params->neon.scale_to_zero = 0x1.0p-110f;
5774   return sizeof(params->neon);
5775 }
5776 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
5777 
5778 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_f16_cvt_sse2_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])5779 size_t xnn_init_f32_f16_cvt_sse2_params(
5780   union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
5781 {
5782   for (uint32_t i = 0; i < 4; i++) {
5783     params->sse2.nonsign_mask[i] = UINT32_C(0x7FFFFFFF);
5784     params->sse2.exp_bias[i] = UINT32_C(0x07800000);
5785     params->sse2.scale_to_inf[i] = 0x1.0p+112f;
5786     params->sse2.expw_max[i] = UINT32_C(0x7F800000);
5787     params->sse2.scale_to_zero[i] = 0x1.0p-110f;
5788   }
5789   params->sse2.bias_min[0] = INT16_C(0x8000);
5790   params->sse2.bias_min[1] = INT16_C(0x4000);
5791   params->sse2.bias_min[2] = INT16_C(0x8000);
5792   params->sse2.bias_min[3] = INT16_C(0x4000);
5793   params->sse2.bias_min[4] = INT16_C(0x8000);
5794   params->sse2.bias_min[5] = INT16_C(0x4000);
5795   params->sse2.bias_min[6] = INT16_C(0x8000);
5796   params->sse2.bias_min[7] = INT16_C(0x4000);
5797   for (uint32_t i = 0; i < 4; i++) {
5798     params->sse2.manth_mask[i] = UINT32_C(0x00000FFF);
5799     params->sse2.exph_mask[i] = UINT32_C(0x00007C00);
5800   }
5801   for (uint32_t i = 0; i < 8; i++) {
5802     params->sse2.nanh[i] = UINT16_C(0x7E00);
5803   }
5804   return sizeof(params->sse2);
5805 }
5806 
xnn_init_f32_f16_cvt_f16c_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])5807 size_t xnn_init_f32_f16_cvt_f16c_params(
5808   union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
5809 {
5810   for (uint32_t i = 0; i < 7; i++) {
5811     params->f16c.mask_table[i] = -1;
5812   }
5813   for (uint32_t i = 7; i < 14; i++) {
5814     params->f16c.mask_table[i] = 0;
5815   }
5816   return sizeof(params->f16c);
5817 }
5818 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
5819 
5820 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_f16_cvt_wasmsimd_params(union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS (1)])5821 size_t xnn_init_f32_f16_cvt_wasmsimd_params(
5822   union xnn_f32_f16_cvt_params params[XNN_MIN_ELEMENTS(1)])
5823 {
5824   for (uint32_t i = 0; i < 2; i++) {
5825     params->wasmsimd.exp_bias[i] = UINT32_C(0x07800000);
5826     params->wasmsimd.scale_to_inf[i] = 0x1.0p+112f;
5827     params->wasmsimd.expw_max[i] = UINT32_C(0x7F800000);
5828     params->wasmsimd.scale_to_zero[i] = 0x1.0p-110f;
5829   }
5830   params->wasmsimd.bias_min[0] = INT16_C(0x8000);
5831   params->wasmsimd.bias_min[1] = INT16_C(0x4000);
5832   params->wasmsimd.bias_min[2] = INT16_C(0x8000);
5833   params->wasmsimd.bias_min[3] = INT16_C(0x4000);
5834   for (uint32_t i = 0; i < 2; i++) {
5835     params->wasmsimd.manth_mask[i] = UINT32_C(0x00000FFF);
5836     params->wasmsimd.exph_mask[i] = UINT32_C(0x00007C00);
5837   }
5838   for (uint32_t i = 0; i < 4; i++) {
5839     params->wasmsimd.nanh[i] = UINT16_C(0x7E00);
5840   }
5841   return sizeof(params->wasmsimd);
5842 }
5843 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5844 
xnn_init_f32_qs8_cvt_scalar_fmagic_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5845 size_t xnn_init_f32_qs8_cvt_scalar_fmagic_params(
5846   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5847   float scale,
5848   int8_t output_zero_point,
5849   int8_t output_min,
5850   int8_t output_max)
5851 {
5852   params->scalar_fmagic.scale = scale;
5853   params->scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5854   params->scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5855   params->scalar_fmagic.magic_bias = 12582912.0f;
5856   params->scalar_fmagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5857   return sizeof(params->scalar_fmagic);
5858 }
5859 
xnn_init_f32_qs8_cvt_scalar_imagic_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5860 size_t xnn_init_f32_qs8_cvt_scalar_imagic_params(
5861   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5862   float scale,
5863   int8_t output_zero_point,
5864   int8_t output_min,
5865   int8_t output_max)
5866 {
5867   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5868   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5869   params->scalar_imagic.scale = scale;
5870   params->scalar_imagic.magic_bias = 12582912.0f;
5871   params->scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
5872   params->scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
5873   params->scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5874   return sizeof(params->scalar_imagic);
5875 }
5876 
xnn_init_f32_qs8_cvt_scalar_lrintf_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5877 size_t xnn_init_f32_qs8_cvt_scalar_lrintf_params(
5878   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5879   float scale,
5880   int8_t output_zero_point,
5881   int8_t output_min,
5882   int8_t output_max)
5883 {
5884   params->scalar_lrintf.scale = scale;
5885   params->scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
5886   params->scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5887   params->scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
5888   return sizeof(params->scalar_lrintf);
5889 }
5890 
5891 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_qs8_cvt_neon_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5892 size_t xnn_init_f32_qs8_cvt_neon_params(
5893   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5894   float scale,
5895   int8_t output_zero_point,
5896   int8_t output_min,
5897   int8_t output_max)
5898 {
5899   params->neon.scale = scale;
5900   params->neon.magic_bias = 12582912.0f;
5901   params->neon.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
5902   params->neon.output_min = output_min;
5903   params->neon.output_max = output_max;
5904   return sizeof(params->neon);
5905 }
5906 
xnn_init_f32_qs8_cvt_neonv8_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5907 size_t xnn_init_f32_qs8_cvt_neonv8_params(
5908   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5909   float scale,
5910   int8_t output_zero_point,
5911   int8_t output_min,
5912   int8_t output_max)
5913 {
5914   params->neonv8.scale = scale;
5915   params->neonv8.output_zero_point = (int16_t) output_zero_point;
5916   params->neonv8.output_min = output_min;
5917   params->neonv8.output_max = output_max;
5918   return sizeof(params->neonv8);
5919 }
5920 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
5921 
5922 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_qs8_cvt_sse2_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5923 size_t xnn_init_f32_qs8_cvt_sse2_params(
5924   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5925   float scale,
5926   int8_t output_zero_point,
5927   int8_t output_min,
5928   int8_t output_max)
5929 {
5930   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5931   for (uint32_t i = 0; i < 4; i++) {
5932     params->sse2.scale[i] = scale;
5933     params->sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
5934   }
5935   for (uint32_t i = 0; i < 8; i++) {
5936     params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
5937     params->sse2.output_min[i] = (int16_t) output_min;
5938   }
5939   return sizeof(params->sse2);
5940 }
5941 
xnn_init_f32_qs8_cvt_sse4_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5942 size_t xnn_init_f32_qs8_cvt_sse4_params(
5943   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5944   float scale,
5945   int8_t output_zero_point,
5946   int8_t output_min,
5947   int8_t output_max)
5948 {
5949   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5950   for (uint32_t i = 0; i < 4; i++) {
5951     params->sse4.scale[i] = scale;
5952     params->sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
5953   }
5954   for (uint32_t i = 0; i < 8; i++) {
5955     params->sse4.output_zero_point[i] = (int16_t) output_zero_point;
5956   }
5957   for (uint32_t i = 0; i < 16; i++) {
5958     params->sse4.output_min[i] = output_min;
5959   }
5960   return sizeof(params->sse4);
5961 }
5962 
xnn_init_f32_qs8_cvt_avx_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5963 size_t xnn_init_f32_qs8_cvt_avx_params(
5964   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5965   float scale,
5966   int8_t output_zero_point,
5967   int8_t output_min,
5968   int8_t output_max)
5969 {
5970   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5971   for (uint32_t i = 0; i < 8; i++) {
5972     params->avx.scale[i] = scale;
5973     params->avx.output_max_less_zero_point[i] = output_max_less_zero_point;
5974   }
5975   for (uint32_t i = 0; i < 8; i++) {
5976     params->avx.output_zero_point[i] = (int16_t) output_zero_point;
5977   }
5978   for (uint32_t i = 0; i < 16; i++) {
5979     params->avx.output_min[i] = output_min;
5980   }
5981   for (uint32_t i = 0; i < 7; i++) {
5982     params->avx.mask_table[i] = -1;
5983   }
5984   for (uint32_t i = 7; i < 14; i++) {
5985     params->avx.mask_table[i] = 0;
5986   }
5987   return sizeof(params->avx);
5988 }
5989 
xnn_init_f32_qs8_cvt_avx2_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)5990 size_t xnn_init_f32_qs8_cvt_avx2_params(
5991   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
5992   float scale,
5993   int8_t output_zero_point,
5994   int8_t output_min,
5995   int8_t output_max)
5996 {
5997   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
5998   for (uint32_t i = 0; i < 8; i++) {
5999     params->avx2.scale[i] = scale;
6000     params->avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
6001   }
6002   for (uint32_t i = 0; i < 16; i++) {
6003     params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
6004   }
6005   params->avx2.shuffle_mask[0] = 0;
6006   params->avx2.shuffle_mask[1] = 4;
6007   params->avx2.shuffle_mask[2] = 1;
6008   params->avx2.shuffle_mask[3] = 5;
6009   params->avx2.shuffle_mask[4] = 2;
6010   params->avx2.shuffle_mask[5] = 6;
6011   params->avx2.shuffle_mask[6] = 3;
6012   params->avx2.shuffle_mask[7] = 7;
6013   for (uint32_t i = 0; i < 32; i++) {
6014     params->avx2.output_min[i] = output_min;
6015   }
6016   for (uint32_t i = 0; i < 7; i++) {
6017     params->avx2.mask_table[i] = -1;
6018   }
6019   for (uint32_t i = 7; i < 14; i++) {
6020     params->avx2.mask_table[i] = 0;
6021   }
6022   return sizeof(params->avx2);
6023 }
6024 
xnn_init_f32_qs8_cvt_avx512_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)6025 size_t xnn_init_f32_qs8_cvt_avx512_params(
6026   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6027   float scale,
6028   int8_t output_zero_point,
6029   int8_t output_min,
6030   int8_t output_max)
6031 {
6032   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6033   for (uint32_t i = 0; i < 16; i++) {
6034     params->avx512.scale[i] = scale;
6035     params->avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
6036   }
6037   for (uint32_t i = 0; i < 32; i++) {
6038     params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
6039   }
6040   for (uint32_t i = 0; i < 64; i++) {
6041     params->avx512.output_min[i] = output_min;
6042   }
6043   params->avx512.shuffle512_mask[0] = 0;
6044   params->avx512.shuffle512_mask[1] = 4;
6045   params->avx512.shuffle512_mask[2] = 8;
6046   params->avx512.shuffle512_mask[3] = 12;
6047   params->avx512.shuffle512_mask[4] = 1;
6048   params->avx512.shuffle512_mask[5] = 5;
6049   params->avx512.shuffle512_mask[6] = 9;
6050   params->avx512.shuffle512_mask[7] = 13;
6051   params->avx512.shuffle512_mask[8] = 2;
6052   params->avx512.shuffle512_mask[9] = 6;
6053   params->avx512.shuffle512_mask[10] = 10;
6054   params->avx512.shuffle512_mask[11] = 14;
6055   params->avx512.shuffle512_mask[12] = 3;
6056   params->avx512.shuffle512_mask[13] = 7;
6057   params->avx512.shuffle512_mask[14] = 11;
6058   params->avx512.shuffle512_mask[15] = 15;
6059   params->avx512.shuffle256_mask[0] = 0;
6060   params->avx512.shuffle256_mask[1] = 4;
6061   params->avx512.shuffle256_mask[2] = 2;
6062   params->avx512.shuffle256_mask[3] = 6;
6063   params->avx512.shuffle256_mask[4] = 1;
6064   params->avx512.shuffle256_mask[5] = 5;
6065   params->avx512.shuffle256_mask[6] = 3;
6066   params->avx512.shuffle256_mask[7] = 7;
6067   return sizeof(params->avx512);
6068 }
6069 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
6070 
6071 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_qs8_cvt_wasmsimd_cvt_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)6072 size_t xnn_init_f32_qs8_cvt_wasmsimd_cvt_params(
6073   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6074   float scale,
6075   int8_t output_zero_point,
6076   int8_t output_min,
6077   int8_t output_max)
6078 {
6079   for (uint32_t i = 0; i < 2; i++) {
6080     params->wasmsimd_cvt.scale[i] = scale;
6081   }
6082   for (uint32_t i = 0; i < 4; i++) {
6083     params->wasmsimd_cvt.output_zero_point[i] = (int16_t) output_zero_point;
6084   }
6085   for (uint32_t i = 0; i < 8; i++) {
6086     params->wasmsimd_cvt.output_min[i] = output_min;
6087     params->wasmsimd_cvt.output_max[i] = output_max;
6088   }
6089   return sizeof(params->wasmsimd_cvt);
6090 }
6091 
xnn_init_f32_qs8_cvt_wasmsimd_magic_params(union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t output_zero_point,int8_t output_min,int8_t output_max)6092 size_t xnn_init_f32_qs8_cvt_wasmsimd_magic_params(
6093   union xnn_f32_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6094   float scale,
6095   int8_t output_zero_point,
6096   int8_t output_min,
6097   int8_t output_max)
6098 {
6099   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
6100   const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
6101   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
6102   for (uint32_t i = 0; i < 2; i++) {
6103     params->wasmsimd_magic.scale[i] = scale;
6104     params->wasmsimd_magic.magic_bias[i] = 12582912.0f;
6105     params->wasmsimd_magic.magic_min[i] = magic_min;
6106     params->wasmsimd_magic.magic_bias_less_zero_point[i] = magic_bias_less_zero_point;
6107   }
6108   for (uint32_t i = 0; i < 8; i++) {
6109     params->wasmsimd_magic.output_max[i] = output_max;
6110   }
6111   return sizeof(params->wasmsimd_magic);
6112 }
6113 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6114 
xnn_init_f32_qu8_cvt_scalar_fmagic_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)6115 size_t xnn_init_f32_qu8_cvt_scalar_fmagic_params(
6116   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6117   float scale,
6118   uint8_t output_zero_point,
6119   uint8_t output_min,
6120   uint8_t output_max)
6121 {
6122   params->scalar_fmagic.scale = scale;
6123   params->scalar_fmagic.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
6124   params->scalar_fmagic.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6125   params->scalar_fmagic.magic_bias = 12582912.0f;
6126   params->scalar_fmagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
6127   return sizeof(params->scalar_fmagic);
6128 }
6129 
xnn_init_f32_qu8_cvt_scalar_imagic_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)6130 size_t xnn_init_f32_qu8_cvt_scalar_imagic_params(
6131   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6132   float scale,
6133   uint8_t output_zero_point,
6134   uint8_t output_min,
6135   uint8_t output_max)
6136 {
6137   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
6138   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6139   params->scalar_imagic.scale = scale;
6140   params->scalar_imagic.magic_bias = 12582912.0f;
6141   params->scalar_imagic.magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
6142   params->scalar_imagic.magic_max = (int32_t) float_as_uint32(12582912.0f + output_max_less_zero_point);
6143   params->scalar_imagic.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
6144   return sizeof(params->scalar_imagic);
6145 }
6146 
xnn_init_f32_qu8_cvt_scalar_lrintf_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)6147 size_t xnn_init_f32_qu8_cvt_scalar_lrintf_params(
6148   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6149   float scale,
6150   uint8_t output_zero_point,
6151   uint8_t output_min,
6152   uint8_t output_max)
6153 {
6154   params->scalar_lrintf.scale = scale;
6155   params->scalar_lrintf.output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
6156   params->scalar_lrintf.output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6157   params->scalar_lrintf.output_zero_point = (int32_t) output_zero_point;
6158   return sizeof(params->scalar_lrintf);
6159 }
6160 
6161 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_f32_qu8_cvt_neon_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)6162 size_t xnn_init_f32_qu8_cvt_neon_params(
6163   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6164   float scale,
6165   uint8_t output_zero_point,
6166   uint8_t output_min,
6167   uint8_t output_max)
6168 {
6169   params->neon.scale = scale;
6170   params->neon.magic_bias = 12582912.0f;
6171   params->neon.magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
6172   params->neon.output_min = output_min;
6173   params->neon.output_max = output_max;
6174   return sizeof(params->neon);
6175 }
6176 
xnn_init_f32_qu8_cvt_neonv8_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)6177 size_t xnn_init_f32_qu8_cvt_neonv8_params(
6178   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6179   float scale,
6180   uint8_t output_zero_point,
6181   uint8_t output_min,
6182   uint8_t output_max)
6183 {
6184   params->neonv8.scale = scale;
6185   params->neonv8.output_zero_point = (int16_t) output_zero_point;
6186   params->neonv8.output_min = output_min;
6187   params->neonv8.output_max = output_max;
6188   return sizeof(params->neonv8);
6189 }
6190 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
6191 
6192 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_f32_qu8_cvt_sse2_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)6193 size_t xnn_init_f32_qu8_cvt_sse2_params(
6194   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6195   float scale,
6196   uint8_t output_zero_point,
6197   uint8_t output_min,
6198   uint8_t output_max)
6199 {
6200   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6201   for (uint32_t i = 0; i < 4; i++) {
6202     params->sse2.scale[i] = scale;
6203     params->sse2.output_max_less_zero_point[i] = output_max_less_zero_point;
6204   }
6205   for (uint32_t i = 0; i < 8; i++) {
6206     params->sse2.output_zero_point[i] = (int16_t) output_zero_point;
6207   }
6208   for (uint32_t i = 0; i < 16; i++) {
6209     params->sse2.output_min[i] = output_min;
6210   }
6211   return sizeof(params->sse2);
6212 }
6213 
xnn_init_f32_qu8_cvt_avx_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)6214 size_t xnn_init_f32_qu8_cvt_avx_params(
6215   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6216   float scale,
6217   uint8_t output_zero_point,
6218   uint8_t output_min,
6219   uint8_t output_max)
6220 {
6221   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6222   for (uint32_t i = 0; i < 8; i++) {
6223     params->avx.scale[i] = scale;
6224     params->avx.output_max_less_zero_point[i] = output_max_less_zero_point;
6225   }
6226   for (uint32_t i = 0; i < 8; i++) {
6227     params->avx.output_zero_point[i] = (int16_t) output_zero_point;
6228   }
6229   for (uint32_t i = 0; i < 16; i++) {
6230     params->avx.output_min[i] = output_min;
6231   }
6232   for (uint32_t i = 0; i < 7; i++) {
6233     params->avx.mask_table[i] = -1;
6234   }
6235   for (uint32_t i = 7; i < 14; i++) {
6236     params->avx.mask_table[i] = 0;
6237   }
6238   return sizeof(params->avx);
6239 }
6240 
xnn_init_f32_qu8_cvt_avx2_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)6241 size_t xnn_init_f32_qu8_cvt_avx2_params(
6242   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6243   float scale,
6244   uint8_t output_zero_point,
6245   uint8_t output_min,
6246   uint8_t output_max)
6247 {
6248   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6249   for (uint32_t i = 0; i < 8; i++) {
6250     params->avx2.scale[i] = scale;
6251     params->avx2.output_max_less_zero_point[i] = output_max_less_zero_point;
6252   }
6253   for (uint32_t i = 0; i < 16; i++) {
6254     params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
6255   }
6256   params->avx2.shuffle_mask[0] = 0;
6257   params->avx2.shuffle_mask[1] = 4;
6258   params->avx2.shuffle_mask[2] = 1;
6259   params->avx2.shuffle_mask[3] = 5;
6260   params->avx2.shuffle_mask[4] = 2;
6261   params->avx2.shuffle_mask[5] = 6;
6262   params->avx2.shuffle_mask[6] = 3;
6263   params->avx2.shuffle_mask[7] = 7;
6264   for (uint32_t i = 0; i < 32; i++) {
6265     params->avx2.output_min[i] = output_min;
6266   }
6267   for (uint32_t i = 0; i < 7; i++) {
6268     params->avx2.mask_table[i] = -1;
6269   }
6270   for (uint32_t i = 7; i < 14; i++) {
6271     params->avx2.mask_table[i] = 0;
6272   }
6273   return sizeof(params->avx2);
6274 }
6275 
xnn_init_f32_qu8_cvt_avx512_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)6276 size_t xnn_init_f32_qu8_cvt_avx512_params(
6277   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6278   float scale,
6279   uint8_t output_zero_point,
6280   uint8_t output_min,
6281   uint8_t output_max)
6282 {
6283   const float output_max_less_zero_point = (float) ((int32_t) output_max - (int32_t) output_zero_point);
6284   for (uint32_t i = 0; i < 16; i++) {
6285     params->avx512.scale[i] = scale;
6286     params->avx512.output_max_less_zero_point[i] = output_max_less_zero_point;
6287   }
6288   for (uint32_t i = 0; i < 32; i++) {
6289     params->avx512.output_zero_point[i] = (int16_t) output_zero_point;
6290   }
6291   for (uint32_t i = 0; i < 64; i++) {
6292     params->avx512.output_min[i] = output_min;
6293   }
6294   params->avx512.shuffle512_mask[0] = 0;
6295   params->avx512.shuffle512_mask[1] = 4;
6296   params->avx512.shuffle512_mask[2] = 8;
6297   params->avx512.shuffle512_mask[3] = 12;
6298   params->avx512.shuffle512_mask[4] = 1;
6299   params->avx512.shuffle512_mask[5] = 5;
6300   params->avx512.shuffle512_mask[6] = 9;
6301   params->avx512.shuffle512_mask[7] = 13;
6302   params->avx512.shuffle512_mask[8] = 2;
6303   params->avx512.shuffle512_mask[9] = 6;
6304   params->avx512.shuffle512_mask[10] = 10;
6305   params->avx512.shuffle512_mask[11] = 14;
6306   params->avx512.shuffle512_mask[12] = 3;
6307   params->avx512.shuffle512_mask[13] = 7;
6308   params->avx512.shuffle512_mask[14] = 11;
6309   params->avx512.shuffle512_mask[15] = 15;
6310   params->avx512.shuffle256_mask[0] = 0;
6311   params->avx512.shuffle256_mask[1] = 4;
6312   params->avx512.shuffle256_mask[2] = 2;
6313   params->avx512.shuffle256_mask[3] = 6;
6314   params->avx512.shuffle256_mask[4] = 1;
6315   params->avx512.shuffle256_mask[5] = 5;
6316   params->avx512.shuffle256_mask[6] = 3;
6317   params->avx512.shuffle256_mask[7] = 7;
6318   return sizeof(params->avx512);
6319 }
6320 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
6321 
6322 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_f32_qu8_cvt_wasmsimd_cvt_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)6323 size_t xnn_init_f32_qu8_cvt_wasmsimd_cvt_params(
6324   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6325   float scale,
6326   uint8_t output_zero_point,
6327   uint8_t output_min,
6328   uint8_t output_max)
6329 {
6330   for (uint32_t i = 0; i < 2; i++) {
6331     params->wasmsimd_cvt.scale[i] = scale;
6332   }
6333   for (uint32_t i = 0; i < 4; i++) {
6334     params->wasmsimd_cvt.output_zero_point[i] = (int16_t) output_zero_point;
6335   }
6336   for (uint32_t i = 0; i < 8; i++) {
6337     params->wasmsimd_cvt.output_min[i] = output_min;
6338     params->wasmsimd_cvt.output_max[i] = output_max;
6339   }
6340   return sizeof(params->wasmsimd_cvt);
6341 }
6342 
xnn_init_f32_qu8_cvt_wasmsimd_magic_params(union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t output_zero_point,uint8_t output_min,uint8_t output_max)6343 size_t xnn_init_f32_qu8_cvt_wasmsimd_magic_params(
6344   union xnn_f32_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6345   float scale,
6346   uint8_t output_zero_point,
6347   uint8_t output_min,
6348   uint8_t output_max)
6349 {
6350   const float output_min_less_zero_point = (float) ((int32_t) output_min - (int32_t) output_zero_point);
6351   const int32_t magic_min = (int32_t) float_as_uint32(12582912.0f + output_min_less_zero_point);
6352   const int32_t magic_bias_less_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
6353   for (uint32_t i = 0; i < 2; i++) {
6354     params->wasmsimd_magic.scale[i] = scale;
6355     params->wasmsimd_magic.magic_bias[i] = 12582912.0f;
6356     params->wasmsimd_magic.magic_min[i] = magic_min;
6357     params->wasmsimd_magic.magic_bias_less_zero_point[i] = magic_bias_less_zero_point;
6358   }
6359   for (uint32_t i = 0; i < 8; i++) {
6360     params->wasmsimd_magic.output_max[i] = output_max;
6361   }
6362   return sizeof(params->wasmsimd_magic);
6363 }
6364 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6365 
xnn_init_qs8_cvt_scalar_params(union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,int8_t input_zero_point,int8_t output_zero_point)6366 size_t xnn_init_qs8_cvt_scalar_params(
6367   union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6368   float input_output_scale,
6369   int8_t input_zero_point,
6370   int8_t output_zero_point)
6371 {
6372   assert(input_output_scale >= 0x1.0p-8);
6373   assert(input_output_scale <= 0x1.0p+7);
6374 
6375   const long multiplier = lrintf(256.0f * input_output_scale);
6376   assert(multiplier >= 1L);
6377   assert(multiplier <= 32768L);
6378   params->scalar.bias = ((int32_t) output_zero_point << 8) - (int32_t) multiplier * (int32_t) input_zero_point + INT32_C(0x80);
6379   params->scalar.multiplier = (int32_t) multiplier;
6380   return sizeof(params->scalar);
6381 }
6382 
6383 #if XNN_ARCH_ARM
xnn_init_qs8_cvt_armsimd32_params(union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,int8_t input_zero_point,int8_t output_zero_point)6384 size_t xnn_init_qs8_cvt_armsimd32_params(
6385   union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6386   float input_output_scale,
6387   int8_t input_zero_point,
6388   int8_t output_zero_point)
6389 {
6390   assert(input_output_scale >= 0x1.0p-8);
6391   assert(input_output_scale <= 0x1.0p+7);
6392 
6393   const long multiplier = lrintf(131072.0f * input_output_scale);
6394   assert(multiplier >= 512L);
6395   assert(multiplier <= 16777216L);
6396   const uint16_t minus_input_zero_point = -(int16_t) input_zero_point;
6397   params->armsimd32.minus_input_zero_point = (uint32_t) minus_input_zero_point * UINT32_C(0x00010001);
6398   params->armsimd32.multiplier = (int32_t) multiplier;
6399   params->armsimd32.bias = ((int32_t) output_zero_point << 1) + INT32_C(1);
6400   return sizeof(params->armsimd32);
6401 }
6402 #endif  // XNN_ARCH_ARM
6403 
6404 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_cvt_neon_params(union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,int8_t input_zero_point,int8_t output_zero_point)6405 size_t xnn_init_qs8_cvt_neon_params(
6406   union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6407   float input_output_scale,
6408   int8_t input_zero_point,
6409   int8_t output_zero_point)
6410 {
6411   assert(input_output_scale >= 0x1.0p-8);
6412   assert(input_output_scale <= 0x1.0p+7);
6413 
6414   const long multiplier = lrintf(-256.0f * input_output_scale);
6415   assert(multiplier <= -1L);
6416   assert(multiplier >= -32768L);
6417   params->neon.input_zero_point = (int16_t) input_zero_point;
6418   params->neon.multiplier = (int16_t) multiplier;
6419   params->neon.output_zero_point = (int16_t) output_zero_point;
6420   return sizeof(params->neon);
6421 }
6422 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
6423 
6424 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_cvt_sse2_params(union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,int8_t input_zero_point,int8_t output_zero_point)6425 size_t xnn_init_qs8_cvt_sse2_params(
6426   union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6427   float input_output_scale,
6428   int8_t input_zero_point,
6429   int8_t output_zero_point)
6430 {
6431   assert(input_output_scale >= 0x1.0p-8);
6432   assert(input_output_scale <= 0x1.0p+7);
6433 
6434   const long multiplier = lrintf(-256.0f * input_output_scale);
6435   assert(multiplier <= -1L);
6436   assert(multiplier >= -32768L);
6437   const int32_t bias = ((int32_t) output_zero_point << 8) + (int32_t) multiplier * (int32_t) input_zero_point + INT32_C(0x80);
6438   for (uint32_t i = 0; i < 8; i++) {
6439     params->sse2.multiplier[i] = (int16_t) multiplier;
6440   }
6441   for (uint32_t i = 0; i < 4; i++) {
6442     params->sse2.bias[i] = bias;
6443   }
6444   return sizeof(params->sse2);
6445 }
6446 
xnn_init_qs8_cvt_ssse3_params(union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,int8_t input_zero_point,int8_t output_zero_point)6447 size_t xnn_init_qs8_cvt_ssse3_params(
6448   union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6449   float input_output_scale,
6450   int8_t input_zero_point,
6451   int8_t output_zero_point)
6452 {
6453   assert(input_output_scale >= 0x1.0p-8);
6454   assert(input_output_scale <= 0x1.0p+7);
6455 
6456   const long multiplier = lrintf(-256.0f * input_output_scale);
6457   assert(multiplier <= -1L);
6458   assert(multiplier >= -32768L);
6459   for (uint32_t i = 0; i < 8; i++) {
6460     params->ssse3.input_zero_point[i] = (int16_t) input_zero_point;
6461     params->ssse3.multiplier[i] = (int16_t) multiplier;
6462     params->ssse3.output_zero_point[i] = (int16_t) output_zero_point;
6463   }
6464   return sizeof(params->ssse3);
6465 }
6466 
xnn_init_qs8_cvt_avx2_params(union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,int8_t input_zero_point,int8_t output_zero_point)6467 size_t xnn_init_qs8_cvt_avx2_params(
6468   union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6469   float input_output_scale,
6470   int8_t input_zero_point,
6471   int8_t output_zero_point)
6472 {
6473   assert(input_output_scale >= 0x1.0p-8);
6474   assert(input_output_scale <= 0x1.0p+7);
6475 
6476   const long multiplier = lrintf(-256.0f * input_output_scale);
6477   assert(multiplier <= -1L);
6478   assert(multiplier >= -32768L);
6479   for (uint32_t i = 0; i < 16; i++) {
6480     params->avx2.input_zero_point[i] = (int16_t) input_zero_point;
6481     params->avx2.multiplier[i] = (int16_t) multiplier;
6482     params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
6483   }
6484   return sizeof(params->avx2);
6485 }
6486 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
6487 
6488 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_cvt_wasmsimd_params(union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,int8_t input_zero_point,int8_t output_zero_point)6489 size_t xnn_init_qs8_cvt_wasmsimd_params(
6490   union xnn_qs8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6491   float input_output_scale,
6492   int8_t input_zero_point,
6493   int8_t output_zero_point)
6494 {
6495   assert(input_output_scale >= 0x1.0p-8);
6496   assert(input_output_scale <= 0x1.0p+7);
6497 
6498   const long multiplier = lrintf(-256.0f * input_output_scale);
6499   assert(multiplier <= -1L);
6500   assert(multiplier >= -32768L);
6501   for (uint32_t i = 0; i < 4; i++) {
6502     params->wasmsimd.input_zero_point[i] = (int16_t) input_zero_point;
6503     params->wasmsimd.multiplier[i] = (int16_t) multiplier;
6504     params->wasmsimd.output_zero_point[i] = (int16_t) output_zero_point;
6505   }
6506   return sizeof(params->wasmsimd);
6507 }
6508 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6509 
xnn_init_qs8_f32_cvt_scalar_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)6510 size_t xnn_init_qs8_f32_cvt_scalar_params(
6511   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6512   float scale,
6513   int8_t zero_point)
6514 {
6515   params->scalar.zero_point = (int32_t) zero_point;
6516   params->scalar.scale = scale;
6517   return sizeof(params->scalar);
6518 }
6519 
6520 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qs8_f32_cvt_neon_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)6521 size_t xnn_init_qs8_f32_cvt_neon_params(
6522   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6523   float scale,
6524   int8_t zero_point)
6525 {
6526   params->neon.minus_zero_point[0] = -(int16_t) zero_point;
6527   params->neon.minus_zero_point[1] = -(int16_t) zero_point;
6528   params->neon.scale = scale;
6529   return sizeof(params->neon);
6530 }
6531 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
6532 
6533 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qs8_f32_cvt_sse2_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)6534 size_t xnn_init_qs8_f32_cvt_sse2_params(
6535   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6536   float scale,
6537   int8_t zero_point)
6538 {
6539   for (uint32_t i = 0; i < 16; i++) {
6540     params->sse2.sign_mask[i] = UINT8_C(0x80);
6541   }
6542   for (uint32_t i = 0; i < 8; i++) {
6543     params->sse2.magic_exp[i] = UINT16_C(0x4B00);
6544   }
6545   const float magic_bias = (float) (INT32_C(0x00800080) + (int32_t) zero_point);
6546   for (uint32_t i = 0; i < 4; i++) {
6547     params->sse2.magic_bias[i] = magic_bias;
6548     params->sse2.scale[i] = scale;
6549   }
6550   return sizeof(params->sse2);
6551 }
6552 
xnn_init_qs8_f32_cvt_sse4_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)6553 size_t xnn_init_qs8_f32_cvt_sse4_params(
6554   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6555   float scale,
6556   int8_t zero_point)
6557 {
6558   for (uint32_t i = 0; i < 4; i++) {
6559     params->sse4.minus_zero_point[i] = -(int32_t) zero_point;
6560     params->sse4.scale[i] = scale;
6561   }
6562   return sizeof(params->sse4);
6563 }
6564 
xnn_init_qs8_f32_cvt_avx_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)6565 size_t xnn_init_qs8_f32_cvt_avx_params(
6566   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6567   float scale,
6568   int8_t zero_point)
6569 {
6570   for (uint32_t i = 0; i < 8; i++) {
6571     params->avx.minus_zero_point[i] = -(int32_t) zero_point;
6572     params->avx.scale[i] = scale;
6573   }
6574   return sizeof(params->avx);
6575 }
6576 
xnn_init_qs8_f32_cvt_avx512_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)6577 size_t xnn_init_qs8_f32_cvt_avx512_params(
6578   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6579   float scale,
6580   int8_t zero_point)
6581 {
6582   for (uint32_t i = 0; i < 16; i++) {
6583     params->avx512.minus_zero_point[i] = -(int32_t) zero_point;
6584     params->avx512.scale[i] = scale;
6585   }
6586   return sizeof(params->avx512);
6587 }
6588 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
6589 
6590 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qs8_f32_cvt_wasmsimd_params(union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,int8_t zero_point)6591 size_t xnn_init_qs8_f32_cvt_wasmsimd_params(
6592   union xnn_qs8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6593   float scale,
6594   int8_t zero_point)
6595 {
6596   for (uint32_t i = 0; i < 4; i++) {
6597     params->wasmsimd.minus_zero_point[i] = -(int16_t) zero_point;
6598   }
6599   for (uint32_t i = 0; i < 2; i++) {
6600     params->wasmsimd.scale[i] = scale;
6601   }
6602   return sizeof(params->wasmsimd);
6603 }
6604 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6605 
xnn_init_qu8_cvt_scalar_params(union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,uint8_t input_zero_point,uint8_t output_zero_point)6606 size_t xnn_init_qu8_cvt_scalar_params(
6607   union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6608   float input_output_scale,
6609   uint8_t input_zero_point,
6610   uint8_t output_zero_point)
6611 {
6612   assert(input_output_scale >= 0x1.0p-8);
6613   assert(input_output_scale <= 0x1.0p+7);
6614 
6615   const long multiplier = lrintf(256.0f * input_output_scale);
6616   assert(multiplier >= 1L);
6617   assert(multiplier <= 32768L);
6618   params->scalar.bias = ((int32_t) output_zero_point << 8) - (int32_t) multiplier * (int32_t) input_zero_point + INT32_C(0x80);
6619   params->scalar.multiplier = (int32_t) multiplier;
6620   return sizeof(params->scalar);
6621 }
6622 
6623 #if XNN_ARCH_ARM
xnn_init_qu8_cvt_armsimd32_params(union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,uint8_t input_zero_point,uint8_t output_zero_point)6624 size_t xnn_init_qu8_cvt_armsimd32_params(
6625   union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6626   float input_output_scale,
6627   uint8_t input_zero_point,
6628   uint8_t output_zero_point)
6629 {
6630   assert(input_output_scale >= 0x1.0p-8);
6631   assert(input_output_scale <= 0x1.0p+7);
6632 
6633   const long multiplier = lrintf(131072.0f * input_output_scale);
6634   assert(multiplier >= 512L);
6635   assert(multiplier <= 16777216L);
6636   const uint16_t minus_input_zero_point = -(int16_t) input_zero_point;
6637   params->armsimd32.minus_input_zero_point = (uint32_t) minus_input_zero_point * UINT32_C(0x00010001);
6638   params->armsimd32.multiplier = (int32_t) multiplier;
6639   params->armsimd32.bias = ((int32_t) output_zero_point << 1) + INT32_C(1);
6640   return sizeof(params->armsimd32);
6641 }
6642 #endif  // XNN_ARCH_ARM
6643 
6644 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_cvt_neon_params(union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,uint8_t input_zero_point,uint8_t output_zero_point)6645 size_t xnn_init_qu8_cvt_neon_params(
6646   union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6647   float input_output_scale,
6648   uint8_t input_zero_point,
6649   uint8_t output_zero_point)
6650 {
6651   assert(input_output_scale >= 0x1.0p-8);
6652   assert(input_output_scale <= 0x1.0p+7);
6653 
6654   const long multiplier = lrintf(-256.0f * input_output_scale);
6655   assert(multiplier <= -1L);
6656   assert(multiplier >= -32768L);
6657   params->neon.input_zero_point = (uint16_t) input_zero_point;
6658   params->neon.multiplier = (int16_t) multiplier;
6659   params->neon.output_zero_point = (int16_t) output_zero_point;
6660   return sizeof(params->neon);
6661 }
6662 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
6663 
6664 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_cvt_sse2_params(union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,uint8_t input_zero_point,uint8_t output_zero_point)6665 size_t xnn_init_qu8_cvt_sse2_params(
6666   union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6667   float input_output_scale,
6668   uint8_t input_zero_point,
6669   uint8_t output_zero_point)
6670 {
6671   assert(input_output_scale >= 0x1.0p-8);
6672   assert(input_output_scale <= 0x1.0p+7);
6673 
6674   const long multiplier = lrintf(256.0f * input_output_scale);
6675   assert(multiplier >= 1L);
6676   assert(multiplier <= 32768L);
6677   const int32_t bias = ((int32_t) output_zero_point << 8) - (int32_t) multiplier * (int32_t) input_zero_point + INT32_C(0x80);
6678   for (uint32_t i = 0; i < 8; i++) {
6679     params->sse2.multiplier[i] = (uint16_t) multiplier;
6680   }
6681   for (uint32_t i = 0; i < 4; i++) {
6682     params->sse2.bias[i] = bias;
6683   }
6684   return sizeof(params->sse2);
6685 }
6686 
xnn_init_qu8_cvt_ssse3_params(union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,uint8_t input_zero_point,uint8_t output_zero_point)6687 size_t xnn_init_qu8_cvt_ssse3_params(
6688   union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6689   float input_output_scale,
6690   uint8_t input_zero_point,
6691   uint8_t output_zero_point)
6692 {
6693   assert(input_output_scale >= 0x1.0p-8);
6694   assert(input_output_scale <= 0x1.0p+7);
6695 
6696   const long multiplier = lrintf(-256.0f * input_output_scale);
6697   assert(multiplier <= -1L);
6698   assert(multiplier >= -32768L);
6699   for (uint32_t i = 0; i < 8; i++) {
6700     params->ssse3.input_zero_point[i] = (uint16_t) input_zero_point;
6701     params->ssse3.multiplier[i] = (int16_t) multiplier;
6702     params->ssse3.output_zero_point[i] = (int16_t) output_zero_point;
6703   }
6704   return sizeof(params->ssse3);
6705 }
6706 
xnn_init_qu8_cvt_avx2_params(union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,uint8_t input_zero_point,uint8_t output_zero_point)6707 size_t xnn_init_qu8_cvt_avx2_params(
6708   union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6709   float input_output_scale,
6710   uint8_t input_zero_point,
6711   uint8_t output_zero_point)
6712 {
6713   assert(input_output_scale >= 0x1.0p-8);
6714   assert(input_output_scale <= 0x1.0p+7);
6715 
6716   const long multiplier = lrintf(-256.0f * input_output_scale);
6717   assert(multiplier <= -1L);
6718   assert(multiplier >= -32768L);
6719   for (uint32_t i = 0; i < 16; i++) {
6720     params->avx2.input_zero_point[i] = (uint16_t) input_zero_point;
6721     params->avx2.multiplier[i] = (int16_t) multiplier;
6722     params->avx2.output_zero_point[i] = (int16_t) output_zero_point;
6723   }
6724   return sizeof(params->avx2);
6725 }
6726 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
6727 
6728 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_cvt_wasmsimd_params(union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS (1)],float input_output_scale,uint8_t input_zero_point,uint8_t output_zero_point)6729 size_t xnn_init_qu8_cvt_wasmsimd_params(
6730   union xnn_qu8_cvt_params params[XNN_MIN_ELEMENTS(1)],
6731   float input_output_scale,
6732   uint8_t input_zero_point,
6733   uint8_t output_zero_point)
6734 {
6735   assert(input_output_scale >= 0x1.0p-8);
6736   assert(input_output_scale <= 0x1.0p+7);
6737 
6738   const long multiplier = lrintf(-256.0f * input_output_scale);
6739   assert(multiplier <= -1L);
6740   assert(multiplier >= -32768L);
6741   for (uint32_t i = 0; i < 4; i++) {
6742     params->wasmsimd.input_zero_point[i] = (uint16_t) input_zero_point;
6743     params->wasmsimd.multiplier[i] = (int16_t) multiplier;
6744     params->wasmsimd.output_zero_point[i] = (int16_t) output_zero_point;
6745   }
6746   return sizeof(params->wasmsimd);
6747 }
6748 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6749 
xnn_init_qu8_f32_cvt_scalar_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)6750 size_t xnn_init_qu8_f32_cvt_scalar_params(
6751   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6752   float scale,
6753   uint8_t zero_point)
6754 {
6755   params->scalar.zero_point = (int32_t) zero_point;
6756   params->scalar.scale = scale;
6757   return sizeof(params->scalar);
6758 }
6759 
6760 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
xnn_init_qu8_f32_cvt_neon_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)6761 size_t xnn_init_qu8_f32_cvt_neon_params(
6762   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6763   float scale,
6764   uint8_t zero_point)
6765 {
6766   params->neon.minus_zero_point[0] = -(int16_t) zero_point;
6767   params->neon.minus_zero_point[1] = -(int16_t) zero_point;
6768   params->neon.scale = scale;
6769   return sizeof(params->neon);
6770 }
6771 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
6772 
6773 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
xnn_init_qu8_f32_cvt_sse2_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)6774 size_t xnn_init_qu8_f32_cvt_sse2_params(
6775   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6776   float scale,
6777   uint8_t zero_point)
6778 {
6779   for (uint32_t i = 0; i < 8; i++) {
6780     params->sse2.magic_exp[i] = UINT16_C(0x4B00);
6781   }
6782   const float magic_bias = (float) (INT32_C(0x00800000) + (int32_t) zero_point);
6783   for (uint32_t i = 0; i < 4; i++) {
6784     params->sse2.magic_bias[i] = magic_bias;
6785     params->sse2.scale[i] = scale;
6786   }
6787   return sizeof(params->sse2);
6788 }
6789 
xnn_init_qu8_f32_cvt_sse4_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)6790 size_t xnn_init_qu8_f32_cvt_sse4_params(
6791   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6792   float scale,
6793   uint8_t zero_point)
6794 {
6795   for (uint32_t i = 0; i < 4; i++) {
6796     params->sse4.minus_zero_point[i] = -(int32_t) zero_point;
6797     params->sse4.scale[i] = scale;
6798   }
6799   return sizeof(params->sse4);
6800 }
6801 
xnn_init_qu8_f32_cvt_avx_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)6802 size_t xnn_init_qu8_f32_cvt_avx_params(
6803   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6804   float scale,
6805   uint8_t zero_point)
6806 {
6807   for (uint32_t i = 0; i < 8; i++) {
6808     params->avx.minus_zero_point[i] = -(int32_t) zero_point;
6809     params->avx.scale[i] = scale;
6810   }
6811   return sizeof(params->avx);
6812 }
6813 
xnn_init_qu8_f32_cvt_avx512_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)6814 size_t xnn_init_qu8_f32_cvt_avx512_params(
6815   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6816   float scale,
6817   uint8_t zero_point)
6818 {
6819   for (uint32_t i = 0; i < 16; i++) {
6820     params->avx512.minus_zero_point[i] = -(int32_t) zero_point;
6821     params->avx512.scale[i] = scale;
6822   }
6823   return sizeof(params->avx512);
6824 }
6825 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
6826 
6827 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
xnn_init_qu8_f32_cvt_wasmsimd_params(union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS (1)],float scale,uint8_t zero_point)6828 size_t xnn_init_qu8_f32_cvt_wasmsimd_params(
6829   union xnn_qu8_f32_cvt_params params[XNN_MIN_ELEMENTS(1)],
6830   float scale,
6831   uint8_t zero_point)
6832 {
6833   for (uint32_t i = 0; i < 4; i++) {
6834     params->wasmsimd.minus_zero_point[i] = -(int16_t) zero_point;
6835   }
6836   for (uint32_t i = 0; i < 2; i++) {
6837     params->wasmsimd.scale[i] = scale;
6838   }
6839   return sizeof(params->wasmsimd);
6840 }
6841 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
6842