• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <stdbool.h>
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <string.h>
13 
14 #include <pthread.h>
15 
16 #ifndef __EMSCRIPTEN__
17   #include <cpuinfo.h>
18 #endif
19 
20 #include <xnnpack.h>
21 #include <xnnpack/argmaxpool.h>
22 #include <xnnpack/avgpool.h>
23 #include <xnnpack/bilinear.h>
24 #include <xnnpack/clamp.h>
25 #include <xnnpack/common.h>
26 #include <xnnpack/conv.h>
27 #include <xnnpack/dwconv.h>
28 #include <xnnpack/gavgpool.h>
29 #include <xnnpack/gemm.h>
30 #include <xnnpack/hswish.h>
31 #include <xnnpack/igemm.h>
32 #include <xnnpack/log.h>
33 #include <xnnpack/lut.h>
34 #include <xnnpack/maxpool.h>
35 #include <xnnpack/memory.h>
36 #include <xnnpack/pad.h>
37 #include <xnnpack/params.h>
38 #include <xnnpack/pavgpool.h>
39 #include <xnnpack/prelu.h>
40 #include <xnnpack/raddstoreexpminusmax.h>
41 #include <xnnpack/rmax.h>
42 #include <xnnpack/spmm.h>
43 #include <xnnpack/unpool.h>
44 #include <xnnpack/vadd.h>
45 #include <xnnpack/vbinary.h>
46 #include <xnnpack/vmulcaddc.h>
47 #include <xnnpack/vunary.h>
48 #include <xnnpack/zip.h>
49 
50 #ifndef XNN_ENABLE_ASSEMBLY
51   #define XNN_ENABLE_ASSEMBLY 1
52 #endif
53 
54 static pthread_once_t init_guard = PTHREAD_ONCE_INIT;
55 
56 struct xnn_parameters xnn_params = {
57   .initialized = false
58 };
59 
60 #if XNN_ARCH_PNACL || XNN_ARCH_ASMJS || XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
61   extern uint32_t xnn_stub_wasm_f32_sub(uint32_t a, uint32_t b);
62 #endif
63 #if XNN_ARCH_PNACL || XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
64   extern uint32_t xnn_stub_wasm_f32_min(uint32_t a, uint32_t b);
65 #endif
66 
init(void)67 static void init(void) {
68 #if XNN_ARCH_ARM
69   if (!cpuinfo_has_arm_neon()) {
70     xnn_log_error("XNNPACK initialization failed: NEON is not supported");
71     return;
72   }
73 
74   /**************************** Q8 micro-kernels ****************************/
75   #ifndef XNN_NO_Q8_OPERATORS
76     xnn_params.q8.gemm = (struct gemm_parameters) {
77       .gemm = (xnn_gemm_ukernel_function) xnn_q8_gemm_ukernel_4x8__neon,
78       .igemm = (xnn_igemm_ukernel_function) xnn_q8_igemm_ukernel_4x8__neon,
79       .mr = 4,
80       .nr = 8,
81     };
82 
83     #if XNN_ENABLE_ASSEMBLY
84       xnn_params.q8.dwconv[0] = (struct dwconv_parameters) {
85         .up = (xnn_dwconv_up_ukernel_function) xnn_q8_dwconv_ukernel_up8x9__aarch32_neon,
86         .cr = 8,
87         .mr = 9,
88       };
89     #else
90       xnn_params.q8.dwconv[0] = (struct dwconv_parameters) {
91         .up = (xnn_dwconv_up_ukernel_function) xnn_q8_dwconv_ukernel_up8x9__neon,
92         .cr = 8,
93         .mr = 9,
94       };
95     #endif
96     xnn_params.q8.avgpool = (struct avgpool_parameters) {
97       .up = (xnn_avgpool_up_ukernel_function) xnn_q8_avgpool_ukernel_up9__neon,
98       .mp = (xnn_avgpool_mp_ukernel_function) xnn_q8_avgpool_ukernel_mp9p8q__neon,
99       .mr = 9,
100       .qr = 8,
101     };
102     xnn_params.q8.gavgpool = (struct gavgpool_parameters) {
103       .up = (xnn_gavgpool_up_ukernel_function) xnn_q8_gavgpool_ukernel_up7__neon,
104       .mp = (xnn_gavgpool_mp_ukernel_function) xnn_q8_gavgpool_ukernel_mp7p7q__neon,
105       .mr = 7,
106     };
107     xnn_params.q8.vadd = (xnn_vadd_ukernel_function) xnn_q8_vadd_ukernel__neon;
108   #endif  // XNN_NO_Q8_OPERATORS
109 
110   /**************************** U8 micro-kernels ****************************/
111   #ifndef XNN_NO_U8_OPERATORS
112     xnn_params.u8.maxpool = (struct maxpool_parameters) {
113       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__neon_c16,
114       .mr = 9,
115       .qr = 8,
116     };
117     xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__neon;
118     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
119     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
120   #endif  // XNN_NO_U8_OPERATORS
121 
122   /**************************** X8 micro-kernels ****************************/
123   #ifndef XNN_NO_X8_OPERATORS
124     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
125     xnn_params.x8.zip = (struct zip_parameters) {
126       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
127       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
128       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
129       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
130     };
131   #endif  // XNN_NO_X8_OPERATORS
132 
133   /**************************** F32 micro-kernels ****************************/
134   #ifndef XNN_NO_F32_OPERATORS
135     #if XNN_ENABLE_ASSEMBLY
136       switch (cpuinfo_get_core(0)->uarch) {
137         case cpuinfo_uarch_cortex_a53:
138         case cpuinfo_uarch_cortex_a55:
139           xnn_params.f32.gemm = (struct gemm_parameters) {
140             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53,
141             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64,
142             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
143             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
144             .mr = 4,
145             .nr = 8,
146           };
147           break;
148 
149         case cpuinfo_uarch_cortex_a57:
150         case cpuinfo_uarch_cortex_a72:
151         case cpuinfo_uarch_cortex_a73:
152           xnn_params.f32.gemm = (struct gemm_parameters) {
153             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75,
154             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75,
155             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
156             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
157             .mr = 4,
158             .nr = 8,
159           };
160           break;
161 
162         default:
163           xnn_params.f32.gemm = (struct gemm_parameters) {
164             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75,
165             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75,
166             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
167             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
168             .mr = 4,
169             .nr = 8,
170           };
171           break;
172       }
173     #else  // XNN_ENABLE_ASSEMBLY
174       xnn_params.f32.gemm = (struct gemm_parameters) {
175         .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__neon_lane_ld128,
176         .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__neon_lane_ld128,
177         .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neon_lane_ld64,
178         .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neon_lane_ld64,
179         .mr = 4,
180         .nr = 8,
181       };
182     #endif  // XNN_ENABLE_ASSEMBLY
183     xnn_params.f32.gemm2 = (struct gemm_parameters) {
184       .gemm = NULL,
185       .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__neon_lane_ld64,
186       .mr = 4,
187       .nr = 2,
188     };
189     xnn_params.f32.dwconv[0] = (struct dwconv_parameters) {
190       .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__psimd,
191       .cr = 4,
192       .mr = 4,
193     };
194     xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
195       .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__neon,
196       .cr = 4,
197       .mr = 9,
198     };
199     xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
200       .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__psimd,
201       .cr = 4,
202       .mr = 25,
203     };
204     xnn_params.f32.avgpool = (struct avgpool_parameters) {
205       .up = (xnn_avgpool_up_ukernel_function) xnn_f32_avgpool_ukernel_up9__neon,
206       .mp = (xnn_avgpool_mp_ukernel_function) xnn_f32_avgpool_ukernel_mp9p8q__neon,
207       .mr = 9,
208       .qr = 8,
209     };
210     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
211       .up = (xnn_pavgpool_up_ukernel_function) xnn_f32_pavgpool_ukernel_up9__neon,
212       .mp = (xnn_pavgpool_mp_ukernel_function) xnn_f32_pavgpool_ukernel_mp9p8q__neon,
213       .mr = 9,
214       .qr = 8,
215     };
216     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
217       .up = (xnn_gavgpool_up_ukernel_function) xnn_f32_gavgpool_ukernel_up7__neon,
218       .mp = (xnn_gavgpool_mp_ukernel_function) xnn_f32_gavgpool_ukernel_mp7p7q__neon,
219       .mr = 7,
220     };
221     xnn_params.f32.maxpool = (struct maxpool_parameters) {
222       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__psimd_c4,
223       .mr = 9,
224       .qr = 8,
225     };
226     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
227       .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__psimd_c4,
228       .mr = 4,
229     };
230     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
231       .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__psimd_c4,
232       .mr = 9,
233     };
234     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
235       .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4,
236       .mr = 9,
237       .qr = 8,
238     };
239     xnn_params.f32.bilinear = (struct bilinear_parameters) {
240       .ukernel = (xnn_bilinear_ukernel_function) xnn_f32_bilinear_ukernel__neon_c8,
241       .pixel_tile = 1,
242       .channel_tile = 8,
243     };
244     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__neon;
245     xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neon_x8;
246     xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8;
247     xnn_params.f32.prelu = (struct prelu_parameters) {
248       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
249       .row_tile = 2,
250       .channel_tile = 8,
251     };
252     xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x8;
253     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
254     xnn_params.f32.vadd = (struct vbinary_parameters) {
255       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__neon_x8,
256       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__neon_x8,
257       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__neon_x8,
258       .element_tile = 8,
259     };
260     xnn_params.f32.vdiv = (struct vbinary_parameters) {
261       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__scalar_x2,
262       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__scalar_x2,
263       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__scalar_x2,
264       .element_tile = 2,
265     };
266     xnn_params.f32.vmax = (struct vbinary_parameters) {
267       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
268       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
269       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
270       .element_tile = 8,
271     };
272     xnn_params.f32.vmin = (struct vbinary_parameters) {
273       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
274       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
275       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
276       .element_tile = 8,
277     };
278     xnn_params.f32.vmul = (struct vbinary_parameters) {
279       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__neon_x8,
280       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
281       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
282       .element_tile = 8,
283     };
284     xnn_params.f32.vsub = (struct vbinary_parameters) {
285       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__neon_x8,
286       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__neon_x8,
287       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__neon_x8,
288       .element_tile = 8,
289     };
290     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
291       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c4__neon_2x,
292       .channel_tile = 4,
293       .row_tile = 2,
294     };
295   #endif  // XNN_NO_F32_OPERATORS
296 
297   /**************************** X32 micro-kernels ****************************/
298   #ifndef XNN_NO_X32_OPERATORS
299     xnn_params.x32.pad = (struct pad_parameters) {
300       .ukernel = xnn_x32_pad_x2__neon,
301       .mr = 2,
302     };
303     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__psimd;
304     xnn_params.x32.zip = (struct zip_parameters) {
305       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
306       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
307       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
308       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
309     };
310   #endif  // XNN_NO_X32_OPERATORS
311 
312 #elif XNN_ARCH_ARM64
313 
314   /**************************** Q8 micro-kernels ****************************/
315   #ifndef XNN_NO_Q8_OPERATORS
316     xnn_params.q8.gemm = (struct gemm_parameters) {
317       .gemm = (xnn_gemm_ukernel_function) xnn_q8_gemm_ukernel_8x8__neon,
318       .igemm = (xnn_igemm_ukernel_function) xnn_q8_igemm_ukernel_8x8__neon,
319       .mr = 8,
320       .nr = 8,
321     };
322     xnn_params.q8.dwconv[0] = (struct dwconv_parameters) {
323       .up = (xnn_dwconv_up_ukernel_function) xnn_q8_dwconv_ukernel_up8x9__neon,
324       .cr = 8,
325       .mr = 9,
326     };
327     xnn_params.q8.avgpool = (struct avgpool_parameters) {
328       .up = (xnn_avgpool_up_ukernel_function) xnn_q8_avgpool_ukernel_up9__neon,
329       .mp = (xnn_avgpool_mp_ukernel_function) xnn_q8_avgpool_ukernel_mp9p8q__neon,
330       .mr = 9,
331       .qr = 8,
332     };
333     xnn_params.q8.gavgpool = (struct gavgpool_parameters) {
334       .up = (xnn_gavgpool_up_ukernel_function) xnn_q8_gavgpool_ukernel_up7__neon,
335       .mp = (xnn_gavgpool_mp_ukernel_function) xnn_q8_gavgpool_ukernel_mp7p7q__neon,
336       .mr = 7,
337     };
338     xnn_params.q8.vadd = (xnn_vadd_ukernel_function) xnn_q8_vadd_ukernel__neon;
339   #endif  // XNN_NO_Q8_OPERATORS
340 
341   /**************************** U8 micro-kernels ****************************/
342   #ifndef XNN_NO_U8_OPERATORS
343     xnn_params.u8.maxpool = (struct maxpool_parameters) {
344       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__neon_c16,
345       .mr = 9,
346       .qr = 8,
347     };
348     xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__neon;
349     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
350     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
351   #endif  // XNN_NO_U8_OPERATORS
352 
353   /**************************** X8 micro-kernels ****************************/
354   #ifndef XNN_NO_X8_OPERATORS
355     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
356     xnn_params.x8.zip = (struct zip_parameters) {
357       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
358       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
359       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
360       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
361     };
362   #endif  // XNN_NO_X8_OPERATORS
363 
364   /**************************** F32 micro-kernels ****************************/
365   #ifndef XNN_NO_F32_OPERATORS
366     #if XNN_ENABLE_ASSEMBLY
367       switch (cpuinfo_get_core(0)->uarch) {
368         case cpuinfo_uarch_cortex_a57:
369           xnn_params.f32.gemm = (struct gemm_parameters) {
370             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57,
371             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57,
372             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
373             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
374             .mr = 6,
375             .nr = 8,
376           };
377           break;
378         case cpuinfo_uarch_cortex_a72:
379           xnn_params.f32.gemm = (struct gemm_parameters) {
380             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
381             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75,
382             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
383             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
384             .mr = 4,
385             .nr = 8,
386           };
387           break;
388         case cpuinfo_uarch_cortex_a75:
389         case cpuinfo_uarch_cortex_a76:
390         case cpuinfo_uarch_exynos_m3:
391         case cpuinfo_uarch_exynos_m4:
392           xnn_params.f32.gemm = (struct gemm_parameters) {
393             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75,
394             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75,
395             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
396             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
397             .mr = 6,
398             .nr = 8,
399           };
400           break;
401         case cpuinfo_uarch_exynos_m1:
402         case cpuinfo_uarch_exynos_m2:
403           xnn_params.f32.gemm = (struct gemm_parameters) {
404             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8s4__neonfma,
405             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8s4__neonfma,
406             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8s4__neonfma,
407             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8s4__neonfma,
408             .mr = 6,
409             .nr = 8,
410             .log2_sr = 2,
411           };
412           break;
413 
414         case cpuinfo_uarch_cortex_a53:
415         case cpuinfo_uarch_cortex_a55:
416           xnn_params.f32.gemm = (struct gemm_parameters) {
417             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
418             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53,
419             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
420             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53,
421             .mr = 6,
422             .nr = 8,
423           };
424           break;
425         case cpuinfo_uarch_cortex_a73:
426           xnn_params.f32.gemm = (struct gemm_parameters) {
427             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73,
428             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73,
429             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
430             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75,
431             .mr = 6,
432             .nr = 8,
433           };
434           break;
435         default:
436         case cpuinfo_uarch_cortex_a77:
437         case cpuinfo_uarch_exynos_m5:
438         case cpuinfo_uarch_kryo:
439           xnn_params.f32.gemm = (struct gemm_parameters) {
440             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57,
441             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57,
442             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
443             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57,
444             .mr = 4,
445             .nr = 8,
446           };
447           break;
448       }
449     #else  // XNN_ENABLE_ASSEMBLY
450       xnn_params.f32.gemm = (struct gemm_parameters) {
451         .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64,
452         .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64,
453         .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64,
454         .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64,
455         .mr = 6,
456         .nr = 8,
457       };
458     #endif  // XNN_ENABLE_ASSEMBLY
459 
460     xnn_params.f32.gemm2 = (struct gemm_parameters) {
461       .gemm = NULL,
462       .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64,
463       .mr = 4,
464       .nr = 2,
465     };
466     xnn_params.f32.dwconv[0] = (struct dwconv_parameters) {
467       .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__psimd,
468       .cr = 4,
469       .mr = 4,
470     };
471     switch (cpuinfo_get_core(0)->uarch) {
472       case cpuinfo_uarch_kryo:
473         xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
474           .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__neonfma,
475           .cr = 4,
476           .mr = 9,
477         };
478         break;
479 #if XNN_ENABLE_ASSEMBLY
480       case cpuinfo_uarch_cortex_a53:
481       case cpuinfo_uarch_cortex_a55:
482         xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
483           .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__aarch64_neonfma_cortex_a55,
484           .cr = 4,
485           .mr = 9,
486         };
487         break;
488 #endif
489       default:
490         xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
491           .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__neonfma,
492           .cr = 8,
493           .mr = 9,
494         };
495         break;
496     }
497     xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
498       .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__psimd,
499       .cr = 4,
500       .mr = 25,
501     };
502     xnn_params.f32.avgpool = (struct avgpool_parameters) {
503       .up = (xnn_avgpool_up_ukernel_function) xnn_f32_avgpool_ukernel_up9__neon,
504       .mp = (xnn_avgpool_mp_ukernel_function) xnn_f32_avgpool_ukernel_mp9p8q__neon,
505       .mr = 9,
506       .qr = 8,
507     };
508     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
509       .up = (xnn_pavgpool_up_ukernel_function) xnn_f32_pavgpool_ukernel_up9__neon,
510       .mp = (xnn_pavgpool_mp_ukernel_function) xnn_f32_pavgpool_ukernel_mp9p8q__neon,
511       .mr = 9,
512       .qr = 8,
513     };
514     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
515       .up = (xnn_gavgpool_up_ukernel_function) xnn_f32_gavgpool_ukernel_up7__neon,
516       .mp = (xnn_gavgpool_mp_ukernel_function) xnn_f32_gavgpool_ukernel_mp7p7q__neon,
517       .mr = 7,
518     };
519     xnn_params.f32.maxpool = (struct maxpool_parameters) {
520       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__psimd_c4,
521       .mr = 9,
522       .qr = 8,
523     };
524     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
525       .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__psimd_c4,
526       .mr = 4,
527     };
528     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
529       .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__psimd_c4,
530       .mr = 9,
531     };
532     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
533       .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4,
534       .mr = 9,
535       .qr = 8,
536     };
537     xnn_params.f32.bilinear = (struct bilinear_parameters) {
538       .ukernel = (xnn_bilinear_ukernel_function) xnn_f32_bilinear_ukernel__neonfma_c8,
539       .pixel_tile = 1,
540       .channel_tile = 8,
541     };
542     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__neon;
543     xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neonfma_x8;
544     xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16;
545     xnn_params.f32.prelu = (struct prelu_parameters) {
546       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
547       .row_tile = 2,
548       .channel_tile = 8,
549     };
550     xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x16;
551     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
552     xnn_params.f32.vadd = (struct vbinary_parameters) {
553       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__neon_x8,
554       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__neon_x8,
555       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__neon_x8,
556       .element_tile = 8,
557     };
558     xnn_params.f32.vdiv = (struct vbinary_parameters) {
559       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__neon_x8,
560       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__neon_x8,
561       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__neon_x8,
562       .element_tile = 8,
563     };
564     xnn_params.f32.vmax = (struct vbinary_parameters) {
565       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
566       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
567       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
568       .element_tile = 8,
569     };
570     xnn_params.f32.vmin = (struct vbinary_parameters) {
571       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
572       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
573       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
574       .element_tile = 8,
575     };
576     xnn_params.f32.vmul = (struct vbinary_parameters) {
577       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__neon_x8,
578       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
579       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
580       .element_tile = 8,
581     };
582     xnn_params.f32.vsub = (struct vbinary_parameters) {
583       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__neon_x8,
584       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__neon_x8,
585       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__neon_x8,
586       .element_tile = 8,
587     };
588     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
589       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c4__neonfma_2x,
590       .channel_tile = 4,
591       .row_tile = 2,
592     };
593     #ifndef XNN_NO_NCHW_OPERATORS
594       xnn_params.f32.spmm = (struct spmm_parameters) {
595         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_ukernel_16x1__neonfma_pipelined,
596         .mr = 16,
597         .nr = 1,
598       };
599       xnn_params.f32.spmm2 = (struct spmm_parameters) {
600         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_ukernel_16x2__neonfma,
601         .mr = 16,
602         .nr = 2,
603       };
604       xnn_params.f32.spmm4 = (struct spmm_parameters) {
605         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_ukernel_16x4__neonfma,
606         .mr = 16,
607         .nr = 4,
608       };
609       xnn_params.f32.hwc2spchw_dconv3x3c3s2 = (struct hwc2spchw_dconv_parameters) {
610         .ukernel_with_symm_padding =
611           (xnn_conv_hwc2spchw_ukernel_function) xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__neonfma_2x2,
612         .output_channel_tile = 4,
613         .output_height_tile = 2,
614         .output_width_tile = 2,
615       };
616       xnn_params.f32.spchw_dwconv3x3 = (struct spchw_dwconv_parameters) {
617         .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma,
618         .input_width_tile = 4,
619         .output_width_tile = 4,
620         .output_height_tile = 3,
621       };
622       xnn_params.f32.spchw_dwconv3x3s2 = (struct spchw_dwconv_parameters) {
623         .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma,
624         .input_width_tile = 4,
625         .output_width_tile = 4,
626         .output_height_tile = 1,
627       };
628       xnn_params.f32.spchw_dwconv5x5 = (struct spchw_dwconv_parameters) {
629         .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma,
630         .input_width_tile = 4,
631         .output_width_tile = 4,
632         .output_height_tile = 3,
633       };
634       xnn_params.f32.spchw_dwconv5x5s2 = (struct spchw_dwconv_parameters) {
635         .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma,
636         .input_width_tile = 4,
637         .output_width_tile = 4,
638         .output_height_tile = 1,
639       };
640       xnn_params.f32.spchw_gavgpool = (struct spchw_gavgpool_parameters) {
641         .ukernel = (xnn_gavgpool_spchw_ukernel_function) xnn_f32_gavgpool_spchw_ukernel__neon_x4,
642         .channel_tile = 4,
643       };
644     #endif  // XNN_NO_NCHW_OPERATORS
645   #endif  // XNN_NO_F32_OPERATORS
646 
647   /**************************** X32 micro-kernels ****************************/
648   #ifndef XNN_NO_X32_OPERATORS
649     xnn_params.x32.pad = (struct pad_parameters) {
650       .ukernel = xnn_x32_pad_x2__neon,
651       .mr = 2,
652     };
653     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__psimd;
654     xnn_params.x32.zip = (struct zip_parameters) {
655       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
656       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
657       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
658       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
659     };
660   #endif  // XNN_NO_X32_OPERATORS
661 
662 #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
663   if (!cpuinfo_has_x86_sse2()) {
664     xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
665     return;
666   }
667 
668   /**************************** Q8 micro-kernels ****************************/
669   #ifndef XNN_NO_Q8_OPERATORS
670     xnn_params.q8.gemm = (struct gemm_parameters) {
671       .gemm = (xnn_gemm_ukernel_function) xnn_q8_gemm_ukernel_4x4c2__sse2,
672       .igemm = (xnn_igemm_ukernel_function) xnn_q8_igemm_ukernel_4x4c2__sse2,
673       .mr = 4,
674       .nr = 4,
675       .log2_kr = 1,
676     };
677     xnn_params.q8.dwconv[0] = (struct dwconv_parameters) {
678       .up = (xnn_dwconv_up_ukernel_function) xnn_q8_dwconv_ukernel_up8x9__sse2,
679       .cr = 8,
680       .mr = 9,
681     };
682     xnn_params.q8.avgpool = (struct avgpool_parameters) {
683       .up = (xnn_avgpool_up_ukernel_function) xnn_q8_avgpool_ukernel_up9__sse2,
684       .mp = (xnn_avgpool_mp_ukernel_function) xnn_q8_avgpool_ukernel_mp9p8q__sse2,
685       .mr = 9,
686       .qr = 8,
687     };
688     xnn_params.q8.gavgpool = (struct gavgpool_parameters) {
689       .up = (xnn_gavgpool_up_ukernel_function) xnn_q8_gavgpool_ukernel_up7__sse2,
690       .mp = (xnn_gavgpool_mp_ukernel_function) xnn_q8_gavgpool_ukernel_mp7p7q__sse2,
691       .mr = 7,
692     };
693     xnn_params.q8.vadd = (xnn_vadd_ukernel_function) xnn_q8_vadd_ukernel__sse2;
694   #endif  // XNN_NO_Q8_OPERATORS
695 
696   /**************************** U8 micro-kernels ****************************/
697   #ifndef XNN_NO_U8_OPERATORS
698     xnn_params.u8.maxpool = (struct maxpool_parameters) {
699       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__sse2_c16,
700       .mr = 9,
701       .qr = 8,
702     };
703     xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__sse2;
704     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
705     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__sse2;
706   #endif  // XNN_NO_U8_OPERATORS
707 
708   /**************************** X8 micro-kernels ****************************/
709   #ifndef XNN_NO_X8_OPERATORS
710     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
711     xnn_params.x8.zip = (struct zip_parameters) {
712       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__sse2,
713       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__sse2,
714       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__sse2,
715       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__sse2,
716     };
717   #endif  // XNN_NO_X8_OPERATORS
718 
719   /**************************** F32 micro-kernels ****************************/
720   #ifndef XNN_NO_F32_OPERATORS
721     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
722       xnn_params.f32.gemm = (struct gemm_parameters) {
723         .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_7x16__avx512f_broadcast,
724         .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_7x16__avx512f_broadcast,
725         .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x16__avx512f_broadcast,
726         .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x16__avx512f_broadcast,
727         .mr = 7,
728         .nr = 16,
729       };
730     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
731       switch (cpuinfo_get_core(0)->uarch) {
732         case cpuinfo_uarch_zen:
733           xnn_params.f32.gemm = (struct gemm_parameters) {
734             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast,
735             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast,
736             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast,
737             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast,
738             .mr = 4,
739             .nr = 16,
740             .log2_sr = 2,
741           };
742           break;
743         default:
744           xnn_params.f32.gemm = (struct gemm_parameters) {
745             .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x16__fma3_broadcast,
746             .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x16__fma3_broadcast,
747             .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x16__fma3_broadcast,
748             .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x16__fma3_broadcast,
749             .mr = 5,
750             .nr = 16,
751           };
752           break;
753       }
754     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
755       xnn_params.f32.gemm = (struct gemm_parameters) {
756         .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x16__avx_broadcast,
757         .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x16__avx_broadcast,
758         .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x16__avx_broadcast,
759         .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x16__avx_broadcast,
760         .mr = 5,
761         .nr = 16,
762       };
763     } else {
764       xnn_params.f32.gemm = (struct gemm_parameters) {
765         .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__sse_load1,
766         .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__sse_load1,
767         .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__sse_load1,
768         .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__sse_load1,
769         .mr = 4,
770         .nr = 8,
771       };
772     }
773     xnn_params.f32.gemm2 = (struct gemm_parameters) {
774       .gemm = NULL,
775       .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__sse,
776       .mr = 4,
777       .nr = 2,
778       .log2_kr = 2,
779     };
780     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
781       xnn_params.f32.dwconv[0] = (struct dwconv_parameters) {
782         .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up16x4__avx512f,
783         .cr = 16,
784         .mr = 4,
785       };
786       xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
787         .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up16x9__avx512f,
788         .cr = 16,
789         .mr = 9,
790       };
791       xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
792         .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up16x25__avx512f,
793         .cr = 16,
794         .mr = 25,
795       };
796     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
797       xnn_params.f32.dwconv[0] = (struct dwconv_parameters) {
798         .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up16x4__fma3,
799         .cr = 16,
800         .mr = 4,
801       };
802       xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
803         .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up16x9__fma3,
804         .cr = 16,
805         .mr = 9,
806       };
807       xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
808         .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up8x25__fma3,
809         .cr = 8,
810         .mr = 25,
811       };
812     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
813       xnn_params.f32.dwconv[0] = (struct dwconv_parameters) {
814         .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up16x4__avx,
815         .cr = 16,
816         .mr = 4,
817       };
818       xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
819         .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up16x9__avx,
820         .cr = 16,
821         .mr = 9,
822       };
823       xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
824         .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up8x25__avx,
825         .cr = 8,
826         .mr = 25,
827       };
828     } else {
829       xnn_params.f32.dwconv[0] = (struct dwconv_parameters) {
830         .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__sse,
831         .cr = 8,
832         .mr = 4,
833       };
834       xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
835         .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__sse,
836         .cr = 8,
837         .mr = 9,
838       };
839       xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
840         .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up8x25__sse,
841         .cr = 8,
842         .mr = 25,
843       };
844     }
845     xnn_params.f32.avgpool = (struct avgpool_parameters) {
846       .up = (xnn_avgpool_up_ukernel_function) xnn_f32_avgpool_ukernel_up9__sse,
847       .mp = (xnn_avgpool_mp_ukernel_function) xnn_f32_avgpool_ukernel_mp9p8q__sse,
848       .mr = 9,
849       .qr = 8,
850     };
851     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
852       .up = (xnn_pavgpool_up_ukernel_function) xnn_f32_pavgpool_ukernel_up9__sse,
853       .mp = (xnn_pavgpool_mp_ukernel_function) xnn_f32_pavgpool_ukernel_mp9p8q__sse,
854       .mr = 9,
855       .qr = 8,
856     };
857     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
858       .up = (xnn_gavgpool_up_ukernel_function) xnn_f32_gavgpool_ukernel_up7__sse,
859       .mp = (xnn_gavgpool_mp_ukernel_function) xnn_f32_gavgpool_ukernel_mp7p7q__sse,
860       .mr = 7,
861     };
862     xnn_params.f32.maxpool = (struct maxpool_parameters) {
863       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__sse_c4,
864       .mr = 9,
865       .qr = 8,
866     };
867     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
868       .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
869       .mr = 4,
870     };
871     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
872       .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
873       .mr = 9,
874     };
875     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
876       .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
877       .mr = 9,
878       .qr = 8,
879     };
880     xnn_params.f32.bilinear = (struct bilinear_parameters) {
881       .ukernel = (xnn_bilinear_ukernel_function) xnn_f32_bilinear_ukernel__sse_c8,
882       .pixel_tile = 1,
883       .channel_tile = 8,
884     };
885     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
886       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__avx512f;
887     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
888       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__avx;
889     } else {
890       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__sse;
891     }
892     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
893       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__avx512f_x32;
894     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
895       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__fma3_x16;
896     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
897       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__avx_x16;
898     } else {
899       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__sse_x8;
900     }
901     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
902       xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x40;
903     } else {
904       xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__sse2_p5_div_x16;
905     }
906     xnn_params.f32.prelu = (struct prelu_parameters) {
907       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse2_2x8,
908       .row_tile = 2,
909       .channel_tile = 8,
910     };
911     xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2;
912     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__sse;
913     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
914       xnn_params.f32.vadd = (struct vbinary_parameters) {
915         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__avx512f_x32,
916         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__avx512f_x32,
917         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__avx512f_x32,
918         .element_tile = 32,
919       };
920       xnn_params.f32.vdiv = (struct vbinary_parameters) {
921         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__avx512f_x32,
922         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__avx512f_x32,
923         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__avx512f_x32,
924         .element_tile = 32,
925       };
926       xnn_params.f32.vmax = (struct vbinary_parameters) {
927         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx512f_x32,
928         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
929         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
930         .element_tile = 32,
931       };
932       xnn_params.f32.vmin = (struct vbinary_parameters) {
933         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx512f_x32,
934         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
935         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
936         .element_tile = 32,
937       };
938       xnn_params.f32.vmul = (struct vbinary_parameters) {
939         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__avx512f_x32,
940         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__avx512f_x32,
941         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__avx512f_x32,
942         .element_tile = 32,
943       };
944       xnn_params.f32.vsub = (struct vbinary_parameters) {
945         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__avx512f_x32,
946         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__avx512f_x32,
947         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__avx512f_x32,
948         .element_tile = 32,
949       };
950     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
951       xnn_params.f32.vadd = (struct vbinary_parameters) {
952         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__avx_x16,
953         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__avx_x16,
954         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__avx_x16,
955         .element_tile = 16,
956       };
957       xnn_params.f32.vdiv = (struct vbinary_parameters) {
958         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__avx_x16,
959         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__avx_x16,
960         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__avx_x16,
961         .element_tile = 16,
962       };
963       xnn_params.f32.vmax = (struct vbinary_parameters) {
964         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx_x16,
965         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
966         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
967         .element_tile = 16,
968       };
969       xnn_params.f32.vmin = (struct vbinary_parameters) {
970         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx_x16,
971         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
972         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
973         .element_tile = 16,
974       };
975       xnn_params.f32.vmul = (struct vbinary_parameters) {
976         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__avx_x16,
977         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__avx_x16,
978         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__avx_x16,
979         .element_tile = 16,
980       };
981       xnn_params.f32.vsub = (struct vbinary_parameters) {
982         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__avx_x16,
983         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__avx_x16,
984         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__avx_x16,
985         .element_tile = 16,
986       };
987     } else {
988       xnn_params.f32.vadd = (struct vbinary_parameters) {
989         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__sse_x8,
990         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__sse_x8,
991         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__sse_x8,
992         .element_tile = 8,
993       };
994       xnn_params.f32.vdiv = (struct vbinary_parameters) {
995         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__sse_x8,
996         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__sse_x8,
997         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__sse_x8,
998         .element_tile = 8,
999       };
1000       xnn_params.f32.vmax = (struct vbinary_parameters) {
1001         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__sse_x8,
1002         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
1003         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
1004         .element_tile = 8,
1005       };
1006       xnn_params.f32.vmin = (struct vbinary_parameters) {
1007         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__sse_x8,
1008         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
1009         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
1010         .element_tile = 8,
1011       };
1012       xnn_params.f32.vmul = (struct vbinary_parameters) {
1013         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__sse_x8,
1014         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__sse_x8,
1015         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__sse_x8,
1016         .element_tile = 8,
1017       };
1018       xnn_params.f32.vsub = (struct vbinary_parameters) {
1019         .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__sse_x8,
1020         .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__sse_x8,
1021         .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__sse_x8,
1022         .element_tile = 8,
1023       };
1024     }
1025     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1026       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c4__sse_2x,
1027       .channel_tile = 4,
1028       .row_tile = 2,
1029     };
1030     #ifndef XNN_NO_NCHW_OPERATORS
1031       xnn_params.f32.spmm = (struct spmm_parameters) {
1032         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_ukernel_4x1__sse,
1033         .mr = 4,
1034         .nr = 1,
1035       };
1036       xnn_params.f32.spchw_dwconv3x3 = (struct spchw_dwconv_parameters) {
1037         .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3p1__sse,
1038         .input_width_tile = 4,
1039         .output_width_tile = 4,
1040         .output_height_tile = 1,
1041       };
1042       xnn_params.f32.spchw_dwconv3x3s2 = (struct spchw_dwconv_parameters) {
1043         .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse,
1044         .input_width_tile = 4,
1045         .output_width_tile = 4,
1046         .output_height_tile = 1,
1047       };
1048       xnn_params.f32.spchw_gavgpool = (struct spchw_gavgpool_parameters) {
1049         .ukernel = (xnn_gavgpool_spchw_ukernel_function) xnn_f32_gavgpool_spchw_ukernel__sse_x4,
1050         .channel_tile = 4,
1051       };
1052     #endif  // XNN_NO_NCHW_OPERATORS
1053   #endif  // XNN_NO_F32_OPERATORS
1054 
1055   /**************************** X32 micro-kernels ****************************/
1056   #ifndef XNN_NO_X32_OPERATORS
1057     xnn_params.x32.pad = (struct pad_parameters) {
1058       .ukernel = xnn_x32_pad_x2__sse2,
1059       .mr = 2,
1060     };
1061     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__psimd;
1062     xnn_params.x32.zip = (struct zip_parameters) {
1063       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__sse2,
1064       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__sse2,
1065       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__sse2,
1066       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__sse2,
1067     };
1068   #endif  // XNN_NO_X32_OPERATORS
1069 
1070 #elif XNN_ARCH_PNACL || XNN_ARCH_WASMSIMD
1071   // Unlike most other architectures, on x86/x86-64 when floating-point instructions
1072   // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
1073   // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
1074   // of two infinities (must produce NaN per IEEE 754 standard).
1075   static volatile uint32_t minus_inf = UINT32_C(0xFF800000);
1076   const bool is_wasm_x86 = (int32_t) xnn_stub_wasm_f32_sub(minus_inf, minus_inf) < 0;
1077 
1078   /**************************** Q8 micro-kernels ****************************/
1079   #ifndef XNN_NO_Q8_OPERATORS
1080     xnn_params.q8.gemm = (struct gemm_parameters) {
1081       .gemm = (xnn_gemm_ukernel_function) xnn_q8_gemm_ukernel_2x2__scalar,
1082       .igemm = (xnn_igemm_ukernel_function) xnn_q8_igemm_ukernel_2x2__scalar,
1083       .mr = 2,
1084       .nr = 2,
1085     };
1086     xnn_params.q8.dwconv[0] = (struct dwconv_parameters) {
1087       .up = (xnn_dwconv_up_ukernel_function) xnn_q8_dwconv_ukernel_up1x9__scalar,
1088       .cr = 1,
1089       .mr = 9,
1090     };
1091     xnn_params.q8.avgpool = (struct avgpool_parameters) {
1092       .up = (xnn_avgpool_up_ukernel_function) xnn_q8_avgpool_ukernel_up9__scalar,
1093       .mp = (xnn_avgpool_mp_ukernel_function) xnn_q8_avgpool_ukernel_mp9p8q__scalar,
1094       .mr = 9,
1095       .qr = 8,
1096     };
1097     xnn_params.q8.gavgpool = (struct gavgpool_parameters) {
1098       .up = (xnn_gavgpool_up_ukernel_function) xnn_q8_gavgpool_ukernel_up7__scalar,
1099       .mp = (xnn_gavgpool_mp_ukernel_function) xnn_q8_gavgpool_ukernel_mp7p7q__scalar,
1100       .mr = 7,
1101     };
1102     xnn_params.q8.vadd = (xnn_vadd_ukernel_function) xnn_q8_vadd_ukernel__scalar;
1103   #endif  // XNN_NO_Q8_OPERATORS
1104 
1105   /**************************** U8 micro-kernels ****************************/
1106   #ifndef XNN_NO_U8_OPERATORS
1107     xnn_params.u8.maxpool = (struct maxpool_parameters) {
1108       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__scalar_c1,
1109       .mr = 9,
1110       .qr = 8,
1111     };
1112     xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__scalar;
1113     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1114     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
1115   #endif  // XNN_NO_U8_OPERATORS
1116 
1117   /**************************** X8 micro-kernels ****************************/
1118   #ifndef XNN_NO_X8_OPERATORS
1119     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
1120     xnn_params.x8.zip = (struct zip_parameters) {
1121       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
1122       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
1123       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
1124       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
1125     };
1126   #endif  // XNN_NO_X8_OPERATORS
1127 
1128   /**************************** F32 micro-kernels ****************************/
1129   #ifndef XNN_NO_F32_OPERATORS
1130     if (is_wasm_x86) {
1131       xnn_params.f32.gemm = (struct gemm_parameters) {
1132         .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__psimd_splat,
1133         .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__psimd_splat,
1134         .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__psimd_splat,
1135         .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__psimd_splat,
1136         .mr = 4,
1137         .nr = 8,
1138       };
1139     } else {
1140       xnn_params.f32.gemm = (struct gemm_parameters) {
1141         .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_6x8s4__psimd,
1142         .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_6x8s4__psimd,
1143         .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_igemm_ukernel_1x8s4__psimd,
1144         .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8s4__psimd,
1145         .mr = 6,
1146         .nr = 8,
1147         .log2_sr = 2,
1148       };
1149     }
1150     xnn_params.f32.gemm2 = (struct gemm_parameters) {
1151       .gemm = NULL,
1152       .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__psimd,
1153       .mr = 4,
1154       .nr = 2,
1155       .log2_kr = 2,
1156     };
1157     xnn_params.f32.dwconv[0] = (struct dwconv_parameters) {
1158       .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__psimd_acc2,
1159       .cr = 4,
1160       .mr = 4,
1161     };
1162     xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
1163       .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__psimd_acc2,
1164       .cr = 4,
1165       .mr = 9,
1166     };
1167     xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
1168       .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__psimd_acc2,
1169       .cr = 4,
1170       .mr = 25,
1171     };
1172     xnn_params.f32.avgpool = (struct avgpool_parameters) {
1173       .up = (xnn_avgpool_up_ukernel_function) xnn_f32_avgpool_ukernel_up9__psimd,
1174       .mp = (xnn_avgpool_mp_ukernel_function) xnn_f32_avgpool_ukernel_mp9p8q__psimd,
1175       .mr = 9,
1176       .qr = 8,
1177     };
1178     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
1179       .up = (xnn_pavgpool_up_ukernel_function) xnn_f32_pavgpool_ukernel_up9__psimd,
1180       .mp = (xnn_pavgpool_mp_ukernel_function) xnn_f32_pavgpool_ukernel_mp9p8q__psimd,
1181       .mr = 9,
1182       .qr = 8,
1183     };
1184     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
1185       .up = (xnn_gavgpool_up_ukernel_function) xnn_f32_gavgpool_ukernel_up7__psimd,
1186       .mp = (xnn_gavgpool_mp_ukernel_function) xnn_f32_gavgpool_ukernel_mp7p7q__psimd,
1187       .mr = 7,
1188     };
1189     xnn_params.f32.maxpool = (struct maxpool_parameters) {
1190       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__psimd_c4,
1191       .mr = 9,
1192       .qr = 8,
1193     };
1194     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1195       .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__psimd_c4,
1196       .mr = 4,
1197     };
1198     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1199       .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__psimd_c4,
1200       .mr = 9,
1201     };
1202     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1203       .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__psimd_c4,
1204       .mr = 9,
1205       .qr = 8,
1206     };
1207     xnn_params.f32.bilinear = (struct bilinear_parameters) {
1208       .ukernel = (xnn_bilinear_ukernel_function) xnn_f32_bilinear_ukernel__psimd_c8,
1209       .pixel_tile = 1,
1210       .channel_tile = 8,
1211     };
1212     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__psimd;
1213     xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__psimd_x8;
1214     xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__psimd_p5_div_x16;
1215     xnn_params.f32.prelu = (struct prelu_parameters) {
1216       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__psimd_2x8,
1217       .row_tile = 2,
1218       .channel_tile = 8,
1219     };
1220     xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__psimd_p5_x16_acc2;
1221     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__psimd;
1222     xnn_params.f32.vadd = (struct vbinary_parameters) {
1223       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__psimd_x8,
1224       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__psimd_x8,
1225       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__psimd_x8,
1226       .element_tile = 8,
1227     };
1228     xnn_params.f32.vdiv = (struct vbinary_parameters) {
1229       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__psimd_x4,
1230       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__psimd_x4,
1231       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__psimd_x4,
1232       .element_tile = 4,
1233     };
1234     xnn_params.f32.vmax = (struct vbinary_parameters) {
1235       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__psimd_x8,
1236       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__psimd_x8,
1237       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__psimd_x8,
1238       .element_tile = 8,
1239     };
1240     xnn_params.f32.vmin = (struct vbinary_parameters) {
1241       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__psimd_x8,
1242       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__psimd_x8,
1243       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__psimd_x8,
1244       .element_tile = 8,
1245     };
1246     xnn_params.f32.vmul = (struct vbinary_parameters) {
1247       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__psimd_x8,
1248       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__psimd_x8,
1249       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__psimd_x8,
1250       .element_tile = 8,
1251     };
1252     xnn_params.f32.vsub = (struct vbinary_parameters) {
1253       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__psimd_x8,
1254       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__psimd_x8,
1255       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__psimd_x8,
1256       .element_tile = 8,
1257     };
1258     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1259       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c4__psimd_2x,
1260       .channel_tile = 4,
1261       .row_tile = 2,
1262     };
1263   #endif  // XNN_NO_F32_OPERATORS
1264 
1265   /**************************** X32 micro-kernels ****************************/
1266   #ifndef XNN_NO_X32_OPERATORS
1267     xnn_params.x32.pad = (struct pad_parameters) {
1268       .ukernel = xnn_x32_pad_x2__psimd,
1269       .mr = 2,
1270     };
1271     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__psimd;
1272     xnn_params.x32.zip = (struct zip_parameters) {
1273       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__psimd,
1274       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__psimd,
1275       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__psimd,
1276       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__psimd,
1277     };
1278   #endif  // XNN_NO_X32_OPERATORS
1279 
1280 #elif XNN_ARCH_WASM || XNN_ARCH_ASMJS
1281   // Unlike most other architectures, on x86/x86-64 when floating-point instructions
1282   // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
1283   // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
1284   // of two infinities (must produce NaN per IEEE 754 standard).
1285   static volatile uint32_t minus_inf = UINT32_C(0xFF800000);
1286   const bool is_wasm_x86 = (int32_t) xnn_stub_wasm_f32_sub(minus_inf, minus_inf) < 0;
1287 
1288   /**************************** Q8 micro-kernels ****************************/
1289   #ifndef XNN_NO_Q8_OPERATORS
1290     xnn_params.q8.gemm = (struct gemm_parameters) {
1291       .gemm = (xnn_gemm_ukernel_function) xnn_q8_gemm_ukernel_2x2__scalar,
1292       .igemm = (xnn_igemm_ukernel_function) xnn_q8_igemm_ukernel_2x2__scalar,
1293       .mr = 2,
1294       .nr = 2,
1295     };
1296     xnn_params.q8.dwconv[0] = (struct dwconv_parameters) {
1297       .up = (xnn_dwconv_up_ukernel_function) xnn_q8_dwconv_ukernel_up1x9__scalar,
1298       .cr = 1,
1299       .mr = 9,
1300     };
1301     xnn_params.q8.avgpool = (struct avgpool_parameters) {
1302       .up = (xnn_avgpool_up_ukernel_function) xnn_q8_avgpool_ukernel_up9__scalar,
1303       .mp = (xnn_avgpool_mp_ukernel_function) xnn_q8_avgpool_ukernel_mp9p8q__scalar,
1304       .mr = 9,
1305       .qr = 8,
1306     };
1307     xnn_params.q8.gavgpool = (struct gavgpool_parameters) {
1308       .up = (xnn_gavgpool_up_ukernel_function) xnn_q8_gavgpool_ukernel_up7__scalar,
1309       .mp = (xnn_gavgpool_mp_ukernel_function) xnn_q8_gavgpool_ukernel_mp7p7q__scalar,
1310       .mr = 7,
1311     };
1312     xnn_params.q8.vadd = (xnn_vadd_ukernel_function) xnn_q8_vadd_ukernel__scalar;
1313   #endif  // XNN_NO_Q8_OPERATORS
1314 
1315   /**************************** U8 micro-kernels ****************************/
1316   #ifndef XNN_NO_U8_OPERATORS
1317     xnn_params.u8.maxpool = (struct maxpool_parameters) {
1318       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_ukernel_9p8x__scalar_c1,
1319       .mr = 9,
1320       .qr = 8,
1321     };
1322     xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__scalar;
1323     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1324     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
1325   #endif  // XNN_NO_U8_OPERATORS
1326 
1327   /**************************** X8 micro-kernels ****************************/
1328   #ifndef XNN_NO_X8_OPERATORS
1329     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
1330     xnn_params.x8.zip = (struct zip_parameters) {
1331       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
1332       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
1333       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
1334       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
1335     };
1336   #endif  // XNN_NO_X8_OPERATORS
1337 
1338   /**************************** F32 micro-kernels ****************************/
1339   #ifndef XNN_NO_F32_OPERATORS
1340     if (is_wasm_x86) {
1341       xnn_params.f32.gemm = (struct gemm_parameters) {
1342         .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_2x4__scalar,
1343         .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_2x4__scalar,
1344         .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm,
1345         .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm,
1346         .mr = 2,
1347         .nr = 4,
1348       };
1349     } else {
1350       xnn_params.f32.gemm = (struct gemm_parameters) {
1351         .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__wasm,
1352         .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__wasm,
1353         .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm,
1354         .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm,
1355         .mr = 4,
1356         .nr = 4,
1357       };
1358     }
1359     xnn_params.f32.gemm2 = (struct gemm_parameters) {
1360       .gemm = NULL,
1361       .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__wasm,
1362       .mr = 4,
1363       .nr = 2,
1364     };
1365     xnn_params.f32.dwconv[0] = (struct dwconv_parameters) {
1366       .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__wasm_acc2,
1367       .cr = 1,
1368       .mr = 4,
1369     };
1370     xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
1371       .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__wasm_acc2,
1372       .cr = 1,
1373       .mr = 9,
1374     };
1375     xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
1376       .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__wasm_acc2,
1377       .cr = 1,
1378       .mr = 25,
1379     };
1380     xnn_params.f32.avgpool = (struct avgpool_parameters) {
1381       .up = (xnn_avgpool_up_ukernel_function) xnn_f32_avgpool_ukernel_up9__wasm,
1382       .mp = (xnn_avgpool_mp_ukernel_function) xnn_f32_avgpool_ukernel_mp9p8q__wasm,
1383       .mr = 9,
1384       .qr = 8,
1385     };
1386     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
1387       .up = (xnn_pavgpool_up_ukernel_function) xnn_f32_pavgpool_ukernel_up9__wasm,
1388       .mp = (xnn_pavgpool_mp_ukernel_function) xnn_f32_pavgpool_ukernel_mp9p8q__wasm,
1389       .mr = 9,
1390       .qr = 8,
1391     };
1392     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
1393       .up = (xnn_gavgpool_up_ukernel_function) xnn_f32_gavgpool_ukernel_up7__wasm,
1394       .mp = (xnn_gavgpool_mp_ukernel_function) xnn_f32_gavgpool_ukernel_mp7p7q__wasm,
1395       .mr = 7,
1396     };
1397     xnn_params.f32.maxpool = (struct maxpool_parameters) {
1398       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__wasm_c1,
1399       .mr = 9,
1400       .qr = 8,
1401     };
1402     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1403       .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
1404       .mr = 4,
1405     };
1406     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1407       .up = (xnn_argmaxpool_up_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
1408       .mr = 9,
1409     };
1410     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1411       .mp = (xnn_argmaxpool_mp_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
1412       .mr = 9,
1413       .qr = 8,
1414     };
1415     xnn_params.f32.bilinear = (struct bilinear_parameters) {
1416       .ukernel = (xnn_bilinear_ukernel_function) xnn_f32_bilinear_ukernel__scalar_c2,
1417       .pixel_tile = 1,
1418       .channel_tile = 2,
1419     };
1420     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasm;
1421     xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasm_x4;
1422     xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2;
1423     xnn_params.f32.prelu = (struct prelu_parameters) {
1424       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
1425       .row_tile = 4,
1426       .channel_tile = 4,
1427     };
1428     xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2;
1429     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
1430     xnn_params.f32.vadd = (struct vbinary_parameters) {
1431       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasm_x4,
1432       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasm_x4,
1433       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasm_x4,
1434       .element_tile = 8,
1435     };
1436     xnn_params.f32.vdiv = (struct vbinary_parameters) {
1437       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasm_x2,
1438       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasm_x2,
1439       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasm_x2,
1440       .element_tile = 2,
1441     };
1442     xnn_params.f32.vmax = (struct vbinary_parameters) {
1443       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x4,
1444       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x4,
1445       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x4,
1446       .element_tile = 8,
1447     };
1448     xnn_params.f32.vmin = (struct vbinary_parameters) {
1449       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x4,
1450       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x4,
1451       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x4,
1452       .element_tile = 8,
1453     };
1454     xnn_params.f32.vmul = (struct vbinary_parameters) {
1455       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasm_x4,
1456       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasm_x4,
1457       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasm_x4,
1458       .element_tile = 8,
1459     };
1460     xnn_params.f32.vsub = (struct vbinary_parameters) {
1461       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasm_x4,
1462       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasm_x4,
1463       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasm_x4,
1464       .element_tile = 8,
1465     };
1466     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1467       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c1__wasm_2x,
1468       .channel_tile = 1,
1469       .row_tile = 2,
1470     };
1471     #ifndef XNN_NO_NCHW_OPERATORS
1472       xnn_params.f32.spmm = (struct spmm_parameters) {
1473         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_ukernel_8x1__scalar,
1474         .mr = 8,
1475         .nr = 1,
1476       };
1477       xnn_params.f32.spmm2 = (struct spmm_parameters) {
1478         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_ukernel_8x2__scalar,
1479         .mr = 8,
1480         .nr = 2,
1481       };
1482       xnn_params.f32.spmm4 = (struct spmm_parameters) {
1483         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_ukernel_8x4__scalar,
1484         .mr = 8,
1485         .nr = 4,
1486       };
1487       xnn_params.f32.hwc2spchw_dconv3x3c3s2 = (struct hwc2spchw_dconv_parameters) {
1488         .ukernel_with_symm_padding =
1489           (xnn_conv_hwc2spchw_ukernel_function) xnn_f32_conv_hwc2spchw_ukernel_3x3s2p1c3x4__scalar_1x1,
1490         .output_channel_tile = 4,
1491         .output_height_tile = 1,
1492         .output_width_tile = 1,
1493       };
1494       xnn_params.f32.spchw_dwconv3x3 = (struct spchw_dwconv_parameters) {
1495         .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar,
1496         .input_width_tile = 1,
1497         .output_width_tile = 1,
1498         .output_height_tile = 1,
1499       };
1500       xnn_params.f32.spchw_dwconv3x3s2 = (struct spchw_dwconv_parameters) {
1501         .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar,
1502         .input_width_tile = 1,
1503         .output_width_tile = 1,
1504         .output_height_tile = 1,
1505       };
1506       xnn_params.f32.spchw_dwconv5x5 = (struct spchw_dwconv_parameters) {
1507         .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar,
1508         .input_width_tile = 1,
1509         .output_width_tile = 1,
1510         .output_height_tile = 1,
1511       };
1512       xnn_params.f32.spchw_dwconv5x5s2 = (struct spchw_dwconv_parameters) {
1513         .ukernel = (xnn_dwconv_spchw_ukernel_function) xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar,
1514         .input_width_tile = 1,
1515         .output_width_tile = 1,
1516         .output_height_tile = 1,
1517       };
1518       xnn_params.f32.spchw_gavgpool = (struct spchw_gavgpool_parameters) {
1519         .ukernel = (xnn_gavgpool_spchw_ukernel_function) xnn_f32_gavgpool_spchw_ukernel__scalar_x1,
1520         .channel_tile = 1,
1521       };
1522     #endif  // XNN_NO_NCHW_OPERATORS
1523   #endif  // XNN_NO_F32_OPERATORS
1524 
1525   /**************************** X32 micro-kernels ****************************/
1526   #ifndef XNN_NO_X32_OPERATORS
1527     xnn_params.x32.pad = (struct pad_parameters) {
1528       .ukernel = xnn_x32_pad_x2__scalar,
1529       .mr = 2,
1530     };
1531     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
1532     xnn_params.x32.zip = (struct zip_parameters) {
1533       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
1534       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
1535       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
1536       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
1537     };
1538   #endif  // XNN_NO_X32_OPERATORS
1539 
1540 #else
1541   #error "Unsupported architecture"
1542 #endif
1543   xnn_params.initialized = true;
1544 }
1545 
xnn_initialize(const struct xnn_allocator * allocator)1546 enum xnn_status xnn_initialize(const struct xnn_allocator* allocator) {
1547   #ifndef __EMSCRIPTEN__
1548     if (!cpuinfo_initialize()) {
1549       return xnn_status_out_of_memory;
1550     }
1551   #endif
1552   pthread_once(&init_guard, &init);
1553   if (xnn_params.initialized) {
1554     if (allocator != NULL) {
1555       memcpy(&xnn_params.allocator, allocator, sizeof(struct xnn_allocator));
1556     } else {
1557       xnn_params.allocator.allocate = &xnn_allocate;
1558       xnn_params.allocator.reallocate = &xnn_reallocate;
1559       xnn_params.allocator.deallocate = &xnn_deallocate;
1560       xnn_params.allocator.aligned_allocate = &xnn_aligned_allocate;
1561       xnn_params.allocator.aligned_deallocate = &xnn_aligned_deallocate;
1562     }
1563     return xnn_status_success;
1564   } else {
1565     return xnn_status_unsupported_hardware;
1566   }
1567 }
1568 
xnn_deinitialize(void)1569 enum xnn_status xnn_deinitialize(void) {
1570   #ifndef __EMSCRIPTEN__
1571     cpuinfo_deinitialize();
1572   #endif
1573   return xnn_status_success;
1574 }
1575