• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <math.h>
10 #include <stdbool.h>
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <string.h>
14 
15 #ifdef _WIN32
16   #include <windows.h>
17 #else
18   #include <pthread.h>
19 #endif
20 
21 #ifndef __EMSCRIPTEN__
22   #include <cpuinfo.h>
23 #endif
24 
25 #include <xnnpack.h>
26 #include <xnnpack/argmaxpool.h>
27 #include <xnnpack/avgpool.h>
28 #include <xnnpack/clamp.h>
29 #include <xnnpack/common.h>
30 #include <xnnpack/conv.h>
31 #include <xnnpack/dwconv.h>
32 #include <xnnpack/depthtospace.h>
33 #include <xnnpack/gavgpool.h>
34 #include <xnnpack/gemm.h>
35 #include <xnnpack/fill.h>
36 #include <xnnpack/hswish.h>
37 #include <xnnpack/ibilinear.h>
38 #include <xnnpack/igemm.h>
39 #include <xnnpack/log.h>
40 #include <xnnpack/lut.h>
41 #include <xnnpack/maxpool.h>
42 #include <xnnpack/memory.h>
43 #include <xnnpack/pad.h>
44 #include <xnnpack/params.h>
45 #include <xnnpack/pavgpool.h>
46 #include <xnnpack/prelu.h>
47 #include <xnnpack/raddstoreexpminusmax.h>
48 #include <xnnpack/rmax.h>
49 #include <xnnpack/spmm.h>
50 #include <xnnpack/unpool.h>
51 #include <xnnpack/vadd.h>
52 #include <xnnpack/vbinary.h>
53 #include <xnnpack/vmulcaddc.h>
54 #include <xnnpack/vunary.h>
55 #include <xnnpack/zip.h>
56 
57 #ifndef XNN_ENABLE_ASSEMBLY
58   #define XNN_ENABLE_ASSEMBLY 1
59 #endif
60 
61 #ifdef _WIN32
62   static INIT_ONCE init_guard = INIT_ONCE_STATIC_INIT;
63 #else
64   static pthread_once_t init_guard = PTHREAD_ONCE_INIT;
65 #endif
66 
67 struct xnn_parameters xnn_params = {
68   .init_flags = 0
69 };
70 
init(void)71 static void init(void) {
72 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
73   // Unlike most other architectures, on x86/x86-64 when floating-point instructions
74   // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
75   // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
76   // of two infinities (must produce NaN per IEEE 754 standard).
77   static const volatile float inf = INFINITY;
78   const bool is_wasm_x86 = signbit(inf - inf);
79 #endif
80   uint32_t init_flags = XNN_INIT_FLAG_XNNPACK;
81 
82 #if XNN_ARCH_ARM
83   #if XNN_PLATFORM_MOBILE
84     if (!cpuinfo_has_arm_neon()) {
85       xnn_log_error("XNNPACK initialization failed: NEON is not supported");
86       return;
87     }
88   #else
89     if (!cpuinfo_has_arm_vfpv2() && !cpuinfo_has_arm_vfpv3()) {
90       xnn_log_error("XNNPACK initialization failed: VFP is not supported");
91       return;
92     }
93   #endif
94 
95   /**************************** XX micro-kernels ****************************/
96   #ifndef XNN_NO_XX_OPERATORS
97     init_flags |= XNN_INIT_FLAG_XX;
98 
99     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
100   #endif
101 
102   if (cpuinfo_has_arm_neon()) {
103     /**************************** QS8 micro-kernels ****************************/
104     #ifndef XNN_NO_QS8_OPERATORS
105       init_flags |= XNN_INIT_FLAG_QS8;
106 
107       if (cpuinfo_has_arm_neon_dot()) {
108         xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot);
109         xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot);
110         xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot);
111         xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot);
112         xnn_params.qs8.gemm.mr = 4;
113         xnn_params.qs8.gemm.nr = 8;
114         xnn_params.qs8.gemm.log2_kr = 2;
115       } else {
116         xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
117         xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
118         xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
119         xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
120         xnn_params.qs8.gemm.mr = 2;
121         xnn_params.qs8.gemm.nr = 8;
122         xnn_params.qs8.gemm.log2_kr = 1;
123       }
124 
125       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16;
126       xnn_params.qs8.dwconv[0].channel_tile = 8;
127       xnn_params.qs8.dwconv[0].primary_tile = 9;
128 
129       xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
130         .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2,
131         .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2,
132         .mr = 7,
133       };
134 
135       xnn_params.qs8.vadd = (struct vbinary_parameters) {
136         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8,
137         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8,
138         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8,
139         .element_tile = 8,
140       };
141     #endif  // XNN_NO_QS8_OPERATORS
142 
143     /*************************** QU8 micro-kernels ***************************/
144     #ifndef XNN_NO_QU8_OPERATORS
145       init_flags |= XNN_INIT_FLAG_QU8;
146 
147       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_ukernel_4x8__neon);
148       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_ukernel_4x8__neon);
149       xnn_params.qu8.gemm.mr = 4;
150       xnn_params.qu8.gemm.nr = 8;
151 
152       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_ukernel_up8x9__neon;
153       xnn_params.qu8.dwconv[0].channel_tile = 8;
154       xnn_params.qu8.dwconv[0].primary_tile = 9;
155       xnn_params.qu8.avgpool = (struct avgpool_parameters) {
156         .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
157         .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
158         .mr = 9,
159         .qr = 8,
160       };
161       xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
162         .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8,
163         .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8,
164         .mr = 7,
165       };
166       xnn_params.qu8.vadd = (xnn_vadd_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon;
167     #endif  // XNN_NO_QU8_OPERATORS
168 
169     /**************************** U8 micro-kernels ****************************/
170     #ifndef XNN_NO_U8_OPERATORS
171       init_flags |= XNN_INIT_FLAG_U8;
172 
173       xnn_params.u8.maxpool = (struct maxpool_parameters) {
174         .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
175         .mr = 9,
176         .qr = 8,
177       };
178       xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__neon_x64;
179       xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
180       xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
181     #endif  // XNN_NO_U8_OPERATORS
182 
183     /**************************** X8 micro-kernels ****************************/
184     #ifndef XNN_NO_X8_OPERATORS
185       init_flags |= XNN_INIT_FLAG_X8;
186 
187       xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
188       xnn_params.x8.zip = (struct zip_parameters) {
189         .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
190         .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
191         .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
192         .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
193       };
194     #endif  // XNN_NO_X8_OPERATORS
195 
196     /**************************** F32 micro-kernels ****************************/
197     #ifndef XNN_NO_F32_OPERATORS
198       init_flags |= XNN_INIT_FLAG_F32;
199 
200       #if XNN_ENABLE_ASSEMBLY
201         switch (cpuinfo_get_uarch(0)->uarch) {
202           case cpuinfo_uarch_cortex_a5:
203           case cpuinfo_uarch_cortex_a7:
204             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
205             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
206             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
207             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
208             xnn_params.f32.gemm.mr = 4;
209             xnn_params.f32.gemm.nr = 8;
210             break;
211 
212           case cpuinfo_uarch_cortex_a53:
213           case cpuinfo_uarch_cortex_a55r0:
214             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
215             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
216             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
217             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
218             xnn_params.f32.gemm.mr = 4;
219             xnn_params.f32.gemm.nr = 8;
220             break;
221 
222           case cpuinfo_uarch_cortex_a55:
223             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
224             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
225             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
226             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
227             xnn_params.f32.gemm.mr = 4;
228             xnn_params.f32.gemm.nr = 8;
229             break;
230 
231           case cpuinfo_uarch_cortex_a57:
232           case cpuinfo_uarch_cortex_a72:
233           case cpuinfo_uarch_cortex_a73:
234             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_pld_cortex_a75);
235             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_pld_cortex_a75);
236             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
237             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
238             xnn_params.f32.gemm.mr = 4;
239             xnn_params.f32.gemm.nr = 8;
240             break;
241 
242           case cpuinfo_uarch_krait:
243           default:
244             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
245             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
246             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
247             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
248             xnn_params.f32.gemm.mr = 4;
249             xnn_params.f32.gemm.nr = 8;
250             break;
251         }
252         #if XNN_MAX_UARCH_TYPES > 1
253         {
254           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
255           const uint32_t mr = xnn_params.f32.gemm.mr;
256           const uint32_t nr = xnn_params.f32.gemm.nr;
257           const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
258           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
259             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
260             if (uarch_info == NULL) {
261               /* No more microarchitectures in the system */
262               break;
263             }
264 
265             switch (uarch_info->uarch) {
266               case cpuinfo_uarch_cortex_a53:
267               case cpuinfo_uarch_cortex_a55r0:
268                 if (mr == 4 && nr == 8 && log2_sr == 0) {
269                   xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
270                   xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
271                   xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
272                   xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
273                 }
274                 break;
275               case cpuinfo_uarch_cortex_a55:
276                 if (mr == 4 && nr == 8 && log2_sr == 0) {
277                   xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
278                   xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
279                   xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
280                   xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
281                 }
282                 break;
283               default:
284                 break;
285             }
286           }
287         }
288         #endif  // XNN_MAX_UARCH_TYPES > 1
289       #else  // XNN_ENABLE_ASSEMBLY
290         xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128);
291         xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128);
292         xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
293         xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
294         xnn_params.f32.gemm.mr = 4;
295         xnn_params.f32.gemm.nr = 8;
296       #endif  // XNN_ENABLE_ASSEMBLY
297       xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64);
298       xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64);
299       xnn_params.f32.gemm2.mr = 4;
300       xnn_params.f32.gemm2.nr = 2;
301 
302       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x4__neon;
303       xnn_params.f32.dwconv[0].channel_tile = 4,
304       xnn_params.f32.dwconv[0].primary_tile = 4,
305 
306       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__neon;
307       xnn_params.f32.dwconv[1].channel_tile = 4;
308       xnn_params.f32.dwconv[1].primary_tile = 9;
309 
310       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__neon_acc2;
311       xnn_params.f32.dwconv[2].channel_tile = 4;
312       xnn_params.f32.dwconv[2].primary_tile = 25;
313 
314       xnn_params.f32.avgpool = (struct avgpool_parameters) {
315         .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
316         .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
317         .mr = 9,
318         .qr = 8,
319       };
320       xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
321         .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
322         .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
323         .mr = 9,
324         .qr = 8,
325       };
326       xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
327         .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
328         .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
329         .mr = 7,
330       };
331       xnn_params.f32.maxpool = (struct maxpool_parameters) {
332         .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
333         .mr = 9,
334         .qr = 8,
335       };
336       xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
337         .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
338         .mr = 4,
339       };
340       xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
341         .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
342         .mr = 9,
343       };
344       xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
345         .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
346         .mr = 9,
347         .qr = 8,
348       };
349       xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
350         .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neon_c8,
351         .pixel_tile = 1,
352         .channel_tile = 8,
353       };
354       xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8;
355       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__neon_x8;
356       if (cpuinfo_has_arm_neon_fma()) {
357         xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_p6_x8;
358       } else {
359         xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8;
360       }
361       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neon_x16;
362       xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8;
363       xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8;
364       if (cpuinfo_has_arm_neon_v8()) {
365         xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8;
366         xnn_params.f32.rndz  = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8;
367         xnn_params.f32.rndu  = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8;
368         xnn_params.f32.rndd  = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8;
369       } else {
370         xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neon_x8;
371         xnn_params.f32.rndz  = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neon_x8;
372         xnn_params.f32.rndu  = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neon_x8;
373         xnn_params.f32.rndd  = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neon_x8;
374       }
375       xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8;
376       xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8;
377       xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1;
378       xnn_params.f32.prelu = (struct prelu_parameters) {
379         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
380         .row_tile = 2,
381         .channel_tile = 8,
382       };
383       xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x8;
384       xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
385       xnn_params.f32.vadd = (struct vbinary_parameters) {
386         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
387         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
388         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
389         .element_tile = 8,
390       };
391       xnn_params.f32.vdiv = (struct vbinary_parameters) {
392         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
393         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
394         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
395         .element_tile = 2,
396       };
397       xnn_params.f32.vmax = (struct vbinary_parameters) {
398         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
399         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
400         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
401         .element_tile = 8,
402       };
403       xnn_params.f32.vmin = (struct vbinary_parameters) {
404         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
405         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
406         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
407         .element_tile = 8,
408       };
409       xnn_params.f32.vmul = (struct vbinary_parameters) {
410         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
411         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
412         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
413         .element_tile = 8,
414       };
415       xnn_params.f32.vsub = (struct vbinary_parameters) {
416         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
417         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
418         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
419         .element_tile = 8,
420       };
421       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
422         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
423         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
424         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
425         .element_tile = 8,
426       };
427       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
428         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x,
429         .channel_tile = 4,
430         .row_tile = 2,
431       };
432       #ifndef XNN_NO_NCHW_OPERATORS
433         init_flags |= XNN_INIT_FLAG_CHW_OPT;
434 
435         xnn_params.f32.spmm = (struct spmm_parameters) {
436           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neon,
437           .mr = 32,
438           .nr = 1,
439         };
440         xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
441           .ukernel_with_symm_padding =
442             (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2,
443           .output_channel_tile = 4,
444           .output_height_tile = 2,
445           .output_width_tile = 2,
446         };
447         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
448           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4,
449           .output_width_tile = 4,
450           .output_height_tile = 2,
451         };
452         xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
453           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4,
454           .output_width_tile = 4,
455           .output_height_tile = 1,
456         };
457         xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
458           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4,
459           .output_width_tile = 4,
460           .output_height_tile = 1,
461         };
462         xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
463           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4,
464           .output_width_tile = 4,
465           .output_height_tile = 1,
466         };
467         xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
468           .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
469           .channel_tile = 4,
470         };
471         xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
472           .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neon_p8,
473           .channel_tile = 1,
474           .pixel_tile = 4,
475         };
476       #endif  // XNN_NO_NCHW_OPERATORS
477     #endif  // XNN_NO_F32_OPERATORS
478 
479     /**************************** X32 micro-kernels ****************************/
480     #ifndef XNN_NO_X32_OPERATORS
481       init_flags |= XNN_INIT_FLAG_X32;
482 
483       xnn_params.x32.fill = (struct fill_parameters) {
484         .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__neon,
485         .row_tile = 1,
486       };
487       xnn_params.x32.pad = (struct pad_parameters) {
488         .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__neon,
489         .row_tile = 1,
490       };
491       xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
492       xnn_params.x32.zip = (struct zip_parameters) {
493         .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
494         .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
495         .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
496         .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
497       };
498       #ifndef XNN_NO_NCHW_OPERATORS
499         xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
500           .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
501           .channel_tile = 1,
502           .pixel_tile = 1,
503         };
504       #endif  // XNN_NO_NCHW_OPERATORS
505     #endif  // XNN_NO_X32_OPERATORS
506   } else if (!XNN_PLATFORM_MOBILE) {
507     /*************************** QU8 micro-kernels ***************************/
508     #ifndef XNN_NO_QU8_OPERATORS
509       init_flags |= XNN_INIT_FLAG_QU8;
510 
511       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_ukernel_2x2__scalar);
512       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_ukernel_2x2__scalar);
513       xnn_params.qu8.gemm.mr = 2;
514       xnn_params.qu8.gemm.nr = 2;
515 
516       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_ukernel_up1x9__scalar;
517       xnn_params.qu8.dwconv[0].channel_tile = 1;
518       xnn_params.qu8.dwconv[0].primary_tile = 9;
519 
520       xnn_params.qu8.avgpool = (struct avgpool_parameters) {
521         .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
522         .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
523         .mr = 9,
524         .qr = 8,
525       };
526       xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
527         .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
528         .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
529         .mr = 7,
530       };
531       xnn_params.qu8.vadd = (xnn_vadd_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar;
532     #endif  // XNN_NO_QU8_OPERATORS
533 
534     /**************************** U8 micro-kernels ****************************/
535     #ifndef XNN_NO_U8_OPERATORS
536       init_flags |= XNN_INIT_FLAG_U8;
537 
538       xnn_params.u8.maxpool = (struct maxpool_parameters) {
539         .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
540         .mr = 9,
541         .qr = 8,
542       };
543       xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__scalar_x4;
544       xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
545       xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
546     #endif  // XNN_NO_U8_OPERATORS
547 
548     /**************************** X8 micro-kernels ****************************/
549     #ifndef XNN_NO_X8_OPERATORS
550       init_flags |= XNN_INIT_FLAG_X8;
551 
552       xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
553       xnn_params.x8.zip = (struct zip_parameters) {
554         .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
555         .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
556         .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
557         .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
558       };
559     #endif  // XNN_NO_X8_OPERATORS
560 
561     /**************************** F32 micro-kernels ****************************/
562     #ifndef XNN_NO_F32_OPERATORS
563       init_flags |= XNN_INIT_FLAG_F32;
564 
565       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
566       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
567       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
568       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
569       xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
570       xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
571       xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
572       xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
573       xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
574       xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
575       xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
576       xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
577       xnn_params.f32.gemm.mr = 4;
578       xnn_params.f32.gemm.nr = 4;
579 
580       xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
581       xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
582       xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
583       xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
584       xnn_params.f32.gemm2.mr = 4;
585       xnn_params.f32.gemm2.nr = 2;
586 
587       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
588       xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
589       xnn_params.f32.dwconv[0].channel_tile = 1;
590       xnn_params.f32.dwconv[0].primary_tile = 4;
591 
592       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
593       xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
594       xnn_params.f32.dwconv[1].channel_tile = 1;
595       xnn_params.f32.dwconv[1].primary_tile = 9;
596 
597       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
598       xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
599       xnn_params.f32.dwconv[2].channel_tile = 1;
600       xnn_params.f32.dwconv[2].primary_tile = 25;
601 
602       xnn_params.f32.avgpool = (struct avgpool_parameters) {
603         .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
604         .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
605         .mr = 9,
606         .qr = 8,
607       };
608       xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
609         .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
610         .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
611         .mr = 9,
612         .qr = 8,
613       };
614       xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
615         .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
616         .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
617         .mr = 7,
618       };
619       xnn_params.f32.maxpool = (struct maxpool_parameters) {
620         .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
621         .mr = 9,
622         .qr = 8,
623       };
624       xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
625         .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
626         .mr = 4,
627       };
628       xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
629         .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
630         .mr = 9,
631       };
632       xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
633         .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
634         .mr = 9,
635         .qr = 8,
636       };
637       xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
638         .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
639         .pixel_tile = 1,
640         .channel_tile = 2,
641       };
642       xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4;
643       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__scalar_x4;
644       xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4;
645       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__scalar_x4;
646       xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4;
647       xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4;
648       xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1;
649       xnn_params.f32.rndz  = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1;
650       xnn_params.f32.rndu  = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1;
651       xnn_params.f32.rndd  = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1;
652       xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2;
653       xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4;
654       xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1;
655       xnn_params.f32.prelu = (struct prelu_parameters) {
656         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
657         .row_tile = 4,
658         .channel_tile = 4,
659       };
660       xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2;
661       xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
662       xnn_params.f32.vadd = (struct vbinary_parameters) {
663         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
664         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
665         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
666         .element_tile = 8,
667       };
668       xnn_params.f32.vdiv = (struct vbinary_parameters) {
669         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
670         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
671         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
672         .element_tile = 2,
673       };
674       xnn_params.f32.vmax = (struct vbinary_parameters) {
675         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
676         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
677         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
678         .element_tile = 8,
679       };
680       xnn_params.f32.vmin = (struct vbinary_parameters) {
681         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
682         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
683         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
684         .element_tile = 8,
685       };
686       xnn_params.f32.vmul = (struct vbinary_parameters) {
687         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
688         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
689         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
690         .element_tile = 8,
691       };
692       xnn_params.f32.vsub = (struct vbinary_parameters) {
693         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
694         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
695         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
696         .element_tile = 8,
697       };
698       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
699         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
700         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
701         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
702         .element_tile = 8,
703       };
704       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
705         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
706         .channel_tile = 1,
707         .row_tile = 2,
708       };
709       #ifndef XNN_NO_NCHW_OPERATORS
710         init_flags |= XNN_INIT_FLAG_CHW_OPT;
711 
712         xnn_params.f32.spmm = (struct spmm_parameters) {
713           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
714           .mr = 8,
715           .nr = 1,
716         };
717         xnn_params.f32.spmm2 = (struct spmm_parameters) {
718           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
719           .mr = 8,
720           .nr = 2,
721         };
722         xnn_params.f32.spmm4 = (struct spmm_parameters) {
723           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
724           .mr = 8,
725           .nr = 4,
726         };
727         xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
728           .ukernel_with_symm_padding =
729             (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
730           .output_channel_tile = 4,
731           .output_height_tile = 1,
732           .output_width_tile = 1,
733         };
734         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
735           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1,
736           .output_width_tile = 1,
737           .output_height_tile = 4,
738         };
739         xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
740           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2,
741           .output_width_tile = 1,
742           .output_height_tile = 2,
743         };
744         xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
745           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2,
746           .output_width_tile = 1,
747           .output_height_tile = 2,
748         };
749         xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
750           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2,
751           .output_width_tile = 1,
752           .output_height_tile = 2,
753         };
754         xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
755           .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
756           .channel_tile = 1,
757         };
758         xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
759           .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
760           .channel_tile = 1,
761           .pixel_tile = 4,
762         };
763       #endif  // XNN_NO_NCHW_OPERATORS
764     #endif  // XNN_NO_F32_OPERATORS
765 
766     /**************************** X32 micro-kernels ****************************/
767     #ifndef XNN_NO_X32_OPERATORS
768       init_flags |= XNN_INIT_FLAG_X32;
769 
770       xnn_params.x32.fill = (struct fill_parameters) {
771         .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__scalar_int,
772         .row_tile = 1,
773       };
774       xnn_params.x32.pad = (struct pad_parameters) {
775         .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__scalar_int,
776         .row_tile = 1,
777       };
778       xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
779       xnn_params.x32.zip = (struct zip_parameters) {
780         .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
781         .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
782         .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
783         .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
784       };
785       #ifndef XNN_NO_NCHW_OPERATORS
786         xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
787           .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
788           .channel_tile = 1,
789           .pixel_tile = 1,
790         };
791       #endif  // XNN_NO_NCHW_OPERATORS
792     #endif  // XNN_NO_X32_OPERATORS
793   }
794 
795 #elif XNN_ARCH_ARM64
796 
797   /**************************** XX micro-kernels ****************************/
798   #ifndef XNN_NO_XX_OPERATORS
799     init_flags |= XNN_INIT_FLAG_XX;
800 
801     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
802   #endif
803 
804   /**************************** QS8 micro-kernels ****************************/
805   #ifndef XNN_NO_QS8_OPERATORS
806     init_flags |= XNN_INIT_FLAG_QS8;
807 
808     #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
809       #if XNN_ENABLE_ASSEMBLY
810         if (cpuinfo_has_arm_neon_dot()) {
811           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
812           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
813           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
814           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
815           xnn_params.qs8.gemm.mr = 4;
816           xnn_params.qs8.gemm.nr = 16;
817           xnn_params.qs8.gemm.log2_kr = 2;
818         } else {
819           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
820           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
821           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
822           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
823           xnn_params.qs8.gemm.mr = 2;
824           xnn_params.qs8.gemm.nr = 8;
825           xnn_params.qs8.gemm.log2_kr = 3;
826         }
827       #else  // !XNN_ENABLE_ASSEMBLY
828         if (cpuinfo_has_arm_neon_dot()) {
829           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot);
830           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
831           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot);
832           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
833           xnn_params.qs8.gemm.mr = 4;
834           xnn_params.qs8.gemm.nr = 16;
835           xnn_params.qs8.gemm.log2_kr = 2;
836         } else {
837           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
838           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
839           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
840           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
841           xnn_params.qs8.gemm.mr = 2;
842           xnn_params.qs8.gemm.nr = 8;
843           xnn_params.qs8.gemm.log2_kr = 1;
844         }
845       #endif  // XNN_ENABLE_ASSEMBLY
846     #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
847       #if XNN_ENABLE_ASSEMBLY
848         if (cpuinfo_has_arm_neon_dot()) {
849           switch (cpuinfo_get_core(0)->uarch) {
850             case cpuinfo_uarch_cortex_a55:
851               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
852               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
853               break;
854             default:
855               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
856               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
857               break;
858           }
859           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
860           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
861           xnn_params.qs8.gemm.mr = 4;
862           xnn_params.qs8.gemm.nr = 16;
863           xnn_params.qs8.gemm.log2_kr = 2;
864         } else {
865           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
866           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
867           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
868           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
869           xnn_params.qs8.gemm.mr = 2;
870           xnn_params.qs8.gemm.nr = 8;
871           xnn_params.qs8.gemm.log2_kr = 3;
872         }
873         #if XNN_MAX_UARCH_TYPES > 1
874         {
875           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
876           const uint32_t mr = xnn_params.qs8.gemm.mr;
877           const uint32_t nr = xnn_params.qs8.gemm.nr;
878           const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
879           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
880             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
881             if (uarch_info == NULL) {
882               /* No more microarchitectures in the system */
883               break;
884             }
885 
886             switch (uarch_info->uarch) {
887               case cpuinfo_uarch_cortex_a55:
888                 if (mr == 4 && nr == 16 && log2_kr == 2) {
889                   xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55;
890                   xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55;
891                   xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot;
892                   xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot;
893                 }
894                 break;
895               default:
896                 break;
897             }
898           }
899         }
900         #endif  // XNN_MAX_UARCH_TYPES > 1
901       #else  // !XNN_ENABLE_ASSEMBLY
902         if (cpuinfo_has_arm_neon_dot()) {
903           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot);
904           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
905           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot);
906           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
907           xnn_params.qs8.gemm.mr = 4;
908           xnn_params.qs8.gemm.nr = 16;
909           xnn_params.qs8.gemm.log2_kr = 2;
910         } else {
911           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
912           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
913           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
914           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
915           xnn_params.qs8.gemm.mr = 2;
916           xnn_params.qs8.gemm.nr = 8;
917           xnn_params.qs8.gemm.log2_kr = 1;
918         }
919       #endif  // XNN_ENABLE_ASSEMBLY
920     #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
921 
922     xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16;
923     xnn_params.qs8.dwconv[0].channel_tile = 8;
924     xnn_params.qs8.dwconv[0].primary_tile = 9;
925 
926     xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
927       .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2,
928       .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2,
929       .mr = 7,
930     };
931 
932     xnn_params.qs8.vadd = (struct vbinary_parameters) {
933       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8,
934       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8,
935       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8,
936       .element_tile = 8,
937     };
938   #endif  // XNN_NO_QS8_OPERATORS
939 
940   /**************************** QU8 micro-kernels ****************************/
941   #ifndef XNN_NO_QU8_OPERATORS
942     init_flags |= XNN_INIT_FLAG_QU8;
943 
944     xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_ukernel_8x8__neon);
945     xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_ukernel_8x8__neon);
946     xnn_params.qu8.gemm.mr = 8;
947     xnn_params.qu8.gemm.nr = 8;
948 
949     xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_ukernel_up8x9__neon;
950     xnn_params.qu8.dwconv[0].channel_tile = 8;
951     xnn_params.qu8.dwconv[0].primary_tile = 9;
952 
953     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
954       .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
955       .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
956       .mr = 9,
957       .qr = 8,
958     };
959     xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
960       .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8,
961       .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8,
962       .mr = 7,
963     };
964     xnn_params.qu8.vadd = (xnn_vadd_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon;
965   #endif  // XNN_NO_QU8_OPERATORS
966 
967   /**************************** U8 micro-kernels ****************************/
968   #ifndef XNN_NO_U8_OPERATORS
969     init_flags |= XNN_INIT_FLAG_U8;
970 
971     xnn_params.u8.maxpool = (struct maxpool_parameters) {
972       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
973       .mr = 9,
974       .qr = 8,
975     };
976     xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__neon_x64;
977     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
978     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
979   #endif  // XNN_NO_U8_OPERATORS
980 
981   /**************************** X8 micro-kernels ****************************/
982   #ifndef XNN_NO_X8_OPERATORS
983     init_flags |= XNN_INIT_FLAG_X8;
984 
985     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
986     xnn_params.x8.zip = (struct zip_parameters) {
987       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
988       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
989       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
990       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
991     };
992   #endif  // XNN_NO_X8_OPERATORS
993 
994   /**************************** F16 micro-kernels ****************************/
995   #ifndef XNN_NO_F16_OPERATORS
996     if (cpuinfo_has_arm_neon_fp16_arith()) {
997       init_flags |= XNN_INIT_FLAG_F16;
998 
999       #if XNN_ENABLE_ASSEMBLY
1000         xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
1001         xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
1002       #else
1003         xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
1004         xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1005       #endif
1006       xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
1007       xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1008       xnn_params.f16.gemm.mr = 6;
1009       xnn_params.f16.gemm.nr = 16;
1010 
1011       xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith;
1012       xnn_params.f16.dwconv[0].channel_tile = 16;
1013       xnn_params.f16.dwconv[0].primary_tile = 4;
1014 
1015       xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith;
1016       xnn_params.f16.dwconv[1].channel_tile = 16;
1017       xnn_params.f16.dwconv[1].primary_tile = 9;
1018 
1019       xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2;
1020       xnn_params.f16.dwconv[2].channel_tile = 8;
1021       xnn_params.f16.dwconv[2].primary_tile = 25;
1022 
1023       xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
1024         .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8,
1025         .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8,
1026         .mr = 7,
1027       };
1028       xnn_params.f16.vadd = (struct vbinary_parameters) {
1029         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16,
1030         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
1031         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
1032         .element_tile = 16,
1033       };
1034       xnn_params.f16.vmul = (struct vbinary_parameters) {
1035         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16,
1036         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
1037         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
1038         .element_tile = 16,
1039       };
1040       xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
1041         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__neonfp16arith_2x,
1042         .channel_tile = 8,
1043         .row_tile = 2,
1044       };
1045       xnn_params.f16.hswish = (xnn_univector_ukernel_function) xnn_f16_hswish_ukernel__neonfp16arith_x16;
1046     }
1047   #endif  // XNN_NO_F16_OPERATORS
1048 
1049   /**************************** F32 micro-kernels ****************************/
1050   #ifndef XNN_NO_F32_OPERATORS
1051     init_flags |= XNN_INIT_FLAG_F32;
1052 
1053     #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1054       #if XNN_ENABLE_ASSEMBLY
1055         xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
1056         xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
1057         xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1058         xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1059         xnn_params.f32.gemm.mr = 6;
1060         xnn_params.f32.gemm.nr = 8;
1061       #else  // !XNN_ENABLE_ASSEMBLY
1062         xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
1063         xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
1064         xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
1065         xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
1066         xnn_params.f32.gemm.mr = 6;
1067         xnn_params.f32.gemm.nr = 8;
1068        #endif  // XNN_ENABLE_ASSEMBLY
1069     #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1070       #if XNN_ENABLE_ASSEMBLY
1071         switch (cpuinfo_get_core(0)->uarch) {
1072           case cpuinfo_uarch_cortex_a57:
1073             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57);
1074             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57);
1075             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a57);
1076             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a57);
1077             xnn_params.f32.gemm.mr = 6;
1078             xnn_params.f32.gemm.nr = 8;
1079             break;
1080           case cpuinfo_uarch_cortex_a72:
1081             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
1082             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
1083             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1084             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1085             xnn_params.f32.gemm.mr = 4;
1086             xnn_params.f32.gemm.nr = 8;
1087             break;
1088           case cpuinfo_uarch_cortex_a75:
1089           case cpuinfo_uarch_cortex_a76:
1090           case cpuinfo_uarch_exynos_m3:
1091           case cpuinfo_uarch_exynos_m4:
1092             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
1093             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
1094             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1095             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1096             xnn_params.f32.gemm.mr = 6;
1097             xnn_params.f32.gemm.nr = 8;
1098             break;
1099           case cpuinfo_uarch_exynos_m1:
1100           case cpuinfo_uarch_exynos_m2:
1101             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma);
1102             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma);
1103             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma);
1104             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma);
1105             xnn_params.f32.gemm.mr = 6;
1106             xnn_params.f32.gemm.nr = 8;
1107             xnn_params.f32.gemm.log2_sr = 2;
1108             break;
1109           case cpuinfo_uarch_cortex_a53:
1110           case cpuinfo_uarch_cortex_a55r0:
1111             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
1112             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
1113             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
1114             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
1115             xnn_params.f32.gemm.mr = 6;
1116             xnn_params.f32.gemm.nr = 8;
1117             break;
1118           case cpuinfo_uarch_cortex_a55:
1119             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
1120             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
1121             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
1122             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
1123             xnn_params.f32.gemm.mr = 6;
1124             xnn_params.f32.gemm.nr = 8;
1125             break;
1126           case cpuinfo_uarch_cortex_a73:
1127             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
1128             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
1129             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1130             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1131             xnn_params.f32.gemm.mr = 6;
1132             xnn_params.f32.gemm.nr = 8;
1133             break;
1134           default:
1135           case cpuinfo_uarch_cortex_a77:
1136           case cpuinfo_uarch_exynos_m5:
1137           case cpuinfo_uarch_kryo:
1138             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a57);
1139             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a57);
1140             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a57);
1141             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a57);
1142             xnn_params.f32.gemm.mr = 4;
1143             xnn_params.f32.gemm.nr = 8;
1144             break;
1145         }
1146         #if XNN_MAX_UARCH_TYPES > 1
1147         {
1148           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1149           const uint32_t mr = xnn_params.f32.gemm.mr;
1150           const uint32_t nr = xnn_params.f32.gemm.nr;
1151           const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
1152           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1153             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1154             if (uarch_info == NULL) {
1155               /* No more microarchitectures in the system */
1156               break;
1157             }
1158 
1159             switch (uarch_info->uarch) {
1160               case cpuinfo_uarch_cortex_a53:
1161               case cpuinfo_uarch_cortex_a55r0:
1162                 if (mr == 6 && nr == 8 && log2_sr == 0) {
1163                   xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
1164                   xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
1165                   xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1166                   xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1167                 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
1168                   xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
1169                   xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
1170                   xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1171                   xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1172                 }
1173                 break;
1174               case cpuinfo_uarch_cortex_a55:
1175                 if (mr == 6 && nr == 8 && log2_sr == 0) {
1176                   xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
1177                   xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
1178                   xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1179                   xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1180                 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
1181                   xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
1182                   xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
1183                   xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1184                   xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1185                 }
1186                 break;
1187               default:
1188                 break;
1189             }
1190           }
1191         }
1192         #endif  // XNN_MAX_UARCH_TYPES > 1
1193       #else  // !XNN_ENABLE_ASSEMBLY
1194         xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
1195         xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
1196         xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
1197         xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
1198         xnn_params.f32.gemm.mr = 6;
1199         xnn_params.f32.gemm.nr = 8;
1200       #endif  // XNN_ENABLE_ASSEMBLY
1201     #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1202     xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64);
1203     xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64);
1204     xnn_params.f32.gemm2.mr = 4;
1205     xnn_params.f32.gemm2.nr = 2;
1206 
1207     xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neonfma;
1208     xnn_params.f32.dwconv[0].channel_tile = 8;
1209     xnn_params.f32.dwconv[0].primary_tile = 4;
1210 
1211     #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1212       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
1213       xnn_params.f32.dwconv[1].channel_tile = 8;
1214       xnn_params.f32.dwconv[1].primary_tile = 9;
1215     #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1216       switch (cpuinfo_get_core(0)->uarch) {
1217         case cpuinfo_uarch_kryo:
1218           xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__neonfma;
1219           xnn_params.f32.dwconv[1].channel_tile = 4;
1220           xnn_params.f32.dwconv[1].primary_tile = 9;
1221           break;
1222         #if XNN_ENABLE_ASSEMBLY
1223           case cpuinfo_uarch_cortex_a53:
1224           case cpuinfo_uarch_cortex_a55r0:
1225           case cpuinfo_uarch_cortex_a55:
1226             xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55;
1227             xnn_params.f32.dwconv[1].channel_tile = 4;
1228             xnn_params.f32.dwconv[1].primary_tile = 9;
1229             break;
1230         #endif  // XNN_ENABLE_ASSEMBLY
1231         default:
1232           xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
1233           xnn_params.f32.dwconv[1].channel_tile = 8;
1234           xnn_params.f32.dwconv[1].primary_tile = 9;
1235           break;
1236       }
1237     #endif  // XNN_PLATFORM_IOS && XNN_PLATFORM_MAC
1238 
1239     xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__neonfma_acc2;
1240     xnn_params.f32.dwconv[2].channel_tile = 4;
1241     xnn_params.f32.dwconv[2].primary_tile = 25;
1242 
1243     xnn_params.f32.avgpool = (struct avgpool_parameters) {
1244       .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
1245       .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
1246       .mr = 9,
1247       .qr = 8,
1248     };
1249     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
1250       .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
1251       .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
1252       .mr = 9,
1253       .qr = 8,
1254     };
1255     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
1256       .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
1257       .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
1258       .mr = 7,
1259     };
1260     xnn_params.f32.maxpool = (struct maxpool_parameters) {
1261       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
1262       .mr = 9,
1263       .qr = 8,
1264     };
1265     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1266       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
1267       .mr = 4,
1268     };
1269     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1270       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
1271       .mr = 9,
1272     };
1273     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1274       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
1275       .mr = 9,
1276       .qr = 8,
1277     };
1278     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1279       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neonfma_c8,
1280       .pixel_tile = 1,
1281       .channel_tile = 8,
1282     };
1283     xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8;
1284     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__neon_x8;
1285     xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16;
1286     xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neon_x16;
1287     xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8;
1288     xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8;
1289     xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8;
1290     xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8;
1291     xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8;
1292     xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8;
1293     xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16;
1294     xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8;
1295     xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__neon_sqrt_x4;
1296     xnn_params.f32.prelu = (struct prelu_parameters) {
1297       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
1298       .row_tile = 2,
1299       .channel_tile = 8,
1300     };
1301     xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x16;
1302     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
1303     xnn_params.f32.vadd = (struct vbinary_parameters) {
1304       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
1305       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1306       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1307       .element_tile = 8,
1308     };
1309     xnn_params.f32.vdiv = (struct vbinary_parameters) {
1310       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__neon_x8,
1311       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__neon_x8,
1312       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__neon_x8,
1313       .element_tile = 8,
1314     };
1315     xnn_params.f32.vmax = (struct vbinary_parameters) {
1316       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
1317       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1318       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1319       .element_tile = 8,
1320     };
1321     xnn_params.f32.vmin = (struct vbinary_parameters) {
1322       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
1323       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1324       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1325       .element_tile = 8,
1326     };
1327     xnn_params.f32.vmul = (struct vbinary_parameters) {
1328       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
1329       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1330       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1331       .element_tile = 8,
1332     };
1333     xnn_params.f32.vsub = (struct vbinary_parameters) {
1334       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
1335       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
1336       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
1337       .element_tile = 8,
1338     };
1339     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1340       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
1341       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1342       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1343       .element_tile = 8,
1344     };
1345     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1346       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x,
1347       .channel_tile = 4,
1348       .row_tile = 2,
1349     };
1350     #ifndef XNN_NO_NCHW_OPERATORS
1351       init_flags |= XNN_INIT_FLAG_CHW_OPT;
1352 
1353       xnn_params.f32.spmm = (struct spmm_parameters) {
1354         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined,
1355         .mr = 32,
1356         .nr = 1,
1357       };
1358       xnn_params.f32.spmm2 = (struct spmm_parameters) {
1359         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x2__neonfma,
1360         .mr = 32,
1361         .nr = 2,
1362       };
1363       xnn_params.f32.spmm4 = (struct spmm_parameters) {
1364         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x4__neonfma,
1365         .mr = 32,
1366         .nr = 4,
1367       };
1368       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1369         .ukernel_with_symm_padding =
1370           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2,
1371         .output_channel_tile = 4,
1372         .output_height_tile = 2,
1373         .output_width_tile = 2,
1374       };
1375       xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1376         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4,
1377         .output_width_tile = 4,
1378         .output_height_tile = 3,
1379       };
1380       xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1381         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2,
1382         .output_width_tile = 4,
1383         .output_height_tile = 2,
1384       };
1385       xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1386         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4,
1387         .output_width_tile = 4,
1388         .output_height_tile = 4,
1389       };
1390       xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1391         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2,
1392         .output_width_tile = 4,
1393         .output_height_tile = 1,
1394       };
1395       xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1396         .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
1397         .channel_tile = 4,
1398       };
1399       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1400         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neonfma_p8,
1401         .channel_tile = 1,
1402         .pixel_tile = 4,
1403       };
1404     #endif  // XNN_NO_NCHW_OPERATORS
1405   #endif  // XNN_NO_F32_OPERATORS
1406 
1407   /**************************** X32 micro-kernels ****************************/
1408   #ifndef XNN_NO_X32_OPERATORS
1409     init_flags |= XNN_INIT_FLAG_X32;
1410 
1411     xnn_params.x32.fill = (struct fill_parameters) {
1412       .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__neon,
1413       .row_tile = 1,
1414     };
1415     xnn_params.x32.pad = (struct pad_parameters) {
1416       .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__neon,
1417       .row_tile = 1,
1418     };
1419     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
1420     xnn_params.x32.zip = (struct zip_parameters) {
1421       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
1422       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
1423       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
1424       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
1425     };
1426     #ifndef XNN_NO_NCHW_OPERATORS
1427       xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
1428         .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
1429         .channel_tile = 1,
1430         .pixel_tile = 1,
1431       };
1432     #endif  // XNN_NO_NCHW_OPERATORS
1433   #endif  // XNN_NO_X32_OPERATORS
1434 
1435 #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1436   if (!cpuinfo_has_x86_sse2()) {
1437     xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
1438     return;
1439   }
1440 
1441   /**************************** XX micro-kernels ****************************/
1442   #ifndef XNN_NO_XX_OPERATORS
1443     init_flags |= XNN_INIT_FLAG_XX;
1444 
1445     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1446   #endif
1447 
1448   /**************************** QS8 micro-kernels ****************************/
1449   #ifndef XNN_NO_QS8_OPERATORS
1450     init_flags |= XNN_INIT_FLAG_QS8;
1451 
1452     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
1453       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx);
1454       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx);
1455       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx);
1456       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx);
1457       xnn_params.qs8.gemm.mr = 4;
1458       xnn_params.qs8.gemm.nr = 16;
1459       xnn_params.qs8.gemm.log2_kr = 3;
1460     } else if (cpuinfo_has_x86_xop()) {
1461       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
1462       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64);
1463       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64);
1464       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64);
1465       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64);
1466       xnn_params.qs8.gemm.mr = 2;
1467       xnn_params.qs8.gemm.nr = 4;
1468       xnn_params.qs8.gemm.log2_kr = 3;
1469     } else if (cpuinfo_has_x86_avx2()) {
1470       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2);
1471       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2);
1472       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2);
1473       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2);
1474       xnn_params.qs8.gemm.mr = 3;
1475       xnn_params.qs8.gemm.nr = 8;
1476       xnn_params.qs8.gemm.log2_kr = 3;
1477     } else if (cpuinfo_has_x86_sse4_1()) {
1478       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64);
1479       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64);
1480       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64);
1481       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64);
1482       xnn_params.qs8.gemm.mr = 3;
1483       xnn_params.qs8.gemm.nr = 4;
1484       xnn_params.qs8.gemm.log2_kr = 3;
1485     } else if (cpuinfo_has_x86_ssse3()) {
1486       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld64);
1487       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld64);
1488       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64);
1489       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld64);
1490       xnn_params.qs8.gemm.mr = 3;
1491       xnn_params.qs8.gemm.nr = 4;
1492       xnn_params.qs8.gemm.log2_kr = 3;
1493     } else {
1494       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld64);
1495       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld64);
1496       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64);
1497       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld64);
1498       xnn_params.qs8.gemm.mr = 3;
1499       xnn_params.qs8.gemm.nr = 4;
1500       xnn_params.qs8.gemm.log2_kr = 3;
1501     }
1502 
1503     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
1504       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32;
1505       xnn_params.qs8.dwconv[0].channel_tile = 32;
1506     } else if (cpuinfo_has_x86_avx2()) {
1507       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32;
1508       xnn_params.qs8.dwconv[0].channel_tile = 16;
1509     } else if (cpuinfo_has_x86_sse4_1()) {
1510       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16;
1511       xnn_params.qs8.dwconv[0].channel_tile = 8;
1512     } else if (cpuinfo_has_x86_ssse3()) {
1513       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16;
1514       xnn_params.qs8.dwconv[0].channel_tile = 8;
1515     } else if (cpuinfo_has_x86_sse2()) {
1516       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16;
1517       xnn_params.qs8.dwconv[0].channel_tile = 8;
1518     }
1519     xnn_params.qs8.dwconv[0].primary_tile = 9;
1520 
1521     if (cpuinfo_has_x86_sse4_1()) {
1522       xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
1523         .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c8_acc2,
1524         .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c8_acc2,
1525         .mr = 7,
1526       };
1527     } else if (cpuinfo_has_x86_ssse3()) {
1528       xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
1529         .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__ssse3_c8_acc2,
1530         .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__ssse3_c8_acc2,
1531         .mr = 7,
1532       };
1533     } else if (cpuinfo_has_x86_sse2()) {
1534       xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
1535         .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c8_acc2,
1536         .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c8_acc2,
1537         .mr = 7,
1538       };
1539     }
1540 
1541     if (cpuinfo_has_x86_xop()) {
1542       xnn_params.qs8.vadd = (struct vbinary_parameters) {
1543         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
1544         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
1545         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
1546         .element_tile = 8,
1547       };
1548     } else if (cpuinfo_has_x86_sse4_1()) {
1549       xnn_params.qs8.vadd = (struct vbinary_parameters) {
1550         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
1551         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
1552         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
1553         .element_tile = 8,
1554       };
1555     } else {
1556       xnn_params.qs8.vadd = (struct vbinary_parameters) {
1557         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
1558         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
1559         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
1560         .element_tile = 8,
1561       };
1562     }
1563   #endif  // XNN_NO_QS8_OPERATORS
1564 
1565   /**************************** QU8 micro-kernels ****************************/
1566   #ifndef XNN_NO_QU8_OPERATORS
1567     init_flags |= XNN_INIT_FLAG_QU8;
1568 
1569     xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2);
1570     xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2);
1571     xnn_params.qu8.gemm.mr = 4;
1572     xnn_params.qu8.gemm.nr = 4;
1573     xnn_params.qu8.gemm.log2_kr = 1;
1574 
1575     xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2;
1576     xnn_params.qu8.dwconv[0].channel_tile = 8;
1577     xnn_params.qu8.dwconv[0].primary_tile = 9;
1578 
1579     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
1580       .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8,
1581       .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8,
1582       .mr = 9,
1583       .qr = 8,
1584     };
1585     xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
1586       .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8,
1587       .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8,
1588       .mr = 7,
1589     };
1590     xnn_params.qu8.vadd = (xnn_vadd_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse2;
1591   #endif  // XNN_NO_QU8_OPERATORS
1592 
1593   /**************************** U8 micro-kernels ****************************/
1594   #ifndef XNN_NO_U8_OPERATORS
1595     init_flags |= XNN_INIT_FLAG_U8;
1596 
1597     xnn_params.u8.maxpool = (struct maxpool_parameters) {
1598       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16,
1599       .mr = 9,
1600       .qr = 8,
1601     };
1602     xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__sse2_x64;
1603     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1604     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__sse2;
1605   #endif  // XNN_NO_U8_OPERATORS
1606 
1607   /**************************** X8 micro-kernels ****************************/
1608   #ifndef XNN_NO_X8_OPERATORS
1609     init_flags |= XNN_INIT_FLAG_X8;
1610 
1611     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
1612     xnn_params.x8.zip = (struct zip_parameters) {
1613       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__sse2,
1614       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__sse2,
1615       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__sse2,
1616       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__sse2,
1617     };
1618   #endif  // XNN_NO_X8_OPERATORS
1619 
1620   /**************************** F32 micro-kernels ****************************/
1621   #ifndef XNN_NO_F32_OPERATORS
1622     init_flags |= XNN_INIT_FLAG_F32;
1623 
1624     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1625       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
1626       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
1627       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
1628       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
1629       xnn_params.f32.gemm.mr = 7;
1630       xnn_params.f32.gemm.nr = 16;
1631     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
1632       switch (cpuinfo_get_core(0)->uarch) {
1633         case cpuinfo_uarch_zen:
1634         case cpuinfo_uarch_dhyana:
1635           xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast);
1636           xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast);
1637           xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast);
1638           xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast);
1639           xnn_params.f32.gemm.mr = 4;
1640           xnn_params.f32.gemm.nr = 16;
1641           xnn_params.f32.gemm.log2_sr = 2;
1642           break;
1643         default:
1644           xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast);
1645           xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast);
1646           xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast);
1647           xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast);
1648           xnn_params.f32.gemm.mr = 5;
1649           xnn_params.f32.gemm.nr = 16;
1650           break;
1651       }
1652     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1653       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast);
1654       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast);
1655       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast);
1656       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast);
1657       xnn_params.f32.gemm.mr = 5;
1658       xnn_params.f32.gemm.nr = 16;
1659     } else {
1660       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__sse_load1);
1661       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__sse_load1);
1662       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__sse_load1);
1663       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__sse_load1);
1664       xnn_params.f32.gemm.mr = 4;
1665       xnn_params.f32.gemm.nr = 8;
1666     }
1667     xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__sse);
1668     xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__sse);
1669     xnn_params.f32.gemm2.mr = 4;
1670     xnn_params.f32.gemm2.nr = 2;
1671     xnn_params.f32.gemm2.log2_kr = 2;
1672 
1673     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1674       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx512f;
1675       xnn_params.f32.dwconv[0].channel_tile = 16;
1676       xnn_params.f32.dwconv[0].primary_tile = 4;
1677 
1678       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx512f;
1679       xnn_params.f32.dwconv[1].channel_tile = 16;
1680       xnn_params.f32.dwconv[1].primary_tile = 9;
1681 
1682       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x25__avx512f;
1683       xnn_params.f32.dwconv[2].channel_tile = 16;
1684       xnn_params.f32.dwconv[2].primary_tile = 25;
1685     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
1686       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__fma3;
1687       xnn_params.f32.dwconv[0].channel_tile = 16;
1688       xnn_params.f32.dwconv[0].primary_tile = 4;
1689 
1690       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__fma3;
1691       xnn_params.f32.dwconv[1].channel_tile = 16;
1692       xnn_params.f32.dwconv[1].primary_tile = 9;
1693 
1694       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__fma3;
1695       xnn_params.f32.dwconv[2].channel_tile = 8;
1696       xnn_params.f32.dwconv[2].primary_tile = 25;
1697     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1698       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx;
1699       xnn_params.f32.dwconv[0].channel_tile = 16;
1700       xnn_params.f32.dwconv[0].primary_tile = 4;
1701 
1702       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx;
1703       xnn_params.f32.dwconv[1].channel_tile = 16;
1704       xnn_params.f32.dwconv[1].primary_tile = 9;
1705 
1706       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__avx;
1707       xnn_params.f32.dwconv[2].channel_tile = 8;
1708       xnn_params.f32.dwconv[2].primary_tile = 25;
1709     } else {
1710       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__sse;
1711       xnn_params.f32.dwconv[0].channel_tile = 8;
1712       xnn_params.f32.dwconv[0].primary_tile = 4;
1713 
1714       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__sse;
1715       xnn_params.f32.dwconv[1].channel_tile = 8;
1716       xnn_params.f32.dwconv[1].primary_tile = 9;
1717 
1718       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__sse;
1719       xnn_params.f32.dwconv[2].channel_tile = 8;
1720       xnn_params.f32.dwconv[2].primary_tile = 25;
1721     }
1722     xnn_params.f32.avgpool = (struct avgpool_parameters) {
1723       .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__sse_c4,
1724       .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4,
1725       .mr = 9,
1726       .qr = 8,
1727     };
1728     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
1729       .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4,
1730       .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4,
1731       .mr = 9,
1732       .qr = 8,
1733     };
1734     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
1735       .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4,
1736       .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4,
1737       .mr = 7,
1738     };
1739     xnn_params.f32.maxpool = (struct maxpool_parameters) {
1740       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4,
1741       .mr = 9,
1742       .qr = 8,
1743     };
1744     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1745       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
1746       .mr = 4,
1747     };
1748     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1749       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
1750       .mr = 9,
1751     };
1752     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1753       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
1754       .mr = 9,
1755       .qr = 8,
1756     };
1757     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1758       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__sse_c8,
1759       .pixel_tile = 1,
1760       .channel_tile = 8,
1761     };
1762     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1763       xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx512f_x16;
1764     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1765       xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx_x16;
1766     } else {
1767       xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__sse_x8;
1768     }
1769     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1770       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__avx512f_x16;
1771     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1772       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__avx_x16;
1773     } else {
1774       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__sse_x8;
1775     }
1776     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1777       xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x64;
1778     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
1779       xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56;
1780     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1781       xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32;
1782     } else {
1783       xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12;
1784     }
1785     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1786       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__avx512f_x16;
1787     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
1788       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__fma3_x16;
1789     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1790       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__avx_x16;
1791     } else {
1792       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__sse_x8;
1793     }
1794     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1795       xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx512f_x16;
1796     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1797       xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx_x16;
1798     } else if (cpuinfo_has_x86_sse4_1()) {
1799       xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse41_x8;
1800     } else {
1801       xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse_x8;
1802     }
1803     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1804       xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx512f_x16;
1805     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1806       xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx_x16;
1807     } else {
1808       xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__sse_x8;
1809     }
1810     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1811       xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx512f_x16;
1812       xnn_params.f32.rndz  = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx512f_x16;
1813       xnn_params.f32.rndu  = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx512f_x16;
1814       xnn_params.f32.rndd  = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx512f_x16;
1815     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1816       xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx_x16;
1817       xnn_params.f32.rndz  = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx_x16;
1818       xnn_params.f32.rndu  = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx_x16;
1819       xnn_params.f32.rndd  = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx_x16;
1820     } else if (cpuinfo_has_x86_sse4_1()) {
1821       xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse41_x8;
1822       xnn_params.f32.rndz  = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse41_x8;
1823       xnn_params.f32.rndu  = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse41_x8;
1824       xnn_params.f32.rndd  = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse41_x8;
1825     } else {
1826       xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse2_x8;
1827       xnn_params.f32.rndz  = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse2_x8;
1828       xnn_params.f32.rndu  = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse2_x8;
1829       xnn_params.f32.rndd  = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse2_x8;
1830     }
1831     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1832       xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x64;
1833     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
1834       xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x40;
1835     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1836       xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__avx_rr2_p5_nr2_x40;
1837     } else if (cpuinfo_has_x86_sse4_1()) {
1838       xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__sse41_lut64_p2_div_x8;
1839     } else {
1840       xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__sse2_lut64_p2_div_x8;
1841     }
1842     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1843       xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx512f_x16;
1844     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1845       xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx_x16;
1846     } else {
1847       xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__sse_x8;
1848     }
1849     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1850       xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__avx_sqrt_x8;
1851     } else {
1852       xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__sse_sqrt_x4;
1853     }
1854     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1855       xnn_params.f32.prelu = (struct prelu_parameters) {
1856         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx512f_2x16,
1857         .row_tile = 2,
1858         .channel_tile = 16,
1859       };
1860     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1861       xnn_params.f32.prelu = (struct prelu_parameters) {
1862         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx_2x16,
1863         .row_tile = 2,
1864         .channel_tile = 16,
1865       };
1866     } else if (cpuinfo_has_x86_sse4_1()) {
1867       xnn_params.f32.prelu = (struct prelu_parameters) {
1868         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse41_2x8,
1869         .row_tile = 2,
1870         .channel_tile = 8,
1871       };
1872     } else {
1873       xnn_params.f32.prelu = (struct prelu_parameters) {
1874         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse2_2x8,
1875         .row_tile = 2,
1876         .channel_tile = 8,
1877       };
1878     }
1879     xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2;
1880     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__sse;
1881     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1882       xnn_params.f32.vadd = (struct vbinary_parameters) {
1883         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx512f_x32,
1884         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
1885         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
1886         .element_tile = 32,
1887       };
1888       xnn_params.f32.vdiv = (struct vbinary_parameters) {
1889         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx512f_x32,
1890         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx512f_x32,
1891         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx512f_x32,
1892         .element_tile = 32,
1893       };
1894       xnn_params.f32.vmax = (struct vbinary_parameters) {
1895         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx512f_x32,
1896         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
1897         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
1898         .element_tile = 32,
1899       };
1900       xnn_params.f32.vmin = (struct vbinary_parameters) {
1901         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx512f_x32,
1902         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
1903         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
1904         .element_tile = 32,
1905       };
1906       xnn_params.f32.vmul = (struct vbinary_parameters) {
1907         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx512f_x32,
1908         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
1909         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
1910         .element_tile = 32,
1911       };
1912       xnn_params.f32.vsub = (struct vbinary_parameters) {
1913         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx512f_x32,
1914         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx512f_x32,
1915         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx512f_x32,
1916         .element_tile = 32,
1917       };
1918       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1919         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx512f_x32,
1920         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
1921         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
1922         .element_tile = 32,
1923       };
1924     } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1925       xnn_params.f32.vadd = (struct vbinary_parameters) {
1926         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx_x16,
1927         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
1928         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
1929         .element_tile = 16,
1930       };
1931       xnn_params.f32.vdiv = (struct vbinary_parameters) {
1932         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx_x16,
1933         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx_x16,
1934         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx_x16,
1935         .element_tile = 16,
1936       };
1937       xnn_params.f32.vmax = (struct vbinary_parameters) {
1938         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx_x16,
1939         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
1940         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
1941         .element_tile = 16,
1942       };
1943       xnn_params.f32.vmin = (struct vbinary_parameters) {
1944         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx_x16,
1945         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
1946         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
1947         .element_tile = 16,
1948       };
1949       xnn_params.f32.vmul = (struct vbinary_parameters) {
1950         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx_x16,
1951         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
1952         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
1953         .element_tile = 16,
1954       };
1955       xnn_params.f32.vsub = (struct vbinary_parameters) {
1956         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx_x16,
1957         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx_x16,
1958         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx_x16,
1959         .element_tile = 16,
1960       };
1961       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1962         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx_x16,
1963         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
1964         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
1965         .element_tile = 16,
1966       };
1967     } else {
1968       xnn_params.f32.vadd = (struct vbinary_parameters) {
1969         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__sse_x8,
1970         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
1971         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
1972         .element_tile = 8,
1973       };
1974       xnn_params.f32.vdiv = (struct vbinary_parameters) {
1975         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__sse_x8,
1976         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__sse_x8,
1977         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__sse_x8,
1978         .element_tile = 8,
1979       };
1980       xnn_params.f32.vmax = (struct vbinary_parameters) {
1981         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__sse_x8,
1982         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
1983         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
1984         .element_tile = 8,
1985       };
1986       xnn_params.f32.vmin = (struct vbinary_parameters) {
1987         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__sse_x8,
1988         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
1989         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
1990         .element_tile = 8,
1991       };
1992       xnn_params.f32.vmul = (struct vbinary_parameters) {
1993         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__sse_x8,
1994         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
1995         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
1996         .element_tile = 8,
1997       };
1998       xnn_params.f32.vsub = (struct vbinary_parameters) {
1999         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__sse_x8,
2000         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__sse_x8,
2001         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__sse_x8,
2002         .element_tile = 8,
2003       };
2004       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
2005         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__sse_x8,
2006         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
2007         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
2008         .element_tile = 8,
2009       };
2010     }
2011     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
2012       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x,
2013       .channel_tile = 4,
2014       .row_tile = 2,
2015     };
2016     #ifndef XNN_NO_NCHW_OPERATORS
2017       // Sparse microkernels on x86 currently target only SSE, and on processors
2018       // with AVX ISA dense inference is expected to be faster than sparse.
2019       if (!cpuinfo_has_x86_avx()) {
2020         init_flags |= XNN_INIT_FLAG_CHW_OPT;
2021       }
2022 
2023       xnn_params.f32.spmm = (struct spmm_parameters) {
2024         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__sse,
2025         .mr = 32,
2026         .nr = 1,
2027       };
2028       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
2029         .ukernel_with_symm_padding =
2030           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2,
2031         .output_channel_tile = 4,
2032         .output_height_tile = 2,
2033         .output_width_tile = 2,
2034       };
2035       if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_ssse3()) {
2036         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2037           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2,
2038           .output_width_tile = 4,
2039           .output_height_tile = 2,
2040         };
2041       } else {
2042         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2043           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2,
2044           .output_width_tile = 4,
2045           .output_height_tile = 2,
2046         };
2047       }
2048       xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2049         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3,
2050         .output_width_tile = 4,
2051         .output_height_tile = 1,
2052       };
2053       xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2054         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4,
2055         .output_width_tile = 4,
2056         .output_height_tile = 4,
2057       };
2058       xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2059         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4,
2060         .output_width_tile = 4,
2061         .output_height_tile = 2,
2062       };
2063       xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2064         .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__sse_x4,
2065         .channel_tile = 4,
2066       };
2067       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
2068         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
2069         .channel_tile = 1,
2070         .pixel_tile = 4,
2071       };
2072     #endif  // XNN_NO_NCHW_OPERATORS
2073   #endif  // XNN_NO_F32_OPERATORS
2074 
2075   /**************************** X32 micro-kernels ****************************/
2076   #ifndef XNN_NO_X32_OPERATORS
2077     init_flags |= XNN_INIT_FLAG_X32;
2078 
2079     xnn_params.x32.fill = (struct fill_parameters) {
2080       .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__sse,
2081       .row_tile = 1,
2082     };
2083     xnn_params.x32.pad = (struct pad_parameters) {
2084       .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__sse,
2085       .row_tile = 1,
2086     };
2087     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__sse2;
2088     xnn_params.x32.zip = (struct zip_parameters) {
2089       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__sse2,
2090       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__sse2,
2091       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__sse2,
2092       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__sse2,
2093     };
2094     #ifndef XNN_NO_NCHW_OPERATORS
2095       xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
2096         .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
2097         .channel_tile = 1,
2098         .pixel_tile = 1,
2099       };
2100     #endif  // XNN_NO_NCHW_OPERATORS
2101   #endif  // XNN_NO_X32_OPERATORS
2102 
2103 #elif XNN_ARCH_WASMSIMD
2104 
2105   /**************************** XX micro-kernels ****************************/
2106   #ifndef XNN_NO_XX_OPERATORS
2107     init_flags |= XNN_INIT_FLAG_XX;
2108 
2109     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
2110   #endif
2111 
2112   /**************************** QS8 micro-kernels ****************************/
2113   #ifndef XNN_NO_QS8_OPERATORS
2114     init_flags |= XNN_INIT_FLAG_QS8;
2115 
2116     xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64);
2117     xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64);
2118     xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64);
2119     xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64);
2120     xnn_params.qs8.gemm.mr = 3;
2121     xnn_params.qs8.gemm.nr = 4;
2122     xnn_params.qs8.gemm.log2_kr = 3;
2123 
2124     xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16;
2125     xnn_params.qs8.dwconv[0].channel_tile = 8;
2126     xnn_params.qs8.dwconv[0].primary_tile = 9;
2127 
2128     xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
2129       .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c8_acc2,
2130       .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__wasmsimd_c8_acc2,
2131       .mr = 7,
2132     };
2133 
2134     xnn_params.qs8.vadd = (struct vbinary_parameters) {
2135       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8,
2136       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8,
2137       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8,
2138       .element_tile = 8,
2139     };
2140   #endif  // XNN_NO_QS8_OPERATORS
2141 
2142   /**************************** QU8 micro-kernels ****************************/
2143   #ifndef XNN_NO_QU8_OPERATORS
2144     init_flags |= XNN_INIT_FLAG_QU8;
2145 
2146     xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_ukernel_2x2__scalar);
2147     xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_ukernel_2x2__scalar);
2148     xnn_params.qu8.gemm.mr = 2;
2149     xnn_params.qu8.gemm.nr = 2;
2150 
2151     xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_ukernel_up1x9__scalar;
2152     xnn_params.qu8.dwconv[0].channel_tile = 1;
2153     xnn_params.qu8.dwconv[0].primary_tile = 9;
2154 
2155     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
2156       .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
2157       .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
2158       .mr = 9,
2159       .qr = 8,
2160     };
2161     xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
2162       .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
2163       .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
2164       .mr = 7,
2165     };
2166     xnn_params.qu8.vadd = (xnn_vadd_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar;
2167   #endif  // XNN_NO_QU8_OPERATORS
2168 
2169   /**************************** U8 micro-kernels ****************************/
2170   #ifndef XNN_NO_U8_OPERATORS
2171     init_flags |= XNN_INIT_FLAG_U8;
2172 
2173     xnn_params.u8.maxpool = (struct maxpool_parameters) {
2174       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
2175       .mr = 9,
2176       .qr = 8,
2177     };
2178     xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__scalar_x4;
2179     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2180     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
2181   #endif  // XNN_NO_U8_OPERATORS
2182 
2183   /**************************** X8 micro-kernels ****************************/
2184   #ifndef XNN_NO_X8_OPERATORS
2185     init_flags |= XNN_INIT_FLAG_X8;
2186 
2187     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
2188     xnn_params.x8.zip = (struct zip_parameters) {
2189       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
2190       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
2191       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
2192       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
2193     };
2194   #endif  // XNN_NO_X8_OPERATORS
2195 
2196   /**************************** F32 micro-kernels ****************************/
2197   #ifndef XNN_NO_F32_OPERATORS
2198     init_flags |= XNN_INIT_FLAG_F32;
2199 
2200     if (is_wasm_x86) {
2201       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
2202       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
2203       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
2204       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
2205       xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat);
2206       xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat);
2207       xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
2208       xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
2209       xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__wasmsimd_splat);
2210       xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__wasmsimd_splat);
2211       xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
2212       xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
2213       xnn_params.f32.gemm.mr = 4;
2214       xnn_params.f32.gemm.nr = 8;
2215 
2216       xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
2217       xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
2218       xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
2219       xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
2220       xnn_params.f32.gemm2.mr = 4;
2221       xnn_params.f32.gemm2.nr = 2;
2222       xnn_params.f32.gemm2.log2_kr = 2;
2223     } else {
2224       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
2225       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
2226       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
2227       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
2228       xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat);
2229       xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat);
2230       xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
2231       xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
2232       xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x8__wasmsimd_splat);
2233       xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x8__wasmsimd_splat);
2234       xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
2235       xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
2236       xnn_params.f32.gemm.mr = 5;
2237       xnn_params.f32.gemm.nr = 8;
2238 
2239       xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
2240       xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
2241       xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
2242       xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
2243       xnn_params.f32.gemm2.mr = 4;
2244       xnn_params.f32.gemm2.nr = 2;
2245       xnn_params.f32.gemm2.log2_kr = 2;
2246     }
2247 
2248     if (is_wasm_x86) {
2249       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86;
2250       xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__wasmsimd;
2251       xnn_params.f32.dwconv[0].channel_tile = 8;
2252       xnn_params.f32.dwconv[0].primary_tile = 4;
2253 
2254       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_x86;
2255       xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__wasmsimd;
2256       xnn_params.f32.dwconv[1].channel_tile = 8;
2257       xnn_params.f32.dwconv[1].primary_tile = 9;
2258     } else {
2259       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_arm;
2260       xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__wasmsimd;
2261       xnn_params.f32.dwconv[0].channel_tile = 4;
2262       xnn_params.f32.dwconv[0].primary_tile = 4;
2263 
2264       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_arm;
2265       xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__wasmsimd;
2266       xnn_params.f32.dwconv[1].channel_tile = 4;
2267       xnn_params.f32.dwconv[1].primary_tile = 9;
2268     }
2269 
2270     xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__wasmsimd_arm;
2271     xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__wasmsimd;
2272     xnn_params.f32.dwconv[2].channel_tile = 4;
2273     xnn_params.f32.dwconv[2].primary_tile = 25;
2274 
2275     if (is_wasm_x86) {
2276       xnn_params.f32.avgpool = (struct avgpool_parameters) {
2277         .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
2278         .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
2279         .mr = 9,
2280         .qr = 8,
2281       };
2282       xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
2283         .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
2284         .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
2285         .mr = 9,
2286         .qr = 8,
2287       };
2288       xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
2289         .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4,
2290         .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4,
2291         .mr = 7,
2292       };
2293     } else {
2294       xnn_params.f32.avgpool = (struct avgpool_parameters) {
2295         .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
2296         .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
2297         .mr = 9,
2298         .qr = 8,
2299       };
2300       xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
2301         .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
2302         .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
2303         .mr = 9,
2304         .qr = 8,
2305       };
2306       xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
2307         .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4,
2308         .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4,
2309         .mr = 7,
2310       };
2311     }
2312     if (is_wasm_x86) {
2313       xnn_params.f32.maxpool = (struct maxpool_parameters) {
2314         .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
2315         .mr = 9,
2316         .qr = 8,
2317       };
2318     } else {
2319       xnn_params.f32.maxpool = (struct maxpool_parameters) {
2320         .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
2321         .mr = 9,
2322         .qr = 8,
2323       };
2324     }
2325     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
2326       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__wasmsimd_c4,
2327       .mr = 4,
2328     };
2329     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
2330       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__wasmsimd_c4,
2331       .mr = 9,
2332     };
2333     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
2334       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__wasmsimd_c4,
2335       .mr = 9,
2336       .qr = 8,
2337     };
2338     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
2339       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__wasmsimd_c8,
2340       .pixel_tile = 1,
2341       .channel_tile = 8,
2342     };
2343     xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8;
2344     if (is_wasm_x86) {
2345       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasmsimd_x86_x8;
2346     } else {
2347       xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasmsimd_arm_x8;
2348     }
2349     if (is_wasm_x86) {
2350       xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20;
2351     } else {
2352       xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20;
2353     }
2354     xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasmsimd_x16;
2355     if (is_wasm_x86) {
2356       xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x8;
2357     } else {
2358       xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x8;
2359     }
2360     xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__wasmsimd_x8;
2361     xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_relu_ukernel__wasmsimd_x16;
2362     xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8;
2363     if (is_wasm_x86) {
2364       xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8;
2365     } else {
2366       xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8;
2367     }
2368     xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8;
2369     xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8;
2370     xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x16;
2371     xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__wasmsimd_x8;
2372     xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8;
2373     if (is_wasm_x86) {
2374       xnn_params.f32.prelu = (struct prelu_parameters) {
2375         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_minmax_2x8,
2376         .row_tile = 2,
2377         .channel_tile = 8,
2378       };
2379     } else {
2380       xnn_params.f32.prelu = (struct prelu_parameters) {
2381         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_bitselect_2x8,
2382         .row_tile = 2,
2383         .channel_tile = 8,
2384       };
2385     }
2386     xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x16_acc2;
2387     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_arm;
2388     if (is_wasm_x86) {
2389       xnn_params.f32.vadd = (struct vbinary_parameters) {
2390         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16,
2391         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
2392         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
2393         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
2394         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
2395         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
2396         .element_tile = 16,
2397       };
2398       xnn_params.f32.vdiv = (struct vbinary_parameters) {
2399         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16,
2400         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16,
2401         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16,
2402         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
2403         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
2404         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
2405         .element_tile = 16,
2406       };
2407       xnn_params.f32.vmax = (struct vbinary_parameters) {
2408         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_x86_x16,
2409         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
2410         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
2411         .element_tile = 16,
2412       };
2413       xnn_params.f32.vmin = (struct vbinary_parameters) {
2414         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_x86_x16,
2415         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
2416         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
2417 
2418         .element_tile = 16,
2419       };
2420       xnn_params.f32.vmul = (struct vbinary_parameters) {
2421         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16,
2422         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
2423         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
2424         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
2425         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
2426         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
2427         .element_tile = 16,
2428       };
2429       xnn_params.f32.vsub = (struct vbinary_parameters) {
2430         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16,
2431         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16,
2432         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16,
2433         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
2434         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
2435         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
2436         .element_tile = 16,
2437       };
2438     } else {
2439       xnn_params.f32.vadd = (struct vbinary_parameters) {
2440         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16,
2441         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
2442         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
2443         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
2444         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
2445         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
2446         .element_tile = 16,
2447       };
2448       xnn_params.f32.vdiv = (struct vbinary_parameters) {
2449         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16,
2450         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16,
2451         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16,
2452         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
2453         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
2454         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
2455         .element_tile = 16,
2456       };
2457       xnn_params.f32.vmax = (struct vbinary_parameters) {
2458         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_arm_x16,
2459         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
2460         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
2461         .element_tile = 16,
2462       };
2463       xnn_params.f32.vmin = (struct vbinary_parameters) {
2464         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_arm_x16,
2465         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
2466         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
2467         .element_tile = 16,
2468       };
2469       xnn_params.f32.vmul = (struct vbinary_parameters) {
2470         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16,
2471         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
2472         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
2473         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
2474         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
2475         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
2476         .element_tile = 16,
2477       };
2478       xnn_params.f32.vsub = (struct vbinary_parameters) {
2479         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16,
2480         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16,
2481         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16,
2482         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
2483         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
2484         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
2485         .element_tile = 16,
2486       };
2487     }
2488     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
2489       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__wasmsimd_x16,
2490       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
2491       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
2492       .element_tile = 16,
2493     };
2494     if (is_wasm_x86) {
2495       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
2496         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_x86_2x,
2497         .channel_tile = 4,
2498         .row_tile = 2,
2499       };
2500     } else {
2501       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
2502         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_arm_2x,
2503         .channel_tile = 4,
2504         .row_tile = 2,
2505       };
2506     }
2507     #ifndef XNN_NO_NCHW_OPERATORS
2508       init_flags |= XNN_INIT_FLAG_CHW_OPT;
2509 
2510       if (is_wasm_x86) {
2511         xnn_params.f32.spmm = (struct spmm_parameters) {
2512           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86,
2513           .mr = 32,
2514           .nr = 1,
2515         };
2516       } else {
2517         xnn_params.f32.spmm = (struct spmm_parameters) {
2518           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm,
2519           .mr = 32,
2520           .nr = 1,
2521         };
2522       }
2523       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
2524         .ukernel_with_symm_padding =
2525           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2,
2526         .output_channel_tile = 4,
2527         .output_height_tile = 2,
2528         .output_width_tile = 2,
2529       };
2530       if (is_wasm_x86) {
2531         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2532           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4,
2533           .output_width_tile = 4,
2534           .output_height_tile = 2,
2535         };
2536         xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2537           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2,
2538           .output_width_tile = 4,
2539           .output_height_tile = 1,
2540         };
2541         xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2542           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4,
2543           .output_width_tile = 4,
2544           .output_height_tile = 3,
2545         };
2546         xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2547           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2,
2548           .output_width_tile = 4,
2549           .output_height_tile = 1,
2550         };
2551       } else {
2552         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2553           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4,
2554           .output_width_tile = 4,
2555           .output_height_tile = 2,
2556         };
2557         xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2558           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4,
2559           .output_width_tile = 4,
2560           .output_height_tile = 1,
2561         };
2562         xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2563           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4,
2564           .output_width_tile = 4,
2565           .output_height_tile = 3,
2566         };
2567         xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2568           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2,
2569           .output_width_tile = 4,
2570           .output_height_tile = 1,
2571         };
2572       }
2573       if (is_wasm_x86) {
2574         xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2575           .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4,
2576           .channel_tile = 4,
2577         };
2578       } else {
2579         xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2580           .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4,
2581           .channel_tile = 4,
2582         };
2583       }
2584       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
2585         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8,
2586         .channel_tile = 1,
2587         .pixel_tile = 4,
2588       };
2589     #endif  // XNN_NO_NCHW_OPERATORS
2590   #endif  // XNN_NO_F32_OPERATORS
2591 
2592   /**************************** X32 micro-kernels ****************************/
2593   #ifndef XNN_NO_X32_OPERATORS
2594     init_flags |= XNN_INIT_FLAG_X32;
2595 
2596     xnn_params.x32.fill = (struct fill_parameters) {
2597       .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__wasmsimd,
2598       .row_tile = 1,
2599     };
2600     xnn_params.x32.pad = (struct pad_parameters) {
2601       .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__wasmsimd,
2602       .row_tile = 1,
2603     };
2604     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__wasmsimd;
2605     xnn_params.x32.zip = (struct zip_parameters) {
2606       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__wasmsimd,
2607       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__wasmsimd,
2608       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__wasmsimd,
2609       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__wasmsimd,
2610     };
2611     #ifndef XNN_NO_NCHW_OPERATORS
2612       xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
2613         .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
2614         .channel_tile = 1,
2615         .pixel_tile = 1,
2616       };
2617     #endif  // XNN_NO_NCHW_OPERATORS
2618   #endif  // XNN_NO_X32_OPERATORS
2619 
2620 #elif XNN_ARCH_WASM
2621 
2622   /**************************** XX micro-kernels ****************************/
2623   #ifndef XNN_NO_XX_OPERATORS
2624     init_flags |= XNN_INIT_FLAG_XX;
2625 
2626     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
2627   #endif
2628 
2629   /**************************** QU8 micro-kernels ****************************/
2630   #ifndef XNN_NO_QU8_OPERATORS
2631     init_flags |= XNN_INIT_FLAG_QU8;
2632 
2633     xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_ukernel_2x2__scalar);
2634     xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_ukernel_2x2__scalar);
2635     xnn_params.qu8.gemm.mr = 2;
2636     xnn_params.qu8.gemm.nr = 2;
2637 
2638     xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_ukernel_up1x9__scalar;
2639     xnn_params.qu8.dwconv[0].channel_tile = 1;
2640     xnn_params.qu8.dwconv[0].primary_tile = 9;
2641 
2642     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
2643       .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
2644       .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
2645       .mr = 9,
2646       .qr = 8,
2647     };
2648     xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
2649       .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
2650       .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
2651       .mr = 7,
2652     };
2653     xnn_params.qu8.vadd = (xnn_vadd_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar;
2654   #endif  // XNN_NO_QU8_OPERATORS
2655 
2656   /**************************** U8 micro-kernels ****************************/
2657   #ifndef XNN_NO_U8_OPERATORS
2658     init_flags |= XNN_INIT_FLAG_U8;
2659 
2660     xnn_params.u8.maxpool = (struct maxpool_parameters) {
2661       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
2662       .mr = 9,
2663       .qr = 8,
2664     };
2665     xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__scalar_x4;
2666     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2667     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
2668   #endif  // XNN_NO_U8_OPERATORS
2669 
2670   /**************************** X8 micro-kernels ****************************/
2671   #ifndef XNN_NO_X8_OPERATORS
2672     init_flags |= XNN_INIT_FLAG_X8;
2673 
2674     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
2675     xnn_params.x8.zip = (struct zip_parameters) {
2676       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
2677       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
2678       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
2679       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
2680     };
2681   #endif  // XNN_NO_X8_OPERATORS
2682 
2683   /**************************** F32 micro-kernels ****************************/
2684   #ifndef XNN_NO_F32_OPERATORS
2685     init_flags |= XNN_INIT_FLAG_F32;
2686 
2687     if (is_wasm_x86) {
2688       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_2x4__scalar);
2689       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_2x4__scalar);
2690       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
2691       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
2692       xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_2x4__scalar);
2693       xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_2x4__scalar);
2694       xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
2695       xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
2696       xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_2x4__scalar);
2697       xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_2x4__scalar);
2698       xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
2699       xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
2700       xnn_params.f32.gemm.mr = 2;
2701       xnn_params.f32.gemm.nr = 4;
2702     } else {
2703       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__wasm);
2704       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__wasm);
2705       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
2706       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
2707       xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__wasm);
2708       xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__wasm);
2709       xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
2710       xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
2711       xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__wasm);
2712       xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__wasm);
2713       xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
2714       xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
2715       xnn_params.f32.gemm.mr = 4;
2716       xnn_params.f32.gemm.nr = 4;
2717     }
2718     xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__wasm);
2719     xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__wasm),
2720     xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__wasm);
2721     xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__wasm),
2722     xnn_params.f32.gemm2.mr = 4;
2723     xnn_params.f32.gemm2.nr = 2;
2724 
2725     xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__wasm_acc2;
2726     xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__wasm_acc2;
2727     xnn_params.f32.dwconv[0].channel_tile = 1;
2728     xnn_params.f32.dwconv[0].primary_tile = 4;
2729 
2730     xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__wasm_acc2;
2731     xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__wasm_acc2;
2732     xnn_params.f32.dwconv[1].channel_tile = 1;
2733     xnn_params.f32.dwconv[1].primary_tile = 9;
2734 
2735     xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__wasm_acc2;
2736     xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__wasm_acc2;
2737     xnn_params.f32.dwconv[2].channel_tile = 1;
2738     xnn_params.f32.dwconv[2].primary_tile = 25;
2739 
2740     xnn_params.f32.avgpool = (struct avgpool_parameters) {
2741       .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasm_c1,
2742       .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasm_c1,
2743       .mr = 9,
2744       .qr = 8,
2745     };
2746     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
2747       .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasm_c1,
2748       .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasm_c1,
2749       .mr = 9,
2750       .qr = 8,
2751     };
2752     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
2753       .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1,
2754       .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1,
2755       .mr = 7,
2756     };
2757     xnn_params.f32.maxpool = (struct maxpool_parameters) {
2758       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasm_c1,
2759       .mr = 9,
2760       .qr = 8,
2761     };
2762     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
2763       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
2764       .mr = 4,
2765     };
2766     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
2767       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
2768       .mr = 9,
2769     };
2770     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
2771       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
2772       .mr = 9,
2773       .qr = 8,
2774     };
2775     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
2776       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
2777       .pixel_tile = 1,
2778       .channel_tile = 2,
2779     };
2780     xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4;
2781     xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasm_x4;
2782     if (is_wasm_x86) {
2783       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__scalar_x4;
2784     } else {
2785       xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasm_x4;
2786     }
2787     if (is_wasm_x86) {
2788       xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2;
2789     } else {
2790       xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasm_rr2_p6_x6;
2791     }
2792     xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4;
2793     xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4;
2794     if (is_wasm_x86) {
2795       xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_relu_ukernel__scalar_x8;
2796     } else {
2797       xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_relu_ukernel__wasm_x8;
2798     }
2799     xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4;
2800     xnn_params.f32.rndz  = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4;
2801     xnn_params.f32.rndu  = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4;
2802     xnn_params.f32.rndd  = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4;
2803     xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2;
2804     xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4;
2805     xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1;
2806     if (is_wasm_x86) {
2807       xnn_params.f32.prelu = (struct prelu_parameters) {
2808         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
2809         .row_tile = 2,
2810         .channel_tile = 4,
2811       };
2812     } else {
2813       xnn_params.f32.prelu = (struct prelu_parameters) {
2814         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
2815         .row_tile = 2,
2816         .channel_tile = 4,
2817       };
2818     }
2819     xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2;
2820     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
2821     xnn_params.f32.vadd = (struct vbinary_parameters) {
2822       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasm_x8,
2823       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
2824       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
2825       .element_tile = 8,
2826     };
2827     xnn_params.f32.vdiv = (struct vbinary_parameters) {
2828       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasm_x8,
2829       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasm_x8,
2830       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasm_x8,
2831       .element_tile = 8,
2832     };
2833     xnn_params.f32.vmax = (struct vbinary_parameters) {
2834       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x8,
2835       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
2836       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
2837       .element_tile = 8,
2838     };
2839     xnn_params.f32.vmin = (struct vbinary_parameters) {
2840       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x8,
2841       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
2842       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
2843       .element_tile = 8,
2844     };
2845     xnn_params.f32.vmul = (struct vbinary_parameters) {
2846       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasm_x8,
2847       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
2848       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
2849       .element_tile = 8,
2850     };
2851     xnn_params.f32.vsub = (struct vbinary_parameters) {
2852       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasm_x8,
2853       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasm_x8,
2854       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasm_x8,
2855       .element_tile = 8,
2856     };
2857     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
2858       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
2859       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
2860       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
2861       .element_tile = 8,
2862     };
2863     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
2864       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__wasm_2x,
2865       .channel_tile = 1,
2866       .row_tile = 2,
2867     };
2868     #ifndef XNN_NO_NCHW_OPERATORS
2869       init_flags |= XNN_INIT_FLAG_CHW_OPT;
2870 
2871       xnn_params.f32.spmm = (struct spmm_parameters) {
2872         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
2873         .mr = 8,
2874         .nr = 1,
2875       };
2876       xnn_params.f32.spmm2 = (struct spmm_parameters) {
2877         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
2878         .mr = 8,
2879         .nr = 2,
2880       };
2881       xnn_params.f32.spmm4 = (struct spmm_parameters) {
2882         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
2883         .mr = 8,
2884         .nr = 4,
2885       };
2886       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
2887         .ukernel_with_symm_padding =
2888           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
2889         .output_channel_tile = 4,
2890         .output_height_tile = 1,
2891         .output_width_tile = 1,
2892       };
2893       xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2894         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
2895         .output_width_tile = 1,
2896         .output_height_tile = 2,
2897       };
2898       xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2899         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
2900         .output_width_tile = 1,
2901         .output_height_tile = 1,
2902       };
2903       xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2904         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
2905         .output_width_tile = 1,
2906         .output_height_tile = 1,
2907       };
2908       xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2909         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
2910         .output_width_tile = 1,
2911         .output_height_tile = 1,
2912       };
2913       xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2914         .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
2915         .channel_tile = 1,
2916       };
2917       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
2918         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
2919         .channel_tile = 1,
2920         .pixel_tile = 4,
2921       };
2922     #endif  // XNN_NO_NCHW_OPERATORS
2923   #endif  // XNN_NO_F32_OPERATORS
2924 
2925   /**************************** X32 micro-kernels ****************************/
2926   #ifndef XNN_NO_X32_OPERATORS
2927     init_flags |= XNN_INIT_FLAG_X32;
2928 
2929     xnn_params.x32.fill = (struct fill_parameters) {
2930       .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__scalar_float,
2931       .row_tile = 1,
2932     };
2933     xnn_params.x32.pad = (struct pad_parameters) {
2934       .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__scalar_float,
2935       .row_tile = 1,
2936     };
2937     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
2938     xnn_params.x32.zip = (struct zip_parameters) {
2939       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
2940       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
2941       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
2942       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
2943     };
2944     #ifndef XNN_NO_NCHW_OPERATORS
2945       xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
2946         .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
2947         .channel_tile = 1,
2948         .pixel_tile = 1,
2949       };
2950     #endif  // XNN_NO_NCHW_OPERATORS
2951   #endif  // XNN_NO_X32_OPERATORS
2952 
2953 #else
2954   #error "Unsupported architecture"
2955 #endif
2956   xnn_params.init_flags = init_flags;
2957 }
2958 
2959 #ifdef _WIN32
init_windows(PINIT_ONCE init_once,PVOID parameter,PVOID * context)2960   static BOOL CALLBACK init_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
2961     init();
2962     return TRUE;
2963   }
2964 #endif
2965 
xnn_initialize(const struct xnn_allocator * allocator)2966 enum xnn_status xnn_initialize(const struct xnn_allocator* allocator) {
2967   #ifndef __EMSCRIPTEN__
2968     if (!cpuinfo_initialize()) {
2969       return xnn_status_out_of_memory;
2970     }
2971   #endif
2972   #ifdef _WIN32
2973     InitOnceExecuteOnce(&init_guard, &init_windows, NULL, NULL);
2974   #else
2975     pthread_once(&init_guard, &init);
2976   #endif
2977   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) != 0) {
2978     if (allocator != NULL) {
2979       memcpy(&xnn_params.allocator, allocator, sizeof(struct xnn_allocator));
2980     } else {
2981       xnn_params.allocator.allocate = &xnn_allocate;
2982       xnn_params.allocator.reallocate = &xnn_reallocate;
2983       xnn_params.allocator.deallocate = &xnn_deallocate;
2984       xnn_params.allocator.aligned_allocate = &xnn_aligned_allocate;
2985       xnn_params.allocator.aligned_deallocate = &xnn_aligned_deallocate;
2986     }
2987     return xnn_status_success;
2988   } else {
2989     return xnn_status_unsupported_hardware;
2990   }
2991 }
2992 
xnn_deinitialize(void)2993 enum xnn_status xnn_deinitialize(void) {
2994   #ifndef __EMSCRIPTEN__
2995     cpuinfo_deinitialize();
2996   #endif
2997   return xnn_status_success;
2998 }
2999