• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <math.h>
10 #include <stdbool.h>
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <string.h>
14 
15 #ifdef _WIN32
16   #include <windows.h>
17 #else
18   #include <pthread.h>
19 #endif
20 
21 #ifdef _MSC_VER
22   #include <intrin.h>
23 #endif
24 
25 #ifndef __EMSCRIPTEN__
26   #include <cpuinfo.h>
27 #endif
28 
29 #include <xnnpack.h>
30 #include <xnnpack/allocator.h>
31 #include <xnnpack/argmaxpool.h>
32 #include <xnnpack/avgpool.h>
33 #include <xnnpack/common.h>
34 #include <xnnpack/conv.h>
35 #include <xnnpack/dwconv.h>
36 #include <xnnpack/depthtospace.h>
37 #include <xnnpack/gavgpool.h>
38 #include <xnnpack/gemm.h>
39 #include <xnnpack/fill.h>
40 #include <xnnpack/ibilinear.h>
41 #include <xnnpack/igemm.h>
42 #include <xnnpack/log.h>
43 #include <xnnpack/lut.h>
44 #include <xnnpack/maxpool.h>
45 #include <xnnpack/pad.h>
46 #include <xnnpack/params.h>
47 #include <xnnpack/params-init.h>
48 #include <xnnpack/pavgpool.h>
49 #include <xnnpack/prelu.h>
50 #include <xnnpack/raddstoreexpminusmax.h>
51 #include <xnnpack/rmax.h>
52 #include <xnnpack/spmm.h>
53 #include <xnnpack/unpool.h>
54 #include <xnnpack/vaddsub.h>
55 #include <xnnpack/vbinary.h>
56 #include <xnnpack/vcvt.h>
57 #include <xnnpack/vmul.h>
58 #include <xnnpack/vmulcaddc.h>
59 #include <xnnpack/vunary.h>
60 #include <xnnpack/zip.h>
61 
62 #ifndef XNN_ENABLE_ASSEMBLY
63   #define XNN_ENABLE_ASSEMBLY 1
64 #endif
65 
66 #if XNN_PLATFORM_WINDOWS
67   static INIT_ONCE init_guard = INIT_ONCE_STATIC_INIT;
68 #else
69   static pthread_once_t init_guard = PTHREAD_ONCE_INIT;
70 #endif
71 
72 static const struct xnn_allocator* volatile init_allocator = NULL;
73 
74 struct xnn_parameters xnn_params = {
75   .init_flags = 0
76 };
77 
init(void)78 static void init(void) {
79 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
80   // Unlike most other architectures, on x86/x86-64 when floating-point instructions
81   // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
82   // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
83   // of two infinities (must produce NaN per IEEE 754 standard).
84   static const volatile float inf = INFINITY;
85   const bool is_wasm_x86 = signbit(inf - inf);
86 #endif
87   uint32_t init_flags = XNN_INIT_FLAG_XNNPACK;
88 
89 #if XNN_ARCH_ARM
90   #if XNN_PLATFORM_MOBILE
91     if (!cpuinfo_has_arm_neon()) {
92       xnn_log_error("XNNPACK initialization failed: NEON is not supported");
93       return;
94     }
95   #else
96     if (!cpuinfo_has_arm_vfpv2() && !cpuinfo_has_arm_vfpv3()) {
97       xnn_log_error("XNNPACK initialization failed: VFP is not supported");
98       return;
99     }
100   #endif
101 
102   if (cpuinfo_has_arm_neon()) {
103     /**************************** QC8 AArch32 micro-kernels ****************************/
104     #ifndef XNN_NO_QC8_OPERATORS
105       init_flags |= XNN_INIT_FLAG_QC8;
106 
107       #if XNN_ENABLE_ASSEMBLY
108         if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
109           switch (cpuinfo_get_uarch(0)->uarch) {
110             case cpuinfo_uarch_cortex_a55:
111               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55);
112               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55);
113               xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
114               xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
115               xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
116               xnn_params.qc8.gemm.mr = 4;
117               xnn_params.qc8.gemm.nr = 8;
118               xnn_params.qc8.gemm.log2_kr = 2;
119               break;
120             default:
121               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
122               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
123               xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
124               xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
125               xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
126               xnn_params.qc8.gemm.mr = 4;
127               xnn_params.qc8.gemm.nr = 8;
128               xnn_params.qc8.gemm.log2_kr = 2;
129               break;
130           }
131         } else {
132           switch (cpuinfo_get_uarch(0)->uarch) {
133             case cpuinfo_uarch_cortex_a7:
134               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
135               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
136               xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
137               xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
138               xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
139               xnn_params.qc8.gemm.mr = 4;
140               xnn_params.qc8.gemm.nr = 8;
141               break;
142             case cpuinfo_uarch_cortex_a35:
143               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
144               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
145               xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
146               xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
147               xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
148               xnn_params.qc8.gemm.mr = 4;
149               xnn_params.qc8.gemm.nr = 8;
150               break;
151             case cpuinfo_uarch_cortex_a53:
152               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53);
153               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
154               xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
155               xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
156               xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
157               xnn_params.qc8.gemm.mr = 4;
158               xnn_params.qc8.gemm.nr = 8;
159               break;
160             case cpuinfo_uarch_cortex_a55r0:
161             case cpuinfo_uarch_kryo:
162               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53);
163               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
164               xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
165               xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
166               xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
167               xnn_params.qc8.gemm.mr = 4;
168               xnn_params.qc8.gemm.nr = 8;
169               break;
170             case cpuinfo_uarch_cortex_a72:
171             case cpuinfo_uarch_exynos_m1:
172             case cpuinfo_uarch_exynos_m2:
173             case cpuinfo_uarch_exynos_m3:
174               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
175               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
176               xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
177               xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
178               xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
179               xnn_params.qc8.gemm.mr = 4;
180               xnn_params.qc8.gemm.nr = 8;
181               break;
182 
183             default:
184               if (cpuinfo_has_arm_neon_v8()) {
185                 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
186                 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
187                 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
188                 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
189                 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
190                 xnn_params.qc8.gemm.mr = 4;
191                 xnn_params.qc8.gemm.nr = 8;
192               } else {
193                 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
194                 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
195                 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
196                 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
197                 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
198                 xnn_params.qc8.gemm.mr = 4;
199                 xnn_params.qc8.gemm.nr = 8;
200               }
201               break;
202           }
203         }
204         #if XNN_MAX_UARCH_TYPES > 1
205         {
206           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
207           const uint32_t mr = xnn_params.qc8.gemm.mr;
208           const uint32_t nr = xnn_params.qc8.gemm.nr;
209           const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
210           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
211             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
212             if (uarch_info == NULL) {
213               /* No more microarchitectures in the system */
214               break;
215             }
216 
217             switch (uarch_info->uarch) {
218               case cpuinfo_uarch_cortex_a55:
219                 if (mr == 4 && nr == 8 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
220                   xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55;
221                   xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55;
222                   xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot;
223                   xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot;
224                 }
225                 break;
226               case cpuinfo_uarch_cortex_a53:
227                 if (mr == 4 && nr == 8 && log2_kr == 0) {
228                   xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53;
229                   xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64;
230                   xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
231                   xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
232                 }
233                 break;
234               case cpuinfo_uarch_cortex_a55r0:
235                 if (mr == 4 && nr == 8 && log2_kr == 0) {
236                   xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53;
237                   xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64;
238                   xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
239                   xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
240                 }
241                 break;
242 
243               default:
244                 break;
245             }
246           }
247         }
248         #endif  // XNN_MAX_UARCH_TYPES > 1
249       #else  // XNN_ENABLE_ASSEMBLY
250         if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
251           xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot);
252           xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__neondot);
253           xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
254           xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
255           xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
256           xnn_params.qc8.gemm.mr = 4;
257           xnn_params.qc8.gemm.nr = 8;
258           xnn_params.qc8.gemm.log2_kr = 2;
259         } else if (cpuinfo_has_arm_v8()) {
260           xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
261           xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
262           xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
263           xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
264           xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
265           xnn_params.qc8.gemm.mr = 2;
266           xnn_params.qc8.gemm.nr = 8;
267           xnn_params.qc8.gemm.log2_kr = 1;
268           xnn_params.qc8.gemm.log2_sr = 2;
269         } else {
270           xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
271           xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
272           xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
273           xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
274           xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
275           xnn_params.qc8.gemm.mr = 2;
276           xnn_params.qc8.gemm.nr = 8;
277           xnn_params.qc8.gemm.log2_kr = 1;
278           xnn_params.qc8.gemm.log2_sr = 2;
279         }
280       #endif  // XNN_ENABLE_ASSEMBLY
281 
282       if (cpuinfo_has_arm_neon_v8()) {
283         xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
284         xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
285         xnn_params.qc8.dwconv[0].channel_tile = 16;
286         xnn_params.qc8.dwconv[0].primary_tile = 9;
287         xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mla8_ld64;
288         xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
289         xnn_params.qc8.dwconv[1].channel_tile = 8;
290         xnn_params.qc8.dwconv[1].primary_tile = 25;
291       } else {
292         xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neon_mla8_ld64;
293         xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neon_params;
294         xnn_params.qc8.dwconv[0].channel_tile = 16;
295         xnn_params.qc8.dwconv[0].primary_tile = 9;
296         xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mla8_ld64;
297         xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neon_params;
298         xnn_params.qc8.dwconv[1].channel_tile = 8;
299         xnn_params.qc8.dwconv[1].primary_tile = 25;
300       }
301     #endif  // XNN_NO_QC8_OPERATORS
302 
303     /**************************** QS8 AArch32 micro-kernels ****************************/
304     #ifndef XNN_NO_QS8_OPERATORS
305       init_flags |= XNN_INIT_FLAG_QS8;
306 
307       #if XNN_ENABLE_ASSEMBLY
308         if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
309           switch (cpuinfo_get_uarch(0)->uarch) {
310             case cpuinfo_uarch_cortex_a55:
311               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55);
312               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55);
313               xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
314               xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
315               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
316               xnn_params.qs8.gemm.mr = 4;
317               xnn_params.qs8.gemm.nr = 8;
318               xnn_params.qs8.gemm.log2_kr = 2;
319               break;
320             default:
321               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
322               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
323               xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
324               xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
325               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
326               xnn_params.qs8.gemm.mr = 4;
327               xnn_params.qs8.gemm.nr = 8;
328               xnn_params.qs8.gemm.log2_kr = 2;
329               break;
330           }
331         } else {
332           switch (cpuinfo_get_uarch(0)->uarch) {
333             case cpuinfo_uarch_cortex_a7:
334               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
335               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
336               xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
337               xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
338               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
339               xnn_params.qs8.gemm.mr = 4;
340               xnn_params.qs8.gemm.nr = 8;
341               break;
342             case cpuinfo_uarch_cortex_a35:
343               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
344               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
345               xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
346               xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
347               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
348               xnn_params.qs8.gemm.mr = 4;
349               xnn_params.qs8.gemm.nr = 8;
350               break;
351             case cpuinfo_uarch_cortex_a53:
352               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
353               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
354               xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
355               xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
356               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
357               xnn_params.qs8.gemm.mr = 4;
358               xnn_params.qs8.gemm.nr = 8;
359               break;
360             case cpuinfo_uarch_cortex_a55r0:
361             case cpuinfo_uarch_kryo:
362               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
363               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
364               xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
365               xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
366               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
367               xnn_params.qs8.gemm.mr = 4;
368               xnn_params.qs8.gemm.nr = 8;
369               break;
370             case cpuinfo_uarch_cortex_a72:
371             case cpuinfo_uarch_exynos_m1:
372             case cpuinfo_uarch_exynos_m2:
373             case cpuinfo_uarch_exynos_m3:
374               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
375               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
376               xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
377               xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
378               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
379               xnn_params.qs8.gemm.mr = 4;
380               xnn_params.qs8.gemm.nr = 8;
381               break;
382             default:
383               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
384               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
385               xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
386               xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
387               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
388               xnn_params.qs8.gemm.mr = 4;
389               xnn_params.qs8.gemm.nr = 8;
390               break;
391           }
392         }
393         #if XNN_MAX_UARCH_TYPES > 1
394         {
395           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
396           const uint32_t mr = xnn_params.qs8.gemm.mr;
397           const uint32_t nr = xnn_params.qs8.gemm.nr;
398           const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
399           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
400             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
401             if (uarch_info == NULL) {
402               /* No more microarchitectures in the system */
403               break;
404             }
405 
406             switch (uarch_info->uarch) {
407               case cpuinfo_uarch_cortex_a55:
408                 if (mr == 4 && nr == 8 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
409                   xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55;
410                   xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55;
411                   xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot;
412                   xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot;
413                 }
414                 break;
415               case cpuinfo_uarch_cortex_a53:
416                 if (mr == 4 && nr == 8 && log2_kr == 0) {
417                   xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
418                   xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64;
419                   xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
420                   xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
421                 }
422                 break;
423               case cpuinfo_uarch_cortex_a55r0:
424                 if (mr == 4 && nr == 8 && log2_kr == 0) {
425                   xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
426                   xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64;
427                   xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
428                   xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
429                 }
430                 break;
431               default:
432                 break;
433             }
434           }
435         }
436         #endif  // XNN_MAX_UARCH_TYPES > 1
437       #else  // XNN_ENABLE_ASSEMBLY
438         if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
439           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
440           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
441           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
442           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
443           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
444           xnn_params.qs8.gemm.mr = 4;
445           xnn_params.qs8.gemm.nr = 8;
446           xnn_params.qs8.gemm.log2_kr = 2;
447         } else {
448           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
449           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
450           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
451           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
452           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
453           xnn_params.qs8.gemm.mr = 2;
454           xnn_params.qs8.gemm.nr = 8;
455           xnn_params.qs8.gemm.log2_kr = 1;
456           xnn_params.qs8.gemm.log2_sr = 2;
457         }
458       #endif  // XNN_ENABLE_ASSEMBLY
459 
460       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
461       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
462       xnn_params.qs8.dwconv[0].channel_tile = 16;
463       xnn_params.qs8.dwconv[0].primary_tile = 9;
464       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64;
465       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
466       xnn_params.qs8.dwconv[1].channel_tile = 8;
467       xnn_params.qs8.dwconv[1].primary_tile = 25;
468 
469       xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
470         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
471         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
472         .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
473         .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
474         .row_tile = 7,
475         .channel_tile = 8,
476       };
477 
478       xnn_params.qs8.vadd = (struct vbinary_parameters) {
479         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16,
480         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
481         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
482         .init.qs8_addsub = xnn_init_qs8_add_minmax_neon_params,
483         .element_tile = 16,
484       };
485       xnn_params.qs8.vmul = (struct vbinary_parameters) {
486         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
487         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
488         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
489         .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
490         .element_tile = 16,
491       };
492     #endif  // XNN_NO_QS8_OPERATORS
493 
494     /*************************** QU8 AArch32 micro-kernels ***************************/
495     #ifndef XNN_NO_QU8_OPERATORS
496       init_flags |= XNN_INIT_FLAG_QU8;
497 
498       #if XNN_ENABLE_ASSEMBLY
499         if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
500           xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
501           xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
502           xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
503           xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
504           xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
505           xnn_params.qu8.gemm.mr = 4;
506           xnn_params.qu8.gemm.nr = 8;
507           xnn_params.qu8.gemm.log2_kr = 2;
508         } else {
509           switch (cpuinfo_get_uarch(0)->uarch) {
510             case cpuinfo_uarch_cortex_a7:
511               xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
512               xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
513               xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
514               xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
515               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
516               xnn_params.qu8.gemm.mr = 4;
517               xnn_params.qu8.gemm.nr = 8;
518               break;
519             case cpuinfo_uarch_cortex_a35:
520               xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
521               xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
522               xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
523               xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
524               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
525               xnn_params.qu8.gemm.mr = 4;
526               xnn_params.qu8.gemm.nr = 8;
527               break;
528             case cpuinfo_uarch_cortex_a53:
529               xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
530               xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
531               xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
532               xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
533               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
534               xnn_params.qu8.gemm.mr = 4;
535               xnn_params.qu8.gemm.nr = 8;
536               break;
537             case cpuinfo_uarch_cortex_a55r0:
538             case cpuinfo_uarch_kryo:
539               xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
540               xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
541               xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
542               xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
543               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
544               xnn_params.qu8.gemm.mr = 4;
545               xnn_params.qu8.gemm.nr = 8;
546               break;
547             case cpuinfo_uarch_cortex_a72:
548             case cpuinfo_uarch_exynos_m1:
549             case cpuinfo_uarch_exynos_m2:
550             case cpuinfo_uarch_exynos_m3:
551               xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
552               xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
553               xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
554               xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
555               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
556               xnn_params.qu8.gemm.mr = 4;
557               xnn_params.qu8.gemm.nr = 8;
558               break;
559             default:
560               xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
561               xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
562               xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
563               xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
564               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
565               xnn_params.qu8.gemm.mr = 4;
566               xnn_params.qu8.gemm.nr = 8;
567               break;
568           }
569         }
570         #if XNN_MAX_UARCH_TYPES > 1
571         {
572           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
573           const uint32_t mr = xnn_params.qu8.gemm.mr;
574           const uint32_t nr = xnn_params.qu8.gemm.nr;
575           const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
576           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
577             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
578             if (uarch_info == NULL) {
579               /* No more microarchitectures in the system */
580               break;
581             }
582 
583             switch (uarch_info->uarch) {
584               case cpuinfo_uarch_cortex_a53:
585                 if (mr == 4 && nr == 8 && log2_kr == 0) {
586                   xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
587                   xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64;
588                   xnn_params.qu8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
589                   xnn_params.qu8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
590                 }
591                 break;
592               case cpuinfo_uarch_cortex_a55r0:
593                 if (mr == 4 && nr == 8 && log2_kr == 0) {
594                   xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
595                   xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64;
596                   xnn_params.qu8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
597                   xnn_params.qu8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
598                 }
599                 break;
600               default:
601                 break;
602             }
603           }
604         }
605         #endif  // XNN_MAX_UARCH_TYPES > 1
606       #else  // XNN_ENABLE_ASSEMBLY
607         if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
608           xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
609           xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
610           xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
611           xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
612           xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
613           xnn_params.qu8.gemm.mr = 4;
614           xnn_params.qu8.gemm.nr = 8;
615           xnn_params.qu8.gemm.log2_kr = 2;
616         } else {
617           xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
618           xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
619           xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
620           xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
621           xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
622           xnn_params.qu8.gemm.mr = 2;
623           xnn_params.qu8.gemm.nr = 8;
624           xnn_params.qu8.gemm.log2_kr = 1;
625           xnn_params.qu8.gemm.log2_sr = 2;
626         }
627       #endif  // XNN_ENABLE_ASSEMBLY
628 
629       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
630       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
631       xnn_params.qu8.dwconv[0].channel_tile = 16;
632       xnn_params.qu8.dwconv[0].primary_tile = 9;
633       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
634       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
635       xnn_params.qu8.dwconv[1].channel_tile = 8;
636       xnn_params.qu8.dwconv[1].primary_tile = 25;
637 
638       xnn_params.qu8.avgpool = (struct avgpool_parameters) {
639         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
640         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
641         .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
642         .primary_tile = 9,
643         .incremental_tile = 8,
644         .channel_tile = 8,
645       };
646       xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
647         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
648         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
649         .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
650         .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
651         .row_tile = 7,
652         .channel_tile = 8,
653       };
654       xnn_params.qu8.vadd = (struct vbinary_parameters) {
655         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16,
656         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
657         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
658         .init.qu8_addsub = xnn_init_qu8_add_minmax_neon_params,
659         .element_tile = 8,
660       };
661       xnn_params.qu8.vmul = (struct vbinary_parameters) {
662         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
663         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
664         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
665         .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
666         .element_tile = 16,
667       };
668     #endif  // XNN_NO_QU8_OPERATORS
669 
670     /**************************** S8 AArch32 micro-kernels ****************************/
671     #ifndef XNN_NO_S8_OPERATORS
672       init_flags |= XNN_INIT_FLAG_S8;
673 
674       xnn_params.s8.clamp = (struct vunary_parameters) {
675         .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
676         .init.s8_minmax = xnn_init_s8_minmax_neon_params,
677         .element_tile = 64,
678       };
679       xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
680         .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c8,
681         .pixel_tile = 1,
682         .channel_tile = 8,
683       };
684       xnn_params.s8.maxpool = (struct maxpool_parameters) {
685         .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
686         .init.s8 = xnn_init_s8_minmax_neon_params,
687         .mr = 9,
688         .qr = 8,
689       };
690     #endif  // XNN_NO_S8_OPERATORS
691 
692     /**************************** U8 AArch32 micro-kernels ****************************/
693     #ifndef XNN_NO_U8_OPERATORS
694       init_flags |= XNN_INIT_FLAG_U8;
695 
696       xnn_params.u8.clamp = (struct vunary_parameters) {
697         .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
698         .init.u8_minmax = xnn_init_u8_minmax_neon_params,
699         .element_tile = 64,
700       };
701       xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
702         .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c8,
703         .pixel_tile = 1,
704         .channel_tile = 8,
705       };
706       xnn_params.u8.maxpool = (struct maxpool_parameters) {
707         .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
708         .init.u8 = xnn_init_u8_minmax_neon_params,
709         .mr = 9,
710         .qr = 8,
711       };
712       xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
713       xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
714     #endif  // XNN_NO_U8_OPERATORS
715 
716     /**************************** X8 AArch32 micro-kernels ****************************/
717     #ifndef XNN_NO_X8_OPERATORS
718       init_flags |= XNN_INIT_FLAG_X8;
719 
720       xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
721       xnn_params.x8.zip = (struct zip_parameters) {
722         .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
723         .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
724         .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
725         .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
726       };
727     #endif  // XNN_NO_X8_OPERATORS
728 
729     /**************************** F32 AArch32 micro-kernels ****************************/
730     #ifndef XNN_NO_F32_OPERATORS
731       init_flags |= XNN_INIT_FLAG_F32;
732 
733       #if XNN_ENABLE_ASSEMBLY
734         switch (cpuinfo_get_uarch(0)->uarch) {
735           case cpuinfo_uarch_cortex_a5:
736           case cpuinfo_uarch_cortex_a7:
737             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
738             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
739             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
740             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
741             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
742             xnn_params.f32.gemm.mr = 4;
743             xnn_params.f32.gemm.nr = 8;
744             break;
745 
746           case cpuinfo_uarch_cortex_a53:
747           case cpuinfo_uarch_cortex_a55r0:
748             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
749             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
750             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
751             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
752             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
753             xnn_params.f32.gemm.mr = 4;
754             xnn_params.f32.gemm.nr = 8;
755             break;
756 
757           case cpuinfo_uarch_cortex_a35:
758           case cpuinfo_uarch_cortex_a55:
759             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
760             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
761             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
762             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
763             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
764             xnn_params.f32.gemm.mr = 4;
765             xnn_params.f32.gemm.nr = 8;
766             break;
767 
768           case cpuinfo_uarch_cortex_a57:
769           case cpuinfo_uarch_cortex_a72:
770           case cpuinfo_uarch_cortex_a73:
771             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
772             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
773             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
774             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
775             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
776             xnn_params.f32.gemm.mr = 4;
777             xnn_params.f32.gemm.nr = 8;
778             break;
779 
780           case cpuinfo_uarch_krait:
781           default:
782             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
783             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
784             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
785             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
786             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
787             xnn_params.f32.gemm.mr = 4;
788             xnn_params.f32.gemm.nr = 8;
789             break;
790         }
791         #if XNN_MAX_UARCH_TYPES > 1
792         {
793           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
794           const uint32_t mr = xnn_params.f32.gemm.mr;
795           const uint32_t nr = xnn_params.f32.gemm.nr;
796           const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
797           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
798             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
799             if (uarch_info == NULL) {
800               /* No more microarchitectures in the system */
801               break;
802             }
803 
804             switch (uarch_info->uarch) {
805               case cpuinfo_uarch_cortex_a53:
806               case cpuinfo_uarch_cortex_a55r0:
807                 if (mr == 4 && nr == 8 && log2_sr == 0) {
808                   xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
809                   xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
810                   xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
811                   xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
812                 }
813                 break;
814               case cpuinfo_uarch_cortex_a55:
815                 if (mr == 4 && nr == 8 && log2_sr == 0) {
816                   xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
817                   xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
818                   xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
819                   xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
820                 }
821                 break;
822               default:
823                 break;
824             }
825           }
826         }
827         #endif  // XNN_MAX_UARCH_TYPES > 1
828       #else  // XNN_ENABLE_ASSEMBLY
829         xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128);
830         xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128);
831         xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
832         xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
833         xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
834         xnn_params.f32.gemm.mr = 4;
835         xnn_params.f32.gemm.nr = 8;
836       #endif  // XNN_ENABLE_ASSEMBLY
837       xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64);
838       xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64);
839       xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
840       xnn_params.f32.gemm2.mr = 4;
841       xnn_params.f32.gemm2.nr = 2;
842 
843       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neon;
844       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
845       xnn_params.f32.dwconv[0].channel_tile = 8,
846       xnn_params.f32.dwconv[0].primary_tile = 3,
847 
848       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neon;
849       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
850       xnn_params.f32.dwconv[1].channel_tile = 8,
851       xnn_params.f32.dwconv[1].primary_tile = 4,
852 
853       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neon;
854       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
855       xnn_params.f32.dwconv[2].channel_tile = 8;
856       xnn_params.f32.dwconv[2].primary_tile = 9;
857 
858       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neon_acc2;
859       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
860       xnn_params.f32.dwconv[3].channel_tile = 8;
861       xnn_params.f32.dwconv[3].primary_tile = 25;
862 
863       xnn_params.f32.avgpool = (struct avgpool_parameters) {
864         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
865         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
866         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
867         .primary_tile = 9,
868         .incremental_tile = 8,
869         .channel_tile = 4,
870       };
871       xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
872         .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
873         .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
874         .primary_tile = 9,
875         .incremental_tile = 8,
876         .channel_tile = 4,
877       };
878       xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
879         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
880         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
881         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
882         .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
883         .row_tile = 7,
884         .channel_tile = 4,
885       };
886       xnn_params.f32.maxpool = (struct maxpool_parameters) {
887         .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
888         .init.f32 = xnn_init_f32_minmax_scalar_params,
889         .mr = 9,
890         .qr = 8,
891       };
892       xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
893         .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
894         .mr = 4,
895       };
896       xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
897         .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
898         .mr = 9,
899       };
900       xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
901         .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
902         .mr = 9,
903         .qr = 8,
904       };
905       xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
906         .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neon_c8,
907         .pixel_tile = 1,
908         .channel_tile = 8,
909       };
910       xnn_params.f32.abs = (struct vunary_parameters) {
911         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
912         .element_tile = 8,
913       };
914       xnn_params.f32.clamp = (struct vunary_parameters) {
915         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
916         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
917         .element_tile = 8,
918       };
919       if (cpuinfo_has_arm_neon_fma()) {
920         xnn_params.f32.elu = (struct vunary_parameters) {
921           .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_p6_x8,
922           .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_p6_params,
923           .element_tile = 8,
924         };
925       } else {
926         xnn_params.f32.elu = (struct vunary_parameters) {
927           .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8,
928           .init.f32_elu = xnn_init_f32_elu_neon_rr2_lut16_p3_params,
929           .element_tile = 8,
930         };
931       }
932       xnn_params.f32.hswish = (struct vunary_parameters) {
933         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
934         .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
935         .element_tile = 16,
936       };
937       xnn_params.f32.lrelu = (struct vunary_parameters) {
938         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
939         .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
940         .element_tile = 8,
941       };
942       xnn_params.f32.neg = (struct vunary_parameters) {
943         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
944         .element_tile = 8,
945       };
946       if (cpuinfo_has_arm_neon_v8()) {
947         xnn_params.f32.rndne = (struct vunary_parameters) {
948           .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
949           .element_tile = 8,
950         };
951         xnn_params.f32.rndz = (struct vunary_parameters) {
952           .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
953           .element_tile = 8,
954         };
955         xnn_params.f32.rndu = (struct vunary_parameters) {
956           .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
957           .element_tile = 8,
958         };
959         xnn_params.f32.rndd = (struct vunary_parameters) {
960           .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
961           .element_tile = 8,
962         };
963       } else {
964         xnn_params.f32.rndne = (struct vunary_parameters) {
965           .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neon_x8,
966           .element_tile = 8,
967         };
968         xnn_params.f32.rndz = (struct vunary_parameters) {
969           .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neon_x8,
970           .element_tile = 8,
971         };
972         xnn_params.f32.rndu = (struct vunary_parameters) {
973           .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neon_x8,
974           .element_tile = 8,
975         };
976         xnn_params.f32.rndd = (struct vunary_parameters) {
977           .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neon_x8,
978           .element_tile = 8,
979         };
980       }
981       xnn_params.f32.sigmoid = (struct vunary_parameters) {
982         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8,
983         .init.f32_sigmoid = xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params,
984         .element_tile = 8,
985       };
986       xnn_params.f32.sqr = (struct vunary_parameters) {
987         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
988         .element_tile = 8,
989       };
990       xnn_params.f32.sqrt = (struct vunary_parameters) {
991         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
992         .element_tile = 1,
993       };
994       xnn_params.f32.prelu = (struct prelu_parameters) {
995         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
996         .row_tile = 2,
997         .channel_tile = 8,
998       };
999       xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1000         .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x8,
1001         .init = xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
1002         .element_tile = 8,
1003       };
1004       xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
1005       xnn_params.f32.vadd = (struct vbinary_parameters) {
1006         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
1007         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1008         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1009         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1010         .element_tile = 8,
1011       };
1012       xnn_params.f32.vdiv = (struct vbinary_parameters) {
1013         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1014         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1015         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
1016         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1017         .element_tile = 2,
1018       };
1019       xnn_params.f32.vmax = (struct vbinary_parameters) {
1020         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
1021         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1022         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1023         .element_tile = 8,
1024       };
1025       xnn_params.f32.vmin = (struct vbinary_parameters) {
1026         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
1027         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1028         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1029         .element_tile = 8,
1030       };
1031       xnn_params.f32.vmul = (struct vbinary_parameters) {
1032         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
1033         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1034         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1035         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1036         .element_tile = 8,
1037       };
1038       xnn_params.f32.vsub = (struct vbinary_parameters) {
1039         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
1040         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
1041         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
1042         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1043         .element_tile = 8,
1044       };
1045       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1046         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
1047         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1048         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1049         .element_tile = 8,
1050       };
1051       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1052         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x,
1053         .init.f32 = xnn_init_f32_minmax_scalar_params,
1054         .channel_tile = 4,
1055         .row_tile = 2,
1056       };
1057       #ifndef XNN_NO_NCHW_OPERATORS
1058         init_flags |= XNN_INIT_FLAG_CHW_OPT;
1059 
1060         xnn_params.f32.spmm = (struct spmm_parameters) {
1061           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neon,
1062           .mr = 32,
1063           .nr = 1,
1064         };
1065         xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1066           .ukernel_with_symm_padding =
1067             (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2,
1068           .output_channel_tile = 4,
1069           .output_height_tile = 2,
1070           .output_width_tile = 2,
1071         };
1072         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1073           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4,
1074           .output_width_tile = 4,
1075           .output_height_tile = 2,
1076         };
1077         xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1078           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4,
1079           .output_width_tile = 4,
1080           .output_height_tile = 1,
1081         };
1082         xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1083           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4,
1084           .output_width_tile = 4,
1085           .output_height_tile = 1,
1086         };
1087         xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1088           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4,
1089           .output_width_tile = 4,
1090           .output_height_tile = 1,
1091         };
1092         xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1093           .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
1094           .channel_tile = 4,
1095         };
1096         xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1097           .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neon_p8,
1098           .channel_tile = 1,
1099           .pixel_tile = 8,
1100         };
1101       #endif  // XNN_NO_NCHW_OPERATORS
1102     #endif  // XNN_NO_F32_OPERATORS
1103 
1104     /*************************** VCVT AArch32 micro-kernels ***************************/
1105     #ifndef XNN_NO_VCVT_OPERATORS
1106       init_flags |= XNN_INIT_FLAG_VCVT;
1107 
1108       if (cpuinfo_has_arm_neon_fp16()) {
1109         xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1110           .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
1111           .element_tile = 16,
1112         };
1113         xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1114           .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
1115           .element_tile = 16,
1116         };
1117       } else {
1118         xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1119           .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neon_int16_x16,
1120           .init.f16_f32_cvt = xnn_init_f16_f32_cvt_neon_params,
1121           .element_tile = 16,
1122         };
1123         xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1124           .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neon_x8,
1125           .init.f32_f16_cvt = xnn_init_f32_f16_cvt_neon_params,
1126           .element_tile = 8,
1127         };
1128       }
1129       if (cpuinfo_has_arm_neon_v8()) {
1130         xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1131           .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
1132           .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
1133           .element_tile = 32,
1134         };
1135         xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1136           .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
1137           .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
1138           .element_tile = 32,
1139         };
1140       } else {
1141         xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1142           .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neon_x32,
1143           .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neon_params,
1144           .element_tile = 32,
1145         };
1146         xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1147           .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neon_x32,
1148           .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neon_params,
1149           .element_tile = 32,
1150         };
1151       }
1152       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1153         .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
1154         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
1155         .element_tile = 32,
1156       };
1157       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1158         .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
1159         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
1160         .element_tile = 32,
1161       };
1162     #endif  // XNN_NO_VCVT_OPERATORS
1163 
1164     /**************************** X32 AArch32 micro-kernels ****************************/
1165     #ifndef XNN_NO_X32_OPERATORS
1166       init_flags |= XNN_INIT_FLAG_X32;
1167 
1168       xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
1169       xnn_params.x32.zip = (struct zip_parameters) {
1170         .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
1171         .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
1172         .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
1173         .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
1174       };
1175       #ifndef XNN_NO_NCHW_OPERATORS
1176         xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
1177           .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
1178           .channel_tile = 1,
1179           .pixel_tile = 1,
1180         };
1181       #endif  // XNN_NO_NCHW_OPERATORS
1182     #endif  // XNN_NO_X32_OPERATORS
1183 
1184     /**************************** XX AArch32 micro-kernels ****************************/
1185     #ifndef XNN_NO_XX_OPERATORS
1186       init_flags |= XNN_INIT_FLAG_XX;
1187 
1188       xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1189       xnn_params.xx.fill = (struct fill_parameters) {
1190         .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
1191         .row_tile = 1,
1192       };
1193       xnn_params.xx.pad = (struct pad_parameters) {
1194         .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
1195         .row_tile = 1,
1196       };
1197     #endif  // XNN_NO_XX_OPERATORS
1198 
1199   } else if (!XNN_PLATFORM_MOBILE) {
1200 
1201     /*************************** QS8 AArch32 Pre-NEON micro-kernels ***************************/
1202     #ifndef XNN_NO_QS8_OPERATORS
1203       init_flags |= XNN_INIT_FLAG_QS8;
1204 
1205       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1206       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1207       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1208       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1209       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
1210       xnn_params.qs8.gemm.mr = 2;
1211       xnn_params.qs8.gemm.nr = 2;
1212 
1213       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1214       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
1215       xnn_params.qs8.dwconv[0].channel_tile = 1;
1216       xnn_params.qs8.dwconv[0].primary_tile = 9;
1217       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1218       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
1219       xnn_params.qs8.dwconv[1].channel_tile = 1;
1220       xnn_params.qs8.dwconv[1].primary_tile = 25;
1221 
1222       xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
1223         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1224         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
1225         .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
1226         .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
1227         .row_tile = 7,
1228         .channel_tile = 1,
1229       };
1230       xnn_params.qs8.vadd = (struct vbinary_parameters) {
1231         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x1,
1232         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
1233         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
1234         .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
1235         .element_tile = 1,
1236       };
1237       xnn_params.qs8.vmul = (struct vbinary_parameters) {
1238         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
1239         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
1240         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
1241         .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
1242         .element_tile = 4,
1243       };
1244     #endif  // XNN_NO_QS8_OPERATORS
1245 
1246     /*************************** QU8 AArch32 Pre-NEON micro-kernels ***************************/
1247     #ifndef XNN_NO_QU8_OPERATORS
1248       init_flags |= XNN_INIT_FLAG_QU8;
1249 
1250       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1251       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1252       xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1253       xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1254       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
1255       xnn_params.qu8.gemm.mr = 2;
1256       xnn_params.qu8.gemm.nr = 2;
1257 
1258       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1259       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
1260       xnn_params.qu8.dwconv[0].channel_tile = 1;
1261       xnn_params.qu8.dwconv[0].primary_tile = 9;
1262       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1263       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
1264       xnn_params.qu8.dwconv[1].channel_tile = 1;
1265       xnn_params.qu8.dwconv[1].primary_tile = 25;
1266 
1267       xnn_params.qu8.avgpool = (struct avgpool_parameters) {
1268         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
1269         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
1270         .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
1271         .primary_tile = 9,
1272         .incremental_tile = 8,
1273         .channel_tile = 1,
1274       };
1275       xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
1276         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1277         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
1278         .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1279         .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1280         .row_tile = 7,
1281         .channel_tile = 1,
1282       };
1283       xnn_params.qu8.vadd = (struct vbinary_parameters) {
1284         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x1,
1285         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
1286         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
1287         .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
1288         .element_tile = 1,
1289       };
1290       xnn_params.qu8.vmul = (struct vbinary_parameters) {
1291         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
1292         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
1293         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
1294         .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
1295         .element_tile = 4,
1296       };
1297     #endif  // XNN_NO_QU8_OPERATORS
1298 
1299     /**************************** S8 AArch32 Pre-NEON micro-kernels ****************************/
1300     #ifndef XNN_NO_S8_OPERATORS
1301       init_flags |= XNN_INIT_FLAG_S8;
1302 
1303       xnn_params.s8.clamp = (struct vunary_parameters) {
1304         .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
1305         .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
1306         .element_tile = 4,
1307       };
1308       xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
1309         .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
1310         .pixel_tile = 1,
1311         .channel_tile = 1,
1312       };
1313       xnn_params.s8.maxpool = (struct maxpool_parameters) {
1314         .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1315         .init.s8 = xnn_init_s8_minmax_scalar_params,
1316         .mr = 9,
1317         .qr = 8,
1318       };
1319     #endif  // XNN_NO_S8_OPERATORS
1320 
1321     /**************************** U8 AArch32 Pre-NEON micro-kernels ****************************/
1322     #ifndef XNN_NO_U8_OPERATORS
1323       init_flags |= XNN_INIT_FLAG_U8;
1324 
1325       xnn_params.u8.clamp = (struct vunary_parameters) {
1326         .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
1327         .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
1328         .element_tile = 4,
1329       };
1330       xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
1331         .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
1332         .pixel_tile = 1,
1333         .channel_tile = 1,
1334       };
1335       xnn_params.u8.maxpool = (struct maxpool_parameters) {
1336         .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1337         .init.u8 = xnn_init_u8_minmax_scalar_params,
1338         .mr = 9,
1339         .qr = 8,
1340       };
1341       xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1342       xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
1343     #endif  // XNN_NO_U8_OPERATORS
1344 
1345     /**************************** X8 AArch32 Pre-NEON micro-kernels ****************************/
1346     #ifndef XNN_NO_X8_OPERATORS
1347       init_flags |= XNN_INIT_FLAG_X8;
1348 
1349       xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
1350       xnn_params.x8.zip = (struct zip_parameters) {
1351         .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
1352         .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
1353         .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
1354         .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
1355       };
1356     #endif  // XNN_NO_X8_OPERATORS
1357 
1358     /**************************** F32 AArch32 Pre-NEON micro-kernels ****************************/
1359     #ifndef XNN_NO_F32_OPERATORS
1360       init_flags |= XNN_INIT_FLAG_F32;
1361 
1362       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
1363       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
1364       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
1365       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
1366       xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
1367       xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
1368       xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
1369       xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
1370       xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
1371       xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
1372       xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
1373       xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
1374       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
1375       xnn_params.f32.gemm.mr = 4;
1376       xnn_params.f32.gemm.nr = 4;
1377 
1378       xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
1379       xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
1380       xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
1381       xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
1382       xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
1383       xnn_params.f32.gemm2.mr = 4;
1384       xnn_params.f32.gemm2.nr = 2;
1385 
1386       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
1387       xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
1388       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
1389       xnn_params.f32.dwconv[0].channel_tile = 1;
1390       xnn_params.f32.dwconv[0].primary_tile = 3;
1391 
1392       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
1393       xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
1394       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
1395       xnn_params.f32.dwconv[1].channel_tile = 1;
1396       xnn_params.f32.dwconv[1].primary_tile = 4;
1397 
1398       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
1399       xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
1400       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
1401       xnn_params.f32.dwconv[2].channel_tile = 1;
1402       xnn_params.f32.dwconv[2].primary_tile = 9;
1403 
1404       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
1405       xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
1406       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
1407       xnn_params.f32.dwconv[3].channel_tile = 1;
1408       xnn_params.f32.dwconv[3].primary_tile = 25;
1409 
1410       xnn_params.f32.avgpool = (struct avgpool_parameters) {
1411         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
1412         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
1413         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1414         .primary_tile = 9,
1415         .incremental_tile = 8,
1416         .channel_tile = 1,
1417       };
1418       xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
1419         .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
1420         .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
1421         .primary_tile = 9,
1422         .incremental_tile = 8,
1423         .channel_tile = 1,
1424       };
1425       xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
1426         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
1427         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
1428         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1429         .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
1430         .row_tile = 7,
1431         .channel_tile = 1,
1432       };
1433       xnn_params.f32.maxpool = (struct maxpool_parameters) {
1434         .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
1435         .init.f32 = xnn_init_f32_minmax_scalar_params,
1436         .mr = 9,
1437         .qr = 8,
1438       };
1439       xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1440         .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
1441         .mr = 4,
1442       };
1443       xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1444         .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
1445         .mr = 9,
1446       };
1447       xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1448         .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
1449         .mr = 9,
1450         .qr = 8,
1451       };
1452       xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1453         .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
1454         .pixel_tile = 1,
1455         .channel_tile = 2,
1456       };
1457       xnn_params.f32.abs = (struct vunary_parameters) {
1458         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
1459         .element_tile = 4,
1460       };
1461       xnn_params.f32.clamp = (struct vunary_parameters) {
1462         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
1463         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1464         .element_tile = 4,
1465       };
1466       xnn_params.f32.elu = (struct vunary_parameters) {
1467         .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
1468         .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
1469         .element_tile = 4,
1470       };
1471       xnn_params.f32.hswish = (struct vunary_parameters) {
1472         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
1473         .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
1474         .element_tile = 4,
1475       };
1476       xnn_params.f32.lrelu = (struct vunary_parameters) {
1477         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
1478         .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
1479         .element_tile = 4,
1480       };
1481       xnn_params.f32.neg = (struct vunary_parameters) {
1482         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
1483         .element_tile = 4,
1484       };
1485       xnn_params.f32.rndne = (struct vunary_parameters) {
1486         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
1487         .element_tile = 1,
1488       };
1489       xnn_params.f32.rndz = (struct vunary_parameters) {
1490         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
1491         .element_tile = 1,
1492       };
1493       xnn_params.f32.rndu = (struct vunary_parameters) {
1494         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
1495         .element_tile = 1,
1496       };
1497       xnn_params.f32.rndd = (struct vunary_parameters) {
1498         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
1499         .element_tile = 1,
1500       };
1501       xnn_params.f32.sigmoid = (struct vunary_parameters) {
1502         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
1503         .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
1504         .element_tile = 2,
1505       };
1506       xnn_params.f32.sqr = (struct vunary_parameters) {
1507         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
1508         .element_tile = 4,
1509       };
1510       xnn_params.f32.sqrt = (struct vunary_parameters) {
1511         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
1512         .element_tile = 1,
1513       };
1514       xnn_params.f32.prelu = (struct prelu_parameters) {
1515         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
1516         .row_tile = 4,
1517         .channel_tile = 4,
1518       };
1519       xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1520         .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
1521         .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
1522         .element_tile = 4,
1523       };
1524       xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
1525       xnn_params.f32.vadd = (struct vbinary_parameters) {
1526         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
1527         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
1528         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
1529         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1530         .element_tile = 8,
1531       };
1532       xnn_params.f32.vdiv = (struct vbinary_parameters) {
1533         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1534         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1535         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
1536         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1537         .element_tile = 2,
1538       };
1539       xnn_params.f32.vmax = (struct vbinary_parameters) {
1540         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
1541         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
1542         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
1543         .element_tile = 8,
1544       };
1545       xnn_params.f32.vmin = (struct vbinary_parameters) {
1546         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
1547         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
1548         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
1549         .element_tile = 8,
1550       };
1551       xnn_params.f32.vmul = (struct vbinary_parameters) {
1552         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
1553         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
1554         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
1555         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1556         .element_tile = 8,
1557       };
1558       xnn_params.f32.vsub = (struct vbinary_parameters) {
1559         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
1560         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
1561         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
1562         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1563         .element_tile = 8,
1564       };
1565       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1566         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
1567         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
1568         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
1569         .element_tile = 8,
1570       };
1571       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1572         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
1573         .init.f32 = xnn_init_f32_minmax_scalar_params,
1574         .channel_tile = 1,
1575         .row_tile = 2,
1576       };
1577       #ifndef XNN_NO_NCHW_OPERATORS
1578         init_flags |= XNN_INIT_FLAG_CHW_OPT;
1579 
1580         xnn_params.f32.spmm = (struct spmm_parameters) {
1581           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
1582           .mr = 8,
1583           .nr = 1,
1584         };
1585         xnn_params.f32.spmm2 = (struct spmm_parameters) {
1586           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
1587           .mr = 8,
1588           .nr = 2,
1589         };
1590         xnn_params.f32.spmm4 = (struct spmm_parameters) {
1591           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
1592           .mr = 8,
1593           .nr = 4,
1594         };
1595         xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1596           .ukernel_with_symm_padding =
1597             (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
1598           .output_channel_tile = 4,
1599           .output_height_tile = 1,
1600           .output_width_tile = 1,
1601         };
1602         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1603           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1,
1604           .output_width_tile = 1,
1605           .output_height_tile = 4,
1606         };
1607         xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1608           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2,
1609           .output_width_tile = 1,
1610           .output_height_tile = 2,
1611         };
1612         xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1613           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2,
1614           .output_width_tile = 1,
1615           .output_height_tile = 2,
1616         };
1617         xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1618           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2,
1619           .output_width_tile = 1,
1620           .output_height_tile = 2,
1621         };
1622         xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1623           .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
1624           .channel_tile = 1,
1625         };
1626         xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1627           .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
1628           .channel_tile = 1,
1629           .pixel_tile = 4,
1630         };
1631       #endif  // XNN_NO_NCHW_OPERATORS
1632     #endif  // XNN_NO_F32_OPERATORS
1633 
1634     /*************************** VCVT AArch32 Pre-NEON micro-kernels ***************************/
1635     #ifndef XNN_NO_VCVT_OPERATORS
1636       init_flags |= XNN_INIT_FLAG_VCVT;
1637 
1638       xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1639         .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
1640         .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
1641         .element_tile = 4,
1642       };
1643       xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1644         .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
1645         .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
1646         .element_tile = 2,
1647       };
1648       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1649         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_imagic_x4,
1650         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
1651         .element_tile = 4,
1652       };
1653       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1654         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x4,
1655         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
1656         .element_tile = 4,
1657       };
1658       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1659         .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
1660         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
1661         .element_tile = 4,
1662       };
1663       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1664         .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
1665         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
1666         .element_tile = 4,
1667       };
1668     #endif  // XNN_NO_VCVT_OPERATORS
1669 
1670     /**************************** X32 AArch32 Pre-NEON micro-kernels ****************************/
1671     #ifndef XNN_NO_X32_OPERATORS
1672       init_flags |= XNN_INIT_FLAG_X32;
1673 
1674       xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
1675       xnn_params.x32.zip = (struct zip_parameters) {
1676         .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
1677         .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
1678         .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
1679         .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
1680       };
1681       #ifndef XNN_NO_NCHW_OPERATORS
1682         xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
1683           .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
1684           .channel_tile = 1,
1685           .pixel_tile = 1,
1686         };
1687       #endif  // XNN_NO_NCHW_OPERATORS
1688     #endif  // XNN_NO_X32_OPERATORS
1689 
1690     /**************************** XX AArch32 Pre-NEON micro-kernels ****************************/
1691     #ifndef XNN_NO_XX_OPERATORS
1692       init_flags |= XNN_INIT_FLAG_XX;
1693 
1694       xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1695       xnn_params.xx.fill = (struct fill_parameters) {
1696         .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
1697         .row_tile = 1,
1698       };
1699       xnn_params.xx.pad = (struct pad_parameters) {
1700         .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
1701         .row_tile = 1,
1702       };
1703     #endif  // XNN_NO_XX_OPERATORS
1704   }
1705 
1706 #elif XNN_ARCH_ARM64
1707 
1708   /**************************** QC8 AArch64 micro-kernels ****************************/
1709   #ifndef XNN_NO_QC8_OPERATORS
1710     init_flags |= XNN_INIT_FLAG_QC8;
1711 
1712     #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1713       #if XNN_ENABLE_ASSEMBLY
1714         if (cpuinfo_has_arm_neon_dot()) {
1715           xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1716           xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1717           xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1718           xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1719           xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1720           xnn_params.qc8.gemm.mr = 4;
1721           xnn_params.qc8.gemm.nr = 16;
1722           xnn_params.qc8.gemm.log2_kr = 2;
1723         } else {
1724           xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1725           xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1726           xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1727           xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1728           xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1729           xnn_params.qc8.gemm.mr = 2;
1730           xnn_params.qc8.gemm.nr = 8;
1731           xnn_params.qc8.gemm.log2_kr = 3;
1732         }
1733       #else  // !XNN_ENABLE_ASSEMBLY
1734         if (cpuinfo_has_arm_neon_dot()) {
1735           xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1736           xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1737           xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1738           xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1739           xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1740           xnn_params.qc8.gemm.mr = 4;
1741           xnn_params.qc8.gemm.nr = 16;
1742           xnn_params.qc8.gemm.log2_kr = 2;
1743         } else {
1744           xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1745           xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1746           xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1747           xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1748           xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1749           xnn_params.qc8.gemm.mr = 2;
1750           xnn_params.qc8.gemm.nr = 8;
1751           xnn_params.qc8.gemm.log2_kr = 1;
1752           xnn_params.qc8.gemm.log2_sr = 2;
1753         }
1754       #endif  // XNN_ENABLE_ASSEMBLY
1755     #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1756       #if XNN_ENABLE_ASSEMBLY
1757         if (cpuinfo_has_arm_neon_dot()) {
1758           switch (cpuinfo_get_core(0)->uarch) {
1759             case cpuinfo_uarch_cortex_a55:
1760               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1761               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1762               break;
1763             case cpuinfo_uarch_cortex_x1:
1764             case cpuinfo_uarch_cortex_a78:
1765               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1766               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1767               break;
1768             default:
1769               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1770               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1771               break;
1772           }
1773           xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1774           xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1775           xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1776           xnn_params.qc8.gemm.mr = 4;
1777           xnn_params.qc8.gemm.nr = 16;
1778           xnn_params.qc8.gemm.log2_kr = 2;
1779         } else {
1780           switch (cpuinfo_get_core(0)->uarch) {
1781             case cpuinfo_uarch_cortex_a35:
1782               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1783               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1784               xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1785               xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1786               xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1787               xnn_params.qc8.gemm.mr = 4;
1788               xnn_params.qc8.gemm.nr = 16;
1789               break;
1790 
1791             case cpuinfo_uarch_cortex_a53:
1792             case cpuinfo_uarch_cortex_a55r0:
1793               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1794               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1795               xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1796               xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1797               xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1798               xnn_params.qc8.gemm.mr = 4;
1799               xnn_params.qc8.gemm.nr = 16;
1800               break;
1801 
1802             case cpuinfo_uarch_cortex_a72:
1803             case cpuinfo_uarch_cortex_a73:
1804             case cpuinfo_uarch_kryo:
1805               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1806               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1807               xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1808               xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1809               xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1810               xnn_params.qc8.gemm.mr = 2;
1811               xnn_params.qc8.gemm.nr = 8;
1812               xnn_params.qc8.gemm.log2_kr = 3;
1813               break;
1814 
1815             default:
1816               xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1817               xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1818               xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1819               xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1820               xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1821               xnn_params.qc8.gemm.mr = 2;
1822               xnn_params.qc8.gemm.nr = 8;
1823               xnn_params.qc8.gemm.log2_kr = 3;
1824               break;
1825           }
1826         }
1827         #if XNN_MAX_UARCH_TYPES > 1
1828         {
1829           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1830           const uint32_t mr = xnn_params.qc8.gemm.mr;
1831           const uint32_t nr = xnn_params.qc8.gemm.nr;
1832           const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
1833           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1834             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1835             if (uarch_info == NULL) {
1836               /* No more microarchitectures in the system */
1837               break;
1838             }
1839 
1840             switch (uarch_info->uarch) {
1841               case cpuinfo_uarch_cortex_a53:
1842               case cpuinfo_uarch_cortex_a55r0:
1843                 if (mr == 2 && nr == 8 && log2_kr == 3) {
1844                   xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1845                   xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1846                   xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1847                   xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1848                 }
1849                 break;
1850 
1851               case cpuinfo_uarch_cortex_a55:
1852                 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
1853                   xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1854                   xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1855                   xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot;
1856                   xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot;
1857                 }
1858                 break;
1859               default:
1860                 break;
1861             }
1862           }
1863         }
1864         #endif  // XNN_MAX_UARCH_TYPES > 1
1865       #else  // !XNN_ENABLE_ASSEMBLY
1866         if (cpuinfo_has_arm_neon_dot()) {
1867           xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1868           xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1869           xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1870           xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1871           xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1872           xnn_params.qc8.gemm.mr = 4;
1873           xnn_params.qc8.gemm.nr = 16;
1874           xnn_params.qc8.gemm.log2_kr = 2;
1875         } else {
1876           xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1877           xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1878           xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1879           xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1880           xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1881           xnn_params.qc8.gemm.mr = 2;
1882           xnn_params.qc8.gemm.nr = 8;
1883           xnn_params.qc8.gemm.log2_kr = 1;
1884           xnn_params.qc8.gemm.log2_sr = 2;
1885         }
1886       #endif  // XNN_ENABLE_ASSEMBLY
1887     #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1888 
1889     xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
1890     xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1891     xnn_params.qc8.dwconv[0].channel_tile = 16;
1892     xnn_params.qc8.dwconv[0].primary_tile = 9;
1893     xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__neonv8_mla8_ld64;
1894     xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1895     xnn_params.qc8.dwconv[1].channel_tile = 16;
1896     xnn_params.qc8.dwconv[1].primary_tile = 25;
1897   #endif  // XNN_NO_QC8_OPERATORS
1898 
1899   /**************************** QS8 AArch64 micro-kernels ****************************/
1900   #ifndef XNN_NO_QS8_OPERATORS
1901     init_flags |= XNN_INIT_FLAG_QS8;
1902 
1903     #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1904       #if XNN_ENABLE_ASSEMBLY
1905         if (cpuinfo_has_arm_neon_dot()) {
1906           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1907           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1908           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1909           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1910           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1911           xnn_params.qs8.gemm.mr = 4;
1912           xnn_params.qs8.gemm.nr = 16;
1913           xnn_params.qs8.gemm.log2_kr = 2;
1914         } else {
1915           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1916           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1917           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
1918           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
1919           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1920           xnn_params.qs8.gemm.mr = 2;
1921           xnn_params.qs8.gemm.nr = 8;
1922           xnn_params.qs8.gemm.log2_kr = 3;
1923         }
1924       #else  // !XNN_ENABLE_ASSEMBLY
1925         if (cpuinfo_has_arm_neon_dot()) {
1926           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
1927           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1928           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
1929           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1930           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1931           xnn_params.qs8.gemm.mr = 4;
1932           xnn_params.qs8.gemm.nr = 16;
1933           xnn_params.qs8.gemm.log2_kr = 2;
1934         } else {
1935           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1936           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1937           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
1938           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
1939           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1940           xnn_params.qs8.gemm.mr = 2;
1941           xnn_params.qs8.gemm.nr = 8;
1942           xnn_params.qs8.gemm.log2_kr = 1;
1943           xnn_params.qs8.gemm.log2_sr = 2;
1944         }
1945       #endif  // XNN_ENABLE_ASSEMBLY
1946     #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1947       #if XNN_ENABLE_ASSEMBLY
1948         if (cpuinfo_has_arm_neon_dot()) {
1949           switch (cpuinfo_get_core(0)->uarch) {
1950             case cpuinfo_uarch_cortex_a55:
1951               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1952               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1953               break;
1954             case cpuinfo_uarch_cortex_x1:
1955             case cpuinfo_uarch_cortex_a78:
1956               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1957               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1958               break;
1959             default:
1960               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
1961               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
1962               break;
1963           }
1964           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1965           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1966           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1967           xnn_params.qs8.gemm.mr = 4;
1968           xnn_params.qs8.gemm.nr = 16;
1969           xnn_params.qs8.gemm.log2_kr = 2;
1970         } else {
1971           switch (cpuinfo_get_core(0)->uarch) {
1972             case cpuinfo_uarch_cortex_a35:
1973               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1974               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1975               xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1976               xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1977               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1978               xnn_params.qs8.gemm.mr = 4;
1979               xnn_params.qs8.gemm.nr = 16;
1980               break;
1981 
1982             case cpuinfo_uarch_cortex_a53:
1983             case cpuinfo_uarch_cortex_a55r0:
1984               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1985               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1986               xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1987               xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1988               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1989               xnn_params.qs8.gemm.mr = 4;
1990               xnn_params.qs8.gemm.nr = 16;
1991               break;
1992 
1993             case cpuinfo_uarch_cortex_a72:
1994             case cpuinfo_uarch_cortex_a73:
1995             case cpuinfo_uarch_kryo:
1996               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1997               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1998               xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1999               xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
2000               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2001               xnn_params.qs8.gemm.mr = 2;
2002               xnn_params.qs8.gemm.nr = 8;
2003               xnn_params.qs8.gemm.log2_kr = 3;
2004               break;
2005 
2006             default:
2007               xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2008               xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2009               xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2010               xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2011               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2012               xnn_params.qs8.gemm.mr = 2;
2013               xnn_params.qs8.gemm.nr = 8;
2014               xnn_params.qs8.gemm.log2_kr = 3;
2015               break;
2016           }
2017         }
2018         #if XNN_MAX_UARCH_TYPES > 1
2019         {
2020           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2021           const uint32_t mr = xnn_params.qs8.gemm.mr;
2022           const uint32_t nr = xnn_params.qs8.gemm.nr;
2023           const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
2024           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2025             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2026             if (uarch_info == NULL) {
2027               /* No more microarchitectures in the system */
2028               break;
2029             }
2030 
2031             switch (uarch_info->uarch) {
2032               case cpuinfo_uarch_cortex_a53:
2033               case cpuinfo_uarch_cortex_a55r0:
2034                 if (mr == 2 && nr == 8 && log2_kr == 3) {
2035                   xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2036                   xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2037                   xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2038                   xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2039                 }
2040                 break;
2041 
2042               case cpuinfo_uarch_cortex_a55:
2043                 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
2044                   xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2045                   xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2046                   xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot;
2047                   xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot;
2048                 }
2049                 break;
2050               default:
2051                 break;
2052             }
2053           }
2054         }
2055         #endif  // XNN_MAX_UARCH_TYPES > 1
2056       #else  // !XNN_ENABLE_ASSEMBLY
2057         if (cpuinfo_has_arm_neon_dot()) {
2058           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2059           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2060           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2061           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2062           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2063           xnn_params.qs8.gemm.mr = 4;
2064           xnn_params.qs8.gemm.nr = 16;
2065           xnn_params.qs8.gemm.log2_kr = 2;
2066         } else {
2067           xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2068           xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2069           xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2070           xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2071           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2072           xnn_params.qs8.gemm.mr = 2;
2073           xnn_params.qs8.gemm.nr = 8;
2074           xnn_params.qs8.gemm.log2_kr = 1;
2075           xnn_params.qs8.gemm.log2_sr = 2;
2076         }
2077       #endif  // XNN_ENABLE_ASSEMBLY
2078     #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2079 
2080     xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
2081     xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2082     xnn_params.qs8.dwconv[0].channel_tile = 16;
2083     xnn_params.qs8.dwconv[0].primary_tile = 9;
2084     xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld64;
2085     xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2086     xnn_params.qs8.dwconv[1].channel_tile = 16;
2087     xnn_params.qs8.dwconv[1].primary_tile = 25;
2088 
2089     xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
2090       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2091       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2092       .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
2093       .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
2094       .row_tile = 7,
2095       .channel_tile = 8,
2096     };
2097 
2098     xnn_params.qs8.vadd = (struct vbinary_parameters) {
2099       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32,
2100       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
2101       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
2102       .init.qs8_addsub = xnn_init_qs8_add_minmax_neon_params,
2103       .element_tile = 32,
2104     };
2105     xnn_params.qs8.vmul = (struct vbinary_parameters) {
2106       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
2107       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2108       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2109       .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
2110       .element_tile = 16,
2111     };
2112   #endif  // XNN_NO_QS8_OPERATORS
2113 
2114   /**************************** QU8 AArch64 micro-kernels ****************************/
2115   #ifndef XNN_NO_QU8_OPERATORS
2116     init_flags |= XNN_INIT_FLAG_QU8;
2117 
2118     #if XNN_ENABLE_ASSEMBLY
2119       if (cpuinfo_has_arm_neon_dot()) {
2120         switch (cpuinfo_get_core(0)->uarch) {
2121           case cpuinfo_uarch_cortex_a55:
2122             xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2123             xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2124             xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2125             xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2126             xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2127             xnn_params.qu8.gemm.mr = 4;
2128             xnn_params.qu8.gemm.nr = 16;
2129             xnn_params.qu8.gemm.log2_kr = 2;
2130             break;
2131           default:
2132             xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2133             xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2134             xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2135             xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2136             xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2137             xnn_params.qu8.gemm.mr = 4;
2138             xnn_params.qu8.gemm.nr = 16;
2139             xnn_params.qu8.gemm.log2_kr = 2;
2140             break;
2141         }
2142       } else {
2143         switch (cpuinfo_get_core(0)->uarch) {
2144           case cpuinfo_uarch_cortex_a53:
2145           case cpuinfo_uarch_cortex_a55r0:
2146             xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
2147             xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
2148             xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2149             xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2150             xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2151             xnn_params.qu8.gemm.mr = 4;
2152             xnn_params.qu8.gemm.nr = 16;
2153             break;
2154 
2155           case cpuinfo_uarch_cortex_a57:
2156           case cpuinfo_uarch_cortex_a72:
2157           case cpuinfo_uarch_cortex_a73:
2158           case cpuinfo_uarch_cortex_a75:
2159           case cpuinfo_uarch_cortex_a76:
2160           case cpuinfo_uarch_exynos_m1:
2161           case cpuinfo_uarch_exynos_m2:
2162           case cpuinfo_uarch_exynos_m3:
2163           case cpuinfo_uarch_exynos_m4:
2164             xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
2165             xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
2166             xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2167             xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2168             xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2169             xnn_params.qu8.gemm.mr = 4;
2170             xnn_params.qu8.gemm.nr = 16;
2171             break;
2172 
2173           case cpuinfo_uarch_kryo:
2174             xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2175             xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2176             xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2177             xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2178             xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2179             xnn_params.qu8.gemm.mr = 4;
2180             xnn_params.qu8.gemm.nr = 16;
2181             break;
2182 
2183           default:
2184             xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
2185             xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
2186             xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2187             xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2188             xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2189             xnn_params.qu8.gemm.mr = 4;
2190             xnn_params.qu8.gemm.nr = 16;
2191             break;
2192         }
2193       }
2194       #if XNN_MAX_UARCH_TYPES > 1
2195       {
2196         /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2197         const uint32_t mr = xnn_params.qu8.gemm.mr;
2198         const uint32_t nr = xnn_params.qu8.gemm.nr;
2199         const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
2200         for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2201           const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2202           if (uarch_info == NULL) {
2203             /* No more microarchitectures in the system */
2204             break;
2205           }
2206 
2207           switch (uarch_info->uarch) {
2208             case cpuinfo_uarch_cortex_a53:
2209             case cpuinfo_uarch_cortex_a55r0:
2210               if (mr == 4 && nr == 16 && log2_kr == 0) {
2211                 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
2212                 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
2213               }
2214               break;
2215 
2216             case cpuinfo_uarch_cortex_a55:
2217               if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
2218                 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2219                 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2220               }
2221               break;
2222             default:
2223               break;
2224           }
2225         }
2226       }
2227       #endif  // XNN_MAX_UARCH_TYPES > 1
2228     #else  // !XNN_ENABLE_ASSEMBLY
2229       if (cpuinfo_has_arm_neon_dot()) {
2230         xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2231         xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2232         xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2233         xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2234         xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2235         xnn_params.qu8.gemm.mr = 4;
2236         xnn_params.qu8.gemm.nr = 16;
2237         xnn_params.qu8.gemm.log2_kr = 2;
2238       } else {
2239         xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2240         xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2241         xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2242         xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2243         xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2244         xnn_params.qu8.gemm.mr = 4;
2245         xnn_params.qu8.gemm.nr = 16;
2246       }
2247     #endif  // XNN_ENABLE_ASSEMBLY
2248 
2249     xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
2250     xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2251     xnn_params.qu8.dwconv[0].channel_tile = 16;
2252     xnn_params.qu8.dwconv[0].primary_tile = 9;
2253     xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
2254     xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2255     xnn_params.qu8.dwconv[1].channel_tile = 8;
2256     xnn_params.qu8.dwconv[1].primary_tile = 25;
2257 
2258     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
2259       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
2260       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
2261       .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
2262       .primary_tile = 9,
2263       .incremental_tile = 8,
2264       .channel_tile = 8,
2265     };
2266     xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
2267       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2268       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2269       .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
2270       .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
2271       .row_tile = 7,
2272       .channel_tile = 8,
2273     };
2274     xnn_params.qu8.vadd = (struct vbinary_parameters) {
2275       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x32,
2276       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
2277       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
2278       .init.qu8_addsub = xnn_init_qu8_add_minmax_neon_params,
2279       .element_tile = 8,
2280     };
2281     xnn_params.qu8.vmul = (struct vbinary_parameters) {
2282       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
2283       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2284       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2285       .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
2286       .element_tile = 16,
2287     };
2288   #endif  // XNN_NO_QU8_OPERATORS
2289 
2290   /**************************** S8 AArch64 micro-kernels ****************************/
2291   #ifndef XNN_NO_S8_OPERATORS
2292     init_flags |= XNN_INIT_FLAG_S8;
2293 
2294     xnn_params.s8.clamp = (struct vunary_parameters) {
2295       .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
2296       .init.s8_minmax = xnn_init_s8_minmax_neon_params,
2297       .element_tile = 64,
2298     };
2299     xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
2300       .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c16,
2301       .pixel_tile = 1,
2302       .channel_tile = 16,
2303     };
2304     xnn_params.s8.maxpool = (struct maxpool_parameters) {
2305       .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
2306       .init.s8 = xnn_init_s8_minmax_neon_params,
2307       .mr = 9,
2308       .qr = 8,
2309     };
2310   #endif  // XNN_NO_S8_OPERATORS
2311 
2312   /**************************** U8 AArch64 micro-kernels ****************************/
2313   #ifndef XNN_NO_U8_OPERATORS
2314     init_flags |= XNN_INIT_FLAG_U8;
2315 
2316     xnn_params.u8.clamp = (struct vunary_parameters) {
2317       .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
2318       .init.u8_minmax = xnn_init_u8_minmax_neon_params,
2319       .element_tile = 64,
2320     };
2321     xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
2322       .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c16,
2323       .pixel_tile = 1,
2324       .channel_tile = 16,
2325     };
2326     xnn_params.u8.maxpool = (struct maxpool_parameters) {
2327       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
2328       .init.u8 = xnn_init_u8_minmax_neon_params,
2329       .mr = 9,
2330       .qr = 8,
2331     };
2332     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2333     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
2334   #endif  // XNN_NO_U8_OPERATORS
2335 
2336   /**************************** X8 AArch64 micro-kernels ****************************/
2337   #ifndef XNN_NO_X8_OPERATORS
2338     init_flags |= XNN_INIT_FLAG_X8;
2339 
2340     xnn_params.x8.lut = xnn_x8_lut_ukernel__neon_tbx128x4_x64;
2341     xnn_params.x8.zip = (struct zip_parameters) {
2342       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
2343       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
2344       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
2345       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
2346     };
2347   #endif  // XNN_NO_X8_OPERATORS
2348 
2349   /**************************** F16 AArch64 micro-kernels ****************************/
2350   #ifndef XNN_NO_F16_OPERATORS
2351     if (cpuinfo_has_arm_neon_fp16_arith()) {
2352       init_flags |= XNN_INIT_FLAG_F16;
2353       xnn_params.f16.gemm.mr = 6;
2354       xnn_params.f16.gemm.nr = 16;
2355 
2356       #if XNN_ENABLE_ASSEMBLY
2357         switch (cpuinfo_get_core(0)->uarch) {
2358           case cpuinfo_uarch_cortex_a55:
2359             xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55);
2360             break;
2361 
2362           case cpuinfo_uarch_cortex_a75:
2363           case cpuinfo_uarch_cortex_x1:
2364             xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75);
2365             break;
2366 
2367           default:
2368             xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
2369             break;
2370         }
2371         xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
2372 
2373         #if XNN_MAX_UARCH_TYPES > 1
2374         {
2375           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2376           const uint32_t mr = xnn_params.f16.gemm.mr;
2377           const uint32_t nr = xnn_params.f16.gemm.nr;
2378           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2379             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2380             if (uarch_info == NULL) {
2381               /* No more microarchitectures in the system */
2382               break;
2383             }
2384 
2385             switch (uarch_info->uarch) {
2386               case cpuinfo_uarch_cortex_a55:
2387                 if (mr == 6 && nr == 16) {
2388                   xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55;
2389                 }
2390                 break;
2391 
2392               case cpuinfo_uarch_cortex_a55r0:
2393                 if (mr == 6 && nr == 16) {
2394                   xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64;
2395                 }
2396                 break;
2397 
2398               /* Cortex A75 is the medium core Exynos 9820 (M4) */
2399               case cpuinfo_uarch_cortex_a75:
2400                 if (mr == 6 && nr == 16) {
2401                   xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75;
2402                 }
2403                 break;
2404 
2405               default:
2406                 break;
2407             }
2408           }
2409         }
2410         #endif  // XNN_MAX_UARCH_TYPES > 1
2411       #else  // XNN_ENABLE_ASSEMBLY
2412         xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2413         xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2414       #endif  // XNN_ENABLE_ASSEMBLY
2415       xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2416       xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2417       xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_neon_params;
2418 
2419       xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith;
2420       xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_neon_params;
2421       xnn_params.f16.dwconv[0].channel_tile = 16;
2422       xnn_params.f16.dwconv[0].primary_tile = 4;
2423 
2424       xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith;
2425       xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_neon_params;
2426       xnn_params.f16.dwconv[1].channel_tile = 16;
2427       xnn_params.f16.dwconv[1].primary_tile = 9;
2428 
2429       xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2;
2430       xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_neon_params;
2431       xnn_params.f16.dwconv[2].channel_tile = 8;
2432       xnn_params.f16.dwconv[2].primary_tile = 25;
2433 
2434       xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
2435         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8,
2436         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8,
2437         .init.f16 = xnn_init_f16_scaleminmax_neon_params,
2438         .update.f16 = xnn_update_f16_scaleminmax_neon_params,
2439         .row_tile = 7,
2440         .channel_tile = 8,
2441       };
2442 
2443       xnn_params.f16.maxpool = (struct maxpool_parameters) {
2444         .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__neonfp16arith_c8,
2445         .init.f16 = xnn_init_f16_minmax_neon_params,
2446         .mr = 9,
2447         .qr = 8,
2448       };
2449 
2450       xnn_params.f16.prelu = (struct prelu_parameters) {
2451         .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__neonfp16arith_2x16,
2452         .row_tile = 2,
2453         .channel_tile = 16,
2454       };
2455 
2456       xnn_params.f16.vadd = (struct vbinary_parameters) {
2457         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16,
2458         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
2459         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
2460         .init.f16_minmax = xnn_init_f16_minmax_neon_params,
2461         .element_tile = 16,
2462       };
2463       xnn_params.f16.vmul = (struct vbinary_parameters) {
2464         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16,
2465         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
2466         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
2467         .init.f16_minmax = xnn_init_f16_minmax_neon_params,
2468         .element_tile = 16,
2469       };
2470       xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
2471         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__neonfp16arith_2x,
2472         .init.f16 = xnn_init_f16_minmax_neon_params,
2473         .channel_tile = 8,
2474         .row_tile = 2,
2475       };
2476 
2477       xnn_params.f16.hswish = (struct vunary_parameters) {
2478         .ukernel = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__neonfp16arith_x16,
2479         .init.f16_hswish = xnn_init_f16_hswish_neon_params,
2480         .element_tile = 16,
2481       };
2482     }
2483   #endif  // XNN_NO_F16_OPERATORS
2484 
2485   /**************************** F32 AArch64 micro-kernels ****************************/
2486   #ifndef XNN_NO_F32_OPERATORS
2487     init_flags |= XNN_INIT_FLAG_F32;
2488 
2489     #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2490       #if XNN_ENABLE_ASSEMBLY
2491         xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2492         xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2493         xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2494         xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2495         xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2496         xnn_params.f32.gemm.mr = 6;
2497         xnn_params.f32.gemm.nr = 8;
2498       #else  // !XNN_ENABLE_ASSEMBLY
2499         xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2500         xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2501         xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2502         xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2503         xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2504         xnn_params.f32.gemm.mr = 6;
2505         xnn_params.f32.gemm.nr = 8;
2506        #endif  // XNN_ENABLE_ASSEMBLY
2507     #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
2508       #if XNN_ENABLE_ASSEMBLY
2509         switch (cpuinfo_get_core(0)->uarch) {
2510           case cpuinfo_uarch_cortex_a57:
2511             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
2512             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
2513             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2514             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2515             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2516             xnn_params.f32.gemm.mr = 6;
2517             xnn_params.f32.gemm.nr = 8;
2518             break;
2519           case cpuinfo_uarch_cortex_a72:
2520             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2521             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2522             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2523             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2524             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2525             xnn_params.f32.gemm.mr = 4;
2526             xnn_params.f32.gemm.nr = 8;
2527             break;
2528           case cpuinfo_uarch_cortex_a75:
2529           case cpuinfo_uarch_cortex_a76:
2530           case cpuinfo_uarch_exynos_m3:
2531           case cpuinfo_uarch_exynos_m4:
2532             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2533             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2534             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2535             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2536             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2537             xnn_params.f32.gemm.mr = 6;
2538             xnn_params.f32.gemm.nr = 8;
2539             #if XNN_ENABLE_JIT
2540               xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2541               xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2542               xnn_params.f32.gemm.generator.gemm1 = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2543               xnn_params.f32.gemm.generator.igemm1 = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2544             #endif
2545             break;
2546           case cpuinfo_uarch_exynos_m1:
2547           case cpuinfo_uarch_exynos_m2:
2548             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma);
2549             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma);
2550             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma);
2551             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma);
2552             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2553             xnn_params.f32.gemm.mr = 6;
2554             xnn_params.f32.gemm.nr = 8;
2555             xnn_params.f32.gemm.log2_sr = 2;
2556             break;
2557           case cpuinfo_uarch_cortex_a53:
2558           case cpuinfo_uarch_cortex_a55r0:
2559             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2560             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2561             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2562             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2563             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2564             xnn_params.f32.gemm.mr = 6;
2565             xnn_params.f32.gemm.nr = 8;
2566             break;
2567           case cpuinfo_uarch_cortex_a35:
2568           case cpuinfo_uarch_cortex_a55:
2569             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2570             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2571             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2572             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2573             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2574             xnn_params.f32.gemm.mr = 6;
2575             xnn_params.f32.gemm.nr = 8;
2576             break;
2577           case cpuinfo_uarch_cortex_a73:
2578             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
2579             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
2580             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2581             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2582             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2583             xnn_params.f32.gemm.mr = 6;
2584             xnn_params.f32.gemm.nr = 8;
2585             break;
2586           case cpuinfo_uarch_cortex_a77:
2587           case cpuinfo_uarch_exynos_m5:
2588           case cpuinfo_uarch_kryo:
2589             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
2590             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
2591             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2592             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2593             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2594             xnn_params.f32.gemm.mr = 4;
2595             xnn_params.f32.gemm.nr = 8;
2596             break;
2597           case cpuinfo_uarch_cortex_a78:
2598           case cpuinfo_uarch_cortex_x1:
2599           default:
2600             xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
2601             xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
2602             xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2603             xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2604             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2605             xnn_params.f32.gemm.mr = 6;
2606             xnn_params.f32.gemm.nr = 8;
2607             break;
2608         }
2609         #if XNN_MAX_UARCH_TYPES > 1
2610         {
2611           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2612           const uint32_t mr = xnn_params.f32.gemm.mr;
2613           const uint32_t nr = xnn_params.f32.gemm.nr;
2614           const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
2615           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2616             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2617             if (uarch_info == NULL) {
2618               /* No more microarchitectures in the system */
2619               break;
2620             }
2621 
2622             switch (uarch_info->uarch) {
2623               case cpuinfo_uarch_cortex_a53:
2624               case cpuinfo_uarch_cortex_a55r0:
2625                 if (mr == 6 && nr == 8 && log2_sr == 0) {
2626                   xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
2627                   xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
2628                   xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2629                   xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2630                 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
2631                   xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
2632                   xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
2633                   xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2634                   xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2635                 }
2636                 break;
2637               case cpuinfo_uarch_cortex_a55:
2638                 if (mr == 6 && nr == 8 && log2_sr == 0) {
2639                   xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
2640                   xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
2641                   xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2642                   xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2643                 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
2644                   xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
2645                   xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
2646                   xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2647                   xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2648                 }
2649                 break;
2650               default:
2651                 break;
2652             }
2653           }
2654         }
2655         #endif  // XNN_MAX_UARCH_TYPES > 1
2656       #else  // !XNN_ENABLE_ASSEMBLY
2657         xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2658         xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2659         xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2660         xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2661         xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2662         xnn_params.f32.gemm.mr = 6;
2663         xnn_params.f32.gemm.nr = 8;
2664       #endif  // XNN_ENABLE_ASSEMBLY
2665     #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2666     xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64);
2667     xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64);
2668     xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
2669     xnn_params.f32.gemm2.mr = 4;
2670     xnn_params.f32.gemm2.nr = 2;
2671 
2672     xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neonfma;
2673     xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
2674     xnn_params.f32.dwconv[0].channel_tile = 8;
2675     xnn_params.f32.dwconv[0].primary_tile = 3;
2676 
2677     xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neonfma;
2678     xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
2679     xnn_params.f32.dwconv[1].channel_tile = 8;
2680     xnn_params.f32.dwconv[1].primary_tile = 4;
2681 
2682     #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2683       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2684       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2685       xnn_params.f32.dwconv[2].channel_tile = 8;
2686       xnn_params.f32.dwconv[2].primary_tile = 9;
2687     #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
2688       switch (cpuinfo_get_core(0)->uarch) {
2689         case cpuinfo_uarch_kryo:
2690           xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2691           xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2692           xnn_params.f32.dwconv[2].channel_tile = 8;
2693           xnn_params.f32.dwconv[2].primary_tile = 9;
2694           break;
2695         #if XNN_ENABLE_ASSEMBLY
2696           case cpuinfo_uarch_cortex_a53:
2697           case cpuinfo_uarch_cortex_a55r0:
2698           case cpuinfo_uarch_cortex_a55:
2699             xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55;
2700             xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2701             xnn_params.f32.dwconv[2].channel_tile = 4;
2702             xnn_params.f32.dwconv[2].primary_tile = 9;
2703             break;
2704         #endif  // XNN_ENABLE_ASSEMBLY
2705         default:
2706           xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2707           xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2708           xnn_params.f32.dwconv[2].channel_tile = 8;
2709           xnn_params.f32.dwconv[2].primary_tile = 9;
2710           break;
2711       }
2712     #endif  // XNN_PLATFORM_IOS && XNN_PLATFORM_MAC
2713 
2714     xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neonfma_acc2;
2715     xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
2716     xnn_params.f32.dwconv[3].channel_tile = 8;
2717     xnn_params.f32.dwconv[3].primary_tile = 25;
2718 
2719     xnn_params.f32.avgpool = (struct avgpool_parameters) {
2720       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
2721       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
2722       .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
2723       .primary_tile = 9,
2724       .incremental_tile = 8,
2725       .channel_tile = 4,
2726     };
2727     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
2728       .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
2729       .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
2730       .primary_tile = 9,
2731       .incremental_tile = 8,
2732       .channel_tile = 4,
2733     };
2734     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
2735       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
2736       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
2737       .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
2738       .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
2739       .row_tile = 7,
2740       .channel_tile = 4,
2741     };
2742     xnn_params.f32.maxpool = (struct maxpool_parameters) {
2743       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
2744       .init.f32 = xnn_init_f32_minmax_scalar_params,
2745       .mr = 9,
2746       .qr = 8,
2747     };
2748     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
2749       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
2750       .mr = 4,
2751     };
2752     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
2753       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
2754       .mr = 9,
2755     };
2756     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
2757       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
2758       .mr = 9,
2759       .qr = 8,
2760     };
2761     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
2762       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neonfma_c8,
2763       .pixel_tile = 1,
2764       .channel_tile = 8,
2765     };
2766     xnn_params.f32.abs = (struct vunary_parameters) {
2767       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
2768       .element_tile = 8,
2769     };
2770     xnn_params.f32.clamp = (struct vunary_parameters) {
2771       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
2772       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2773       .element_tile = 8,
2774     };
2775     xnn_params.f32.elu = (struct vunary_parameters) {
2776       .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16,
2777       .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
2778       .element_tile = 16,
2779     };
2780     xnn_params.f32.hswish = (struct vunary_parameters) {
2781       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
2782       .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
2783       .element_tile = 16,
2784     };
2785     xnn_params.f32.lrelu = (struct vunary_parameters) {
2786       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
2787       .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
2788       .element_tile = 8,
2789     };
2790     xnn_params.f32.neg = (struct vunary_parameters) {
2791       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
2792       .element_tile = 8,
2793     };
2794     xnn_params.f32.rndne = (struct vunary_parameters) {
2795       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
2796       .element_tile = 8,
2797     };
2798     xnn_params.f32.rndz = (struct vunary_parameters) {
2799       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
2800       .element_tile = 8,
2801     };
2802     xnn_params.f32.rndu = (struct vunary_parameters) {
2803       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
2804       .element_tile = 8,
2805     };
2806     xnn_params.f32.rndd = (struct vunary_parameters) {
2807       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
2808       .element_tile = 8,
2809     };
2810     xnn_params.f32.sigmoid = (struct vunary_parameters) {
2811       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16,
2812       .init.f32_sigmoid = xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params,
2813       .element_tile = 16,
2814     };
2815     xnn_params.f32.sqr = (struct vunary_parameters) {
2816       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
2817       .element_tile = 8,
2818     };
2819     xnn_params.f32.sqrt = (struct vunary_parameters) {
2820       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__neon_sqrt_x4,
2821       .element_tile = 4,
2822     };
2823     xnn_params.f32.prelu = (struct prelu_parameters) {
2824       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
2825       .row_tile = 2,
2826       .channel_tile = 8,
2827     };
2828     xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
2829       .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16,
2830       .init = xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
2831       .element_tile = 16,
2832     };
2833     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
2834     xnn_params.f32.vadd = (struct vbinary_parameters) {
2835       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
2836       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
2837       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
2838       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2839       .element_tile = 8,
2840     };
2841     xnn_params.f32.vdiv = (struct vbinary_parameters) {
2842       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__neon_x8,
2843       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__neon_x8,
2844       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__neon_x8,
2845       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2846       .element_tile = 8,
2847     };
2848     xnn_params.f32.vmax = (struct vbinary_parameters) {
2849       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
2850       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
2851       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
2852       .element_tile = 8,
2853     };
2854     xnn_params.f32.vmin = (struct vbinary_parameters) {
2855       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
2856       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
2857       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
2858       .element_tile = 8,
2859     };
2860     xnn_params.f32.vmul = (struct vbinary_parameters) {
2861       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
2862       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
2863       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
2864       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2865       .element_tile = 8,
2866     };
2867     xnn_params.f32.vsub = (struct vbinary_parameters) {
2868       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
2869       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
2870       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
2871       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2872       .element_tile = 8,
2873     };
2874     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
2875       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
2876       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
2877       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
2878       .element_tile = 8,
2879     };
2880     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
2881       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x,
2882       .init.f32 = xnn_init_f32_minmax_scalar_params,
2883       .channel_tile = 4,
2884       .row_tile = 2,
2885     };
2886     #ifndef XNN_NO_NCHW_OPERATORS
2887       init_flags |= XNN_INIT_FLAG_CHW_OPT;
2888 
2889       xnn_params.f32.spmm = (struct spmm_parameters) {
2890         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined,
2891         .mr = 32,
2892         .nr = 1,
2893       };
2894       xnn_params.f32.spmm2 = (struct spmm_parameters) {
2895         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x2__neonfma,
2896         .mr = 32,
2897         .nr = 2,
2898       };
2899       xnn_params.f32.spmm4 = (struct spmm_parameters) {
2900         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x4__neonfma,
2901         .mr = 32,
2902         .nr = 4,
2903       };
2904       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
2905         .ukernel_with_symm_padding =
2906           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2,
2907         .output_channel_tile = 4,
2908         .output_height_tile = 2,
2909         .output_width_tile = 2,
2910       };
2911       xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2912         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4,
2913         .output_width_tile = 4,
2914         .output_height_tile = 3,
2915       };
2916       xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2917         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2,
2918         .output_width_tile = 4,
2919         .output_height_tile = 2,
2920       };
2921       xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2922         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4,
2923         .output_width_tile = 4,
2924         .output_height_tile = 4,
2925       };
2926       xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2927         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2,
2928         .output_width_tile = 4,
2929         .output_height_tile = 1,
2930       };
2931       xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2932         .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
2933         .channel_tile = 4,
2934       };
2935       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
2936         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neonfma_p8,
2937         .channel_tile = 1,
2938         .pixel_tile = 8,
2939       };
2940     #endif  // XNN_NO_NCHW_OPERATORS
2941   #endif  // XNN_NO_F32_OPERATORS
2942 
2943   /*************************** VCVT AArch64 micro-kernels ***************************/
2944   #ifndef XNN_NO_VCVT_OPERATORS
2945     init_flags |= XNN_INIT_FLAG_VCVT;
2946 
2947     xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
2948       .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
2949       .element_tile = 16,
2950     };
2951     xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
2952       .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
2953       .element_tile = 16,
2954     };
2955     xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
2956       .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
2957       .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
2958       .element_tile = 32,
2959     };
2960     xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
2961       .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
2962       .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
2963       .element_tile = 32,
2964     };
2965     xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
2966       .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
2967       .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
2968       .element_tile = 32,
2969     };
2970     xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
2971       .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
2972       .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
2973       .element_tile = 32,
2974     };
2975   #endif  // XNN_NO_VCVT_OPERATORS
2976 
2977   /**************************** X32 AArch64 micro-kernels ****************************/
2978   #ifndef XNN_NO_X32_OPERATORS
2979     init_flags |= XNN_INIT_FLAG_X32;
2980 
2981     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
2982     xnn_params.x32.zip = (struct zip_parameters) {
2983       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
2984       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
2985       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
2986       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
2987     };
2988     #ifndef XNN_NO_NCHW_OPERATORS
2989       xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
2990         .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
2991         .channel_tile = 1,
2992         .pixel_tile = 1,
2993       };
2994     #endif  // XNN_NO_NCHW_OPERATORS
2995   #endif  // XNN_NO_X32_OPERATORS
2996 
2997   /**************************** XX AArch64 micro-kernels ****************************/
2998   #ifndef XNN_NO_XX_OPERATORS
2999     init_flags |= XNN_INIT_FLAG_XX;
3000 
3001     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
3002     xnn_params.xx.fill = (struct fill_parameters) {
3003       .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
3004       .row_tile = 1,
3005     };
3006     xnn_params.xx.pad = (struct pad_parameters) {
3007       .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
3008       .row_tile = 1,
3009     };
3010   #endif
3011 
3012 #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3013   if (!cpuinfo_has_x86_sse2()) {
3014     xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
3015     return;
3016   }
3017 
3018   /**************************** QC8 x86 micro-kernels ****************************/
3019   #ifndef XNN_NO_QC8_OPERATORS
3020     init_flags |= XNN_INIT_FLAG_QC8;
3021 
3022     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3023       xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3024       xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3025       xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3026       xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3027       xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_avx512_params;
3028       xnn_params.qc8.gemm.mr = 4;
3029       xnn_params.qc8.gemm.nr = 16;
3030       xnn_params.qc8.gemm.log2_kr = 3;
3031     } else if (cpuinfo_has_x86_xop()) {
3032       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3033       xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3034       xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3035       xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3036       xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3037       xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
3038       xnn_params.qc8.gemm.mr = 2;
3039       xnn_params.qc8.gemm.nr = 4;
3040       xnn_params.qc8.gemm.log2_kr = 3;
3041     } else if (cpuinfo_has_x86_avx2()) {
3042       xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3043       xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3044       xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3045       xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3046       xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_avx2_params;
3047       xnn_params.qc8.gemm.mr = 3;
3048       xnn_params.qc8.gemm.nr = 8;
3049       xnn_params.qc8.gemm.log2_kr = 3;
3050     } else if (cpuinfo_has_x86_avx()) {
3051       xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3052       xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3053       xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3054       xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3055       xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
3056       xnn_params.qc8.gemm.mr = 2;
3057       xnn_params.qc8.gemm.nr = 4;
3058       xnn_params.qc8.gemm.log2_kr = 3;
3059     } else if (cpuinfo_has_x86_sse4_1()) {
3060       xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3061       xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3062       xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3063       xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3064       xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
3065       xnn_params.qc8.gemm.mr = 3;
3066       xnn_params.qc8.gemm.nr = 4;
3067       xnn_params.qc8.gemm.log2_kr = 3;
3068     } else {
3069       xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3070       xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3071       xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3072       xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3073       xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse2_params;
3074       xnn_params.qc8.gemm.mr = 3;
3075       xnn_params.qc8.gemm.nr = 4;
3076       xnn_params.qc8.gemm.log2_kr = 3;
3077     }
3078 
3079     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3080       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3081       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_avx512_params;
3082       xnn_params.qc8.dwconv[0].channel_tile = 32;
3083       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3084       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_avx512_params;
3085       xnn_params.qc8.dwconv[1].channel_tile = 32;
3086     } else if (cpuinfo_has_x86_xop()) {
3087       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3088       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
3089       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3090       xnn_params.qc8.dwconv[0].channel_tile = 16;
3091       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
3092       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3093       xnn_params.qc8.dwconv[1].channel_tile = 16;
3094     } else if (cpuinfo_has_x86_avx2()) {
3095       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3096       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_avx2_params;
3097       xnn_params.qc8.dwconv[0].channel_tile = 16;
3098       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3099       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_avx2_params;
3100       xnn_params.qc8.dwconv[1].channel_tile = 16;
3101     } else if (cpuinfo_has_x86_avx()) {
3102       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
3103       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3104       xnn_params.qc8.dwconv[0].channel_tile = 16;
3105       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
3106       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3107       xnn_params.qc8.dwconv[1].channel_tile = 16;
3108     } else if (cpuinfo_has_x86_sse4_1()) {
3109       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3110       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3111       xnn_params.qc8.dwconv[0].channel_tile = 8;
3112       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3113       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3114       xnn_params.qc8.dwconv[1].channel_tile = 8;
3115     } else if (cpuinfo_has_x86_sse2()) {
3116       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3117       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse2_params;
3118       xnn_params.qc8.dwconv[0].channel_tile = 8;
3119       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3120       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse2_params;
3121       xnn_params.qc8.dwconv[1].channel_tile = 8;
3122     }
3123     xnn_params.qc8.dwconv[0].primary_tile = 9;
3124     xnn_params.qc8.dwconv[1].primary_tile = 25;
3125   #endif  // XNN_NO_QC8_OPERATORS
3126 
3127   /**************************** QS8 x86 micro-kernels ****************************/
3128   #ifndef XNN_NO_QS8_OPERATORS
3129     init_flags |= XNN_INIT_FLAG_QS8;
3130 
3131     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3132       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3133       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3134       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3135       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3136       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3137       xnn_params.qs8.gemm.mr = 4;
3138       xnn_params.qs8.gemm.nr = 16;
3139       xnn_params.qs8.gemm.log2_kr = 3;
3140     } else if (cpuinfo_has_x86_xop()) {
3141       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3142       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3143       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3144       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3145       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3146       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3147       xnn_params.qs8.gemm.mr = 2;
3148       xnn_params.qs8.gemm.nr = 4;
3149       xnn_params.qs8.gemm.log2_kr = 3;
3150     } else if (cpuinfo_has_x86_avx2()) {
3151       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3152       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3153       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3154       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3155       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3156       xnn_params.qs8.gemm.mr = 3;
3157       xnn_params.qs8.gemm.nr = 8;
3158       xnn_params.qs8.gemm.log2_kr = 3;
3159     } else if (cpuinfo_has_x86_avx()) {
3160       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3161       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3162       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3163       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3164       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3165       xnn_params.qs8.gemm.mr = 2;
3166       xnn_params.qs8.gemm.nr = 4;
3167       xnn_params.qs8.gemm.log2_kr = 3;
3168     } else if (cpuinfo_has_x86_sse4_1()) {
3169       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3170       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3171       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3172       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3173       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3174       xnn_params.qs8.gemm.mr = 3;
3175       xnn_params.qs8.gemm.nr = 4;
3176       xnn_params.qs8.gemm.log2_kr = 3;
3177     } else {
3178       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3179       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3180       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3181       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3182       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3183       xnn_params.qs8.gemm.mr = 3;
3184       xnn_params.qs8.gemm.nr = 4;
3185       xnn_params.qs8.gemm.log2_kr = 3;
3186     }
3187 
3188     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3189       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3190       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3191       xnn_params.qs8.dwconv[0].channel_tile = 32;
3192       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3193       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3194       xnn_params.qs8.dwconv[1].channel_tile = 32;
3195     } else if (cpuinfo_has_x86_xop()) {
3196       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3197       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
3198       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3199       xnn_params.qs8.dwconv[0].channel_tile = 16;
3200       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
3201       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3202       xnn_params.qs8.dwconv[1].channel_tile = 16;
3203     } else if (cpuinfo_has_x86_avx2()) {
3204       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3205       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3206       xnn_params.qs8.dwconv[0].channel_tile = 16;
3207       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3208       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3209       xnn_params.qs8.dwconv[1].channel_tile = 16;
3210     } else if (cpuinfo_has_x86_avx()) {
3211       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
3212       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3213       xnn_params.qs8.dwconv[0].channel_tile = 16;
3214       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
3215       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3216       xnn_params.qs8.dwconv[1].channel_tile = 16;
3217     } else if (cpuinfo_has_x86_sse4_1()) {
3218       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16;
3219       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3220       xnn_params.qs8.dwconv[0].channel_tile = 8;
3221       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16;
3222       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3223       xnn_params.qs8.dwconv[1].channel_tile = 8;
3224     } else if (cpuinfo_has_x86_sse2()) {
3225       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16;
3226       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3227       xnn_params.qs8.dwconv[0].channel_tile = 8;
3228       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16;
3229       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3230       xnn_params.qs8.dwconv[1].channel_tile = 8;
3231     }
3232     xnn_params.qs8.dwconv[0].primary_tile = 9;
3233     xnn_params.qs8.dwconv[1].primary_tile = 25;
3234 
3235     if (cpuinfo_has_x86_sse4_1()) {
3236       xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
3237         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3238         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3239         .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse4_params,
3240         .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse4_params,
3241         .row_tile = 7,
3242         .channel_tile = 8,
3243       };
3244     } else {
3245       xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
3246         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3247         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3248         .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse2_params,
3249         .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse2_params,
3250         .row_tile = 7,
3251         .channel_tile = 8,
3252       };
3253     }
3254 
3255     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3256       xnn_params.qs8.vadd = (struct vbinary_parameters) {
3257         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3258         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3259         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3260         .init.qs8_addsub = xnn_init_qs8_add_minmax_avx512_params,
3261         .element_tile = 16,
3262       };
3263     } else if (cpuinfo_has_x86_xop()) {
3264       xnn_params.qs8.vadd = (struct vbinary_parameters) {
3265         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
3266         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3267         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3268         .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul32_params,
3269         .element_tile = 8,
3270       };
3271     } else if (cpuinfo_has_x86_avx2()) {
3272       xnn_params.qs8.vadd = (struct vbinary_parameters) {
3273         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
3274         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3275         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3276         .init.qs8_addsub = xnn_init_qs8_add_minmax_avx2_params,
3277         .element_tile = 16,
3278       };
3279     } else if (cpuinfo_has_x86_avx()) {
3280       xnn_params.qs8.vadd = (struct vbinary_parameters) {
3281         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
3282         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3283         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3284         .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul32_params,
3285         .element_tile = 8,
3286       };
3287     } else if (cpuinfo_has_x86_sse4_1()) {
3288       xnn_params.qs8.vadd = (struct vbinary_parameters) {
3289         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
3290         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3291         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3292         .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul16_params,
3293         .element_tile = 8,
3294       };
3295     } else {
3296       xnn_params.qs8.vadd = (struct vbinary_parameters) {
3297         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
3298         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3299         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3300         .init.qs8_addsub = xnn_init_qs8_add_minmax_sse2_params,
3301         .element_tile = 8,
3302       };
3303     }
3304     if (cpuinfo_has_x86_avx()) {
3305       xnn_params.qs8.vmul = (struct vbinary_parameters) {
3306         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3307         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3308         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3309         .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
3310         .element_tile = 16,
3311       };
3312     } else if (cpuinfo_has_x86_sse4_1()) {
3313       xnn_params.qs8.vmul = (struct vbinary_parameters) {
3314         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3315         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3316         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3317         .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
3318         .element_tile = 16,
3319       };
3320     } else {
3321       xnn_params.qs8.vmul = (struct vbinary_parameters) {
3322         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3323         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3324         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3325         .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse2_params,
3326         .element_tile = 8,
3327       };
3328     }
3329   #endif  // XNN_NO_QS8_OPERATORS
3330 
3331   /**************************** QU8 x86 micro-kernels ****************************/
3332   #ifndef XNN_NO_QU8_OPERATORS
3333     init_flags |= XNN_INIT_FLAG_QU8;
3334 
3335     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3336       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3337       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3338       xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3339       xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3340       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3341       xnn_params.qu8.gemm.mr = 4;
3342       xnn_params.qu8.gemm.nr = 16;
3343       xnn_params.qu8.gemm.log2_kr = 3;
3344     } else if (cpuinfo_has_x86_xop()) {
3345       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3346       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3347       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3348       xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3349       xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3350       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3351       xnn_params.qu8.gemm.mr = 2;
3352       xnn_params.qu8.gemm.nr = 4;
3353       xnn_params.qu8.gemm.log2_kr = 3;
3354     } else if (cpuinfo_has_x86_avx2()) {
3355       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3356       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3357       xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3358       xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3359       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3360       xnn_params.qu8.gemm.mr = 3;
3361       xnn_params.qu8.gemm.nr = 8;
3362       xnn_params.qu8.gemm.log2_kr = 3;
3363     } else if (cpuinfo_has_x86_avx()) {
3364       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3365       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3366       xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3367       xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3368       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3369       xnn_params.qu8.gemm.mr = 2;
3370       xnn_params.qu8.gemm.nr = 4;
3371       xnn_params.qu8.gemm.log2_kr = 3;
3372     } else if (cpuinfo_has_x86_sse4_1()) {
3373       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3374       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3375       xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3376       xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3377       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3378       xnn_params.qu8.gemm.mr = 3;
3379       xnn_params.qu8.gemm.nr = 4;
3380       xnn_params.qu8.gemm.log2_kr = 3;
3381     } else {
3382       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3383       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3384       xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3385       xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3386       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3387       xnn_params.qu8.gemm.mr = 3;
3388       xnn_params.qu8.gemm.nr = 4;
3389       xnn_params.qu8.gemm.log2_kr = 3;
3390     }
3391 
3392     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3393       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3394       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3395       xnn_params.qu8.dwconv[0].channel_tile = 32;
3396       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3397       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3398       xnn_params.qu8.dwconv[1].channel_tile = 32;
3399     } else if (cpuinfo_has_x86_xop()) {
3400       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3401       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32;
3402       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3403       xnn_params.qu8.dwconv[0].channel_tile = 16;
3404       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32;
3405       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3406       xnn_params.qu8.dwconv[1].channel_tile = 16;
3407     } else if (cpuinfo_has_x86_avx2()) {
3408       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3409       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3410       xnn_params.qu8.dwconv[0].channel_tile = 16;
3411       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3412       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3413       xnn_params.qu8.dwconv[1].channel_tile = 16;
3414     } else if (cpuinfo_has_x86_avx()) {
3415       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16;
3416       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3417       xnn_params.qu8.dwconv[0].channel_tile = 16;
3418       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16;
3419       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3420       xnn_params.qu8.dwconv[1].channel_tile = 16;
3421     } else if (cpuinfo_has_x86_sse4_1()) {
3422       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3423       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3424       xnn_params.qu8.dwconv[0].channel_tile = 8;
3425       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3426       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3427       xnn_params.qu8.dwconv[1].channel_tile = 8;
3428     } else if (cpuinfo_has_x86_sse2()) {
3429       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3430       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3431       xnn_params.qu8.dwconv[0].channel_tile = 8;
3432       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3433       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3434       xnn_params.qu8.dwconv[1].channel_tile = 8;
3435     }
3436     xnn_params.qu8.dwconv[0].primary_tile = 9;
3437     xnn_params.qu8.dwconv[1].primary_tile = 25;
3438 
3439     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
3440       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8,
3441       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8,
3442       .init.qu8 = xnn_init_qu8_avgpool_minmax_sse2_params,
3443       .primary_tile = 9,
3444       .incremental_tile = 8,
3445       .channel_tile = 8,
3446     };
3447     if (cpuinfo_has_x86_sse4_1()) {
3448       xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3449         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3450         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3451         .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse4_params,
3452         .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse4_params,
3453         .row_tile = 7,
3454         .channel_tile = 8,
3455       };
3456     } else {
3457       xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3458         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3459         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3460         .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse2_params,
3461         .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse2_params,
3462         .row_tile = 7,
3463         .channel_tile = 8,
3464       };
3465     }
3466 
3467     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3468       xnn_params.qu8.vadd = (struct vbinary_parameters) {
3469         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3470         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3471         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3472         .init.qu8_addsub = xnn_init_qu8_add_minmax_avx512_params,
3473         .element_tile = 16,
3474       };
3475     } else if (cpuinfo_has_x86_xop()) {
3476       xnn_params.qu8.vadd = (struct vbinary_parameters) {
3477         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
3478         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3479         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3480         .init.qu8_addsub = xnn_init_qu8_add_minmax_sse4_params,
3481         .element_tile = 8,
3482       };
3483     } else if (cpuinfo_has_x86_avx2()) {
3484       xnn_params.qu8.vadd = (struct vbinary_parameters) {
3485         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
3486         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3487         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3488         .init.qu8_addsub = xnn_init_qu8_add_minmax_avx2_params,
3489         .element_tile = 16,
3490       };
3491     } else if (cpuinfo_has_x86_avx()) {
3492       xnn_params.qu8.vadd = (struct vbinary_parameters) {
3493         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
3494         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3495         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3496         .init.qu8_addsub = xnn_init_qu8_add_minmax_sse4_params,
3497         .element_tile = 8,
3498       };
3499     } else if (cpuinfo_has_x86_sse4_1()) {
3500       xnn_params.qu8.vadd = (struct vbinary_parameters) {
3501         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
3502         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3503         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3504         .init.qu8_addsub = xnn_init_qu8_add_minmax_sse2_params,
3505         .element_tile = 8,
3506       };
3507     } else {
3508       xnn_params.qu8.vadd = (struct vbinary_parameters) {
3509         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
3510         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3511         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3512         .init.qu8_addsub = xnn_init_qu8_add_minmax_sse2_params,
3513         .element_tile = 8,
3514       };
3515     }
3516     if (cpuinfo_has_x86_avx()) {
3517       xnn_params.qu8.vmul = (struct vbinary_parameters) {
3518         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3519         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3520         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3521         .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3522         .element_tile = 16,
3523       };
3524     } else if (cpuinfo_has_x86_sse4_1()) {
3525       xnn_params.qu8.vmul = (struct vbinary_parameters) {
3526         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3527         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3528         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3529         .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3530         .element_tile = 16,
3531       };
3532     } else {
3533       xnn_params.qu8.vmul = (struct vbinary_parameters) {
3534         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3535         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3536         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3537         .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3538         .element_tile = 8,
3539       };
3540     }
3541   #endif  // XNN_NO_QU8_OPERATORS
3542 
3543   /**************************** U8 x86 micro-kernels ****************************/
3544   #ifndef XNN_NO_S8_OPERATORS
3545     init_flags |= XNN_INIT_FLAG_S8;
3546 
3547     if (cpuinfo_has_x86_sse4_1()) {
3548       xnn_params.s8.clamp = (struct vunary_parameters) {
3549         .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__sse41_x64,
3550         .init.s8_minmax = xnn_init_s8_minmax_sse4_params,
3551         .element_tile = 64,
3552       };
3553       xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3554         .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse41_c16,
3555         .pixel_tile = 1,
3556         .channel_tile = 16,
3557       };
3558       xnn_params.s8.maxpool = (struct maxpool_parameters) {
3559         .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16,
3560         .init.s8 = xnn_init_s8_minmax_sse4_params,
3561         .mr = 9,
3562         .qr = 8,
3563       };
3564     } else {
3565       xnn_params.s8.clamp = (struct vunary_parameters) {
3566         .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__sse2_x64,
3567         .init.s8_minmax = xnn_init_s8_minmax_sse2_params,
3568         .element_tile = 64,
3569       };
3570       xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3571         .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse2_c8,
3572         .pixel_tile = 1,
3573         .channel_tile = 8,
3574       };
3575       xnn_params.s8.maxpool = (struct maxpool_parameters) {
3576         .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse2_c16,
3577         .init.s8 = xnn_init_s8_minmax_sse2_params,
3578         .mr = 9,
3579         .qr = 8,
3580       };
3581     }
3582   #endif  // XNN_NO_S8_OPERATORS
3583 
3584   /**************************** U8 x86 micro-kernels ****************************/
3585   #ifndef XNN_NO_U8_OPERATORS
3586     init_flags |= XNN_INIT_FLAG_U8;
3587 
3588     xnn_params.u8.clamp = (struct vunary_parameters) {
3589       .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__sse2_x64,
3590       .init.u8_minmax = xnn_init_u8_minmax_sse2_params,
3591       .element_tile = 64,
3592     };
3593     if (cpuinfo_has_x86_sse4_1()) {
3594       xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3595         .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse41_c16,
3596         .pixel_tile = 1,
3597         .channel_tile = 16,
3598       };
3599     } else {
3600       xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3601         .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse2_c8,
3602         .pixel_tile = 1,
3603         .channel_tile = 8,
3604       };
3605     }
3606     xnn_params.u8.maxpool = (struct maxpool_parameters) {
3607       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16,
3608       .init.u8 = xnn_init_u8_minmax_sse2_params,
3609       .mr = 9,
3610       .qr = 8,
3611     };
3612     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
3613     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__sse2;
3614   #endif  // XNN_NO_U8_OPERATORS
3615 
3616   /**************************** X8 x86 micro-kernels ****************************/
3617   #ifndef XNN_NO_X8_OPERATORS
3618     init_flags |= XNN_INIT_FLAG_X8;
3619 
3620     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3621       xnn_params.x8.lut = xnn_x8_lut_ukernel__avx512skx_vpshufb_x64;
3622     } else if (cpuinfo_has_x86_avx2()) {
3623       xnn_params.x8.lut = xnn_x8_lut_ukernel__avx2_x128;
3624     } else if (cpuinfo_has_x86_avx()) {
3625       xnn_params.x8.lut = xnn_x8_lut_ukernel__avx_x64;
3626     } else {
3627       // Note: SSSE3 version is usually slower than scalar
3628       xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
3629     }
3630     xnn_params.x8.zip = (struct zip_parameters) {
3631       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__sse2,
3632       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__sse2,
3633       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__sse2,
3634       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__sse2,
3635     };
3636   #endif  // XNN_NO_X8_OPERATORS
3637 
3638   /**************************** F16 x86 micro-kernels ****************************/
3639   #ifndef XNN_NO_F16_OPERATORS
3640     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
3641       init_flags |= XNN_INIT_FLAG_F16;
3642 
3643       xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast);
3644       xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast);
3645       xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast);
3646       xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast);
3647       xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_avx_params;
3648       xnn_params.f16.gemm.mr = 4;
3649       xnn_params.f16.gemm.nr = 16;
3650 
3651       xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__fma3;
3652       xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_avx_params;
3653       xnn_params.f16.dwconv[0].channel_tile = 16;
3654       xnn_params.f16.dwconv[0].primary_tile = 4;
3655 
3656       xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__fma3;
3657       xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_avx_params;
3658       xnn_params.f16.dwconv[1].channel_tile = 16;
3659       xnn_params.f16.dwconv[1].primary_tile = 9;
3660 
3661       xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2;
3662       xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_avx_params;
3663       xnn_params.f16.dwconv[2].channel_tile = 8;
3664       xnn_params.f16.dwconv[2].primary_tile = 25;
3665 
3666       xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
3667         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8,
3668         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8,
3669         .init.f16 = xnn_init_f16_scaleminmax_avx_params,
3670         .update.f16 = xnn_update_f16_scaleminmax_avx_params,
3671         .row_tile = 7,
3672         .channel_tile = 8,
3673       };
3674 
3675       xnn_params.f16.maxpool = (struct maxpool_parameters) {
3676         .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8,
3677         .init.f16 = xnn_init_f16_minmax_avx_params,
3678         .mr = 9,
3679         .qr = 8,
3680       };
3681 
3682       xnn_params.f16.prelu = (struct prelu_parameters) {
3683         .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__f16c_2x16,
3684         .row_tile = 2,
3685         .channel_tile = 16,
3686       };
3687 
3688       xnn_params.f16.vadd = (struct vbinary_parameters) {
3689         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__f16c_x16,
3690         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
3691         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
3692         .init.f16_minmax = xnn_init_f16_minmax_avx_params,
3693         .element_tile = 16,
3694       };
3695       xnn_params.f16.vmul = (struct vbinary_parameters) {
3696         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__f16c_x16,
3697         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
3698         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
3699         .init.f16_minmax = xnn_init_f16_minmax_avx_params,
3700         .element_tile = 16,
3701       };
3702       xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
3703         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x,
3704         .init.f16 = xnn_init_f16_minmax_avx_params,
3705         .channel_tile = 8,
3706         .row_tile = 2,
3707       };
3708       xnn_params.f16.hswish = (struct vunary_parameters) {
3709         .ukernel = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__f16c_x16,
3710         .init.f16_hswish = xnn_init_f16_hswish_avx_params,
3711         .element_tile = 16,
3712       };
3713     }
3714   #endif  // XNN_NO_F16_OPERATORS
3715 
3716   /**************************** F32 x86 micro-kernels ****************************/
3717   #ifndef XNN_NO_F32_OPERATORS
3718     init_flags |= XNN_INIT_FLAG_F32;
3719 
3720     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3721       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
3722       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
3723       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
3724       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
3725       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3726       xnn_params.f32.gemm.mr = 7;
3727       xnn_params.f32.gemm.nr = 16;
3728     } else if (cpuinfo_has_x86_fma3()) {
3729       switch (cpuinfo_get_core(0)->uarch) {
3730         case cpuinfo_uarch_zen:
3731         case cpuinfo_uarch_dhyana:
3732           xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast);
3733           xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast);
3734           xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast);
3735           xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast);
3736           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
3737           xnn_params.f32.gemm.mr = 4;
3738           xnn_params.f32.gemm.nr = 16;
3739           xnn_params.f32.gemm.log2_sr = 2;
3740           break;
3741         default:
3742           xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast);
3743           xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast);
3744           xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast);
3745           xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast);
3746           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
3747           xnn_params.f32.gemm.mr = 5;
3748           xnn_params.f32.gemm.nr = 16;
3749           break;
3750       }
3751     } else if (cpuinfo_has_x86_avx()) {
3752       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast);
3753       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast);
3754       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast);
3755       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast);
3756       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
3757       xnn_params.f32.gemm.mr = 5;
3758       xnn_params.f32.gemm.nr = 16;
3759     } else {
3760       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__sse_load1);
3761       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__sse_load1);
3762       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__sse_load1);
3763       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__sse_load1);
3764       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_sse_params;
3765       xnn_params.f32.gemm.mr = 4;
3766       xnn_params.f32.gemm.nr = 8;
3767     }
3768     xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__sse);
3769     xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__sse);
3770     xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_sse_params;
3771     xnn_params.f32.gemm2.mr = 4;
3772     xnn_params.f32.gemm2.nr = 2;
3773     xnn_params.f32.gemm2.log2_kr = 2;
3774 
3775     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3776       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx512f;
3777       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
3778       xnn_params.f32.dwconv[0].channel_tile = 16;
3779       xnn_params.f32.dwconv[0].primary_tile = 3;
3780 
3781       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx512f;
3782       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
3783       xnn_params.f32.dwconv[1].channel_tile = 16;
3784       xnn_params.f32.dwconv[1].primary_tile = 4;
3785 
3786       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx512f;
3787       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3788       xnn_params.f32.dwconv[2].channel_tile = 16;
3789       xnn_params.f32.dwconv[2].primary_tile = 9;
3790 
3791       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x25__avx512f;
3792       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
3793       xnn_params.f32.dwconv[3].channel_tile = 16;
3794       xnn_params.f32.dwconv[3].primary_tile = 25;
3795     } else if (cpuinfo_has_x86_fma3()) {
3796       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__fma3;
3797       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
3798       xnn_params.f32.dwconv[0].channel_tile = 16;
3799       xnn_params.f32.dwconv[0].primary_tile = 3;
3800 
3801       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__fma3;
3802       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
3803       xnn_params.f32.dwconv[1].channel_tile = 16;
3804       xnn_params.f32.dwconv[1].primary_tile = 4;
3805 
3806       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__fma3;
3807       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
3808       xnn_params.f32.dwconv[2].channel_tile = 16;
3809       xnn_params.f32.dwconv[2].primary_tile = 9;
3810 
3811       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__fma3;
3812       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
3813       xnn_params.f32.dwconv[3].channel_tile = 8;
3814       xnn_params.f32.dwconv[3].primary_tile = 25;
3815     } else if (cpuinfo_has_x86_avx()) {
3816       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx;
3817       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
3818       xnn_params.f32.dwconv[0].channel_tile = 16;
3819       xnn_params.f32.dwconv[0].primary_tile = 3;
3820 
3821       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx;
3822       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
3823       xnn_params.f32.dwconv[1].channel_tile = 16;
3824       xnn_params.f32.dwconv[1].primary_tile = 4;
3825 
3826       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx;
3827       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
3828       xnn_params.f32.dwconv[2].channel_tile = 16;
3829       xnn_params.f32.dwconv[2].primary_tile = 9;
3830 
3831       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__avx;
3832       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
3833       xnn_params.f32.dwconv[3].channel_tile = 8;
3834       xnn_params.f32.dwconv[3].primary_tile = 25;
3835     } else {
3836       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__sse;
3837       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_sse_params;
3838       xnn_params.f32.dwconv[0].channel_tile = 8;
3839       xnn_params.f32.dwconv[0].primary_tile = 3;
3840 
3841       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__sse;
3842       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_sse_params;
3843       xnn_params.f32.dwconv[1].channel_tile = 8;
3844       xnn_params.f32.dwconv[1].primary_tile = 4;
3845 
3846       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__sse;
3847       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_sse_params;
3848       xnn_params.f32.dwconv[2].channel_tile = 8;
3849       xnn_params.f32.dwconv[2].primary_tile = 9;
3850 
3851       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__sse;
3852       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_sse_params;
3853       xnn_params.f32.dwconv[3].channel_tile = 8;
3854       xnn_params.f32.dwconv[3].primary_tile = 25;
3855     }
3856     xnn_params.f32.avgpool = (struct avgpool_parameters) {
3857       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__sse_c4,
3858       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4,
3859       .init.f32 = xnn_init_f32_scaleminmax_sse_params,
3860       .primary_tile = 9,
3861       .incremental_tile = 8,
3862       .channel_tile = 4,
3863     };
3864     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
3865       .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4,
3866       .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4,
3867       .primary_tile = 9,
3868       .incremental_tile = 8,
3869       .channel_tile = 4,
3870     };
3871     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
3872       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4,
3873       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4,
3874       .init.f32 = xnn_init_f32_scaleminmax_sse_params,
3875       .update.f32 = xnn_update_f32_scaleminmax_sse_params,
3876       .row_tile = 7,
3877       .channel_tile = 4,
3878     };
3879     xnn_params.f32.maxpool = (struct maxpool_parameters) {
3880       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4,
3881       .init.f32 = xnn_init_f32_minmax_sse_params,
3882       .mr = 9,
3883       .qr = 8,
3884     };
3885     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
3886       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
3887       .mr = 4,
3888     };
3889     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
3890       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
3891       .mr = 9,
3892     };
3893     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
3894       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
3895       .mr = 9,
3896       .qr = 8,
3897     };
3898     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
3899       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__sse_c8,
3900       .pixel_tile = 1,
3901       .channel_tile = 8,
3902     };
3903     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3904       xnn_params.f32.abs = (struct vunary_parameters) {
3905         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx512f_x16,
3906         .init.f32_abs = xnn_init_f32_abs_avx512_params,
3907         .element_tile = 16,
3908       };
3909     } else if (cpuinfo_has_x86_avx()) {
3910       xnn_params.f32.abs = (struct vunary_parameters) {
3911         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx_x16,
3912         .init.f32_abs = xnn_init_f32_abs_avx_params,
3913         .element_tile = 16,
3914       };
3915     } else {
3916       xnn_params.f32.abs = (struct vunary_parameters) {
3917         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__sse_x8,
3918         .init.f32_abs = xnn_init_f32_abs_sse_params,
3919         .element_tile = 8,
3920       };
3921     }
3922     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3923       xnn_params.f32.clamp = (struct vunary_parameters) {
3924         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__avx512f_x16,
3925         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3926         .element_tile = 16,
3927       };
3928     } else if (cpuinfo_has_x86_avx()) {
3929       xnn_params.f32.clamp = (struct vunary_parameters) {
3930         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__avx_x16,
3931         .init.f32_minmax = xnn_init_f32_minmax_avx_params,
3932         .element_tile = 16,
3933       };
3934     } else {
3935       xnn_params.f32.clamp = (struct vunary_parameters) {
3936         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__sse_x8,
3937         .init.f32_minmax = xnn_init_f32_minmax_sse_params,
3938         .element_tile = 8,
3939       };
3940     }
3941     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3942       xnn_params.f32.elu = (struct vunary_parameters) {
3943         .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x64,
3944         .init.f32_elu = xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
3945         .element_tile = 64,
3946       };
3947     } else if (cpuinfo_has_x86_avx2()) {
3948       xnn_params.f32.elu = (struct vunary_parameters) {
3949         .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56,
3950         .init.f32_elu = xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
3951         .element_tile = 56,
3952       };
3953     } else if (cpuinfo_has_x86_avx()) {
3954       xnn_params.f32.elu = (struct vunary_parameters) {
3955         .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32,
3956         .init.f32_elu = xnn_init_f32_elu_avx_rr2_lut4_p4_params,
3957         .element_tile = 32,
3958       };
3959     } else {
3960       xnn_params.f32.elu = (struct vunary_parameters) {
3961         .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12,
3962         .init.f32_elu = xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
3963         .element_tile = 12,
3964       };
3965     }
3966     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3967       xnn_params.f32.hswish = (struct vunary_parameters) {
3968         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__avx512f_x16,
3969         .init.f32_hswish = xnn_init_f32_hswish_avx512_params,
3970         .element_tile = 16,
3971       };
3972     } else if (cpuinfo_has_x86_fma3()) {
3973       xnn_params.f32.hswish = (struct vunary_parameters) {
3974         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__fma3_x16,
3975         .init.f32_hswish = xnn_init_f32_hswish_avx_params,
3976         .element_tile = 16,
3977       };
3978     } else if (cpuinfo_has_x86_avx()) {
3979       xnn_params.f32.hswish = (struct vunary_parameters) {
3980         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__avx_x16,
3981         .init.f32_hswish = xnn_init_f32_hswish_avx_params,
3982         .element_tile = 16,
3983       };
3984     } else {
3985       xnn_params.f32.hswish = (struct vunary_parameters) {
3986         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__sse_x8,
3987         .init.f32_hswish = xnn_init_f32_hswish_sse_params,
3988         .element_tile = 8,
3989       };
3990     }
3991     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3992       xnn_params.f32.lrelu = (struct vunary_parameters) {
3993         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx512f_x16,
3994         .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
3995         .element_tile = 16,
3996       };
3997     } else if (cpuinfo_has_x86_avx()) {
3998       xnn_params.f32.lrelu = (struct vunary_parameters) {
3999         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx_x16,
4000         .init.f32_lrelu = xnn_init_f32_lrelu_avx_params,
4001         .element_tile = 16,
4002       };
4003     } else if (cpuinfo_has_x86_sse4_1()) {
4004       xnn_params.f32.lrelu = (struct vunary_parameters) {
4005         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse41_x8,
4006         .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
4007         .element_tile = 8,
4008       };
4009     } else {
4010       xnn_params.f32.lrelu = (struct vunary_parameters) {
4011         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse_x8,
4012         .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
4013         .element_tile = 8,
4014       };
4015     }
4016     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4017       xnn_params.f32.neg = (struct vunary_parameters) {
4018         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx512f_x16,
4019         .init.f32_neg = xnn_init_f32_neg_avx512_params,
4020         .element_tile = 16,
4021       };
4022     } else if (cpuinfo_has_x86_avx()) {
4023       xnn_params.f32.neg = (struct vunary_parameters) {
4024         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx_x16,
4025         .init.f32_neg = xnn_init_f32_neg_avx_params,
4026         .element_tile = 16,
4027       };
4028     } else {
4029       xnn_params.f32.neg = (struct vunary_parameters) {
4030         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__sse_x8,
4031         .init.f32_neg = xnn_init_f32_neg_sse_params,
4032         .element_tile = 8,
4033       };
4034     }
4035     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4036       xnn_params.f32.rndne = (struct vunary_parameters) {
4037         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx512f_x16,
4038         .element_tile = 16,
4039       };
4040       xnn_params.f32.rndz = (struct vunary_parameters) {
4041         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx512f_x16,
4042         .element_tile = 16,
4043       };
4044       xnn_params.f32.rndu = (struct vunary_parameters) {
4045         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx512f_x16,
4046         .element_tile = 16,
4047       };
4048       xnn_params.f32.rndd = (struct vunary_parameters) {
4049         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx512f_x16,
4050         .element_tile = 16,
4051       };
4052     } else if (cpuinfo_has_x86_avx()) {
4053       xnn_params.f32.rndne = (struct vunary_parameters) {
4054         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx_x16,
4055         .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4056         .element_tile = 16,
4057       };
4058       xnn_params.f32.rndz = (struct vunary_parameters) {
4059         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx_x16,
4060         .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4061         .element_tile = 16,
4062       };
4063       xnn_params.f32.rndu = (struct vunary_parameters) {
4064         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx_x16,
4065         .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4066         .element_tile = 16,
4067       };
4068       xnn_params.f32.rndd = (struct vunary_parameters) {
4069         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx_x16,
4070         .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4071         .element_tile = 16,
4072       };
4073     } else if (cpuinfo_has_x86_sse4_1()) {
4074       xnn_params.f32.rndne = (struct vunary_parameters) {
4075         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse41_x8,
4076         .element_tile = 8,
4077       };
4078       xnn_params.f32.rndz = (struct vunary_parameters) {
4079         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse41_x8,
4080         .element_tile = 8,
4081       };
4082       xnn_params.f32.rndu = (struct vunary_parameters) {
4083         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse41_x8,
4084         .element_tile = 8,
4085       };
4086       xnn_params.f32.rndd = (struct vunary_parameters) {
4087         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse41_x8,
4088         .element_tile = 8,
4089       };
4090     } else {
4091       xnn_params.f32.rndne = (struct vunary_parameters) {
4092         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse2_x8,
4093         .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4094         .element_tile = 8,
4095       };
4096       xnn_params.f32.rndz = (struct vunary_parameters) {
4097         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse2_x8,
4098         .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4099         .element_tile = 8,
4100       };
4101       xnn_params.f32.rndu = (struct vunary_parameters) {
4102         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse2_x8,
4103         .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4104         .element_tile = 8,
4105       };
4106       xnn_params.f32.rndd = (struct vunary_parameters) {
4107         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse2_x8,
4108         .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4109         .element_tile = 8,
4110       };
4111     }
4112     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4113       xnn_params.f32.sigmoid = (struct vunary_parameters) {
4114         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x64,
4115         .init.f32_sigmoid = xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params,
4116         .element_tile = 64,
4117       };
4118     } else if (cpuinfo_has_x86_avx2()) {
4119       xnn_params.f32.sigmoid = (struct vunary_parameters) {
4120         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40,
4121         .init.f32_sigmoid = xnn_init_f32_sigmoid_avx2_rr1_p5_params,
4122         .element_tile = 40,
4123       };
4124     } else if (cpuinfo_has_x86_avx()) {
4125       xnn_params.f32.sigmoid = (struct vunary_parameters) {
4126         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40,
4127         .init.f32_sigmoid = xnn_init_f32_sigmoid_avx_rr2_p5_params,
4128         .element_tile = 40,
4129       };
4130     } else if (cpuinfo_has_x86_sse4_1()) {
4131       xnn_params.f32.sigmoid = (struct vunary_parameters) {
4132         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x8,
4133         .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4134         .element_tile = 8,
4135       };
4136     } else {
4137       xnn_params.f32.sigmoid = (struct vunary_parameters) {
4138         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_x8,
4139         .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4140         .element_tile = 8,
4141       };
4142     }
4143     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4144       xnn_params.f32.sqr = (struct vunary_parameters) {
4145         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx512f_x16,
4146         .element_tile = 16,
4147       };
4148     } else if (cpuinfo_has_x86_avx()) {
4149       xnn_params.f32.sqr = (struct vunary_parameters) {
4150         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx_x16,
4151         .init.f32_default = xnn_init_f32_default_avx_params,
4152         .element_tile = 16,
4153       };
4154     } else {
4155       xnn_params.f32.sqr = (struct vunary_parameters) {
4156         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__sse_x8,
4157         .element_tile = 8,
4158       };
4159     }
4160     if (cpuinfo_has_x86_avx()) {
4161       xnn_params.f32.sqrt = (struct vunary_parameters) {
4162         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__avx_sqrt_x8,
4163         .init.f32_sqrt = xnn_init_f32_sqrt_avx_params,
4164         .element_tile = 8,
4165       };
4166     } else {
4167       xnn_params.f32.sqrt = (struct vunary_parameters) {
4168         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__sse_sqrt_x4,
4169         .element_tile = 4,
4170       };
4171     }
4172     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4173       xnn_params.f32.prelu = (struct prelu_parameters) {
4174         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx512f_2x16,
4175         .row_tile = 2,
4176         .channel_tile = 16,
4177       };
4178     } else if (cpuinfo_has_x86_avx()) {
4179       xnn_params.f32.prelu = (struct prelu_parameters) {
4180         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx_2x16,
4181         .row_tile = 2,
4182         .channel_tile = 16,
4183       };
4184     } else if (cpuinfo_has_x86_sse4_1()) {
4185       xnn_params.f32.prelu = (struct prelu_parameters) {
4186         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse41_2x8,
4187         .row_tile = 2,
4188         .channel_tile = 8,
4189       };
4190     } else {
4191       xnn_params.f32.prelu = (struct prelu_parameters) {
4192         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse2_2x8,
4193         .row_tile = 2,
4194         .channel_tile = 8,
4195       };
4196     }
4197     xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
4198       .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20_acc2,
4199       .init = xnn_init_f32_expminus_sse2_rr2_p5_params,
4200       .element_tile = 20,
4201     };
4202     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__sse;
4203     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4204       xnn_params.f32.vadd = (struct vbinary_parameters) {
4205         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx512f_x32,
4206         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
4207         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
4208         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4209         .element_tile = 32,
4210       };
4211       xnn_params.f32.vdiv = (struct vbinary_parameters) {
4212         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx512f_x32,
4213         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx512f_x32,
4214         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx512f_x32,
4215         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4216         .element_tile = 32,
4217       };
4218       xnn_params.f32.vmax = (struct vbinary_parameters) {
4219         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx512f_x32,
4220         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
4221         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
4222         .element_tile = 32,
4223       };
4224       xnn_params.f32.vmin = (struct vbinary_parameters) {
4225         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx512f_x32,
4226         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
4227         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
4228         .element_tile = 32,
4229       };
4230       xnn_params.f32.vmul = (struct vbinary_parameters) {
4231         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx512f_x32,
4232         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
4233         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
4234         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4235         .element_tile = 32,
4236       };
4237       xnn_params.f32.vsub = (struct vbinary_parameters) {
4238         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx512f_x32,
4239         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx512f_x32,
4240         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx512f_x32,
4241         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4242         .element_tile = 32,
4243       };
4244       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
4245         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx512f_x32,
4246         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
4247         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
4248         .element_tile = 32,
4249       };
4250     } else if (cpuinfo_has_x86_avx()) {
4251       xnn_params.f32.vadd = (struct vbinary_parameters) {
4252         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx_x16,
4253         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
4254         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
4255         .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4256         .element_tile = 16,
4257       };
4258       xnn_params.f32.vdiv = (struct vbinary_parameters) {
4259         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx_x16,
4260         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx_x16,
4261         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx_x16,
4262         .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4263         .element_tile = 16,
4264       };
4265       xnn_params.f32.vmax = (struct vbinary_parameters) {
4266         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx_x16,
4267         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
4268         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
4269         .init.f32_default = xnn_init_f32_default_avx_params,
4270         .element_tile = 16,
4271       };
4272       xnn_params.f32.vmin = (struct vbinary_parameters) {
4273         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx_x16,
4274         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
4275         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
4276         .init.f32_default = xnn_init_f32_default_avx_params,
4277         .element_tile = 16,
4278       };
4279       xnn_params.f32.vmul = (struct vbinary_parameters) {
4280         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx_x16,
4281         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
4282         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
4283         .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4284         .element_tile = 16,
4285       };
4286       xnn_params.f32.vsub = (struct vbinary_parameters) {
4287         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx_x16,
4288         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx_x16,
4289         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx_x16,
4290         .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4291         .element_tile = 16,
4292       };
4293       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
4294         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx_x16,
4295         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
4296         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
4297         .init.f32_default = xnn_init_f32_default_avx_params,
4298         .element_tile = 16,
4299       };
4300     } else {
4301       xnn_params.f32.vadd = (struct vbinary_parameters) {
4302         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__sse_x8,
4303         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
4304         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
4305         .init.f32_minmax = xnn_init_f32_minmax_sse_params,
4306         .element_tile = 8,
4307       };
4308       xnn_params.f32.vdiv = (struct vbinary_parameters) {
4309         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__sse_x8,
4310         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__sse_x8,
4311         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__sse_x8,
4312         .init.f32_minmax = xnn_init_f32_minmax_sse_params,
4313         .element_tile = 8,
4314       };
4315       xnn_params.f32.vmax = (struct vbinary_parameters) {
4316         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__sse_x8,
4317         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
4318         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
4319         .element_tile = 8,
4320       };
4321       xnn_params.f32.vmin = (struct vbinary_parameters) {
4322         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__sse_x8,
4323         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
4324         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
4325         .element_tile = 8,
4326       };
4327       xnn_params.f32.vmul = (struct vbinary_parameters) {
4328         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__sse_x8,
4329         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
4330         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
4331         .init.f32_minmax = xnn_init_f32_minmax_sse_params,
4332         .element_tile = 8,
4333       };
4334       xnn_params.f32.vsub = (struct vbinary_parameters) {
4335         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__sse_x8,
4336         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__sse_x8,
4337         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__sse_x8,
4338         .init.f32_minmax = xnn_init_f32_minmax_sse_params,
4339         .element_tile = 8,
4340       };
4341       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
4342         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__sse_x8,
4343         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
4344         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
4345         .element_tile = 8,
4346       };
4347     }
4348     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
4349       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x,
4350       .init.f32 = xnn_init_f32_minmax_sse_params,
4351       .channel_tile = 4,
4352       .row_tile = 2,
4353     };
4354     #ifndef XNN_NO_NCHW_OPERATORS
4355       // Sparse microkernels on x86 currently target only SSE, and on processors
4356       // with AVX ISA dense inference is expected to be faster than sparse.
4357       if (!cpuinfo_has_x86_avx()) {
4358         init_flags |= XNN_INIT_FLAG_CHW_OPT;
4359       }
4360 
4361       xnn_params.f32.spmm = (struct spmm_parameters) {
4362         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__sse,
4363         .mr = 32,
4364         .nr = 1,
4365       };
4366       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
4367         .ukernel_with_symm_padding =
4368           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2,
4369         .output_channel_tile = 4,
4370         .output_height_tile = 2,
4371         .output_width_tile = 2,
4372       };
4373       if (cpuinfo_has_x86_ssse3()) {
4374         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
4375           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2,
4376           .output_width_tile = 4,
4377           .output_height_tile = 2,
4378         };
4379       } else {
4380         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
4381           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2,
4382           .output_width_tile = 4,
4383           .output_height_tile = 2,
4384         };
4385       }
4386       xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
4387         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3,
4388         .output_width_tile = 4,
4389         .output_height_tile = 1,
4390       };
4391       xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
4392         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4,
4393         .output_width_tile = 4,
4394         .output_height_tile = 4,
4395       };
4396       xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
4397         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4,
4398         .output_width_tile = 4,
4399         .output_height_tile = 2,
4400       };
4401       xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
4402         .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__sse_x4,
4403         .channel_tile = 4,
4404       };
4405       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
4406         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__sse_p8,
4407         .channel_tile = 1,
4408         .pixel_tile = 8,
4409       };
4410     #endif  // XNN_NO_NCHW_OPERATORS
4411   #endif  // XNN_NO_F32_OPERATORS
4412 
4413   /*************************** VCVT x86 micro-kernels ***************************/
4414   #ifndef XNN_NO_VCVT_OPERATORS
4415     init_flags |= XNN_INIT_FLAG_VCVT;
4416 
4417     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4418       xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4419         .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx512skx_x16,
4420         .element_tile = 16,
4421       };
4422       xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4423         .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx512skx_x16,
4424         .element_tile = 16,
4425       };
4426     } else if (cpuinfo_has_x86_f16c()) {
4427       xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4428         .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__f16c_x16,
4429         .element_tile = 16,
4430       };
4431       xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4432         .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__f16c_x16,
4433         .init.f32_f16_cvt = xnn_init_f32_f16_cvt_f16c_params,
4434         .element_tile = 16,
4435       };
4436     } else if (cpuinfo_has_x86_avx()) {
4437       xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4438         .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx_int16_x16,
4439         .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4440         .element_tile = 16,
4441       };
4442       xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4443         .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx_x24,
4444         .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4445         .element_tile = 24,
4446       };
4447     } else if (cpuinfo_has_x86_sse4_1()) {
4448       xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4449         .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse41_int16_x16,
4450         .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4451         .element_tile = 16,
4452       };
4453       xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4454         .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse41_x8,
4455         .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4456         .element_tile = 8,
4457       };
4458     } else {
4459       xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4460         .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32,
4461         .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4462         .element_tile = 32,
4463       };
4464       xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4465         .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse2_x16,
4466         .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4467         .element_tile = 16,
4468       };
4469     }
4470     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4471       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4472         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx512skx_x128,
4473         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx512_params,
4474         .element_tile = 128,
4475       };
4476     } else if (cpuinfo_has_x86_avx2()) {
4477       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4478         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx2_x64,
4479         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx2_params,
4480         .element_tile = 64,
4481       };
4482     } else if (cpuinfo_has_x86_avx()) {
4483       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4484         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx_x32,
4485         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx_params,
4486         .element_tile = 32,
4487       };
4488     } else if (cpuinfo_has_x86_sse4_1()) {
4489       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4490         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse41_x32,
4491         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse4_params,
4492         .element_tile = 32,
4493       };
4494     } else {
4495       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4496         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse2_x32,
4497         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse2_params,
4498         .element_tile = 32,
4499       };
4500     }
4501     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4502       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4503         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx512skx_x128,
4504         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx512_params,
4505         .element_tile = 128,
4506       };
4507     } else if (cpuinfo_has_x86_avx2()) {
4508       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4509         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx2_x64,
4510         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx2_params,
4511         .element_tile = 64,
4512       };
4513     } else if (cpuinfo_has_x86_avx()) {
4514       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4515         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx_x32,
4516         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx_params,
4517         .element_tile = 32,
4518       };
4519     } else {
4520       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4521         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
4522         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
4523         .element_tile = 32,
4524       };
4525     }
4526     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4527       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4528         .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx512skx_x32,
4529         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx512_params,
4530         .element_tile = 32,
4531       };
4532       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4533         .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx512skx_x32,
4534         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx512_params,
4535         .element_tile = 32,
4536       };
4537     } else if (cpuinfo_has_x86_avx2()) {
4538       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4539         .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx2_x16,
4540         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
4541         .element_tile = 16,
4542       };
4543       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4544         .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx2_x16,
4545         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
4546         .element_tile = 16,
4547       };
4548     } else if (cpuinfo_has_x86_avx()) {
4549       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4550         .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx_x32,
4551         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
4552         .element_tile = 32,
4553       };
4554       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4555         .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx_x32,
4556         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
4557         .element_tile = 32,
4558       };
4559     } else if (cpuinfo_has_x86_sse4_1()) {
4560       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4561         .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse41_x16,
4562         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse4_params,
4563         .element_tile = 16,
4564       };
4565       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4566         .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse41_x16,
4567         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse4_params,
4568         .element_tile = 16,
4569       };
4570     } else {
4571       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4572         .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse2_x32,
4573         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse2_params,
4574         .element_tile = 32,
4575       };
4576       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4577         .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse2_x32,
4578         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse2_params,
4579         .element_tile = 32,
4580       };
4581     }
4582   #endif  // XNN_NO_VCVT_OPERATORS
4583 
4584   /**************************** X32 x86 micro-kernels ****************************/
4585   #ifndef XNN_NO_X32_OPERATORS
4586     init_flags |= XNN_INIT_FLAG_X32;
4587 
4588     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__sse2;
4589     xnn_params.x32.zip = (struct zip_parameters) {
4590       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__sse2,
4591       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__sse2,
4592       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__sse2,
4593       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__sse2,
4594     };
4595     #ifndef XNN_NO_NCHW_OPERATORS
4596       xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
4597         .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
4598         .channel_tile = 1,
4599         .pixel_tile = 1,
4600       };
4601     #endif  // XNN_NO_NCHW_OPERATORS
4602   #endif  // XNN_NO_X32_OPERATORS
4603 
4604   /**************************** XX x86 micro-kernels ****************************/
4605   #ifndef XNN_NO_XX_OPERATORS
4606     init_flags |= XNN_INIT_FLAG_XX;
4607 
4608     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
4609     xnn_params.xx.fill = (struct fill_parameters) {
4610       .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__sse2_x64,
4611       .row_tile = 1,
4612     };
4613     xnn_params.xx.pad = (struct pad_parameters) {
4614       .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__sse2,
4615       .row_tile = 1,
4616     };
4617   #endif
4618 
4619 #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4620 
4621   /**************************** QC8 WAsm SIMD micro-kernels****************************/
4622   #ifndef XNN_NO_QS8_OPERATORS
4623     init_flags |= XNN_INIT_FLAG_QC8;
4624 
4625     #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4626       xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4627       xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4628       xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4629       xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4630       xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
4631       xnn_params.qc8.gemm.mr = 4;
4632       xnn_params.qc8.gemm.nr = 4;
4633       xnn_params.qc8.gemm.log2_kr = 1;
4634       xnn_params.qc8.gemm.log2_sr = 2;
4635     #else
4636       xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4637       xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4638       xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4639       xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4640       xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
4641       xnn_params.qc8.gemm.mr = 3;
4642       xnn_params.qc8.gemm.nr = 4;
4643       xnn_params.qc8.gemm.log2_kr = 3;
4644     #endif
4645 
4646     xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
4647     xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
4648     xnn_params.qc8.dwconv[0].channel_tile = 16;
4649     xnn_params.qc8.dwconv[0].primary_tile = 9;
4650     xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
4651     xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
4652     xnn_params.qc8.dwconv[1].channel_tile = 16;
4653     xnn_params.qc8.dwconv[1].primary_tile = 25;
4654   #endif  // XNN_NO_QC8_OPERATORS
4655 
4656   /**************************** QS8 WAsm SIMD micro-kernels****************************/
4657   #ifndef XNN_NO_QS8_OPERATORS
4658     init_flags |= XNN_INIT_FLAG_QS8;
4659 
4660     #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4661       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4662       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4663       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4664       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4665       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4666       xnn_params.qs8.gemm.mr = 4;
4667       xnn_params.qs8.gemm.nr = 4;
4668       xnn_params.qs8.gemm.log2_kr = 1;
4669       xnn_params.qs8.gemm.log2_sr = 2;
4670     #else  // XNN_WASMSIMD_VERSION >= 88
4671       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4672       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4673       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4674       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4675       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4676       xnn_params.qs8.gemm.mr = 3;
4677       xnn_params.qs8.gemm.nr = 4;
4678       xnn_params.qs8.gemm.log2_kr = 3;
4679     #endif  // XNN_WASMSIMD_VERSION >= 88
4680 
4681     xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
4682     xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4683     xnn_params.qs8.dwconv[0].channel_tile = 16;
4684     xnn_params.qs8.dwconv[0].primary_tile = 9;
4685     xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
4686     xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4687     xnn_params.qs8.dwconv[1].channel_tile = 16;
4688     xnn_params.qs8.dwconv[1].primary_tile = 25;
4689 
4690     xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
4691       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
4692       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
4693       .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params,
4694       .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params,
4695       .row_tile = 7,
4696       .channel_tile = 16,
4697     };
4698 
4699     xnn_params.qs8.vadd = (struct vbinary_parameters) {
4700       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__wasmsimd_x32,
4701       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
4702       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
4703       .init.qs8_addsub = xnn_init_qs8_add_minmax_wasmsimd_params,
4704       .element_tile = 32,
4705     };
4706     xnn_params.qs8.vmul = (struct vbinary_parameters) {
4707       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4708       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4709       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4710       .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_wasmsimd_params,
4711       .element_tile = 8,
4712     };
4713   #endif  // XNN_NO_QS8_OPERATORS
4714 
4715   /**************************** QU8 WAsm SIMD micro-kernels****************************/
4716   #ifndef XNN_NO_QU8_OPERATORS
4717     init_flags |= XNN_INIT_FLAG_QU8;
4718 
4719     #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4720       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4721       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4722       xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4723       xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4724       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4725       xnn_params.qu8.gemm.mr = 4;
4726       xnn_params.qu8.gemm.nr = 4;
4727       xnn_params.qu8.gemm.log2_kr = 1;
4728       xnn_params.qu8.gemm.log2_sr = 2;
4729     #else  // XNN_WASMSIMD_VERSION >= 88
4730       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64);
4731       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64);
4732       xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64);
4733       xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64);
4734       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4735       xnn_params.qu8.gemm.mr = 3;
4736       xnn_params.qu8.gemm.nr = 4;
4737       xnn_params.qu8.gemm.log2_kr = 3;
4738     #endif  // XNN_WASMSIMD_VERSION >= 88
4739 
4740     xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16;
4741     xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4742     xnn_params.qu8.dwconv[0].channel_tile = 8;
4743     xnn_params.qu8.dwconv[0].primary_tile = 9;
4744     xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16;
4745     xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4746     xnn_params.qu8.dwconv[1].channel_tile = 8;
4747     xnn_params.qu8.dwconv[1].primary_tile = 25;
4748 
4749     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
4750       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
4751       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
4752       .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
4753       .primary_tile = 9,
4754       .incremental_tile = 8,
4755       .channel_tile = 1,
4756     };
4757     xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
4758       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
4759       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
4760       .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params,
4761       .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params,
4762       .row_tile = 7,
4763       .channel_tile = 16,
4764     };
4765 
4766     xnn_params.qu8.vadd = (struct vbinary_parameters) {
4767       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__wasmsimd_x32,
4768       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
4769       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
4770       .init.qu8_addsub = xnn_init_qu8_add_minmax_wasmsimd_params,
4771       .element_tile = 32,
4772     };
4773     xnn_params.qu8.vmul = (struct vbinary_parameters) {
4774       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4775       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4776       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4777       .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_wasmsimd_params,
4778       .element_tile = 8,
4779     };
4780   #endif  // XNN_NO_QU8_OPERATORS
4781 
4782   /**************************** S8 WAsm SIMD micro-kernels****************************/
4783   #ifndef XNN_NO_S8_OPERATORS
4784     init_flags |= XNN_INIT_FLAG_S8;
4785 
4786     xnn_params.s8.clamp = (struct vunary_parameters) {
4787       .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__wasmsimd_x64,
4788       .init.s8_minmax = xnn_init_s8_minmax_wasmsimd_params,
4789       .element_tile = 64,
4790     };
4791     #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4792       xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4793         .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
4794         .pixel_tile = 1,
4795         .channel_tile = 8,
4796       };
4797     #else  // XNN_WASMSIMD_VERSION >= 88
4798       xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4799         .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_mul32_c8,
4800         .pixel_tile = 1,
4801         .channel_tile = 8,
4802       };
4803     #endif  // XNN_WASMSIMD_VERSION >= 88
4804     xnn_params.s8.maxpool = (struct maxpool_parameters) {
4805       .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
4806       .init.s8 = xnn_init_s8_minmax_wasmsimd_params,
4807       .mr = 9,
4808       .qr = 8,
4809     };
4810   #endif  // XNN_NO_S8_OPERATORS
4811 
4812   /**************************** U8 WAsm SIMD micro-kernels****************************/
4813   #ifndef XNN_NO_U8_OPERATORS
4814     init_flags |= XNN_INIT_FLAG_U8;
4815 
4816     xnn_params.u8.clamp = (struct vunary_parameters) {
4817       .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__wasmsimd_x64,
4818       .init.u8_minmax = xnn_init_u8_minmax_wasmsimd_params,
4819       .element_tile = 64,
4820     };
4821     #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4822       xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4823         .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
4824         .pixel_tile = 1,
4825         .channel_tile = 8,
4826       };
4827     #else  // XNN_WASMSIMD_VERSION >= 88
4828       xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4829         .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_mul32_c8,
4830         .pixel_tile = 1,
4831         .channel_tile = 8,
4832       };
4833     #endif  // XNN_WASMSIMD_VERSION >= 88
4834     xnn_params.u8.maxpool = (struct maxpool_parameters) {
4835       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
4836       .init.u8 = xnn_init_u8_minmax_wasmsimd_params,
4837       .mr = 9,
4838       .qr = 8,
4839     };
4840     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
4841     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
4842   #endif  // XNN_NO_U8_OPERATORS
4843 
4844   /**************************** X8 WAsm SIMD micro-kernels****************************/
4845   #ifndef XNN_NO_X8_OPERATORS
4846     init_flags |= XNN_INIT_FLAG_X8;
4847 
4848     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
4849     xnn_params.x8.zip = (struct zip_parameters) {
4850       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
4851       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
4852       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
4853       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
4854     };
4855   #endif  // XNN_NO_X8_OPERATORS
4856 
4857   /**************************** F32 WAsm SIMD micro-kernels****************************/
4858   #ifndef XNN_NO_F32_OPERATORS
4859     init_flags |= XNN_INIT_FLAG_F32;
4860 
4861     if (is_wasm_x86) {
4862       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
4863       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
4864       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
4865       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
4866       xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat);
4867       xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat);
4868       xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
4869       xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
4870       xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__wasmsimd_splat);
4871       xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__wasmsimd_splat);
4872       xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
4873       xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
4874       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4875       xnn_params.f32.gemm.mr = 4;
4876       xnn_params.f32.gemm.nr = 8;
4877 
4878       xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
4879       xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
4880       xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
4881       xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
4882       xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4883       xnn_params.f32.gemm2.mr = 4;
4884       xnn_params.f32.gemm2.nr = 2;
4885       xnn_params.f32.gemm2.log2_kr = 2;
4886     } else {
4887       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
4888       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
4889       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
4890       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
4891       xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat);
4892       xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat);
4893       xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
4894       xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
4895       xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x8__wasmsimd_splat);
4896       xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x8__wasmsimd_splat);
4897       xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
4898       xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
4899       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4900       xnn_params.f32.gemm.mr = 5;
4901       xnn_params.f32.gemm.nr = 8;
4902 
4903       xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
4904       xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
4905       xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
4906       xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
4907       xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4908       xnn_params.f32.gemm2.mr = 4;
4909       xnn_params.f32.gemm2.nr = 2;
4910       xnn_params.f32.gemm2.log2_kr = 2;
4911     }
4912 
4913     if (is_wasm_x86) {
4914       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__wasmsimd_x86;
4915       xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x3__wasmsimd;
4916       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4917       xnn_params.f32.dwconv[0].channel_tile = 8;
4918       xnn_params.f32.dwconv[0].primary_tile = 3;
4919 
4920       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86;
4921       xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__wasmsimd;
4922       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4923       xnn_params.f32.dwconv[1].channel_tile = 8;
4924       xnn_params.f32.dwconv[1].primary_tile = 4;
4925 
4926       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_x86;
4927       xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__wasmsimd;
4928       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4929       xnn_params.f32.dwconv[2].channel_tile = 8;
4930       xnn_params.f32.dwconv[2].primary_tile = 9;
4931     } else {
4932       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x3__wasmsimd_arm;
4933       xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x3__wasmsimd;
4934       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4935       xnn_params.f32.dwconv[0].channel_tile = 4;
4936       xnn_params.f32.dwconv[0].primary_tile = 3;
4937 
4938       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_arm;
4939       xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__wasmsimd;
4940       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4941       xnn_params.f32.dwconv[1].channel_tile = 4;
4942       xnn_params.f32.dwconv[1].primary_tile = 4;
4943 
4944       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_arm;
4945       xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__wasmsimd;
4946       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4947       xnn_params.f32.dwconv[2].channel_tile = 4;
4948       xnn_params.f32.dwconv[2].primary_tile = 9;
4949     }
4950 
4951     xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__wasmsimd_arm;
4952     xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__wasmsimd;
4953     xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4954     xnn_params.f32.dwconv[3].channel_tile = 4;
4955     xnn_params.f32.dwconv[3].primary_tile = 25;
4956 
4957     if (is_wasm_x86) {
4958       xnn_params.f32.avgpool = (struct avgpool_parameters) {
4959         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
4960         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
4961         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
4962         .primary_tile = 9,
4963         .incremental_tile = 8,
4964         .channel_tile = 4,
4965       };
4966       xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
4967         .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
4968         .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
4969         .primary_tile = 9,
4970         .incremental_tile = 8,
4971         .channel_tile = 4,
4972       };
4973       xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
4974         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4,
4975         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4,
4976         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
4977         .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
4978         .row_tile = 7,
4979         .channel_tile = 4,
4980       };
4981     } else {
4982       xnn_params.f32.avgpool = (struct avgpool_parameters) {
4983         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
4984         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
4985         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
4986         .primary_tile = 9,
4987         .incremental_tile = 8,
4988         .channel_tile = 4,
4989       };
4990       xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
4991         .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
4992         .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
4993         .primary_tile = 9,
4994         .incremental_tile = 8,
4995         .channel_tile = 4,
4996       };
4997       xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
4998         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4,
4999         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4,
5000         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5001         .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5002         .row_tile = 7,
5003         .channel_tile = 4,
5004       };
5005     }
5006     if (is_wasm_x86) {
5007       xnn_params.f32.maxpool = (struct maxpool_parameters) {
5008         .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
5009         .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5010         .mr = 9,
5011         .qr = 8,
5012       };
5013     } else {
5014       xnn_params.f32.maxpool = (struct maxpool_parameters) {
5015         .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
5016         .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5017         .mr = 9,
5018         .qr = 8,
5019       };
5020     }
5021     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
5022       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__wasmsimd_c4,
5023       .mr = 4,
5024     };
5025     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
5026       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__wasmsimd_c4,
5027       .mr = 9,
5028     };
5029     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
5030       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__wasmsimd_c4,
5031       .mr = 9,
5032       .qr = 8,
5033     };
5034     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
5035       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__wasmsimd_c8,
5036       .pixel_tile = 1,
5037       .channel_tile = 8,
5038     };
5039     xnn_params.f32.abs = (struct vunary_parameters) {
5040       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8,
5041       .init.f32_abs = xnn_init_f32_abs_wasmsimd_params,
5042       .element_tile = 16,
5043     };
5044     if (is_wasm_x86) {
5045       xnn_params.f32.clamp = (struct vunary_parameters) {
5046         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_x86_x8,
5047         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5048         .element_tile = 8,
5049       };
5050     } else {
5051       xnn_params.f32.clamp = (struct vunary_parameters) {
5052         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_arm_x8,
5053         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5054         .element_tile = 8,
5055       };
5056     }
5057     if (is_wasm_x86) {
5058       xnn_params.f32.elu = (struct vunary_parameters) {
5059         .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20,
5060         .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5061         .element_tile = 20,
5062       };
5063     } else {
5064       xnn_params.f32.elu = (struct vunary_parameters) {
5065         .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20,
5066         .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5067         .element_tile = 20,
5068       };
5069     }
5070     xnn_params.f32.hswish = (struct vunary_parameters) {
5071       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__wasmsimd_x16,
5072       .init.f32_hswish = xnn_init_f32_hswish_wasmsimd_params,
5073       .element_tile = 16,
5074     };
5075     if (is_wasm_x86) {
5076       xnn_params.f32.lrelu = (struct vunary_parameters) {
5077         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x8,
5078         .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5079         .element_tile = 8,
5080       };
5081     } else {
5082       xnn_params.f32.lrelu = (struct vunary_parameters) {
5083         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x8,
5084         .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5085         .element_tile = 8,
5086       };
5087     }
5088     xnn_params.f32.neg = (struct vunary_parameters) {
5089       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__wasmsimd_x8,
5090       .init.f32_neg = xnn_init_f32_neg_wasmsimd_params,
5091       .element_tile = 16,
5092     };
5093     xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasmsimd_x16;
5094     #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 91)
5095       xnn_params.f32.rndne = (struct vunary_parameters) {
5096         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_native_x8,
5097         .element_tile = 8,
5098       };
5099       xnn_params.f32.rndz = (struct vunary_parameters) {
5100         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_native_x8,
5101         .element_tile = 8,
5102       };
5103       xnn_params.f32.rndu = (struct vunary_parameters) {
5104         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_native_x8,
5105         .element_tile = 8,
5106       };
5107       xnn_params.f32.rndd = (struct vunary_parameters) {
5108         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_native_x8,
5109         .element_tile = 8,
5110       };
5111     #else  // XNN_WASMSIMD_VERSION >= 91
5112       xnn_params.f32.rndne = (struct vunary_parameters) {
5113         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8,
5114         .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5115         .element_tile = 8,
5116       };
5117       if (is_wasm_x86) {
5118         xnn_params.f32.rndz = (struct vunary_parameters) {
5119           .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8,
5120           .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5121           .element_tile = 8,
5122         };
5123       } else {
5124         xnn_params.f32.rndz = (struct vunary_parameters) {
5125           .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8,
5126           .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5127           .element_tile = 8,
5128         };
5129       }
5130       xnn_params.f32.rndu = (struct vunary_parameters) {
5131         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8,
5132         .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5133         .element_tile = 8,
5134       };
5135       xnn_params.f32.rndd = (struct vunary_parameters) {
5136         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8,
5137         .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5138         .element_tile = 8,
5139       };
5140     #endif  // XNN_WASMSIMD_VERSION >= 91
5141     xnn_params.f32.sigmoid = (struct vunary_parameters) {
5142       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_x16,
5143       .init.f32_sigmoid = xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params,
5144       .element_tile = 16,
5145     };
5146     xnn_params.f32.sqr = (struct vunary_parameters) {
5147       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__wasmsimd_x8,
5148       .element_tile = 16,
5149     };
5150     xnn_params.f32.sqrt = (struct vunary_parameters) {
5151       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8,
5152       .element_tile = 8,
5153     };
5154     if (is_wasm_x86) {
5155       xnn_params.f32.prelu = (struct prelu_parameters) {
5156         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_minmax_2x8,
5157         .row_tile = 2,
5158         .channel_tile = 8,
5159       };
5160     } else {
5161       xnn_params.f32.prelu = (struct prelu_parameters) {
5162         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_bitselect_2x8,
5163         .row_tile = 2,
5164         .channel_tile = 8,
5165       };
5166     }
5167     xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5168       .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x16_acc2,
5169       .init = xnn_init_f32_expminus_wasmsimd_rr2_p5_params,
5170       .element_tile = 16,
5171     };
5172     if (is_wasm_x86) {
5173       xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_x86;
5174       xnn_params.f32.vadd = (struct vbinary_parameters) {
5175         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16,
5176         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
5177         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
5178         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
5179         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
5180         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
5181         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5182         .element_tile = 16,
5183       };
5184       xnn_params.f32.vdiv = (struct vbinary_parameters) {
5185         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16,
5186         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16,
5187         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16,
5188         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
5189         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
5190         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
5191         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5192         .element_tile = 16,
5193       };
5194       xnn_params.f32.vmax = (struct vbinary_parameters) {
5195         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_x86_x16,
5196         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
5197         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
5198         .element_tile = 16,
5199       };
5200       xnn_params.f32.vmin = (struct vbinary_parameters) {
5201         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_x86_x16,
5202         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
5203         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
5204         .element_tile = 16,
5205       };
5206       xnn_params.f32.vmul = (struct vbinary_parameters) {
5207         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16,
5208         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
5209         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
5210         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
5211         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
5212         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
5213         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5214         .element_tile = 16,
5215       };
5216       xnn_params.f32.vsub = (struct vbinary_parameters) {
5217         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16,
5218         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16,
5219         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16,
5220         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
5221         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
5222         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
5223         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5224         .element_tile = 16,
5225       };
5226     } else {
5227       xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_arm;
5228       xnn_params.f32.vadd = (struct vbinary_parameters) {
5229         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16,
5230         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
5231         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
5232         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
5233         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
5234         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
5235         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5236         .element_tile = 16,
5237       };
5238       xnn_params.f32.vdiv = (struct vbinary_parameters) {
5239         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16,
5240         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16,
5241         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16,
5242         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
5243         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
5244         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
5245         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5246         .element_tile = 16,
5247       };
5248       xnn_params.f32.vmax = (struct vbinary_parameters) {
5249         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_arm_x16,
5250         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
5251         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
5252         .element_tile = 16,
5253       };
5254       xnn_params.f32.vmin = (struct vbinary_parameters) {
5255         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_arm_x16,
5256         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
5257         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
5258         .element_tile = 16,
5259       };
5260       xnn_params.f32.vmul = (struct vbinary_parameters) {
5261         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16,
5262         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
5263         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
5264         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
5265         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
5266         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
5267         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5268         .element_tile = 16,
5269       };
5270       xnn_params.f32.vsub = (struct vbinary_parameters) {
5271         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16,
5272         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16,
5273         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16,
5274         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
5275         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
5276         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
5277         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5278         .element_tile = 16,
5279       };
5280     }
5281     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
5282       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__wasmsimd_x16,
5283       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
5284       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
5285       .element_tile = 16,
5286     };
5287     if (is_wasm_x86) {
5288       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
5289         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_x86_2x,
5290         .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5291         .channel_tile = 4,
5292         .row_tile = 2,
5293       };
5294     } else {
5295       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
5296         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_arm_2x,
5297         .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5298         .channel_tile = 4,
5299         .row_tile = 2,
5300       };
5301     }
5302     #ifndef XNN_NO_NCHW_OPERATORS
5303       init_flags |= XNN_INIT_FLAG_CHW_OPT;
5304 
5305       if (is_wasm_x86) {
5306         xnn_params.f32.spmm = (struct spmm_parameters) {
5307           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86,
5308           .mr = 32,
5309           .nr = 1,
5310         };
5311       } else {
5312         xnn_params.f32.spmm = (struct spmm_parameters) {
5313           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm,
5314           .mr = 32,
5315           .nr = 1,
5316         };
5317       }
5318       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
5319         .ukernel_with_symm_padding =
5320           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2,
5321         .output_channel_tile = 4,
5322         .output_height_tile = 2,
5323         .output_width_tile = 2,
5324       };
5325       if (is_wasm_x86) {
5326         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5327           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4,
5328           .output_width_tile = 4,
5329           .output_height_tile = 2,
5330         };
5331         xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
5332           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2,
5333           .output_width_tile = 4,
5334           .output_height_tile = 1,
5335         };
5336         xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
5337           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4,
5338           .output_width_tile = 4,
5339           .output_height_tile = 3,
5340         };
5341         xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
5342           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2,
5343           .output_width_tile = 4,
5344           .output_height_tile = 1,
5345         };
5346       } else {
5347         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5348           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4,
5349           .output_width_tile = 4,
5350           .output_height_tile = 2,
5351         };
5352         xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
5353           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4,
5354           .output_width_tile = 4,
5355           .output_height_tile = 1,
5356         };
5357         xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
5358           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4,
5359           .output_width_tile = 4,
5360           .output_height_tile = 3,
5361         };
5362         xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
5363           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2,
5364           .output_width_tile = 4,
5365           .output_height_tile = 1,
5366         };
5367       }
5368       if (is_wasm_x86) {
5369         xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5370           .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4,
5371           .channel_tile = 4,
5372         };
5373       } else {
5374         xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5375           .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4,
5376           .channel_tile = 4,
5377         };
5378       }
5379       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
5380         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8,
5381         .channel_tile = 1,
5382         .pixel_tile = 8,
5383       };
5384     #endif  // XNN_NO_NCHW_OPERATORS
5385   #endif  // XNN_NO_F32_OPERATORS
5386 
5387   /*************************** VCVT WAsm SIMD micro-kernels***************************/
5388   #ifndef XNN_NO_VCVT_OPERATORS
5389     init_flags |= XNN_INIT_FLAG_VCVT;
5390 
5391     xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5392       .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16,
5393       .init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params,
5394       .element_tile = 16,
5395     };
5396     xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5397       .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__wasmsimd_x24,
5398       .init.f32_f16_cvt = xnn_init_f32_f16_cvt_wasmsimd_params,
5399       .element_tile = 24,
5400     };
5401     xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5402       .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x32,
5403       .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_wasmsimd_magic_params,
5404       .element_tile = 32,
5405     };
5406     xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5407       .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x32,
5408       .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_wasmsimd_magic_params,
5409       .element_tile = 32,
5410     };
5411     xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5412       .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__wasmsimd_x32,
5413       .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_wasmsimd_params,
5414       .element_tile = 32,
5415     };
5416     xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5417       .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__wasmsimd_x32,
5418       .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_wasmsimd_params,
5419       .element_tile = 32,
5420     };
5421   #endif  // XNN_NO_VCVT_OPERATORS
5422 
5423   /**************************** X32 WAsm SIMD micro-kernels****************************/
5424   #ifndef XNN_NO_X32_OPERATORS
5425     init_flags |= XNN_INIT_FLAG_X32;
5426 
5427     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__wasmsimd;
5428     xnn_params.x32.zip = (struct zip_parameters) {
5429       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__wasmsimd,
5430       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__wasmsimd,
5431       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__wasmsimd,
5432       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__wasmsimd,
5433     };
5434     #ifndef XNN_NO_NCHW_OPERATORS
5435       xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
5436         .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
5437         .channel_tile = 1,
5438         .pixel_tile = 1,
5439       };
5440     #endif  // XNN_NO_NCHW_OPERATORS
5441   #endif  // XNN_NO_X32_OPERATORS
5442 
5443   /**************************** XX WAsm SIMD micro-kernels****************************/
5444   #ifndef XNN_NO_XX_OPERATORS
5445     init_flags |= XNN_INIT_FLAG_XX;
5446 
5447     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
5448     xnn_params.xx.fill = (struct fill_parameters) {
5449       .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__wasmsimd_x64,
5450       .row_tile = 1,
5451     };
5452     xnn_params.xx.pad = (struct pad_parameters) {
5453       .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__wasmsimd,
5454       .row_tile = 1,
5455     };
5456   #endif
5457 
5458 #elif XNN_ARCH_WASM
5459 
5460   /**************************** QC8 WAsm micro-kernels****************************/
5461   #ifndef XNN_NO_QC8_OPERATORS
5462     init_flags |= XNN_INIT_FLAG_QC8;
5463 
5464     if (is_wasm_x86) {
5465       xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5466       xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5467       xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5468       xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5469       xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
5470       xnn_params.qc8.gemm.mr = 2;
5471       xnn_params.qc8.gemm.nr = 2;
5472     } else {
5473       xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5474       xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5475       xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5476       xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5477       xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
5478       xnn_params.qc8.gemm.mr = 4;
5479       xnn_params.qc8.gemm.nr = 4;
5480     }
5481 
5482     if (is_wasm_x86) {
5483       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5484       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
5485       xnn_params.qc8.dwconv[0].channel_tile = 2;
5486       xnn_params.qc8.dwconv[0].primary_tile = 9;
5487       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5488       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
5489       xnn_params.qc8.dwconv[1].channel_tile = 1;
5490       xnn_params.qc8.dwconv[1].primary_tile = 25;
5491     } else {
5492       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5493       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
5494       xnn_params.qc8.dwconv[0].channel_tile = 2;
5495       xnn_params.qc8.dwconv[0].primary_tile = 9;
5496       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5497       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
5498       xnn_params.qc8.dwconv[1].channel_tile = 2;
5499       xnn_params.qc8.dwconv[1].primary_tile = 25;
5500     }
5501   #endif  // XNN_NO_QC8_OPERATORS
5502 
5503   /**************************** QS8 WAsm micro-kernels****************************/
5504   #ifndef XNN_NO_QS8_OPERATORS
5505     init_flags |= XNN_INIT_FLAG_QS8;
5506 
5507     if (is_wasm_x86) {
5508       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5509       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5510       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5511       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5512       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5513       xnn_params.qs8.gemm.mr = 2;
5514       xnn_params.qs8.gemm.nr = 2;
5515     } else {
5516       xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5517       xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5518       xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5519       xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5520       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5521       xnn_params.qs8.gemm.mr = 4;
5522       xnn_params.qs8.gemm.nr = 4;
5523     }
5524 
5525     if (is_wasm_x86) {
5526       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5527       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5528       xnn_params.qs8.dwconv[0].channel_tile = 2;
5529       xnn_params.qs8.dwconv[0].primary_tile = 9;
5530       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5531       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5532       xnn_params.qs8.dwconv[1].channel_tile = 1;
5533       xnn_params.qs8.dwconv[1].primary_tile = 25;
5534     } else {
5535       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5536       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5537       xnn_params.qs8.dwconv[0].channel_tile = 2;
5538       xnn_params.qs8.dwconv[0].primary_tile = 9;
5539       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5540       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5541       xnn_params.qs8.dwconv[1].channel_tile = 2;
5542       xnn_params.qs8.dwconv[1].primary_tile = 25;
5543     }
5544 
5545     xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
5546       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
5547       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
5548       .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
5549       .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
5550       .row_tile = 7,
5551       .channel_tile = 4,
5552     };
5553 
5554     xnn_params.qs8.vadd = (struct vbinary_parameters) {
5555       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
5556       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
5557       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
5558       .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
5559       .element_tile = 4,
5560     };
5561     xnn_params.qs8.vmul = (struct vbinary_parameters) {
5562       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
5563       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
5564       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
5565       .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
5566       .element_tile = 4,
5567     };
5568   #endif  // XNN_NO_QS8_OPERATORS
5569 
5570   /**************************** QU8 WAsm micro-kernels****************************/
5571   #ifndef XNN_NO_QU8_OPERATORS
5572     init_flags |= XNN_INIT_FLAG_QU8;
5573 
5574     if (is_wasm_x86) {
5575       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5576       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5577       xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5578       xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5579       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5580       xnn_params.qu8.gemm.mr = 2;
5581       xnn_params.qu8.gemm.nr = 2;
5582     } else {
5583       xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5584       xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5585       xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5586       xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5587       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5588       xnn_params.qu8.gemm.mr = 4;
5589       xnn_params.qu8.gemm.nr = 4;
5590     }
5591 
5592     if (is_wasm_x86) {
5593       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5594       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5595       xnn_params.qu8.dwconv[0].channel_tile = 2;
5596       xnn_params.qu8.dwconv[0].primary_tile = 9;
5597       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5598       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5599       xnn_params.qu8.dwconv[1].channel_tile = 1;
5600       xnn_params.qu8.dwconv[1].primary_tile = 25;
5601     } else {
5602       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5603       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5604       xnn_params.qu8.dwconv[0].channel_tile = 2;
5605       xnn_params.qu8.dwconv[0].primary_tile = 9;
5606       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5607       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5608       xnn_params.qu8.dwconv[1].channel_tile = 2;
5609       xnn_params.qu8.dwconv[1].primary_tile = 25;
5610     }
5611 
5612     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
5613       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
5614       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
5615       .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
5616       .primary_tile = 9,
5617       .incremental_tile = 8,
5618       .channel_tile = 1,
5619     };
5620     xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
5621       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
5622       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
5623       .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
5624       .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
5625       .row_tile = 7,
5626       .channel_tile = 4,
5627     };
5628 
5629     xnn_params.qu8.vadd = (struct vbinary_parameters) {
5630       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
5631       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
5632       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
5633       .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
5634       .element_tile = 4,
5635     };
5636     xnn_params.qu8.vmul = (struct vbinary_parameters) {
5637       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
5638       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
5639       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
5640       .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
5641       .element_tile = 4,
5642     };
5643   #endif  // XNN_NO_QU8_OPERATORS
5644 
5645   /**************************** S8 WAsm micro-kernels****************************/
5646   #ifndef XNN_NO_S8_OPERATORS
5647     init_flags |= XNN_INIT_FLAG_S8;
5648 
5649     xnn_params.s8.clamp = (struct vunary_parameters) {
5650       .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
5651       .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
5652       .element_tile = 4,
5653     };
5654     xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
5655       .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
5656       .pixel_tile = 1,
5657       .channel_tile = 1,
5658     };
5659     xnn_params.s8.maxpool = (struct maxpool_parameters) {
5660       .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
5661       .init.s8 = xnn_init_s8_minmax_scalar_params,
5662       .mr = 9,
5663       .qr = 8,
5664     };
5665   #endif  // XNN_NO_S8_OPERATORS
5666 
5667   /**************************** U8 WAsm micro-kernels****************************/
5668   #ifndef XNN_NO_U8_OPERATORS
5669     init_flags |= XNN_INIT_FLAG_U8;
5670 
5671     xnn_params.u8.clamp = (struct vunary_parameters) {
5672       .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
5673       .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
5674       .element_tile = 4,
5675     };
5676     xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
5677       .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
5678       .pixel_tile = 1,
5679       .channel_tile = 1,
5680     };
5681     xnn_params.u8.maxpool = (struct maxpool_parameters) {
5682       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
5683       .init.u8 = xnn_init_u8_minmax_scalar_params,
5684       .mr = 9,
5685       .qr = 8,
5686     };
5687     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
5688     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
5689   #endif  // XNN_NO_U8_OPERATORS
5690 
5691   /**************************** X8 WAsm micro-kernels****************************/
5692   #ifndef XNN_NO_X8_OPERATORS
5693     init_flags |= XNN_INIT_FLAG_X8;
5694 
5695     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
5696     xnn_params.x8.zip = (struct zip_parameters) {
5697       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
5698       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
5699       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
5700       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
5701     };
5702   #endif  // XNN_NO_X8_OPERATORS
5703 
5704   /**************************** F32 WAsm micro-kernels****************************/
5705   #ifndef XNN_NO_F32_OPERATORS
5706     init_flags |= XNN_INIT_FLAG_F32;
5707 
5708     if (is_wasm_x86) {
5709       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_2x4__scalar);
5710       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_2x4__scalar);
5711       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
5712       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
5713       xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_2x4__scalar);
5714       xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_2x4__scalar);
5715       xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
5716       xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
5717       xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_2x4__scalar);
5718       xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_2x4__scalar);
5719       xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
5720       xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
5721       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
5722       xnn_params.f32.gemm.mr = 2;
5723       xnn_params.f32.gemm.nr = 4;
5724     } else {
5725       xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__wasm);
5726       xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__wasm);
5727       xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
5728       xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
5729       xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__wasm);
5730       xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__wasm);
5731       xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
5732       xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
5733       xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__wasm);
5734       xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__wasm);
5735       xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
5736       xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
5737       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
5738       xnn_params.f32.gemm.mr = 4;
5739       xnn_params.f32.gemm.nr = 4;
5740     }
5741     xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__wasm);
5742     xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__wasm),
5743     xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__wasm);
5744     xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__wasm),
5745     xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
5746     xnn_params.f32.gemm2.mr = 4;
5747     xnn_params.f32.gemm2.nr = 2;
5748 
5749     xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__wasm_acc2;
5750     xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__wasm_acc2;
5751     xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
5752     xnn_params.f32.dwconv[0].channel_tile = 1;
5753     xnn_params.f32.dwconv[0].primary_tile = 3;
5754 
5755     xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__wasm_acc2;
5756     xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__wasm_acc2;
5757     xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
5758     xnn_params.f32.dwconv[1].channel_tile = 1;
5759     xnn_params.f32.dwconv[1].primary_tile = 4;
5760 
5761     xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__wasm_acc2;
5762     xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__wasm_acc2;
5763     xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
5764     xnn_params.f32.dwconv[2].channel_tile = 1;
5765     xnn_params.f32.dwconv[2].primary_tile = 9;
5766 
5767     xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__wasm_acc2;
5768     xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__wasm_acc2;
5769     xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
5770     xnn_params.f32.dwconv[3].channel_tile = 1;
5771     xnn_params.f32.dwconv[3].primary_tile = 25;
5772 
5773     xnn_params.f32.avgpool = (struct avgpool_parameters) {
5774       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasm_c1,
5775       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasm_c1,
5776       .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5777       .primary_tile = 9,
5778       .incremental_tile = 8,
5779       .channel_tile = 1,
5780     };
5781     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
5782       .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasm_c1,
5783       .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasm_c1,
5784       .primary_tile = 9,
5785       .incremental_tile = 8,
5786       .channel_tile = 1,
5787     };
5788     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
5789       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1,
5790       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1,
5791       .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5792       .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5793       .row_tile = 7,
5794       .channel_tile = 1,
5795     };
5796     xnn_params.f32.maxpool = (struct maxpool_parameters) {
5797       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasm_c1,
5798       .init.f32 = xnn_init_f32_minmax_scalar_params,
5799       .mr = 9,
5800       .qr = 8,
5801     };
5802     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
5803       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
5804       .mr = 4,
5805     };
5806     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
5807       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
5808       .mr = 9,
5809     };
5810     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
5811       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
5812       .mr = 9,
5813       .qr = 8,
5814     };
5815     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
5816       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
5817       .pixel_tile = 1,
5818       .channel_tile = 2,
5819     };
5820     xnn_params.f32.abs = (struct vunary_parameters) {
5821       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
5822       .element_tile = 4,
5823     };
5824     xnn_params.f32.clamp = (struct vunary_parameters) {
5825       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasm_x4,
5826       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5827       .element_tile = 4,
5828     };
5829     if (is_wasm_x86) {
5830       xnn_params.f32.hswish = (struct vunary_parameters) {
5831         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
5832         .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
5833         .element_tile = 4,
5834       };
5835     } else {
5836       xnn_params.f32.hswish = (struct vunary_parameters) {
5837         .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__wasm_x4,
5838         .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
5839         .element_tile = 4,
5840       };
5841     }
5842     if (is_wasm_x86) {
5843       xnn_params.f32.elu = (struct vunary_parameters) {
5844         .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2,
5845         .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
5846         .element_tile = 2,
5847       };
5848     } else {
5849       xnn_params.f32.elu = (struct vunary_parameters) {
5850         .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasm_rr2_p6_x6,
5851         .init.f32_elu = xnn_init_f32_elu_scalar_rr2_p6_params,
5852         .element_tile = 6,
5853       };
5854     }
5855     xnn_params.f32.lrelu = (struct vunary_parameters) {
5856       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
5857       .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
5858       .element_tile = 4,
5859     };
5860     xnn_params.f32.neg = (struct vunary_parameters) {
5861       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
5862       .element_tile = 4,
5863     };
5864     if (is_wasm_x86) {
5865       xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__scalar_x8;
5866     } else {
5867       xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasm_x8;
5868     }
5869     xnn_params.f32.rndne = (struct vunary_parameters) {
5870       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4,
5871       .element_tile = 4,
5872     };
5873     xnn_params.f32.rndz = (struct vunary_parameters) {
5874       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4,
5875       .element_tile = 4,
5876     };
5877     xnn_params.f32.rndu = (struct vunary_parameters) {
5878       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4,
5879       .element_tile = 4,
5880     };
5881     xnn_params.f32.rndd = (struct vunary_parameters) {
5882       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4,
5883       .element_tile = 4,
5884     };
5885     xnn_params.f32.sigmoid = (struct vunary_parameters) {
5886       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
5887       .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
5888       .element_tile = 2,
5889     };
5890     xnn_params.f32.sqr = (struct vunary_parameters) {
5891       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
5892       .element_tile = 4,
5893     };
5894     xnn_params.f32.sqrt = (struct vunary_parameters) {
5895       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
5896       .element_tile = 1,
5897     };
5898     if (is_wasm_x86) {
5899       xnn_params.f32.prelu = (struct prelu_parameters) {
5900         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
5901         .row_tile = 2,
5902         .channel_tile = 4,
5903       };
5904     } else {
5905       xnn_params.f32.prelu = (struct prelu_parameters) {
5906         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
5907         .row_tile = 2,
5908         .channel_tile = 4,
5909       };
5910     }
5911     xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5912       .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
5913       .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
5914       .element_tile = 4,
5915     };
5916     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
5917     xnn_params.f32.vadd = (struct vbinary_parameters) {
5918       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasm_x8,
5919       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
5920       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
5921       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5922       .element_tile = 8,
5923     };
5924     xnn_params.f32.vdiv = (struct vbinary_parameters) {
5925       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasm_x8,
5926       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasm_x8,
5927       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasm_x8,
5928       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5929       .element_tile = 8,
5930     };
5931     xnn_params.f32.vmax = (struct vbinary_parameters) {
5932       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x8,
5933       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
5934       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
5935       .element_tile = 8,
5936     };
5937     xnn_params.f32.vmin = (struct vbinary_parameters) {
5938       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x8,
5939       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
5940       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
5941       .element_tile = 8,
5942     };
5943     xnn_params.f32.vmul = (struct vbinary_parameters) {
5944       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasm_x8,
5945       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
5946       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
5947       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5948       .element_tile = 8,
5949     };
5950     xnn_params.f32.vsub = (struct vbinary_parameters) {
5951       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasm_x8,
5952       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasm_x8,
5953       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasm_x8,
5954       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5955       .element_tile = 8,
5956     };
5957     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
5958       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
5959       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
5960       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
5961       .element_tile = 8,
5962     };
5963     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
5964       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__wasm_2x,
5965       .init.f32 = xnn_init_f32_minmax_scalar_params,
5966       .channel_tile = 1,
5967       .row_tile = 2,
5968     };
5969     #ifndef XNN_NO_NCHW_OPERATORS
5970       init_flags |= XNN_INIT_FLAG_CHW_OPT;
5971 
5972       xnn_params.f32.spmm = (struct spmm_parameters) {
5973         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
5974         .mr = 8,
5975         .nr = 1,
5976       };
5977       xnn_params.f32.spmm2 = (struct spmm_parameters) {
5978         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
5979         .mr = 8,
5980         .nr = 2,
5981       };
5982       xnn_params.f32.spmm4 = (struct spmm_parameters) {
5983         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
5984         .mr = 8,
5985         .nr = 4,
5986       };
5987       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
5988         .ukernel_with_symm_padding =
5989           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
5990         .output_channel_tile = 4,
5991         .output_height_tile = 1,
5992         .output_width_tile = 1,
5993       };
5994       xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5995         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
5996         .output_width_tile = 1,
5997         .output_height_tile = 2,
5998       };
5999       xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6000         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
6001         .output_width_tile = 1,
6002         .output_height_tile = 1,
6003       };
6004       xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6005         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
6006         .output_width_tile = 1,
6007         .output_height_tile = 1,
6008       };
6009       xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6010         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
6011         .output_width_tile = 1,
6012         .output_height_tile = 1,
6013       };
6014       xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6015         .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
6016         .channel_tile = 1,
6017       };
6018       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6019         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
6020         .channel_tile = 1,
6021         .pixel_tile = 4,
6022       };
6023     #endif  // XNN_NO_NCHW_OPERATORS
6024   #endif  // XNN_NO_F32_OPERATORS
6025 
6026   /*************************** VCVT WAsm micro-kernels***************************/
6027   #ifndef XNN_NO_VCVT_OPERATORS
6028     init_flags |= XNN_INIT_FLAG_VCVT;
6029 
6030     xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6031       .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x1,
6032       .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6033       .element_tile = 1,
6034     };
6035     xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6036       .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_bitcast_x4,
6037       .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_bitcast_params,
6038       .element_tile = 4,
6039     };
6040     if (is_wasm_x86) {
6041       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6042         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6043         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
6044         .element_tile = 1,
6045       };
6046       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6047         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6048         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
6049         .element_tile = 1,
6050       };
6051     } else {
6052       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6053         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6054         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_fmagic_params,
6055         .element_tile = 4,
6056       };
6057       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6058         .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6059         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_fmagic_params,
6060         .element_tile = 4,
6061       };
6062     }
6063     xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6064       .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x1,
6065       .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6066       .element_tile = 1,
6067     };
6068     xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6069       .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x1,
6070       .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
6071       .element_tile = 1,
6072     };
6073   #endif  // XNN_NO_VCVT_OPERATORS
6074 
6075   /**************************** X32 WAsm micro-kernels****************************/
6076   #ifndef XNN_NO_X32_OPERATORS
6077     init_flags |= XNN_INIT_FLAG_X32;
6078 
6079     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
6080     xnn_params.x32.zip = (struct zip_parameters) {
6081       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
6082       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
6083       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
6084       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
6085     };
6086     #ifndef XNN_NO_NCHW_OPERATORS
6087       xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
6088         .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
6089         .channel_tile = 1,
6090         .pixel_tile = 1,
6091       };
6092     #endif  // XNN_NO_NCHW_OPERATORS
6093   #endif  // XNN_NO_X32_OPERATORS
6094 
6095   /**************************** XX WAsm micro-kernels****************************/
6096   #ifndef XNN_NO_XX_OPERATORS
6097     init_flags |= XNN_INIT_FLAG_XX;
6098 
6099     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
6100     xnn_params.xx.fill = (struct fill_parameters) {
6101       .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
6102       .row_tile = 1,
6103     };
6104     xnn_params.xx.pad = (struct pad_parameters) {
6105       .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
6106       .row_tile = 1,
6107     };
6108   #endif
6109 
6110 #elif XNN_ARCH_RISCV
6111 
6112   /************************** QC8 RISC-V micro-kernels **************************/
6113   #ifndef XNN_NO_QC8_OPERATORS
6114     init_flags |= XNN_INIT_FLAG_QC8;
6115 
6116     xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6117     xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6118     xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6119     xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6120     xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
6121     xnn_params.qc8.gemm.mr = 3;
6122     xnn_params.qc8.gemm.nr = 4;
6123 
6124     xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
6125     xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
6126     xnn_params.qc8.dwconv[0].channel_tile = 2;
6127     xnn_params.qc8.dwconv[0].primary_tile = 9;
6128     xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
6129     xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
6130     xnn_params.qc8.dwconv[1].channel_tile = 2;
6131     xnn_params.qc8.dwconv[1].primary_tile = 25;
6132   #endif  // XNN_NO_QS8_OPERATORS
6133 
6134   /************************** QS8 RISC-V micro-kernels **************************/
6135   #ifndef XNN_NO_QS8_OPERATORS
6136     init_flags |= XNN_INIT_FLAG_QS8;
6137 
6138     xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6139     xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6140     xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6141     xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6142     xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
6143     xnn_params.qs8.gemm.mr = 3;
6144     xnn_params.qs8.gemm.nr = 4;
6145 
6146     xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
6147     xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
6148     xnn_params.qs8.dwconv[0].channel_tile = 2;
6149     xnn_params.qs8.dwconv[0].primary_tile = 9;
6150     xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
6151     xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
6152     xnn_params.qs8.dwconv[1].channel_tile = 2;
6153     xnn_params.qs8.dwconv[1].primary_tile = 25;
6154 
6155     xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
6156       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
6157       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
6158       .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
6159       .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
6160       .row_tile = 7,
6161       .channel_tile = 1,
6162     };
6163 
6164     xnn_params.qs8.vadd = (struct vbinary_parameters) {
6165       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
6166       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
6167       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
6168       .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
6169       .element_tile = 4,
6170     };
6171     xnn_params.qs8.vmul = (struct vbinary_parameters) {
6172       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
6173       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
6174       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
6175       .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
6176       .element_tile = 4,
6177     };
6178   #endif  // XNN_NO_QS8_OPERATORS
6179 
6180   /************************** QU8 RISC-V micro-kernels **************************/
6181   #ifndef XNN_NO_QU8_OPERATORS
6182     init_flags |= XNN_INIT_FLAG_QU8;
6183 
6184     xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6185     xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6186     xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6187     xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6188     xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6189     xnn_params.qu8.gemm.mr = 3;
6190     xnn_params.qu8.gemm.nr = 4;
6191 
6192     xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
6193     xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6194     xnn_params.qu8.dwconv[0].channel_tile = 2;
6195     xnn_params.qu8.dwconv[0].primary_tile = 9;
6196     xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
6197     xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6198     xnn_params.qu8.dwconv[1].channel_tile = 2;
6199     xnn_params.qu8.dwconv[1].primary_tile = 25;
6200 
6201     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
6202       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
6203       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
6204       .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
6205       .primary_tile = 9,
6206       .incremental_tile = 8,
6207       .channel_tile = 1,
6208     };
6209     xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
6210       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
6211       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
6212       .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6213       .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6214       .row_tile = 7,
6215       .channel_tile = 1,
6216     };
6217 
6218     xnn_params.qu8.vadd = (struct vbinary_parameters) {
6219       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
6220       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
6221       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
6222       .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
6223       .element_tile = 4,
6224     };
6225     xnn_params.qu8.vmul = (struct vbinary_parameters) {
6226       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
6227       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
6228       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
6229       .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
6230       .element_tile = 4,
6231     };
6232   #endif  // XNN_NO_QU8_OPERATORS
6233 
6234   /************************** S8 RISC-V micro-kernels ***************************/
6235   #ifndef XNN_NO_S8_OPERATORS
6236     init_flags |= XNN_INIT_FLAG_S8;
6237 
6238     xnn_params.s8.clamp = (struct vunary_parameters) {
6239       .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
6240       .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
6241       .element_tile = 4,
6242     };
6243     xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
6244       .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
6245       .pixel_tile = 1,
6246       .channel_tile = 1,
6247     };
6248     xnn_params.s8.maxpool = (struct maxpool_parameters) {
6249       .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6250       .init.s8 = xnn_init_s8_minmax_scalar_params,
6251       .mr = 9,
6252       .qr = 8,
6253     };
6254   #endif  // XNN_NO_S8_OPERATORS
6255 
6256   /************************** U8 RISC-V micro-kernels ***************************/
6257   #ifndef XNN_NO_U8_OPERATORS
6258     init_flags |= XNN_INIT_FLAG_U8;
6259 
6260     xnn_params.u8.clamp = (struct vunary_parameters) {
6261       .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
6262       .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
6263       .element_tile = 4,
6264     };
6265     xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
6266       .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
6267       .pixel_tile = 1,
6268       .channel_tile = 1,
6269     };
6270     xnn_params.u8.maxpool = (struct maxpool_parameters) {
6271       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6272       .init.u8 = xnn_init_u8_minmax_scalar_params,
6273       .mr = 9,
6274       .qr = 8,
6275     };
6276     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
6277     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
6278   #endif  // XNN_NO_U8_OPERATORS
6279 
6280   /************************** X8 RISC-V micro-kernels ***************************/
6281   #ifndef XNN_NO_X8_OPERATORS
6282     init_flags |= XNN_INIT_FLAG_X8;
6283 
6284     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
6285     xnn_params.x8.zip = (struct zip_parameters) {
6286       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
6287       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
6288       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
6289       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
6290     };
6291   #endif  // XNN_NO_X8_OPERATORS
6292 
6293   /************************** F32 RISC-V micro-kernels **************************/
6294   #ifndef XNN_NO_F32_OPERATORS
6295     init_flags |= XNN_INIT_FLAG_F32;
6296 
6297     xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
6298     xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
6299     xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
6300     xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
6301     xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
6302     xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
6303     xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
6304     xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
6305     xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
6306     xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
6307     xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
6308     xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
6309     xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
6310     xnn_params.f32.gemm.mr = 4;
6311     xnn_params.f32.gemm.nr = 4;
6312 
6313     xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
6314     xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
6315     xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
6316     xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
6317     xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
6318     xnn_params.f32.gemm2.mr = 4;
6319     xnn_params.f32.gemm2.nr = 2;
6320 
6321     xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
6322     xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
6323     xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
6324     xnn_params.f32.dwconv[0].channel_tile = 1;
6325     xnn_params.f32.dwconv[0].primary_tile = 3;
6326 
6327     xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
6328     xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
6329     xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
6330     xnn_params.f32.dwconv[1].channel_tile = 1;
6331     xnn_params.f32.dwconv[1].primary_tile = 4;
6332 
6333     xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
6334     xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
6335     xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
6336     xnn_params.f32.dwconv[2].channel_tile = 1;
6337     xnn_params.f32.dwconv[2].primary_tile = 9;
6338 
6339     xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
6340     xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
6341     xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
6342     xnn_params.f32.dwconv[3].channel_tile = 1;
6343     xnn_params.f32.dwconv[3].primary_tile = 25;
6344 
6345     xnn_params.f32.avgpool = (struct avgpool_parameters) {
6346       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
6347       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
6348       .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6349       .primary_tile = 9,
6350       .incremental_tile = 8,
6351       .channel_tile = 1,
6352     };
6353     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
6354       .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
6355       .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
6356       .primary_tile = 9,
6357       .incremental_tile = 8,
6358       .channel_tile = 1,
6359     };
6360     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
6361       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
6362       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
6363       .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6364       .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
6365       .row_tile = 7,
6366       .channel_tile = 1,
6367     };
6368     xnn_params.f32.maxpool = (struct maxpool_parameters) {
6369       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
6370       .init.f32 = xnn_init_f32_minmax_scalar_params,
6371       .mr = 9,
6372       .qr = 8,
6373     };
6374     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
6375       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
6376       .mr = 4,
6377     };
6378     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
6379       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
6380       .mr = 9,
6381     };
6382     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
6383       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
6384       .mr = 9,
6385       .qr = 8,
6386     };
6387     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
6388       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
6389       .pixel_tile = 1,
6390       .channel_tile = 2,
6391     };
6392     xnn_params.f32.abs = (struct vunary_parameters) {
6393       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
6394       .element_tile = 4,
6395     };
6396     xnn_params.f32.clamp = (struct vunary_parameters) {
6397       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
6398       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6399       .element_tile = 4,
6400     };
6401     xnn_params.f32.elu = (struct vunary_parameters) {
6402       .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
6403       .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
6404       .element_tile = 4,
6405     };
6406     xnn_params.f32.hswish = (struct vunary_parameters) {
6407       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
6408       .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
6409       .element_tile = 4,
6410     };
6411     xnn_params.f32.lrelu = (struct vunary_parameters) {
6412       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
6413       .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
6414       .element_tile = 4,
6415     };
6416     xnn_params.f32.neg = (struct vunary_parameters) {
6417       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
6418       .element_tile = 4,
6419     };
6420     xnn_params.f32.rndne = (struct vunary_parameters) {
6421       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
6422       .element_tile = 1,
6423     };
6424     xnn_params.f32.rndz = (struct vunary_parameters) {
6425       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
6426       .element_tile = 1,
6427     };
6428     xnn_params.f32.rndu = (struct vunary_parameters) {
6429       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
6430       .element_tile = 1,
6431     };
6432     xnn_params.f32.rndd = (struct vunary_parameters) {
6433       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
6434       .element_tile = 1,
6435     };
6436     xnn_params.f32.sigmoid = (struct vunary_parameters) {
6437       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
6438       .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
6439       .element_tile = 2,
6440     };
6441     xnn_params.f32.sqr = (struct vunary_parameters) {
6442       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
6443       .element_tile = 4,
6444     };
6445     xnn_params.f32.sqrt = (struct vunary_parameters) {
6446       .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
6447       .element_tile = 1,
6448     };
6449     xnn_params.f32.prelu = (struct prelu_parameters) {
6450       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
6451       .row_tile = 4,
6452       .channel_tile = 4,
6453     };
6454     xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
6455       .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
6456       .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
6457       .element_tile = 4,
6458     };
6459     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
6460     xnn_params.f32.vadd = (struct vbinary_parameters) {
6461       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
6462       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
6463       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
6464       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6465       .element_tile = 8,
6466     };
6467     xnn_params.f32.vdiv = (struct vbinary_parameters) {
6468       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
6469       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
6470       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
6471       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6472       .element_tile = 2,
6473     };
6474     xnn_params.f32.vmax = (struct vbinary_parameters) {
6475       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
6476       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
6477       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
6478       .element_tile = 8,
6479     };
6480     xnn_params.f32.vmin = (struct vbinary_parameters) {
6481       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
6482       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
6483       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
6484       .element_tile = 8,
6485     };
6486     xnn_params.f32.vmul = (struct vbinary_parameters) {
6487       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
6488       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
6489       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
6490       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6491       .element_tile = 8,
6492     };
6493     xnn_params.f32.vsub = (struct vbinary_parameters) {
6494       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
6495       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
6496       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
6497       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6498       .element_tile = 8,
6499     };
6500     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
6501       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
6502       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6503       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6504       .element_tile = 8,
6505     };
6506     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6507       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
6508       .init.f32 = xnn_init_f32_minmax_scalar_params,
6509       .channel_tile = 1,
6510       .row_tile = 2,
6511     };
6512     #ifndef XNN_NO_NCHW_OPERATORS
6513       init_flags |= XNN_INIT_FLAG_CHW_OPT;
6514 
6515       xnn_params.f32.spmm = (struct spmm_parameters) {
6516         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
6517         .mr = 8,
6518         .nr = 1,
6519       };
6520       xnn_params.f32.spmm2 = (struct spmm_parameters) {
6521         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
6522         .mr = 8,
6523         .nr = 2,
6524       };
6525       xnn_params.f32.spmm4 = (struct spmm_parameters) {
6526         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
6527         .mr = 8,
6528         .nr = 4,
6529       };
6530       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
6531         .ukernel_with_symm_padding =
6532           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
6533         .output_channel_tile = 4,
6534         .output_height_tile = 1,
6535         .output_width_tile = 1,
6536       };
6537       xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6538         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
6539         .output_width_tile = 1,
6540         .output_height_tile = 2,
6541       };
6542       xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6543         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
6544         .output_width_tile = 1,
6545         .output_height_tile = 1,
6546       };
6547       xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6548         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
6549         .output_width_tile = 1,
6550         .output_height_tile = 1,
6551       };
6552       xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6553         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
6554         .output_width_tile = 1,
6555         .output_height_tile = 1,
6556       };
6557       xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6558         .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
6559         .channel_tile = 1,
6560       };
6561       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6562         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
6563         .channel_tile = 1,
6564         .pixel_tile = 4,
6565       };
6566     #endif  // XNN_NO_NCHW_OPERATORS
6567   #endif  // XNN_NO_F32_OPERATORS
6568 
6569   /************************** VCVT RISC-V micro-kernels *************************/
6570   #ifndef XNN_NO_VCVT_OPERATORS
6571     init_flags |= XNN_INIT_FLAG_VCVT;
6572 
6573     xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6574       .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
6575       .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6576       .element_tile = 4,
6577     };
6578     xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6579       .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
6580       .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
6581       .element_tile = 2,
6582     };
6583     xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6584       .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_x4,
6585       .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_lrintf_params,
6586       .element_tile = 4,
6587     };
6588     xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6589       .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_x4,
6590       .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_lrintf_params,
6591       .element_tile = 4,
6592     };
6593     xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6594       .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
6595       .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6596       .element_tile = 4,
6597     };
6598     xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6599       .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
6600       .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
6601       .element_tile = 4,
6602     };
6603   #endif  // XNN_NO_VCVT_OPERATORS
6604 
6605   /************************** X32 RISC-V micro-kernels **************************/
6606   #ifndef XNN_NO_X32_OPERATORS
6607     init_flags |= XNN_INIT_FLAG_X32;
6608 
6609     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
6610     xnn_params.x32.zip = (struct zip_parameters) {
6611       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
6612       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
6613       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
6614       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
6615     };
6616     #ifndef XNN_NO_NCHW_OPERATORS
6617       xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
6618         .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
6619         .channel_tile = 1,
6620         .pixel_tile = 1,
6621       };
6622     #endif  // XNN_NO_NCHW_OPERATORS
6623   #endif  // XNN_NO_X32_OPERATORS
6624 
6625   /************************** XX RISC-V micro-kernels ***************************/
6626   #ifndef XNN_NO_XX_OPERATORS
6627     init_flags |= XNN_INIT_FLAG_XX;
6628 
6629     xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
6630     xnn_params.xx.fill = (struct fill_parameters) {
6631       .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
6632       .row_tile = 1,
6633     };
6634     xnn_params.xx.pad = (struct pad_parameters) {
6635       .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
6636       .row_tile = 1,
6637     };
6638   #endif  // XNN_NO_XX_OPERATORS
6639 
6640 #else
6641   #error "Unsupported architecture"
6642 #endif
6643 
6644   memcpy(&xnn_params.allocator, init_allocator, sizeof(struct xnn_allocator));
6645   xnn_params.init_flags = init_flags;
6646 }
6647 
6648 #if XNN_PLATFORM_WINDOWS
init_windows(PINIT_ONCE init_once,PVOID parameter,PVOID * context)6649   static BOOL CALLBACK init_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
6650     init();
6651     return TRUE;
6652   }
6653 #endif
6654 
xnn_initialize(const struct xnn_allocator * allocator)6655 enum xnn_status xnn_initialize(const struct xnn_allocator* allocator) {
6656   #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
6657     if (!cpuinfo_initialize()) {
6658       return xnn_status_out_of_memory;
6659     }
6660   #endif  // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
6661   if (allocator == NULL) {
6662     allocator = &xnn_default_allocator;
6663   }
6664   #ifdef _MSC_VER
6665     _InterlockedCompareExchangePointer((PVOID volatile*) &init_allocator, (PVOID) allocator, NULL);
6666   #else
6667     __sync_bool_compare_and_swap(&init_allocator, NULL, allocator);
6668   #endif
6669   #if XNN_PLATFORM_WINDOWS
6670     InitOnceExecuteOnce(&init_guard, &init_windows, NULL, NULL);
6671   #else
6672     pthread_once(&init_guard, &init);
6673   #endif
6674   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) != 0) {
6675     return xnn_status_success;
6676   } else {
6677     return xnn_status_unsupported_hardware;
6678   }
6679 }
6680 
xnn_deinitialize(void)6681 enum xnn_status xnn_deinitialize(void) {
6682   #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
6683     cpuinfo_deinitialize();
6684   #endif  // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
6685   return xnn_status_success;
6686 }
6687