• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <math.h>
10 #include <stdbool.h>
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <string.h>
14 
15 #ifdef _WIN32
16   #include <windows.h>
17 #else
18   #include <errno.h>
19   #include <pthread.h>
20   #include <sys/mman.h>
21   #include <unistd.h>
22 #endif
23 
24 #ifdef _MSC_VER
25   #include <intrin.h>
26 #endif
27 
28 #ifndef __EMSCRIPTEN__
29   #include <cpuinfo.h>
30 #endif
31 
32 #include <xnnpack.h>
33 #include <xnnpack/allocator.h>
34 #include <xnnpack/argmaxpool.h>
35 #include <xnnpack/avgpool.h>
36 #include <xnnpack/common.h>
37 #include <xnnpack/conv.h>
38 #include <xnnpack/dwconv.h>
39 #include <xnnpack/gavgpool.h>
40 #include <xnnpack/gemm.h>
41 #include <xnnpack/fill.h>
42 #include <xnnpack/ibilinear.h>
43 #include <xnnpack/igemm.h>
44 #include <xnnpack/log.h>
45 #include <xnnpack/lut.h>
46 #include <xnnpack/maxpool.h>
47 #include <xnnpack/pad.h>
48 #include <xnnpack/params.h>
49 #include <xnnpack/microparams-init.h>
50 #include <xnnpack/pavgpool.h>
51 #include <xnnpack/prelu.h>
52 #include <xnnpack/raddstoreexpminusmax.h>
53 #include <xnnpack/rmax.h>
54 #include <xnnpack/spmm.h>
55 #include <xnnpack/unpool.h>
56 #include <xnnpack/transpose.h>
57 #include <xnnpack/vadd.h>
58 #include <xnnpack/vbinary.h>
59 #include <xnnpack/vcvt.h>
60 #include <xnnpack/vlrelu.h>
61 #include <xnnpack/vmul.h>
62 #include <xnnpack/vmulcaddc.h>
63 #include <xnnpack/vunary.h>
64 #include <xnnpack/zip.h>
65 
66 #ifndef XNN_ENABLE_ASSEMBLY
67   #define XNN_ENABLE_ASSEMBLY 1
68 #endif
69 
70 #if XNN_PLATFORM_WINDOWS
71   static INIT_ONCE init_guard = INIT_ONCE_STATIC_INIT;
72 #else
73   static pthread_once_t init_guard = PTHREAD_ONCE_INIT;
74 #endif
75 
76 #define XNN_MR_TO_INDEX(MR) (MR-1)
77 
78 #ifndef XNN_ENABLE_GEMM_M_SPECIALIZATION
79 #error "XNN_ENABLE_GEMM_M_SPECIALIZATION is not defined"
80 #endif
81 
82 static const struct xnn_allocator* volatile init_allocator = NULL;
83 
init(void)84 static void init(void) {
85 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
86   // Unlike most other architectures, on x86/x86-64 when floating-point instructions
87   // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
88   // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
89   // of two infinities (must produce NaN per IEEE 754 standard).
90   static const volatile float inf = INFINITY;
91   const bool is_wasm_x86 = signbit(inf - inf);
92 #endif
93   uint32_t init_flags = XNN_INIT_FLAG_XNNPACK;
94 
95 #if XNN_ARCH_ARM
96   #if XNN_PLATFORM_MOBILE
97     if (!cpuinfo_has_arm_neon()) {
98       xnn_log_error("XNNPACK initialization failed: NEON is not supported");
99       return;
100     }
101   #else
102     if (!cpuinfo_has_arm_v6()) {
103       xnn_log_error("XNNPACK initialization failed: ARMv6 instructions not supported");
104       return;
105     }
106 
107     if (!cpuinfo_has_arm_vfpv2() && !cpuinfo_has_arm_vfpv3()) {
108       xnn_log_error("XNNPACK initialization failed: VFP is not supported");
109       return;
110     }
111   #endif
112 
113   if (cpuinfo_has_arm_neon()) {
114     /**************************** QC8 AArch32 micro-kernels ****************************/
115     #ifndef XNN_NO_QC8_OPERATORS
116       init_flags |= XNN_INIT_FLAG_QC8;
117 
118       #if XNN_ENABLE_ASSEMBLY
119         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
120           #if XNN_ENABLE_ARM_DOTPROD
121             switch (cpuinfo_get_uarch(0)->uarch) {
122               case cpuinfo_uarch_cortex_a55:
123                 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55);
124                 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55);
125                 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
126                 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
127                 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
128                 xnn_params.qc8.gemm.mr = 4;
129                 xnn_params.qc8.gemm.nr = 8;
130                 xnn_params.qc8.gemm.log2_kr = 2;
131                 break;
132               default:
133                 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
134                 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
135                 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
136                 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
137                 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
138                 xnn_params.qc8.gemm.mr = 4;
139                 xnn_params.qc8.gemm.nr = 8;
140                 xnn_params.qc8.gemm.log2_kr = 2;
141                 break;
142             }
143           #endif  // XNN_ENABLE_ARM_DOTPROD
144         } else {
145           switch (cpuinfo_get_uarch(0)->uarch) {
146             case cpuinfo_uarch_cortex_a5:
147             case cpuinfo_uarch_cortex_a7:
148             case cpuinfo_uarch_krait:
149             case cpuinfo_uarch_kryo:
150               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
151               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
152               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
153               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
154               xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
155               xnn_params.qc8.gemm.mr = 4;
156               xnn_params.qc8.gemm.nr = 8;
157               break;
158             case cpuinfo_uarch_cortex_a32:
159             case cpuinfo_uarch_cortex_a35:
160               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a35);
161               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a35);
162               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35);
163               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35);
164               xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
165               xnn_params.qc8.gemm.mr = 4;
166               xnn_params.qc8.gemm.nr = 8;
167               break;
168             case cpuinfo_uarch_cortex_a53:
169             case cpuinfo_uarch_cortex_a57:
170               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53);
171               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53);
172               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35);
173               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35);
174               xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
175               xnn_params.qc8.gemm.mr = 4;
176               xnn_params.qc8.gemm.nr = 8;
177               break;
178             case cpuinfo_uarch_cortex_a55r0:
179               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53);
180               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53);
181               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35);
182               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35);
183               xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
184               xnn_params.qc8.gemm.mr = 4;
185               xnn_params.qc8.gemm.nr = 8;
186               break;
187             case cpuinfo_uarch_cortex_a72:
188               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
189               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
190               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
191               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
192               xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
193               xnn_params.qc8.gemm.mr = 2;
194               xnn_params.qc8.gemm.nr = 8;
195               xnn_params.qc8.gemm.log2_kr = 1;
196               xnn_params.qc8.gemm.log2_sr = 2;
197               break;
198             case cpuinfo_uarch_exynos_m1:
199             case cpuinfo_uarch_exynos_m2:
200             case cpuinfo_uarch_exynos_m3:
201               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
202               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
203               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35);
204               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35);
205               xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
206               xnn_params.qc8.gemm.mr = 4;
207               xnn_params.qc8.gemm.nr = 8;
208               break;
209 
210             default:
211               if (cpuinfo_has_arm_neon_v8()) {
212                 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
213                 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
214                 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35);
215                 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35);
216                 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
217                 xnn_params.qc8.gemm.mr = 4;
218                 xnn_params.qc8.gemm.nr = 8;
219               } else {
220                 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
221                 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
222                 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
223                 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
224                 xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
225                 xnn_params.qc8.gemm.mr = 4;
226                 xnn_params.qc8.gemm.nr = 8;
227               }
228               break;
229           }
230         }
231         #if XNN_MAX_UARCH_TYPES > 1
232         {
233           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
234           const uint32_t mr = xnn_params.qc8.gemm.mr;
235           const uint32_t nr = xnn_params.qc8.gemm.nr;
236           const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
237           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
238             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
239             if (uarch_info == NULL) {
240               /* No more microarchitectures in the system */
241               break;
242             }
243 
244             switch (uarch_info->uarch) {
245               case cpuinfo_uarch_cortex_a55:
246                 #if XNN_ENABLE_ARM_DOTPROD
247                   if (mr == 4 && nr == 8 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
248                     xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55;
249                     xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55;
250                     xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot;
251                     xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot;
252                   }
253                 #endif  // XNN_ENABLE_ARM_DOTPROD
254                 break;
255               case cpuinfo_uarch_cortex_a53:
256                 if (mr == 4 && nr == 8 && log2_kr == 0) {
257                   xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53;
258                   xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53;
259                   xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35;
260                   xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_prfm_cortex_a35;
261                 }
262                 break;
263               case cpuinfo_uarch_cortex_a55r0:
264                 if (mr == 4 && nr == 8 && log2_kr == 0) {
265                   xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53;
266                   xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53;
267                   xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35;
268                   xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__aarch32_neonv8_mlal_lane_cortex_a35;
269                 }
270                 break;
271 
272               default:
273                 break;
274             }
275           }
276         }
277         #endif  // XNN_MAX_UARCH_TYPES > 1
278       #else  // XNN_ENABLE_ASSEMBLY
279         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
280           #if XNN_ENABLE_ARM_DOTPROD
281             xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot);
282             xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__neondot);
283             xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
284             xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
285             xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
286             xnn_params.qc8.gemm.mr = 4;
287             xnn_params.qc8.gemm.nr = 8;
288             xnn_params.qc8.gemm.log2_kr = 2;
289           #endif  // XNN_ENABLE_ARM_DOTPROD
290         } else if (cpuinfo_has_arm_v8()) {
291           xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
292           xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
293           xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
294           xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
295           xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
296           xnn_params.qc8.gemm.mr = 2;
297           xnn_params.qc8.gemm.nr = 8;
298           xnn_params.qc8.gemm.log2_kr = 1;
299           xnn_params.qc8.gemm.log2_sr = 2;
300         } else {
301           xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
302           xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
303           xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
304           xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
305           xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
306           xnn_params.qc8.gemm.mr = 2;
307           xnn_params.qc8.gemm.nr = 8;
308           xnn_params.qc8.gemm.log2_kr = 1;
309           xnn_params.qc8.gemm.log2_sr = 2;
310         }
311       #endif  // XNN_ENABLE_ASSEMBLY
312 
313       if (cpuinfo_has_arm_neon_v8()) {
314         xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__aarch32_neonv8_mla8_cortex_a35;
315         xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
316         xnn_params.qc8.dwconv[0].channel_tile = 16;
317         xnn_params.qc8.dwconv[0].primary_tile = 3;
318         xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
319         xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
320         xnn_params.qc8.dwconv[1].channel_tile = 16;
321         xnn_params.qc8.dwconv[1].primary_tile = 9;
322         xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mla8_ld64;
323         xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
324         xnn_params.qc8.dwconv[2].channel_tile = 8;
325         xnn_params.qc8.dwconv[2].primary_tile = 25;
326       } else {
327         xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__neon_mla8_ld128;
328         xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
329         xnn_params.qc8.dwconv[0].channel_tile = 16;
330         xnn_params.qc8.dwconv[0].primary_tile = 3;
331         xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neon_mla8_ld64;
332         xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
333         xnn_params.qc8.dwconv[1].channel_tile = 16;
334         xnn_params.qc8.dwconv[1].primary_tile = 9;
335         xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mla8_ld64;
336         xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neon_params;
337         xnn_params.qc8.dwconv[2].channel_tile = 8;
338         xnn_params.qc8.dwconv[2].primary_tile = 25;
339       }
340     #endif  // XNN_NO_QC8_OPERATORS
341 
342     /**************************** QS8 AArch32 micro-kernels ****************************/
343     #ifndef XNN_NO_QS8_OPERATORS
344       init_flags |= XNN_INIT_FLAG_QS8;
345 
346       #if XNN_ENABLE_ASSEMBLY
347         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
348           #if XNN_ENABLE_ARM_DOTPROD
349             switch (cpuinfo_get_uarch(0)->uarch) {
350               case cpuinfo_uarch_cortex_a55:
351                 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55);
352                 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55);
353                 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
354                 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
355                 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
356                 xnn_params.qs8.gemm.mr = 4;
357                 xnn_params.qs8.gemm.nr = 8;
358                 xnn_params.qs8.gemm.log2_kr = 2;
359                 break;
360               default:
361                 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
362                 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
363                 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
364                 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
365                 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
366                 xnn_params.qs8.gemm.mr = 4;
367                 xnn_params.qs8.gemm.nr = 8;
368                 xnn_params.qs8.gemm.log2_kr = 2;
369                 break;
370             }
371           #endif  // XNN_ENABLE_ARM_DOTPROD
372         } else {
373           switch (cpuinfo_get_uarch(0)->uarch) {
374             case cpuinfo_uarch_cortex_a5:
375             case cpuinfo_uarch_cortex_a7:
376             case cpuinfo_uarch_krait:
377             case cpuinfo_uarch_kryo:
378               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
379               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
380               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
381               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
382               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
383               xnn_params.qs8.gemm.mr = 4;
384               xnn_params.qs8.gemm.nr = 8;
385               break;
386             case cpuinfo_uarch_cortex_a32:
387             case cpuinfo_uarch_cortex_a35:
388               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
389               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
390               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
391               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
392               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
393               xnn_params.qs8.gemm.mr = 4;
394               xnn_params.qs8.gemm.nr = 8;
395               break;
396             case cpuinfo_uarch_cortex_a53:
397             case cpuinfo_uarch_cortex_a57:
398               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
399               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
400               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
401               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
402               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
403               xnn_params.qs8.gemm.mr = 4;
404               xnn_params.qs8.gemm.nr = 8;
405               break;
406             case cpuinfo_uarch_cortex_a55r0:
407               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
408               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
409               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
410               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
411               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
412               xnn_params.qs8.gemm.mr = 4;
413               xnn_params.qs8.gemm.nr = 8;
414               break;
415             case cpuinfo_uarch_cortex_a72:
416               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
417               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
418               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
419               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
420               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
421               xnn_params.qs8.gemm.mr = 2;
422               xnn_params.qs8.gemm.nr = 8;
423               xnn_params.qs8.gemm.log2_kr = 1;
424               xnn_params.qs8.gemm.log2_sr = 2;
425               break;
426             case cpuinfo_uarch_exynos_m1:
427             case cpuinfo_uarch_exynos_m2:
428             case cpuinfo_uarch_exynos_m3:
429               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
430               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
431               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
432               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
433               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
434               xnn_params.qs8.gemm.mr = 4;
435               xnn_params.qs8.gemm.nr = 8;
436               break;
437             default:
438               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
439               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
440               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
441               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
442               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
443               xnn_params.qs8.gemm.mr = 4;
444               xnn_params.qs8.gemm.nr = 8;
445               break;
446           }
447         }
448         #if XNN_MAX_UARCH_TYPES > 1
449         {
450           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
451           const uint32_t mr = xnn_params.qs8.gemm.mr;
452           const uint32_t nr = xnn_params.qs8.gemm.nr;
453           const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
454           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
455             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
456             if (uarch_info == NULL) {
457               /* No more microarchitectures in the system */
458               break;
459             }
460 
461             switch (uarch_info->uarch) {
462               case cpuinfo_uarch_cortex_a55:
463                 #if XNN_ENABLE_ARM_DOTPROD
464                   if (mr == 4 && nr == 8 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
465                     xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55;
466                     xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55;
467                     xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot;
468                     xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot;
469                   }
470                 #endif  // XNN_ENABLE_ARM_DOTPROD
471                 break;
472               case cpuinfo_uarch_cortex_a53:
473                 if (mr == 4 && nr == 8 && log2_kr == 0) {
474                   xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
475                   xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
476                   xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7;
477                   xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7;
478                 }
479                 break;
480               case cpuinfo_uarch_cortex_a55r0:
481                 if (mr == 4 && nr == 8 && log2_kr == 0) {
482                   xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
483                   xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
484                   xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7;
485                   xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7;
486                 }
487                 break;
488               default:
489                 break;
490             }
491           }
492         }
493         #endif  // XNN_MAX_UARCH_TYPES > 1
494       #else  // XNN_ENABLE_ASSEMBLY
495         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
496           #if XNN_ENABLE_ARM_DOTPROD
497             xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
498             xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
499             xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
500             xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
501             xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
502             xnn_params.qs8.gemm.mr = 4;
503             xnn_params.qs8.gemm.nr = 8;
504             xnn_params.qs8.gemm.log2_kr = 2;
505           #endif  // XNN_ENABLE_ARM_DOTPROD
506         } else {
507           xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
508           xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
509           xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
510           xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
511           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
512           xnn_params.qs8.gemm.mr = 2;
513           xnn_params.qs8.gemm.nr = 8;
514           xnn_params.qs8.gemm.log2_kr = 1;
515           xnn_params.qs8.gemm.log2_sr = 2;
516         }
517       #endif  // XNN_ENABLE_ASSEMBLY
518 
519       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
520       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
521       xnn_params.qs8.dwconv[0].channel_tile = 16;
522       xnn_params.qs8.dwconv[0].primary_tile = 9;
523       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64;
524       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
525       xnn_params.qs8.dwconv[1].channel_tile = 8;
526       xnn_params.qs8.dwconv[1].primary_tile = 25;
527 
528       xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
529         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
530         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
531         .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
532         .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
533         .row_tile = 7,
534         .channel_tile = 8,
535       };
536 
537       xnn_params.qs8.vadd = (struct vbinary_parameters) {
538         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16,
539         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
540         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
541         .init.qs8_add = xnn_init_qs8_add_minmax_neon_params,
542         .element_tile = 16,
543       };
544       xnn_params.qs8.vmul = (struct vbinary_parameters) {
545         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
546         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
547         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
548         .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
549         .element_tile = 16,
550       };
551 
552       xnn_params.qs8.lrelu = (struct vunary_parameters) {
553         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__neon_x32,
554         .init.qs8_lrelu = xnn_init_qs8_lrelu_neon_params,
555         .element_tile = 32,
556       };
557     #endif  // XNN_NO_QS8_OPERATORS
558 
559     /*************************** QU8 AArch32 micro-kernels ***************************/
560     #ifndef XNN_NO_QU8_OPERATORS
561       init_flags |= XNN_INIT_FLAG_QU8;
562 
563       #if XNN_ENABLE_ASSEMBLY
564         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
565           #if XNN_ENABLE_ARM_DOTPROD
566             xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
567             xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
568             xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
569             xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
570             xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
571             xnn_params.qu8.gemm.mr = 4;
572             xnn_params.qu8.gemm.nr = 8;
573             xnn_params.qu8.gemm.log2_kr = 2;
574           #endif  // XNN_ENABLE_ARM_DOTPROD
575         } else {
576           switch (cpuinfo_get_uarch(0)->uarch) {
577             case cpuinfo_uarch_cortex_a5:
578             case cpuinfo_uarch_cortex_a7:
579             case cpuinfo_uarch_krait:
580             case cpuinfo_uarch_kryo:
581               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
582               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
583               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
584               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
585               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
586               xnn_params.qu8.gemm.mr = 4;
587               xnn_params.qu8.gemm.nr = 8;
588               break;
589             case cpuinfo_uarch_cortex_a32:
590             case cpuinfo_uarch_cortex_a35:
591               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
592               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
593               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
594               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
595               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
596               xnn_params.qu8.gemm.mr = 4;
597               xnn_params.qu8.gemm.nr = 8;
598               break;
599             case cpuinfo_uarch_cortex_a53:
600             case cpuinfo_uarch_cortex_a57:
601             case cpuinfo_uarch_cortex_a72:
602               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
603               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
604               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
605               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
606               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
607               xnn_params.qu8.gemm.mr = 4;
608               xnn_params.qu8.gemm.nr = 8;
609               break;
610             case cpuinfo_uarch_cortex_a55r0:
611               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
612               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
613               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
614               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
615               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
616               xnn_params.qu8.gemm.mr = 4;
617               xnn_params.qu8.gemm.nr = 8;
618               break;
619             case cpuinfo_uarch_exynos_m1:
620             case cpuinfo_uarch_exynos_m2:
621             case cpuinfo_uarch_exynos_m3:
622               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
623               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
624               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
625               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
626               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
627               xnn_params.qu8.gemm.mr = 4;
628               xnn_params.qu8.gemm.nr = 8;
629               break;
630             default:
631               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
632               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
633               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
634               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7);
635               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
636               xnn_params.qu8.gemm.mr = 4;
637               xnn_params.qu8.gemm.nr = 8;
638               break;
639           }
640         }
641         #if XNN_MAX_UARCH_TYPES > 1
642         {
643           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
644           const uint32_t mr = xnn_params.qu8.gemm.mr;
645           const uint32_t nr = xnn_params.qu8.gemm.nr;
646           const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
647           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
648             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
649             if (uarch_info == NULL) {
650               /* No more microarchitectures in the system */
651               break;
652             }
653 
654             switch (uarch_info->uarch) {
655               case cpuinfo_uarch_cortex_a53:
656                 if (mr == 4 && nr == 8 && log2_kr == 0) {
657                   xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
658                   xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
659                   xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7;
660                   xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_prfm_cortex_a7;
661                 }
662                 break;
663               case cpuinfo_uarch_cortex_a55r0:
664                 if (mr == 4 && nr == 8 && log2_kr == 0) {
665                   xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
666                   xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
667                   xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7;
668                   xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__aarch32_neon_mlal_lane_cortex_a7;
669                 }
670                 break;
671               default:
672                 break;
673             }
674           }
675         }
676         #endif  // XNN_MAX_UARCH_TYPES > 1
677       #else  // XNN_ENABLE_ASSEMBLY
678         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
679           #if XNN_ENABLE_ARM_DOTPROD
680             xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
681             xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
682             xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
683             xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
684             xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
685             xnn_params.qu8.gemm.mr = 4;
686             xnn_params.qu8.gemm.nr = 8;
687             xnn_params.qu8.gemm.log2_kr = 2;
688           #endif  // XNN_ENABLE_ARM_DOTPROD
689         } else {
690           xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane);
691           xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane);
692           xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
693           xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
694           xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
695           xnn_params.qu8.gemm.mr = 3;
696           xnn_params.qu8.gemm.nr = 8;
697         }
698       #endif  // XNN_ENABLE_ASSEMBLY
699 
700       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
701       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
702       xnn_params.qu8.dwconv[0].channel_tile = 16;
703       xnn_params.qu8.dwconv[0].primary_tile = 9;
704       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
705       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
706       xnn_params.qu8.dwconv[1].channel_tile = 8;
707       xnn_params.qu8.dwconv[1].primary_tile = 25;
708 
709       xnn_params.qu8.avgpool = (struct avgpool_parameters) {
710         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
711         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
712         .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
713         .primary_tile = 9,
714         .incremental_tile = 8,
715         .channel_tile = 8,
716       };
717       xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
718         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
719         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
720         .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
721         .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
722         .row_tile = 7,
723         .channel_tile = 8,
724       };
725       xnn_params.qu8.vadd = (struct vbinary_parameters) {
726         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16,
727         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
728         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
729         .init.qu8_add = xnn_init_qu8_add_minmax_neon_params,
730         .element_tile = 8,
731       };
732       xnn_params.qu8.vmul = (struct vbinary_parameters) {
733         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
734         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
735         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
736         .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
737         .element_tile = 16,
738       };
739 
740       xnn_params.qu8.lrelu = (struct vunary_parameters) {
741         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__neon_x32,
742         .init.qu8_lrelu = xnn_init_qu8_lrelu_neon_params,
743         .element_tile = 32,
744       };
745     #endif  // XNN_NO_QU8_OPERATORS
746 
747     /**************************** S8 AArch32 micro-kernels ****************************/
748     #ifndef XNN_NO_S8_OPERATORS
749       init_flags |= XNN_INIT_FLAG_S8;
750 
751       xnn_params.s8.clamp = (struct vunary_parameters) {
752         .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
753         .init.s8_minmax = xnn_init_s8_minmax_neon_params,
754         .element_tile = 64,
755       };
756       xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
757         .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c8,
758         .pixel_tile = 1,
759         .channel_tile = 8,
760       };
761       xnn_params.s8.maxpool = (struct maxpool_parameters) {
762         .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
763         .init.s8 = xnn_init_s8_minmax_neon_params,
764         .mr = 9,
765         .qr = 8,
766       };
767     #endif  // XNN_NO_S8_OPERATORS
768 
769     /**************************** U8 AArch32 micro-kernels ****************************/
770     #ifndef XNN_NO_U8_OPERATORS
771       init_flags |= XNN_INIT_FLAG_U8;
772 
773       xnn_params.u8.clamp = (struct vunary_parameters) {
774         .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
775         .init.u8_minmax = xnn_init_u8_minmax_neon_params,
776         .element_tile = 64,
777       };
778       xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
779         .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c8,
780         .pixel_tile = 1,
781         .channel_tile = 8,
782       };
783       xnn_params.u8.maxpool = (struct maxpool_parameters) {
784         .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
785         .init.u8 = xnn_init_u8_minmax_neon_params,
786         .mr = 9,
787         .qr = 8,
788       };
789       xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
790       xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
791     #endif  // XNN_NO_U8_OPERATORS
792 
793     /**************************** X8 AArch32 micro-kernels ****************************/
794     #ifndef XNN_NO_X8_OPERATORS
795       init_flags |= XNN_INIT_FLAG_X8;
796 
797       xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
798       xnn_params.x8.zip = (struct zip_parameters) {
799         .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
800         .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
801         .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
802         .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
803       };
804 
805       xnn_params.x8.transpose = (struct transpose_parameters) {
806         .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon,
807         .tile_size = 32,
808       };
809     #endif  // XNN_NO_X8_OPERATORS
810 
811     /**************************** X16 AArch32 micro-kernels ****************************/
812     #ifndef XNN_NO_X16_OPERATORS
813       init_flags |= XNN_INIT_FLAG_X16;
814 
815       xnn_params.x16.transpose = (struct transpose_parameters) {
816         .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon,
817         .tile_size = 32,
818       };
819     #endif  // XNN_NO_X16_OPERATORS
820 
821     /**************************** F32 AArch32 micro-kernels ****************************/
822     #ifndef XNN_NO_F32_OPERATORS
823       init_flags |= XNN_INIT_FLAG_F32;
824 
825       #if XNN_ENABLE_ASSEMBLY
826         switch (cpuinfo_get_uarch(0)->uarch) {
827           case cpuinfo_uarch_cortex_a5:
828           case cpuinfo_uarch_cortex_a7:
829           case cpuinfo_uarch_krait:
830           case cpuinfo_uarch_kryo:
831             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
832             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
833             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
834             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
835             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
836             xnn_params.f32.gemm.mr = 4;
837             xnn_params.f32.gemm.nr = 8;
838             break;
839           case cpuinfo_uarch_cortex_a53:
840             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53);
841             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53);
842             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
843             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
844             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
845             xnn_params.f32.gemm.mr = 4;
846             xnn_params.f32.gemm.nr = 8;
847             break;
848           case cpuinfo_uarch_cortex_a55r0:
849             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
850             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
851             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
852             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
853             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
854             xnn_params.f32.gemm.mr = 4;
855             xnn_params.f32.gemm.nr = 8;
856             break;
857           case cpuinfo_uarch_cortex_a32:
858           case cpuinfo_uarch_cortex_a35:
859           case cpuinfo_uarch_cortex_a55:
860             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
861             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
862             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
863             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
864             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
865             xnn_params.f32.gemm.mr = 4;
866             xnn_params.f32.gemm.nr = 8;
867             break;
868 
869           case cpuinfo_uarch_cortex_a57:
870           case cpuinfo_uarch_cortex_a72:
871           case cpuinfo_uarch_cortex_a73:
872             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
873             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
874             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
875             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
876             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
877             xnn_params.f32.gemm.mr = 4;
878             xnn_params.f32.gemm.nr = 8;
879             break;
880 
881           default:
882             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
883             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
884             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
885             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
886             xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
887             xnn_params.f32.gemm.mr = 4;
888             xnn_params.f32.gemm.nr = 8;
889             #if XNN_ENABLE_JIT
890               xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
891               xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
892             #endif
893             break;
894         }
895         #if XNN_MAX_UARCH_TYPES > 1
896         {
897           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
898           const uint32_t mr = xnn_params.f32.gemm.mr;
899           const uint32_t nr = xnn_params.f32.gemm.nr;
900           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
901             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
902             if (uarch_info == NULL) {
903               /* No more microarchitectures in the system */
904               break;
905             }
906 
907             switch (uarch_info->uarch) {
908               case cpuinfo_uarch_cortex_a53:
909                 if (mr == 4 && nr == 8) {
910                   xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53;
911                   xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a53;
912                   xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
913                   xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
914                 }
915                 break;
916               case cpuinfo_uarch_cortex_a55r0:
917                 if (mr == 4 && nr == 8) {
918                   xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
919                   xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
920                   xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
921                   xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
922                 }
923                 break;
924               case cpuinfo_uarch_cortex_a55:
925                 if (mr == 4 && nr == 8) {
926                   xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
927                   xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
928                   xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
929                   xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
930                 }
931                 break;
932               default:
933                 break;
934             }
935           }
936         }
937         #endif  // XNN_MAX_UARCH_TYPES > 1
938       #else  // XNN_ENABLE_ASSEMBLY
939         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128);
940         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128);
941         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
942         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
943         xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
944         xnn_params.f32.gemm.mr = 4;
945         xnn_params.f32.gemm.nr = 8;
946       #endif  // XNN_ENABLE_ASSEMBLY
947       xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64);
948       xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64);
949       xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
950       xnn_params.f32.gemm2.mr = 4;
951       xnn_params.f32.gemm2.nr = 2;
952 
953       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neon;
954       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
955       xnn_params.f32.dwconv[0].channel_tile = 8,
956       xnn_params.f32.dwconv[0].primary_tile = 3,
957 
958       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neon;
959       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
960       xnn_params.f32.dwconv[1].channel_tile = 8,
961       xnn_params.f32.dwconv[1].primary_tile = 4,
962 
963       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neon;
964       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
965       xnn_params.f32.dwconv[2].channel_tile = 8;
966       xnn_params.f32.dwconv[2].primary_tile = 9;
967 
968       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neon_acc2;
969       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
970       xnn_params.f32.dwconv[3].channel_tile = 8;
971       xnn_params.f32.dwconv[3].primary_tile = 25;
972 
973       xnn_params.f32.avgpool = (struct avgpool_parameters) {
974         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
975         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
976         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
977         .primary_tile = 9,
978         .incremental_tile = 8,
979         .channel_tile = 4,
980       };
981       xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
982         .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
983         .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
984         .init.f32 = xnn_init_f32_minmax_scalar_params,
985         .primary_tile = 9,
986         .incremental_tile = 8,
987         .channel_tile = 4,
988       };
989       xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
990         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
991         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
992         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
993         .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
994         .row_tile = 7,
995         .channel_tile = 4,
996       };
997       xnn_params.f32.maxpool = (struct maxpool_parameters) {
998         .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
999         .init.f32 = xnn_init_f32_minmax_scalar_params,
1000         .mr = 9,
1001         .qr = 8,
1002       };
1003       xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1004         .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
1005         .mr = 4,
1006       };
1007       xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1008         .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
1009         .mr = 9,
1010       };
1011       xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1012         .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
1013         .mr = 9,
1014         .qr = 8,
1015       };
1016       xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1017         .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neon_c8,
1018         .pixel_tile = 1,
1019         .channel_tile = 8,
1020       };
1021       xnn_params.f32.abs = (struct vunary_parameters) {
1022         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
1023         .element_tile = 8,
1024       };
1025       xnn_params.f32.clamp = (struct vunary_parameters) {
1026         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
1027         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1028         .element_tile = 8,
1029       };
1030       if (cpuinfo_has_arm_neon_fma()) {
1031         xnn_params.f32.elu = (struct vunary_parameters) {
1032           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_p6_x8,
1033           .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_p6_params,
1034           .element_tile = 8,
1035         };
1036       } else {
1037         xnn_params.f32.elu = (struct vunary_parameters) {
1038           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8,
1039           .init.f32_elu = xnn_init_f32_elu_neon_rr2_lut16_p3_params,
1040           .element_tile = 8,
1041         };
1042       }
1043       xnn_params.f32.hswish = (struct vunary_parameters) {
1044         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
1045         .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
1046         .element_tile = 16,
1047       };
1048       xnn_params.f32.lrelu = (struct vunary_parameters) {
1049         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
1050         .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
1051         .element_tile = 8,
1052       };
1053       xnn_params.f32.neg = (struct vunary_parameters) {
1054         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
1055         .element_tile = 8,
1056       };
1057       if (cpuinfo_has_arm_neon_v8()) {
1058         xnn_params.f32.rndne = (struct vunary_parameters) {
1059           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
1060           .element_tile = 8,
1061         };
1062         xnn_params.f32.rndz = (struct vunary_parameters) {
1063           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
1064           .element_tile = 8,
1065         };
1066         xnn_params.f32.rndu = (struct vunary_parameters) {
1067           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
1068           .element_tile = 8,
1069         };
1070         xnn_params.f32.rndd = (struct vunary_parameters) {
1071           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
1072           .element_tile = 8,
1073         };
1074       } else {
1075         xnn_params.f32.rndne = (struct vunary_parameters) {
1076           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__neon_x8,
1077           .element_tile = 8,
1078         };
1079         xnn_params.f32.rndz = (struct vunary_parameters) {
1080           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__neon_x8,
1081           .element_tile = 8,
1082         };
1083         xnn_params.f32.rndu = (struct vunary_parameters) {
1084           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__neon_x8,
1085           .element_tile = 8,
1086         };
1087         xnn_params.f32.rndd = (struct vunary_parameters) {
1088           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__neon_x8,
1089           .element_tile = 8,
1090         };
1091       }
1092       xnn_params.f32.sigmoid = (struct vunary_parameters) {
1093         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8,
1094         .init.f32_sigmoid = xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params,
1095         .element_tile = 8,
1096       };
1097       xnn_params.f32.sqr = (struct vunary_parameters) {
1098         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
1099         .element_tile = 8,
1100       };
1101       xnn_params.f32.sqrt = (struct vunary_parameters) {
1102         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
1103         .element_tile = 1,
1104       };
1105       xnn_params.f32.prelu = (struct prelu_parameters) {
1106         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
1107         .row_tile = 2,
1108         .channel_tile = 8,
1109       };
1110       xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1111         .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x8,
1112         .init.f32 = xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
1113         .element_tile = 8,
1114       };
1115       xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__neon;
1116       xnn_params.f32.vadd = (struct vbinary_parameters) {
1117         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
1118         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1119         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1120         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1121         .element_tile = 8,
1122       };
1123       xnn_params.f32.vdiv = (struct vbinary_parameters) {
1124         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1125         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1126         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
1127         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1128         .element_tile = 2,
1129       };
1130       xnn_params.f32.vmax = (struct vbinary_parameters) {
1131         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
1132         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1133         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1134         .element_tile = 8,
1135       };
1136       xnn_params.f32.vmin = (struct vbinary_parameters) {
1137         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
1138         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1139         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1140         .element_tile = 8,
1141       };
1142       xnn_params.f32.vmul = (struct vbinary_parameters) {
1143         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
1144         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1145         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1146         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1147         .element_tile = 8,
1148       };
1149       xnn_params.f32.vsub = (struct vbinary_parameters) {
1150         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
1151         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
1152         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
1153         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1154         .element_tile = 8,
1155       };
1156       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1157         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
1158         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1159         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1160         .element_tile = 8,
1161       };
1162       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1163         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x,
1164         .init.f32 = xnn_init_f32_minmax_scalar_params,
1165         .channel_tile = 4,
1166         .row_tile = 2,
1167       };
1168       #ifndef XNN_NO_NCHW_OPERATORS
1169         init_flags |= XNN_INIT_FLAG_CHW_OPT;
1170 
1171         xnn_params.f32.spmm = (struct spmm_parameters) {
1172           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neon,
1173           .mr = 32,
1174           .nr = 1,
1175         };
1176         xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1177           .ukernel_with_symm_padding =
1178             (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2,
1179           .output_channel_tile = 4,
1180           .output_height_tile = 2,
1181           .output_width_tile = 2,
1182         };
1183         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1184           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4,
1185           .output_width_tile = 4,
1186           .output_height_tile = 2,
1187         };
1188         xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1189           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4,
1190           .output_width_tile = 4,
1191           .output_height_tile = 1,
1192         };
1193         xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1194           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4,
1195           .output_width_tile = 4,
1196           .output_height_tile = 1,
1197         };
1198         xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1199           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4,
1200           .output_width_tile = 4,
1201           .output_height_tile = 1,
1202         };
1203         xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1204           .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
1205           .channel_tile = 4,
1206         };
1207         xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1208           .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neon_p8,
1209           .channel_tile = 1,
1210           .pixel_tile = 8,
1211         };
1212       #endif  // XNN_NO_NCHW_OPERATORS
1213     #endif  // XNN_NO_F32_OPERATORS
1214 
1215     /*************************** VCVT AArch32 micro-kernels ***************************/
1216     #ifndef XNN_NO_VCVT_OPERATORS
1217       init_flags |= XNN_INIT_FLAG_VCVT;
1218 
1219       if (cpuinfo_has_arm_neon_fp16()) {
1220         xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1221           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
1222           .element_tile = 16,
1223         };
1224         xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1225           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
1226           .element_tile = 16,
1227         };
1228       } else {
1229         xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1230           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__neon_int16_x16,
1231           .init.f16_f32_cvt = xnn_init_f16_f32_cvt_neon_params,
1232           .element_tile = 16,
1233         };
1234         xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1235           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__neon_x8,
1236           .init.f32_f16_cvt = xnn_init_f32_f16_cvt_neon_params,
1237           .element_tile = 8,
1238         };
1239       }
1240       if (cpuinfo_has_arm_neon_v8()) {
1241         xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1242           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
1243           .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
1244           .element_tile = 32,
1245         };
1246         xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1247           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
1248           .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
1249           .element_tile = 32,
1250         };
1251       } else {
1252         xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1253           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neon_x32,
1254           .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neon_params,
1255           .element_tile = 32,
1256         };
1257         xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1258           .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neon_x32,
1259           .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neon_params,
1260           .element_tile = 32,
1261         };
1262       }
1263       xnn_params.vcvt.qs8 = (struct vunary_parameters) {
1264         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__neon_x32,
1265         .init.qs8_cvt = xnn_init_qs8_cvt_neon_params,
1266         .element_tile = 32,
1267       };
1268       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1269         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
1270         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
1271         .element_tile = 32,
1272       };
1273       xnn_params.vcvt.qu8 = (struct vunary_parameters) {
1274         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__neon_x32,
1275         .init.qu8_cvt = xnn_init_qu8_cvt_neon_params,
1276         .element_tile = 32,
1277       };
1278       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1279         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
1280         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
1281         .element_tile = 32,
1282       };
1283     #endif  // XNN_NO_VCVT_OPERATORS
1284 
1285     /**************************** X32 AArch32 micro-kernels ****************************/
1286     #ifndef XNN_NO_X32_OPERATORS
1287       init_flags |= XNN_INIT_FLAG_X32;
1288 
1289       xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
1290       xnn_params.x32.zip = (struct zip_parameters) {
1291         .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
1292         .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
1293         .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
1294         .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
1295       };
1296 
1297       xnn_params.x32.transpose = (struct transpose_parameters) {
1298         .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__4x4_reuse_dec_zip_neon,
1299         .tile_size = 32,
1300       };
1301     #endif  // XNN_NO_X32_OPERATORS
1302 
1303     /**************************** XX AArch32 micro-kernels ****************************/
1304     #ifndef XNN_NO_XX_OPERATORS
1305       init_flags |= XNN_INIT_FLAG_XX;
1306 
1307       xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1308       xnn_params.xx.fill = (struct fill_parameters) {
1309         .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
1310         .row_tile = 1,
1311       };
1312       xnn_params.xx.pad = (struct pad_parameters) {
1313         .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
1314         .row_tile = 1,
1315       };
1316       xnn_params.xx.transpose = (struct transpose_parameters) {
1317         .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
1318         .tile_size = 32,
1319       };
1320     #endif  // XNN_NO_XX_OPERATORS
1321 
1322   } else if (!XNN_PLATFORM_MOBILE) {
1323 
1324     /*************************** QC8 AArch32 Pre-NEON micro-kernels ***************************/
1325     #ifndef XNN_NO_QC8_OPERATORS
1326       init_flags |= XNN_INIT_FLAG_QC8;
1327 
1328       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1329       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1330       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1331       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1332       xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_armsimd32_params;
1333       xnn_params.qc8.gemm.mr = 2;
1334       xnn_params.qc8.gemm.nr = 2;
1335       xnn_params.qc8.gemm.log2_kr = 2;
1336 
1337       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x3__scalar_fmagic;
1338       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
1339       xnn_params.qc8.dwconv[0].channel_tile = 1;
1340       xnn_params.qc8.dwconv[0].primary_tile = 3;
1341       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1342       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
1343       xnn_params.qc8.dwconv[1].channel_tile = 1;
1344       xnn_params.qc8.dwconv[1].primary_tile = 9;
1345       xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1346       xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
1347       xnn_params.qc8.dwconv[2].channel_tile = 1;
1348       xnn_params.qc8.dwconv[2].primary_tile = 25;
1349     #endif  // XNN_NO_QS8_OPERATORS
1350 
1351     /*************************** QS8 AArch32 Pre-NEON micro-kernels ***************************/
1352     #ifndef XNN_NO_QS8_OPERATORS
1353       init_flags |= XNN_INIT_FLAG_QS8;
1354 
1355       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1356       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1357       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1358       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1359       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_armsimd32_params;
1360       xnn_params.qs8.gemm.mr = 2;
1361       xnn_params.qs8.gemm.nr = 2;
1362       xnn_params.qs8.gemm.log2_kr = 2;
1363 
1364       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1365       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
1366       xnn_params.qs8.dwconv[0].channel_tile = 1;
1367       xnn_params.qs8.dwconv[0].primary_tile = 9;
1368       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1369       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
1370       xnn_params.qs8.dwconv[1].channel_tile = 1;
1371       xnn_params.qs8.dwconv[1].primary_tile = 25;
1372 
1373       xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
1374         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1375         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
1376         .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
1377         .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
1378         .row_tile = 7,
1379         .channel_tile = 1,
1380       };
1381       xnn_params.qs8.vadd = (struct vbinary_parameters) {
1382         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x1,
1383         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
1384         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
1385         .init.qs8_add = xnn_init_qs8_add_minmax_scalar_params,
1386         .element_tile = 1,
1387       };
1388       xnn_params.qs8.vmul = (struct vbinary_parameters) {
1389         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
1390         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
1391         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
1392         .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
1393         .element_tile = 4,
1394       };
1395 
1396       xnn_params.qs8.lrelu = (struct vunary_parameters) {
1397         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__armsimd32_x4,
1398         .init.qs8_lrelu = xnn_init_qs8_lrelu_armsimd32_params,
1399         .element_tile = 4,
1400       };
1401     #endif  // XNN_NO_QS8_OPERATORS
1402 
1403     /*************************** QU8 AArch32 Pre-NEON micro-kernels ***************************/
1404     #ifndef XNN_NO_QU8_OPERATORS
1405       init_flags |= XNN_INIT_FLAG_QU8;
1406 
1407       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1408       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2c4__armsimd32);
1409       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1410       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2c4__armsimd32);
1411       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_armsimd32_params;
1412       xnn_params.qu8.gemm.mr = 2;
1413       xnn_params.qu8.gemm.nr = 2;
1414       xnn_params.qu8.gemm.log2_kr = 2;
1415 
1416       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1417       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
1418       xnn_params.qu8.dwconv[0].channel_tile = 1;
1419       xnn_params.qu8.dwconv[0].primary_tile = 9;
1420       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1421       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
1422       xnn_params.qu8.dwconv[1].channel_tile = 1;
1423       xnn_params.qu8.dwconv[1].primary_tile = 25;
1424 
1425       xnn_params.qu8.avgpool = (struct avgpool_parameters) {
1426         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
1427         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
1428         .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
1429         .primary_tile = 9,
1430         .incremental_tile = 8,
1431         .channel_tile = 1,
1432       };
1433       xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
1434         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1435         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
1436         .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1437         .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1438         .row_tile = 7,
1439         .channel_tile = 1,
1440       };
1441       xnn_params.qu8.vadd = (struct vbinary_parameters) {
1442         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x1,
1443         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
1444         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
1445         .init.qu8_add = xnn_init_qu8_add_minmax_scalar_params,
1446         .element_tile = 1,
1447       };
1448       xnn_params.qu8.vmul = (struct vbinary_parameters) {
1449         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
1450         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
1451         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
1452         .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
1453         .element_tile = 4,
1454       };
1455 
1456       xnn_params.qu8.lrelu = (struct vunary_parameters) {
1457         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__armsimd32_x4,
1458         .init.qu8_lrelu = xnn_init_qu8_lrelu_armsimd32_params,
1459         .element_tile = 4,
1460       };
1461     #endif  // XNN_NO_QU8_OPERATORS
1462 
1463     /**************************** S8 AArch32 Pre-NEON micro-kernels ****************************/
1464     #ifndef XNN_NO_S8_OPERATORS
1465       init_flags |= XNN_INIT_FLAG_S8;
1466 
1467       xnn_params.s8.clamp = (struct vunary_parameters) {
1468         .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
1469         .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
1470         .element_tile = 4,
1471       };
1472       xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
1473         .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
1474         .pixel_tile = 1,
1475         .channel_tile = 1,
1476       };
1477       xnn_params.s8.maxpool = (struct maxpool_parameters) {
1478         .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1479         .init.s8 = xnn_init_s8_minmax_scalar_params,
1480         .mr = 9,
1481         .qr = 8,
1482       };
1483     #endif  // XNN_NO_S8_OPERATORS
1484 
1485     /**************************** U8 AArch32 Pre-NEON micro-kernels ****************************/
1486     #ifndef XNN_NO_U8_OPERATORS
1487       init_flags |= XNN_INIT_FLAG_U8;
1488 
1489       xnn_params.u8.clamp = (struct vunary_parameters) {
1490         .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
1491         .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
1492         .element_tile = 4,
1493       };
1494       xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
1495         .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
1496         .pixel_tile = 1,
1497         .channel_tile = 1,
1498       };
1499       xnn_params.u8.maxpool = (struct maxpool_parameters) {
1500         .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1501         .init.u8 = xnn_init_u8_minmax_scalar_params,
1502         .mr = 9,
1503         .qr = 8,
1504       };
1505       xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1506       xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
1507     #endif  // XNN_NO_U8_OPERATORS
1508 
1509     /**************************** X8 AArch32 Pre-NEON micro-kernels ****************************/
1510     #ifndef XNN_NO_X8_OPERATORS
1511       init_flags |= XNN_INIT_FLAG_X8;
1512 
1513       xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
1514       xnn_params.x8.zip = (struct zip_parameters) {
1515         .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
1516         .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
1517         .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
1518         .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
1519       };
1520 
1521       xnn_params.x8.transpose = (struct transpose_parameters) {
1522         .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__2x4_scalar_int,
1523         .tile_size = 32,
1524       };
1525     #endif  // XNN_NO_X8_OPERATORS
1526 
1527     /**************************** X16 AArch32 Pre-NEON micro-kernels ****************************/
1528     #ifndef XNN_NO_X16_OPERATORS
1529       init_flags |= XNN_INIT_FLAG_X16;
1530 
1531       xnn_params.x16.transpose = (struct transpose_parameters) {
1532         .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__2x4_scalar_int,
1533         .tile_size = 32,
1534       };
1535     #endif  // XNN_NO_X16_OPERATORS
1536 
1537     /**************************** F32 AArch32 Pre-NEON micro-kernels ****************************/
1538     #ifndef XNN_NO_F32_OPERATORS
1539       init_flags |= XNN_INIT_FLAG_F32;
1540 
1541       xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
1542       xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
1543       xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
1544       xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
1545       xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
1546       xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
1547       xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
1548       xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
1549       xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
1550       xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
1551       xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
1552       xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
1553       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
1554       xnn_params.f32.gemm.mr = 4;
1555       xnn_params.f32.gemm.nr = 4;
1556 
1557       xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
1558       xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar);
1559       xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
1560       xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar);
1561       xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
1562       xnn_params.f32.gemm2.mr = 4;
1563       xnn_params.f32.gemm2.nr = 2;
1564 
1565       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
1566       xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
1567       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
1568       xnn_params.f32.dwconv[0].channel_tile = 1;
1569       xnn_params.f32.dwconv[0].primary_tile = 3;
1570 
1571       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
1572       xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
1573       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
1574       xnn_params.f32.dwconv[1].channel_tile = 1;
1575       xnn_params.f32.dwconv[1].primary_tile = 4;
1576 
1577       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
1578       xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
1579       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
1580       xnn_params.f32.dwconv[2].channel_tile = 1;
1581       xnn_params.f32.dwconv[2].primary_tile = 9;
1582 
1583       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
1584       xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
1585       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
1586       xnn_params.f32.dwconv[3].channel_tile = 1;
1587       xnn_params.f32.dwconv[3].primary_tile = 25;
1588 
1589       xnn_params.f32.avgpool = (struct avgpool_parameters) {
1590         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
1591         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
1592         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1593         .primary_tile = 9,
1594         .incremental_tile = 8,
1595         .channel_tile = 1,
1596       };
1597       xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
1598         .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
1599         .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
1600         .init.f32 = xnn_init_f32_minmax_scalar_params,
1601         .primary_tile = 9,
1602         .incremental_tile = 8,
1603         .channel_tile = 1,
1604       };
1605       xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
1606         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
1607         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
1608         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1609         .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
1610         .row_tile = 7,
1611         .channel_tile = 1,
1612       };
1613       xnn_params.f32.maxpool = (struct maxpool_parameters) {
1614         .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
1615         .init.f32 = xnn_init_f32_minmax_scalar_params,
1616         .mr = 9,
1617         .qr = 8,
1618       };
1619       xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1620         .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
1621         .mr = 4,
1622       };
1623       xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1624         .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
1625         .mr = 9,
1626       };
1627       xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1628         .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
1629         .mr = 9,
1630         .qr = 8,
1631       };
1632       xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1633         .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
1634         .pixel_tile = 1,
1635         .channel_tile = 2,
1636       };
1637       xnn_params.f32.abs = (struct vunary_parameters) {
1638         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
1639         .element_tile = 4,
1640       };
1641       xnn_params.f32.clamp = (struct vunary_parameters) {
1642         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
1643         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1644         .element_tile = 4,
1645       };
1646       xnn_params.f32.elu = (struct vunary_parameters) {
1647         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
1648         .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
1649         .element_tile = 4,
1650       };
1651       xnn_params.f32.hswish = (struct vunary_parameters) {
1652         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
1653         .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
1654         .element_tile = 4,
1655       };
1656       xnn_params.f32.lrelu = (struct vunary_parameters) {
1657         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
1658         .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
1659         .element_tile = 4,
1660       };
1661       xnn_params.f32.neg = (struct vunary_parameters) {
1662         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
1663         .element_tile = 4,
1664       };
1665       xnn_params.f32.rndne = (struct vunary_parameters) {
1666         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
1667         .element_tile = 1,
1668       };
1669       xnn_params.f32.rndz = (struct vunary_parameters) {
1670         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
1671         .element_tile = 1,
1672       };
1673       xnn_params.f32.rndu = (struct vunary_parameters) {
1674         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
1675         .element_tile = 1,
1676       };
1677       xnn_params.f32.rndd = (struct vunary_parameters) {
1678         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
1679         .element_tile = 1,
1680       };
1681       xnn_params.f32.sigmoid = (struct vunary_parameters) {
1682         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
1683         .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
1684         .element_tile = 2,
1685       };
1686       xnn_params.f32.sqr = (struct vunary_parameters) {
1687         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
1688         .element_tile = 4,
1689       };
1690       xnn_params.f32.sqrt = (struct vunary_parameters) {
1691         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
1692         .element_tile = 1,
1693       };
1694       xnn_params.f32.prelu = (struct prelu_parameters) {
1695         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
1696         .row_tile = 4,
1697         .channel_tile = 4,
1698       };
1699       xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1700         .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
1701         .init.f32 = xnn_init_f32_expminus_scalar_rr2_p5_params,
1702         .element_tile = 4,
1703       };
1704       xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__scalar;
1705       xnn_params.f32.vadd = (struct vbinary_parameters) {
1706         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
1707         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
1708         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
1709         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1710         .element_tile = 8,
1711       };
1712       xnn_params.f32.vdiv = (struct vbinary_parameters) {
1713         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1714         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1715         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
1716         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1717         .element_tile = 2,
1718       };
1719       xnn_params.f32.vmax = (struct vbinary_parameters) {
1720         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
1721         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
1722         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
1723         .element_tile = 8,
1724       };
1725       xnn_params.f32.vmin = (struct vbinary_parameters) {
1726         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
1727         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
1728         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
1729         .element_tile = 8,
1730       };
1731       xnn_params.f32.vmul = (struct vbinary_parameters) {
1732         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
1733         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
1734         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
1735         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1736         .element_tile = 8,
1737       };
1738       xnn_params.f32.vsub = (struct vbinary_parameters) {
1739         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
1740         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
1741         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
1742         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1743         .element_tile = 8,
1744       };
1745       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1746         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
1747         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
1748         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
1749         .element_tile = 8,
1750       };
1751       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1752         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
1753         .init.f32 = xnn_init_f32_minmax_scalar_params,
1754         .channel_tile = 1,
1755         .row_tile = 2,
1756       };
1757       #ifndef XNN_NO_NCHW_OPERATORS
1758         init_flags |= XNN_INIT_FLAG_CHW_OPT;
1759 
1760         xnn_params.f32.spmm = (struct spmm_parameters) {
1761           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
1762           .mr = 8,
1763           .nr = 1,
1764         };
1765         xnn_params.f32.spmm2 = (struct spmm_parameters) {
1766           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
1767           .mr = 8,
1768           .nr = 2,
1769         };
1770         xnn_params.f32.spmm4 = (struct spmm_parameters) {
1771           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
1772           .mr = 8,
1773           .nr = 4,
1774         };
1775         xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1776           .ukernel_with_symm_padding =
1777             (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
1778           .output_channel_tile = 4,
1779           .output_height_tile = 1,
1780           .output_width_tile = 1,
1781         };
1782         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1783           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1,
1784           .output_width_tile = 1,
1785           .output_height_tile = 4,
1786         };
1787         xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1788           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2,
1789           .output_width_tile = 1,
1790           .output_height_tile = 2,
1791         };
1792         xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1793           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2,
1794           .output_width_tile = 1,
1795           .output_height_tile = 2,
1796         };
1797         xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1798           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2,
1799           .output_width_tile = 1,
1800           .output_height_tile = 2,
1801         };
1802         xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1803           .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
1804           .channel_tile = 1,
1805         };
1806         xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1807           .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
1808           .channel_tile = 1,
1809           .pixel_tile = 4,
1810         };
1811       #endif  // XNN_NO_NCHW_OPERATORS
1812     #endif  // XNN_NO_F32_OPERATORS
1813 
1814     /*************************** VCVT AArch32 Pre-NEON micro-kernels ***************************/
1815     #ifndef XNN_NO_VCVT_OPERATORS
1816       init_flags |= XNN_INIT_FLAG_VCVT;
1817 
1818       xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1819         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
1820         .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
1821         .element_tile = 4,
1822       };
1823       xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1824         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
1825         .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
1826         .element_tile = 2,
1827       };
1828       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1829         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_imagic_x4,
1830         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
1831         .element_tile = 4,
1832       };
1833       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1834         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x4,
1835         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
1836         .element_tile = 4,
1837       };
1838       xnn_params.vcvt.qs8 = (struct vunary_parameters) {
1839         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__armsimd32_x8,
1840         .init.qs8_cvt = xnn_init_qs8_cvt_scalar_params,
1841         .element_tile = 8,
1842       };
1843       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1844         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
1845         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
1846         .element_tile = 4,
1847       };
1848       xnn_params.vcvt.qu8 = (struct vunary_parameters) {
1849         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__armsimd32_x8,
1850         .init.qu8_cvt = xnn_init_qu8_cvt_scalar_params,
1851         .element_tile = 8,
1852       };
1853       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1854         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
1855         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
1856         .element_tile = 4,
1857       };
1858     #endif  // XNN_NO_VCVT_OPERATORS
1859 
1860     /**************************** X32 AArch32 Pre-NEON micro-kernels ****************************/
1861     #ifndef XNN_NO_X32_OPERATORS
1862       init_flags |= XNN_INIT_FLAG_X32;
1863 
1864       xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
1865       xnn_params.x32.zip = (struct zip_parameters) {
1866         .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
1867         .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
1868         .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
1869         .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
1870       };
1871 
1872       xnn_params.x32.transpose = (struct transpose_parameters) {
1873         .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__2x4_scalar_int,
1874         .tile_size = 32,
1875       };
1876     #endif  // XNN_NO_X32_OPERATORS
1877 
1878     /**************************** XX AArch32 Pre-NEON micro-kernels ****************************/
1879     #ifndef XNN_NO_XX_OPERATORS
1880       init_flags |= XNN_INIT_FLAG_XX;
1881 
1882       xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1883       xnn_params.xx.fill = (struct fill_parameters) {
1884         .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
1885         .row_tile = 1,
1886       };
1887       xnn_params.xx.pad = (struct pad_parameters) {
1888         .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
1889         .row_tile = 1,
1890       };
1891       xnn_params.xx.transpose = (struct transpose_parameters) {
1892         .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
1893         .tile_size = 32,
1894       };
1895     #endif  // XNN_NO_XX_OPERATORS
1896   }
1897 
1898 #elif XNN_ARCH_ARM64
1899 
1900   /**************************** QC8 AArch64 micro-kernels ****************************/
1901   #ifndef XNN_NO_QC8_OPERATORS
1902     init_flags |= XNN_INIT_FLAG_QC8;
1903 
1904     #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1905       #if XNN_ENABLE_ASSEMBLY
1906         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
1907           #if XNN_ENABLE_ARM_DOTPROD
1908             xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1909             xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1910             xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1911             xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1912             xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1913             xnn_params.qc8.gemm.mr = 4;
1914             xnn_params.qc8.gemm.nr = 16;
1915             xnn_params.qc8.gemm.log2_kr = 2;
1916           #endif  // XNN_ENABLE_ARM_DOTPROD
1917         } else {
1918           xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1919           xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1920           xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1921           xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1922           xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1923           xnn_params.qc8.gemm.mr = 2;
1924           xnn_params.qc8.gemm.nr = 8;
1925           xnn_params.qc8.gemm.log2_kr = 3;
1926         }
1927       #else  // !XNN_ENABLE_ASSEMBLY
1928         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
1929           #if XNN_ENABLE_ARM_DOTPROD
1930             xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1931             xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1932             xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1933             xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1934             xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1935             xnn_params.qc8.gemm.mr = 4;
1936             xnn_params.qc8.gemm.nr = 16;
1937             xnn_params.qc8.gemm.log2_kr = 2;
1938           #endif  // XNN_ENABLE_ARM_DOTPROD
1939         } else {
1940           xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1941           xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1942           xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1943           xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1944           xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1945           xnn_params.qc8.gemm.mr = 2;
1946           xnn_params.qc8.gemm.nr = 8;
1947           xnn_params.qc8.gemm.log2_kr = 1;
1948           xnn_params.qc8.gemm.log2_sr = 2;
1949         }
1950       #endif  // XNN_ENABLE_ASSEMBLY
1951     #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1952       #if XNN_ENABLE_ASSEMBLY
1953         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
1954           #if XNN_ENABLE_ARM_DOTPROD
1955             switch (cpuinfo_get_core(0)->uarch) {
1956               case cpuinfo_uarch_cortex_a55:
1957                 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1958                 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1959                 break;
1960               case cpuinfo_uarch_cortex_x1:
1961               case cpuinfo_uarch_cortex_a78:
1962                 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1963                 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1964                 break;
1965               default:
1966                 xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1967                 xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1968                 break;
1969             }
1970             xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1971             xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1972             xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1973             xnn_params.qc8.gemm.mr = 4;
1974             xnn_params.qc8.gemm.nr = 16;
1975             xnn_params.qc8.gemm.log2_kr = 2;
1976           #endif  // XNN_ENABLE_ARM_DOTPROD
1977         } else {
1978           switch (cpuinfo_get_core(0)->uarch) {
1979             case cpuinfo_uarch_cortex_a35:
1980             case cpuinfo_uarch_kryo:
1981               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1982               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1983               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1984               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1985               xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1986               xnn_params.qc8.gemm.mr = 4;
1987               xnn_params.qc8.gemm.nr = 16;
1988               break;
1989 
1990             case cpuinfo_uarch_cortex_a53:
1991             case cpuinfo_uarch_cortex_a55r0:
1992               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1993               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1994               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1995               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1996               xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
1997               xnn_params.qc8.gemm.mr = 4;
1998               xnn_params.qc8.gemm.nr = 16;
1999               break;
2000 
2001             case cpuinfo_uarch_cortex_a72:
2002             case cpuinfo_uarch_cortex_a73:
2003               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
2004               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
2005               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
2006               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
2007               xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2008               xnn_params.qc8.gemm.mr = 2;
2009               xnn_params.qc8.gemm.nr = 8;
2010               xnn_params.qc8.gemm.log2_kr = 3;
2011               break;
2012 
2013             default:
2014               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
2015               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
2016               xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
2017               xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
2018               xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2019               xnn_params.qc8.gemm.mr = 2;
2020               xnn_params.qc8.gemm.nr = 8;
2021               xnn_params.qc8.gemm.log2_kr = 3;
2022               break;
2023           }
2024         }
2025         #if XNN_MAX_UARCH_TYPES > 1
2026         {
2027           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2028           const uint32_t mr = xnn_params.qc8.gemm.mr;
2029           const uint32_t nr = xnn_params.qc8.gemm.nr;
2030           const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
2031           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2032             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2033             if (uarch_info == NULL) {
2034               /* No more microarchitectures in the system */
2035               break;
2036             }
2037 
2038             switch (uarch_info->uarch) {
2039               case cpuinfo_uarch_cortex_a53:
2040               case cpuinfo_uarch_cortex_a55r0:
2041                 if (mr == 2 && nr == 8 && log2_kr == 3) {
2042                   xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2043                   xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2044                   xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2045                   xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2046                 }
2047                 break;
2048 
2049               case cpuinfo_uarch_cortex_a55:
2050                 #if XNN_ENABLE_ARM_DOTPROD
2051                   if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
2052                     xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2053                     xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2054                     xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot;
2055                     xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot;
2056                   }
2057                 #endif  // XNN_ENABLE_ARM_DOTPROD
2058                 break;
2059               default:
2060                 break;
2061             }
2062           }
2063         }
2064         #endif  // XNN_MAX_UARCH_TYPES > 1
2065       #else  // !XNN_ENABLE_ASSEMBLY
2066         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2067           #if XNN_ENABLE_ARM_DOTPROD
2068             xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
2069             xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
2070             xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
2071             xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
2072             xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2073             xnn_params.qc8.gemm.mr = 4;
2074             xnn_params.qc8.gemm.nr = 16;
2075             xnn_params.qc8.gemm.log2_kr = 2;
2076           #endif  // XNN_ENABLE_ARM_DOTPROD
2077         } else {
2078           xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
2079           xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
2080           xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
2081           xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
2082           xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2083           xnn_params.qc8.gemm.mr = 2;
2084           xnn_params.qc8.gemm.nr = 8;
2085           xnn_params.qc8.gemm.log2_kr = 1;
2086           xnn_params.qc8.gemm.log2_sr = 2;
2087         }
2088       #endif  // XNN_ENABLE_ASSEMBLY
2089     #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2090 
2091     xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__neonv8_mla8_ld128;
2092     xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2093     xnn_params.qc8.dwconv[0].channel_tile = 16;
2094     xnn_params.qc8.dwconv[0].primary_tile = 3;
2095     xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
2096     xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2097     xnn_params.qc8.dwconv[1].channel_tile = 16;
2098     xnn_params.qc8.dwconv[1].primary_tile = 9;
2099     xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__neonv8_mla8_ld64;
2100     xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_neonv8_params;
2101     xnn_params.qc8.dwconv[2].channel_tile = 16;
2102     xnn_params.qc8.dwconv[2].primary_tile = 25;
2103   #endif  // XNN_NO_QC8_OPERATORS
2104 
2105   /**************************** QS8 AArch64 micro-kernels ****************************/
2106   #ifndef XNN_NO_QS8_OPERATORS
2107     init_flags |= XNN_INIT_FLAG_QS8;
2108 
2109     #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2110       #if XNN_ENABLE_ASSEMBLY
2111         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2112           #if XNN_ENABLE_ARM_DOTPROD
2113             xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2114             xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2115             xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2116             xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2117             xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2118             xnn_params.qs8.gemm.mr = 4;
2119             xnn_params.qs8.gemm.nr = 16;
2120             xnn_params.qs8.gemm.log2_kr = 2;
2121           #endif  // XNN_ENABLE_ARM_DOTPROD
2122         } else {
2123           xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2124           xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2125           xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2126           xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2127           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2128           xnn_params.qs8.gemm.mr = 2;
2129           xnn_params.qs8.gemm.nr = 8;
2130           xnn_params.qs8.gemm.log2_kr = 3;
2131         }
2132       #else  // !XNN_ENABLE_ASSEMBLY
2133         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2134           #if XNN_ENABLE_ARM_DOTPROD
2135             xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2136             xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2137             xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2138             xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2139             xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2140             xnn_params.qs8.gemm.mr = 4;
2141             xnn_params.qs8.gemm.nr = 16;
2142             xnn_params.qs8.gemm.log2_kr = 2;
2143           #endif  // XNN_ENABLE_ARM_DOTPROD
2144         } else {
2145           xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2146           xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2147           xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2148           xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2149           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2150           xnn_params.qs8.gemm.mr = 2;
2151           xnn_params.qs8.gemm.nr = 8;
2152           xnn_params.qs8.gemm.log2_kr = 1;
2153           xnn_params.qs8.gemm.log2_sr = 2;
2154         }
2155       #endif  // XNN_ENABLE_ASSEMBLY
2156     #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
2157       #if XNN_ENABLE_ASSEMBLY
2158         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2159           #if XNN_ENABLE_ARM_DOTPROD
2160             switch (cpuinfo_get_core(0)->uarch) {
2161               case cpuinfo_uarch_cortex_a55:
2162                 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2163                 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2164                 break;
2165               case cpuinfo_uarch_cortex_x1:
2166               case cpuinfo_uarch_cortex_a78:
2167                 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2168                 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2169                 break;
2170               default:
2171                 xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
2172                 xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
2173                 break;
2174             }
2175             xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2176             xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2177             xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2178             xnn_params.qs8.gemm.mr = 4;
2179             xnn_params.qs8.gemm.nr = 16;
2180             xnn_params.qs8.gemm.log2_kr = 2;
2181           #endif  // XNN_ENABLE_ARM_DOTPROD
2182         } else {
2183           switch (cpuinfo_get_core(0)->uarch) {
2184             case cpuinfo_uarch_cortex_a35:
2185             case cpuinfo_uarch_kryo:
2186               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
2187               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
2188               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2189               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2190               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2191               xnn_params.qs8.gemm.mr = 4;
2192               xnn_params.qs8.gemm.nr = 16;
2193               break;
2194 
2195             case cpuinfo_uarch_cortex_a53:
2196             case cpuinfo_uarch_cortex_a55r0:
2197               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
2198               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
2199               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2200               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2201               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2202               xnn_params.qs8.gemm.mr = 4;
2203               xnn_params.qs8.gemm.nr = 16;
2204               break;
2205 
2206             case cpuinfo_uarch_cortex_a72:
2207             case cpuinfo_uarch_cortex_a73:
2208               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
2209               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
2210               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
2211               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
2212               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2213               xnn_params.qs8.gemm.mr = 2;
2214               xnn_params.qs8.gemm.nr = 8;
2215               xnn_params.qs8.gemm.log2_kr = 3;
2216               break;
2217 
2218             default:
2219               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2220               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2221               xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2222               xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2223               xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2224               xnn_params.qs8.gemm.mr = 2;
2225               xnn_params.qs8.gemm.nr = 8;
2226               xnn_params.qs8.gemm.log2_kr = 3;
2227               break;
2228           }
2229         }
2230         #if XNN_MAX_UARCH_TYPES > 1
2231         {
2232           /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2233           const uint32_t mr = xnn_params.qs8.gemm.mr;
2234           const uint32_t nr = xnn_params.qs8.gemm.nr;
2235           const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
2236           for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2237             const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2238             if (uarch_info == NULL) {
2239               /* No more microarchitectures in the system */
2240               break;
2241             }
2242 
2243             switch (uarch_info->uarch) {
2244               case cpuinfo_uarch_cortex_a53:
2245               case cpuinfo_uarch_cortex_a55r0:
2246                 if (mr == 2 && nr == 8 && log2_kr == 3) {
2247                   xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2248                   xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2249                   xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2250                   xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2251                 }
2252                 break;
2253 
2254               case cpuinfo_uarch_cortex_a55:
2255                 #if XNN_ENABLE_ARM_DOTPROD
2256                   if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
2257                     xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2258                     xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2259                     xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot;
2260                     xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot;
2261                   }
2262                 #endif  // XNN_ENABLE_ARM_DOTPROD
2263                 break;
2264               default:
2265                 break;
2266             }
2267           }
2268         }
2269         #endif  // XNN_MAX_UARCH_TYPES > 1
2270       #else  // !XNN_ENABLE_ASSEMBLY
2271         if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2272           #if XNN_ENABLE_ARM_DOTPROD
2273             xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2274             xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2275             xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2276             xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2277             xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2278             xnn_params.qs8.gemm.mr = 4;
2279             xnn_params.qs8.gemm.nr = 16;
2280             xnn_params.qs8.gemm.log2_kr = 2;
2281           #endif  // XNN_ENABLE_ARM_DOTPROD
2282         } else {
2283           xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2284           xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2285           xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2286           xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2287           xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2288           xnn_params.qs8.gemm.mr = 2;
2289           xnn_params.qs8.gemm.nr = 8;
2290           xnn_params.qs8.gemm.log2_kr = 1;
2291           xnn_params.qs8.gemm.log2_sr = 2;
2292         }
2293       #endif  // XNN_ENABLE_ASSEMBLY
2294     #endif  // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2295 
2296     xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
2297     xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2298     xnn_params.qs8.dwconv[0].channel_tile = 16;
2299     xnn_params.qs8.dwconv[0].primary_tile = 9;
2300     xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld64;
2301     xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2302     xnn_params.qs8.dwconv[1].channel_tile = 16;
2303     xnn_params.qs8.dwconv[1].primary_tile = 25;
2304 
2305     xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
2306       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2307       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2308       .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
2309       .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
2310       .row_tile = 7,
2311       .channel_tile = 8,
2312     };
2313 
2314     xnn_params.qs8.vadd = (struct vbinary_parameters) {
2315       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32,
2316       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
2317       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
2318       .init.qs8_add = xnn_init_qs8_add_minmax_neon_params,
2319       .element_tile = 32,
2320     };
2321     xnn_params.qs8.vmul = (struct vbinary_parameters) {
2322       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
2323       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2324       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2325       .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
2326       .element_tile = 16,
2327     };
2328 
2329     xnn_params.qs8.lrelu = (struct vunary_parameters) {
2330       .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__neon_x32,
2331       .init.qs8_lrelu = xnn_init_qs8_lrelu_neon_params,
2332       .element_tile = 32,
2333     };
2334   #endif  // XNN_NO_QS8_OPERATORS
2335 
2336   /**************************** QU8 AArch64 micro-kernels ****************************/
2337   #ifndef XNN_NO_QU8_OPERATORS
2338     init_flags |= XNN_INIT_FLAG_QU8;
2339 
2340     #if XNN_ENABLE_ASSEMBLY
2341       if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2342         #if XNN_ENABLE_ARM_DOTPROD
2343           switch (cpuinfo_get_core(0)->uarch) {
2344             case cpuinfo_uarch_cortex_a55:
2345               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2346               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2347               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2348               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2349               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2350               xnn_params.qu8.gemm.mr = 4;
2351               xnn_params.qu8.gemm.nr = 16;
2352               xnn_params.qu8.gemm.log2_kr = 2;
2353               break;
2354             default:
2355               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2356               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2357               xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2358               xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2359               xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2360               xnn_params.qu8.gemm.mr = 4;
2361               xnn_params.qu8.gemm.nr = 16;
2362               xnn_params.qu8.gemm.log2_kr = 2;
2363               break;
2364           }
2365         #endif  // XNN_ENABLE_ARM_DOTPROD
2366       } else {
2367         switch (cpuinfo_get_core(0)->uarch) {
2368           case cpuinfo_uarch_cortex_a53:
2369           case cpuinfo_uarch_cortex_a55r0:
2370           case cpuinfo_uarch_kryo:
2371             xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
2372             xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
2373             xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2374             xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2375             xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2376             xnn_params.qu8.gemm.mr = 4;
2377             xnn_params.qu8.gemm.nr = 16;
2378             break;
2379 
2380           case cpuinfo_uarch_cortex_a57:
2381           case cpuinfo_uarch_cortex_a72:
2382           case cpuinfo_uarch_cortex_a73:
2383           case cpuinfo_uarch_cortex_a75:
2384           case cpuinfo_uarch_cortex_a76:
2385           case cpuinfo_uarch_exynos_m1:
2386           case cpuinfo_uarch_exynos_m2:
2387           case cpuinfo_uarch_exynos_m3:
2388           case cpuinfo_uarch_exynos_m4:
2389             xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
2390             xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
2391             xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2392             xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2393             xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2394             xnn_params.qu8.gemm.mr = 4;
2395             xnn_params.qu8.gemm.nr = 16;
2396             break;
2397 
2398           default:
2399             xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
2400             xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
2401             xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2402             xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2403             xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2404             xnn_params.qu8.gemm.mr = 4;
2405             xnn_params.qu8.gemm.nr = 16;
2406             break;
2407         }
2408       }
2409       #if XNN_MAX_UARCH_TYPES > 1
2410       {
2411         /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2412         const uint32_t mr = xnn_params.qu8.gemm.mr;
2413         const uint32_t nr = xnn_params.qu8.gemm.nr;
2414         const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
2415         for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2416           const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2417           if (uarch_info == NULL) {
2418             /* No more microarchitectures in the system */
2419             break;
2420           }
2421 
2422           switch (uarch_info->uarch) {
2423             case cpuinfo_uarch_cortex_a53:
2424             case cpuinfo_uarch_cortex_a55r0:
2425               if (mr == 4 && nr == 16 && log2_kr == 0) {
2426                 xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
2427                 xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
2428               }
2429               break;
2430 
2431             case cpuinfo_uarch_cortex_a55:
2432               #if XNN_ENABLE_ARM_DOTPROD
2433                 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
2434                   xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2435                   xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2436                 }
2437               #endif  // XNN_ENABLE_ARM_DOTPROD
2438               break;
2439             default:
2440               break;
2441           }
2442         }
2443       }
2444       #endif  // XNN_MAX_UARCH_TYPES > 1
2445     #else  // !XNN_ENABLE_ASSEMBLY
2446       if (XNN_ENABLE_ARM_DOTPROD && cpuinfo_has_arm_neon_dot()) {
2447         #if XNN_ENABLE_ARM_DOTPROD
2448           xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2449           xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2450           xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2451           xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2452           xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2453           xnn_params.qu8.gemm.mr = 4;
2454           xnn_params.qu8.gemm.nr = 16;
2455           xnn_params.qu8.gemm.log2_kr = 2;
2456         #endif  // XNN_ENABLE_ARM_DOTPROD
2457       } else {
2458         xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2459         xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2460         xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2461         xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2462         xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2463         xnn_params.qu8.gemm.mr = 4;
2464         xnn_params.qu8.gemm.nr = 16;
2465       }
2466     #endif  // XNN_ENABLE_ASSEMBLY
2467 
2468     xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
2469     xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2470     xnn_params.qu8.dwconv[0].channel_tile = 16;
2471     xnn_params.qu8.dwconv[0].primary_tile = 9;
2472     xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
2473     xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2474     xnn_params.qu8.dwconv[1].channel_tile = 8;
2475     xnn_params.qu8.dwconv[1].primary_tile = 25;
2476 
2477     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
2478       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
2479       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
2480       .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
2481       .primary_tile = 9,
2482       .incremental_tile = 8,
2483       .channel_tile = 8,
2484     };
2485     xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
2486       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2487       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2488       .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
2489       .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
2490       .row_tile = 7,
2491       .channel_tile = 8,
2492     };
2493     xnn_params.qu8.vadd = (struct vbinary_parameters) {
2494       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x32,
2495       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
2496       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
2497       .init.qu8_add = xnn_init_qu8_add_minmax_neon_params,
2498       .element_tile = 8,
2499     };
2500     xnn_params.qu8.vmul = (struct vbinary_parameters) {
2501       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
2502       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2503       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2504       .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
2505       .element_tile = 16,
2506     };
2507 
2508     xnn_params.qu8.lrelu = (struct vunary_parameters) {
2509       .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__neon_x32,
2510       .init.qu8_lrelu = xnn_init_qu8_lrelu_neon_params,
2511       .element_tile = 32,
2512     };
2513   #endif  // XNN_NO_QU8_OPERATORS
2514 
2515   /**************************** S8 AArch64 micro-kernels ****************************/
2516   #ifndef XNN_NO_S8_OPERATORS
2517     init_flags |= XNN_INIT_FLAG_S8;
2518 
2519     xnn_params.s8.clamp = (struct vunary_parameters) {
2520       .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
2521       .init.s8_minmax = xnn_init_s8_minmax_neon_params,
2522       .element_tile = 64,
2523     };
2524     xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
2525       .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c16,
2526       .pixel_tile = 1,
2527       .channel_tile = 16,
2528     };
2529     xnn_params.s8.maxpool = (struct maxpool_parameters) {
2530       .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
2531       .init.s8 = xnn_init_s8_minmax_neon_params,
2532       .mr = 9,
2533       .qr = 8,
2534     };
2535   #endif  // XNN_NO_S8_OPERATORS
2536 
2537   /**************************** U8 AArch64 micro-kernels ****************************/
2538   #ifndef XNN_NO_U8_OPERATORS
2539     init_flags |= XNN_INIT_FLAG_U8;
2540 
2541     xnn_params.u8.clamp = (struct vunary_parameters) {
2542       .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
2543       .init.u8_minmax = xnn_init_u8_minmax_neon_params,
2544       .element_tile = 64,
2545     };
2546     xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
2547       .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c16,
2548       .pixel_tile = 1,
2549       .channel_tile = 16,
2550     };
2551     xnn_params.u8.maxpool = (struct maxpool_parameters) {
2552       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
2553       .init.u8 = xnn_init_u8_minmax_neon_params,
2554       .mr = 9,
2555       .qr = 8,
2556     };
2557     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2558     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
2559   #endif  // XNN_NO_U8_OPERATORS
2560 
2561   /**************************** X8 AArch64 micro-kernels ****************************/
2562   #ifndef XNN_NO_X8_OPERATORS
2563     init_flags |= XNN_INIT_FLAG_X8;
2564 
2565     xnn_params.x8.lut = xnn_x8_lut_ukernel__neon_tbx128x4_x64;
2566     xnn_params.x8.zip = (struct zip_parameters) {
2567       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
2568       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
2569       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
2570       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
2571     };
2572 
2573     xnn_params.x8.transpose = (struct transpose_parameters) {
2574       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon,
2575       .tile_size = 32,
2576     };
2577   #endif  // XNN_NO_X8_OPERATORS
2578 
2579   /**************************** X16 AArch64 micro-kernels ****************************/
2580   #ifndef XNN_NO_X16_OPERATORS
2581     init_flags |= XNN_INIT_FLAG_X16;
2582 
2583     xnn_params.x16.transpose = (struct transpose_parameters) {
2584       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon,
2585       .tile_size = 32,
2586     };
2587   #endif  // XNN_NO_X16_OPERATORS
2588 
2589   /**************************** F16 AArch64 micro-kernels ****************************/
2590   #ifndef XNN_NO_F16_OPERATORS
2591     #if XNN_ENABLE_ARM_FP16
2592       if (cpuinfo_has_arm_neon_fp16_arith()) {
2593         init_flags |= XNN_INIT_FLAG_F16 | XNN_INIT_FLAG_F16_NATIVE;
2594 
2595         #if XNN_ENABLE_ASSEMBLY
2596           switch (cpuinfo_get_core(0)->uarch) {
2597             case cpuinfo_uarch_cortex_a55:
2598               xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55);
2599               xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55);
2600               xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2601               xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2602               xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_neon_params;
2603               xnn_params.f16.gemm.mr = 6;
2604               xnn_params.f16.gemm.nr = 16;
2605               break;
2606             case cpuinfo_uarch_cortex_a55r0:
2607             case cpuinfo_uarch_cortex_a75:
2608               xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0);
2609               xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0);
2610               xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2611               xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2612               xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_neon_params;
2613               xnn_params.f16.gemm.mr = 6;
2614               xnn_params.f16.gemm.nr = 16;
2615               break;
2616             case cpuinfo_uarch_exynos_m5:
2617               xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64);
2618               xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld64);
2619               xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2620               xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2621               xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_neon_params;
2622               xnn_params.f16.gemm.mr = 4;
2623               xnn_params.f16.gemm.nr = 16;
2624               break;
2625             case cpuinfo_uarch_exynos_m4:
2626               xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64);
2627               xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld64);
2628               xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2629               xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2630               xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_neon_params;
2631               xnn_params.f16.gemm.mr = 6;
2632               xnn_params.f16.gemm.nr = 16;
2633               break;
2634             default:
2635             case cpuinfo_uarch_cortex_a76:
2636             case cpuinfo_uarch_cortex_a77:
2637             case cpuinfo_uarch_cortex_a78:
2638             case cpuinfo_uarch_cortex_x1:
2639               xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75);
2640               xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75);
2641               xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2642               xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld64);
2643               xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_neon_params;
2644               xnn_params.f16.gemm.mr = 6;
2645               xnn_params.f16.gemm.nr = 16;
2646               break;
2647           }
2648 
2649           #if XNN_MAX_UARCH_TYPES > 1
2650           {
2651             /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2652             const uint32_t mr = xnn_params.f16.gemm.mr;
2653             const uint32_t nr = xnn_params.f16.gemm.nr;
2654             for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2655               const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2656               if (uarch_info == NULL) {
2657                 /* No more microarchitectures in the system */
2658                 break;
2659               }
2660 
2661               switch (uarch_info->uarch) {
2662                 case cpuinfo_uarch_cortex_a55:
2663                   if (mr == 6 && nr == 16) {
2664                     xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55;
2665                     xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55;
2666                   }
2667                   break;
2668                 case cpuinfo_uarch_cortex_a55r0:
2669                 case cpuinfo_uarch_cortex_a75:
2670                   if (mr == 6 && nr == 16) {
2671                     xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0;
2672                     xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55r0;
2673                   }
2674                   break;
2675                 default:
2676                   break;
2677               }
2678             }
2679           }
2680           #endif  // XNN_MAX_UARCH_TYPES > 1
2681         #else  // XNN_ENABLE_ASSEMBLY
2682           xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2683           xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2684           xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2685           xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2686           xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_neon_params;
2687           xnn_params.f16.gemm.mr = 6;
2688           xnn_params.f16.gemm.nr = 16;
2689         #endif  // XNN_ENABLE_ASSEMBLY
2690 
2691         xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x3__neonfp16arith;
2692         xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_neon_params;
2693         xnn_params.f16.dwconv[0].channel_tile = 16;
2694         xnn_params.f16.dwconv[0].primary_tile = 3;
2695 
2696         xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith;
2697         xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_neon_params;
2698         xnn_params.f16.dwconv[1].channel_tile = 16;
2699         xnn_params.f16.dwconv[1].primary_tile = 4;
2700 
2701         xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith;
2702         xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_neon_params;
2703         xnn_params.f16.dwconv[2].channel_tile = 16;
2704         xnn_params.f16.dwconv[2].primary_tile = 9;
2705 
2706         xnn_params.f16.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2;
2707         xnn_params.f16.dwconv[3].init.f16 = xnn_init_f16_minmax_neon_params;
2708         xnn_params.f16.dwconv[3].channel_tile = 8;
2709         xnn_params.f16.dwconv[3].primary_tile = 25;
2710 
2711         xnn_params.f16.avgpool = (struct avgpool_parameters) {
2712           .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f16_avgpool_minmax_ukernel_9x__neonfp16arith_c8,
2713           .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f16_avgpool_minmax_ukernel_9p8x__neonfp16arith_c8,
2714           .init.f16 = xnn_init_f16_scaleminmax_neon_params,
2715           .primary_tile = 9,
2716           .incremental_tile = 8,
2717           .channel_tile = 8,
2718         };
2719         xnn_params.f16.pavgpool = (struct pavgpool_parameters) {
2720           .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f16_pavgpool_minmax_ukernel_9x__neonfp16arith_c8,
2721           .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f16_pavgpool_minmax_ukernel_9p8x__neonfp16arith_c8,
2722           .init.f16 = xnn_init_f16_minmax_neon_params,
2723           .primary_tile = 9,
2724           .incremental_tile = 8,
2725           .channel_tile = 8,
2726         };
2727         xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
2728           .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8,
2729           .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8,
2730           .init.f16 = xnn_init_f16_scaleminmax_neon_params,
2731           .update.f16 = xnn_update_f16_scaleminmax_neon_params,
2732           .row_tile = 7,
2733           .channel_tile = 8,
2734         };
2735 
2736         xnn_params.f16.maxpool = (struct maxpool_parameters) {
2737           .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__neonfp16arith_c8,
2738           .init.f16 = xnn_init_f16_minmax_neon_params,
2739           .mr = 9,
2740           .qr = 8,
2741         };
2742         xnn_params.f16.ibilinear = (struct ibilinear_parameters) {
2743           .ukernel = (xnn_ibilinear_ukernel_function) xnn_f16_ibilinear_ukernel__neonfp16arith_c8,
2744           .pixel_tile = 1,
2745           .channel_tile = 8,
2746         };
2747 
2748         xnn_params.f16.prelu = (struct prelu_parameters) {
2749           .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__neonfp16arith_2x16,
2750           .row_tile = 2,
2751           .channel_tile = 16,
2752         };
2753 
2754         xnn_params.f16.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
2755           .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f16_raddstoreexpminusmax_ukernel__neonfp16arith_rr2_p2_x40,
2756           .init.f16 = xnn_init_f16_expminus_neonfp16arith_rr2_p2_params,
2757           .element_tile = 40,
2758         };
2759         xnn_params.f16.rmax = (xnn_rmax_ukernel_function) xnn_f16_rmax_ukernel__neonfp16arith;
2760 
2761         xnn_params.f16.vadd = (struct vbinary_parameters) {
2762           .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16,
2763           .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
2764           .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
2765           .init.f16_minmax = xnn_init_f16_minmax_neon_params,
2766           .element_tile = 16,
2767         };
2768         xnn_params.f16.vdiv = (struct vbinary_parameters) {
2769           .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vdiv_minmax_ukernel__neonfp16arith_x8,
2770           .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vdivc_minmax_ukernel__neonfp16arith_x8,
2771           .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vrdivc_minmax_ukernel__neonfp16arith_x8,
2772           .init.f16_minmax = xnn_init_f16_minmax_neon_params,
2773           .element_tile = 8,
2774         };
2775         xnn_params.f16.vmax = (struct vbinary_parameters) {
2776           .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmax_ukernel__neonfp16arith_x16,
2777           .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmaxc_ukernel__neonfp16arith_x16,
2778           .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmaxc_ukernel__neonfp16arith_x16,
2779           .element_tile = 16,
2780         };
2781         xnn_params.f16.vmin = (struct vbinary_parameters) {
2782           .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmin_ukernel__neonfp16arith_x16,
2783           .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vminc_ukernel__neonfp16arith_x16,
2784           .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vminc_ukernel__neonfp16arith_x16,
2785           .element_tile = 16,
2786         };
2787         xnn_params.f16.vmul = (struct vbinary_parameters) {
2788           .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16,
2789           .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
2790           .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
2791           .init.f16_minmax = xnn_init_f16_minmax_neon_params,
2792           .element_tile = 16,
2793         };
2794         xnn_params.f16.vsub = (struct vbinary_parameters) {
2795           .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsub_minmax_ukernel__neonfp16arith_x16,
2796           .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsubc_minmax_ukernel__neonfp16arith_x16,
2797           .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vrsubc_minmax_ukernel__neonfp16arith_x16,
2798           .init.f16_minmax = xnn_init_f16_minmax_neon_params,
2799           .element_tile = 16,
2800         };
2801         xnn_params.f16.vsqrdiff = (struct vbinary_parameters) {
2802           .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsqrdiff_ukernel__neonfp16arith_x16,
2803           .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_x16,
2804           .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsqrdiffc_ukernel__neonfp16arith_x16,
2805           .init.f16_minmax = xnn_init_f16_minmax_neon_params,
2806           .element_tile = 16,
2807         };
2808 
2809         xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
2810           .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__neonfp16arith_2x,
2811           .init.f16 = xnn_init_f16_minmax_neon_params,
2812           .channel_tile = 8,
2813           .row_tile = 2,
2814         };
2815 
2816         xnn_params.f16.abs = (struct vunary_parameters) {
2817           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vabs_ukernel__neonfp16arith_x16,
2818           .element_tile = 16,
2819         };
2820         xnn_params.f16.clamp = (struct vunary_parameters) {
2821           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vclamp_ukernel__neonfp16arith_x16,
2822           .init.f16_minmax = xnn_init_f16_minmax_neon_params,
2823           .element_tile = 16,
2824         };
2825         xnn_params.f16.elu = (struct vunary_parameters) {
2826           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_velu_ukernel__neonfp16arith_rr1_p3_x16,
2827           .init.f16_elu = xnn_init_f16_elu_neonfp16arith_rr1_p3_params,
2828           .element_tile = 16,
2829         };
2830         xnn_params.f16.hswish = (struct vunary_parameters) {
2831           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vhswish_ukernel__neonfp16arith_x16,
2832           .init.f16_hswish = xnn_init_f16_hswish_neon_params,
2833           .element_tile = 16,
2834         };
2835         xnn_params.f16.lrelu = (struct vunary_parameters) {
2836           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vlrelu_ukernel__neonfp16arith_x16,
2837           .init.f16_lrelu = xnn_init_f16_lrelu_neon_params,
2838           .element_tile = 16,
2839         };
2840         xnn_params.f16.neg = (struct vunary_parameters) {
2841           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vneg_ukernel__neonfp16arith_x16,
2842           .element_tile = 16,
2843         };
2844         xnn_params.f16.rndne = (struct vunary_parameters) {
2845           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndne_ukernel__neonfp16arith_x16,
2846           .element_tile = 16,
2847         };
2848         xnn_params.f16.rndz = (struct vunary_parameters) {
2849           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndz_ukernel__neonfp16arith_x16,
2850           .element_tile = 16,
2851         };
2852         xnn_params.f16.rndu = (struct vunary_parameters) {
2853           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndu_ukernel__neonfp16arith_x16,
2854           .element_tile = 16,
2855         };
2856         xnn_params.f16.rndd = (struct vunary_parameters) {
2857           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndd_ukernel__neonfp16arith_x16,
2858           .element_tile = 16,
2859         };
2860         xnn_params.f16.sigmoid = (struct vunary_parameters) {
2861           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vsigmoid_ukernel__neonfp16arith_rr2_p2_nr1fma_x40,
2862           .init.f16_sigmoid = xnn_init_f16_sigmoid_neonfp16arith_rr2_p2_params,
2863           .element_tile = 40,
2864         };
2865         xnn_params.f16.sqr = (struct vunary_parameters) {
2866           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vsqr_ukernel__neonfp16arith_x16,
2867           .element_tile = 16,
2868         };
2869         xnn_params.f16.sqrt = (struct vunary_parameters) {
2870           .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vsqrt_ukernel__neonfp16arith_sqrt_x8,
2871           .element_tile = 8,
2872         };
2873 
2874         #ifndef XNN_NO_NCHW_OPERATORS
2875           init_flags |= XNN_INIT_FLAG_CHW_OPT;
2876 
2877           xnn_params.f16.spmm = (struct spmm_parameters) {
2878             .ukernel = (xnn_spmm_ukernel_function) xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith,
2879             .mr = 32,
2880             .nr = 1,
2881           };
2882           xnn_params.f16.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
2883             .ukernel_with_symm_padding =
2884               (xnn_conv_hwc2chw_ukernel_function) xnn_f16_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfp16arith_2x2,
2885             .output_channel_tile = 4,
2886             .output_height_tile = 2,
2887             .output_width_tile = 2,
2888           };
2889           xnn_params.f16.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2890             .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_2x8,
2891             .output_width_tile = 8,
2892             .output_height_tile = 2,
2893           };
2894           xnn_params.f16.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2895             .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f16_dwconv2d_chw_ukernel_3x3s2p1__neonfp16arith_1x4,
2896             .output_width_tile = 4,
2897             .output_height_tile = 1,
2898           };
2899           xnn_params.f16.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2900             .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f16_dwconv2d_chw_ukernel_5x5p2__neonfp16arith_1x4,
2901             .output_width_tile = 4,
2902             .output_height_tile = 1,
2903           };
2904           xnn_params.f16.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2905             .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f16_dwconv2d_chw_ukernel_5x5s2p2__neonfp16arith_1x4,
2906             .output_width_tile = 4,
2907             .output_height_tile = 1,
2908           };
2909           xnn_params.f16.gavgpool_cw = (struct gavgpool_cw_parameters) {
2910             .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f16_gavgpool_cw_ukernel__neonfp16arith_x4,
2911             .channel_tile = 4,
2912           };
2913           xnn_params.f16.ibilinear_chw = (struct ibilinear_chw_parameters) {
2914             .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f16_ibilinear_chw_ukernel__neonfp16arith_p8,
2915             .channel_tile = 1,
2916             .pixel_tile = 8,
2917           };
2918         #endif  // XNN_NO_NCHW_OPERATORS
2919       }
2920     #endif  // XNN_ENABLE_ARM_FP16
2921   #endif  // XNN_NO_F16_OPERATORS
2922 
2923   /**************************** F32 AArch64 micro-kernels ****************************/
2924   #ifndef XNN_NO_F32_OPERATORS
2925     init_flags |= XNN_INIT_FLAG_F32;
2926 
2927     #if XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
2928       switch (cpuinfo_get_core(0)->uarch) {
2929         case cpuinfo_uarch_cortex_a72:
2930           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2931           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2932           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2933           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2934           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2935           xnn_params.f32.gemm.mr = 4;
2936           xnn_params.f32.gemm.nr = 8;
2937           break;
2938         case cpuinfo_uarch_cortex_a57:
2939         case cpuinfo_uarch_cortex_a75:
2940         case cpuinfo_uarch_cortex_a76:
2941         case cpuinfo_uarch_exynos_m3:
2942         case cpuinfo_uarch_exynos_m4:
2943           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2944           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2945           #if XNN_ENABLE_GEMM_M_SPECIALIZATION
2946             xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2947             xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2948           #endif
2949           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2950           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2951           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2952           xnn_params.f32.gemm.mr = 6;
2953           xnn_params.f32.gemm.nr = 8;
2954           #if XNN_ENABLE_JIT
2955             xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75);
2956             xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_upto6x8__aarch64_neonfma_prfm_cortex_a75);
2957             xnn_params.f32.gemm.generator.gemm1 = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2958             xnn_params.f32.gemm.generator.igemm1 = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2959           #endif
2960           break;
2961         case cpuinfo_uarch_exynos_m1:
2962         case cpuinfo_uarch_exynos_m2:
2963           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma);
2964           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma);
2965           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma);
2966           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma);
2967           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2968           xnn_params.f32.gemm.mr = 6;
2969           xnn_params.f32.gemm.nr = 8;
2970           xnn_params.f32.gemm.log2_sr = 2;
2971           break;
2972         case cpuinfo_uarch_cortex_a53:
2973           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53);
2974           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53);
2975           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53);
2976           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53);
2977           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2978           xnn_params.f32.gemm.mr = 6;
2979           xnn_params.f32.gemm.nr = 8;
2980           break;
2981         case cpuinfo_uarch_cortex_a55r0:
2982           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2983           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2984           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2985           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2986           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2987           xnn_params.f32.gemm.mr = 6;
2988           xnn_params.f32.gemm.nr = 8;
2989           break;
2990         case cpuinfo_uarch_cortex_a35:
2991         case cpuinfo_uarch_cortex_a55:
2992         case cpuinfo_uarch_kryo:
2993           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2994           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2995           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2996           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2997           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2998           xnn_params.f32.gemm.mr = 6;
2999           xnn_params.f32.gemm.nr = 8;
3000           break;
3001         case cpuinfo_uarch_cortex_a73:
3002           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
3003           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
3004           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
3005           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
3006           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3007           xnn_params.f32.gemm.mr = 6;
3008           xnn_params.f32.gemm.nr = 8;
3009           break;
3010         case cpuinfo_uarch_cortex_a77:
3011         case cpuinfo_uarch_exynos_m5:
3012           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
3013           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
3014           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
3015           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
3016           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3017           xnn_params.f32.gemm.mr = 4;
3018           xnn_params.f32.gemm.nr = 8;
3019           break;
3020         case cpuinfo_uarch_cortex_a78:
3021         case cpuinfo_uarch_cortex_x1:
3022         default:
3023           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
3024           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
3025           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
3026           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
3027           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3028           xnn_params.f32.gemm.mr = 6;
3029           xnn_params.f32.gemm.nr = 8;
3030           #if XNN_ENABLE_JIT
3031             xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
3032             xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_ld128);
3033           #endif
3034           break;
3035       }
3036       #if XNN_MAX_UARCH_TYPES > 1
3037       {
3038         /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
3039         const uint32_t mr = xnn_params.f32.gemm.mr;
3040         const uint32_t nr = xnn_params.f32.gemm.nr;
3041         const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
3042         for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
3043           const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
3044           if (uarch_info == NULL) {
3045             /* No more microarchitectures in the system */
3046             break;
3047           }
3048 
3049           switch (uarch_info->uarch) {
3050             case cpuinfo_uarch_cortex_a53:
3051               if (mr == 6 && nr == 8 && log2_sr == 0) {
3052                 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53;
3053                 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a53;
3054                 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3055                 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53;
3056               } else if (mr == 4 && nr == 8 && log2_sr == 0) {
3057                 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53;
3058                 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a53;
3059                 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3060                 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a53;
3061               }
3062               break;
3063             case cpuinfo_uarch_cortex_a55r0:
3064               if (mr == 6 && nr == 8 && log2_sr == 0) {
3065                 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
3066                 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
3067                 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3068                 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3069               } else if (mr == 4 && nr == 8 && log2_sr == 0) {
3070                 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
3071                 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
3072                 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3073                 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3074               }
3075               break;
3076             case cpuinfo_uarch_cortex_a55:
3077               if (mr == 6 && nr == 8 && log2_sr == 0) {
3078                 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
3079                 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
3080                 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3081                 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3082               } else if (mr == 4 && nr == 8 && log2_sr == 0) {
3083                 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
3084                 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
3085                 xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3086                 xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)].function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
3087               }
3088               break;
3089             default:
3090               break;
3091           }
3092         }
3093       }
3094       #endif  // XNN_MAX_UARCH_TYPES > 1
3095       xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75);
3096       xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75);
3097       xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
3098       xnn_params.f32.gemm2.mr = 4;
3099       xnn_params.f32.gemm2.nr = 2;
3100 
3101     #else  // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
3102       #if XNN_ENABLE_ASSEMBLY
3103         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
3104         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
3105         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
3106         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
3107         xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3108         xnn_params.f32.gemm.mr = 6;
3109         xnn_params.f32.gemm.nr = 8;
3110 
3111         xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75);
3112         xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__aarch64_neonfma_prfm_cortex_a75);
3113         xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
3114         xnn_params.f32.gemm2.mr = 4;
3115         xnn_params.f32.gemm2.nr = 2;
3116       #else  // !XNN_ENABLE_ASSEMBLY
3117         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
3118         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(6)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
3119         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
3120         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
3121         xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3122         xnn_params.f32.gemm.mr = 6;
3123         xnn_params.f32.gemm.nr = 8;
3124 
3125         xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64);
3126         xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64);
3127         xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
3128         xnn_params.f32.gemm2.mr = 4;
3129         xnn_params.f32.gemm2.nr = 2;
3130        #endif  // XNN_ENABLE_ASSEMBLY
3131     #endif  // XNN_ENABLE_ASSEMBLY && !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
3132 
3133     xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neonfma;
3134     xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
3135     xnn_params.f32.dwconv[0].channel_tile = 8;
3136     xnn_params.f32.dwconv[0].primary_tile = 3;
3137 
3138     xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neonfma;
3139     xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
3140     xnn_params.f32.dwconv[1].channel_tile = 8;
3141     xnn_params.f32.dwconv[1].primary_tile = 4;
3142 
3143     #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
3144       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
3145       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3146       xnn_params.f32.dwconv[2].channel_tile = 8;
3147       xnn_params.f32.dwconv[2].primary_tile = 9;
3148     #else  // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
3149       switch (cpuinfo_get_core(0)->uarch) {
3150         case cpuinfo_uarch_kryo:
3151           xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
3152           xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3153           xnn_params.f32.dwconv[2].channel_tile = 8;
3154           xnn_params.f32.dwconv[2].primary_tile = 9;
3155           break;
3156         #if XNN_ENABLE_ASSEMBLY
3157           case cpuinfo_uarch_cortex_a53:
3158           case cpuinfo_uarch_cortex_a55r0:
3159           case cpuinfo_uarch_cortex_a55:
3160             xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55;
3161             xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3162             xnn_params.f32.dwconv[2].channel_tile = 4;
3163             xnn_params.f32.dwconv[2].primary_tile = 9;
3164             break;
3165         #endif  // XNN_ENABLE_ASSEMBLY
3166         default:
3167           xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
3168           xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3169           xnn_params.f32.dwconv[2].channel_tile = 8;
3170           xnn_params.f32.dwconv[2].primary_tile = 9;
3171           break;
3172       }
3173     #endif  // XNN_PLATFORM_IOS && XNN_PLATFORM_MAC
3174 
3175     xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neonfma_acc2;
3176     xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
3177     xnn_params.f32.dwconv[3].channel_tile = 8;
3178     xnn_params.f32.dwconv[3].primary_tile = 25;
3179 
3180     xnn_params.f32.avgpool = (struct avgpool_parameters) {
3181       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
3182       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
3183       .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
3184       .primary_tile = 9,
3185       .incremental_tile = 8,
3186       .channel_tile = 4,
3187     };
3188     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
3189       .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
3190       .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
3191       .init.f32 = xnn_init_f32_minmax_scalar_params,
3192       .primary_tile = 9,
3193       .incremental_tile = 8,
3194       .channel_tile = 4,
3195     };
3196     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
3197       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
3198       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
3199       .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
3200       .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
3201       .row_tile = 7,
3202       .channel_tile = 4,
3203     };
3204     xnn_params.f32.maxpool = (struct maxpool_parameters) {
3205       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
3206       .init.f32 = xnn_init_f32_minmax_scalar_params,
3207       .mr = 9,
3208       .qr = 8,
3209     };
3210     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
3211       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
3212       .mr = 4,
3213     };
3214     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
3215       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
3216       .mr = 9,
3217     };
3218     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
3219       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
3220       .mr = 9,
3221       .qr = 8,
3222     };
3223     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
3224       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neonfma_c8,
3225       .pixel_tile = 1,
3226       .channel_tile = 8,
3227     };
3228     xnn_params.f32.abs = (struct vunary_parameters) {
3229       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
3230       .element_tile = 8,
3231     };
3232     xnn_params.f32.clamp = (struct vunary_parameters) {
3233       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
3234       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3235       .element_tile = 8,
3236     };
3237     xnn_params.f32.elu = (struct vunary_parameters) {
3238       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16,
3239       .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
3240       .element_tile = 16,
3241     };
3242     xnn_params.f32.hswish = (struct vunary_parameters) {
3243       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
3244       .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
3245       .element_tile = 16,
3246     };
3247     xnn_params.f32.lrelu = (struct vunary_parameters) {
3248       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
3249       .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
3250       .element_tile = 8,
3251     };
3252     xnn_params.f32.neg = (struct vunary_parameters) {
3253       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
3254       .element_tile = 8,
3255     };
3256     xnn_params.f32.rndne = (struct vunary_parameters) {
3257       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
3258       .element_tile = 8,
3259     };
3260     xnn_params.f32.rndz = (struct vunary_parameters) {
3261       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
3262       .element_tile = 8,
3263     };
3264     xnn_params.f32.rndu = (struct vunary_parameters) {
3265       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
3266       .element_tile = 8,
3267     };
3268     xnn_params.f32.rndd = (struct vunary_parameters) {
3269       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
3270       .element_tile = 8,
3271     };
3272     xnn_params.f32.sigmoid = (struct vunary_parameters) {
3273       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16,
3274       .init.f32_sigmoid = xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params,
3275       .element_tile = 16,
3276     };
3277     xnn_params.f32.sqr = (struct vunary_parameters) {
3278       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
3279       .element_tile = 8,
3280     };
3281     xnn_params.f32.sqrt = (struct vunary_parameters) {
3282       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__neon_sqrt_x4,
3283       .element_tile = 4,
3284     };
3285     xnn_params.f32.prelu = (struct prelu_parameters) {
3286       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
3287       .row_tile = 2,
3288       .channel_tile = 8,
3289     };
3290     xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
3291       .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16,
3292       .init.f32 = xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
3293       .element_tile = 16,
3294     };
3295     xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__neon;
3296     xnn_params.f32.vadd = (struct vbinary_parameters) {
3297       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
3298       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
3299       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
3300       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3301       .element_tile = 8,
3302     };
3303     xnn_params.f32.vdiv = (struct vbinary_parameters) {
3304       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__neon_x8,
3305       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__neon_x8,
3306       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__neon_x8,
3307       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3308       .element_tile = 8,
3309     };
3310     xnn_params.f32.vmax = (struct vbinary_parameters) {
3311       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
3312       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
3313       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
3314       .element_tile = 8,
3315     };
3316     xnn_params.f32.vmin = (struct vbinary_parameters) {
3317       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
3318       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
3319       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
3320       .element_tile = 8,
3321     };
3322     xnn_params.f32.vmul = (struct vbinary_parameters) {
3323       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
3324       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
3325       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
3326       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3327       .element_tile = 8,
3328     };
3329     xnn_params.f32.vsub = (struct vbinary_parameters) {
3330       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
3331       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
3332       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
3333       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3334       .element_tile = 8,
3335     };
3336     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
3337       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
3338       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
3339       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
3340       .element_tile = 8,
3341     };
3342     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
3343       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x,
3344       .init.f32 = xnn_init_f32_minmax_scalar_params,
3345       .channel_tile = 4,
3346       .row_tile = 2,
3347     };
3348     #ifndef XNN_NO_NCHW_OPERATORS
3349       init_flags |= XNN_INIT_FLAG_CHW_OPT;
3350 
3351       xnn_params.f32.spmm = (struct spmm_parameters) {
3352         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined,
3353         .mr = 32,
3354         .nr = 1,
3355       };
3356       xnn_params.f32.spmm2 = (struct spmm_parameters) {
3357         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x2__neonfma,
3358         .mr = 32,
3359         .nr = 2,
3360       };
3361       xnn_params.f32.spmm4 = (struct spmm_parameters) {
3362         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x4__neonfma,
3363         .mr = 32,
3364         .nr = 4,
3365       };
3366       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
3367         .ukernel_with_symm_padding =
3368           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2,
3369         .output_channel_tile = 4,
3370         .output_height_tile = 2,
3371         .output_width_tile = 2,
3372       };
3373       xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
3374         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4,
3375         .output_width_tile = 4,
3376         .output_height_tile = 3,
3377       };
3378       xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
3379         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2,
3380         .output_width_tile = 4,
3381         .output_height_tile = 2,
3382       };
3383       xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
3384         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4,
3385         .output_width_tile = 4,
3386         .output_height_tile = 4,
3387       };
3388       xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
3389         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2,
3390         .output_width_tile = 4,
3391         .output_height_tile = 1,
3392       };
3393       xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
3394         .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
3395         .channel_tile = 4,
3396       };
3397       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
3398         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neonfma_p8,
3399         .channel_tile = 1,
3400         .pixel_tile = 8,
3401       };
3402     #endif  // XNN_NO_NCHW_OPERATORS
3403   #endif  // XNN_NO_F32_OPERATORS
3404 
3405   /*************************** VCVT AArch64 micro-kernels ***************************/
3406   #ifndef XNN_NO_VCVT_OPERATORS
3407     init_flags |= XNN_INIT_FLAG_VCVT;
3408 
3409     xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
3410       .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
3411       .element_tile = 16,
3412     };
3413     xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
3414       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
3415       .element_tile = 16,
3416     };
3417     xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
3418       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
3419       .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
3420       .element_tile = 32,
3421     };
3422     xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
3423       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
3424       .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
3425       .element_tile = 32,
3426     };
3427     xnn_params.vcvt.qs8 = (struct vunary_parameters) {
3428       .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__neon_x32,
3429       .init.qs8_cvt = xnn_init_qs8_cvt_neon_params,
3430       .element_tile = 32,
3431     };
3432     xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
3433       .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
3434       .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
3435       .element_tile = 32,
3436     };
3437     xnn_params.vcvt.qu8 = (struct vunary_parameters) {
3438       .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__neon_x32,
3439       .init.qu8_cvt = xnn_init_qu8_cvt_neon_params,
3440       .element_tile = 32,
3441     };
3442     xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
3443       .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
3444       .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
3445       .element_tile = 32,
3446     };
3447   #endif  // XNN_NO_VCVT_OPERATORS
3448 
3449   /**************************** X32 AArch64 micro-kernels ****************************/
3450   #ifndef XNN_NO_X32_OPERATORS
3451     init_flags |= XNN_INIT_FLAG_X32;
3452 
3453     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
3454     xnn_params.x32.zip = (struct zip_parameters) {
3455       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
3456       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
3457       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
3458       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
3459     };
3460 
3461     xnn_params.x32.transpose = (struct transpose_parameters) {
3462       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__4x4_aarch64_neon_tbl,
3463       .tile_size = 32,
3464     };
3465   #endif  // XNN_NO_X32_OPERATORS
3466 
3467   /**************************** XX AArch64 micro-kernels ****************************/
3468   #ifndef XNN_NO_XX_OPERATORS
3469     init_flags |= XNN_INIT_FLAG_XX;
3470 
3471     xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
3472     xnn_params.xx.fill = (struct fill_parameters) {
3473       .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
3474       .row_tile = 1,
3475     };
3476     xnn_params.xx.pad = (struct pad_parameters) {
3477       .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
3478       .row_tile = 1,
3479     };
3480     xnn_params.xx.transpose = (struct transpose_parameters) {
3481       .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
3482       .tile_size = 32,
3483     };
3484   #endif
3485 
3486 #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3487   if (!cpuinfo_has_x86_sse2()) {
3488     xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
3489     return;
3490   }
3491 
3492   /**************************** QC8 x86 micro-kernels ****************************/
3493   #ifndef XNN_NO_QC8_OPERATORS
3494     init_flags |= XNN_INIT_FLAG_QC8;
3495 
3496     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3497       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3498       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3499       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3500       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3501       xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx512_params;
3502       xnn_params.qc8.gemm.mr = 4;
3503       xnn_params.qc8.gemm.nr = 16;
3504       xnn_params.qc8.gemm.log2_kr = 3;
3505     } else if (cpuinfo_has_x86_xop()) {
3506       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3507       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3508       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3509       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3510       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3511       xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3512       xnn_params.qc8.gemm.mr = 2;
3513       xnn_params.qc8.gemm.nr = 4;
3514       xnn_params.qc8.gemm.log2_kr = 3;
3515     } else if (cpuinfo_has_x86_avx2()) {
3516       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3517       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3518       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3519       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3520       xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx2_params;
3521       xnn_params.qc8.gemm.mr = 3;
3522       xnn_params.qc8.gemm.nr = 8;
3523       xnn_params.qc8.gemm.log2_kr = 3;
3524     } else if (cpuinfo_has_x86_avx()) {
3525       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3526       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3527       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3528       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3529       xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3530       xnn_params.qc8.gemm.mr = 2;
3531       xnn_params.qc8.gemm.nr = 4;
3532       xnn_params.qc8.gemm.log2_kr = 3;
3533     } else if (cpuinfo_has_x86_sse4_1()) {
3534       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3535       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3536       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3537       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3538       xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3539       xnn_params.qc8.gemm.mr = 3;
3540       xnn_params.qc8.gemm.nr = 4;
3541       xnn_params.qc8.gemm.log2_kr = 3;
3542     } else {
3543       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3544       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3545       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3546       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3547       xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse2_params;
3548       xnn_params.qc8.gemm.mr = 3;
3549       xnn_params.qc8.gemm.nr = 4;
3550       xnn_params.qc8.gemm.log2_kr = 3;
3551     }
3552 
3553     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3554       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x3__avx512skx_mul32;
3555       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx512_params;
3556       xnn_params.qc8.dwconv[0].channel_tile = 32;
3557       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3558       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx512_params;
3559       xnn_params.qc8.dwconv[1].channel_tile = 32;
3560       xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3561       xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx512_params;
3562       xnn_params.qc8.dwconv[2].channel_tile = 32;
3563     } else if (cpuinfo_has_x86_xop()) {
3564       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3565       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__xop_mul16_add16;
3566       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3567       xnn_params.qc8.dwconv[0].channel_tile = 16;
3568       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
3569       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3570       xnn_params.qc8.dwconv[1].channel_tile = 16;
3571       xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
3572       xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3573       xnn_params.qc8.dwconv[2].channel_tile = 16;
3574     } else if (cpuinfo_has_x86_avx2()) {
3575       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__avx2_mul32;
3576       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx2_params;
3577       xnn_params.qc8.dwconv[0].channel_tile = 16;
3578       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3579       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx2_params;
3580       xnn_params.qc8.dwconv[1].channel_tile = 16;
3581       xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3582       xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_avx2_params;
3583       xnn_params.qc8.dwconv[2].channel_tile = 16;
3584     } else if (cpuinfo_has_x86_avx()) {
3585       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__avx_mul16_add16;
3586       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3587       xnn_params.qc8.dwconv[0].channel_tile = 16;
3588       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
3589       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3590       xnn_params.qc8.dwconv[1].channel_tile = 16;
3591       xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
3592       xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3593       xnn_params.qc8.dwconv[2].channel_tile = 16;
3594     } else if (cpuinfo_has_x86_sse4_1()) {
3595       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__sse41_mul16;
3596       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3597       xnn_params.qc8.dwconv[0].channel_tile = 8;
3598       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3599       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3600       xnn_params.qc8.dwconv[1].channel_tile = 8;
3601       xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3602       xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse4_params;
3603       xnn_params.qc8.dwconv[2].channel_tile = 8;
3604     } else if (cpuinfo_has_x86_sse2()) {
3605       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x3__sse2_mul16;
3606       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse2_params;
3607       xnn_params.qc8.dwconv[0].channel_tile = 8;
3608       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3609       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse2_params;
3610       xnn_params.qc8.dwconv[1].channel_tile = 8;
3611       xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3612       xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_sse2_params;
3613       xnn_params.qc8.dwconv[2].channel_tile = 8;
3614     }
3615     xnn_params.qc8.dwconv[0].primary_tile = 3;
3616     xnn_params.qc8.dwconv[1].primary_tile = 9;
3617     xnn_params.qc8.dwconv[2].primary_tile = 25;
3618   #endif  // XNN_NO_QC8_OPERATORS
3619 
3620   /**************************** QS8 x86 micro-kernels ****************************/
3621   #ifndef XNN_NO_QS8_OPERATORS
3622     init_flags |= XNN_INIT_FLAG_QS8;
3623 
3624     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3625       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3626       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3627       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3628       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3629       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3630       xnn_params.qs8.gemm.mr = 4;
3631       xnn_params.qs8.gemm.nr = 16;
3632       xnn_params.qs8.gemm.log2_kr = 3;
3633     } else if (cpuinfo_has_x86_xop()) {
3634       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3635       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3636       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3637       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3638       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3639       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3640       xnn_params.qs8.gemm.mr = 2;
3641       xnn_params.qs8.gemm.nr = 4;
3642       xnn_params.qs8.gemm.log2_kr = 3;
3643     } else if (cpuinfo_has_x86_avx2()) {
3644       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3645       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3646       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3647       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3648       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3649       xnn_params.qs8.gemm.mr = 3;
3650       xnn_params.qs8.gemm.nr = 8;
3651       xnn_params.qs8.gemm.log2_kr = 3;
3652     } else if (cpuinfo_has_x86_avx()) {
3653       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3654       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3655       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3656       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3657       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3658       xnn_params.qs8.gemm.mr = 2;
3659       xnn_params.qs8.gemm.nr = 4;
3660       xnn_params.qs8.gemm.log2_kr = 3;
3661     } else if (cpuinfo_has_x86_sse4_1()) {
3662       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3663       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3664       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3665       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3666       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3667       xnn_params.qs8.gemm.mr = 3;
3668       xnn_params.qs8.gemm.nr = 4;
3669       xnn_params.qs8.gemm.log2_kr = 3;
3670     } else {
3671       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3672       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3673       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3674       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3675       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3676       xnn_params.qs8.gemm.mr = 3;
3677       xnn_params.qs8.gemm.nr = 4;
3678       xnn_params.qs8.gemm.log2_kr = 3;
3679     }
3680 
3681     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3682       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3683       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3684       xnn_params.qs8.dwconv[0].channel_tile = 32;
3685       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3686       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3687       xnn_params.qs8.dwconv[1].channel_tile = 32;
3688     } else if (cpuinfo_has_x86_xop()) {
3689       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3690       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
3691       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3692       xnn_params.qs8.dwconv[0].channel_tile = 16;
3693       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
3694       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3695       xnn_params.qs8.dwconv[1].channel_tile = 16;
3696     } else if (cpuinfo_has_x86_avx2()) {
3697       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3698       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3699       xnn_params.qs8.dwconv[0].channel_tile = 16;
3700       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3701       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3702       xnn_params.qs8.dwconv[1].channel_tile = 16;
3703     } else if (cpuinfo_has_x86_avx()) {
3704       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
3705       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3706       xnn_params.qs8.dwconv[0].channel_tile = 16;
3707       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
3708       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3709       xnn_params.qs8.dwconv[1].channel_tile = 16;
3710     } else if (cpuinfo_has_x86_sse4_1()) {
3711       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16;
3712       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3713       xnn_params.qs8.dwconv[0].channel_tile = 8;
3714       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16;
3715       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3716       xnn_params.qs8.dwconv[1].channel_tile = 8;
3717     } else if (cpuinfo_has_x86_sse2()) {
3718       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16;
3719       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3720       xnn_params.qs8.dwconv[0].channel_tile = 8;
3721       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16;
3722       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3723       xnn_params.qs8.dwconv[1].channel_tile = 8;
3724     }
3725     xnn_params.qs8.dwconv[0].primary_tile = 9;
3726     xnn_params.qs8.dwconv[1].primary_tile = 25;
3727 
3728     if (cpuinfo_has_x86_sse4_1()) {
3729       xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
3730         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3731         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3732         .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse4_params,
3733         .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse4_params,
3734         .row_tile = 7,
3735         .channel_tile = 8,
3736       };
3737     } else {
3738       xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
3739         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3740         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3741         .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse2_params,
3742         .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse2_params,
3743         .row_tile = 7,
3744         .channel_tile = 8,
3745       };
3746     }
3747 
3748     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3749       xnn_params.qs8.vadd = (struct vbinary_parameters) {
3750         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3751         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3752         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3753         .init.qs8_add = xnn_init_qs8_add_minmax_avx512_params,
3754         .element_tile = 16,
3755       };
3756     } else if (cpuinfo_has_x86_xop()) {
3757       xnn_params.qs8.vadd = (struct vbinary_parameters) {
3758         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
3759         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3760         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3761         .init.qs8_add = xnn_init_qs8_add_minmax_sse4_mul32_params,
3762         .element_tile = 8,
3763       };
3764     } else if (cpuinfo_has_x86_avx2()) {
3765       xnn_params.qs8.vadd = (struct vbinary_parameters) {
3766         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
3767         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3768         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3769         .init.qs8_add = xnn_init_qs8_add_minmax_avx2_params,
3770         .element_tile = 16,
3771       };
3772     } else if (cpuinfo_has_x86_avx()) {
3773       xnn_params.qs8.vadd = (struct vbinary_parameters) {
3774         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
3775         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3776         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3777         .init.qs8_add = xnn_init_qs8_add_minmax_sse4_mul32_params,
3778         .element_tile = 8,
3779       };
3780     } else if (cpuinfo_has_x86_sse4_1()) {
3781       xnn_params.qs8.vadd = (struct vbinary_parameters) {
3782         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
3783         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3784         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3785         .init.qs8_add = xnn_init_qs8_add_minmax_sse4_mul16_params,
3786         .element_tile = 8,
3787       };
3788     } else {
3789       xnn_params.qs8.vadd = (struct vbinary_parameters) {
3790         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
3791         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3792         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3793         .init.qs8_add = xnn_init_qs8_add_minmax_sse2_params,
3794         .element_tile = 8,
3795       };
3796     }
3797     if (cpuinfo_has_x86_avx()) {
3798       xnn_params.qs8.vmul = (struct vbinary_parameters) {
3799         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3800         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3801         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3802         .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
3803         .element_tile = 16,
3804       };
3805     } else if (cpuinfo_has_x86_sse4_1()) {
3806       xnn_params.qs8.vmul = (struct vbinary_parameters) {
3807         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3808         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3809         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3810         .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
3811         .element_tile = 16,
3812       };
3813     } else {
3814       xnn_params.qs8.vmul = (struct vbinary_parameters) {
3815         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3816         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3817         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3818         .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse2_params,
3819         .element_tile = 8,
3820       };
3821     }
3822 
3823     if (cpuinfo_has_x86_avx2()) {
3824       xnn_params.qs8.lrelu = (struct vunary_parameters) {
3825         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__avx2_x32,
3826         .init.qs8_lrelu = xnn_init_qs8_lrelu_avx2_params,
3827         .element_tile = 32,
3828       };
3829     } else if (cpuinfo_has_x86_avx()) {
3830       xnn_params.qs8.lrelu = (struct vunary_parameters) {
3831         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__avx_x32,
3832         .init.qs8_lrelu = xnn_init_qs8_lrelu_avx_params,
3833         .element_tile = 32,
3834       };
3835     } else if (cpuinfo_has_x86_sse4_1()) {
3836       xnn_params.qs8.lrelu = (struct vunary_parameters) {
3837         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__sse41_x32,
3838         .init.qs8_lrelu = xnn_init_qs8_lrelu_sse2_params,
3839         .element_tile = 32,
3840       };
3841     } else if (cpuinfo_has_x86_sse4_1()) {
3842       xnn_params.qs8.lrelu = (struct vunary_parameters) {
3843         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__ssse3_x32,
3844         .init.qs8_lrelu = xnn_init_qs8_lrelu_sse2_params,
3845         .element_tile = 32,
3846       };
3847     } else {
3848       xnn_params.qs8.lrelu = (struct vunary_parameters) {
3849         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__sse2_x32,
3850         .init.qs8_lrelu = xnn_init_qs8_lrelu_sse2_params,
3851         .element_tile = 32,
3852       };
3853     }
3854   #endif  // XNN_NO_QS8_OPERATORS
3855 
3856   /**************************** QU8 x86 micro-kernels ****************************/
3857   #ifndef XNN_NO_QU8_OPERATORS
3858     init_flags |= XNN_INIT_FLAG_QU8;
3859 
3860     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3861       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3862       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3863       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3864       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3865       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3866       xnn_params.qu8.gemm.mr = 4;
3867       xnn_params.qu8.gemm.nr = 16;
3868       xnn_params.qu8.gemm.log2_kr = 3;
3869     } else if (cpuinfo_has_x86_xop()) {
3870       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3871       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3872       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3873       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3874       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3875       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3876       xnn_params.qu8.gemm.mr = 2;
3877       xnn_params.qu8.gemm.nr = 4;
3878       xnn_params.qu8.gemm.log2_kr = 3;
3879     } else if (cpuinfo_has_x86_avx2()) {
3880       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3881       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3882       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3883       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3884       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3885       xnn_params.qu8.gemm.mr = 3;
3886       xnn_params.qu8.gemm.nr = 8;
3887       xnn_params.qu8.gemm.log2_kr = 3;
3888     } else if (cpuinfo_has_x86_avx()) {
3889       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3890       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3891       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3892       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3893       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3894       xnn_params.qu8.gemm.mr = 2;
3895       xnn_params.qu8.gemm.nr = 4;
3896       xnn_params.qu8.gemm.log2_kr = 3;
3897     } else if (cpuinfo_has_x86_sse4_1()) {
3898       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3899       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3900       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3901       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3902       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3903       xnn_params.qu8.gemm.mr = 3;
3904       xnn_params.qu8.gemm.nr = 4;
3905       xnn_params.qu8.gemm.log2_kr = 3;
3906     } else {
3907       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3908       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3909       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3910       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3911       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3912       xnn_params.qu8.gemm.mr = 3;
3913       xnn_params.qu8.gemm.nr = 4;
3914       xnn_params.qu8.gemm.log2_kr = 3;
3915     }
3916 
3917     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3918       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3919       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3920       xnn_params.qu8.dwconv[0].channel_tile = 32;
3921       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3922       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3923       xnn_params.qu8.dwconv[1].channel_tile = 32;
3924     } else if (cpuinfo_has_x86_xop()) {
3925       // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3926       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32;
3927       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3928       xnn_params.qu8.dwconv[0].channel_tile = 16;
3929       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32;
3930       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3931       xnn_params.qu8.dwconv[1].channel_tile = 16;
3932     } else if (cpuinfo_has_x86_avx2()) {
3933       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3934       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3935       xnn_params.qu8.dwconv[0].channel_tile = 16;
3936       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3937       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3938       xnn_params.qu8.dwconv[1].channel_tile = 16;
3939     } else if (cpuinfo_has_x86_avx()) {
3940       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16;
3941       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3942       xnn_params.qu8.dwconv[0].channel_tile = 16;
3943       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16;
3944       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3945       xnn_params.qu8.dwconv[1].channel_tile = 16;
3946     } else if (cpuinfo_has_x86_sse4_1()) {
3947       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3948       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3949       xnn_params.qu8.dwconv[0].channel_tile = 8;
3950       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3951       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3952       xnn_params.qu8.dwconv[1].channel_tile = 8;
3953     } else if (cpuinfo_has_x86_sse2()) {
3954       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3955       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3956       xnn_params.qu8.dwconv[0].channel_tile = 8;
3957       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3958       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3959       xnn_params.qu8.dwconv[1].channel_tile = 8;
3960     }
3961     xnn_params.qu8.dwconv[0].primary_tile = 9;
3962     xnn_params.qu8.dwconv[1].primary_tile = 25;
3963 
3964     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
3965       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8,
3966       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8,
3967       .init.qu8 = xnn_init_qu8_avgpool_minmax_sse2_params,
3968       .primary_tile = 9,
3969       .incremental_tile = 8,
3970       .channel_tile = 8,
3971     };
3972     if (cpuinfo_has_x86_sse4_1()) {
3973       xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3974         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3975         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3976         .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse4_params,
3977         .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse4_params,
3978         .row_tile = 7,
3979         .channel_tile = 8,
3980       };
3981     } else {
3982       xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3983         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3984         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3985         .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse2_params,
3986         .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse2_params,
3987         .row_tile = 7,
3988         .channel_tile = 8,
3989       };
3990     }
3991 
3992     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3993       xnn_params.qu8.vadd = (struct vbinary_parameters) {
3994         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3995         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3996         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3997         .init.qu8_add = xnn_init_qu8_add_minmax_avx512_params,
3998         .element_tile = 16,
3999       };
4000     } else if (cpuinfo_has_x86_xop()) {
4001       xnn_params.qu8.vadd = (struct vbinary_parameters) {
4002         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
4003         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
4004         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
4005         .init.qu8_add = xnn_init_qu8_add_minmax_sse4_params,
4006         .element_tile = 8,
4007       };
4008     } else if (cpuinfo_has_x86_avx2()) {
4009       xnn_params.qu8.vadd = (struct vbinary_parameters) {
4010         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
4011         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
4012         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
4013         .init.qu8_add = xnn_init_qu8_add_minmax_avx2_params,
4014         .element_tile = 16,
4015       };
4016     } else if (cpuinfo_has_x86_avx()) {
4017       xnn_params.qu8.vadd = (struct vbinary_parameters) {
4018         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
4019         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
4020         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
4021         .init.qu8_add = xnn_init_qu8_add_minmax_sse4_params,
4022         .element_tile = 8,
4023       };
4024     } else if (cpuinfo_has_x86_sse4_1()) {
4025       xnn_params.qu8.vadd = (struct vbinary_parameters) {
4026         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
4027         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
4028         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
4029         .init.qu8_add = xnn_init_qu8_add_minmax_sse2_params,
4030         .element_tile = 8,
4031       };
4032     } else {
4033       xnn_params.qu8.vadd = (struct vbinary_parameters) {
4034         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
4035         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
4036         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
4037         .init.qu8_add = xnn_init_qu8_add_minmax_sse2_params,
4038         .element_tile = 8,
4039       };
4040     }
4041     if (cpuinfo_has_x86_avx()) {
4042       xnn_params.qu8.vmul = (struct vbinary_parameters) {
4043         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
4044         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
4045         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
4046         .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
4047         .element_tile = 16,
4048       };
4049     } else if (cpuinfo_has_x86_sse4_1()) {
4050       xnn_params.qu8.vmul = (struct vbinary_parameters) {
4051         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
4052         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
4053         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
4054         .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
4055         .element_tile = 16,
4056       };
4057     } else {
4058       xnn_params.qu8.vmul = (struct vbinary_parameters) {
4059         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
4060         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
4061         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
4062         .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
4063         .element_tile = 8,
4064       };
4065     }
4066 
4067     if (cpuinfo_has_x86_avx2()) {
4068       xnn_params.qu8.lrelu = (struct vunary_parameters) {
4069         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__avx2_x32,
4070         .init.qu8_lrelu = xnn_init_qu8_lrelu_avx2_params,
4071         .element_tile = 32,
4072       };
4073     } else if (cpuinfo_has_x86_avx()) {
4074       xnn_params.qu8.lrelu = (struct vunary_parameters) {
4075         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__avx_x32,
4076         .init.qu8_lrelu = xnn_init_qu8_lrelu_avx_params,
4077         .element_tile = 32,
4078       };
4079     } else if (cpuinfo_has_x86_sse4_1()) {
4080       xnn_params.qu8.lrelu = (struct vunary_parameters) {
4081         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__sse41_x32,
4082         .init.qu8_lrelu = xnn_init_qu8_lrelu_sse2_params,
4083         .element_tile = 32,
4084       };
4085     } else if (cpuinfo_has_x86_sse4_1()) {
4086       xnn_params.qu8.lrelu = (struct vunary_parameters) {
4087         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__ssse3_x32,
4088         .init.qu8_lrelu = xnn_init_qu8_lrelu_sse2_params,
4089         .element_tile = 32,
4090       };
4091     } else {
4092       xnn_params.qu8.lrelu = (struct vunary_parameters) {
4093         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__sse2_x32,
4094         .init.qu8_lrelu = xnn_init_qu8_lrelu_sse2_params,
4095         .element_tile = 32,
4096       };
4097     }
4098   #endif  // XNN_NO_QU8_OPERATORS
4099 
4100   /**************************** U8 x86 micro-kernels ****************************/
4101   #ifndef XNN_NO_S8_OPERATORS
4102     init_flags |= XNN_INIT_FLAG_S8;
4103 
4104     if (cpuinfo_has_x86_sse4_1()) {
4105       xnn_params.s8.clamp = (struct vunary_parameters) {
4106         .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__sse41_x64,
4107         .init.s8_minmax = xnn_init_s8_minmax_sse4_params,
4108         .element_tile = 64,
4109       };
4110       xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4111         .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse41_c16,
4112         .pixel_tile = 1,
4113         .channel_tile = 16,
4114       };
4115       xnn_params.s8.maxpool = (struct maxpool_parameters) {
4116         .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16,
4117         .init.s8 = xnn_init_s8_minmax_sse4_params,
4118         .mr = 9,
4119         .qr = 8,
4120       };
4121     } else {
4122       xnn_params.s8.clamp = (struct vunary_parameters) {
4123         .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__sse2_x64,
4124         .init.s8_minmax = xnn_init_s8_minmax_sse2_params,
4125         .element_tile = 64,
4126       };
4127       xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4128         .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse2_c8,
4129         .pixel_tile = 1,
4130         .channel_tile = 8,
4131       };
4132       xnn_params.s8.maxpool = (struct maxpool_parameters) {
4133         .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse2_c16,
4134         .init.s8 = xnn_init_s8_minmax_sse2_params,
4135         .mr = 9,
4136         .qr = 8,
4137       };
4138     }
4139   #endif  // XNN_NO_S8_OPERATORS
4140 
4141   /**************************** U8 x86 micro-kernels ****************************/
4142   #ifndef XNN_NO_U8_OPERATORS
4143     init_flags |= XNN_INIT_FLAG_U8;
4144 
4145     xnn_params.u8.clamp = (struct vunary_parameters) {
4146       .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__sse2_x64,
4147       .init.u8_minmax = xnn_init_u8_minmax_sse2_params,
4148       .element_tile = 64,
4149     };
4150     if (cpuinfo_has_x86_sse4_1()) {
4151       xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4152         .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse41_c16,
4153         .pixel_tile = 1,
4154         .channel_tile = 16,
4155       };
4156     } else {
4157       xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4158         .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse2_c8,
4159         .pixel_tile = 1,
4160         .channel_tile = 8,
4161       };
4162     }
4163     xnn_params.u8.maxpool = (struct maxpool_parameters) {
4164       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16,
4165       .init.u8 = xnn_init_u8_minmax_sse2_params,
4166       .mr = 9,
4167       .qr = 8,
4168     };
4169     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
4170     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__sse2;
4171   #endif  // XNN_NO_U8_OPERATORS
4172 
4173   /**************************** X8 x86 micro-kernels ****************************/
4174   #ifndef XNN_NO_X8_OPERATORS
4175     init_flags |= XNN_INIT_FLAG_X8;
4176 
4177     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4178       xnn_params.x8.lut = xnn_x8_lut_ukernel__avx512skx_vpshufb_x64;
4179     } else if (cpuinfo_has_x86_avx2()) {
4180       xnn_params.x8.lut = xnn_x8_lut_ukernel__avx2_x128;
4181     } else if (cpuinfo_has_x86_avx()) {
4182       xnn_params.x8.lut = xnn_x8_lut_ukernel__avx_x64;
4183     } else {
4184       // Note: SSSE3 version is usually slower than scalar
4185       xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
4186     }
4187     xnn_params.x8.zip = (struct zip_parameters) {
4188       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__sse2,
4189       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__sse2,
4190       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__sse2,
4191       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__sse2,
4192     };
4193 
4194     xnn_params.x8.transpose = (struct transpose_parameters) {
4195       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2,
4196       .tile_size = 32,
4197     };
4198   #endif  // XNN_NO_X8_OPERATORS
4199 
4200 
4201   /**************************** X16 x86 micro-kernels ****************************/
4202   #ifndef XNN_NO_X16_OPERATORS
4203     init_flags |= XNN_INIT_FLAG_X16;
4204 
4205     xnn_params.x16.transpose = (struct transpose_parameters) {
4206       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__8x8_reuse_multi_sse2,
4207       .tile_size = 32,
4208     };
4209   #endif  // XNN_NO_X16_OPERATORS
4210 
4211   /**************************** F16 x86 micro-kernels ****************************/
4212   #ifndef XNN_NO_F16_OPERATORS
4213     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
4214       init_flags |= XNN_INIT_FLAG_F16;
4215 
4216       xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast);
4217       xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast);
4218       xnn_params.f16.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast);
4219       xnn_params.f16.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast);
4220       xnn_params.f16.gemm.init.f16 = xnn_init_f16_minmax_avx_params;
4221       xnn_params.f16.gemm.mr = 4;
4222       xnn_params.f16.gemm.nr = 16;
4223 
4224       xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x3__fma3;
4225       xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_avx_params;
4226       xnn_params.f16.dwconv[0].channel_tile = 16;
4227       xnn_params.f16.dwconv[0].primary_tile = 3;
4228 
4229       xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__fma3;
4230       xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_avx_params;
4231       xnn_params.f16.dwconv[1].channel_tile = 16;
4232       xnn_params.f16.dwconv[1].primary_tile = 4;
4233 
4234       xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__fma3;
4235       xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_avx_params;
4236       xnn_params.f16.dwconv[2].channel_tile = 16;
4237       xnn_params.f16.dwconv[2].primary_tile = 9;
4238 
4239       xnn_params.f16.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2;
4240       xnn_params.f16.dwconv[3].init.f16 = xnn_init_f16_minmax_avx_params;
4241       xnn_params.f16.dwconv[3].channel_tile = 8;
4242       xnn_params.f16.dwconv[3].primary_tile = 25;
4243 
4244       xnn_params.f16.avgpool = (struct avgpool_parameters) {
4245         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f16_avgpool_minmax_ukernel_9x__f16c_c8,
4246         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f16_avgpool_minmax_ukernel_9p8x__f16c_c8,
4247         .init.f16 = xnn_init_f16_scaleminmax_avx_params,
4248         .primary_tile = 9,
4249         .incremental_tile = 8,
4250         .channel_tile = 8,
4251       };
4252       xnn_params.f16.pavgpool = (struct pavgpool_parameters) {
4253         .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f16_pavgpool_minmax_ukernel_9x__avx2_c8,
4254         .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f16_pavgpool_minmax_ukernel_9p8x__avx2_c8,
4255         .init.f16 = xnn_init_f16_minmax_avx_params,
4256         .primary_tile = 9,
4257         .incremental_tile = 8,
4258         .channel_tile = 8,
4259       };
4260       xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
4261         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8,
4262         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8,
4263         .init.f16 = xnn_init_f16_scaleminmax_avx_params,
4264         .update.f16 = xnn_update_f16_scaleminmax_avx_params,
4265         .row_tile = 7,
4266         .channel_tile = 8,
4267       };
4268 
4269       xnn_params.f16.maxpool = (struct maxpool_parameters) {
4270         .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8,
4271         .init.f16 = xnn_init_f16_minmax_avx_params,
4272         .mr = 9,
4273         .qr = 8,
4274       };
4275       xnn_params.f16.ibilinear = (struct ibilinear_parameters) {
4276         .ukernel = (xnn_ibilinear_ukernel_function) xnn_f16_ibilinear_ukernel__fma3_c8,
4277         .pixel_tile = 1,
4278         .channel_tile = 8,
4279       };
4280 
4281       xnn_params.f16.prelu = (struct prelu_parameters) {
4282         .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__f16c_2x16,
4283         .row_tile = 2,
4284         .channel_tile = 16,
4285       };
4286 
4287       xnn_params.f16.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
4288         .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f16_raddstoreexpminusmax_ukernel__avx2_rr1_p2_x40,
4289         .init.f16 = xnn_init_f16_expminus_avx2_rr1_p2_params,
4290         .element_tile = 40,
4291       };
4292       xnn_params.f16.rmax = (xnn_rmax_ukernel_function) xnn_f16_rmax_ukernel__f16c;
4293 
4294       xnn_params.f16.vadd = (struct vbinary_parameters) {
4295         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__f16c_x16,
4296         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
4297         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
4298         .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4299         .element_tile = 16,
4300       };
4301       xnn_params.f16.vdiv = (struct vbinary_parameters) {
4302         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vdiv_minmax_ukernel__f16c_x8,
4303         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vdivc_minmax_ukernel__f16c_x8,
4304         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vrdivc_minmax_ukernel__f16c_x8,
4305         .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4306         .element_tile = 8,
4307       };
4308       xnn_params.f16.vmax = (struct vbinary_parameters) {
4309         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmax_ukernel__f16c_x16,
4310         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmaxc_ukernel__f16c_x16,
4311         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmaxc_ukernel__f16c_x16,
4312         .element_tile = 16,
4313       };
4314       xnn_params.f16.vmin = (struct vbinary_parameters) {
4315         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmin_ukernel__f16c_x16,
4316         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vminc_ukernel__f16c_x16,
4317         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vminc_ukernel__f16c_x16,
4318         .element_tile = 16,
4319       };
4320       xnn_params.f16.vmul = (struct vbinary_parameters) {
4321         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__f16c_x16,
4322         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
4323         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
4324         .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4325         .element_tile = 16,
4326       };
4327       xnn_params.f16.vsub = (struct vbinary_parameters) {
4328         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsub_minmax_ukernel__f16c_x16,
4329         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsubc_minmax_ukernel__f16c_x16,
4330         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vrsubc_minmax_ukernel__f16c_x16,
4331         .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4332         .element_tile = 16,
4333       };
4334       xnn_params.f16.vsqrdiff = (struct vbinary_parameters) {
4335         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsqrdiff_ukernel__f16c_x16,
4336         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsqrdiffc_ukernel__f16c_x16,
4337         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vsqrdiffc_ukernel__f16c_x16,
4338         .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4339         .element_tile = 16,
4340       };
4341 
4342       xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
4343         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x,
4344         .init.f16 = xnn_init_f16_minmax_avx_params,
4345         .channel_tile = 8,
4346         .row_tile = 2,
4347       };
4348 
4349       xnn_params.f16.abs = (struct vunary_parameters) {
4350         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vabs_ukernel__sse2_x16,
4351         .init.f16_abs = xnn_init_f16_abs_sse_params,
4352         .element_tile = 16,
4353       };
4354       xnn_params.f16.clamp = (struct vunary_parameters) {
4355         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vclamp_ukernel__f16c_x16,
4356         .init.f16_minmax = xnn_init_f16_minmax_avx_params,
4357         .element_tile = 16,
4358       };
4359       xnn_params.f16.elu = (struct vunary_parameters) {
4360         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_velu_ukernel__avx2_rr1_p3_x16,
4361         .init.f16_elu = xnn_init_f16_elu_avx2_rr1_p3_params,
4362         .element_tile = 16,
4363       };
4364       xnn_params.f16.hswish = (struct vunary_parameters) {
4365         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vhswish_ukernel__f16c_x16,
4366         .init.f16_hswish = xnn_init_f16_hswish_avx_params,
4367         .element_tile = 16,
4368       };
4369       xnn_params.f16.lrelu = (struct vunary_parameters) {
4370         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vlrelu_ukernel__f16c_x16,
4371         .init.f16_lrelu = xnn_init_f16_lrelu_avx_params,
4372         .element_tile = 16,
4373       };
4374       xnn_params.f16.neg = (struct vunary_parameters) {
4375         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vneg_ukernel__sse2_x16,
4376         .init.f16_neg = xnn_init_f16_neg_sse_params,
4377         .element_tile = 16,
4378       };
4379       xnn_params.f16.rndne = (struct vunary_parameters) {
4380         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndne_ukernel__f16c_x16,
4381         .element_tile = 16,
4382       };
4383       xnn_params.f16.rndz = (struct vunary_parameters) {
4384         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndz_ukernel__f16c_x16,
4385         .element_tile = 16,
4386       };
4387       xnn_params.f16.rndu = (struct vunary_parameters) {
4388         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndu_ukernel__f16c_x16,
4389         .element_tile = 16,
4390       };
4391       xnn_params.f16.rndd = (struct vunary_parameters) {
4392         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vrndd_ukernel__f16c_x16,
4393         .element_tile = 16,
4394       };
4395       xnn_params.f16.sigmoid = (struct vunary_parameters) {
4396         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vsigmoid_ukernel__avx2_rr1_p2_rcp_x32,
4397         .init.f16_sigmoid = xnn_init_f16_sigmoid_avx2_rr1_p2_params,
4398         .element_tile = 32,
4399       };
4400       xnn_params.f16.sqr = (struct vunary_parameters) {
4401         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vsqr_ukernel__f16c_x16,
4402         .element_tile = 16,
4403       };
4404       xnn_params.f16.sqrt = (struct vunary_parameters) {
4405         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_vsqrt_ukernel__f16c_sqrt_x8,
4406         .element_tile = 8,
4407       };
4408     }
4409   #endif  // XNN_NO_F16_OPERATORS
4410 
4411   /**************************** F32 x86 micro-kernels ****************************/
4412   #ifndef XNN_NO_F32_OPERATORS
4413     init_flags |= XNN_INIT_FLAG_F32;
4414 
4415     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4416       xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
4417       xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(7)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
4418       xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
4419       xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
4420       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
4421       xnn_params.f32.gemm.mr = 7;
4422       xnn_params.f32.gemm.nr = 16;
4423     } else if (cpuinfo_has_x86_fma3()) {
4424       switch (cpuinfo_get_core(0)->uarch) {
4425         case cpuinfo_uarch_zen:
4426         case cpuinfo_uarch_dhyana:
4427           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast);
4428           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast);
4429           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast);
4430           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast);
4431           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
4432           xnn_params.f32.gemm.mr = 4;
4433           xnn_params.f32.gemm.nr = 16;
4434           xnn_params.f32.gemm.log2_sr = 2;
4435           break;
4436         default:
4437           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast);
4438           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast);
4439           xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast);
4440           xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast);
4441           xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
4442           xnn_params.f32.gemm.mr = 5;
4443           xnn_params.f32.gemm.nr = 16;
4444           break;
4445       }
4446     } else if (cpuinfo_has_x86_avx()) {
4447       xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast);
4448       xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast);
4449       xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast);
4450       xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast);
4451       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
4452       xnn_params.f32.gemm.mr = 5;
4453       xnn_params.f32.gemm.nr = 16;
4454     } else {
4455       xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__sse_load1);
4456       xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__sse_load1);
4457       xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__sse_load1);
4458       xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__sse_load1);
4459       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_sse_params;
4460       xnn_params.f32.gemm.mr = 4;
4461       xnn_params.f32.gemm.nr = 8;
4462     }
4463     xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__sse);
4464     xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__sse);
4465     xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_sse_params;
4466     xnn_params.f32.gemm2.mr = 4;
4467     xnn_params.f32.gemm2.nr = 2;
4468     xnn_params.f32.gemm2.log2_kr = 2;
4469 
4470     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4471       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx512f;
4472       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
4473       xnn_params.f32.dwconv[0].channel_tile = 16;
4474       xnn_params.f32.dwconv[0].primary_tile = 3;
4475 
4476       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx512f;
4477       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
4478       xnn_params.f32.dwconv[1].channel_tile = 16;
4479       xnn_params.f32.dwconv[1].primary_tile = 4;
4480 
4481       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx512f;
4482       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
4483       xnn_params.f32.dwconv[2].channel_tile = 16;
4484       xnn_params.f32.dwconv[2].primary_tile = 9;
4485 
4486       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x25__avx512f;
4487       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
4488       xnn_params.f32.dwconv[3].channel_tile = 16;
4489       xnn_params.f32.dwconv[3].primary_tile = 25;
4490     } else if (cpuinfo_has_x86_fma3()) {
4491       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__fma3;
4492       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
4493       xnn_params.f32.dwconv[0].channel_tile = 16;
4494       xnn_params.f32.dwconv[0].primary_tile = 3;
4495 
4496       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__fma3;
4497       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
4498       xnn_params.f32.dwconv[1].channel_tile = 16;
4499       xnn_params.f32.dwconv[1].primary_tile = 4;
4500 
4501       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__fma3;
4502       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
4503       xnn_params.f32.dwconv[2].channel_tile = 16;
4504       xnn_params.f32.dwconv[2].primary_tile = 9;
4505 
4506       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__fma3;
4507       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
4508       xnn_params.f32.dwconv[3].channel_tile = 8;
4509       xnn_params.f32.dwconv[3].primary_tile = 25;
4510     } else if (cpuinfo_has_x86_avx()) {
4511       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx;
4512       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
4513       xnn_params.f32.dwconv[0].channel_tile = 16;
4514       xnn_params.f32.dwconv[0].primary_tile = 3;
4515 
4516       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx;
4517       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
4518       xnn_params.f32.dwconv[1].channel_tile = 16;
4519       xnn_params.f32.dwconv[1].primary_tile = 4;
4520 
4521       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx;
4522       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
4523       xnn_params.f32.dwconv[2].channel_tile = 16;
4524       xnn_params.f32.dwconv[2].primary_tile = 9;
4525 
4526       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__avx;
4527       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
4528       xnn_params.f32.dwconv[3].channel_tile = 8;
4529       xnn_params.f32.dwconv[3].primary_tile = 25;
4530     } else {
4531       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__sse;
4532       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_sse_params;
4533       xnn_params.f32.dwconv[0].channel_tile = 8;
4534       xnn_params.f32.dwconv[0].primary_tile = 3;
4535 
4536       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__sse;
4537       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_sse_params;
4538       xnn_params.f32.dwconv[1].channel_tile = 8;
4539       xnn_params.f32.dwconv[1].primary_tile = 4;
4540 
4541       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__sse;
4542       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_sse_params;
4543       xnn_params.f32.dwconv[2].channel_tile = 8;
4544       xnn_params.f32.dwconv[2].primary_tile = 9;
4545 
4546       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__sse;
4547       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_sse_params;
4548       xnn_params.f32.dwconv[3].channel_tile = 8;
4549       xnn_params.f32.dwconv[3].primary_tile = 25;
4550     }
4551     xnn_params.f32.avgpool = (struct avgpool_parameters) {
4552       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__sse_c4,
4553       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4,
4554       .init.f32 = xnn_init_f32_scaleminmax_sse_params,
4555       .primary_tile = 9,
4556       .incremental_tile = 8,
4557       .channel_tile = 4,
4558     };
4559     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
4560       .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4,
4561       .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4,
4562       .init.f32 = xnn_init_f32_minmax_sse_params,
4563       .primary_tile = 9,
4564       .incremental_tile = 8,
4565       .channel_tile = 4,
4566     };
4567     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
4568       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4,
4569       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4,
4570       .init.f32 = xnn_init_f32_scaleminmax_sse_params,
4571       .update.f32 = xnn_update_f32_scaleminmax_sse_params,
4572       .row_tile = 7,
4573       .channel_tile = 4,
4574     };
4575     xnn_params.f32.maxpool = (struct maxpool_parameters) {
4576       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4,
4577       .init.f32 = xnn_init_f32_minmax_sse_params,
4578       .mr = 9,
4579       .qr = 8,
4580     };
4581     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
4582       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
4583       .mr = 4,
4584     };
4585     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
4586       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
4587       .mr = 9,
4588     };
4589     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
4590       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
4591       .mr = 9,
4592       .qr = 8,
4593     };
4594     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
4595       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__sse_c8,
4596       .pixel_tile = 1,
4597       .channel_tile = 8,
4598     };
4599     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4600       xnn_params.f32.abs = (struct vunary_parameters) {
4601         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__avx512f_x16,
4602         .init.f32_abs = xnn_init_f32_abs_avx512_params,
4603         .element_tile = 16,
4604       };
4605     } else if (cpuinfo_has_x86_avx()) {
4606       xnn_params.f32.abs = (struct vunary_parameters) {
4607         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__avx_x16,
4608         .init.f32_abs = xnn_init_f32_abs_avx_params,
4609         .element_tile = 16,
4610       };
4611     } else {
4612       xnn_params.f32.abs = (struct vunary_parameters) {
4613         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__sse_x8,
4614         .init.f32_abs = xnn_init_f32_abs_sse_params,
4615         .element_tile = 8,
4616       };
4617     }
4618     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4619       xnn_params.f32.clamp = (struct vunary_parameters) {
4620         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__avx512f_x16,
4621         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4622         .element_tile = 16,
4623       };
4624     } else if (cpuinfo_has_x86_avx()) {
4625       xnn_params.f32.clamp = (struct vunary_parameters) {
4626         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__avx_x16,
4627         .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4628         .element_tile = 16,
4629       };
4630     } else {
4631       xnn_params.f32.clamp = (struct vunary_parameters) {
4632         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__sse_x8,
4633         .init.f32_minmax = xnn_init_f32_minmax_sse_params,
4634         .element_tile = 8,
4635       };
4636     }
4637     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4638       xnn_params.f32.elu = (struct vunary_parameters) {
4639         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x64,
4640         .init.f32_elu = xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
4641         .element_tile = 64,
4642       };
4643     } else if (cpuinfo_has_x86_avx2()) {
4644       xnn_params.f32.elu = (struct vunary_parameters) {
4645         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56,
4646         .init.f32_elu = xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
4647         .element_tile = 56,
4648       };
4649     } else if (cpuinfo_has_x86_avx()) {
4650       xnn_params.f32.elu = (struct vunary_parameters) {
4651         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32,
4652         .init.f32_elu = xnn_init_f32_elu_avx_rr2_lut4_p4_params,
4653         .element_tile = 32,
4654       };
4655     } else {
4656       xnn_params.f32.elu = (struct vunary_parameters) {
4657         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12,
4658         .init.f32_elu = xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
4659         .element_tile = 12,
4660       };
4661     }
4662     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4663       xnn_params.f32.hswish = (struct vunary_parameters) {
4664         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__avx512f_x16,
4665         .init.f32_hswish = xnn_init_f32_hswish_avx512_params,
4666         .element_tile = 16,
4667       };
4668     } else if (cpuinfo_has_x86_fma3()) {
4669       xnn_params.f32.hswish = (struct vunary_parameters) {
4670         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__fma3_x16,
4671         .init.f32_hswish = xnn_init_f32_hswish_avx_params,
4672         .element_tile = 16,
4673       };
4674     } else if (cpuinfo_has_x86_avx()) {
4675       xnn_params.f32.hswish = (struct vunary_parameters) {
4676         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__avx_x16,
4677         .init.f32_hswish = xnn_init_f32_hswish_avx_params,
4678         .element_tile = 16,
4679       };
4680     } else {
4681       xnn_params.f32.hswish = (struct vunary_parameters) {
4682         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__sse_x8,
4683         .init.f32_hswish = xnn_init_f32_hswish_sse_params,
4684         .element_tile = 8,
4685       };
4686     }
4687     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4688       xnn_params.f32.lrelu = (struct vunary_parameters) {
4689         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__avx512f_x16,
4690         .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
4691         .element_tile = 16,
4692       };
4693     } else if (cpuinfo_has_x86_avx()) {
4694       xnn_params.f32.lrelu = (struct vunary_parameters) {
4695         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__avx_x16,
4696         .init.f32_lrelu = xnn_init_f32_lrelu_avx_params,
4697         .element_tile = 16,
4698       };
4699     } else if (cpuinfo_has_x86_sse4_1()) {
4700       xnn_params.f32.lrelu = (struct vunary_parameters) {
4701         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__sse41_x8,
4702         .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
4703         .element_tile = 8,
4704       };
4705     } else {
4706       xnn_params.f32.lrelu = (struct vunary_parameters) {
4707         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__sse_x8,
4708         .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
4709         .element_tile = 8,
4710       };
4711     }
4712     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4713       xnn_params.f32.neg = (struct vunary_parameters) {
4714         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__avx512f_x16,
4715         .init.f32_neg = xnn_init_f32_neg_avx512_params,
4716         .element_tile = 16,
4717       };
4718     } else if (cpuinfo_has_x86_avx()) {
4719       xnn_params.f32.neg = (struct vunary_parameters) {
4720         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__avx_x16,
4721         .init.f32_neg = xnn_init_f32_neg_avx_params,
4722         .element_tile = 16,
4723       };
4724     } else {
4725       xnn_params.f32.neg = (struct vunary_parameters) {
4726         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__sse_x8,
4727         .init.f32_neg = xnn_init_f32_neg_sse_params,
4728         .element_tile = 8,
4729       };
4730     }
4731     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4732       xnn_params.f32.rndne = (struct vunary_parameters) {
4733         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__avx512f_x16,
4734         .element_tile = 16,
4735       };
4736       xnn_params.f32.rndz = (struct vunary_parameters) {
4737         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__avx512f_x16,
4738         .element_tile = 16,
4739       };
4740       xnn_params.f32.rndu = (struct vunary_parameters) {
4741         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__avx512f_x16,
4742         .element_tile = 16,
4743       };
4744       xnn_params.f32.rndd = (struct vunary_parameters) {
4745         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__avx512f_x16,
4746         .element_tile = 16,
4747       };
4748     } else if (cpuinfo_has_x86_avx()) {
4749       xnn_params.f32.rndne = (struct vunary_parameters) {
4750         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__avx_x16,
4751         .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4752         .element_tile = 16,
4753       };
4754       xnn_params.f32.rndz = (struct vunary_parameters) {
4755         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__avx_x16,
4756         .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4757         .element_tile = 16,
4758       };
4759       xnn_params.f32.rndu = (struct vunary_parameters) {
4760         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__avx_x16,
4761         .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4762         .element_tile = 16,
4763       };
4764       xnn_params.f32.rndd = (struct vunary_parameters) {
4765         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__avx_x16,
4766         .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4767         .element_tile = 16,
4768       };
4769     } else if (cpuinfo_has_x86_sse4_1()) {
4770       xnn_params.f32.rndne = (struct vunary_parameters) {
4771         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__sse41_x8,
4772         .element_tile = 8,
4773       };
4774       xnn_params.f32.rndz = (struct vunary_parameters) {
4775         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__sse41_x8,
4776         .element_tile = 8,
4777       };
4778       xnn_params.f32.rndu = (struct vunary_parameters) {
4779         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__sse41_x8,
4780         .element_tile = 8,
4781       };
4782       xnn_params.f32.rndd = (struct vunary_parameters) {
4783         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__sse41_x8,
4784         .element_tile = 8,
4785       };
4786     } else {
4787       xnn_params.f32.rndne = (struct vunary_parameters) {
4788         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__sse2_x8,
4789         .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4790         .element_tile = 8,
4791       };
4792       xnn_params.f32.rndz = (struct vunary_parameters) {
4793         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__sse2_x8,
4794         .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4795         .element_tile = 8,
4796       };
4797       xnn_params.f32.rndu = (struct vunary_parameters) {
4798         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__sse2_x8,
4799         .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4800         .element_tile = 8,
4801       };
4802       xnn_params.f32.rndd = (struct vunary_parameters) {
4803         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__sse2_x8,
4804         .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4805         .element_tile = 8,
4806       };
4807     }
4808     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4809       xnn_params.f32.sigmoid = (struct vunary_parameters) {
4810         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x64,
4811         .init.f32_sigmoid = xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params,
4812         .element_tile = 64,
4813       };
4814     } else if (cpuinfo_has_x86_avx2()) {
4815       xnn_params.f32.sigmoid = (struct vunary_parameters) {
4816         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40,
4817         .init.f32_sigmoid = xnn_init_f32_sigmoid_avx2_rr1_p5_params,
4818         .element_tile = 40,
4819       };
4820     } else if (cpuinfo_has_x86_avx()) {
4821       xnn_params.f32.sigmoid = (struct vunary_parameters) {
4822         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40,
4823         .init.f32_sigmoid = xnn_init_f32_sigmoid_avx_rr2_p5_params,
4824         .element_tile = 40,
4825       };
4826     } else if (cpuinfo_has_x86_sse4_1()) {
4827       xnn_params.f32.sigmoid = (struct vunary_parameters) {
4828         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x8,
4829         .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4830         .element_tile = 8,
4831       };
4832     } else {
4833       xnn_params.f32.sigmoid = (struct vunary_parameters) {
4834         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_x8,
4835         .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4836         .element_tile = 8,
4837       };
4838     }
4839     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4840       xnn_params.f32.sqr = (struct vunary_parameters) {
4841         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__avx512f_x16,
4842         .element_tile = 16,
4843       };
4844     } else if (cpuinfo_has_x86_avx()) {
4845       xnn_params.f32.sqr = (struct vunary_parameters) {
4846         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__avx_x16,
4847         .init.f32_default = xnn_init_f32_default_avx_params,
4848         .element_tile = 16,
4849       };
4850     } else {
4851       xnn_params.f32.sqr = (struct vunary_parameters) {
4852         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__sse_x8,
4853         .element_tile = 8,
4854       };
4855     }
4856     if (cpuinfo_has_x86_avx()) {
4857       xnn_params.f32.sqrt = (struct vunary_parameters) {
4858         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__avx_sqrt_x8,
4859         .init.f32_sqrt = xnn_init_f32_sqrt_avx_params,
4860         .element_tile = 8,
4861       };
4862     } else {
4863       xnn_params.f32.sqrt = (struct vunary_parameters) {
4864         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__sse_sqrt_x4,
4865         .element_tile = 4,
4866       };
4867     }
4868     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4869       xnn_params.f32.prelu = (struct prelu_parameters) {
4870         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx512f_2x16,
4871         .row_tile = 2,
4872         .channel_tile = 16,
4873       };
4874     } else if (cpuinfo_has_x86_avx()) {
4875       xnn_params.f32.prelu = (struct prelu_parameters) {
4876         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx_2x16,
4877         .row_tile = 2,
4878         .channel_tile = 16,
4879       };
4880     } else if (cpuinfo_has_x86_sse4_1()) {
4881       xnn_params.f32.prelu = (struct prelu_parameters) {
4882         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse41_2x8,
4883         .row_tile = 2,
4884         .channel_tile = 8,
4885       };
4886     } else {
4887       xnn_params.f32.prelu = (struct prelu_parameters) {
4888         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse2_2x8,
4889         .row_tile = 2,
4890         .channel_tile = 8,
4891       };
4892     }
4893     xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
4894       .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20_acc2,
4895       .init.f32 = xnn_init_f32_expminus_sse2_rr2_p5_params,
4896       .element_tile = 20,
4897     };
4898     xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__sse;
4899     if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4900       xnn_params.f32.vadd = (struct vbinary_parameters) {
4901         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx512f_x32,
4902         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
4903         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
4904         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4905         .element_tile = 32,
4906       };
4907       xnn_params.f32.vdiv = (struct vbinary_parameters) {
4908         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx512f_x32,
4909         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx512f_x32,
4910         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx512f_x32,
4911         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4912         .element_tile = 32,
4913       };
4914       xnn_params.f32.vmax = (struct vbinary_parameters) {
4915         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx512f_x32,
4916         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
4917         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
4918         .element_tile = 32,
4919       };
4920       xnn_params.f32.vmin = (struct vbinary_parameters) {
4921         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx512f_x32,
4922         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
4923         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
4924         .element_tile = 32,
4925       };
4926       xnn_params.f32.vmul = (struct vbinary_parameters) {
4927         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx512f_x32,
4928         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
4929         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
4930         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4931         .element_tile = 32,
4932       };
4933       xnn_params.f32.vsub = (struct vbinary_parameters) {
4934         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx512f_x32,
4935         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx512f_x32,
4936         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx512f_x32,
4937         .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4938         .element_tile = 32,
4939       };
4940       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
4941         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx512f_x32,
4942         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
4943         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
4944         .element_tile = 32,
4945       };
4946     } else if (cpuinfo_has_x86_avx()) {
4947       xnn_params.f32.vadd = (struct vbinary_parameters) {
4948         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx_x16,
4949         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
4950         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
4951         .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4952         .element_tile = 16,
4953       };
4954       xnn_params.f32.vdiv = (struct vbinary_parameters) {
4955         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx_x16,
4956         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx_x16,
4957         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx_x16,
4958         .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4959         .element_tile = 16,
4960       };
4961       xnn_params.f32.vmax = (struct vbinary_parameters) {
4962         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx_x16,
4963         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
4964         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
4965         .init.f32_default = xnn_init_f32_default_avx_params,
4966         .element_tile = 16,
4967       };
4968       xnn_params.f32.vmin = (struct vbinary_parameters) {
4969         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx_x16,
4970         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
4971         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
4972         .init.f32_default = xnn_init_f32_default_avx_params,
4973         .element_tile = 16,
4974       };
4975       xnn_params.f32.vmul = (struct vbinary_parameters) {
4976         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx_x16,
4977         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
4978         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
4979         .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4980         .element_tile = 16,
4981       };
4982       xnn_params.f32.vsub = (struct vbinary_parameters) {
4983         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx_x16,
4984         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx_x16,
4985         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx_x16,
4986         .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4987         .element_tile = 16,
4988       };
4989       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
4990         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx_x16,
4991         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
4992         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
4993         .init.f32_default = xnn_init_f32_default_avx_params,
4994         .element_tile = 16,
4995       };
4996     } else {
4997       xnn_params.f32.vadd = (struct vbinary_parameters) {
4998         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__sse_x8,
4999         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
5000         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
5001         .init.f32_minmax = xnn_init_f32_minmax_sse_params,
5002         .element_tile = 8,
5003       };
5004       xnn_params.f32.vdiv = (struct vbinary_parameters) {
5005         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__sse_x8,
5006         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__sse_x8,
5007         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__sse_x8,
5008         .init.f32_minmax = xnn_init_f32_minmax_sse_params,
5009         .element_tile = 8,
5010       };
5011       xnn_params.f32.vmax = (struct vbinary_parameters) {
5012         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__sse_x8,
5013         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
5014         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
5015         .element_tile = 8,
5016       };
5017       xnn_params.f32.vmin = (struct vbinary_parameters) {
5018         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__sse_x8,
5019         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
5020         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
5021         .element_tile = 8,
5022       };
5023       xnn_params.f32.vmul = (struct vbinary_parameters) {
5024         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__sse_x8,
5025         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
5026         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
5027         .init.f32_minmax = xnn_init_f32_minmax_sse_params,
5028         .element_tile = 8,
5029       };
5030       xnn_params.f32.vsub = (struct vbinary_parameters) {
5031         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__sse_x8,
5032         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__sse_x8,
5033         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__sse_x8,
5034         .init.f32_minmax = xnn_init_f32_minmax_sse_params,
5035         .element_tile = 8,
5036       };
5037       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
5038         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__sse_x8,
5039         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
5040         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
5041         .element_tile = 8,
5042       };
5043     }
5044     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
5045       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x,
5046       .init.f32 = xnn_init_f32_minmax_sse_params,
5047       .channel_tile = 4,
5048       .row_tile = 2,
5049     };
5050     #ifndef XNN_NO_NCHW_OPERATORS
5051       // Sparse microkernels on x86 currently target only SSE, and on processors
5052       // with AVX ISA dense inference is expected to be faster than sparse.
5053       if (!cpuinfo_has_x86_avx()) {
5054         init_flags |= XNN_INIT_FLAG_CHW_OPT;
5055       }
5056 
5057       xnn_params.f32.spmm = (struct spmm_parameters) {
5058         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__sse,
5059         .mr = 32,
5060         .nr = 1,
5061       };
5062       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
5063         .ukernel_with_symm_padding =
5064           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2,
5065         .output_channel_tile = 4,
5066         .output_height_tile = 2,
5067         .output_width_tile = 2,
5068       };
5069       if (cpuinfo_has_x86_ssse3()) {
5070         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5071           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2,
5072           .output_width_tile = 4,
5073           .output_height_tile = 2,
5074         };
5075       } else {
5076         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5077           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2,
5078           .output_width_tile = 4,
5079           .output_height_tile = 2,
5080         };
5081       }
5082       xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
5083         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3,
5084         .output_width_tile = 4,
5085         .output_height_tile = 1,
5086       };
5087       xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
5088         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4,
5089         .output_width_tile = 4,
5090         .output_height_tile = 4,
5091       };
5092       xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
5093         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4,
5094         .output_width_tile = 4,
5095         .output_height_tile = 2,
5096       };
5097       xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5098         .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__sse_x4,
5099         .channel_tile = 4,
5100       };
5101       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
5102         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__sse_p8,
5103         .channel_tile = 1,
5104         .pixel_tile = 8,
5105       };
5106     #endif  // XNN_NO_NCHW_OPERATORS
5107   #endif  // XNN_NO_F32_OPERATORS
5108 
5109   /*************************** VCVT x86 micro-kernels ***************************/
5110   #ifndef XNN_NO_VCVT_OPERATORS
5111     init_flags |= XNN_INIT_FLAG_VCVT;
5112 
5113     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
5114       xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5115         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx512skx_x16,
5116         .element_tile = 16,
5117       };
5118       xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5119         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx512skx_x16,
5120         .element_tile = 16,
5121       };
5122     } else if (cpuinfo_has_x86_f16c()) {
5123       xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5124         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__f16c_x16,
5125         .element_tile = 16,
5126       };
5127       xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5128         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__f16c_x16,
5129         .init.f32_f16_cvt = xnn_init_f32_f16_cvt_f16c_params,
5130         .element_tile = 16,
5131       };
5132     } else if (cpuinfo_has_x86_avx()) {
5133       xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5134         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx_int16_x16,
5135         .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
5136         .element_tile = 16,
5137       };
5138       xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5139         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx_x24,
5140         .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
5141         .element_tile = 24,
5142       };
5143     } else if (cpuinfo_has_x86_sse4_1()) {
5144       xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5145         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse41_int16_x16,
5146         .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
5147         .element_tile = 16,
5148       };
5149       xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5150         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse41_x8,
5151         .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
5152         .element_tile = 8,
5153       };
5154     } else {
5155       xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5156         .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32,
5157         .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
5158         .element_tile = 32,
5159       };
5160       xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5161         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse2_x16,
5162         .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
5163         .element_tile = 16,
5164       };
5165     }
5166     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
5167       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5168         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx512skx_x128,
5169         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx512_params,
5170         .element_tile = 128,
5171       };
5172     } else if (cpuinfo_has_x86_avx2()) {
5173       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5174         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx2_x64,
5175         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx2_params,
5176         .element_tile = 64,
5177       };
5178     } else if (cpuinfo_has_x86_avx()) {
5179       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5180         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx_x32,
5181         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx_params,
5182         .element_tile = 32,
5183       };
5184     } else if (cpuinfo_has_x86_sse4_1()) {
5185       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5186         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse41_x32,
5187         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse4_params,
5188         .element_tile = 32,
5189       };
5190     } else {
5191       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5192         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse2_x32,
5193         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse2_params,
5194         .element_tile = 32,
5195       };
5196     }
5197     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
5198       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5199         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx512skx_x128,
5200         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx512_params,
5201         .element_tile = 128,
5202       };
5203     } else if (cpuinfo_has_x86_avx2()) {
5204       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5205         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx2_x64,
5206         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx2_params,
5207         .element_tile = 64,
5208       };
5209     } else if (cpuinfo_has_x86_avx()) {
5210       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5211         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx_x32,
5212         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx_params,
5213         .element_tile = 32,
5214       };
5215     } else {
5216       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5217         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
5218         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
5219         .element_tile = 32,
5220       };
5221     }
5222     if (cpuinfo_has_x86_avx2()) {
5223       xnn_params.vcvt.qs8 = (struct vunary_parameters) {
5224         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__avx2_x32,
5225         .init.qs8_cvt = xnn_init_qs8_cvt_avx2_params,
5226         .element_tile = 32,
5227       };
5228       xnn_params.vcvt.qu8 = (struct vunary_parameters) {
5229         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__avx2_x32,
5230         .init.qu8_cvt = xnn_init_qu8_cvt_avx2_params,
5231         .element_tile = 32,
5232       };
5233     } else if (cpuinfo_has_x86_avx()) {
5234       xnn_params.vcvt.qs8 = (struct vunary_parameters) {
5235         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__avx_x32,
5236         .init.qs8_cvt = xnn_init_qs8_cvt_ssse3_params,
5237         .element_tile = 32,
5238       };
5239       xnn_params.vcvt.qu8 = (struct vunary_parameters) {
5240         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__avx_x32,
5241         .init.qu8_cvt = xnn_init_qu8_cvt_ssse3_params,
5242         .element_tile = 32,
5243       };
5244     } else if (cpuinfo_has_x86_sse4_1()) {
5245       xnn_params.vcvt.qs8 = (struct vunary_parameters) {
5246         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__sse41_x32,
5247         .init.qs8_cvt = xnn_init_qs8_cvt_ssse3_params,
5248         .element_tile = 32,
5249       };
5250       xnn_params.vcvt.qu8 = (struct vunary_parameters) {
5251         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__sse41_x32,
5252         .init.qu8_cvt = xnn_init_qu8_cvt_ssse3_params,
5253         .element_tile = 32,
5254       };
5255     } else if (cpuinfo_has_x86_ssse3()) {
5256       xnn_params.vcvt.qs8 = (struct vunary_parameters) {
5257         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__ssse3_x32,
5258         .init.qs8_cvt = xnn_init_qs8_cvt_ssse3_params,
5259         .element_tile = 32,
5260       };
5261       xnn_params.vcvt.qu8 = (struct vunary_parameters) {
5262         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__ssse3_x32,
5263         .init.qu8_cvt = xnn_init_qu8_cvt_ssse3_params,
5264         .element_tile = 32,
5265       };
5266     } else {
5267       xnn_params.vcvt.qs8 = (struct vunary_parameters) {
5268         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__sse2_x32,
5269         .init.qs8_cvt = xnn_init_qs8_cvt_sse2_params,
5270         .element_tile = 32,
5271       };
5272       xnn_params.vcvt.qu8 = (struct vunary_parameters) {
5273         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__sse2_x32,
5274         .init.qu8_cvt = xnn_init_qu8_cvt_sse2_params,
5275         .element_tile = 32,
5276       };
5277     }
5278     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
5279       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5280         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx512skx_x32,
5281         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx512_params,
5282         .element_tile = 32,
5283       };
5284       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5285         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx512skx_x32,
5286         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx512_params,
5287         .element_tile = 32,
5288       };
5289     } else if (cpuinfo_has_x86_avx2()) {
5290       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5291         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx2_x16,
5292         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
5293         .element_tile = 16,
5294       };
5295       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5296         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx2_x16,
5297         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
5298         .element_tile = 16,
5299       };
5300     } else if (cpuinfo_has_x86_avx()) {
5301       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5302         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx_x32,
5303         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
5304         .element_tile = 32,
5305       };
5306       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5307         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx_x32,
5308         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
5309         .element_tile = 32,
5310       };
5311     } else if (cpuinfo_has_x86_sse4_1()) {
5312       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5313         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse41_x16,
5314         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse4_params,
5315         .element_tile = 16,
5316       };
5317       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5318         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse41_x16,
5319         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse4_params,
5320         .element_tile = 16,
5321       };
5322     } else {
5323       xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5324         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse2_x32,
5325         .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse2_params,
5326         .element_tile = 32,
5327       };
5328       xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5329         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse2_x32,
5330         .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse2_params,
5331         .element_tile = 32,
5332       };
5333     }
5334   #endif  // XNN_NO_VCVT_OPERATORS
5335 
5336   /**************************** X32 x86 micro-kernels ****************************/
5337   #ifndef XNN_NO_X32_OPERATORS
5338     init_flags |= XNN_INIT_FLAG_X32;
5339 
5340     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__sse2;
5341     xnn_params.x32.zip = (struct zip_parameters) {
5342       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__sse2,
5343       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__sse2,
5344       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__sse2,
5345       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__sse2,
5346     };
5347 
5348     xnn_params.x32.transpose = (struct transpose_parameters) {
5349       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__4x4_sse,
5350       .tile_size = 32,
5351     };
5352   #endif  // XNN_NO_X32_OPERATORS
5353 
5354   /**************************** XX x86 micro-kernels ****************************/
5355   #ifndef XNN_NO_XX_OPERATORS
5356     init_flags |= XNN_INIT_FLAG_XX;
5357 
5358     xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
5359     xnn_params.xx.fill = (struct fill_parameters) {
5360       .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__sse2_x64,
5361       .row_tile = 1,
5362     };
5363     xnn_params.xx.pad = (struct pad_parameters) {
5364       .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__sse2,
5365       .row_tile = 1,
5366     };
5367     xnn_params.xx.transpose = (struct transpose_parameters) {
5368       .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
5369       .tile_size = 32,
5370     };
5371   #endif
5372 
5373 #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
5374 
5375   /**************************** QC8 WAsm SIMD micro-kernels****************************/
5376   #ifndef XNN_NO_QS8_OPERATORS
5377     init_flags |= XNN_INIT_FLAG_QC8;
5378 
5379     xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5380     xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5381     xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5382     xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5383     xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_wasmsimd_params;
5384     xnn_params.qc8.gemm.mr = 4;
5385     xnn_params.qc8.gemm.nr = 4;
5386     xnn_params.qc8.gemm.log2_kr = 1;
5387     xnn_params.qc8.gemm.log2_sr = 2;
5388 
5389     xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__wasmsimd_mul16_add16;
5390     xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_wasmsimd_params;
5391     xnn_params.qc8.dwconv[0].channel_tile = 16;
5392     xnn_params.qc8.dwconv[0].primary_tile = 3;
5393     xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
5394     xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_wasmsimd_params;
5395     xnn_params.qc8.dwconv[1].channel_tile = 16;
5396     xnn_params.qc8.dwconv[1].primary_tile = 9;
5397     xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
5398     xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_wasmsimd_params;
5399     xnn_params.qc8.dwconv[2].channel_tile = 16;
5400     xnn_params.qc8.dwconv[2].primary_tile = 25;
5401   #endif  // XNN_NO_QC8_OPERATORS
5402 
5403   /**************************** QS8 WAsm SIMD micro-kernels****************************/
5404   #ifndef XNN_NO_QS8_OPERATORS
5405     init_flags |= XNN_INIT_FLAG_QS8;
5406 
5407     xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5408     xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5409     xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5410     xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5411     xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
5412     xnn_params.qs8.gemm.mr = 4;
5413     xnn_params.qs8.gemm.nr = 4;
5414     xnn_params.qs8.gemm.log2_kr = 1;
5415     xnn_params.qs8.gemm.log2_sr = 2;
5416 
5417     xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
5418     xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
5419     xnn_params.qs8.dwconv[0].channel_tile = 16;
5420     xnn_params.qs8.dwconv[0].primary_tile = 9;
5421     xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
5422     xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
5423     xnn_params.qs8.dwconv[1].channel_tile = 16;
5424     xnn_params.qs8.dwconv[1].primary_tile = 25;
5425 
5426     xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
5427       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
5428       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
5429       .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params,
5430       .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params,
5431       .row_tile = 7,
5432       .channel_tile = 16,
5433     };
5434 
5435     xnn_params.qs8.vadd = (struct vbinary_parameters) {
5436       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__wasmsimd_x32,
5437       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
5438       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
5439       .init.qs8_add = xnn_init_qs8_add_minmax_wasmsimd_params,
5440       .element_tile = 32,
5441     };
5442     xnn_params.qs8.vmul = (struct vbinary_parameters) {
5443       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
5444       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
5445       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
5446       .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_wasmsimd_params,
5447       .element_tile = 8,
5448     };
5449 
5450     #if XNN_ARCH_WASMRELAXEDSIMD
5451       if (is_wasm_x86) {
5452         xnn_params.qs8.lrelu = (struct vunary_parameters) {
5453           .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32,
5454           .init.qs8_lrelu = xnn_init_qs8_lrelu_wasmsimd_x86_params,
5455           .element_tile = 32,
5456         };
5457       } else {
5458         xnn_params.qs8.lrelu = (struct vunary_parameters) {
5459           .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32,
5460           .init.qs8_lrelu = xnn_init_qs8_lrelu_wasmsimd_arm_params,
5461           .element_tile = 32,
5462         };
5463       }
5464     #else
5465       if (is_wasm_x86) {
5466         xnn_params.qs8.lrelu = (struct vunary_parameters) {
5467           .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__wasmsimd_x86_x16,
5468           .init.qs8_lrelu = xnn_init_qs8_lrelu_wasmsimd_x86_params,
5469           .element_tile = 16,
5470         };
5471       } else {
5472         xnn_params.qs8.lrelu = (struct vunary_parameters) {
5473           .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__wasmsimd_arm_x32,
5474           .init.qs8_lrelu = xnn_init_qs8_lrelu_wasmsimd_arm_params,
5475           .element_tile = 32,
5476         };
5477       }
5478     #endif
5479   #endif  // XNN_NO_QS8_OPERATORS
5480 
5481   /**************************** QU8 WAsm SIMD micro-kernels****************************/
5482   #ifndef XNN_NO_QU8_OPERATORS
5483     init_flags |= XNN_INIT_FLAG_QU8;
5484 
5485     xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5486     xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
5487     xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5488     xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
5489     xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
5490     xnn_params.qu8.gemm.mr = 4;
5491     xnn_params.qu8.gemm.nr = 4;
5492     xnn_params.qu8.gemm.log2_kr = 1;
5493     xnn_params.qu8.gemm.log2_sr = 2;
5494 
5495     xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16;
5496     xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
5497     xnn_params.qu8.dwconv[0].channel_tile = 8;
5498     xnn_params.qu8.dwconv[0].primary_tile = 9;
5499     xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16;
5500     xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
5501     xnn_params.qu8.dwconv[1].channel_tile = 8;
5502     xnn_params.qu8.dwconv[1].primary_tile = 25;
5503 
5504     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
5505       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
5506       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
5507       .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
5508       .primary_tile = 9,
5509       .incremental_tile = 8,
5510       .channel_tile = 1,
5511     };
5512     xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
5513       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
5514       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
5515       .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params,
5516       .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params,
5517       .row_tile = 7,
5518       .channel_tile = 16,
5519     };
5520 
5521     xnn_params.qu8.vadd = (struct vbinary_parameters) {
5522       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__wasmsimd_x32,
5523       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
5524       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
5525       .init.qu8_add = xnn_init_qu8_add_minmax_wasmsimd_params,
5526       .element_tile = 32,
5527     };
5528     xnn_params.qu8.vmul = (struct vbinary_parameters) {
5529       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
5530       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
5531       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
5532       .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_wasmsimd_params,
5533       .element_tile = 8,
5534     };
5535 
5536     #if XNN_ARCH_WASMRELAXEDSIMD
5537       if (is_wasm_x86) {
5538         xnn_params.qu8.lrelu = (struct vunary_parameters) {
5539           .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_x86_x32,
5540           .init.qu8_lrelu = xnn_init_qu8_lrelu_wasmsimd_x86_params,
5541           .element_tile = 32,
5542         };
5543       } else {
5544         xnn_params.qu8.lrelu = (struct vunary_parameters) {
5545           .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__wasmrelaxedsimd_arm_x32,
5546           .init.qu8_lrelu = xnn_init_qu8_lrelu_wasmsimd_arm_params,
5547           .element_tile = 32,
5548         };
5549       }
5550     #else
5551       if (is_wasm_x86) {
5552         xnn_params.qu8.lrelu = (struct vunary_parameters) {
5553           .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__wasmsimd_x86_x16,
5554           .init.qu8_lrelu = xnn_init_qu8_lrelu_wasmsimd_x86_params,
5555           .element_tile = 16,
5556         };
5557       } else {
5558         xnn_params.qu8.lrelu = (struct vunary_parameters) {
5559           .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__wasmsimd_arm_x32,
5560           .init.qu8_lrelu = xnn_init_qu8_lrelu_wasmsimd_arm_params,
5561           .element_tile = 32,
5562         };
5563       }
5564     #endif
5565   #endif  // XNN_NO_QU8_OPERATORS
5566 
5567   /**************************** S8 WAsm SIMD micro-kernels****************************/
5568   #ifndef XNN_NO_S8_OPERATORS
5569     init_flags |= XNN_INIT_FLAG_S8;
5570 
5571     xnn_params.s8.clamp = (struct vunary_parameters) {
5572       .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__wasmsimd_x64,
5573       .init.s8_minmax = xnn_init_s8_minmax_wasmsimd_params,
5574       .element_tile = 64,
5575     };
5576     xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
5577       .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
5578       .pixel_tile = 1,
5579       .channel_tile = 8,
5580     };
5581     xnn_params.s8.maxpool = (struct maxpool_parameters) {
5582       .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
5583       .init.s8 = xnn_init_s8_minmax_wasmsimd_params,
5584       .mr = 9,
5585       .qr = 8,
5586     };
5587   #endif  // XNN_NO_S8_OPERATORS
5588 
5589   /**************************** U8 WAsm SIMD micro-kernels****************************/
5590   #ifndef XNN_NO_U8_OPERATORS
5591     init_flags |= XNN_INIT_FLAG_U8;
5592 
5593     xnn_params.u8.clamp = (struct vunary_parameters) {
5594       .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__wasmsimd_x64,
5595       .init.u8_minmax = xnn_init_u8_minmax_wasmsimd_params,
5596       .element_tile = 64,
5597     };
5598     xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
5599       .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
5600       .pixel_tile = 1,
5601       .channel_tile = 8,
5602     };
5603     xnn_params.u8.maxpool = (struct maxpool_parameters) {
5604       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
5605       .init.u8 = xnn_init_u8_minmax_wasmsimd_params,
5606       .mr = 9,
5607       .qr = 8,
5608     };
5609     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
5610     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
5611   #endif  // XNN_NO_U8_OPERATORS
5612 
5613   /**************************** X8 WAsm SIMD micro-kernels****************************/
5614   #ifndef XNN_NO_X8_OPERATORS
5615     init_flags |= XNN_INIT_FLAG_X8;
5616 
5617     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
5618     xnn_params.x8.zip = (struct zip_parameters) {
5619       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
5620       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
5621       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
5622       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
5623     };
5624 
5625     xnn_params.x8.transpose = (struct transpose_parameters) {
5626       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__2x4_scalar_int,
5627       .tile_size = 32,
5628     };
5629   #endif  // XNN_NO_X8_OPERATORS
5630 
5631   /**************************** X16 WAsm SIMD micro-kernels****************************/
5632   #ifndef XNN_NO_X16_OPERATORS
5633     init_flags |= XNN_INIT_FLAG_X16;
5634 
5635     xnn_params.x16.transpose = (struct transpose_parameters) {
5636       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__2x4_scalar_int,
5637       .tile_size = 32,
5638     };
5639   #endif  // XNN_NO_X16_OPERATORS
5640 
5641   /**************************** F32 WAsm SIMD micro-kernels****************************/
5642   #ifndef XNN_NO_F32_OPERATORS
5643     init_flags |= XNN_INIT_FLAG_F32;
5644 
5645     if (is_wasm_x86) {
5646       #if XNN_ARCH_WASMRELAXEDSIMD
5647         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
5648         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
5649         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
5650         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
5651         xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
5652         xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
5653         xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
5654         xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
5655         xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
5656         xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__wasmrelaxedsimd_fma_loadsplat);
5657         xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
5658         xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_loadsplat);
5659       #else
5660         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
5661         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
5662         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
5663         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
5664         xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat);
5665         xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat);
5666         xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
5667         xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
5668         xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__wasmsimd_splat);
5669         xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__wasmsimd_splat);
5670         xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
5671         xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
5672       #endif
5673       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5674       xnn_params.f32.gemm.mr = 4;
5675       xnn_params.f32.gemm.nr = 8;
5676 
5677       #if XNN_ARCH_WASMRELAXEDSIMD
5678         xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
5679         xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
5680         xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
5681         xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
5682       #else
5683         xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
5684         xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
5685         xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
5686         xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
5687       #endif
5688       xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5689       xnn_params.f32.gemm2.mr = 4;
5690       xnn_params.f32.gemm2.nr = 2;
5691       xnn_params.f32.gemm2.log2_kr = 2;
5692     } else {
5693       #if XNN_ARCH_WASMRELAXEDSIMD
5694         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat);
5695         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmrelaxedsimd_fma_splat);
5696         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5697         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5698         xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat);
5699         xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_5x8__wasmrelaxedsimd_fma_splat);
5700         xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5701         xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5702         xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x8__wasmrelaxedsimd_fma_splat);
5703         xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x8__wasmrelaxedsimd_fma_splat);
5704         xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5705         xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmrelaxedsimd_fma_splat);
5706       #else
5707         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
5708         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
5709         xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
5710         xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
5711         xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat);
5712         xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat);
5713         xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
5714         xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
5715         xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x8__wasmsimd_splat);
5716         xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(5)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x8__wasmsimd_splat);
5717         xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
5718         xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
5719       #endif
5720       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5721       xnn_params.f32.gemm.mr = 5;
5722       xnn_params.f32.gemm.nr = 8;
5723 
5724       #if XNN_ARCH_WASMRELAXEDSIMD
5725         xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
5726         xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmrelaxedsimd_fma);
5727         xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
5728         xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmrelaxedsimd_fma);
5729       #else
5730         xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
5731         xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
5732         xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
5733         xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
5734       #endif
5735       xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5736       xnn_params.f32.gemm2.mr = 4;
5737       xnn_params.f32.gemm2.nr = 2;
5738       xnn_params.f32.gemm2.log2_kr = 2;
5739     }
5740 
5741     #if XNN_ARCH_WASMRELAXEDSIMD
5742       xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__wasmrelaxedsimd_fma;
5743       xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x3__wasmrelaxedsimd_fma;
5744       xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5745       xnn_params.f32.dwconv[0].channel_tile = 8;
5746       xnn_params.f32.dwconv[0].primary_tile = 3;
5747 
5748       xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmrelaxedsimd_fma;
5749       xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__wasmrelaxedsimd_fma;
5750       xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5751       xnn_params.f32.dwconv[1].channel_tile = 8;
5752       xnn_params.f32.dwconv[1].primary_tile = 4;
5753 
5754       xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__wasmrelaxedsimd_fma;
5755       xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__wasmrelaxedsimd_fma;
5756       xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5757       xnn_params.f32.dwconv[2].channel_tile = 8;
5758       xnn_params.f32.dwconv[2].primary_tile = 9;
5759     #else
5760       if (is_wasm_x86) {
5761         xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__wasmsimd_x86;
5762         xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x3__wasmsimd;
5763         xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5764         xnn_params.f32.dwconv[0].channel_tile = 8;
5765         xnn_params.f32.dwconv[0].primary_tile = 3;
5766 
5767         xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86;
5768         xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__wasmsimd;
5769         xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5770         xnn_params.f32.dwconv[1].channel_tile = 8;
5771         xnn_params.f32.dwconv[1].primary_tile = 4;
5772 
5773         xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_x86;
5774         xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__wasmsimd;
5775         xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5776         xnn_params.f32.dwconv[2].channel_tile = 8;
5777         xnn_params.f32.dwconv[2].primary_tile = 9;
5778       } else {
5779         xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x3__wasmsimd_arm;
5780         xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x3__wasmsimd;
5781         xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5782         xnn_params.f32.dwconv[0].channel_tile = 4;
5783         xnn_params.f32.dwconv[0].primary_tile = 3;
5784 
5785         xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_arm;
5786         xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__wasmsimd;
5787         xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5788         xnn_params.f32.dwconv[1].channel_tile = 4;
5789         xnn_params.f32.dwconv[1].primary_tile = 4;
5790 
5791         xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_arm;
5792         xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__wasmsimd;
5793         xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5794         xnn_params.f32.dwconv[2].channel_tile = 4;
5795         xnn_params.f32.dwconv[2].primary_tile = 9;
5796       }
5797     #endif
5798 
5799     #if XNN_ARCH_WASMRELAXEDSIMD
5800       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__wasmrelaxedsimd_fma;
5801       xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x25__wasmrelaxedsimd_fma;
5802       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5803       xnn_params.f32.dwconv[3].channel_tile = 8;
5804       xnn_params.f32.dwconv[3].primary_tile = 25;
5805     #else
5806       xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__wasmsimd_arm;
5807       xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__wasmsimd;
5808       xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
5809       xnn_params.f32.dwconv[3].channel_tile = 4;
5810       xnn_params.f32.dwconv[3].primary_tile = 25;
5811     #endif
5812 
5813     if (is_wasm_x86) {
5814       xnn_params.f32.avgpool = (struct avgpool_parameters) {
5815         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
5816         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
5817         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5818         .primary_tile = 9,
5819         .incremental_tile = 8,
5820         .channel_tile = 4,
5821       };
5822       xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
5823         .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
5824         .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
5825         .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5826         .primary_tile = 9,
5827         .incremental_tile = 8,
5828         .channel_tile = 4,
5829       };
5830       xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
5831         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4,
5832         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4,
5833         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5834         .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5835         .row_tile = 7,
5836         .channel_tile = 4,
5837       };
5838     } else {
5839       xnn_params.f32.avgpool = (struct avgpool_parameters) {
5840         .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
5841         .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
5842         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5843         .primary_tile = 9,
5844         .incremental_tile = 8,
5845         .channel_tile = 4,
5846       };
5847       xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
5848         .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
5849         .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
5850         .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5851         .primary_tile = 9,
5852         .incremental_tile = 8,
5853         .channel_tile = 4,
5854       };
5855       xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
5856         .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4,
5857         .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4,
5858         .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5859         .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5860         .row_tile = 7,
5861         .channel_tile = 4,
5862       };
5863     }
5864     if (is_wasm_x86) {
5865       xnn_params.f32.maxpool = (struct maxpool_parameters) {
5866         .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
5867         .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5868         .mr = 9,
5869         .qr = 8,
5870       };
5871     } else {
5872       xnn_params.f32.maxpool = (struct maxpool_parameters) {
5873         .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
5874         .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5875         .mr = 9,
5876         .qr = 8,
5877       };
5878     }
5879     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
5880       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__wasmsimd_c4,
5881       .mr = 4,
5882     };
5883     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
5884       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__wasmsimd_c4,
5885       .mr = 9,
5886     };
5887     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
5888       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__wasmsimd_c4,
5889       .mr = 9,
5890       .qr = 8,
5891     };
5892     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
5893       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__wasmsimd_c8,
5894       .pixel_tile = 1,
5895       .channel_tile = 8,
5896     };
5897     xnn_params.f32.abs = (struct vunary_parameters) {
5898       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8,
5899       .init.f32_abs = xnn_init_f32_abs_wasmsimd_params,
5900       .element_tile = 16,
5901     };
5902     if (is_wasm_x86) {
5903       xnn_params.f32.clamp = (struct vunary_parameters) {
5904         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_x86_x8,
5905         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5906         .element_tile = 8,
5907       };
5908     } else {
5909       xnn_params.f32.clamp = (struct vunary_parameters) {
5910         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_arm_x8,
5911         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5912         .element_tile = 8,
5913       };
5914     }
5915     if (is_wasm_x86) {
5916       xnn_params.f32.elu = (struct vunary_parameters) {
5917         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20,
5918         .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5919         .element_tile = 20,
5920       };
5921     } else {
5922       xnn_params.f32.elu = (struct vunary_parameters) {
5923         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20,
5924         .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5925         .element_tile = 20,
5926       };
5927     }
5928     xnn_params.f32.hswish = (struct vunary_parameters) {
5929       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__wasmsimd_x16,
5930       .init.f32_hswish = xnn_init_f32_hswish_wasmsimd_params,
5931       .element_tile = 16,
5932     };
5933     if (is_wasm_x86) {
5934       xnn_params.f32.lrelu = (struct vunary_parameters) {
5935         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x8,
5936         .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5937         .element_tile = 8,
5938       };
5939     } else {
5940       xnn_params.f32.lrelu = (struct vunary_parameters) {
5941         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x8,
5942         .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5943         .element_tile = 8,
5944       };
5945     }
5946     xnn_params.f32.neg = (struct vunary_parameters) {
5947       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__wasmsimd_x8,
5948       .init.f32_neg = xnn_init_f32_neg_wasmsimd_params,
5949       .element_tile = 16,
5950     };
5951     xnn_params.f32.relu = (struct vunary_parameters) {
5952       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrelu_ukernel__wasmsimd_x16,
5953       .element_tile = 16,
5954     };
5955     xnn_params.f32.rndne = (struct vunary_parameters) {
5956       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_x8,
5957       .element_tile = 8,
5958     };
5959     xnn_params.f32.rndz = (struct vunary_parameters) {
5960       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_x8,
5961       .element_tile = 8,
5962     };
5963     xnn_params.f32.rndu = (struct vunary_parameters) {
5964       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_x8,
5965       .element_tile = 8,
5966     };
5967     xnn_params.f32.rndd = (struct vunary_parameters) {
5968       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_x8,
5969       .element_tile = 8,
5970     };
5971     xnn_params.f32.sigmoid = (struct vunary_parameters) {
5972       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_x16,
5973       .init.f32_sigmoid = xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params,
5974       .element_tile = 16,
5975     };
5976     xnn_params.f32.sqr = (struct vunary_parameters) {
5977       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__wasmsimd_x8,
5978       .element_tile = 16,
5979     };
5980     xnn_params.f32.sqrt = (struct vunary_parameters) {
5981       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8,
5982       .element_tile = 8,
5983     };
5984     if (is_wasm_x86) {
5985       xnn_params.f32.prelu = (struct prelu_parameters) {
5986         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_minmax_2x8,
5987         .row_tile = 2,
5988         .channel_tile = 8,
5989       };
5990     } else {
5991       xnn_params.f32.prelu = (struct prelu_parameters) {
5992         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_bitselect_2x8,
5993         .row_tile = 2,
5994         .channel_tile = 8,
5995       };
5996     }
5997     xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5998       .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x16_acc2,
5999       .init.f32 = xnn_init_f32_expminus_wasmsimd_rr2_p5_params,
6000       .element_tile = 16,
6001     };
6002     if (is_wasm_x86) {
6003       xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__wasmsimd_x86;
6004       xnn_params.f32.vadd = (struct vbinary_parameters) {
6005         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16,
6006         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
6007         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
6008         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
6009         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
6010         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
6011         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6012         .element_tile = 16,
6013       };
6014       xnn_params.f32.vdiv = (struct vbinary_parameters) {
6015         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16,
6016         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16,
6017         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16,
6018         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
6019         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
6020         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
6021         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6022         .element_tile = 16,
6023       };
6024       xnn_params.f32.vmax = (struct vbinary_parameters) {
6025         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_x86_x16,
6026         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
6027         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
6028         .element_tile = 16,
6029       };
6030       xnn_params.f32.vmin = (struct vbinary_parameters) {
6031         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_x86_x16,
6032         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
6033         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
6034         .element_tile = 16,
6035       };
6036       xnn_params.f32.vmul = (struct vbinary_parameters) {
6037         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16,
6038         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
6039         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
6040         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
6041         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
6042         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
6043         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6044         .element_tile = 16,
6045       };
6046       xnn_params.f32.vsub = (struct vbinary_parameters) {
6047         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16,
6048         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16,
6049         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16,
6050         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
6051         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
6052         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
6053         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6054         .element_tile = 16,
6055       };
6056     } else {
6057       xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__wasmsimd_arm;
6058       xnn_params.f32.vadd = (struct vbinary_parameters) {
6059         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16,
6060         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
6061         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
6062         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
6063         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
6064         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
6065         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6066         .element_tile = 16,
6067       };
6068       xnn_params.f32.vdiv = (struct vbinary_parameters) {
6069         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16,
6070         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16,
6071         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16,
6072         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
6073         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
6074         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
6075         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6076         .element_tile = 16,
6077       };
6078       xnn_params.f32.vmax = (struct vbinary_parameters) {
6079         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_arm_x16,
6080         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
6081         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
6082         .element_tile = 16,
6083       };
6084       xnn_params.f32.vmin = (struct vbinary_parameters) {
6085         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_arm_x16,
6086         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
6087         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
6088         .element_tile = 16,
6089       };
6090       xnn_params.f32.vmul = (struct vbinary_parameters) {
6091         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16,
6092         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
6093         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
6094         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
6095         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
6096         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
6097         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6098         .element_tile = 16,
6099       };
6100       xnn_params.f32.vsub = (struct vbinary_parameters) {
6101         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16,
6102         .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16,
6103         .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16,
6104         .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
6105         .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
6106         .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
6107         .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
6108         .element_tile = 16,
6109       };
6110     }
6111     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
6112       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__wasmsimd_x16,
6113       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
6114       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
6115       .element_tile = 16,
6116     };
6117     if (is_wasm_x86) {
6118       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6119         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_x86_2x,
6120         .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
6121         .channel_tile = 4,
6122         .row_tile = 2,
6123       };
6124     } else {
6125       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6126         .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_arm_2x,
6127         .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
6128         .channel_tile = 4,
6129         .row_tile = 2,
6130       };
6131     }
6132     #ifndef XNN_NO_NCHW_OPERATORS
6133       init_flags |= XNN_INIT_FLAG_CHW_OPT;
6134 
6135       if (is_wasm_x86) {
6136         xnn_params.f32.spmm = (struct spmm_parameters) {
6137           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86,
6138           .mr = 32,
6139           .nr = 1,
6140         };
6141       } else {
6142         xnn_params.f32.spmm = (struct spmm_parameters) {
6143           .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm,
6144           .mr = 32,
6145           .nr = 1,
6146         };
6147       }
6148       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
6149         .ukernel_with_symm_padding =
6150           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2,
6151         .output_channel_tile = 4,
6152         .output_height_tile = 2,
6153         .output_width_tile = 2,
6154       };
6155       if (is_wasm_x86) {
6156         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6157           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4,
6158           .output_width_tile = 4,
6159           .output_height_tile = 2,
6160         };
6161         xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6162           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2,
6163           .output_width_tile = 4,
6164           .output_height_tile = 1,
6165         };
6166         xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6167           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4,
6168           .output_width_tile = 4,
6169           .output_height_tile = 3,
6170         };
6171         xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6172           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2,
6173           .output_width_tile = 4,
6174           .output_height_tile = 1,
6175         };
6176       } else {
6177         xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6178           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4,
6179           .output_width_tile = 4,
6180           .output_height_tile = 2,
6181         };
6182         xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6183           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4,
6184           .output_width_tile = 4,
6185           .output_height_tile = 1,
6186         };
6187         xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6188           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4,
6189           .output_width_tile = 4,
6190           .output_height_tile = 3,
6191         };
6192         xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6193           .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2,
6194           .output_width_tile = 4,
6195           .output_height_tile = 1,
6196         };
6197       }
6198       if (is_wasm_x86) {
6199         xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6200           .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4,
6201           .channel_tile = 4,
6202         };
6203       } else {
6204         xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6205           .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4,
6206           .channel_tile = 4,
6207         };
6208       }
6209       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6210         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8,
6211         .channel_tile = 1,
6212         .pixel_tile = 8,
6213       };
6214     #endif  // XNN_NO_NCHW_OPERATORS
6215   #endif  // XNN_NO_F32_OPERATORS
6216 
6217   /*************************** VCVT WAsm SIMD micro-kernels***************************/
6218   #ifndef XNN_NO_VCVT_OPERATORS
6219     init_flags |= XNN_INIT_FLAG_VCVT;
6220 
6221     xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6222       .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16,
6223       .init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params,
6224       .element_tile = 16,
6225     };
6226     xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6227       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__wasmsimd_x24,
6228       .init.f32_f16_cvt = xnn_init_f32_f16_cvt_wasmsimd_params,
6229       .element_tile = 24,
6230     };
6231     xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6232       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x32,
6233       .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_wasmsimd_magic_params,
6234       .element_tile = 32,
6235     };
6236     xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6237       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x32,
6238       .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_wasmsimd_magic_params,
6239       .element_tile = 32,
6240     };
6241     #if XNN_ARCH_WASMRELAXEDSIMD
6242       xnn_params.vcvt.qs8 = (struct vunary_parameters) {
6243         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__wasmrelaxedsimd_x32,
6244         .init.qs8_cvt = xnn_init_qs8_cvt_wasmsimd_params,
6245         .element_tile = 32,
6246       };
6247     #else
6248       xnn_params.vcvt.qs8 = (struct vunary_parameters) {
6249         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__wasmsimd_x16,
6250         .init.qs8_cvt = xnn_init_qs8_cvt_wasmsimd_params,
6251         .element_tile = 16,
6252       };
6253     #endif
6254     xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6255       .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__wasmsimd_x32,
6256       .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_wasmsimd_params,
6257       .element_tile = 32,
6258     };
6259     #if XNN_ARCH_WASMRELAXEDSIMD
6260       xnn_params.vcvt.qu8 = (struct vunary_parameters) {
6261         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__wasmrelaxedsimd_x32,
6262         .init.qu8_cvt = xnn_init_qu8_cvt_wasmsimd_params,
6263         .element_tile = 32,
6264       };
6265     #else
6266       xnn_params.vcvt.qu8 = (struct vunary_parameters) {
6267         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__wasmsimd_x16,
6268         .init.qu8_cvt = xnn_init_qu8_cvt_wasmsimd_params,
6269         .element_tile = 16,
6270       };
6271     #endif
6272     xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6273       .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__wasmsimd_x32,
6274       .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_wasmsimd_params,
6275       .element_tile = 32,
6276     };
6277   #endif  // XNN_NO_VCVT_OPERATORS
6278 
6279   /**************************** X32 WAsm SIMD micro-kernels****************************/
6280   #ifndef XNN_NO_X32_OPERATORS
6281     init_flags |= XNN_INIT_FLAG_X32;
6282 
6283     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__wasmsimd;
6284     xnn_params.x32.zip = (struct zip_parameters) {
6285       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__wasmsimd,
6286       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__wasmsimd,
6287       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__wasmsimd,
6288       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__wasmsimd,
6289     };
6290 
6291     xnn_params.x32.transpose = (struct transpose_parameters) {
6292       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__2x4_scalar_int,
6293       .tile_size = 32,
6294     };
6295   #endif  // XNN_NO_X32_OPERATORS
6296 
6297   /**************************** XX WAsm SIMD micro-kernels****************************/
6298   #ifndef XNN_NO_XX_OPERATORS
6299     init_flags |= XNN_INIT_FLAG_XX;
6300 
6301     xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
6302     xnn_params.xx.fill = (struct fill_parameters) {
6303       .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__wasmsimd_x64,
6304       .row_tile = 1,
6305     };
6306     xnn_params.xx.pad = (struct pad_parameters) {
6307       .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__wasmsimd,
6308       .row_tile = 1,
6309     };
6310     xnn_params.xx.transpose = (struct transpose_parameters) {
6311       .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
6312       .tile_size = 32,
6313     };
6314   #endif
6315 
6316 #elif XNN_ARCH_WASM
6317 
6318   /**************************** QC8 WAsm micro-kernels****************************/
6319   #ifndef XNN_NO_QC8_OPERATORS
6320     init_flags |= XNN_INIT_FLAG_QC8;
6321 
6322     if (is_wasm_x86) {
6323       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
6324       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
6325       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
6326       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
6327       xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params;
6328       xnn_params.qc8.gemm.mr = 2;
6329       xnn_params.qc8.gemm.nr = 2;
6330     } else {
6331       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
6332       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
6333       xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
6334       xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
6335       xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
6336       xnn_params.qc8.gemm.mr = 4;
6337       xnn_params.qc8.gemm.nr = 4;
6338     }
6339 
6340     if (is_wasm_x86) {
6341       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x3__scalar_imagic;
6342       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params;
6343       xnn_params.qc8.dwconv[0].channel_tile = 2;
6344       xnn_params.qc8.dwconv[0].primary_tile = 3;
6345       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
6346       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params;
6347       xnn_params.qc8.dwconv[1].channel_tile = 2;
6348       xnn_params.qc8.dwconv[1].primary_tile = 9;
6349       xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
6350       xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_imagic_params;
6351       xnn_params.qc8.dwconv[2].channel_tile = 1;
6352       xnn_params.qc8.dwconv[2].primary_tile = 25;
6353     } else {
6354       xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x3__wasm_fmagic;
6355       xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
6356       xnn_params.qc8.dwconv[0].channel_tile = 2;
6357       xnn_params.qc8.dwconv[0].primary_tile = 3;
6358       xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
6359       xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
6360       xnn_params.qc8.dwconv[1].channel_tile = 2;
6361       xnn_params.qc8.dwconv[1].primary_tile = 9;
6362       xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
6363       xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_fmagic_params;
6364       xnn_params.qc8.dwconv[2].channel_tile = 2;
6365       xnn_params.qc8.dwconv[2].primary_tile = 25;
6366     }
6367   #endif  // XNN_NO_QC8_OPERATORS
6368 
6369   /**************************** QS8 WAsm micro-kernels****************************/
6370   #ifndef XNN_NO_QS8_OPERATORS
6371     init_flags |= XNN_INIT_FLAG_QS8;
6372 
6373     if (is_wasm_x86) {
6374       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
6375       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
6376       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
6377       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
6378       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
6379       xnn_params.qs8.gemm.mr = 2;
6380       xnn_params.qs8.gemm.nr = 2;
6381     } else {
6382       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
6383       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
6384       xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
6385       xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
6386       xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
6387       xnn_params.qs8.gemm.mr = 4;
6388       xnn_params.qs8.gemm.nr = 4;
6389     }
6390 
6391     if (is_wasm_x86) {
6392       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
6393       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
6394       xnn_params.qs8.dwconv[0].channel_tile = 2;
6395       xnn_params.qs8.dwconv[0].primary_tile = 9;
6396       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
6397       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
6398       xnn_params.qs8.dwconv[1].channel_tile = 1;
6399       xnn_params.qs8.dwconv[1].primary_tile = 25;
6400     } else {
6401       xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
6402       xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
6403       xnn_params.qs8.dwconv[0].channel_tile = 2;
6404       xnn_params.qs8.dwconv[0].primary_tile = 9;
6405       xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
6406       xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
6407       xnn_params.qs8.dwconv[1].channel_tile = 2;
6408       xnn_params.qs8.dwconv[1].primary_tile = 25;
6409     }
6410 
6411     xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
6412       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
6413       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
6414       .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
6415       .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
6416       .row_tile = 7,
6417       .channel_tile = 4,
6418     };
6419 
6420     xnn_params.qs8.vadd = (struct vbinary_parameters) {
6421       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
6422       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
6423       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
6424       .init.qs8_add = xnn_init_qs8_add_minmax_scalar_params,
6425       .element_tile = 4,
6426     };
6427     xnn_params.qs8.vmul = (struct vbinary_parameters) {
6428       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
6429       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
6430       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
6431       .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
6432       .element_tile = 4,
6433     };
6434 
6435     if (is_wasm_x86) {
6436       xnn_params.qs8.lrelu = (struct vunary_parameters) {
6437         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__scalar_select_x4,
6438         .init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_select_params,
6439         .element_tile = 4,
6440       };
6441     } else {
6442       xnn_params.qs8.lrelu = (struct vunary_parameters) {
6443         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__scalar_andxor_x4,
6444         .init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_andxor_params,
6445         .element_tile = 4,
6446       };
6447     }
6448   #endif  // XNN_NO_QS8_OPERATORS
6449 
6450   /**************************** QU8 WAsm micro-kernels****************************/
6451   #ifndef XNN_NO_QU8_OPERATORS
6452     init_flags |= XNN_INIT_FLAG_QU8;
6453 
6454     if (is_wasm_x86) {
6455       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
6456       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
6457       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
6458       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
6459       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
6460       xnn_params.qu8.gemm.mr = 2;
6461       xnn_params.qu8.gemm.nr = 2;
6462     } else {
6463       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
6464       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
6465       xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
6466       xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
6467       xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
6468       xnn_params.qu8.gemm.mr = 4;
6469       xnn_params.qu8.gemm.nr = 4;
6470     }
6471 
6472     if (is_wasm_x86) {
6473       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
6474       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
6475       xnn_params.qu8.dwconv[0].channel_tile = 2;
6476       xnn_params.qu8.dwconv[0].primary_tile = 9;
6477       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
6478       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
6479       xnn_params.qu8.dwconv[1].channel_tile = 1;
6480       xnn_params.qu8.dwconv[1].primary_tile = 25;
6481     } else {
6482       xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
6483       xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
6484       xnn_params.qu8.dwconv[0].channel_tile = 2;
6485       xnn_params.qu8.dwconv[0].primary_tile = 9;
6486       xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
6487       xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
6488       xnn_params.qu8.dwconv[1].channel_tile = 2;
6489       xnn_params.qu8.dwconv[1].primary_tile = 25;
6490     }
6491 
6492     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
6493       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
6494       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
6495       .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
6496       .primary_tile = 9,
6497       .incremental_tile = 8,
6498       .channel_tile = 1,
6499     };
6500     xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
6501       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
6502       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
6503       .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6504       .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6505       .row_tile = 7,
6506       .channel_tile = 4,
6507     };
6508 
6509     xnn_params.qu8.vadd = (struct vbinary_parameters) {
6510       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
6511       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
6512       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
6513       .init.qu8_add = xnn_init_qu8_add_minmax_scalar_params,
6514       .element_tile = 4,
6515     };
6516     xnn_params.qu8.vmul = (struct vbinary_parameters) {
6517       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
6518       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
6519       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
6520       .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
6521       .element_tile = 4,
6522     };
6523 
6524     if (is_wasm_x86) {
6525       xnn_params.qu8.lrelu = (struct vunary_parameters) {
6526         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__scalar_select_x4,
6527         .init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_select_params,
6528         .element_tile = 4,
6529       };
6530     } else {
6531       xnn_params.qu8.lrelu = (struct vunary_parameters) {
6532         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__scalar_andxor_x4,
6533         .init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_andxor_params,
6534         .element_tile = 4,
6535       };
6536     }
6537   #endif  // XNN_NO_QU8_OPERATORS
6538 
6539   /**************************** S8 WAsm micro-kernels****************************/
6540   #ifndef XNN_NO_S8_OPERATORS
6541     init_flags |= XNN_INIT_FLAG_S8;
6542 
6543     xnn_params.s8.clamp = (struct vunary_parameters) {
6544       .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
6545       .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
6546       .element_tile = 4,
6547     };
6548     xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
6549       .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
6550       .pixel_tile = 1,
6551       .channel_tile = 1,
6552     };
6553     xnn_params.s8.maxpool = (struct maxpool_parameters) {
6554       .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6555       .init.s8 = xnn_init_s8_minmax_scalar_params,
6556       .mr = 9,
6557       .qr = 8,
6558     };
6559   #endif  // XNN_NO_S8_OPERATORS
6560 
6561   /**************************** U8 WAsm micro-kernels****************************/
6562   #ifndef XNN_NO_U8_OPERATORS
6563     init_flags |= XNN_INIT_FLAG_U8;
6564 
6565     xnn_params.u8.clamp = (struct vunary_parameters) {
6566       .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
6567       .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
6568       .element_tile = 4,
6569     };
6570     xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
6571       .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
6572       .pixel_tile = 1,
6573       .channel_tile = 1,
6574     };
6575     xnn_params.u8.maxpool = (struct maxpool_parameters) {
6576       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6577       .init.u8 = xnn_init_u8_minmax_scalar_params,
6578       .mr = 9,
6579       .qr = 8,
6580     };
6581     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
6582     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
6583   #endif  // XNN_NO_U8_OPERATORS
6584 
6585   /**************************** X8 WAsm micro-kernels****************************/
6586   #ifndef XNN_NO_X8_OPERATORS
6587     init_flags |= XNN_INIT_FLAG_X8;
6588 
6589     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
6590     xnn_params.x8.zip = (struct zip_parameters) {
6591       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
6592       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
6593       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
6594       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
6595     };
6596 
6597     xnn_params.x8.transpose = (struct transpose_parameters) {
6598       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__2x4_scalar_int,
6599       .tile_size = 32,
6600     };
6601   #endif  // XNN_NO_X8_OPERATORS
6602 
6603   /**************************** X16 WAsm micro-kernels****************************/
6604   #ifndef XNN_NO_X16_OPERATORS
6605     init_flags |= XNN_INIT_FLAG_X16;
6606 
6607     xnn_params.x16.transpose = (struct transpose_parameters) {
6608       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__2x4_scalar_int,
6609       .tile_size = 32,
6610     };
6611   #endif  // XNN_NO_X16_OPERATORS
6612 
6613   /**************************** F32 WAsm micro-kernels****************************/
6614   #ifndef XNN_NO_F32_OPERATORS
6615     init_flags |= XNN_INIT_FLAG_F32;
6616 
6617     if (is_wasm_x86) {
6618       xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_2x4__scalar);
6619       xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_2x4__scalar);
6620       xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
6621       xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
6622       xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_2x4__scalar);
6623       xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_2x4__scalar);
6624       xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
6625       xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
6626       xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_2x4__scalar);
6627       xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(2)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_2x4__scalar);
6628       xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
6629       xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
6630       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
6631       xnn_params.f32.gemm.mr = 2;
6632       xnn_params.f32.gemm.nr = 4;
6633     } else {
6634       xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__wasm);
6635       xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__wasm);
6636       xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
6637       xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
6638       xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__wasm);
6639       xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__wasm);
6640       xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
6641       xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
6642       xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
6643       xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
6644       xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
6645       xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
6646       xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
6647       xnn_params.f32.gemm.mr = 4;
6648       xnn_params.f32.gemm.nr = 4;
6649     }
6650     xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__wasm);
6651     xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__wasm);
6652     xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
6653     xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar);
6654     xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
6655     xnn_params.f32.gemm2.mr = 4;
6656     xnn_params.f32.gemm2.nr = 2;
6657 
6658     xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__wasm_acc2;
6659     xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
6660     xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
6661     xnn_params.f32.dwconv[0].channel_tile = 1;
6662     xnn_params.f32.dwconv[0].primary_tile = 3;
6663 
6664     xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__wasm_acc2;
6665     xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
6666     xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
6667     xnn_params.f32.dwconv[1].channel_tile = 1;
6668     xnn_params.f32.dwconv[1].primary_tile = 4;
6669 
6670     xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__wasm_acc2;
6671     xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
6672     xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
6673     xnn_params.f32.dwconv[2].channel_tile = 1;
6674     xnn_params.f32.dwconv[2].primary_tile = 9;
6675 
6676     xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__wasm_acc2;
6677     xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
6678     xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
6679     xnn_params.f32.dwconv[3].channel_tile = 1;
6680     xnn_params.f32.dwconv[3].primary_tile = 25;
6681 
6682     xnn_params.f32.avgpool = (struct avgpool_parameters) {
6683       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasm_c1,
6684       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasm_c1,
6685       .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6686       .primary_tile = 9,
6687       .incremental_tile = 8,
6688       .channel_tile = 1,
6689     };
6690     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
6691       .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasm_c1,
6692       .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasm_c1,
6693       .init.f32 = xnn_init_f32_minmax_scalar_params,
6694       .primary_tile = 9,
6695       .incremental_tile = 8,
6696       .channel_tile = 1,
6697     };
6698     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
6699       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1,
6700       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1,
6701       .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6702       .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
6703       .row_tile = 7,
6704       .channel_tile = 1,
6705     };
6706     xnn_params.f32.maxpool = (struct maxpool_parameters) {
6707       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasm_c1,
6708       .init.f32 = xnn_init_f32_minmax_scalar_params,
6709       .mr = 9,
6710       .qr = 8,
6711     };
6712     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
6713       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
6714       .mr = 4,
6715     };
6716     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
6717       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
6718       .mr = 9,
6719     };
6720     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
6721       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
6722       .mr = 9,
6723       .qr = 8,
6724     };
6725     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
6726       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
6727       .pixel_tile = 1,
6728       .channel_tile = 2,
6729     };
6730     xnn_params.f32.abs = (struct vunary_parameters) {
6731       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
6732       .element_tile = 4,
6733     };
6734     xnn_params.f32.clamp = (struct vunary_parameters) {
6735       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__wasm_x4,
6736       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6737       .element_tile = 4,
6738     };
6739     if (is_wasm_x86) {
6740       xnn_params.f32.hswish = (struct vunary_parameters) {
6741         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
6742         .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
6743         .element_tile = 4,
6744       };
6745     } else {
6746       xnn_params.f32.hswish = (struct vunary_parameters) {
6747         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__wasm_x4,
6748         .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
6749         .element_tile = 4,
6750       };
6751     }
6752     if (is_wasm_x86) {
6753       xnn_params.f32.elu = (struct vunary_parameters) {
6754         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2,
6755         .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
6756         .element_tile = 2,
6757       };
6758     } else {
6759       xnn_params.f32.elu = (struct vunary_parameters) {
6760         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__wasm_rr2_p6_x6,
6761         .init.f32_elu = xnn_init_f32_elu_scalar_rr2_p6_params,
6762         .element_tile = 6,
6763       };
6764     }
6765     xnn_params.f32.lrelu = (struct vunary_parameters) {
6766       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
6767       .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
6768       .element_tile = 4,
6769     };
6770     xnn_params.f32.neg = (struct vunary_parameters) {
6771       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
6772       .element_tile = 4,
6773     };
6774     if (is_wasm_x86) {
6775       xnn_params.f32.relu = (struct vunary_parameters) {
6776         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrelu_ukernel__scalar_x8,
6777         .element_tile = 8,
6778       };
6779     } else {
6780       xnn_params.f32.relu = (struct vunary_parameters) {
6781         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrelu_ukernel__wasm_x8,
6782         .element_tile = 8,
6783       };
6784     }
6785     xnn_params.f32.rndne = (struct vunary_parameters) {
6786       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4,
6787       .element_tile = 4,
6788     };
6789     xnn_params.f32.rndz = (struct vunary_parameters) {
6790       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4,
6791       .element_tile = 4,
6792     };
6793     xnn_params.f32.rndu = (struct vunary_parameters) {
6794       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4,
6795       .element_tile = 4,
6796     };
6797     xnn_params.f32.rndd = (struct vunary_parameters) {
6798       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4,
6799       .element_tile = 4,
6800     };
6801     xnn_params.f32.sigmoid = (struct vunary_parameters) {
6802       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
6803       .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
6804       .element_tile = 2,
6805     };
6806     xnn_params.f32.sqr = (struct vunary_parameters) {
6807       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
6808       .element_tile = 4,
6809     };
6810     xnn_params.f32.sqrt = (struct vunary_parameters) {
6811       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
6812       .element_tile = 1,
6813     };
6814     if (is_wasm_x86) {
6815       xnn_params.f32.prelu = (struct prelu_parameters) {
6816         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
6817         .row_tile = 2,
6818         .channel_tile = 4,
6819       };
6820     } else {
6821       xnn_params.f32.prelu = (struct prelu_parameters) {
6822         .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
6823         .row_tile = 2,
6824         .channel_tile = 4,
6825       };
6826     }
6827     xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
6828       .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
6829       .init.f32 = xnn_init_f32_expminus_scalar_rr2_p5_params,
6830       .element_tile = 4,
6831     };
6832     xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__scalar;
6833     xnn_params.f32.vadd = (struct vbinary_parameters) {
6834       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasm_x8,
6835       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
6836       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
6837       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6838       .element_tile = 8,
6839     };
6840     xnn_params.f32.vdiv = (struct vbinary_parameters) {
6841       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasm_x8,
6842       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasm_x8,
6843       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasm_x8,
6844       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6845       .element_tile = 8,
6846     };
6847     xnn_params.f32.vmax = (struct vbinary_parameters) {
6848       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x8,
6849       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
6850       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
6851       .element_tile = 8,
6852     };
6853     xnn_params.f32.vmin = (struct vbinary_parameters) {
6854       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x8,
6855       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
6856       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
6857       .element_tile = 8,
6858     };
6859     xnn_params.f32.vmul = (struct vbinary_parameters) {
6860       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasm_x8,
6861       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
6862       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
6863       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6864       .element_tile = 8,
6865     };
6866     xnn_params.f32.vsub = (struct vbinary_parameters) {
6867       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasm_x8,
6868       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasm_x8,
6869       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasm_x8,
6870       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6871       .element_tile = 8,
6872     };
6873     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
6874       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
6875       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6876       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6877       .element_tile = 8,
6878     };
6879     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6880       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__wasm_2x,
6881       .init.f32 = xnn_init_f32_minmax_scalar_params,
6882       .channel_tile = 1,
6883       .row_tile = 2,
6884     };
6885     #ifndef XNN_NO_NCHW_OPERATORS
6886       init_flags |= XNN_INIT_FLAG_CHW_OPT;
6887 
6888       xnn_params.f32.spmm = (struct spmm_parameters) {
6889         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
6890         .mr = 8,
6891         .nr = 1,
6892       };
6893       xnn_params.f32.spmm2 = (struct spmm_parameters) {
6894         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
6895         .mr = 8,
6896         .nr = 2,
6897       };
6898       xnn_params.f32.spmm4 = (struct spmm_parameters) {
6899         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
6900         .mr = 8,
6901         .nr = 4,
6902       };
6903       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
6904         .ukernel_with_symm_padding =
6905           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
6906         .output_channel_tile = 4,
6907         .output_height_tile = 1,
6908         .output_width_tile = 1,
6909       };
6910       xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6911         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
6912         .output_width_tile = 1,
6913         .output_height_tile = 2,
6914       };
6915       xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6916         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
6917         .output_width_tile = 1,
6918         .output_height_tile = 1,
6919       };
6920       xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6921         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
6922         .output_width_tile = 1,
6923         .output_height_tile = 1,
6924       };
6925       xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6926         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
6927         .output_width_tile = 1,
6928         .output_height_tile = 1,
6929       };
6930       xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6931         .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
6932         .channel_tile = 1,
6933       };
6934       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6935         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
6936         .channel_tile = 1,
6937         .pixel_tile = 4,
6938       };
6939     #endif  // XNN_NO_NCHW_OPERATORS
6940   #endif  // XNN_NO_F32_OPERATORS
6941 
6942   /*************************** VCVT WAsm micro-kernels***************************/
6943   #ifndef XNN_NO_VCVT_OPERATORS
6944     init_flags |= XNN_INIT_FLAG_VCVT;
6945 
6946     xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6947       .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x1,
6948       .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6949       .element_tile = 1,
6950     };
6951     xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6952       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_bitcast_x4,
6953       .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_bitcast_params,
6954       .element_tile = 4,
6955     };
6956     if (is_wasm_x86) {
6957       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6958         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6959         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
6960         .element_tile = 1,
6961       };
6962       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6963         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6964         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
6965         .element_tile = 1,
6966       };
6967     } else {
6968       xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6969         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6970         .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_fmagic_params,
6971         .element_tile = 4,
6972       };
6973       xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6974         .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6975         .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_fmagic_params,
6976         .element_tile = 4,
6977       };
6978     }
6979     if (is_wasm_x86) {
6980       xnn_params.vcvt.qs8 = (struct vunary_parameters) {
6981         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__scalar_x1,
6982         .init.qs8_cvt = xnn_init_qs8_cvt_scalar_params,
6983         .element_tile = 1,
6984       };
6985     } else {
6986       xnn_params.vcvt.qs8 = (struct vunary_parameters) {
6987         .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__scalar_x4,
6988         .init.qs8_cvt = xnn_init_qs8_cvt_scalar_params,
6989         .element_tile = 4,
6990       };
6991     }
6992     xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6993       .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x1,
6994       .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6995       .element_tile = 1,
6996     };
6997     if (is_wasm_x86) {
6998       xnn_params.vcvt.qu8 = (struct vunary_parameters) {
6999         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__scalar_x1,
7000         .init.qu8_cvt = xnn_init_qu8_cvt_scalar_params,
7001         .element_tile = 1,
7002       };
7003     } else {
7004       xnn_params.vcvt.qu8 = (struct vunary_parameters) {
7005         .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__scalar_x4,
7006         .init.qu8_cvt = xnn_init_qu8_cvt_scalar_params,
7007         .element_tile = 4,
7008       };
7009     }
7010     xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
7011       .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x1,
7012       .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
7013       .element_tile = 1,
7014     };
7015   #endif  // XNN_NO_VCVT_OPERATORS
7016 
7017   /**************************** X32 WAsm micro-kernels****************************/
7018   #ifndef XNN_NO_X32_OPERATORS
7019     init_flags |= XNN_INIT_FLAG_X32;
7020 
7021     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
7022     xnn_params.x32.zip = (struct zip_parameters) {
7023       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
7024       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
7025       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
7026       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
7027     };
7028 
7029     xnn_params.x32.transpose = (struct transpose_parameters) {
7030       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__2x4_scalar_int,
7031       .tile_size = 32,
7032     };
7033   #endif  // XNN_NO_X32_OPERATORS
7034 
7035   /**************************** XX WAsm micro-kernels****************************/
7036   #ifndef XNN_NO_XX_OPERATORS
7037     init_flags |= XNN_INIT_FLAG_XX;
7038 
7039     xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
7040     xnn_params.xx.fill = (struct fill_parameters) {
7041       .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
7042       .row_tile = 1,
7043     };
7044     xnn_params.xx.pad = (struct pad_parameters) {
7045       .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
7046       .row_tile = 1,
7047     };
7048     xnn_params.xx.transpose = (struct transpose_parameters) {
7049       .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
7050       .tile_size = 32,
7051     };
7052   #endif
7053 
7054 #elif XNN_ARCH_RISCV
7055 
7056   /************************** QC8 RISC-V micro-kernels **************************/
7057   #ifndef XNN_NO_QC8_OPERATORS
7058     init_flags |= XNN_INIT_FLAG_QC8;
7059 
7060     xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
7061     xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
7062     xnn_params.qc8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
7063     xnn_params.qc8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
7064     xnn_params.qc8.gemm.init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params;
7065     xnn_params.qc8.gemm.mr = 3;
7066     xnn_params.qc8.gemm.nr = 4;
7067 
7068     xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x3__scalar_lrintf;
7069     xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params;
7070     xnn_params.qc8.dwconv[0].channel_tile = 2;
7071     xnn_params.qc8.dwconv[0].primary_tile = 3;
7072     xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
7073     xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params;
7074     xnn_params.qc8.dwconv[1].channel_tile = 2;
7075     xnn_params.qc8.dwconv[1].primary_tile = 9;
7076     xnn_params.qc8.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
7077     xnn_params.qc8.dwconv[2].init.qc8 = xnn_init_qc8_conv_minmax_fp32_scalar_lrintf_params;
7078     xnn_params.qc8.dwconv[2].channel_tile = 2;
7079     xnn_params.qc8.dwconv[2].primary_tile = 25;
7080   #endif  // XNN_NO_QS8_OPERATORS
7081 
7082   /************************** QS8 RISC-V micro-kernels **************************/
7083   #ifndef XNN_NO_QS8_OPERATORS
7084     init_flags |= XNN_INIT_FLAG_QS8;
7085 
7086     xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
7087     xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
7088     xnn_params.qs8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
7089     xnn_params.qs8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
7090     xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
7091     xnn_params.qs8.gemm.mr = 3;
7092     xnn_params.qs8.gemm.nr = 4;
7093 
7094     xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
7095     xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
7096     xnn_params.qs8.dwconv[0].channel_tile = 2;
7097     xnn_params.qs8.dwconv[0].primary_tile = 9;
7098     xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
7099     xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
7100     xnn_params.qs8.dwconv[1].channel_tile = 2;
7101     xnn_params.qs8.dwconv[1].primary_tile = 25;
7102 
7103     xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
7104       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
7105       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
7106       .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
7107       .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
7108       .row_tile = 7,
7109       .channel_tile = 1,
7110     };
7111 
7112     xnn_params.qs8.vadd = (struct vbinary_parameters) {
7113       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
7114       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
7115       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
7116       .init.qs8_add = xnn_init_qs8_add_minmax_scalar_params,
7117       .element_tile = 4,
7118     };
7119     xnn_params.qs8.vmul = (struct vbinary_parameters) {
7120       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
7121       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
7122       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
7123       .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
7124       .element_tile = 4,
7125     };
7126 
7127     xnn_params.qs8.lrelu = (struct vunary_parameters) {
7128       .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vlrelu_ukernel__scalar_andxor_x4,
7129       .init.qs8_lrelu = xnn_init_qs8_lrelu_scalar_andxor_params,
7130       .element_tile = 4,
7131     };
7132   #endif  // XNN_NO_QS8_OPERATORS
7133 
7134   /************************** QU8 RISC-V micro-kernels **************************/
7135   #ifndef XNN_NO_QU8_OPERATORS
7136     init_flags |= XNN_INIT_FLAG_QU8;
7137 
7138     xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
7139     xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(3)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
7140     xnn_params.qu8.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
7141     xnn_params.qu8.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
7142     xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
7143     xnn_params.qu8.gemm.mr = 3;
7144     xnn_params.qu8.gemm.nr = 4;
7145 
7146     xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
7147     xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
7148     xnn_params.qu8.dwconv[0].channel_tile = 2;
7149     xnn_params.qu8.dwconv[0].primary_tile = 9;
7150     xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
7151     xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
7152     xnn_params.qu8.dwconv[1].channel_tile = 2;
7153     xnn_params.qu8.dwconv[1].primary_tile = 25;
7154 
7155     xnn_params.qu8.avgpool = (struct avgpool_parameters) {
7156       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
7157       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
7158       .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
7159       .primary_tile = 9,
7160       .incremental_tile = 8,
7161       .channel_tile = 1,
7162     };
7163     xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
7164       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
7165       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
7166       .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
7167       .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
7168       .row_tile = 7,
7169       .channel_tile = 1,
7170     };
7171 
7172     xnn_params.qu8.vadd = (struct vbinary_parameters) {
7173       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
7174       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
7175       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
7176       .init.qu8_add = xnn_init_qu8_add_minmax_scalar_params,
7177       .element_tile = 4,
7178     };
7179     xnn_params.qu8.vmul = (struct vbinary_parameters) {
7180       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
7181       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
7182       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
7183       .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
7184       .element_tile = 4,
7185     };
7186 
7187     xnn_params.qu8.lrelu = (struct vunary_parameters) {
7188       .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vlrelu_ukernel__scalar_andxor_x4,
7189       .init.qu8_lrelu = xnn_init_qu8_lrelu_scalar_andxor_params,
7190       .element_tile = 4,
7191     };
7192   #endif  // XNN_NO_QU8_OPERATORS
7193 
7194   /************************** S8 RISC-V micro-kernels ***************************/
7195   #ifndef XNN_NO_S8_OPERATORS
7196     init_flags |= XNN_INIT_FLAG_S8;
7197 
7198     xnn_params.s8.clamp = (struct vunary_parameters) {
7199       .ukernel = (xnn_vunary_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
7200       .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
7201       .element_tile = 4,
7202     };
7203     xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
7204       .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
7205       .pixel_tile = 1,
7206       .channel_tile = 1,
7207     };
7208     xnn_params.s8.maxpool = (struct maxpool_parameters) {
7209       .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
7210       .init.s8 = xnn_init_s8_minmax_scalar_params,
7211       .mr = 9,
7212       .qr = 8,
7213     };
7214   #endif  // XNN_NO_S8_OPERATORS
7215 
7216   /************************** U8 RISC-V micro-kernels ***************************/
7217   #ifndef XNN_NO_U8_OPERATORS
7218     init_flags |= XNN_INIT_FLAG_U8;
7219 
7220     xnn_params.u8.clamp = (struct vunary_parameters) {
7221       .ukernel = (xnn_vunary_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
7222       .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
7223       .element_tile = 4,
7224     };
7225     xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
7226       .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
7227       .pixel_tile = 1,
7228       .channel_tile = 1,
7229     };
7230     xnn_params.u8.maxpool = (struct maxpool_parameters) {
7231       .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
7232       .init.u8 = xnn_init_u8_minmax_scalar_params,
7233       .mr = 9,
7234       .qr = 8,
7235     };
7236     xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
7237     xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
7238   #endif  // XNN_NO_U8_OPERATORS
7239 
7240   /************************** X8 RISC-V micro-kernels ***************************/
7241   #ifndef XNN_NO_X8_OPERATORS
7242     init_flags |= XNN_INIT_FLAG_X8;
7243 
7244     xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
7245     xnn_params.x8.zip = (struct zip_parameters) {
7246       .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
7247       .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
7248       .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
7249       .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
7250     };
7251 
7252     xnn_params.x8.transpose = (struct transpose_parameters) {
7253       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x8_transposec_ukernel__2x4_scalar_int,
7254       .tile_size = 32,
7255     };
7256   #endif  // XNN_NO_X8_OPERATORS
7257 
7258   /************************** X16 RISC-V micro-kernels ***************************/
7259   #ifndef XNN_NO_X16_OPERATORS
7260     init_flags |= XNN_INIT_FLAG_X16;
7261 
7262     xnn_params.x16.transpose = (struct transpose_parameters) {
7263       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x16_transposec_ukernel__2x4_scalar_int,
7264       .tile_size = 32,
7265     };
7266   #endif  // XNN_NO_X16_OPERATORS
7267 
7268   /************************** F32 RISC-V micro-kernels **************************/
7269   #ifndef XNN_NO_F32_OPERATORS
7270     init_flags |= XNN_INIT_FLAG_F32;
7271 
7272     xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
7273     xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
7274     xnn_params.f32.gemm.minmax.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
7275     xnn_params.f32.gemm.minmax.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
7276     xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
7277     xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
7278     xnn_params.f32.gemm.relu.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
7279     xnn_params.f32.gemm.relu.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
7280     xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
7281     xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
7282     xnn_params.f32.gemm.linear.gemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
7283     xnn_params.f32.gemm.linear.igemm[XNN_MR_TO_INDEX(1)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
7284     xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
7285     xnn_params.f32.gemm.mr = 4;
7286     xnn_params.f32.gemm.nr = 4;
7287 
7288     xnn_params.f32.gemm2.minmax.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
7289     xnn_params.f32.gemm2.minmax.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar);
7290     xnn_params.f32.gemm2.linear.gemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
7291     xnn_params.f32.gemm2.linear.igemm[XNN_MR_TO_INDEX(4)] = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar);
7292     xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
7293     xnn_params.f32.gemm2.mr = 4;
7294     xnn_params.f32.gemm2.nr = 2;
7295 
7296     xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
7297     xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
7298     xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
7299     xnn_params.f32.dwconv[0].channel_tile = 1;
7300     xnn_params.f32.dwconv[0].primary_tile = 3;
7301 
7302     xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
7303     xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
7304     xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
7305     xnn_params.f32.dwconv[1].channel_tile = 1;
7306     xnn_params.f32.dwconv[1].primary_tile = 4;
7307 
7308     xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
7309     xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
7310     xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
7311     xnn_params.f32.dwconv[2].channel_tile = 1;
7312     xnn_params.f32.dwconv[2].primary_tile = 9;
7313 
7314     xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
7315     xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
7316     xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
7317     xnn_params.f32.dwconv[3].channel_tile = 1;
7318     xnn_params.f32.dwconv[3].primary_tile = 25;
7319 
7320     xnn_params.f32.avgpool = (struct avgpool_parameters) {
7321       .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
7322       .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
7323       .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
7324       .primary_tile = 9,
7325       .incremental_tile = 8,
7326       .channel_tile = 1,
7327     };
7328     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
7329       .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
7330       .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
7331       .init.f32 = xnn_init_f32_minmax_scalar_params,
7332       .primary_tile = 9,
7333       .incremental_tile = 8,
7334       .channel_tile = 1,
7335     };
7336     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
7337       .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
7338       .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
7339       .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
7340       .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
7341       .row_tile = 7,
7342       .channel_tile = 1,
7343     };
7344     xnn_params.f32.maxpool = (struct maxpool_parameters) {
7345       .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
7346       .init.f32 = xnn_init_f32_minmax_scalar_params,
7347       .mr = 9,
7348       .qr = 8,
7349     };
7350     xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
7351       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
7352       .mr = 4,
7353     };
7354     xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
7355       .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
7356       .mr = 9,
7357     };
7358     xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
7359       .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
7360       .mr = 9,
7361       .qr = 8,
7362     };
7363     xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
7364       .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
7365       .pixel_tile = 1,
7366       .channel_tile = 2,
7367     };
7368     xnn_params.f32.abs = (struct vunary_parameters) {
7369       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
7370       .element_tile = 4,
7371     };
7372     xnn_params.f32.clamp = (struct vunary_parameters) {
7373       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
7374       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
7375       .element_tile = 4,
7376     };
7377     xnn_params.f32.elu = (struct vunary_parameters) {
7378       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
7379       .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
7380       .element_tile = 4,
7381     };
7382     xnn_params.f32.hswish = (struct vunary_parameters) {
7383       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
7384       .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
7385       .element_tile = 4,
7386     };
7387     xnn_params.f32.lrelu = (struct vunary_parameters) {
7388       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
7389       .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
7390       .element_tile = 4,
7391     };
7392     xnn_params.f32.neg = (struct vunary_parameters) {
7393       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
7394       .element_tile = 4,
7395     };
7396     xnn_params.f32.rndne = (struct vunary_parameters) {
7397       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
7398       .element_tile = 1,
7399     };
7400     xnn_params.f32.rndz = (struct vunary_parameters) {
7401       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
7402       .element_tile = 1,
7403     };
7404     xnn_params.f32.rndu = (struct vunary_parameters) {
7405       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
7406       .element_tile = 1,
7407     };
7408     xnn_params.f32.rndd = (struct vunary_parameters) {
7409       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
7410       .element_tile = 1,
7411     };
7412     xnn_params.f32.sigmoid = (struct vunary_parameters) {
7413       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
7414       .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
7415       .element_tile = 2,
7416     };
7417     xnn_params.f32.sqr = (struct vunary_parameters) {
7418       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
7419       .element_tile = 4,
7420     };
7421     xnn_params.f32.sqrt = (struct vunary_parameters) {
7422       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
7423       .element_tile = 1,
7424     };
7425     xnn_params.f32.prelu = (struct prelu_parameters) {
7426       .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
7427       .row_tile = 4,
7428       .channel_tile = 4,
7429     };
7430     xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
7431       .ukernel = (xnn_raddstoreexpminusmax_ukernel_function) xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
7432       .init.f32 = xnn_init_f32_expminus_scalar_rr2_p5_params,
7433       .element_tile = 4,
7434     };
7435     xnn_params.f32.rmax = (xnn_rmax_ukernel_function) xnn_f32_rmax_ukernel__scalar;
7436     xnn_params.f32.vadd = (struct vbinary_parameters) {
7437       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
7438       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
7439       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
7440       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
7441       .element_tile = 8,
7442     };
7443     xnn_params.f32.vdiv = (struct vbinary_parameters) {
7444       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
7445       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
7446       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
7447       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
7448       .element_tile = 2,
7449     };
7450     xnn_params.f32.vmax = (struct vbinary_parameters) {
7451       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
7452       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
7453       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
7454       .element_tile = 8,
7455     };
7456     xnn_params.f32.vmin = (struct vbinary_parameters) {
7457       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
7458       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
7459       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
7460       .element_tile = 8,
7461     };
7462     xnn_params.f32.vmul = (struct vbinary_parameters) {
7463       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
7464       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
7465       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
7466       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
7467       .element_tile = 8,
7468     };
7469     xnn_params.f32.vsub = (struct vbinary_parameters) {
7470       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
7471       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
7472       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
7473       .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
7474       .element_tile = 8,
7475     };
7476     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
7477       .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
7478       .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
7479       .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
7480       .element_tile = 8,
7481     };
7482     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
7483       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
7484       .init.f32 = xnn_init_f32_minmax_scalar_params,
7485       .channel_tile = 1,
7486       .row_tile = 2,
7487     };
7488     #ifndef XNN_NO_NCHW_OPERATORS
7489       init_flags |= XNN_INIT_FLAG_CHW_OPT;
7490 
7491       xnn_params.f32.spmm = (struct spmm_parameters) {
7492         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
7493         .mr = 8,
7494         .nr = 1,
7495       };
7496       xnn_params.f32.spmm2 = (struct spmm_parameters) {
7497         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
7498         .mr = 8,
7499         .nr = 2,
7500       };
7501       xnn_params.f32.spmm4 = (struct spmm_parameters) {
7502         .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
7503         .mr = 8,
7504         .nr = 4,
7505       };
7506       xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
7507         .ukernel_with_symm_padding =
7508           (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
7509         .output_channel_tile = 4,
7510         .output_height_tile = 1,
7511         .output_width_tile = 1,
7512       };
7513       xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
7514         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
7515         .output_width_tile = 1,
7516         .output_height_tile = 2,
7517       };
7518       xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
7519         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
7520         .output_width_tile = 1,
7521         .output_height_tile = 1,
7522       };
7523       xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
7524         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
7525         .output_width_tile = 1,
7526         .output_height_tile = 1,
7527       };
7528       xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
7529         .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
7530         .output_width_tile = 1,
7531         .output_height_tile = 1,
7532       };
7533       xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
7534         .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
7535         .channel_tile = 1,
7536       };
7537       xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
7538         .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
7539         .channel_tile = 1,
7540         .pixel_tile = 4,
7541       };
7542     #endif  // XNN_NO_NCHW_OPERATORS
7543   #endif  // XNN_NO_F32_OPERATORS
7544 
7545   /************************** VCVT RISC-V micro-kernels *************************/
7546   #ifndef XNN_NO_VCVT_OPERATORS
7547     init_flags |= XNN_INIT_FLAG_VCVT;
7548 
7549     xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
7550       .ukernel = (xnn_vunary_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
7551       .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
7552       .element_tile = 4,
7553     };
7554     xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
7555       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
7556       .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
7557       .element_tile = 2,
7558     };
7559     xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
7560       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_x4,
7561       .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_lrintf_params,
7562       .element_tile = 4,
7563     };
7564     xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
7565       .ukernel = (xnn_vunary_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_x4,
7566       .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_lrintf_params,
7567       .element_tile = 4,
7568     };
7569     xnn_params.vcvt.qs8 = (struct vunary_parameters) {
7570       .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_vcvt_ukernel__scalar_x4,
7571       .init.qs8_cvt = xnn_init_qs8_cvt_scalar_params,
7572       .element_tile = 4,
7573     };
7574     xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
7575       .ukernel = (xnn_vunary_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
7576       .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
7577       .element_tile = 4,
7578     };
7579     xnn_params.vcvt.qu8 = (struct vunary_parameters) {
7580       .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_vcvt_ukernel__scalar_x4,
7581       .init.qu8_cvt = xnn_init_qu8_cvt_scalar_params,
7582       .element_tile = 4,
7583     };
7584     xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
7585       .ukernel = (xnn_vunary_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
7586       .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
7587       .element_tile = 4,
7588     };
7589   #endif  // XNN_NO_VCVT_OPERATORS
7590 
7591   /************************** X32 RISC-V micro-kernels **************************/
7592   #ifndef XNN_NO_X32_OPERATORS
7593     init_flags |= XNN_INIT_FLAG_X32;
7594 
7595     xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
7596     xnn_params.x32.zip = (struct zip_parameters) {
7597       .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
7598       .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
7599       .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
7600       .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
7601     };
7602 
7603     xnn_params.x32.transpose = (struct transpose_parameters) {
7604       .const_size_ukernel = (xnn_transposec_ukernel_function) xnn_x32_transposec_ukernel__2x4_scalar_int,
7605       .tile_size = 32,
7606     };
7607   #endif  // XNN_NO_X32_OPERATORS
7608 
7609   /************************** XX RISC-V micro-kernels ***************************/
7610   #ifndef XNN_NO_XX_OPERATORS
7611     init_flags |= XNN_INIT_FLAG_XX;
7612 
7613     xnn_params.xx.copy = (xnn_vunary_ukernel_function) xnn_xx_copy_ukernel__memcpy;
7614     xnn_params.xx.fill = (struct fill_parameters) {
7615       .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
7616       .row_tile = 1,
7617     };
7618     xnn_params.xx.pad = (struct pad_parameters) {
7619       .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
7620       .row_tile = 1,
7621     };
7622     xnn_params.xx.transpose = (struct transpose_parameters) {
7623       .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_memcpy,
7624       .tile_size = 32,
7625     };
7626   #endif  // XNN_NO_XX_OPERATORS
7627 
7628 #else
7629   #error "Unsupported architecture"
7630 #endif
7631 
7632   // Get page size.
7633   #if XNN_PLATFORM_WINDOWS
7634     SYSTEM_INFO sysinfo;
7635     GetSystemInfo(&sysinfo);
7636     xnn_params.page_size = sysinfo.dwPageSize;
7637   #else
7638     const long res = sysconf(_SC_PAGESIZE);
7639     if (res == -1) {
7640       xnn_log_error("failed to get page size, error code: %d", errno);
7641       return;
7642     }
7643     xnn_params.page_size = res;
7644   #endif
7645 
7646   memcpy(&xnn_params.allocator, init_allocator, sizeof(struct xnn_allocator));
7647   xnn_params.init_flags = init_flags;
7648 }
7649 
7650 #if XNN_PLATFORM_WINDOWS
init_windows(PINIT_ONCE init_once,PVOID parameter,PVOID * context)7651   static BOOL CALLBACK init_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
7652     init();
7653     return TRUE;
7654   }
7655 #endif
7656 
xnn_initialize(const struct xnn_allocator * allocator)7657 enum xnn_status xnn_initialize(const struct xnn_allocator* allocator) {
7658   #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
7659     if (!cpuinfo_initialize()) {
7660       return xnn_status_out_of_memory;
7661     }
7662   #endif  // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
7663   if (allocator == NULL) {
7664     allocator = &xnn_default_allocator;
7665   }
7666   #ifdef _MSC_VER
7667     _InterlockedCompareExchangePointer((PVOID volatile*) &init_allocator, (PVOID) allocator, NULL);
7668   #else
7669     __sync_bool_compare_and_swap(&init_allocator, NULL, allocator);
7670   #endif
7671   #if XNN_PLATFORM_WINDOWS
7672     InitOnceExecuteOnce(&init_guard, &init_windows, NULL, NULL);
7673   #else
7674     pthread_once(&init_guard, &init);
7675   #endif
7676   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) != 0) {
7677     return xnn_status_success;
7678   } else {
7679     return xnn_status_unsupported_hardware;
7680   }
7681 }
7682 
xnn_deinitialize(void)7683 enum xnn_status xnn_deinitialize(void) {
7684   #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
7685     cpuinfo_deinitialize();
7686   #endif  // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
7687   return xnn_status_success;
7688 }
7689