1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <math.h>
10 #include <stdbool.h>
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <string.h>
14
15 #ifdef _WIN32
16 #include <windows.h>
17 #else
18 #include <pthread.h>
19 #endif
20
21 #ifdef _MSC_VER
22 #include <intrin.h>
23 #endif
24
25 #ifndef __EMSCRIPTEN__
26 #include <cpuinfo.h>
27 #endif
28
29 #include <xnnpack.h>
30 #include <xnnpack/allocator.h>
31 #include <xnnpack/argmaxpool.h>
32 #include <xnnpack/avgpool.h>
33 #include <xnnpack/common.h>
34 #include <xnnpack/conv.h>
35 #include <xnnpack/dwconv.h>
36 #include <xnnpack/depthtospace.h>
37 #include <xnnpack/gavgpool.h>
38 #include <xnnpack/gemm.h>
39 #include <xnnpack/fill.h>
40 #include <xnnpack/ibilinear.h>
41 #include <xnnpack/igemm.h>
42 #include <xnnpack/log.h>
43 #include <xnnpack/lut.h>
44 #include <xnnpack/maxpool.h>
45 #include <xnnpack/pad.h>
46 #include <xnnpack/params.h>
47 #include <xnnpack/params-init.h>
48 #include <xnnpack/pavgpool.h>
49 #include <xnnpack/prelu.h>
50 #include <xnnpack/raddstoreexpminusmax.h>
51 #include <xnnpack/rmax.h>
52 #include <xnnpack/spmm.h>
53 #include <xnnpack/unpool.h>
54 #include <xnnpack/vaddsub.h>
55 #include <xnnpack/vbinary.h>
56 #include <xnnpack/vcvt.h>
57 #include <xnnpack/vmul.h>
58 #include <xnnpack/vmulcaddc.h>
59 #include <xnnpack/vunary.h>
60 #include <xnnpack/zip.h>
61
62 #ifndef XNN_ENABLE_ASSEMBLY
63 #define XNN_ENABLE_ASSEMBLY 1
64 #endif
65
66 #if XNN_PLATFORM_WINDOWS
67 static INIT_ONCE init_guard = INIT_ONCE_STATIC_INIT;
68 #else
69 static pthread_once_t init_guard = PTHREAD_ONCE_INIT;
70 #endif
71
72 static const struct xnn_allocator* volatile init_allocator = NULL;
73
74 struct xnn_parameters xnn_params = {
75 .init_flags = 0
76 };
77
init(void)78 static void init(void) {
79 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
80 // Unlike most other architectures, on x86/x86-64 when floating-point instructions
81 // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
82 // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
83 // of two infinities (must produce NaN per IEEE 754 standard).
84 static const volatile float inf = INFINITY;
85 const bool is_wasm_x86 = signbit(inf - inf);
86 #endif
87 uint32_t init_flags = XNN_INIT_FLAG_XNNPACK;
88
89 #if XNN_ARCH_ARM
90 #if XNN_PLATFORM_MOBILE
91 if (!cpuinfo_has_arm_neon()) {
92 xnn_log_error("XNNPACK initialization failed: NEON is not supported");
93 return;
94 }
95 #else
96 if (!cpuinfo_has_arm_vfpv2() && !cpuinfo_has_arm_vfpv3()) {
97 xnn_log_error("XNNPACK initialization failed: VFP is not supported");
98 return;
99 }
100 #endif
101
102 if (cpuinfo_has_arm_neon()) {
103 /**************************** QC8 AArch32 micro-kernels ****************************/
104 #ifndef XNN_NO_QC8_OPERATORS
105 init_flags |= XNN_INIT_FLAG_QC8;
106
107 #if XNN_ENABLE_ASSEMBLY
108 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
109 switch (cpuinfo_get_uarch(0)->uarch) {
110 case cpuinfo_uarch_cortex_a55:
111 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55);
112 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55);
113 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
114 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
115 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
116 xnn_params.qc8.gemm.mr = 4;
117 xnn_params.qc8.gemm.nr = 8;
118 xnn_params.qc8.gemm.log2_kr = 2;
119 break;
120 default:
121 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
122 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_ld64);
123 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
124 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
125 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
126 xnn_params.qc8.gemm.mr = 4;
127 xnn_params.qc8.gemm.nr = 8;
128 xnn_params.qc8.gemm.log2_kr = 2;
129 break;
130 }
131 } else {
132 switch (cpuinfo_get_uarch(0)->uarch) {
133 case cpuinfo_uarch_cortex_a7:
134 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
135 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
136 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
137 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
138 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
139 xnn_params.qc8.gemm.mr = 4;
140 xnn_params.qc8.gemm.nr = 8;
141 break;
142 case cpuinfo_uarch_cortex_a35:
143 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
144 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
145 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
146 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
147 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
148 xnn_params.qc8.gemm.mr = 4;
149 xnn_params.qc8.gemm.nr = 8;
150 break;
151 case cpuinfo_uarch_cortex_a53:
152 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53);
153 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
154 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
155 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
156 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
157 xnn_params.qc8.gemm.mr = 4;
158 xnn_params.qc8.gemm.nr = 8;
159 break;
160 case cpuinfo_uarch_cortex_a55r0:
161 case cpuinfo_uarch_kryo:
162 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53);
163 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
164 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
165 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
166 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
167 xnn_params.qc8.gemm.mr = 4;
168 xnn_params.qc8.gemm.nr = 8;
169 break;
170 case cpuinfo_uarch_cortex_a72:
171 case cpuinfo_uarch_exynos_m1:
172 case cpuinfo_uarch_exynos_m2:
173 case cpuinfo_uarch_exynos_m3:
174 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
175 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64);
176 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
177 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
178 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
179 xnn_params.qc8.gemm.mr = 4;
180 xnn_params.qc8.gemm.nr = 8;
181 break;
182
183 default:
184 if (cpuinfo_has_arm_neon_v8()) {
185 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
186 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64);
187 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
188 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane);
189 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
190 xnn_params.qc8.gemm.mr = 4;
191 xnn_params.qc8.gemm.nr = 8;
192 } else {
193 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
194 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
195 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
196 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane);
197 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
198 xnn_params.qc8.gemm.mr = 4;
199 xnn_params.qc8.gemm.nr = 8;
200 }
201 break;
202 }
203 }
204 #if XNN_MAX_UARCH_TYPES > 1
205 {
206 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
207 const uint32_t mr = xnn_params.qc8.gemm.mr;
208 const uint32_t nr = xnn_params.qc8.gemm.nr;
209 const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
210 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
211 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
212 if (uarch_info == NULL) {
213 /* No more microarchitectures in the system */
214 break;
215 }
216
217 switch (uarch_info->uarch) {
218 case cpuinfo_uarch_cortex_a55:
219 if (mr == 4 && nr == 8 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
220 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55;
221 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__aarch32_neondot_cortex_a55;
222 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot;
223 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot;
224 }
225 break;
226 case cpuinfo_uarch_cortex_a53:
227 if (mr == 4 && nr == 8 && log2_kr == 0) {
228 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_cortex_a53;
229 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_prfm_ld64;
230 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
231 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
232 }
233 break;
234 case cpuinfo_uarch_cortex_a55r0:
235 if (mr == 4 && nr == 8 && log2_kr == 0) {
236 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_cortex_a53;
237 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8__aarch32_neonv8_mlal_lane_ld64;
238 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
239 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8__neonv8_mlal_lane;
240 }
241 break;
242
243 default:
244 break;
245 }
246 }
247 }
248 #endif // XNN_MAX_UARCH_TYPES > 1
249 #else // XNN_ENABLE_ASSEMBLY
250 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
251 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x8c4__neondot);
252 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x8c4__neondot);
253 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c4__neondot);
254 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c4__neondot);
255 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
256 xnn_params.qc8.gemm.mr = 4;
257 xnn_params.qc8.gemm.nr = 8;
258 xnn_params.qc8.gemm.log2_kr = 2;
259 } else if (cpuinfo_has_arm_v8()) {
260 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
261 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
262 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
263 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
264 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
265 xnn_params.qc8.gemm.mr = 2;
266 xnn_params.qc8.gemm.nr = 8;
267 xnn_params.qc8.gemm.log2_kr = 1;
268 xnn_params.qc8.gemm.log2_sr = 2;
269 } else {
270 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
271 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neon_mlal);
272 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
273 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neon_mlal);
274 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neon_params;
275 xnn_params.qc8.gemm.mr = 2;
276 xnn_params.qc8.gemm.nr = 8;
277 xnn_params.qc8.gemm.log2_kr = 1;
278 xnn_params.qc8.gemm.log2_sr = 2;
279 }
280 #endif // XNN_ENABLE_ASSEMBLY
281
282 if (cpuinfo_has_arm_neon_v8()) {
283 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
284 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
285 xnn_params.qc8.dwconv[0].channel_tile = 16;
286 xnn_params.qc8.dwconv[0].primary_tile = 9;
287 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neonv8_mla8_ld64;
288 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
289 xnn_params.qc8.dwconv[1].channel_tile = 8;
290 xnn_params.qc8.dwconv[1].primary_tile = 25;
291 } else {
292 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neon_mla8_ld64;
293 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neon_params;
294 xnn_params.qc8.dwconv[0].channel_tile = 16;
295 xnn_params.qc8.dwconv[0].primary_tile = 9;
296 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__neon_mla8_ld64;
297 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neon_params;
298 xnn_params.qc8.dwconv[1].channel_tile = 8;
299 xnn_params.qc8.dwconv[1].primary_tile = 25;
300 }
301 #endif // XNN_NO_QC8_OPERATORS
302
303 /**************************** QS8 AArch32 micro-kernels ****************************/
304 #ifndef XNN_NO_QS8_OPERATORS
305 init_flags |= XNN_INIT_FLAG_QS8;
306
307 #if XNN_ENABLE_ASSEMBLY
308 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
309 switch (cpuinfo_get_uarch(0)->uarch) {
310 case cpuinfo_uarch_cortex_a55:
311 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55);
312 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55);
313 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
314 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
315 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
316 xnn_params.qs8.gemm.mr = 4;
317 xnn_params.qs8.gemm.nr = 8;
318 xnn_params.qs8.gemm.log2_kr = 2;
319 break;
320 default:
321 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
322 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64);
323 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
324 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
325 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
326 xnn_params.qs8.gemm.mr = 4;
327 xnn_params.qs8.gemm.nr = 8;
328 xnn_params.qs8.gemm.log2_kr = 2;
329 break;
330 }
331 } else {
332 switch (cpuinfo_get_uarch(0)->uarch) {
333 case cpuinfo_uarch_cortex_a7:
334 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
335 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
336 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
337 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
338 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
339 xnn_params.qs8.gemm.mr = 4;
340 xnn_params.qs8.gemm.nr = 8;
341 break;
342 case cpuinfo_uarch_cortex_a35:
343 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
344 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
345 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
346 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
347 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
348 xnn_params.qs8.gemm.mr = 4;
349 xnn_params.qs8.gemm.nr = 8;
350 break;
351 case cpuinfo_uarch_cortex_a53:
352 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
353 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
354 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
355 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
356 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
357 xnn_params.qs8.gemm.mr = 4;
358 xnn_params.qs8.gemm.nr = 8;
359 break;
360 case cpuinfo_uarch_cortex_a55r0:
361 case cpuinfo_uarch_kryo:
362 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
363 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
364 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
365 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
366 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
367 xnn_params.qs8.gemm.mr = 4;
368 xnn_params.qs8.gemm.nr = 8;
369 break;
370 case cpuinfo_uarch_cortex_a72:
371 case cpuinfo_uarch_exynos_m1:
372 case cpuinfo_uarch_exynos_m2:
373 case cpuinfo_uarch_exynos_m3:
374 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
375 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
376 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
377 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
378 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
379 xnn_params.qs8.gemm.mr = 4;
380 xnn_params.qs8.gemm.nr = 8;
381 break;
382 default:
383 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
384 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
385 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
386 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
387 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
388 xnn_params.qs8.gemm.mr = 4;
389 xnn_params.qs8.gemm.nr = 8;
390 break;
391 }
392 }
393 #if XNN_MAX_UARCH_TYPES > 1
394 {
395 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
396 const uint32_t mr = xnn_params.qs8.gemm.mr;
397 const uint32_t nr = xnn_params.qs8.gemm.nr;
398 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
399 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
400 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
401 if (uarch_info == NULL) {
402 /* No more microarchitectures in the system */
403 break;
404 }
405
406 switch (uarch_info->uarch) {
407 case cpuinfo_uarch_cortex_a55:
408 if (mr == 4 && nr == 8 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
409 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55;
410 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55;
411 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot;
412 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot;
413 }
414 break;
415 case cpuinfo_uarch_cortex_a53:
416 if (mr == 4 && nr == 8 && log2_kr == 0) {
417 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
418 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64;
419 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
420 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
421 }
422 break;
423 case cpuinfo_uarch_cortex_a55r0:
424 if (mr == 4 && nr == 8 && log2_kr == 0) {
425 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
426 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64;
427 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
428 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
429 }
430 break;
431 default:
432 break;
433 }
434 }
435 }
436 #endif // XNN_MAX_UARCH_TYPES > 1
437 #else // XNN_ENABLE_ASSEMBLY
438 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
439 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
440 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
441 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
442 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
443 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
444 xnn_params.qs8.gemm.mr = 4;
445 xnn_params.qs8.gemm.nr = 8;
446 xnn_params.qs8.gemm.log2_kr = 2;
447 } else {
448 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
449 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
450 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
451 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
452 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
453 xnn_params.qs8.gemm.mr = 2;
454 xnn_params.qs8.gemm.nr = 8;
455 xnn_params.qs8.gemm.log2_kr = 1;
456 xnn_params.qs8.gemm.log2_sr = 2;
457 }
458 #endif // XNN_ENABLE_ASSEMBLY
459
460 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
461 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
462 xnn_params.qs8.dwconv[0].channel_tile = 16;
463 xnn_params.qs8.dwconv[0].primary_tile = 9;
464 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mla8_ld64;
465 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
466 xnn_params.qs8.dwconv[1].channel_tile = 8;
467 xnn_params.qs8.dwconv[1].primary_tile = 25;
468
469 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
470 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
471 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
472 .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
473 .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
474 .row_tile = 7,
475 .channel_tile = 8,
476 };
477
478 xnn_params.qs8.vadd = (struct vbinary_parameters) {
479 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x16,
480 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
481 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16,
482 .init.qs8_addsub = xnn_init_qs8_add_minmax_neon_params,
483 .element_tile = 16,
484 };
485 xnn_params.qs8.vmul = (struct vbinary_parameters) {
486 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
487 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
488 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
489 .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
490 .element_tile = 16,
491 };
492 #endif // XNN_NO_QS8_OPERATORS
493
494 /*************************** QU8 AArch32 micro-kernels ***************************/
495 #ifndef XNN_NO_QU8_OPERATORS
496 init_flags |= XNN_INIT_FLAG_QU8;
497
498 #if XNN_ENABLE_ASSEMBLY
499 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
500 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
501 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
502 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
503 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
504 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
505 xnn_params.qu8.gemm.mr = 4;
506 xnn_params.qu8.gemm.nr = 8;
507 xnn_params.qu8.gemm.log2_kr = 2;
508 } else {
509 switch (cpuinfo_get_uarch(0)->uarch) {
510 case cpuinfo_uarch_cortex_a7:
511 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7);
512 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
513 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
514 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
515 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
516 xnn_params.qu8.gemm.mr = 4;
517 xnn_params.qu8.gemm.nr = 8;
518 break;
519 case cpuinfo_uarch_cortex_a35:
520 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7);
521 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
522 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
523 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
524 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
525 xnn_params.qu8.gemm.mr = 4;
526 xnn_params.qu8.gemm.nr = 8;
527 break;
528 case cpuinfo_uarch_cortex_a53:
529 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53);
530 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
531 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
532 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
533 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
534 xnn_params.qu8.gemm.mr = 4;
535 xnn_params.qu8.gemm.nr = 8;
536 break;
537 case cpuinfo_uarch_cortex_a55r0:
538 case cpuinfo_uarch_kryo:
539 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53);
540 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
541 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
542 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
543 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
544 xnn_params.qu8.gemm.mr = 4;
545 xnn_params.qu8.gemm.nr = 8;
546 break;
547 case cpuinfo_uarch_cortex_a72:
548 case cpuinfo_uarch_exynos_m1:
549 case cpuinfo_uarch_exynos_m2:
550 case cpuinfo_uarch_exynos_m3:
551 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
552 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64);
553 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
554 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
555 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
556 xnn_params.qu8.gemm.mr = 4;
557 xnn_params.qu8.gemm.nr = 8;
558 break;
559 default:
560 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
561 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64);
562 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
563 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane);
564 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
565 xnn_params.qu8.gemm.mr = 4;
566 xnn_params.qu8.gemm.nr = 8;
567 break;
568 }
569 }
570 #if XNN_MAX_UARCH_TYPES > 1
571 {
572 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
573 const uint32_t mr = xnn_params.qu8.gemm.mr;
574 const uint32_t nr = xnn_params.qu8.gemm.nr;
575 const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
576 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
577 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
578 if (uarch_info == NULL) {
579 /* No more microarchitectures in the system */
580 break;
581 }
582
583 switch (uarch_info->uarch) {
584 case cpuinfo_uarch_cortex_a53:
585 if (mr == 4 && nr == 8 && log2_kr == 0) {
586 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53;
587 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64;
588 xnn_params.qu8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
589 xnn_params.qu8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
590 }
591 break;
592 case cpuinfo_uarch_cortex_a55r0:
593 if (mr == 4 && nr == 8 && log2_kr == 0) {
594 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53;
595 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64;
596 xnn_params.qu8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
597 xnn_params.qu8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane;
598 }
599 break;
600 default:
601 break;
602 }
603 }
604 }
605 #endif // XNN_MAX_UARCH_TYPES > 1
606 #else // XNN_ENABLE_ASSEMBLY
607 if (!XNN_PLATFORM_IOS && cpuinfo_has_arm_neon_dot()) {
608 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot);
609 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot);
610 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot);
611 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot);
612 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
613 xnn_params.qu8.gemm.mr = 4;
614 xnn_params.qu8.gemm.nr = 8;
615 xnn_params.qu8.gemm.log2_kr = 2;
616 } else {
617 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
618 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
619 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
620 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
621 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
622 xnn_params.qu8.gemm.mr = 2;
623 xnn_params.qu8.gemm.nr = 8;
624 xnn_params.qu8.gemm.log2_kr = 1;
625 xnn_params.qu8.gemm.log2_sr = 2;
626 }
627 #endif // XNN_ENABLE_ASSEMBLY
628
629 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
630 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
631 xnn_params.qu8.dwconv[0].channel_tile = 16;
632 xnn_params.qu8.dwconv[0].primary_tile = 9;
633 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
634 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
635 xnn_params.qu8.dwconv[1].channel_tile = 8;
636 xnn_params.qu8.dwconv[1].primary_tile = 25;
637
638 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
639 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
640 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
641 .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
642 .primary_tile = 9,
643 .incremental_tile = 8,
644 .channel_tile = 8,
645 };
646 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
647 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
648 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
649 .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
650 .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
651 .row_tile = 7,
652 .channel_tile = 8,
653 };
654 xnn_params.qu8.vadd = (struct vbinary_parameters) {
655 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16,
656 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
657 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16,
658 .init.qu8_addsub = xnn_init_qu8_add_minmax_neon_params,
659 .element_tile = 8,
660 };
661 xnn_params.qu8.vmul = (struct vbinary_parameters) {
662 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
663 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
664 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
665 .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
666 .element_tile = 16,
667 };
668 #endif // XNN_NO_QU8_OPERATORS
669
670 /**************************** S8 AArch32 micro-kernels ****************************/
671 #ifndef XNN_NO_S8_OPERATORS
672 init_flags |= XNN_INIT_FLAG_S8;
673
674 xnn_params.s8.clamp = (struct vunary_parameters) {
675 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
676 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
677 .element_tile = 64,
678 };
679 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
680 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c8,
681 .pixel_tile = 1,
682 .channel_tile = 8,
683 };
684 xnn_params.s8.maxpool = (struct maxpool_parameters) {
685 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
686 .init.s8 = xnn_init_s8_minmax_neon_params,
687 .mr = 9,
688 .qr = 8,
689 };
690 #endif // XNN_NO_S8_OPERATORS
691
692 /**************************** U8 AArch32 micro-kernels ****************************/
693 #ifndef XNN_NO_U8_OPERATORS
694 init_flags |= XNN_INIT_FLAG_U8;
695
696 xnn_params.u8.clamp = (struct vunary_parameters) {
697 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
698 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
699 .element_tile = 64,
700 };
701 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
702 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c8,
703 .pixel_tile = 1,
704 .channel_tile = 8,
705 };
706 xnn_params.u8.maxpool = (struct maxpool_parameters) {
707 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
708 .init.u8 = xnn_init_u8_minmax_neon_params,
709 .mr = 9,
710 .qr = 8,
711 };
712 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
713 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
714 #endif // XNN_NO_U8_OPERATORS
715
716 /**************************** X8 AArch32 micro-kernels ****************************/
717 #ifndef XNN_NO_X8_OPERATORS
718 init_flags |= XNN_INIT_FLAG_X8;
719
720 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
721 xnn_params.x8.zip = (struct zip_parameters) {
722 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
723 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
724 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
725 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
726 };
727 #endif // XNN_NO_X8_OPERATORS
728
729 /**************************** F32 AArch32 micro-kernels ****************************/
730 #ifndef XNN_NO_F32_OPERATORS
731 init_flags |= XNN_INIT_FLAG_F32;
732
733 #if XNN_ENABLE_ASSEMBLY
734 switch (cpuinfo_get_uarch(0)->uarch) {
735 case cpuinfo_uarch_cortex_a5:
736 case cpuinfo_uarch_cortex_a7:
737 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
738 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
739 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
740 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
741 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
742 xnn_params.f32.gemm.mr = 4;
743 xnn_params.f32.gemm.nr = 8;
744 break;
745
746 case cpuinfo_uarch_cortex_a53:
747 case cpuinfo_uarch_cortex_a55r0:
748 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
749 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
750 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
751 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
752 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
753 xnn_params.f32.gemm.mr = 4;
754 xnn_params.f32.gemm.nr = 8;
755 break;
756
757 case cpuinfo_uarch_cortex_a35:
758 case cpuinfo_uarch_cortex_a55:
759 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
760 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
761 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
762 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
763 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
764 xnn_params.f32.gemm.mr = 4;
765 xnn_params.f32.gemm.nr = 8;
766 break;
767
768 case cpuinfo_uarch_cortex_a57:
769 case cpuinfo_uarch_cortex_a72:
770 case cpuinfo_uarch_cortex_a73:
771 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
772 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_prfm_cortex_a75);
773 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
774 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
775 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
776 xnn_params.f32.gemm.mr = 4;
777 xnn_params.f32.gemm.nr = 8;
778 break;
779
780 case cpuinfo_uarch_krait:
781 default:
782 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
783 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
784 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
785 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
786 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
787 xnn_params.f32.gemm.mr = 4;
788 xnn_params.f32.gemm.nr = 8;
789 break;
790 }
791 #if XNN_MAX_UARCH_TYPES > 1
792 {
793 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
794 const uint32_t mr = xnn_params.f32.gemm.mr;
795 const uint32_t nr = xnn_params.f32.gemm.nr;
796 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
797 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
798 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
799 if (uarch_info == NULL) {
800 /* No more microarchitectures in the system */
801 break;
802 }
803
804 switch (uarch_info->uarch) {
805 case cpuinfo_uarch_cortex_a53:
806 case cpuinfo_uarch_cortex_a55r0:
807 if (mr == 4 && nr == 8 && log2_sr == 0) {
808 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
809 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
810 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
811 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
812 }
813 break;
814 case cpuinfo_uarch_cortex_a55:
815 if (mr == 4 && nr == 8 && log2_sr == 0) {
816 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
817 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
818 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
819 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
820 }
821 break;
822 default:
823 break;
824 }
825 }
826 }
827 #endif // XNN_MAX_UARCH_TYPES > 1
828 #else // XNN_ENABLE_ASSEMBLY
829 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128);
830 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128);
831 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
832 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
833 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
834 xnn_params.f32.gemm.mr = 4;
835 xnn_params.f32.gemm.nr = 8;
836 #endif // XNN_ENABLE_ASSEMBLY
837 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64);
838 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64);
839 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
840 xnn_params.f32.gemm2.mr = 4;
841 xnn_params.f32.gemm2.nr = 2;
842
843 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neon;
844 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
845 xnn_params.f32.dwconv[0].channel_tile = 8,
846 xnn_params.f32.dwconv[0].primary_tile = 3,
847
848 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neon;
849 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
850 xnn_params.f32.dwconv[1].channel_tile = 8,
851 xnn_params.f32.dwconv[1].primary_tile = 4,
852
853 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neon;
854 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
855 xnn_params.f32.dwconv[2].channel_tile = 8;
856 xnn_params.f32.dwconv[2].primary_tile = 9;
857
858 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neon_acc2;
859 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
860 xnn_params.f32.dwconv[3].channel_tile = 8;
861 xnn_params.f32.dwconv[3].primary_tile = 25;
862
863 xnn_params.f32.avgpool = (struct avgpool_parameters) {
864 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
865 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
866 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
867 .primary_tile = 9,
868 .incremental_tile = 8,
869 .channel_tile = 4,
870 };
871 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
872 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
873 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
874 .primary_tile = 9,
875 .incremental_tile = 8,
876 .channel_tile = 4,
877 };
878 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
879 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
880 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
881 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
882 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
883 .row_tile = 7,
884 .channel_tile = 4,
885 };
886 xnn_params.f32.maxpool = (struct maxpool_parameters) {
887 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
888 .init.f32 = xnn_init_f32_minmax_scalar_params,
889 .mr = 9,
890 .qr = 8,
891 };
892 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
893 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
894 .mr = 4,
895 };
896 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
897 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
898 .mr = 9,
899 };
900 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
901 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
902 .mr = 9,
903 .qr = 8,
904 };
905 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
906 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neon_c8,
907 .pixel_tile = 1,
908 .channel_tile = 8,
909 };
910 xnn_params.f32.abs = (struct vunary_parameters) {
911 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
912 .element_tile = 8,
913 };
914 xnn_params.f32.clamp = (struct vunary_parameters) {
915 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
916 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
917 .element_tile = 8,
918 };
919 if (cpuinfo_has_arm_neon_fma()) {
920 xnn_params.f32.elu = (struct vunary_parameters) {
921 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_p6_x8,
922 .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_p6_params,
923 .element_tile = 8,
924 };
925 } else {
926 xnn_params.f32.elu = (struct vunary_parameters) {
927 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8,
928 .init.f32_elu = xnn_init_f32_elu_neon_rr2_lut16_p3_params,
929 .element_tile = 8,
930 };
931 }
932 xnn_params.f32.hswish = (struct vunary_parameters) {
933 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
934 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
935 .element_tile = 16,
936 };
937 xnn_params.f32.lrelu = (struct vunary_parameters) {
938 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
939 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
940 .element_tile = 8,
941 };
942 xnn_params.f32.neg = (struct vunary_parameters) {
943 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
944 .element_tile = 8,
945 };
946 if (cpuinfo_has_arm_neon_v8()) {
947 xnn_params.f32.rndne = (struct vunary_parameters) {
948 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
949 .element_tile = 8,
950 };
951 xnn_params.f32.rndz = (struct vunary_parameters) {
952 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
953 .element_tile = 8,
954 };
955 xnn_params.f32.rndu = (struct vunary_parameters) {
956 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
957 .element_tile = 8,
958 };
959 xnn_params.f32.rndd = (struct vunary_parameters) {
960 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
961 .element_tile = 8,
962 };
963 } else {
964 xnn_params.f32.rndne = (struct vunary_parameters) {
965 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neon_x8,
966 .element_tile = 8,
967 };
968 xnn_params.f32.rndz = (struct vunary_parameters) {
969 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neon_x8,
970 .element_tile = 8,
971 };
972 xnn_params.f32.rndu = (struct vunary_parameters) {
973 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neon_x8,
974 .element_tile = 8,
975 };
976 xnn_params.f32.rndd = (struct vunary_parameters) {
977 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neon_x8,
978 .element_tile = 8,
979 };
980 }
981 xnn_params.f32.sigmoid = (struct vunary_parameters) {
982 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8,
983 .init.f32_sigmoid = xnn_init_f32_sigmoid_neon_rr2_lut64_p2_params,
984 .element_tile = 8,
985 };
986 xnn_params.f32.sqr = (struct vunary_parameters) {
987 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
988 .element_tile = 8,
989 };
990 xnn_params.f32.sqrt = (struct vunary_parameters) {
991 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
992 .element_tile = 1,
993 };
994 xnn_params.f32.prelu = (struct prelu_parameters) {
995 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
996 .row_tile = 2,
997 .channel_tile = 8,
998 };
999 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1000 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__neon_rr2_lut64_p2_x8,
1001 .init = xnn_init_f32_expminus_neon_rr2_lut64_p2_params,
1002 .element_tile = 8,
1003 };
1004 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
1005 xnn_params.f32.vadd = (struct vbinary_parameters) {
1006 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
1007 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1008 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1009 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1010 .element_tile = 8,
1011 };
1012 xnn_params.f32.vdiv = (struct vbinary_parameters) {
1013 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1014 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1015 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
1016 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1017 .element_tile = 2,
1018 };
1019 xnn_params.f32.vmax = (struct vbinary_parameters) {
1020 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
1021 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1022 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1023 .element_tile = 8,
1024 };
1025 xnn_params.f32.vmin = (struct vbinary_parameters) {
1026 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
1027 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1028 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1029 .element_tile = 8,
1030 };
1031 xnn_params.f32.vmul = (struct vbinary_parameters) {
1032 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
1033 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1034 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1035 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1036 .element_tile = 8,
1037 };
1038 xnn_params.f32.vsub = (struct vbinary_parameters) {
1039 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
1040 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
1041 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
1042 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1043 .element_tile = 8,
1044 };
1045 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1046 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
1047 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1048 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1049 .element_tile = 8,
1050 };
1051 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1052 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x,
1053 .init.f32 = xnn_init_f32_minmax_scalar_params,
1054 .channel_tile = 4,
1055 .row_tile = 2,
1056 };
1057 #ifndef XNN_NO_NCHW_OPERATORS
1058 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1059
1060 xnn_params.f32.spmm = (struct spmm_parameters) {
1061 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neon,
1062 .mr = 32,
1063 .nr = 1,
1064 };
1065 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1066 .ukernel_with_symm_padding =
1067 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2,
1068 .output_channel_tile = 4,
1069 .output_height_tile = 2,
1070 .output_width_tile = 2,
1071 };
1072 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1073 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4,
1074 .output_width_tile = 4,
1075 .output_height_tile = 2,
1076 };
1077 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1078 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4,
1079 .output_width_tile = 4,
1080 .output_height_tile = 1,
1081 };
1082 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1083 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4,
1084 .output_width_tile = 4,
1085 .output_height_tile = 1,
1086 };
1087 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1088 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4,
1089 .output_width_tile = 4,
1090 .output_height_tile = 1,
1091 };
1092 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1093 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
1094 .channel_tile = 4,
1095 };
1096 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1097 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neon_p8,
1098 .channel_tile = 1,
1099 .pixel_tile = 8,
1100 };
1101 #endif // XNN_NO_NCHW_OPERATORS
1102 #endif // XNN_NO_F32_OPERATORS
1103
1104 /*************************** VCVT AArch32 micro-kernels ***************************/
1105 #ifndef XNN_NO_VCVT_OPERATORS
1106 init_flags |= XNN_INIT_FLAG_VCVT;
1107
1108 if (cpuinfo_has_arm_neon_fp16()) {
1109 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1110 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
1111 .element_tile = 16,
1112 };
1113 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1114 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
1115 .element_tile = 16,
1116 };
1117 } else {
1118 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1119 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neon_int16_x16,
1120 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_neon_params,
1121 .element_tile = 16,
1122 };
1123 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1124 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neon_x8,
1125 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_neon_params,
1126 .element_tile = 8,
1127 };
1128 }
1129 if (cpuinfo_has_arm_neon_v8()) {
1130 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1131 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
1132 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
1133 .element_tile = 32,
1134 };
1135 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1136 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
1137 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
1138 .element_tile = 32,
1139 };
1140 } else {
1141 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1142 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neon_x32,
1143 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neon_params,
1144 .element_tile = 32,
1145 };
1146 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1147 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neon_x32,
1148 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neon_params,
1149 .element_tile = 32,
1150 };
1151 }
1152 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1153 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
1154 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
1155 .element_tile = 32,
1156 };
1157 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1158 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
1159 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
1160 .element_tile = 32,
1161 };
1162 #endif // XNN_NO_VCVT_OPERATORS
1163
1164 /**************************** X32 AArch32 micro-kernels ****************************/
1165 #ifndef XNN_NO_X32_OPERATORS
1166 init_flags |= XNN_INIT_FLAG_X32;
1167
1168 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
1169 xnn_params.x32.zip = (struct zip_parameters) {
1170 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
1171 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
1172 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
1173 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
1174 };
1175 #ifndef XNN_NO_NCHW_OPERATORS
1176 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
1177 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
1178 .channel_tile = 1,
1179 .pixel_tile = 1,
1180 };
1181 #endif // XNN_NO_NCHW_OPERATORS
1182 #endif // XNN_NO_X32_OPERATORS
1183
1184 /**************************** XX AArch32 micro-kernels ****************************/
1185 #ifndef XNN_NO_XX_OPERATORS
1186 init_flags |= XNN_INIT_FLAG_XX;
1187
1188 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1189 xnn_params.xx.fill = (struct fill_parameters) {
1190 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
1191 .row_tile = 1,
1192 };
1193 xnn_params.xx.pad = (struct pad_parameters) {
1194 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
1195 .row_tile = 1,
1196 };
1197 #endif // XNN_NO_XX_OPERATORS
1198
1199 } else if (!XNN_PLATFORM_MOBILE) {
1200
1201 /*************************** QS8 AArch32 Pre-NEON micro-kernels ***************************/
1202 #ifndef XNN_NO_QS8_OPERATORS
1203 init_flags |= XNN_INIT_FLAG_QS8;
1204
1205 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1206 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1207 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1208 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1209 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
1210 xnn_params.qs8.gemm.mr = 2;
1211 xnn_params.qs8.gemm.nr = 2;
1212
1213 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1214 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
1215 xnn_params.qs8.dwconv[0].channel_tile = 1;
1216 xnn_params.qs8.dwconv[0].primary_tile = 9;
1217 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1218 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
1219 xnn_params.qs8.dwconv[1].channel_tile = 1;
1220 xnn_params.qs8.dwconv[1].primary_tile = 25;
1221
1222 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
1223 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1224 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
1225 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
1226 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
1227 .row_tile = 7,
1228 .channel_tile = 1,
1229 };
1230 xnn_params.qs8.vadd = (struct vbinary_parameters) {
1231 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x1,
1232 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
1233 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x1,
1234 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
1235 .element_tile = 1,
1236 };
1237 xnn_params.qs8.vmul = (struct vbinary_parameters) {
1238 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
1239 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
1240 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
1241 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
1242 .element_tile = 4,
1243 };
1244 #endif // XNN_NO_QS8_OPERATORS
1245
1246 /*************************** QU8 AArch32 Pre-NEON micro-kernels ***************************/
1247 #ifndef XNN_NO_QU8_OPERATORS
1248 init_flags |= XNN_INIT_FLAG_QU8;
1249
1250 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1251 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic);
1252 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1253 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic);
1254 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
1255 xnn_params.qu8.gemm.mr = 2;
1256 xnn_params.qu8.gemm.nr = 2;
1257
1258 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x9__scalar_fmagic;
1259 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
1260 xnn_params.qu8.dwconv[0].channel_tile = 1;
1261 xnn_params.qu8.dwconv[0].primary_tile = 9;
1262 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_fmagic;
1263 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
1264 xnn_params.qu8.dwconv[1].channel_tile = 1;
1265 xnn_params.qu8.dwconv[1].primary_tile = 25;
1266
1267 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
1268 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
1269 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
1270 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
1271 .primary_tile = 9,
1272 .incremental_tile = 8,
1273 .channel_tile = 1,
1274 };
1275 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
1276 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
1277 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
1278 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1279 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
1280 .row_tile = 7,
1281 .channel_tile = 1,
1282 };
1283 xnn_params.qu8.vadd = (struct vbinary_parameters) {
1284 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x1,
1285 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
1286 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x1,
1287 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
1288 .element_tile = 1,
1289 };
1290 xnn_params.qu8.vmul = (struct vbinary_parameters) {
1291 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
1292 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
1293 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
1294 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
1295 .element_tile = 4,
1296 };
1297 #endif // XNN_NO_QU8_OPERATORS
1298
1299 /**************************** S8 AArch32 Pre-NEON micro-kernels ****************************/
1300 #ifndef XNN_NO_S8_OPERATORS
1301 init_flags |= XNN_INIT_FLAG_S8;
1302
1303 xnn_params.s8.clamp = (struct vunary_parameters) {
1304 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
1305 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
1306 .element_tile = 4,
1307 };
1308 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
1309 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
1310 .pixel_tile = 1,
1311 .channel_tile = 1,
1312 };
1313 xnn_params.s8.maxpool = (struct maxpool_parameters) {
1314 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1315 .init.s8 = xnn_init_s8_minmax_scalar_params,
1316 .mr = 9,
1317 .qr = 8,
1318 };
1319 #endif // XNN_NO_S8_OPERATORS
1320
1321 /**************************** U8 AArch32 Pre-NEON micro-kernels ****************************/
1322 #ifndef XNN_NO_U8_OPERATORS
1323 init_flags |= XNN_INIT_FLAG_U8;
1324
1325 xnn_params.u8.clamp = (struct vunary_parameters) {
1326 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
1327 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
1328 .element_tile = 4,
1329 };
1330 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
1331 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
1332 .pixel_tile = 1,
1333 .channel_tile = 1,
1334 };
1335 xnn_params.u8.maxpool = (struct maxpool_parameters) {
1336 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
1337 .init.u8 = xnn_init_u8_minmax_scalar_params,
1338 .mr = 9,
1339 .qr = 8,
1340 };
1341 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1342 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
1343 #endif // XNN_NO_U8_OPERATORS
1344
1345 /**************************** X8 AArch32 Pre-NEON micro-kernels ****************************/
1346 #ifndef XNN_NO_X8_OPERATORS
1347 init_flags |= XNN_INIT_FLAG_X8;
1348
1349 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
1350 xnn_params.x8.zip = (struct zip_parameters) {
1351 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
1352 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
1353 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
1354 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
1355 };
1356 #endif // XNN_NO_X8_OPERATORS
1357
1358 /**************************** F32 AArch32 Pre-NEON micro-kernels ****************************/
1359 #ifndef XNN_NO_F32_OPERATORS
1360 init_flags |= XNN_INIT_FLAG_F32;
1361
1362 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
1363 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
1364 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
1365 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
1366 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
1367 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
1368 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
1369 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
1370 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
1371 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
1372 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
1373 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
1374 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
1375 xnn_params.f32.gemm.mr = 4;
1376 xnn_params.f32.gemm.nr = 4;
1377
1378 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
1379 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
1380 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
1381 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
1382 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
1383 xnn_params.f32.gemm2.mr = 4;
1384 xnn_params.f32.gemm2.nr = 2;
1385
1386 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
1387 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
1388 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
1389 xnn_params.f32.dwconv[0].channel_tile = 1;
1390 xnn_params.f32.dwconv[0].primary_tile = 3;
1391
1392 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
1393 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
1394 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
1395 xnn_params.f32.dwconv[1].channel_tile = 1;
1396 xnn_params.f32.dwconv[1].primary_tile = 4;
1397
1398 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
1399 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
1400 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
1401 xnn_params.f32.dwconv[2].channel_tile = 1;
1402 xnn_params.f32.dwconv[2].primary_tile = 9;
1403
1404 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
1405 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
1406 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
1407 xnn_params.f32.dwconv[3].channel_tile = 1;
1408 xnn_params.f32.dwconv[3].primary_tile = 25;
1409
1410 xnn_params.f32.avgpool = (struct avgpool_parameters) {
1411 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
1412 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
1413 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1414 .primary_tile = 9,
1415 .incremental_tile = 8,
1416 .channel_tile = 1,
1417 };
1418 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
1419 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
1420 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
1421 .primary_tile = 9,
1422 .incremental_tile = 8,
1423 .channel_tile = 1,
1424 };
1425 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
1426 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
1427 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
1428 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
1429 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
1430 .row_tile = 7,
1431 .channel_tile = 1,
1432 };
1433 xnn_params.f32.maxpool = (struct maxpool_parameters) {
1434 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
1435 .init.f32 = xnn_init_f32_minmax_scalar_params,
1436 .mr = 9,
1437 .qr = 8,
1438 };
1439 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1440 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
1441 .mr = 4,
1442 };
1443 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1444 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
1445 .mr = 9,
1446 };
1447 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1448 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
1449 .mr = 9,
1450 .qr = 8,
1451 };
1452 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1453 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
1454 .pixel_tile = 1,
1455 .channel_tile = 2,
1456 };
1457 xnn_params.f32.abs = (struct vunary_parameters) {
1458 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
1459 .element_tile = 4,
1460 };
1461 xnn_params.f32.clamp = (struct vunary_parameters) {
1462 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
1463 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1464 .element_tile = 4,
1465 };
1466 xnn_params.f32.elu = (struct vunary_parameters) {
1467 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
1468 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
1469 .element_tile = 4,
1470 };
1471 xnn_params.f32.hswish = (struct vunary_parameters) {
1472 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
1473 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
1474 .element_tile = 4,
1475 };
1476 xnn_params.f32.lrelu = (struct vunary_parameters) {
1477 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
1478 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
1479 .element_tile = 4,
1480 };
1481 xnn_params.f32.neg = (struct vunary_parameters) {
1482 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
1483 .element_tile = 4,
1484 };
1485 xnn_params.f32.rndne = (struct vunary_parameters) {
1486 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
1487 .element_tile = 1,
1488 };
1489 xnn_params.f32.rndz = (struct vunary_parameters) {
1490 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
1491 .element_tile = 1,
1492 };
1493 xnn_params.f32.rndu = (struct vunary_parameters) {
1494 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
1495 .element_tile = 1,
1496 };
1497 xnn_params.f32.rndd = (struct vunary_parameters) {
1498 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
1499 .element_tile = 1,
1500 };
1501 xnn_params.f32.sigmoid = (struct vunary_parameters) {
1502 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
1503 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
1504 .element_tile = 2,
1505 };
1506 xnn_params.f32.sqr = (struct vunary_parameters) {
1507 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
1508 .element_tile = 4,
1509 };
1510 xnn_params.f32.sqrt = (struct vunary_parameters) {
1511 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
1512 .element_tile = 1,
1513 };
1514 xnn_params.f32.prelu = (struct prelu_parameters) {
1515 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
1516 .row_tile = 4,
1517 .channel_tile = 4,
1518 };
1519 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
1520 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
1521 .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
1522 .element_tile = 4,
1523 };
1524 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
1525 xnn_params.f32.vadd = (struct vbinary_parameters) {
1526 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
1527 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
1528 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
1529 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1530 .element_tile = 8,
1531 };
1532 xnn_params.f32.vdiv = (struct vbinary_parameters) {
1533 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
1534 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
1535 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
1536 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1537 .element_tile = 2,
1538 };
1539 xnn_params.f32.vmax = (struct vbinary_parameters) {
1540 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
1541 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
1542 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
1543 .element_tile = 8,
1544 };
1545 xnn_params.f32.vmin = (struct vbinary_parameters) {
1546 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
1547 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
1548 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
1549 .element_tile = 8,
1550 };
1551 xnn_params.f32.vmul = (struct vbinary_parameters) {
1552 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
1553 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
1554 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
1555 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1556 .element_tile = 8,
1557 };
1558 xnn_params.f32.vsub = (struct vbinary_parameters) {
1559 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
1560 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
1561 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
1562 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
1563 .element_tile = 8,
1564 };
1565 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1566 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
1567 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
1568 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
1569 .element_tile = 8,
1570 };
1571 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1572 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
1573 .init.f32 = xnn_init_f32_minmax_scalar_params,
1574 .channel_tile = 1,
1575 .row_tile = 2,
1576 };
1577 #ifndef XNN_NO_NCHW_OPERATORS
1578 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1579
1580 xnn_params.f32.spmm = (struct spmm_parameters) {
1581 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
1582 .mr = 8,
1583 .nr = 1,
1584 };
1585 xnn_params.f32.spmm2 = (struct spmm_parameters) {
1586 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
1587 .mr = 8,
1588 .nr = 2,
1589 };
1590 xnn_params.f32.spmm4 = (struct spmm_parameters) {
1591 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
1592 .mr = 8,
1593 .nr = 4,
1594 };
1595 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1596 .ukernel_with_symm_padding =
1597 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
1598 .output_channel_tile = 4,
1599 .output_height_tile = 1,
1600 .output_width_tile = 1,
1601 };
1602 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1603 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1,
1604 .output_width_tile = 1,
1605 .output_height_tile = 4,
1606 };
1607 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1608 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2,
1609 .output_width_tile = 1,
1610 .output_height_tile = 2,
1611 };
1612 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1613 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2,
1614 .output_width_tile = 1,
1615 .output_height_tile = 2,
1616 };
1617 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1618 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2,
1619 .output_width_tile = 1,
1620 .output_height_tile = 2,
1621 };
1622 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1623 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
1624 .channel_tile = 1,
1625 };
1626 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1627 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
1628 .channel_tile = 1,
1629 .pixel_tile = 4,
1630 };
1631 #endif // XNN_NO_NCHW_OPERATORS
1632 #endif // XNN_NO_F32_OPERATORS
1633
1634 /*************************** VCVT AArch32 Pre-NEON micro-kernels ***************************/
1635 #ifndef XNN_NO_VCVT_OPERATORS
1636 init_flags |= XNN_INIT_FLAG_VCVT;
1637
1638 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
1639 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
1640 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
1641 .element_tile = 4,
1642 };
1643 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
1644 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
1645 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
1646 .element_tile = 2,
1647 };
1648 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
1649 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_imagic_x4,
1650 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
1651 .element_tile = 4,
1652 };
1653 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
1654 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x4,
1655 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
1656 .element_tile = 4,
1657 };
1658 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
1659 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
1660 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
1661 .element_tile = 4,
1662 };
1663 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
1664 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
1665 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
1666 .element_tile = 4,
1667 };
1668 #endif // XNN_NO_VCVT_OPERATORS
1669
1670 /**************************** X32 AArch32 Pre-NEON micro-kernels ****************************/
1671 #ifndef XNN_NO_X32_OPERATORS
1672 init_flags |= XNN_INIT_FLAG_X32;
1673
1674 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
1675 xnn_params.x32.zip = (struct zip_parameters) {
1676 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
1677 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
1678 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
1679 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
1680 };
1681 #ifndef XNN_NO_NCHW_OPERATORS
1682 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
1683 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
1684 .channel_tile = 1,
1685 .pixel_tile = 1,
1686 };
1687 #endif // XNN_NO_NCHW_OPERATORS
1688 #endif // XNN_NO_X32_OPERATORS
1689
1690 /**************************** XX AArch32 Pre-NEON micro-kernels ****************************/
1691 #ifndef XNN_NO_XX_OPERATORS
1692 init_flags |= XNN_INIT_FLAG_XX;
1693
1694 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1695 xnn_params.xx.fill = (struct fill_parameters) {
1696 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
1697 .row_tile = 1,
1698 };
1699 xnn_params.xx.pad = (struct pad_parameters) {
1700 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
1701 .row_tile = 1,
1702 };
1703 #endif // XNN_NO_XX_OPERATORS
1704 }
1705
1706 #elif XNN_ARCH_ARM64
1707
1708 /**************************** QC8 AArch64 micro-kernels ****************************/
1709 #ifndef XNN_NO_QC8_OPERATORS
1710 init_flags |= XNN_INIT_FLAG_QC8;
1711
1712 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1713 #if XNN_ENABLE_ASSEMBLY
1714 if (cpuinfo_has_arm_neon_dot()) {
1715 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1716 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1717 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1718 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1719 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1720 xnn_params.qc8.gemm.mr = 4;
1721 xnn_params.qc8.gemm.nr = 16;
1722 xnn_params.qc8.gemm.log2_kr = 2;
1723 } else {
1724 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1725 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1726 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1727 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1728 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1729 xnn_params.qc8.gemm.mr = 2;
1730 xnn_params.qc8.gemm.nr = 8;
1731 xnn_params.qc8.gemm.log2_kr = 3;
1732 }
1733 #else // !XNN_ENABLE_ASSEMBLY
1734 if (cpuinfo_has_arm_neon_dot()) {
1735 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1736 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1737 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1738 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1739 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1740 xnn_params.qc8.gemm.mr = 4;
1741 xnn_params.qc8.gemm.nr = 16;
1742 xnn_params.qc8.gemm.log2_kr = 2;
1743 } else {
1744 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1745 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1746 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1747 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1748 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1749 xnn_params.qc8.gemm.mr = 2;
1750 xnn_params.qc8.gemm.nr = 8;
1751 xnn_params.qc8.gemm.log2_kr = 1;
1752 xnn_params.qc8.gemm.log2_sr = 2;
1753 }
1754 #endif // XNN_ENABLE_ASSEMBLY
1755 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1756 #if XNN_ENABLE_ASSEMBLY
1757 if (cpuinfo_has_arm_neon_dot()) {
1758 switch (cpuinfo_get_core(0)->uarch) {
1759 case cpuinfo_uarch_cortex_a55:
1760 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1761 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1762 break;
1763 case cpuinfo_uarch_cortex_x1:
1764 case cpuinfo_uarch_cortex_a78:
1765 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1766 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128);
1767 break;
1768 default:
1769 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1770 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64);
1771 break;
1772 }
1773 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1774 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1775 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1776 xnn_params.qc8.gemm.mr = 4;
1777 xnn_params.qc8.gemm.nr = 16;
1778 xnn_params.qc8.gemm.log2_kr = 2;
1779 } else {
1780 switch (cpuinfo_get_core(0)->uarch) {
1781 case cpuinfo_uarch_cortex_a35:
1782 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1783 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1784 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1785 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1786 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1787 xnn_params.qc8.gemm.mr = 4;
1788 xnn_params.qc8.gemm.nr = 16;
1789 break;
1790
1791 case cpuinfo_uarch_cortex_a53:
1792 case cpuinfo_uarch_cortex_a55r0:
1793 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1794 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1795 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1796 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane);
1797 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1798 xnn_params.qc8.gemm.mr = 4;
1799 xnn_params.qc8.gemm.nr = 16;
1800 break;
1801
1802 case cpuinfo_uarch_cortex_a72:
1803 case cpuinfo_uarch_cortex_a73:
1804 case cpuinfo_uarch_kryo:
1805 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1806 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1807 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1808 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1809 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1810 xnn_params.qc8.gemm.mr = 2;
1811 xnn_params.qc8.gemm.nr = 8;
1812 xnn_params.qc8.gemm.log2_kr = 3;
1813 break;
1814
1815 default:
1816 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1817 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal);
1818 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1819 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal);
1820 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1821 xnn_params.qc8.gemm.mr = 2;
1822 xnn_params.qc8.gemm.nr = 8;
1823 xnn_params.qc8.gemm.log2_kr = 3;
1824 break;
1825 }
1826 }
1827 #if XNN_MAX_UARCH_TYPES > 1
1828 {
1829 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1830 const uint32_t mr = xnn_params.qc8.gemm.mr;
1831 const uint32_t nr = xnn_params.qc8.gemm.nr;
1832 const uint32_t log2_kr = xnn_params.qc8.gemm.log2_kr;
1833 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1834 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1835 if (uarch_info == NULL) {
1836 /* No more microarchitectures in the system */
1837 break;
1838 }
1839
1840 switch (uarch_info->uarch) {
1841 case cpuinfo_uarch_cortex_a53:
1842 case cpuinfo_uarch_cortex_a55r0:
1843 if (mr == 2 && nr == 8 && log2_kr == 3) {
1844 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1845 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1846 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1847 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
1848 }
1849 break;
1850
1851 case cpuinfo_uarch_cortex_a55:
1852 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
1853 xnn_params.qc8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1854 xnn_params.qc8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55;
1855 xnn_params.qc8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot;
1856 xnn_params.qc8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot;
1857 }
1858 break;
1859 default:
1860 break;
1861 }
1862 }
1863 }
1864 #endif // XNN_MAX_UARCH_TYPES > 1
1865 #else // !XNN_ENABLE_ASSEMBLY
1866 if (cpuinfo_has_arm_neon_dot()) {
1867 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__neondot);
1868 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__neondot);
1869 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__neondot);
1870 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c4__neondot);
1871 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1872 xnn_params.qc8.gemm.mr = 4;
1873 xnn_params.qc8.gemm.nr = 16;
1874 xnn_params.qc8.gemm.log2_kr = 2;
1875 } else {
1876 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1877 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x8c2s4__neonv8_mlal);
1878 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1879 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c2s4__neonv8_mlal);
1880 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1881 xnn_params.qc8.gemm.mr = 2;
1882 xnn_params.qc8.gemm.nr = 8;
1883 xnn_params.qc8.gemm.log2_kr = 1;
1884 xnn_params.qc8.gemm.log2_sr = 2;
1885 }
1886 #endif // XNN_ENABLE_ASSEMBLY
1887 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1888
1889 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__neonv8_mla8_ld64;
1890 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1891 xnn_params.qc8.dwconv[0].channel_tile = 16;
1892 xnn_params.qc8.dwconv[0].primary_tile = 9;
1893 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__neonv8_mla8_ld64;
1894 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_neonv8_params;
1895 xnn_params.qc8.dwconv[1].channel_tile = 16;
1896 xnn_params.qc8.dwconv[1].primary_tile = 25;
1897 #endif // XNN_NO_QC8_OPERATORS
1898
1899 /**************************** QS8 AArch64 micro-kernels ****************************/
1900 #ifndef XNN_NO_QS8_OPERATORS
1901 init_flags |= XNN_INIT_FLAG_QS8;
1902
1903 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1904 #if XNN_ENABLE_ASSEMBLY
1905 if (cpuinfo_has_arm_neon_dot()) {
1906 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1907 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1908 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1909 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1910 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1911 xnn_params.qs8.gemm.mr = 4;
1912 xnn_params.qs8.gemm.nr = 16;
1913 xnn_params.qs8.gemm.log2_kr = 2;
1914 } else {
1915 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1916 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
1917 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
1918 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
1919 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1920 xnn_params.qs8.gemm.mr = 2;
1921 xnn_params.qs8.gemm.nr = 8;
1922 xnn_params.qs8.gemm.log2_kr = 3;
1923 }
1924 #else // !XNN_ENABLE_ASSEMBLY
1925 if (cpuinfo_has_arm_neon_dot()) {
1926 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
1927 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1928 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
1929 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1930 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1931 xnn_params.qs8.gemm.mr = 4;
1932 xnn_params.qs8.gemm.nr = 16;
1933 xnn_params.qs8.gemm.log2_kr = 2;
1934 } else {
1935 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1936 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
1937 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
1938 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
1939 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1940 xnn_params.qs8.gemm.mr = 2;
1941 xnn_params.qs8.gemm.nr = 8;
1942 xnn_params.qs8.gemm.log2_kr = 1;
1943 xnn_params.qs8.gemm.log2_sr = 2;
1944 }
1945 #endif // XNN_ENABLE_ASSEMBLY
1946 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1947 #if XNN_ENABLE_ASSEMBLY
1948 if (cpuinfo_has_arm_neon_dot()) {
1949 switch (cpuinfo_get_core(0)->uarch) {
1950 case cpuinfo_uarch_cortex_a55:
1951 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1952 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
1953 break;
1954 case cpuinfo_uarch_cortex_x1:
1955 case cpuinfo_uarch_cortex_a78:
1956 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1957 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
1958 break;
1959 default:
1960 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
1961 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64);
1962 break;
1963 }
1964 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
1965 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
1966 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1967 xnn_params.qs8.gemm.mr = 4;
1968 xnn_params.qs8.gemm.nr = 16;
1969 xnn_params.qs8.gemm.log2_kr = 2;
1970 } else {
1971 switch (cpuinfo_get_core(0)->uarch) {
1972 case cpuinfo_uarch_cortex_a35:
1973 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1974 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64);
1975 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1976 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1977 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1978 xnn_params.qs8.gemm.mr = 4;
1979 xnn_params.qs8.gemm.nr = 16;
1980 break;
1981
1982 case cpuinfo_uarch_cortex_a53:
1983 case cpuinfo_uarch_cortex_a55r0:
1984 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1985 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53);
1986 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1987 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
1988 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
1989 xnn_params.qs8.gemm.mr = 4;
1990 xnn_params.qs8.gemm.nr = 16;
1991 break;
1992
1993 case cpuinfo_uarch_cortex_a72:
1994 case cpuinfo_uarch_cortex_a73:
1995 case cpuinfo_uarch_kryo:
1996 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1997 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm);
1998 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
1999 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm);
2000 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2001 xnn_params.qs8.gemm.mr = 2;
2002 xnn_params.qs8.gemm.nr = 8;
2003 xnn_params.qs8.gemm.log2_kr = 3;
2004 break;
2005
2006 default:
2007 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2008 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal);
2009 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2010 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal);
2011 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2012 xnn_params.qs8.gemm.mr = 2;
2013 xnn_params.qs8.gemm.nr = 8;
2014 xnn_params.qs8.gemm.log2_kr = 3;
2015 break;
2016 }
2017 }
2018 #if XNN_MAX_UARCH_TYPES > 1
2019 {
2020 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2021 const uint32_t mr = xnn_params.qs8.gemm.mr;
2022 const uint32_t nr = xnn_params.qs8.gemm.nr;
2023 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
2024 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2025 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2026 if (uarch_info == NULL) {
2027 /* No more microarchitectures in the system */
2028 break;
2029 }
2030
2031 switch (uarch_info->uarch) {
2032 case cpuinfo_uarch_cortex_a53:
2033 case cpuinfo_uarch_cortex_a55r0:
2034 if (mr == 2 && nr == 8 && log2_kr == 3) {
2035 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2036 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2037 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2038 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53;
2039 }
2040 break;
2041
2042 case cpuinfo_uarch_cortex_a55:
2043 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
2044 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2045 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2046 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot;
2047 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot;
2048 }
2049 break;
2050 default:
2051 break;
2052 }
2053 }
2054 }
2055 #endif // XNN_MAX_UARCH_TYPES > 1
2056 #else // !XNN_ENABLE_ASSEMBLY
2057 if (cpuinfo_has_arm_neon_dot()) {
2058 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2059 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2060 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2061 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2062 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2063 xnn_params.qs8.gemm.mr = 4;
2064 xnn_params.qs8.gemm.nr = 16;
2065 xnn_params.qs8.gemm.log2_kr = 2;
2066 } else {
2067 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2068 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal);
2069 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2070 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal);
2071 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2072 xnn_params.qs8.gemm.mr = 2;
2073 xnn_params.qs8.gemm.nr = 8;
2074 xnn_params.qs8.gemm.log2_kr = 1;
2075 xnn_params.qs8.gemm.log2_sr = 2;
2076 }
2077 #endif // XNN_ENABLE_ASSEMBLY
2078 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2079
2080 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mla8_ld64;
2081 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2082 xnn_params.qs8.dwconv[0].channel_tile = 16;
2083 xnn_params.qs8.dwconv[0].primary_tile = 9;
2084 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_rndnu_ukernel_up16x25__neon_mla8_ld64;
2085 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_rndnu_neon_params;
2086 xnn_params.qs8.dwconv[1].channel_tile = 16;
2087 xnn_params.qs8.dwconv[1].primary_tile = 25;
2088
2089 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
2090 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2091 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2092 .init.qs8 = xnn_init_qs8_avgpool_minmax_rndnu_neon_params,
2093 .update.qs8 = xnn_update_qs8_avgpool_minmax_rndnu_neon_params,
2094 .row_tile = 7,
2095 .channel_tile = 8,
2096 };
2097
2098 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2099 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32,
2100 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
2101 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32,
2102 .init.qs8_addsub = xnn_init_qs8_add_minmax_neon_params,
2103 .element_tile = 32,
2104 };
2105 xnn_params.qs8.vmul = (struct vbinary_parameters) {
2106 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
2107 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2108 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2109 .init.qs8_mul = xnn_init_qs8_mul_minmax_rndnu_neon_params,
2110 .element_tile = 16,
2111 };
2112 #endif // XNN_NO_QS8_OPERATORS
2113
2114 /**************************** QU8 AArch64 micro-kernels ****************************/
2115 #ifndef XNN_NO_QU8_OPERATORS
2116 init_flags |= XNN_INIT_FLAG_QU8;
2117
2118 #if XNN_ENABLE_ASSEMBLY
2119 if (cpuinfo_has_arm_neon_dot()) {
2120 switch (cpuinfo_get_core(0)->uarch) {
2121 case cpuinfo_uarch_cortex_a55:
2122 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2123 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55);
2124 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2125 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2126 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2127 xnn_params.qu8.gemm.mr = 4;
2128 xnn_params.qu8.gemm.nr = 16;
2129 xnn_params.qu8.gemm.log2_kr = 2;
2130 break;
2131 default:
2132 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2133 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128);
2134 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2135 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2136 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2137 xnn_params.qu8.gemm.mr = 4;
2138 xnn_params.qu8.gemm.nr = 16;
2139 xnn_params.qu8.gemm.log2_kr = 2;
2140 break;
2141 }
2142 } else {
2143 switch (cpuinfo_get_core(0)->uarch) {
2144 case cpuinfo_uarch_cortex_a53:
2145 case cpuinfo_uarch_cortex_a55r0:
2146 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
2147 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
2148 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2149 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2150 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2151 xnn_params.qu8.gemm.mr = 4;
2152 xnn_params.qu8.gemm.nr = 16;
2153 break;
2154
2155 case cpuinfo_uarch_cortex_a57:
2156 case cpuinfo_uarch_cortex_a72:
2157 case cpuinfo_uarch_cortex_a73:
2158 case cpuinfo_uarch_cortex_a75:
2159 case cpuinfo_uarch_cortex_a76:
2160 case cpuinfo_uarch_exynos_m1:
2161 case cpuinfo_uarch_exynos_m2:
2162 case cpuinfo_uarch_exynos_m3:
2163 case cpuinfo_uarch_exynos_m4:
2164 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
2165 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
2166 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2167 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2168 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2169 xnn_params.qu8.gemm.mr = 4;
2170 xnn_params.qu8.gemm.nr = 16;
2171 break;
2172
2173 case cpuinfo_uarch_kryo:
2174 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2175 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2176 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2177 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2178 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2179 xnn_params.qu8.gemm.mr = 4;
2180 xnn_params.qu8.gemm.nr = 16;
2181 break;
2182
2183 default:
2184 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
2185 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75);
2186 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2187 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2188 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2189 xnn_params.qu8.gemm.mr = 4;
2190 xnn_params.qu8.gemm.nr = 16;
2191 break;
2192 }
2193 }
2194 #if XNN_MAX_UARCH_TYPES > 1
2195 {
2196 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2197 const uint32_t mr = xnn_params.qu8.gemm.mr;
2198 const uint32_t nr = xnn_params.qu8.gemm.nr;
2199 const uint32_t log2_kr = xnn_params.qu8.gemm.log2_kr;
2200 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2201 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2202 if (uarch_info == NULL) {
2203 /* No more microarchitectures in the system */
2204 break;
2205 }
2206
2207 switch (uarch_info->uarch) {
2208 case cpuinfo_uarch_cortex_a53:
2209 case cpuinfo_uarch_cortex_a55r0:
2210 if (mr == 4 && nr == 16 && log2_kr == 0) {
2211 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
2212 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53;
2213 }
2214 break;
2215
2216 case cpuinfo_uarch_cortex_a55:
2217 if (mr == 4 && nr == 16 && log2_kr == 2 && cpuinfo_has_arm_neon_dot()) {
2218 xnn_params.qu8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2219 xnn_params.qu8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55;
2220 }
2221 break;
2222 default:
2223 break;
2224 }
2225 }
2226 }
2227 #endif // XNN_MAX_UARCH_TYPES > 1
2228 #else // !XNN_ENABLE_ASSEMBLY
2229 if (cpuinfo_has_arm_neon_dot()) {
2230 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot);
2231 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot);
2232 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot);
2233 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot);
2234 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2235 xnn_params.qu8.gemm.mr = 4;
2236 xnn_params.qu8.gemm.nr = 16;
2237 xnn_params.qu8.gemm.log2_kr = 2;
2238 } else {
2239 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2240 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane);
2241 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2242 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane);
2243 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2244 xnn_params.qu8.gemm.mr = 4;
2245 xnn_params.qu8.gemm.nr = 16;
2246 }
2247 #endif // XNN_ENABLE_ASSEMBLY
2248
2249 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up16x9__neon_mul8;
2250 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2251 xnn_params.qu8.dwconv[0].channel_tile = 16;
2252 xnn_params.qu8.dwconv[0].primary_tile = 9;
2253 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_rndnu_ukernel_up8x25__neon_mul8;
2254 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_rndnu_neon_params;
2255 xnn_params.qu8.dwconv[1].channel_tile = 8;
2256 xnn_params.qu8.dwconv[1].primary_tile = 25;
2257
2258 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
2259 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
2260 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
2261 .init.qu8 = xnn_init_qu8_avgpool_minmax_neon_params,
2262 .primary_tile = 9,
2263 .incremental_tile = 8,
2264 .channel_tile = 8,
2265 };
2266 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
2267 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7x__neon_c8,
2268 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_rndnu_ukernel_7p7x__neon_c8,
2269 .init.qu8 = xnn_init_qu8_avgpool_minmax_rndnu_neon_params,
2270 .update.qu8 = xnn_update_qu8_avgpool_minmax_rndnu_neon_params,
2271 .row_tile = 7,
2272 .channel_tile = 8,
2273 };
2274 xnn_params.qu8.vadd = (struct vbinary_parameters) {
2275 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon_ld64_x32,
2276 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
2277 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x32,
2278 .init.qu8_addsub = xnn_init_qu8_add_minmax_neon_params,
2279 .element_tile = 8,
2280 };
2281 xnn_params.qu8.vmul = (struct vbinary_parameters) {
2282 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_rndnu_ukernel__neon_ld64_x16,
2283 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2284 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_rndnu_ukernel__neon_ld64_x16,
2285 .init.qu8_mul = xnn_init_qu8_mul_minmax_rndnu_neon_params,
2286 .element_tile = 16,
2287 };
2288 #endif // XNN_NO_QU8_OPERATORS
2289
2290 /**************************** S8 AArch64 micro-kernels ****************************/
2291 #ifndef XNN_NO_S8_OPERATORS
2292 init_flags |= XNN_INIT_FLAG_S8;
2293
2294 xnn_params.s8.clamp = (struct vunary_parameters) {
2295 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__neon_x64,
2296 .init.s8_minmax = xnn_init_s8_minmax_neon_params,
2297 .element_tile = 64,
2298 };
2299 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
2300 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__neon_c16,
2301 .pixel_tile = 1,
2302 .channel_tile = 16,
2303 };
2304 xnn_params.s8.maxpool = (struct maxpool_parameters) {
2305 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__neon_c16,
2306 .init.s8 = xnn_init_s8_minmax_neon_params,
2307 .mr = 9,
2308 .qr = 8,
2309 };
2310 #endif // XNN_NO_S8_OPERATORS
2311
2312 /**************************** U8 AArch64 micro-kernels ****************************/
2313 #ifndef XNN_NO_U8_OPERATORS
2314 init_flags |= XNN_INIT_FLAG_U8;
2315
2316 xnn_params.u8.clamp = (struct vunary_parameters) {
2317 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__neon_x64,
2318 .init.u8_minmax = xnn_init_u8_minmax_neon_params,
2319 .element_tile = 64,
2320 };
2321 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
2322 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__neon_c16,
2323 .pixel_tile = 1,
2324 .channel_tile = 16,
2325 };
2326 xnn_params.u8.maxpool = (struct maxpool_parameters) {
2327 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
2328 .init.u8 = xnn_init_u8_minmax_neon_params,
2329 .mr = 9,
2330 .qr = 8,
2331 };
2332 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2333 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
2334 #endif // XNN_NO_U8_OPERATORS
2335
2336 /**************************** X8 AArch64 micro-kernels ****************************/
2337 #ifndef XNN_NO_X8_OPERATORS
2338 init_flags |= XNN_INIT_FLAG_X8;
2339
2340 xnn_params.x8.lut = xnn_x8_lut_ukernel__neon_tbx128x4_x64;
2341 xnn_params.x8.zip = (struct zip_parameters) {
2342 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
2343 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
2344 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
2345 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
2346 };
2347 #endif // XNN_NO_X8_OPERATORS
2348
2349 /**************************** F16 AArch64 micro-kernels ****************************/
2350 #ifndef XNN_NO_F16_OPERATORS
2351 if (cpuinfo_has_arm_neon_fp16_arith()) {
2352 init_flags |= XNN_INIT_FLAG_F16;
2353 xnn_params.f16.gemm.mr = 6;
2354 xnn_params.f16.gemm.nr = 16;
2355
2356 #if XNN_ENABLE_ASSEMBLY
2357 switch (cpuinfo_get_core(0)->uarch) {
2358 case cpuinfo_uarch_cortex_a55:
2359 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55);
2360 break;
2361
2362 case cpuinfo_uarch_cortex_a75:
2363 case cpuinfo_uarch_cortex_x1:
2364 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75);
2365 break;
2366
2367 default:
2368 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
2369 break;
2370 }
2371 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
2372
2373 #if XNN_MAX_UARCH_TYPES > 1
2374 {
2375 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2376 const uint32_t mr = xnn_params.f16.gemm.mr;
2377 const uint32_t nr = xnn_params.f16.gemm.nr;
2378 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2379 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2380 if (uarch_info == NULL) {
2381 /* No more microarchitectures in the system */
2382 break;
2383 }
2384
2385 switch (uarch_info->uarch) {
2386 case cpuinfo_uarch_cortex_a55:
2387 if (mr == 6 && nr == 16) {
2388 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a55;
2389 }
2390 break;
2391
2392 case cpuinfo_uarch_cortex_a55r0:
2393 if (mr == 6 && nr == 16) {
2394 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64;
2395 }
2396 break;
2397
2398 /* Cortex A75 is the medium core Exynos 9820 (M4) */
2399 case cpuinfo_uarch_cortex_a75:
2400 if (mr == 6 && nr == 16) {
2401 xnn_params.f16.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_cortex_a75;
2402 }
2403 break;
2404
2405 default:
2406 break;
2407 }
2408 }
2409 }
2410 #endif // XNN_MAX_UARCH_TYPES > 1
2411 #else // XNN_ENABLE_ASSEMBLY
2412 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2413 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2414 #endif // XNN_ENABLE_ASSEMBLY
2415 xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2416 xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2417 xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_neon_params;
2418
2419 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith;
2420 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_neon_params;
2421 xnn_params.f16.dwconv[0].channel_tile = 16;
2422 xnn_params.f16.dwconv[0].primary_tile = 4;
2423
2424 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith;
2425 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_neon_params;
2426 xnn_params.f16.dwconv[1].channel_tile = 16;
2427 xnn_params.f16.dwconv[1].primary_tile = 9;
2428
2429 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2;
2430 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_neon_params;
2431 xnn_params.f16.dwconv[2].channel_tile = 8;
2432 xnn_params.f16.dwconv[2].primary_tile = 25;
2433
2434 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
2435 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8,
2436 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8,
2437 .init.f16 = xnn_init_f16_scaleminmax_neon_params,
2438 .update.f16 = xnn_update_f16_scaleminmax_neon_params,
2439 .row_tile = 7,
2440 .channel_tile = 8,
2441 };
2442
2443 xnn_params.f16.maxpool = (struct maxpool_parameters) {
2444 .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__neonfp16arith_c8,
2445 .init.f16 = xnn_init_f16_minmax_neon_params,
2446 .mr = 9,
2447 .qr = 8,
2448 };
2449
2450 xnn_params.f16.prelu = (struct prelu_parameters) {
2451 .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__neonfp16arith_2x16,
2452 .row_tile = 2,
2453 .channel_tile = 16,
2454 };
2455
2456 xnn_params.f16.vadd = (struct vbinary_parameters) {
2457 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16,
2458 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
2459 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
2460 .init.f16_minmax = xnn_init_f16_minmax_neon_params,
2461 .element_tile = 16,
2462 };
2463 xnn_params.f16.vmul = (struct vbinary_parameters) {
2464 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16,
2465 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
2466 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
2467 .init.f16_minmax = xnn_init_f16_minmax_neon_params,
2468 .element_tile = 16,
2469 };
2470 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
2471 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__neonfp16arith_2x,
2472 .init.f16 = xnn_init_f16_minmax_neon_params,
2473 .channel_tile = 8,
2474 .row_tile = 2,
2475 };
2476
2477 xnn_params.f16.hswish = (struct vunary_parameters) {
2478 .ukernel = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__neonfp16arith_x16,
2479 .init.f16_hswish = xnn_init_f16_hswish_neon_params,
2480 .element_tile = 16,
2481 };
2482 }
2483 #endif // XNN_NO_F16_OPERATORS
2484
2485 /**************************** F32 AArch64 micro-kernels ****************************/
2486 #ifndef XNN_NO_F32_OPERATORS
2487 init_flags |= XNN_INIT_FLAG_F32;
2488
2489 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2490 #if XNN_ENABLE_ASSEMBLY
2491 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2492 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2493 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2494 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2495 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2496 xnn_params.f32.gemm.mr = 6;
2497 xnn_params.f32.gemm.nr = 8;
2498 #else // !XNN_ENABLE_ASSEMBLY
2499 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2500 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2501 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2502 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2503 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2504 xnn_params.f32.gemm.mr = 6;
2505 xnn_params.f32.gemm.nr = 8;
2506 #endif // XNN_ENABLE_ASSEMBLY
2507 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
2508 #if XNN_ENABLE_ASSEMBLY
2509 switch (cpuinfo_get_core(0)->uarch) {
2510 case cpuinfo_uarch_cortex_a57:
2511 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
2512 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
2513 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2514 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2515 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2516 xnn_params.f32.gemm.mr = 6;
2517 xnn_params.f32.gemm.nr = 8;
2518 break;
2519 case cpuinfo_uarch_cortex_a72:
2520 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2521 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_prfm_cortex_a75);
2522 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2523 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2524 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2525 xnn_params.f32.gemm.mr = 4;
2526 xnn_params.f32.gemm.nr = 8;
2527 break;
2528 case cpuinfo_uarch_cortex_a75:
2529 case cpuinfo_uarch_cortex_a76:
2530 case cpuinfo_uarch_exynos_m3:
2531 case cpuinfo_uarch_exynos_m4:
2532 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2533 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2534 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2535 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2536 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2537 xnn_params.f32.gemm.mr = 6;
2538 xnn_params.f32.gemm.nr = 8;
2539 #if XNN_ENABLE_JIT
2540 xnn_params.f32.gemm.generator.gemm = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2541 xnn_params.f32.gemm.generator.igemm = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_6x8__aarch64_neonfma_prfm_cortex_a75);
2542 xnn_params.f32.gemm.generator.gemm1 = xnn_init_hmp_gemm_codegen(xnn_generate_f32_gemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2543 xnn_params.f32.gemm.generator.igemm1 = xnn_init_hmp_igemm_codegen(xnn_generate_f32_igemm_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2544 #endif
2545 break;
2546 case cpuinfo_uarch_exynos_m1:
2547 case cpuinfo_uarch_exynos_m2:
2548 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma);
2549 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma);
2550 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma);
2551 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma);
2552 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2553 xnn_params.f32.gemm.mr = 6;
2554 xnn_params.f32.gemm.nr = 8;
2555 xnn_params.f32.gemm.log2_sr = 2;
2556 break;
2557 case cpuinfo_uarch_cortex_a53:
2558 case cpuinfo_uarch_cortex_a55r0:
2559 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2560 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
2561 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2562 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2563 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2564 xnn_params.f32.gemm.mr = 6;
2565 xnn_params.f32.gemm.nr = 8;
2566 break;
2567 case cpuinfo_uarch_cortex_a35:
2568 case cpuinfo_uarch_cortex_a55:
2569 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2570 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
2571 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2572 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
2573 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2574 xnn_params.f32.gemm.mr = 6;
2575 xnn_params.f32.gemm.nr = 8;
2576 break;
2577 case cpuinfo_uarch_cortex_a73:
2578 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
2579 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
2580 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2581 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_prfm_cortex_a75);
2582 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2583 xnn_params.f32.gemm.mr = 6;
2584 xnn_params.f32.gemm.nr = 8;
2585 break;
2586 case cpuinfo_uarch_cortex_a77:
2587 case cpuinfo_uarch_exynos_m5:
2588 case cpuinfo_uarch_kryo:
2589 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
2590 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
2591 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2592 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
2593 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2594 xnn_params.f32.gemm.mr = 4;
2595 xnn_params.f32.gemm.nr = 8;
2596 break;
2597 case cpuinfo_uarch_cortex_a78:
2598 case cpuinfo_uarch_cortex_x1:
2599 default:
2600 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
2601 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_ld128);
2602 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2603 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2604 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2605 xnn_params.f32.gemm.mr = 6;
2606 xnn_params.f32.gemm.nr = 8;
2607 break;
2608 }
2609 #if XNN_MAX_UARCH_TYPES > 1
2610 {
2611 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
2612 const uint32_t mr = xnn_params.f32.gemm.mr;
2613 const uint32_t nr = xnn_params.f32.gemm.nr;
2614 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
2615 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
2616 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
2617 if (uarch_info == NULL) {
2618 /* No more microarchitectures in the system */
2619 break;
2620 }
2621
2622 switch (uarch_info->uarch) {
2623 case cpuinfo_uarch_cortex_a53:
2624 case cpuinfo_uarch_cortex_a55r0:
2625 if (mr == 6 && nr == 8 && log2_sr == 0) {
2626 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
2627 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
2628 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2629 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2630 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
2631 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
2632 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
2633 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2634 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2635 }
2636 break;
2637 case cpuinfo_uarch_cortex_a55:
2638 if (mr == 6 && nr == 8 && log2_sr == 0) {
2639 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
2640 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
2641 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2642 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2643 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
2644 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
2645 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
2646 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2647 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
2648 }
2649 break;
2650 default:
2651 break;
2652 }
2653 }
2654 }
2655 #endif // XNN_MAX_UARCH_TYPES > 1
2656 #else // !XNN_ENABLE_ASSEMBLY
2657 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2658 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
2659 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2660 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
2661 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
2662 xnn_params.f32.gemm.mr = 6;
2663 xnn_params.f32.gemm.nr = 8;
2664 #endif // XNN_ENABLE_ASSEMBLY
2665 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2666 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64);
2667 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64);
2668 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
2669 xnn_params.f32.gemm2.mr = 4;
2670 xnn_params.f32.gemm2.nr = 2;
2671
2672 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__neonfma;
2673 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
2674 xnn_params.f32.dwconv[0].channel_tile = 8;
2675 xnn_params.f32.dwconv[0].primary_tile = 3;
2676
2677 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neonfma;
2678 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
2679 xnn_params.f32.dwconv[1].channel_tile = 8;
2680 xnn_params.f32.dwconv[1].primary_tile = 4;
2681
2682 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
2683 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2684 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2685 xnn_params.f32.dwconv[2].channel_tile = 8;
2686 xnn_params.f32.dwconv[2].primary_tile = 9;
2687 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
2688 switch (cpuinfo_get_core(0)->uarch) {
2689 case cpuinfo_uarch_kryo:
2690 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2691 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2692 xnn_params.f32.dwconv[2].channel_tile = 8;
2693 xnn_params.f32.dwconv[2].primary_tile = 9;
2694 break;
2695 #if XNN_ENABLE_ASSEMBLY
2696 case cpuinfo_uarch_cortex_a53:
2697 case cpuinfo_uarch_cortex_a55r0:
2698 case cpuinfo_uarch_cortex_a55:
2699 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55;
2700 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2701 xnn_params.f32.dwconv[2].channel_tile = 4;
2702 xnn_params.f32.dwconv[2].primary_tile = 9;
2703 break;
2704 #endif // XNN_ENABLE_ASSEMBLY
2705 default:
2706 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
2707 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
2708 xnn_params.f32.dwconv[2].channel_tile = 8;
2709 xnn_params.f32.dwconv[2].primary_tile = 9;
2710 break;
2711 }
2712 #endif // XNN_PLATFORM_IOS && XNN_PLATFORM_MAC
2713
2714 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__neonfma_acc2;
2715 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
2716 xnn_params.f32.dwconv[3].channel_tile = 8;
2717 xnn_params.f32.dwconv[3].primary_tile = 25;
2718
2719 xnn_params.f32.avgpool = (struct avgpool_parameters) {
2720 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
2721 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
2722 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
2723 .primary_tile = 9,
2724 .incremental_tile = 8,
2725 .channel_tile = 4,
2726 };
2727 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
2728 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
2729 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
2730 .primary_tile = 9,
2731 .incremental_tile = 8,
2732 .channel_tile = 4,
2733 };
2734 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
2735 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
2736 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
2737 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
2738 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
2739 .row_tile = 7,
2740 .channel_tile = 4,
2741 };
2742 xnn_params.f32.maxpool = (struct maxpool_parameters) {
2743 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
2744 .init.f32 = xnn_init_f32_minmax_scalar_params,
2745 .mr = 9,
2746 .qr = 8,
2747 };
2748 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
2749 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
2750 .mr = 4,
2751 };
2752 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
2753 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
2754 .mr = 9,
2755 };
2756 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
2757 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
2758 .mr = 9,
2759 .qr = 8,
2760 };
2761 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
2762 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neonfma_c8,
2763 .pixel_tile = 1,
2764 .channel_tile = 8,
2765 };
2766 xnn_params.f32.abs = (struct vunary_parameters) {
2767 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8,
2768 .element_tile = 8,
2769 };
2770 xnn_params.f32.clamp = (struct vunary_parameters) {
2771 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__neon_x8,
2772 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2773 .element_tile = 8,
2774 };
2775 xnn_params.f32.elu = (struct vunary_parameters) {
2776 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16,
2777 .init.f32_elu = xnn_init_f32_elu_neonfma_rr1_lut16_p3_params,
2778 .element_tile = 16,
2779 };
2780 xnn_params.f32.hswish = (struct vunary_parameters) {
2781 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__neon_x16,
2782 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
2783 .element_tile = 16,
2784 };
2785 xnn_params.f32.lrelu = (struct vunary_parameters) {
2786 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8,
2787 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
2788 .element_tile = 8,
2789 };
2790 xnn_params.f32.neg = (struct vunary_parameters) {
2791 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8,
2792 .element_tile = 8,
2793 };
2794 xnn_params.f32.rndne = (struct vunary_parameters) {
2795 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8,
2796 .element_tile = 8,
2797 };
2798 xnn_params.f32.rndz = (struct vunary_parameters) {
2799 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8,
2800 .element_tile = 8,
2801 };
2802 xnn_params.f32.rndu = (struct vunary_parameters) {
2803 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8,
2804 .element_tile = 8,
2805 };
2806 xnn_params.f32.rndd = (struct vunary_parameters) {
2807 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8,
2808 .element_tile = 8,
2809 };
2810 xnn_params.f32.sigmoid = (struct vunary_parameters) {
2811 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16,
2812 .init.f32_sigmoid = xnn_init_f32_sigmoid_neonfma_rr1_lut64_p2_params,
2813 .element_tile = 16,
2814 };
2815 xnn_params.f32.sqr = (struct vunary_parameters) {
2816 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8,
2817 .element_tile = 8,
2818 };
2819 xnn_params.f32.sqrt = (struct vunary_parameters) {
2820 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__neon_sqrt_x4,
2821 .element_tile = 4,
2822 };
2823 xnn_params.f32.prelu = (struct prelu_parameters) {
2824 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
2825 .row_tile = 2,
2826 .channel_tile = 8,
2827 };
2828 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
2829 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__neonfma_rr1_lut64_p2_x16,
2830 .init = xnn_init_f32_expminus_neonfma_rr1_lut64_p2_params,
2831 .element_tile = 16,
2832 };
2833 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
2834 xnn_params.f32.vadd = (struct vbinary_parameters) {
2835 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
2836 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
2837 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
2838 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2839 .element_tile = 8,
2840 };
2841 xnn_params.f32.vdiv = (struct vbinary_parameters) {
2842 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__neon_x8,
2843 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__neon_x8,
2844 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__neon_x8,
2845 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2846 .element_tile = 8,
2847 };
2848 xnn_params.f32.vmax = (struct vbinary_parameters) {
2849 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
2850 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
2851 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
2852 .element_tile = 8,
2853 };
2854 xnn_params.f32.vmin = (struct vbinary_parameters) {
2855 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
2856 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
2857 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
2858 .element_tile = 8,
2859 };
2860 xnn_params.f32.vmul = (struct vbinary_parameters) {
2861 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
2862 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
2863 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
2864 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2865 .element_tile = 8,
2866 };
2867 xnn_params.f32.vsub = (struct vbinary_parameters) {
2868 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
2869 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
2870 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
2871 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
2872 .element_tile = 8,
2873 };
2874 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
2875 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
2876 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
2877 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
2878 .element_tile = 8,
2879 };
2880 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
2881 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x,
2882 .init.f32 = xnn_init_f32_minmax_scalar_params,
2883 .channel_tile = 4,
2884 .row_tile = 2,
2885 };
2886 #ifndef XNN_NO_NCHW_OPERATORS
2887 init_flags |= XNN_INIT_FLAG_CHW_OPT;
2888
2889 xnn_params.f32.spmm = (struct spmm_parameters) {
2890 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined,
2891 .mr = 32,
2892 .nr = 1,
2893 };
2894 xnn_params.f32.spmm2 = (struct spmm_parameters) {
2895 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x2__neonfma,
2896 .mr = 32,
2897 .nr = 2,
2898 };
2899 xnn_params.f32.spmm4 = (struct spmm_parameters) {
2900 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x4__neonfma,
2901 .mr = 32,
2902 .nr = 4,
2903 };
2904 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
2905 .ukernel_with_symm_padding =
2906 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2,
2907 .output_channel_tile = 4,
2908 .output_height_tile = 2,
2909 .output_width_tile = 2,
2910 };
2911 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2912 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4,
2913 .output_width_tile = 4,
2914 .output_height_tile = 3,
2915 };
2916 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2917 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2,
2918 .output_width_tile = 4,
2919 .output_height_tile = 2,
2920 };
2921 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2922 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4,
2923 .output_width_tile = 4,
2924 .output_height_tile = 4,
2925 };
2926 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2927 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2,
2928 .output_width_tile = 4,
2929 .output_height_tile = 1,
2930 };
2931 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2932 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
2933 .channel_tile = 4,
2934 };
2935 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
2936 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neonfma_p8,
2937 .channel_tile = 1,
2938 .pixel_tile = 8,
2939 };
2940 #endif // XNN_NO_NCHW_OPERATORS
2941 #endif // XNN_NO_F32_OPERATORS
2942
2943 /*************************** VCVT AArch64 micro-kernels ***************************/
2944 #ifndef XNN_NO_VCVT_OPERATORS
2945 init_flags |= XNN_INIT_FLAG_VCVT;
2946
2947 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
2948 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16,
2949 .element_tile = 16,
2950 };
2951 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
2952 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16,
2953 .element_tile = 16,
2954 };
2955 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
2956 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__neonv8_x32,
2957 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_neonv8_params,
2958 .element_tile = 32,
2959 };
2960 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
2961 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
2962 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_neonv8_params,
2963 .element_tile = 32,
2964 };
2965 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
2966 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__neon_x32,
2967 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_neon_params,
2968 .element_tile = 32,
2969 };
2970 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
2971 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__neon_x32,
2972 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_neon_params,
2973 .element_tile = 32,
2974 };
2975 #endif // XNN_NO_VCVT_OPERATORS
2976
2977 /**************************** X32 AArch64 micro-kernels ****************************/
2978 #ifndef XNN_NO_X32_OPERATORS
2979 init_flags |= XNN_INIT_FLAG_X32;
2980
2981 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
2982 xnn_params.x32.zip = (struct zip_parameters) {
2983 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
2984 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
2985 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
2986 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
2987 };
2988 #ifndef XNN_NO_NCHW_OPERATORS
2989 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
2990 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
2991 .channel_tile = 1,
2992 .pixel_tile = 1,
2993 };
2994 #endif // XNN_NO_NCHW_OPERATORS
2995 #endif // XNN_NO_X32_OPERATORS
2996
2997 /**************************** XX AArch64 micro-kernels ****************************/
2998 #ifndef XNN_NO_XX_OPERATORS
2999 init_flags |= XNN_INIT_FLAG_XX;
3000
3001 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
3002 xnn_params.xx.fill = (struct fill_parameters) {
3003 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__neon_x64,
3004 .row_tile = 1,
3005 };
3006 xnn_params.xx.pad = (struct pad_parameters) {
3007 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__neon,
3008 .row_tile = 1,
3009 };
3010 #endif
3011
3012 #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
3013 if (!cpuinfo_has_x86_sse2()) {
3014 xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
3015 return;
3016 }
3017
3018 /**************************** QC8 x86 micro-kernels ****************************/
3019 #ifndef XNN_NO_QC8_OPERATORS
3020 init_flags |= XNN_INIT_FLAG_QC8;
3021
3022 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3023 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3024 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3025 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3026 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3027 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_avx512_params;
3028 xnn_params.qc8.gemm.mr = 4;
3029 xnn_params.qc8.gemm.nr = 16;
3030 xnn_params.qc8.gemm.log2_kr = 3;
3031 } else if (cpuinfo_has_x86_xop()) {
3032 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3033 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3034 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3035 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3036 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3037 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
3038 xnn_params.qc8.gemm.mr = 2;
3039 xnn_params.qc8.gemm.nr = 4;
3040 xnn_params.qc8.gemm.log2_kr = 3;
3041 } else if (cpuinfo_has_x86_avx2()) {
3042 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3043 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3044 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3045 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3046 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_avx2_params;
3047 xnn_params.qc8.gemm.mr = 3;
3048 xnn_params.qc8.gemm.nr = 8;
3049 xnn_params.qc8.gemm.log2_kr = 3;
3050 } else if (cpuinfo_has_x86_avx()) {
3051 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3052 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3053 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3054 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3055 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
3056 xnn_params.qc8.gemm.mr = 2;
3057 xnn_params.qc8.gemm.nr = 4;
3058 xnn_params.qc8.gemm.log2_kr = 3;
3059 } else if (cpuinfo_has_x86_sse4_1()) {
3060 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3061 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3062 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3063 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3064 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse4_params;
3065 xnn_params.qc8.gemm.mr = 3;
3066 xnn_params.qc8.gemm.nr = 4;
3067 xnn_params.qc8.gemm.log2_kr = 3;
3068 } else {
3069 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3070 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3071 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3072 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3073 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_sse2_params;
3074 xnn_params.qc8.gemm.mr = 3;
3075 xnn_params.qc8.gemm.nr = 4;
3076 xnn_params.qc8.gemm.log2_kr = 3;
3077 }
3078
3079 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3080 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3081 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_avx512_params;
3082 xnn_params.qc8.dwconv[0].channel_tile = 32;
3083 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3084 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_avx512_params;
3085 xnn_params.qc8.dwconv[1].channel_tile = 32;
3086 } else if (cpuinfo_has_x86_xop()) {
3087 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3088 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
3089 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3090 xnn_params.qc8.dwconv[0].channel_tile = 16;
3091 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
3092 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3093 xnn_params.qc8.dwconv[1].channel_tile = 16;
3094 } else if (cpuinfo_has_x86_avx2()) {
3095 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3096 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_avx2_params;
3097 xnn_params.qc8.dwconv[0].channel_tile = 16;
3098 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3099 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_avx2_params;
3100 xnn_params.qc8.dwconv[1].channel_tile = 16;
3101 } else if (cpuinfo_has_x86_avx()) {
3102 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
3103 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3104 xnn_params.qc8.dwconv[0].channel_tile = 16;
3105 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
3106 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3107 xnn_params.qc8.dwconv[1].channel_tile = 16;
3108 } else if (cpuinfo_has_x86_sse4_1()) {
3109 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3110 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3111 xnn_params.qc8.dwconv[0].channel_tile = 8;
3112 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3113 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse4_params;
3114 xnn_params.qc8.dwconv[1].channel_tile = 8;
3115 } else if (cpuinfo_has_x86_sse2()) {
3116 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3117 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_sse2_params;
3118 xnn_params.qc8.dwconv[0].channel_tile = 8;
3119 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3120 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_sse2_params;
3121 xnn_params.qc8.dwconv[1].channel_tile = 8;
3122 }
3123 xnn_params.qc8.dwconv[0].primary_tile = 9;
3124 xnn_params.qc8.dwconv[1].primary_tile = 25;
3125 #endif // XNN_NO_QC8_OPERATORS
3126
3127 /**************************** QS8 x86 micro-kernels ****************************/
3128 #ifndef XNN_NO_QS8_OPERATORS
3129 init_flags |= XNN_INIT_FLAG_QS8;
3130
3131 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3132 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3133 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3134 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3135 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3136 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3137 xnn_params.qs8.gemm.mr = 4;
3138 xnn_params.qs8.gemm.nr = 16;
3139 xnn_params.qs8.gemm.log2_kr = 3;
3140 } else if (cpuinfo_has_x86_xop()) {
3141 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3142 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3143 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3144 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3145 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3146 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3147 xnn_params.qs8.gemm.mr = 2;
3148 xnn_params.qs8.gemm.nr = 4;
3149 xnn_params.qs8.gemm.log2_kr = 3;
3150 } else if (cpuinfo_has_x86_avx2()) {
3151 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3152 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3153 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3154 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3155 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3156 xnn_params.qs8.gemm.mr = 3;
3157 xnn_params.qs8.gemm.nr = 8;
3158 xnn_params.qs8.gemm.log2_kr = 3;
3159 } else if (cpuinfo_has_x86_avx()) {
3160 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3161 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3162 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3163 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3164 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3165 xnn_params.qs8.gemm.mr = 2;
3166 xnn_params.qs8.gemm.nr = 4;
3167 xnn_params.qs8.gemm.log2_kr = 3;
3168 } else if (cpuinfo_has_x86_sse4_1()) {
3169 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3170 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3171 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3172 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3173 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3174 xnn_params.qs8.gemm.mr = 3;
3175 xnn_params.qs8.gemm.nr = 4;
3176 xnn_params.qs8.gemm.log2_kr = 3;
3177 } else {
3178 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3179 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3180 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3181 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3182 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3183 xnn_params.qs8.gemm.mr = 3;
3184 xnn_params.qs8.gemm.nr = 4;
3185 xnn_params.qs8.gemm.log2_kr = 3;
3186 }
3187
3188 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3189 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3190 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3191 xnn_params.qs8.dwconv[0].channel_tile = 32;
3192 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3193 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx512_params;
3194 xnn_params.qs8.dwconv[1].channel_tile = 32;
3195 } else if (cpuinfo_has_x86_xop()) {
3196 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3197 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul16_add16;
3198 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3199 xnn_params.qs8.dwconv[0].channel_tile = 16;
3200 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul16_add16;
3201 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3202 xnn_params.qs8.dwconv[1].channel_tile = 16;
3203 } else if (cpuinfo_has_x86_avx2()) {
3204 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3205 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3206 xnn_params.qs8.dwconv[0].channel_tile = 16;
3207 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3208 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_avx2_params;
3209 xnn_params.qs8.dwconv[1].channel_tile = 16;
3210 } else if (cpuinfo_has_x86_avx()) {
3211 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16;
3212 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3213 xnn_params.qs8.dwconv[0].channel_tile = 16;
3214 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16;
3215 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3216 xnn_params.qs8.dwconv[1].channel_tile = 16;
3217 } else if (cpuinfo_has_x86_sse4_1()) {
3218 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16_add16;
3219 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3220 xnn_params.qs8.dwconv[0].channel_tile = 8;
3221 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16_add16;
3222 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse4_params;
3223 xnn_params.qs8.dwconv[1].channel_tile = 8;
3224 } else if (cpuinfo_has_x86_sse2()) {
3225 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16_add16;
3226 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3227 xnn_params.qs8.dwconv[0].channel_tile = 8;
3228 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16_add16;
3229 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_sse2_params;
3230 xnn_params.qs8.dwconv[1].channel_tile = 8;
3231 }
3232 xnn_params.qs8.dwconv[0].primary_tile = 9;
3233 xnn_params.qs8.dwconv[1].primary_tile = 25;
3234
3235 if (cpuinfo_has_x86_sse4_1()) {
3236 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
3237 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3238 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3239 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse4_params,
3240 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse4_params,
3241 .row_tile = 7,
3242 .channel_tile = 8,
3243 };
3244 } else {
3245 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
3246 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3247 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3248 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_sse2_params,
3249 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_sse2_params,
3250 .row_tile = 7,
3251 .channel_tile = 8,
3252 };
3253 }
3254
3255 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3256 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3257 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3258 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3259 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3260 .init.qs8_addsub = xnn_init_qs8_add_minmax_avx512_params,
3261 .element_tile = 16,
3262 };
3263 } else if (cpuinfo_has_x86_xop()) {
3264 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3265 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
3266 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3267 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3268 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul32_params,
3269 .element_tile = 8,
3270 };
3271 } else if (cpuinfo_has_x86_avx2()) {
3272 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3273 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
3274 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3275 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3276 .init.qs8_addsub = xnn_init_qs8_add_minmax_avx2_params,
3277 .element_tile = 16,
3278 };
3279 } else if (cpuinfo_has_x86_avx()) {
3280 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3281 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
3282 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3283 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3284 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul32_params,
3285 .element_tile = 8,
3286 };
3287 } else if (cpuinfo_has_x86_sse4_1()) {
3288 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3289 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
3290 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3291 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3292 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse4_mul16_params,
3293 .element_tile = 8,
3294 };
3295 } else {
3296 xnn_params.qs8.vadd = (struct vbinary_parameters) {
3297 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
3298 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3299 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3300 .init.qs8_addsub = xnn_init_qs8_add_minmax_sse2_params,
3301 .element_tile = 8,
3302 };
3303 }
3304 if (cpuinfo_has_x86_avx()) {
3305 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3306 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3307 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3308 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3309 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
3310 .element_tile = 16,
3311 };
3312 } else if (cpuinfo_has_x86_sse4_1()) {
3313 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3314 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3315 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3316 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3317 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse4_params,
3318 .element_tile = 16,
3319 };
3320 } else {
3321 xnn_params.qs8.vmul = (struct vbinary_parameters) {
3322 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3323 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3324 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3325 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_sse2_params,
3326 .element_tile = 8,
3327 };
3328 }
3329 #endif // XNN_NO_QS8_OPERATORS
3330
3331 /**************************** QU8 x86 micro-kernels ****************************/
3332 #ifndef XNN_NO_QU8_OPERATORS
3333 init_flags |= XNN_INIT_FLAG_QU8;
3334
3335 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3336 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3337 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx);
3338 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3339 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx);
3340 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3341 xnn_params.qu8.gemm.mr = 4;
3342 xnn_params.qu8.gemm.nr = 16;
3343 xnn_params.qu8.gemm.log2_kr = 3;
3344 } else if (cpuinfo_has_x86_xop()) {
3345 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3346 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3347 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64);
3348 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3349 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64);
3350 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3351 xnn_params.qu8.gemm.mr = 2;
3352 xnn_params.qu8.gemm.nr = 4;
3353 xnn_params.qu8.gemm.log2_kr = 3;
3354 } else if (cpuinfo_has_x86_avx2()) {
3355 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2);
3356 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2);
3357 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2);
3358 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2);
3359 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3360 xnn_params.qu8.gemm.mr = 3;
3361 xnn_params.qu8.gemm.nr = 8;
3362 xnn_params.qu8.gemm.log2_kr = 3;
3363 } else if (cpuinfo_has_x86_avx()) {
3364 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3365 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128);
3366 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3367 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128);
3368 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3369 xnn_params.qu8.gemm.mr = 2;
3370 xnn_params.qu8.gemm.nr = 4;
3371 xnn_params.qu8.gemm.log2_kr = 3;
3372 } else if (cpuinfo_has_x86_sse4_1()) {
3373 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3374 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64);
3375 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3376 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64);
3377 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3378 xnn_params.qu8.gemm.mr = 3;
3379 xnn_params.qu8.gemm.nr = 4;
3380 xnn_params.qu8.gemm.log2_kr = 3;
3381 } else {
3382 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3383 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64);
3384 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3385 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64);
3386 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3387 xnn_params.qu8.gemm.mr = 3;
3388 xnn_params.qu8.gemm.nr = 4;
3389 xnn_params.qu8.gemm.log2_kr = 3;
3390 }
3391
3392 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3393 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x9__avx512skx_mul32;
3394 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3395 xnn_params.qu8.dwconv[0].channel_tile = 32;
3396 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up32x25__avx512skx_mul32;
3397 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx512_params;
3398 xnn_params.qu8.dwconv[1].channel_tile = 32;
3399 } else if (cpuinfo_has_x86_xop()) {
3400 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
3401 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__xop_mul32;
3402 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3403 xnn_params.qu8.dwconv[0].channel_tile = 16;
3404 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__xop_mul32;
3405 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3406 xnn_params.qu8.dwconv[1].channel_tile = 16;
3407 } else if (cpuinfo_has_x86_avx2()) {
3408 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx2_mul32;
3409 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3410 xnn_params.qu8.dwconv[0].channel_tile = 16;
3411 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx2_mul32;
3412 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_avx2_params;
3413 xnn_params.qu8.dwconv[1].channel_tile = 16;
3414 } else if (cpuinfo_has_x86_avx()) {
3415 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16;
3416 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3417 xnn_params.qu8.dwconv[0].channel_tile = 16;
3418 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16;
3419 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3420 xnn_params.qu8.dwconv[1].channel_tile = 16;
3421 } else if (cpuinfo_has_x86_sse4_1()) {
3422 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse41_mul16;
3423 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3424 xnn_params.qu8.dwconv[0].channel_tile = 8;
3425 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse41_mul16;
3426 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3427 xnn_params.qu8.dwconv[1].channel_tile = 8;
3428 } else if (cpuinfo_has_x86_sse2()) {
3429 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__sse2_mul16;
3430 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3431 xnn_params.qu8.dwconv[0].channel_tile = 8;
3432 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__sse2_mul16;
3433 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_sse2_params;
3434 xnn_params.qu8.dwconv[1].channel_tile = 8;
3435 }
3436 xnn_params.qu8.dwconv[0].primary_tile = 9;
3437 xnn_params.qu8.dwconv[1].primary_tile = 25;
3438
3439 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
3440 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8,
3441 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8,
3442 .init.qu8 = xnn_init_qu8_avgpool_minmax_sse2_params,
3443 .primary_tile = 9,
3444 .incremental_tile = 8,
3445 .channel_tile = 8,
3446 };
3447 if (cpuinfo_has_x86_sse4_1()) {
3448 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3449 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse41_c8,
3450 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse41_c8,
3451 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse4_params,
3452 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse4_params,
3453 .row_tile = 7,
3454 .channel_tile = 8,
3455 };
3456 } else {
3457 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
3458 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__sse2_c8,
3459 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__sse2_c8,
3460 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_sse2_params,
3461 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_sse2_params,
3462 .row_tile = 7,
3463 .channel_tile = 8,
3464 };
3465 }
3466
3467 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3468 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3469 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx512skx_mul32_ld128_x16,
3470 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3471 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx512skx_mul32_ld128_x16,
3472 .init.qu8_addsub = xnn_init_qu8_add_minmax_avx512_params,
3473 .element_tile = 16,
3474 };
3475 } else if (cpuinfo_has_x86_xop()) {
3476 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3477 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
3478 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3479 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
3480 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse4_params,
3481 .element_tile = 8,
3482 };
3483 } else if (cpuinfo_has_x86_avx2()) {
3484 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3485 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx2_mul32_ld64_x16,
3486 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3487 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx2_mul32_ld64_x16,
3488 .init.qu8_addsub = xnn_init_qu8_add_minmax_avx2_params,
3489 .element_tile = 16,
3490 };
3491 } else if (cpuinfo_has_x86_avx()) {
3492 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3493 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8,
3494 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3495 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8,
3496 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse4_params,
3497 .element_tile = 8,
3498 };
3499 } else if (cpuinfo_has_x86_sse4_1()) {
3500 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3501 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
3502 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3503 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
3504 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse2_params,
3505 .element_tile = 8,
3506 };
3507 } else {
3508 xnn_params.qu8.vadd = (struct vbinary_parameters) {
3509 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
3510 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3511 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
3512 .init.qu8_addsub = xnn_init_qu8_add_minmax_sse2_params,
3513 .element_tile = 8,
3514 };
3515 }
3516 if (cpuinfo_has_x86_avx()) {
3517 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3518 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3519 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3520 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16,
3521 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3522 .element_tile = 16,
3523 };
3524 } else if (cpuinfo_has_x86_sse4_1()) {
3525 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3526 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3527 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3528 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse41_mul16_ld64_x16,
3529 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3530 .element_tile = 16,
3531 };
3532 } else {
3533 xnn_params.qu8.vmul = (struct vbinary_parameters) {
3534 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3535 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3536 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__sse2_mul16_ld64_x8,
3537 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_sse2_params,
3538 .element_tile = 8,
3539 };
3540 }
3541 #endif // XNN_NO_QU8_OPERATORS
3542
3543 /**************************** U8 x86 micro-kernels ****************************/
3544 #ifndef XNN_NO_S8_OPERATORS
3545 init_flags |= XNN_INIT_FLAG_S8;
3546
3547 if (cpuinfo_has_x86_sse4_1()) {
3548 xnn_params.s8.clamp = (struct vunary_parameters) {
3549 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__sse41_x64,
3550 .init.s8_minmax = xnn_init_s8_minmax_sse4_params,
3551 .element_tile = 64,
3552 };
3553 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3554 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse41_c16,
3555 .pixel_tile = 1,
3556 .channel_tile = 16,
3557 };
3558 xnn_params.s8.maxpool = (struct maxpool_parameters) {
3559 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse41_c16,
3560 .init.s8 = xnn_init_s8_minmax_sse4_params,
3561 .mr = 9,
3562 .qr = 8,
3563 };
3564 } else {
3565 xnn_params.s8.clamp = (struct vunary_parameters) {
3566 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__sse2_x64,
3567 .init.s8_minmax = xnn_init_s8_minmax_sse2_params,
3568 .element_tile = 64,
3569 };
3570 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
3571 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__sse2_c8,
3572 .pixel_tile = 1,
3573 .channel_tile = 8,
3574 };
3575 xnn_params.s8.maxpool = (struct maxpool_parameters) {
3576 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__sse2_c16,
3577 .init.s8 = xnn_init_s8_minmax_sse2_params,
3578 .mr = 9,
3579 .qr = 8,
3580 };
3581 }
3582 #endif // XNN_NO_S8_OPERATORS
3583
3584 /**************************** U8 x86 micro-kernels ****************************/
3585 #ifndef XNN_NO_U8_OPERATORS
3586 init_flags |= XNN_INIT_FLAG_U8;
3587
3588 xnn_params.u8.clamp = (struct vunary_parameters) {
3589 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__sse2_x64,
3590 .init.u8_minmax = xnn_init_u8_minmax_sse2_params,
3591 .element_tile = 64,
3592 };
3593 if (cpuinfo_has_x86_sse4_1()) {
3594 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3595 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse41_c16,
3596 .pixel_tile = 1,
3597 .channel_tile = 16,
3598 };
3599 } else {
3600 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
3601 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__sse2_c8,
3602 .pixel_tile = 1,
3603 .channel_tile = 8,
3604 };
3605 }
3606 xnn_params.u8.maxpool = (struct maxpool_parameters) {
3607 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16,
3608 .init.u8 = xnn_init_u8_minmax_sse2_params,
3609 .mr = 9,
3610 .qr = 8,
3611 };
3612 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
3613 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__sse2;
3614 #endif // XNN_NO_U8_OPERATORS
3615
3616 /**************************** X8 x86 micro-kernels ****************************/
3617 #ifndef XNN_NO_X8_OPERATORS
3618 init_flags |= XNN_INIT_FLAG_X8;
3619
3620 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
3621 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx512skx_vpshufb_x64;
3622 } else if (cpuinfo_has_x86_avx2()) {
3623 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx2_x128;
3624 } else if (cpuinfo_has_x86_avx()) {
3625 xnn_params.x8.lut = xnn_x8_lut_ukernel__avx_x64;
3626 } else {
3627 // Note: SSSE3 version is usually slower than scalar
3628 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
3629 }
3630 xnn_params.x8.zip = (struct zip_parameters) {
3631 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__sse2,
3632 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__sse2,
3633 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__sse2,
3634 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__sse2,
3635 };
3636 #endif // XNN_NO_X8_OPERATORS
3637
3638 /**************************** F16 x86 micro-kernels ****************************/
3639 #ifndef XNN_NO_F16_OPERATORS
3640 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
3641 init_flags |= XNN_INIT_FLAG_F16;
3642
3643 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_4x16__avx2_broadcast);
3644 xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_4x16__avx2_broadcast);
3645 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__avx2_broadcast);
3646 xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__avx2_broadcast);
3647 xnn_params.f16.gemm.init.f16 = xnn_init_f16_scaleminmax_avx_params;
3648 xnn_params.f16.gemm.mr = 4;
3649 xnn_params.f16.gemm.nr = 16;
3650
3651 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__fma3;
3652 xnn_params.f16.dwconv[0].init.f16 = xnn_init_f16_minmax_avx_params;
3653 xnn_params.f16.dwconv[0].channel_tile = 16;
3654 xnn_params.f16.dwconv[0].primary_tile = 4;
3655
3656 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__fma3;
3657 xnn_params.f16.dwconv[1].init.f16 = xnn_init_f16_minmax_avx_params;
3658 xnn_params.f16.dwconv[1].channel_tile = 16;
3659 xnn_params.f16.dwconv[1].primary_tile = 9;
3660
3661 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__fma3_acc2;
3662 xnn_params.f16.dwconv[2].init.f16 = xnn_init_f16_minmax_avx_params;
3663 xnn_params.f16.dwconv[2].channel_tile = 8;
3664 xnn_params.f16.dwconv[2].primary_tile = 25;
3665
3666 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
3667 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8,
3668 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8,
3669 .init.f16 = xnn_init_f16_scaleminmax_avx_params,
3670 .update.f16 = xnn_update_f16_scaleminmax_avx_params,
3671 .row_tile = 7,
3672 .channel_tile = 8,
3673 };
3674
3675 xnn_params.f16.maxpool = (struct maxpool_parameters) {
3676 .ukernel = (xnn_maxpool_ukernel_function) xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8,
3677 .init.f16 = xnn_init_f16_minmax_avx_params,
3678 .mr = 9,
3679 .qr = 8,
3680 };
3681
3682 xnn_params.f16.prelu = (struct prelu_parameters) {
3683 .ukernel = (xnn_prelu_ukernel_function) xnn_f16_prelu_ukernel__f16c_2x16,
3684 .row_tile = 2,
3685 .channel_tile = 16,
3686 };
3687
3688 xnn_params.f16.vadd = (struct vbinary_parameters) {
3689 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__f16c_x16,
3690 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
3691 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__f16c_x16,
3692 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
3693 .element_tile = 16,
3694 };
3695 xnn_params.f16.vmul = (struct vbinary_parameters) {
3696 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__f16c_x16,
3697 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
3698 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__f16c_x16,
3699 .init.f16_minmax = xnn_init_f16_minmax_avx_params,
3700 .element_tile = 16,
3701 };
3702 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
3703 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__fma3_2x,
3704 .init.f16 = xnn_init_f16_minmax_avx_params,
3705 .channel_tile = 8,
3706 .row_tile = 2,
3707 };
3708 xnn_params.f16.hswish = (struct vunary_parameters) {
3709 .ukernel = (xnn_univector_ukernel_function) xnn_f16_vhswish_ukernel__f16c_x16,
3710 .init.f16_hswish = xnn_init_f16_hswish_avx_params,
3711 .element_tile = 16,
3712 };
3713 }
3714 #endif // XNN_NO_F16_OPERATORS
3715
3716 /**************************** F32 x86 micro-kernels ****************************/
3717 #ifndef XNN_NO_F32_OPERATORS
3718 init_flags |= XNN_INIT_FLAG_F32;
3719
3720 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3721 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
3722 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
3723 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
3724 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
3725 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
3726 xnn_params.f32.gemm.mr = 7;
3727 xnn_params.f32.gemm.nr = 16;
3728 } else if (cpuinfo_has_x86_fma3()) {
3729 switch (cpuinfo_get_core(0)->uarch) {
3730 case cpuinfo_uarch_zen:
3731 case cpuinfo_uarch_dhyana:
3732 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast);
3733 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast);
3734 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast);
3735 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast);
3736 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
3737 xnn_params.f32.gemm.mr = 4;
3738 xnn_params.f32.gemm.nr = 16;
3739 xnn_params.f32.gemm.log2_sr = 2;
3740 break;
3741 default:
3742 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast);
3743 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast);
3744 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast);
3745 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast);
3746 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
3747 xnn_params.f32.gemm.mr = 5;
3748 xnn_params.f32.gemm.nr = 16;
3749 break;
3750 }
3751 } else if (cpuinfo_has_x86_avx()) {
3752 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast);
3753 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast);
3754 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast);
3755 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast);
3756 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_avx_params;
3757 xnn_params.f32.gemm.mr = 5;
3758 xnn_params.f32.gemm.nr = 16;
3759 } else {
3760 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__sse_load1);
3761 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__sse_load1);
3762 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__sse_load1);
3763 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__sse_load1);
3764 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_sse_params;
3765 xnn_params.f32.gemm.mr = 4;
3766 xnn_params.f32.gemm.nr = 8;
3767 }
3768 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__sse);
3769 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__sse);
3770 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_sse_params;
3771 xnn_params.f32.gemm2.mr = 4;
3772 xnn_params.f32.gemm2.nr = 2;
3773 xnn_params.f32.gemm2.log2_kr = 2;
3774
3775 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3776 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx512f;
3777 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
3778 xnn_params.f32.dwconv[0].channel_tile = 16;
3779 xnn_params.f32.dwconv[0].primary_tile = 3;
3780
3781 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx512f;
3782 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
3783 xnn_params.f32.dwconv[1].channel_tile = 16;
3784 xnn_params.f32.dwconv[1].primary_tile = 4;
3785
3786 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx512f;
3787 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
3788 xnn_params.f32.dwconv[2].channel_tile = 16;
3789 xnn_params.f32.dwconv[2].primary_tile = 9;
3790
3791 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x25__avx512f;
3792 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
3793 xnn_params.f32.dwconv[3].channel_tile = 16;
3794 xnn_params.f32.dwconv[3].primary_tile = 25;
3795 } else if (cpuinfo_has_x86_fma3()) {
3796 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__fma3;
3797 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
3798 xnn_params.f32.dwconv[0].channel_tile = 16;
3799 xnn_params.f32.dwconv[0].primary_tile = 3;
3800
3801 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__fma3;
3802 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
3803 xnn_params.f32.dwconv[1].channel_tile = 16;
3804 xnn_params.f32.dwconv[1].primary_tile = 4;
3805
3806 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__fma3;
3807 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
3808 xnn_params.f32.dwconv[2].channel_tile = 16;
3809 xnn_params.f32.dwconv[2].primary_tile = 9;
3810
3811 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__fma3;
3812 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
3813 xnn_params.f32.dwconv[3].channel_tile = 8;
3814 xnn_params.f32.dwconv[3].primary_tile = 25;
3815 } else if (cpuinfo_has_x86_avx()) {
3816 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x3__avx;
3817 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_avx_params;
3818 xnn_params.f32.dwconv[0].channel_tile = 16;
3819 xnn_params.f32.dwconv[0].primary_tile = 3;
3820
3821 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx;
3822 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_avx_params;
3823 xnn_params.f32.dwconv[1].channel_tile = 16;
3824 xnn_params.f32.dwconv[1].primary_tile = 4;
3825
3826 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx;
3827 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_avx_params;
3828 xnn_params.f32.dwconv[2].channel_tile = 16;
3829 xnn_params.f32.dwconv[2].primary_tile = 9;
3830
3831 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__avx;
3832 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_avx_params;
3833 xnn_params.f32.dwconv[3].channel_tile = 8;
3834 xnn_params.f32.dwconv[3].primary_tile = 25;
3835 } else {
3836 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__sse;
3837 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_sse_params;
3838 xnn_params.f32.dwconv[0].channel_tile = 8;
3839 xnn_params.f32.dwconv[0].primary_tile = 3;
3840
3841 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__sse;
3842 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_sse_params;
3843 xnn_params.f32.dwconv[1].channel_tile = 8;
3844 xnn_params.f32.dwconv[1].primary_tile = 4;
3845
3846 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__sse;
3847 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_sse_params;
3848 xnn_params.f32.dwconv[2].channel_tile = 8;
3849 xnn_params.f32.dwconv[2].primary_tile = 9;
3850
3851 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__sse;
3852 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_sse_params;
3853 xnn_params.f32.dwconv[3].channel_tile = 8;
3854 xnn_params.f32.dwconv[3].primary_tile = 25;
3855 }
3856 xnn_params.f32.avgpool = (struct avgpool_parameters) {
3857 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__sse_c4,
3858 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4,
3859 .init.f32 = xnn_init_f32_scaleminmax_sse_params,
3860 .primary_tile = 9,
3861 .incremental_tile = 8,
3862 .channel_tile = 4,
3863 };
3864 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
3865 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4,
3866 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4,
3867 .primary_tile = 9,
3868 .incremental_tile = 8,
3869 .channel_tile = 4,
3870 };
3871 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
3872 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4,
3873 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4,
3874 .init.f32 = xnn_init_f32_scaleminmax_sse_params,
3875 .update.f32 = xnn_update_f32_scaleminmax_sse_params,
3876 .row_tile = 7,
3877 .channel_tile = 4,
3878 };
3879 xnn_params.f32.maxpool = (struct maxpool_parameters) {
3880 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4,
3881 .init.f32 = xnn_init_f32_minmax_sse_params,
3882 .mr = 9,
3883 .qr = 8,
3884 };
3885 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
3886 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
3887 .mr = 4,
3888 };
3889 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
3890 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
3891 .mr = 9,
3892 };
3893 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
3894 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
3895 .mr = 9,
3896 .qr = 8,
3897 };
3898 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
3899 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__sse_c8,
3900 .pixel_tile = 1,
3901 .channel_tile = 8,
3902 };
3903 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3904 xnn_params.f32.abs = (struct vunary_parameters) {
3905 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx512f_x16,
3906 .init.f32_abs = xnn_init_f32_abs_avx512_params,
3907 .element_tile = 16,
3908 };
3909 } else if (cpuinfo_has_x86_avx()) {
3910 xnn_params.f32.abs = (struct vunary_parameters) {
3911 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx_x16,
3912 .init.f32_abs = xnn_init_f32_abs_avx_params,
3913 .element_tile = 16,
3914 };
3915 } else {
3916 xnn_params.f32.abs = (struct vunary_parameters) {
3917 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__sse_x8,
3918 .init.f32_abs = xnn_init_f32_abs_sse_params,
3919 .element_tile = 8,
3920 };
3921 }
3922 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3923 xnn_params.f32.clamp = (struct vunary_parameters) {
3924 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__avx512f_x16,
3925 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
3926 .element_tile = 16,
3927 };
3928 } else if (cpuinfo_has_x86_avx()) {
3929 xnn_params.f32.clamp = (struct vunary_parameters) {
3930 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__avx_x16,
3931 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
3932 .element_tile = 16,
3933 };
3934 } else {
3935 xnn_params.f32.clamp = (struct vunary_parameters) {
3936 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__sse_x8,
3937 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
3938 .element_tile = 8,
3939 };
3940 }
3941 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3942 xnn_params.f32.elu = (struct vunary_parameters) {
3943 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x64,
3944 .init.f32_elu = xnn_init_f32_elu_avx512_rr1_lut16_p3_params,
3945 .element_tile = 64,
3946 };
3947 } else if (cpuinfo_has_x86_avx2()) {
3948 xnn_params.f32.elu = (struct vunary_parameters) {
3949 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56,
3950 .init.f32_elu = xnn_init_f32_elu_avx2_rr1_lut4_p4_params,
3951 .element_tile = 56,
3952 };
3953 } else if (cpuinfo_has_x86_avx()) {
3954 xnn_params.f32.elu = (struct vunary_parameters) {
3955 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32,
3956 .init.f32_elu = xnn_init_f32_elu_avx_rr2_lut4_p4_params,
3957 .element_tile = 32,
3958 };
3959 } else {
3960 xnn_params.f32.elu = (struct vunary_parameters) {
3961 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12,
3962 .init.f32_elu = xnn_init_f32_elu_sse2_rr2_lut16_p3_params,
3963 .element_tile = 12,
3964 };
3965 }
3966 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3967 xnn_params.f32.hswish = (struct vunary_parameters) {
3968 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__avx512f_x16,
3969 .init.f32_hswish = xnn_init_f32_hswish_avx512_params,
3970 .element_tile = 16,
3971 };
3972 } else if (cpuinfo_has_x86_fma3()) {
3973 xnn_params.f32.hswish = (struct vunary_parameters) {
3974 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__fma3_x16,
3975 .init.f32_hswish = xnn_init_f32_hswish_avx_params,
3976 .element_tile = 16,
3977 };
3978 } else if (cpuinfo_has_x86_avx()) {
3979 xnn_params.f32.hswish = (struct vunary_parameters) {
3980 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__avx_x16,
3981 .init.f32_hswish = xnn_init_f32_hswish_avx_params,
3982 .element_tile = 16,
3983 };
3984 } else {
3985 xnn_params.f32.hswish = (struct vunary_parameters) {
3986 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__sse_x8,
3987 .init.f32_hswish = xnn_init_f32_hswish_sse_params,
3988 .element_tile = 8,
3989 };
3990 }
3991 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
3992 xnn_params.f32.lrelu = (struct vunary_parameters) {
3993 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx512f_x16,
3994 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
3995 .element_tile = 16,
3996 };
3997 } else if (cpuinfo_has_x86_avx()) {
3998 xnn_params.f32.lrelu = (struct vunary_parameters) {
3999 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx_x16,
4000 .init.f32_lrelu = xnn_init_f32_lrelu_avx_params,
4001 .element_tile = 16,
4002 };
4003 } else if (cpuinfo_has_x86_sse4_1()) {
4004 xnn_params.f32.lrelu = (struct vunary_parameters) {
4005 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse41_x8,
4006 .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
4007 .element_tile = 8,
4008 };
4009 } else {
4010 xnn_params.f32.lrelu = (struct vunary_parameters) {
4011 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse_x8,
4012 .init.f32_lrelu = xnn_init_f32_lrelu_sse_params,
4013 .element_tile = 8,
4014 };
4015 }
4016 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4017 xnn_params.f32.neg = (struct vunary_parameters) {
4018 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx512f_x16,
4019 .init.f32_neg = xnn_init_f32_neg_avx512_params,
4020 .element_tile = 16,
4021 };
4022 } else if (cpuinfo_has_x86_avx()) {
4023 xnn_params.f32.neg = (struct vunary_parameters) {
4024 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx_x16,
4025 .init.f32_neg = xnn_init_f32_neg_avx_params,
4026 .element_tile = 16,
4027 };
4028 } else {
4029 xnn_params.f32.neg = (struct vunary_parameters) {
4030 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__sse_x8,
4031 .init.f32_neg = xnn_init_f32_neg_sse_params,
4032 .element_tile = 8,
4033 };
4034 }
4035 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4036 xnn_params.f32.rndne = (struct vunary_parameters) {
4037 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx512f_x16,
4038 .element_tile = 16,
4039 };
4040 xnn_params.f32.rndz = (struct vunary_parameters) {
4041 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx512f_x16,
4042 .element_tile = 16,
4043 };
4044 xnn_params.f32.rndu = (struct vunary_parameters) {
4045 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx512f_x16,
4046 .element_tile = 16,
4047 };
4048 xnn_params.f32.rndd = (struct vunary_parameters) {
4049 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx512f_x16,
4050 .element_tile = 16,
4051 };
4052 } else if (cpuinfo_has_x86_avx()) {
4053 xnn_params.f32.rndne = (struct vunary_parameters) {
4054 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx_x16,
4055 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4056 .element_tile = 16,
4057 };
4058 xnn_params.f32.rndz = (struct vunary_parameters) {
4059 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx_x16,
4060 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4061 .element_tile = 16,
4062 };
4063 xnn_params.f32.rndu = (struct vunary_parameters) {
4064 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx_x16,
4065 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4066 .element_tile = 16,
4067 };
4068 xnn_params.f32.rndd = (struct vunary_parameters) {
4069 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx_x16,
4070 .init.f32_rnd = xnn_init_f32_rnd_avx_params,
4071 .element_tile = 16,
4072 };
4073 } else if (cpuinfo_has_x86_sse4_1()) {
4074 xnn_params.f32.rndne = (struct vunary_parameters) {
4075 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse41_x8,
4076 .element_tile = 8,
4077 };
4078 xnn_params.f32.rndz = (struct vunary_parameters) {
4079 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse41_x8,
4080 .element_tile = 8,
4081 };
4082 xnn_params.f32.rndu = (struct vunary_parameters) {
4083 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse41_x8,
4084 .element_tile = 8,
4085 };
4086 xnn_params.f32.rndd = (struct vunary_parameters) {
4087 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse41_x8,
4088 .element_tile = 8,
4089 };
4090 } else {
4091 xnn_params.f32.rndne = (struct vunary_parameters) {
4092 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse2_x8,
4093 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4094 .element_tile = 8,
4095 };
4096 xnn_params.f32.rndz = (struct vunary_parameters) {
4097 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse2_x8,
4098 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4099 .element_tile = 8,
4100 };
4101 xnn_params.f32.rndu = (struct vunary_parameters) {
4102 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse2_x8,
4103 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4104 .element_tile = 8,
4105 };
4106 xnn_params.f32.rndd = (struct vunary_parameters) {
4107 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse2_x8,
4108 .init.f32_rnd = xnn_init_f32_rnd_sse2_params,
4109 .element_tile = 8,
4110 };
4111 }
4112 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4113 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4114 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x64,
4115 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx512_rr2_lut32_p2_params,
4116 .element_tile = 64,
4117 };
4118 } else if (cpuinfo_has_x86_avx2()) {
4119 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4120 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx2_rr1_p5_div_x40,
4121 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx2_rr1_p5_params,
4122 .element_tile = 40,
4123 };
4124 } else if (cpuinfo_has_x86_avx()) {
4125 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4126 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40,
4127 .init.f32_sigmoid = xnn_init_f32_sigmoid_avx_rr2_p5_params,
4128 .element_tile = 40,
4129 };
4130 } else if (cpuinfo_has_x86_sse4_1()) {
4131 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4132 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__sse41_rr2_lut64_p2_div_x8,
4133 .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4134 .element_tile = 8,
4135 };
4136 } else {
4137 xnn_params.f32.sigmoid = (struct vunary_parameters) {
4138 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__sse2_rr2_lut64_p2_div_x8,
4139 .init.f32_sigmoid = xnn_init_f32_sigmoid_sse2_rr2_lut64_p2_params,
4140 .element_tile = 8,
4141 };
4142 }
4143 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4144 xnn_params.f32.sqr = (struct vunary_parameters) {
4145 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx512f_x16,
4146 .element_tile = 16,
4147 };
4148 } else if (cpuinfo_has_x86_avx()) {
4149 xnn_params.f32.sqr = (struct vunary_parameters) {
4150 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx_x16,
4151 .init.f32_default = xnn_init_f32_default_avx_params,
4152 .element_tile = 16,
4153 };
4154 } else {
4155 xnn_params.f32.sqr = (struct vunary_parameters) {
4156 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__sse_x8,
4157 .element_tile = 8,
4158 };
4159 }
4160 if (cpuinfo_has_x86_avx()) {
4161 xnn_params.f32.sqrt = (struct vunary_parameters) {
4162 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__avx_sqrt_x8,
4163 .init.f32_sqrt = xnn_init_f32_sqrt_avx_params,
4164 .element_tile = 8,
4165 };
4166 } else {
4167 xnn_params.f32.sqrt = (struct vunary_parameters) {
4168 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__sse_sqrt_x4,
4169 .element_tile = 4,
4170 };
4171 }
4172 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4173 xnn_params.f32.prelu = (struct prelu_parameters) {
4174 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx512f_2x16,
4175 .row_tile = 2,
4176 .channel_tile = 16,
4177 };
4178 } else if (cpuinfo_has_x86_avx()) {
4179 xnn_params.f32.prelu = (struct prelu_parameters) {
4180 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx_2x16,
4181 .row_tile = 2,
4182 .channel_tile = 16,
4183 };
4184 } else if (cpuinfo_has_x86_sse4_1()) {
4185 xnn_params.f32.prelu = (struct prelu_parameters) {
4186 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse41_2x8,
4187 .row_tile = 2,
4188 .channel_tile = 8,
4189 };
4190 } else {
4191 xnn_params.f32.prelu = (struct prelu_parameters) {
4192 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse2_2x8,
4193 .row_tile = 2,
4194 .channel_tile = 8,
4195 };
4196 }
4197 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
4198 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__sse2_rr2_p5_x20_acc2,
4199 .init = xnn_init_f32_expminus_sse2_rr2_p5_params,
4200 .element_tile = 20,
4201 };
4202 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__sse;
4203 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
4204 xnn_params.f32.vadd = (struct vbinary_parameters) {
4205 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx512f_x32,
4206 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
4207 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
4208 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4209 .element_tile = 32,
4210 };
4211 xnn_params.f32.vdiv = (struct vbinary_parameters) {
4212 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx512f_x32,
4213 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx512f_x32,
4214 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx512f_x32,
4215 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4216 .element_tile = 32,
4217 };
4218 xnn_params.f32.vmax = (struct vbinary_parameters) {
4219 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx512f_x32,
4220 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
4221 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
4222 .element_tile = 32,
4223 };
4224 xnn_params.f32.vmin = (struct vbinary_parameters) {
4225 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx512f_x32,
4226 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
4227 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
4228 .element_tile = 32,
4229 };
4230 xnn_params.f32.vmul = (struct vbinary_parameters) {
4231 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx512f_x32,
4232 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
4233 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
4234 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4235 .element_tile = 32,
4236 };
4237 xnn_params.f32.vsub = (struct vbinary_parameters) {
4238 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx512f_x32,
4239 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx512f_x32,
4240 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx512f_x32,
4241 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
4242 .element_tile = 32,
4243 };
4244 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
4245 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx512f_x32,
4246 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
4247 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
4248 .element_tile = 32,
4249 };
4250 } else if (cpuinfo_has_x86_avx()) {
4251 xnn_params.f32.vadd = (struct vbinary_parameters) {
4252 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx_x16,
4253 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
4254 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
4255 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4256 .element_tile = 16,
4257 };
4258 xnn_params.f32.vdiv = (struct vbinary_parameters) {
4259 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx_x16,
4260 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx_x16,
4261 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx_x16,
4262 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4263 .element_tile = 16,
4264 };
4265 xnn_params.f32.vmax = (struct vbinary_parameters) {
4266 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx_x16,
4267 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
4268 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
4269 .init.f32_default = xnn_init_f32_default_avx_params,
4270 .element_tile = 16,
4271 };
4272 xnn_params.f32.vmin = (struct vbinary_parameters) {
4273 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx_x16,
4274 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
4275 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
4276 .init.f32_default = xnn_init_f32_default_avx_params,
4277 .element_tile = 16,
4278 };
4279 xnn_params.f32.vmul = (struct vbinary_parameters) {
4280 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx_x16,
4281 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
4282 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
4283 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4284 .element_tile = 16,
4285 };
4286 xnn_params.f32.vsub = (struct vbinary_parameters) {
4287 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx_x16,
4288 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx_x16,
4289 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx_x16,
4290 .init.f32_minmax = xnn_init_f32_minmax_avx_params,
4291 .element_tile = 16,
4292 };
4293 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
4294 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx_x16,
4295 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
4296 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
4297 .init.f32_default = xnn_init_f32_default_avx_params,
4298 .element_tile = 16,
4299 };
4300 } else {
4301 xnn_params.f32.vadd = (struct vbinary_parameters) {
4302 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__sse_x8,
4303 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
4304 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
4305 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
4306 .element_tile = 8,
4307 };
4308 xnn_params.f32.vdiv = (struct vbinary_parameters) {
4309 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__sse_x8,
4310 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__sse_x8,
4311 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__sse_x8,
4312 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
4313 .element_tile = 8,
4314 };
4315 xnn_params.f32.vmax = (struct vbinary_parameters) {
4316 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__sse_x8,
4317 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
4318 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
4319 .element_tile = 8,
4320 };
4321 xnn_params.f32.vmin = (struct vbinary_parameters) {
4322 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__sse_x8,
4323 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
4324 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
4325 .element_tile = 8,
4326 };
4327 xnn_params.f32.vmul = (struct vbinary_parameters) {
4328 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__sse_x8,
4329 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
4330 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
4331 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
4332 .element_tile = 8,
4333 };
4334 xnn_params.f32.vsub = (struct vbinary_parameters) {
4335 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__sse_x8,
4336 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__sse_x8,
4337 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__sse_x8,
4338 .init.f32_minmax = xnn_init_f32_minmax_sse_params,
4339 .element_tile = 8,
4340 };
4341 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
4342 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__sse_x8,
4343 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
4344 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
4345 .element_tile = 8,
4346 };
4347 }
4348 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
4349 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x,
4350 .init.f32 = xnn_init_f32_minmax_sse_params,
4351 .channel_tile = 4,
4352 .row_tile = 2,
4353 };
4354 #ifndef XNN_NO_NCHW_OPERATORS
4355 // Sparse microkernels on x86 currently target only SSE, and on processors
4356 // with AVX ISA dense inference is expected to be faster than sparse.
4357 if (!cpuinfo_has_x86_avx()) {
4358 init_flags |= XNN_INIT_FLAG_CHW_OPT;
4359 }
4360
4361 xnn_params.f32.spmm = (struct spmm_parameters) {
4362 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__sse,
4363 .mr = 32,
4364 .nr = 1,
4365 };
4366 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
4367 .ukernel_with_symm_padding =
4368 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2,
4369 .output_channel_tile = 4,
4370 .output_height_tile = 2,
4371 .output_width_tile = 2,
4372 };
4373 if (cpuinfo_has_x86_ssse3()) {
4374 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
4375 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2,
4376 .output_width_tile = 4,
4377 .output_height_tile = 2,
4378 };
4379 } else {
4380 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
4381 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2,
4382 .output_width_tile = 4,
4383 .output_height_tile = 2,
4384 };
4385 }
4386 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
4387 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3,
4388 .output_width_tile = 4,
4389 .output_height_tile = 1,
4390 };
4391 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
4392 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4,
4393 .output_width_tile = 4,
4394 .output_height_tile = 4,
4395 };
4396 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
4397 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4,
4398 .output_width_tile = 4,
4399 .output_height_tile = 2,
4400 };
4401 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
4402 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__sse_x4,
4403 .channel_tile = 4,
4404 };
4405 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
4406 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__sse_p8,
4407 .channel_tile = 1,
4408 .pixel_tile = 8,
4409 };
4410 #endif // XNN_NO_NCHW_OPERATORS
4411 #endif // XNN_NO_F32_OPERATORS
4412
4413 /*************************** VCVT x86 micro-kernels ***************************/
4414 #ifndef XNN_NO_VCVT_OPERATORS
4415 init_flags |= XNN_INIT_FLAG_VCVT;
4416
4417 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4418 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4419 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx512skx_x16,
4420 .element_tile = 16,
4421 };
4422 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4423 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx512skx_x16,
4424 .element_tile = 16,
4425 };
4426 } else if (cpuinfo_has_x86_f16c()) {
4427 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4428 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__f16c_x16,
4429 .element_tile = 16,
4430 };
4431 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4432 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__f16c_x16,
4433 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_f16c_params,
4434 .element_tile = 16,
4435 };
4436 } else if (cpuinfo_has_x86_avx()) {
4437 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4438 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx_int16_x16,
4439 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4440 .element_tile = 16,
4441 };
4442 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4443 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx_x24,
4444 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4445 .element_tile = 24,
4446 };
4447 } else if (cpuinfo_has_x86_sse4_1()) {
4448 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4449 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse41_int16_x16,
4450 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4451 .element_tile = 16,
4452 };
4453 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4454 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse41_x8,
4455 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4456 .element_tile = 8,
4457 };
4458 } else {
4459 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
4460 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32,
4461 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params,
4462 .element_tile = 32,
4463 };
4464 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
4465 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse2_x16,
4466 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_sse2_params,
4467 .element_tile = 16,
4468 };
4469 }
4470 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4471 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4472 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx512skx_x128,
4473 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx512_params,
4474 .element_tile = 128,
4475 };
4476 } else if (cpuinfo_has_x86_avx2()) {
4477 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4478 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx2_x64,
4479 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx2_params,
4480 .element_tile = 64,
4481 };
4482 } else if (cpuinfo_has_x86_avx()) {
4483 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4484 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__avx_x32,
4485 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_avx_params,
4486 .element_tile = 32,
4487 };
4488 } else if (cpuinfo_has_x86_sse4_1()) {
4489 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4490 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse41_x32,
4491 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse4_params,
4492 .element_tile = 32,
4493 };
4494 } else {
4495 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
4496 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__sse2_x32,
4497 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_sse2_params,
4498 .element_tile = 32,
4499 };
4500 }
4501 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4502 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4503 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx512skx_x128,
4504 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx512_params,
4505 .element_tile = 128,
4506 };
4507 } else if (cpuinfo_has_x86_avx2()) {
4508 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4509 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx2_x64,
4510 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx2_params,
4511 .element_tile = 64,
4512 };
4513 } else if (cpuinfo_has_x86_avx()) {
4514 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4515 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__avx_x32,
4516 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_avx_params,
4517 .element_tile = 32,
4518 };
4519 } else {
4520 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
4521 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__sse2_x32,
4522 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_sse2_params,
4523 .element_tile = 32,
4524 };
4525 }
4526 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
4527 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4528 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx512skx_x32,
4529 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx512_params,
4530 .element_tile = 32,
4531 };
4532 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4533 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx512skx_x32,
4534 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx512_params,
4535 .element_tile = 32,
4536 };
4537 } else if (cpuinfo_has_x86_avx2()) {
4538 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4539 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx2_x16,
4540 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
4541 .element_tile = 16,
4542 };
4543 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4544 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx2_x16,
4545 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
4546 .element_tile = 16,
4547 };
4548 } else if (cpuinfo_has_x86_avx()) {
4549 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4550 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__avx_x32,
4551 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_avx_params,
4552 .element_tile = 32,
4553 };
4554 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4555 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__avx_x32,
4556 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_avx_params,
4557 .element_tile = 32,
4558 };
4559 } else if (cpuinfo_has_x86_sse4_1()) {
4560 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4561 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse41_x16,
4562 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse4_params,
4563 .element_tile = 16,
4564 };
4565 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4566 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse41_x16,
4567 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse4_params,
4568 .element_tile = 16,
4569 };
4570 } else {
4571 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
4572 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__sse2_x32,
4573 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_sse2_params,
4574 .element_tile = 32,
4575 };
4576 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
4577 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__sse2_x32,
4578 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_sse2_params,
4579 .element_tile = 32,
4580 };
4581 }
4582 #endif // XNN_NO_VCVT_OPERATORS
4583
4584 /**************************** X32 x86 micro-kernels ****************************/
4585 #ifndef XNN_NO_X32_OPERATORS
4586 init_flags |= XNN_INIT_FLAG_X32;
4587
4588 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__sse2;
4589 xnn_params.x32.zip = (struct zip_parameters) {
4590 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__sse2,
4591 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__sse2,
4592 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__sse2,
4593 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__sse2,
4594 };
4595 #ifndef XNN_NO_NCHW_OPERATORS
4596 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
4597 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
4598 .channel_tile = 1,
4599 .pixel_tile = 1,
4600 };
4601 #endif // XNN_NO_NCHW_OPERATORS
4602 #endif // XNN_NO_X32_OPERATORS
4603
4604 /**************************** XX x86 micro-kernels ****************************/
4605 #ifndef XNN_NO_XX_OPERATORS
4606 init_flags |= XNN_INIT_FLAG_XX;
4607
4608 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
4609 xnn_params.xx.fill = (struct fill_parameters) {
4610 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__sse2_x64,
4611 .row_tile = 1,
4612 };
4613 xnn_params.xx.pad = (struct pad_parameters) {
4614 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__sse2,
4615 .row_tile = 1,
4616 };
4617 #endif
4618
4619 #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
4620
4621 /**************************** QC8 WAsm SIMD micro-kernels****************************/
4622 #ifndef XNN_NO_QS8_OPERATORS
4623 init_flags |= XNN_INIT_FLAG_QC8;
4624
4625 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4626 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4627 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4628 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4629 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4630 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
4631 xnn_params.qc8.gemm.mr = 4;
4632 xnn_params.qc8.gemm.nr = 4;
4633 xnn_params.qc8.gemm.log2_kr = 1;
4634 xnn_params.qc8.gemm.log2_sr = 2;
4635 #else
4636 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4637 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4638 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4639 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4640 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
4641 xnn_params.qc8.gemm.mr = 3;
4642 xnn_params.qc8.gemm.nr = 4;
4643 xnn_params.qc8.gemm.log2_kr = 3;
4644 #endif
4645
4646 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
4647 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
4648 xnn_params.qc8.dwconv[0].channel_tile = 16;
4649 xnn_params.qc8.dwconv[0].primary_tile = 9;
4650 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
4651 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_wasmsimd_params;
4652 xnn_params.qc8.dwconv[1].channel_tile = 16;
4653 xnn_params.qc8.dwconv[1].primary_tile = 25;
4654 #endif // XNN_NO_QC8_OPERATORS
4655
4656 /**************************** QS8 WAsm SIMD micro-kernels****************************/
4657 #ifndef XNN_NO_QS8_OPERATORS
4658 init_flags |= XNN_INIT_FLAG_QS8;
4659
4660 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4661 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4662 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4663 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4664 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4665 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4666 xnn_params.qs8.gemm.mr = 4;
4667 xnn_params.qs8.gemm.nr = 4;
4668 xnn_params.qs8.gemm.log2_kr = 1;
4669 xnn_params.qs8.gemm.log2_sr = 2;
4670 #else // XNN_WASMSIMD_VERSION >= 88
4671 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4672 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64);
4673 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4674 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul16_ld64);
4675 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4676 xnn_params.qs8.gemm.mr = 3;
4677 xnn_params.qs8.gemm.nr = 4;
4678 xnn_params.qs8.gemm.log2_kr = 3;
4679 #endif // XNN_WASMSIMD_VERSION >= 88
4680
4681 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__wasmsimd_mul16_add16;
4682 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4683 xnn_params.qs8.dwconv[0].channel_tile = 16;
4684 xnn_params.qs8.dwconv[0].primary_tile = 9;
4685 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__wasmsimd_mul16_add16;
4686 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_wasmsimd_params;
4687 xnn_params.qs8.dwconv[1].channel_tile = 16;
4688 xnn_params.qs8.dwconv[1].primary_tile = 25;
4689
4690 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
4691 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
4692 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
4693 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_wasmsimd_params,
4694 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_wasmsimd_params,
4695 .row_tile = 7,
4696 .channel_tile = 16,
4697 };
4698
4699 xnn_params.qs8.vadd = (struct vbinary_parameters) {
4700 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__wasmsimd_x32,
4701 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
4702 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x32,
4703 .init.qs8_addsub = xnn_init_qs8_add_minmax_wasmsimd_params,
4704 .element_tile = 32,
4705 };
4706 xnn_params.qs8.vmul = (struct vbinary_parameters) {
4707 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4708 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4709 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4710 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_wasmsimd_params,
4711 .element_tile = 8,
4712 };
4713 #endif // XNN_NO_QS8_OPERATORS
4714
4715 /**************************** QU8 WAsm SIMD micro-kernels****************************/
4716 #ifndef XNN_NO_QU8_OPERATORS
4717 init_flags |= XNN_INIT_FLAG_QU8;
4718
4719 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4720 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4721 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128);
4722 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4723 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128);
4724 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4725 xnn_params.qu8.gemm.mr = 4;
4726 xnn_params.qu8.gemm.nr = 4;
4727 xnn_params.qu8.gemm.log2_kr = 1;
4728 xnn_params.qu8.gemm.log2_sr = 2;
4729 #else // XNN_WASMSIMD_VERSION >= 88
4730 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64);
4731 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64);
4732 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64);
4733 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64);
4734 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4735 xnn_params.qu8.gemm.mr = 3;
4736 xnn_params.qu8.gemm.nr = 4;
4737 xnn_params.qu8.gemm.log2_kr = 3;
4738 #endif // XNN_WASMSIMD_VERSION >= 88
4739
4740 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x9__wasmsimd_mul16;
4741 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4742 xnn_params.qu8.dwconv[0].channel_tile = 8;
4743 xnn_params.qu8.dwconv[0].primary_tile = 9;
4744 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up8x25__wasmsimd_mul16;
4745 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_wasmsimd_params;
4746 xnn_params.qu8.dwconv[1].channel_tile = 8;
4747 xnn_params.qu8.dwconv[1].primary_tile = 25;
4748
4749 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
4750 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
4751 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
4752 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
4753 .primary_tile = 9,
4754 .incremental_tile = 8,
4755 .channel_tile = 1,
4756 };
4757 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
4758 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__wasmsimd_c16,
4759 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__wasmsimd_c16,
4760 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_wasmsimd_params,
4761 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_wasmsimd_params,
4762 .row_tile = 7,
4763 .channel_tile = 16,
4764 };
4765
4766 xnn_params.qu8.vadd = (struct vbinary_parameters) {
4767 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__wasmsimd_x32,
4768 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
4769 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__wasmsimd_x32,
4770 .init.qu8_addsub = xnn_init_qu8_add_minmax_wasmsimd_params,
4771 .element_tile = 32,
4772 };
4773 xnn_params.qu8.vmul = (struct vbinary_parameters) {
4774 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4775 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4776 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__wasmsimd_mul32_ld64_x8,
4777 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_wasmsimd_params,
4778 .element_tile = 8,
4779 };
4780 #endif // XNN_NO_QU8_OPERATORS
4781
4782 /**************************** S8 WAsm SIMD micro-kernels****************************/
4783 #ifndef XNN_NO_S8_OPERATORS
4784 init_flags |= XNN_INIT_FLAG_S8;
4785
4786 xnn_params.s8.clamp = (struct vunary_parameters) {
4787 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__wasmsimd_x64,
4788 .init.s8_minmax = xnn_init_s8_minmax_wasmsimd_params,
4789 .element_tile = 64,
4790 };
4791 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4792 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4793 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
4794 .pixel_tile = 1,
4795 .channel_tile = 8,
4796 };
4797 #else // XNN_WASMSIMD_VERSION >= 88
4798 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
4799 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__wasmsimd_mul32_c8,
4800 .pixel_tile = 1,
4801 .channel_tile = 8,
4802 };
4803 #endif // XNN_WASMSIMD_VERSION >= 88
4804 xnn_params.s8.maxpool = (struct maxpool_parameters) {
4805 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
4806 .init.s8 = xnn_init_s8_minmax_wasmsimd_params,
4807 .mr = 9,
4808 .qr = 8,
4809 };
4810 #endif // XNN_NO_S8_OPERATORS
4811
4812 /**************************** U8 WAsm SIMD micro-kernels****************************/
4813 #ifndef XNN_NO_U8_OPERATORS
4814 init_flags |= XNN_INIT_FLAG_U8;
4815
4816 xnn_params.u8.clamp = (struct vunary_parameters) {
4817 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__wasmsimd_x64,
4818 .init.u8_minmax = xnn_init_u8_minmax_wasmsimd_params,
4819 .element_tile = 64,
4820 };
4821 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 88)
4822 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4823 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_dot16x2_c8,
4824 .pixel_tile = 1,
4825 .channel_tile = 8,
4826 };
4827 #else // XNN_WASMSIMD_VERSION >= 88
4828 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
4829 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__wasmsimd_mul32_c8,
4830 .pixel_tile = 1,
4831 .channel_tile = 8,
4832 };
4833 #endif // XNN_WASMSIMD_VERSION >= 88
4834 xnn_params.u8.maxpool = (struct maxpool_parameters) {
4835 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__wasmsimd_c16,
4836 .init.u8 = xnn_init_u8_minmax_wasmsimd_params,
4837 .mr = 9,
4838 .qr = 8,
4839 };
4840 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
4841 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
4842 #endif // XNN_NO_U8_OPERATORS
4843
4844 /**************************** X8 WAsm SIMD micro-kernels****************************/
4845 #ifndef XNN_NO_X8_OPERATORS
4846 init_flags |= XNN_INIT_FLAG_X8;
4847
4848 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
4849 xnn_params.x8.zip = (struct zip_parameters) {
4850 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
4851 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
4852 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
4853 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
4854 };
4855 #endif // XNN_NO_X8_OPERATORS
4856
4857 /**************************** F32 WAsm SIMD micro-kernels****************************/
4858 #ifndef XNN_NO_F32_OPERATORS
4859 init_flags |= XNN_INIT_FLAG_F32;
4860
4861 if (is_wasm_x86) {
4862 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
4863 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
4864 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
4865 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
4866 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat);
4867 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat);
4868 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
4869 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
4870 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__wasmsimd_splat);
4871 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__wasmsimd_splat);
4872 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
4873 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
4874 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4875 xnn_params.f32.gemm.mr = 4;
4876 xnn_params.f32.gemm.nr = 8;
4877
4878 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
4879 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
4880 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
4881 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
4882 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4883 xnn_params.f32.gemm2.mr = 4;
4884 xnn_params.f32.gemm2.nr = 2;
4885 xnn_params.f32.gemm2.log2_kr = 2;
4886 } else {
4887 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
4888 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
4889 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
4890 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
4891 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat);
4892 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat);
4893 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
4894 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
4895 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x8__wasmsimd_splat);
4896 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x8__wasmsimd_splat);
4897 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
4898 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
4899 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4900 xnn_params.f32.gemm.mr = 5;
4901 xnn_params.f32.gemm.nr = 8;
4902
4903 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
4904 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
4905 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
4906 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
4907 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4908 xnn_params.f32.gemm2.mr = 4;
4909 xnn_params.f32.gemm2.nr = 2;
4910 xnn_params.f32.gemm2.log2_kr = 2;
4911 }
4912
4913 if (is_wasm_x86) {
4914 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x3__wasmsimd_x86;
4915 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x3__wasmsimd;
4916 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4917 xnn_params.f32.dwconv[0].channel_tile = 8;
4918 xnn_params.f32.dwconv[0].primary_tile = 3;
4919
4920 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86;
4921 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__wasmsimd;
4922 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4923 xnn_params.f32.dwconv[1].channel_tile = 8;
4924 xnn_params.f32.dwconv[1].primary_tile = 4;
4925
4926 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_x86;
4927 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__wasmsimd;
4928 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4929 xnn_params.f32.dwconv[2].channel_tile = 8;
4930 xnn_params.f32.dwconv[2].primary_tile = 9;
4931 } else {
4932 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x3__wasmsimd_arm;
4933 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x3__wasmsimd;
4934 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4935 xnn_params.f32.dwconv[0].channel_tile = 4;
4936 xnn_params.f32.dwconv[0].primary_tile = 3;
4937
4938 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_arm;
4939 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__wasmsimd;
4940 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4941 xnn_params.f32.dwconv[1].channel_tile = 4;
4942 xnn_params.f32.dwconv[1].primary_tile = 4;
4943
4944 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_arm;
4945 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__wasmsimd;
4946 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4947 xnn_params.f32.dwconv[2].channel_tile = 4;
4948 xnn_params.f32.dwconv[2].primary_tile = 9;
4949 }
4950
4951 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__wasmsimd_arm;
4952 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__wasmsimd;
4953 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_wasmsimd_params;
4954 xnn_params.f32.dwconv[3].channel_tile = 4;
4955 xnn_params.f32.dwconv[3].primary_tile = 25;
4956
4957 if (is_wasm_x86) {
4958 xnn_params.f32.avgpool = (struct avgpool_parameters) {
4959 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
4960 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
4961 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
4962 .primary_tile = 9,
4963 .incremental_tile = 8,
4964 .channel_tile = 4,
4965 };
4966 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
4967 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
4968 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
4969 .primary_tile = 9,
4970 .incremental_tile = 8,
4971 .channel_tile = 4,
4972 };
4973 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
4974 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4,
4975 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4,
4976 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
4977 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
4978 .row_tile = 7,
4979 .channel_tile = 4,
4980 };
4981 } else {
4982 xnn_params.f32.avgpool = (struct avgpool_parameters) {
4983 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
4984 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
4985 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
4986 .primary_tile = 9,
4987 .incremental_tile = 8,
4988 .channel_tile = 4,
4989 };
4990 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
4991 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
4992 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
4993 .primary_tile = 9,
4994 .incremental_tile = 8,
4995 .channel_tile = 4,
4996 };
4997 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
4998 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4,
4999 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4,
5000 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5001 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5002 .row_tile = 7,
5003 .channel_tile = 4,
5004 };
5005 }
5006 if (is_wasm_x86) {
5007 xnn_params.f32.maxpool = (struct maxpool_parameters) {
5008 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
5009 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5010 .mr = 9,
5011 .qr = 8,
5012 };
5013 } else {
5014 xnn_params.f32.maxpool = (struct maxpool_parameters) {
5015 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
5016 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5017 .mr = 9,
5018 .qr = 8,
5019 };
5020 }
5021 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
5022 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__wasmsimd_c4,
5023 .mr = 4,
5024 };
5025 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
5026 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__wasmsimd_c4,
5027 .mr = 9,
5028 };
5029 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
5030 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__wasmsimd_c4,
5031 .mr = 9,
5032 .qr = 8,
5033 };
5034 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
5035 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__wasmsimd_c8,
5036 .pixel_tile = 1,
5037 .channel_tile = 8,
5038 };
5039 xnn_params.f32.abs = (struct vunary_parameters) {
5040 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8,
5041 .init.f32_abs = xnn_init_f32_abs_wasmsimd_params,
5042 .element_tile = 16,
5043 };
5044 if (is_wasm_x86) {
5045 xnn_params.f32.clamp = (struct vunary_parameters) {
5046 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_x86_x8,
5047 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5048 .element_tile = 8,
5049 };
5050 } else {
5051 xnn_params.f32.clamp = (struct vunary_parameters) {
5052 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasmsimd_arm_x8,
5053 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5054 .element_tile = 8,
5055 };
5056 }
5057 if (is_wasm_x86) {
5058 xnn_params.f32.elu = (struct vunary_parameters) {
5059 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20,
5060 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5061 .element_tile = 20,
5062 };
5063 } else {
5064 xnn_params.f32.elu = (struct vunary_parameters) {
5065 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20,
5066 .init.f32_elu = xnn_init_f32_elu_wasmsimd_rr2_p6_params,
5067 .element_tile = 20,
5068 };
5069 }
5070 xnn_params.f32.hswish = (struct vunary_parameters) {
5071 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__wasmsimd_x16,
5072 .init.f32_hswish = xnn_init_f32_hswish_wasmsimd_params,
5073 .element_tile = 16,
5074 };
5075 if (is_wasm_x86) {
5076 xnn_params.f32.lrelu = (struct vunary_parameters) {
5077 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x8,
5078 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5079 .element_tile = 8,
5080 };
5081 } else {
5082 xnn_params.f32.lrelu = (struct vunary_parameters) {
5083 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x8,
5084 .init.f32_lrelu = xnn_init_f32_lrelu_wasmsimd_params,
5085 .element_tile = 8,
5086 };
5087 }
5088 xnn_params.f32.neg = (struct vunary_parameters) {
5089 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__wasmsimd_x8,
5090 .init.f32_neg = xnn_init_f32_neg_wasmsimd_params,
5091 .element_tile = 16,
5092 };
5093 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasmsimd_x16;
5094 #if defined(XNN_WASMSIMD_VERSION) && (XNN_WASMSIMD_VERSION >= 91)
5095 xnn_params.f32.rndne = (struct vunary_parameters) {
5096 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_native_x8,
5097 .element_tile = 8,
5098 };
5099 xnn_params.f32.rndz = (struct vunary_parameters) {
5100 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_native_x8,
5101 .element_tile = 8,
5102 };
5103 xnn_params.f32.rndu = (struct vunary_parameters) {
5104 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_native_x8,
5105 .element_tile = 8,
5106 };
5107 xnn_params.f32.rndd = (struct vunary_parameters) {
5108 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_native_x8,
5109 .element_tile = 8,
5110 };
5111 #else // XNN_WASMSIMD_VERSION >= 91
5112 xnn_params.f32.rndne = (struct vunary_parameters) {
5113 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8,
5114 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5115 .element_tile = 8,
5116 };
5117 if (is_wasm_x86) {
5118 xnn_params.f32.rndz = (struct vunary_parameters) {
5119 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8,
5120 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5121 .element_tile = 8,
5122 };
5123 } else {
5124 xnn_params.f32.rndz = (struct vunary_parameters) {
5125 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8,
5126 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5127 .element_tile = 8,
5128 };
5129 }
5130 xnn_params.f32.rndu = (struct vunary_parameters) {
5131 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8,
5132 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5133 .element_tile = 8,
5134 };
5135 xnn_params.f32.rndd = (struct vunary_parameters) {
5136 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8,
5137 .init.f32_rnd = xnn_init_f32_rnd_wasmsimd_params,
5138 .element_tile = 8,
5139 };
5140 #endif // XNN_WASMSIMD_VERSION >= 91
5141 xnn_params.f32.sigmoid = (struct vunary_parameters) {
5142 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__wasmsimd_rr2_p5_div_x16,
5143 .init.f32_sigmoid = xnn_init_f32_sigmoid_wasmsimd_rr2_p5_params,
5144 .element_tile = 16,
5145 };
5146 xnn_params.f32.sqr = (struct vunary_parameters) {
5147 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__wasmsimd_x8,
5148 .element_tile = 16,
5149 };
5150 xnn_params.f32.sqrt = (struct vunary_parameters) {
5151 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8,
5152 .element_tile = 8,
5153 };
5154 if (is_wasm_x86) {
5155 xnn_params.f32.prelu = (struct prelu_parameters) {
5156 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_minmax_2x8,
5157 .row_tile = 2,
5158 .channel_tile = 8,
5159 };
5160 } else {
5161 xnn_params.f32.prelu = (struct prelu_parameters) {
5162 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_bitselect_2x8,
5163 .row_tile = 2,
5164 .channel_tile = 8,
5165 };
5166 }
5167 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5168 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_rr2_p5_x16_acc2,
5169 .init = xnn_init_f32_expminus_wasmsimd_rr2_p5_params,
5170 .element_tile = 16,
5171 };
5172 if (is_wasm_x86) {
5173 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_x86;
5174 xnn_params.f32.vadd = (struct vbinary_parameters) {
5175 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16,
5176 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
5177 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
5178 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
5179 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
5180 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
5181 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5182 .element_tile = 16,
5183 };
5184 xnn_params.f32.vdiv = (struct vbinary_parameters) {
5185 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16,
5186 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16,
5187 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16,
5188 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
5189 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
5190 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
5191 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5192 .element_tile = 16,
5193 };
5194 xnn_params.f32.vmax = (struct vbinary_parameters) {
5195 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_x86_x16,
5196 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
5197 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
5198 .element_tile = 16,
5199 };
5200 xnn_params.f32.vmin = (struct vbinary_parameters) {
5201 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_x86_x16,
5202 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
5203 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
5204 .element_tile = 16,
5205 };
5206 xnn_params.f32.vmul = (struct vbinary_parameters) {
5207 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16,
5208 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
5209 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
5210 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
5211 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
5212 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
5213 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5214 .element_tile = 16,
5215 };
5216 xnn_params.f32.vsub = (struct vbinary_parameters) {
5217 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16,
5218 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16,
5219 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16,
5220 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
5221 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
5222 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
5223 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5224 .element_tile = 16,
5225 };
5226 } else {
5227 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_arm;
5228 xnn_params.f32.vadd = (struct vbinary_parameters) {
5229 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16,
5230 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
5231 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
5232 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
5233 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
5234 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
5235 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5236 .element_tile = 16,
5237 };
5238 xnn_params.f32.vdiv = (struct vbinary_parameters) {
5239 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16,
5240 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16,
5241 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16,
5242 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
5243 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
5244 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
5245 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5246 .element_tile = 16,
5247 };
5248 xnn_params.f32.vmax = (struct vbinary_parameters) {
5249 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_arm_x16,
5250 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
5251 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
5252 .element_tile = 16,
5253 };
5254 xnn_params.f32.vmin = (struct vbinary_parameters) {
5255 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_arm_x16,
5256 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
5257 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
5258 .element_tile = 16,
5259 };
5260 xnn_params.f32.vmul = (struct vbinary_parameters) {
5261 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16,
5262 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
5263 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
5264 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
5265 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
5266 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
5267 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5268 .element_tile = 16,
5269 };
5270 xnn_params.f32.vsub = (struct vbinary_parameters) {
5271 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16,
5272 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16,
5273 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16,
5274 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
5275 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
5276 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
5277 .init.f32_minmax = xnn_init_f32_minmax_wasmsimd_params,
5278 .element_tile = 16,
5279 };
5280 }
5281 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
5282 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__wasmsimd_x16,
5283 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
5284 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
5285 .element_tile = 16,
5286 };
5287 if (is_wasm_x86) {
5288 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
5289 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_x86_2x,
5290 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5291 .channel_tile = 4,
5292 .row_tile = 2,
5293 };
5294 } else {
5295 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
5296 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_arm_2x,
5297 .init.f32 = xnn_init_f32_minmax_wasmsimd_params,
5298 .channel_tile = 4,
5299 .row_tile = 2,
5300 };
5301 }
5302 #ifndef XNN_NO_NCHW_OPERATORS
5303 init_flags |= XNN_INIT_FLAG_CHW_OPT;
5304
5305 if (is_wasm_x86) {
5306 xnn_params.f32.spmm = (struct spmm_parameters) {
5307 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86,
5308 .mr = 32,
5309 .nr = 1,
5310 };
5311 } else {
5312 xnn_params.f32.spmm = (struct spmm_parameters) {
5313 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm,
5314 .mr = 32,
5315 .nr = 1,
5316 };
5317 }
5318 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
5319 .ukernel_with_symm_padding =
5320 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2,
5321 .output_channel_tile = 4,
5322 .output_height_tile = 2,
5323 .output_width_tile = 2,
5324 };
5325 if (is_wasm_x86) {
5326 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5327 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4,
5328 .output_width_tile = 4,
5329 .output_height_tile = 2,
5330 };
5331 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
5332 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2,
5333 .output_width_tile = 4,
5334 .output_height_tile = 1,
5335 };
5336 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
5337 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4,
5338 .output_width_tile = 4,
5339 .output_height_tile = 3,
5340 };
5341 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
5342 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2,
5343 .output_width_tile = 4,
5344 .output_height_tile = 1,
5345 };
5346 } else {
5347 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5348 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4,
5349 .output_width_tile = 4,
5350 .output_height_tile = 2,
5351 };
5352 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
5353 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4,
5354 .output_width_tile = 4,
5355 .output_height_tile = 1,
5356 };
5357 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
5358 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4,
5359 .output_width_tile = 4,
5360 .output_height_tile = 3,
5361 };
5362 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
5363 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2,
5364 .output_width_tile = 4,
5365 .output_height_tile = 1,
5366 };
5367 }
5368 if (is_wasm_x86) {
5369 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5370 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4,
5371 .channel_tile = 4,
5372 };
5373 } else {
5374 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
5375 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4,
5376 .channel_tile = 4,
5377 };
5378 }
5379 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
5380 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8,
5381 .channel_tile = 1,
5382 .pixel_tile = 8,
5383 };
5384 #endif // XNN_NO_NCHW_OPERATORS
5385 #endif // XNN_NO_F32_OPERATORS
5386
5387 /*************************** VCVT WAsm SIMD micro-kernels***************************/
5388 #ifndef XNN_NO_VCVT_OPERATORS
5389 init_flags |= XNN_INIT_FLAG_VCVT;
5390
5391 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
5392 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16,
5393 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params,
5394 .element_tile = 16,
5395 };
5396 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
5397 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__wasmsimd_x24,
5398 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_wasmsimd_params,
5399 .element_tile = 24,
5400 };
5401 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
5402 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__wasmsimd_magic_x32,
5403 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_wasmsimd_magic_params,
5404 .element_tile = 32,
5405 };
5406 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
5407 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x32,
5408 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_wasmsimd_magic_params,
5409 .element_tile = 32,
5410 };
5411 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
5412 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__wasmsimd_x32,
5413 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_wasmsimd_params,
5414 .element_tile = 32,
5415 };
5416 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
5417 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__wasmsimd_x32,
5418 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_wasmsimd_params,
5419 .element_tile = 32,
5420 };
5421 #endif // XNN_NO_VCVT_OPERATORS
5422
5423 /**************************** X32 WAsm SIMD micro-kernels****************************/
5424 #ifndef XNN_NO_X32_OPERATORS
5425 init_flags |= XNN_INIT_FLAG_X32;
5426
5427 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__wasmsimd;
5428 xnn_params.x32.zip = (struct zip_parameters) {
5429 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__wasmsimd,
5430 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__wasmsimd,
5431 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__wasmsimd,
5432 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__wasmsimd,
5433 };
5434 #ifndef XNN_NO_NCHW_OPERATORS
5435 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
5436 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
5437 .channel_tile = 1,
5438 .pixel_tile = 1,
5439 };
5440 #endif // XNN_NO_NCHW_OPERATORS
5441 #endif // XNN_NO_X32_OPERATORS
5442
5443 /**************************** XX WAsm SIMD micro-kernels****************************/
5444 #ifndef XNN_NO_XX_OPERATORS
5445 init_flags |= XNN_INIT_FLAG_XX;
5446
5447 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
5448 xnn_params.xx.fill = (struct fill_parameters) {
5449 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__wasmsimd_x64,
5450 .row_tile = 1,
5451 };
5452 xnn_params.xx.pad = (struct pad_parameters) {
5453 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__wasmsimd,
5454 .row_tile = 1,
5455 };
5456 #endif
5457
5458 #elif XNN_ARCH_WASM
5459
5460 /**************************** QC8 WAsm micro-kernels****************************/
5461 #ifndef XNN_NO_QC8_OPERATORS
5462 init_flags |= XNN_INIT_FLAG_QC8;
5463
5464 if (is_wasm_x86) {
5465 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5466 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5467 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5468 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5469 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
5470 xnn_params.qc8.gemm.mr = 2;
5471 xnn_params.qc8.gemm.nr = 2;
5472 } else {
5473 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5474 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5475 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5476 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5477 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
5478 xnn_params.qc8.gemm.mr = 4;
5479 xnn_params.qc8.gemm.nr = 4;
5480 }
5481
5482 if (is_wasm_x86) {
5483 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5484 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
5485 xnn_params.qc8.dwconv[0].channel_tile = 2;
5486 xnn_params.qc8.dwconv[0].primary_tile = 9;
5487 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5488 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_imagic_params;
5489 xnn_params.qc8.dwconv[1].channel_tile = 1;
5490 xnn_params.qc8.dwconv[1].primary_tile = 25;
5491 } else {
5492 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5493 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
5494 xnn_params.qc8.dwconv[0].channel_tile = 2;
5495 xnn_params.qc8.dwconv[0].primary_tile = 9;
5496 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5497 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_fmagic_params;
5498 xnn_params.qc8.dwconv[1].channel_tile = 2;
5499 xnn_params.qc8.dwconv[1].primary_tile = 25;
5500 }
5501 #endif // XNN_NO_QC8_OPERATORS
5502
5503 /**************************** QS8 WAsm micro-kernels****************************/
5504 #ifndef XNN_NO_QS8_OPERATORS
5505 init_flags |= XNN_INIT_FLAG_QS8;
5506
5507 if (is_wasm_x86) {
5508 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5509 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5510 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5511 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5512 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5513 xnn_params.qs8.gemm.mr = 2;
5514 xnn_params.qs8.gemm.nr = 2;
5515 } else {
5516 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5517 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5518 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5519 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5520 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5521 xnn_params.qs8.gemm.mr = 4;
5522 xnn_params.qs8.gemm.nr = 4;
5523 }
5524
5525 if (is_wasm_x86) {
5526 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5527 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5528 xnn_params.qs8.dwconv[0].channel_tile = 2;
5529 xnn_params.qs8.dwconv[0].primary_tile = 9;
5530 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5531 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params;
5532 xnn_params.qs8.dwconv[1].channel_tile = 1;
5533 xnn_params.qs8.dwconv[1].primary_tile = 25;
5534 } else {
5535 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5536 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5537 xnn_params.qs8.dwconv[0].channel_tile = 2;
5538 xnn_params.qs8.dwconv[0].primary_tile = 9;
5539 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5540 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params;
5541 xnn_params.qs8.dwconv[1].channel_tile = 2;
5542 xnn_params.qs8.dwconv[1].primary_tile = 25;
5543 }
5544
5545 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
5546 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
5547 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
5548 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
5549 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
5550 .row_tile = 7,
5551 .channel_tile = 4,
5552 };
5553
5554 xnn_params.qs8.vadd = (struct vbinary_parameters) {
5555 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
5556 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
5557 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
5558 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
5559 .element_tile = 4,
5560 };
5561 xnn_params.qs8.vmul = (struct vbinary_parameters) {
5562 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
5563 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
5564 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
5565 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
5566 .element_tile = 4,
5567 };
5568 #endif // XNN_NO_QS8_OPERATORS
5569
5570 /**************************** QU8 WAsm micro-kernels****************************/
5571 #ifndef XNN_NO_QU8_OPERATORS
5572 init_flags |= XNN_INIT_FLAG_QU8;
5573
5574 if (is_wasm_x86) {
5575 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5576 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic);
5577 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5578 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic);
5579 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5580 xnn_params.qu8.gemm.mr = 2;
5581 xnn_params.qu8.gemm.nr = 2;
5582 } else {
5583 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5584 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic);
5585 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5586 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic);
5587 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5588 xnn_params.qu8.gemm.mr = 4;
5589 xnn_params.qu8.gemm.nr = 4;
5590 }
5591
5592 if (is_wasm_x86) {
5593 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_imagic;
5594 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5595 xnn_params.qu8.dwconv[0].channel_tile = 2;
5596 xnn_params.qu8.dwconv[0].primary_tile = 9;
5597 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up1x25__scalar_imagic;
5598 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params;
5599 xnn_params.qu8.dwconv[1].channel_tile = 1;
5600 xnn_params.qu8.dwconv[1].primary_tile = 25;
5601 } else {
5602 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__wasm_fmagic;
5603 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5604 xnn_params.qu8.dwconv[0].channel_tile = 2;
5605 xnn_params.qu8.dwconv[0].primary_tile = 9;
5606 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__wasm_fmagic;
5607 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params;
5608 xnn_params.qu8.dwconv[1].channel_tile = 2;
5609 xnn_params.qu8.dwconv[1].primary_tile = 25;
5610 }
5611
5612 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
5613 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
5614 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
5615 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
5616 .primary_tile = 9,
5617 .incremental_tile = 8,
5618 .channel_tile = 1,
5619 };
5620 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
5621 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c4,
5622 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c4,
5623 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
5624 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
5625 .row_tile = 7,
5626 .channel_tile = 4,
5627 };
5628
5629 xnn_params.qu8.vadd = (struct vbinary_parameters) {
5630 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
5631 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
5632 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
5633 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
5634 .element_tile = 4,
5635 };
5636 xnn_params.qu8.vmul = (struct vbinary_parameters) {
5637 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
5638 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
5639 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
5640 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
5641 .element_tile = 4,
5642 };
5643 #endif // XNN_NO_QU8_OPERATORS
5644
5645 /**************************** S8 WAsm micro-kernels****************************/
5646 #ifndef XNN_NO_S8_OPERATORS
5647 init_flags |= XNN_INIT_FLAG_S8;
5648
5649 xnn_params.s8.clamp = (struct vunary_parameters) {
5650 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
5651 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
5652 .element_tile = 4,
5653 };
5654 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
5655 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
5656 .pixel_tile = 1,
5657 .channel_tile = 1,
5658 };
5659 xnn_params.s8.maxpool = (struct maxpool_parameters) {
5660 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
5661 .init.s8 = xnn_init_s8_minmax_scalar_params,
5662 .mr = 9,
5663 .qr = 8,
5664 };
5665 #endif // XNN_NO_S8_OPERATORS
5666
5667 /**************************** U8 WAsm micro-kernels****************************/
5668 #ifndef XNN_NO_U8_OPERATORS
5669 init_flags |= XNN_INIT_FLAG_U8;
5670
5671 xnn_params.u8.clamp = (struct vunary_parameters) {
5672 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
5673 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
5674 .element_tile = 4,
5675 };
5676 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
5677 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
5678 .pixel_tile = 1,
5679 .channel_tile = 1,
5680 };
5681 xnn_params.u8.maxpool = (struct maxpool_parameters) {
5682 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
5683 .init.u8 = xnn_init_u8_minmax_scalar_params,
5684 .mr = 9,
5685 .qr = 8,
5686 };
5687 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
5688 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
5689 #endif // XNN_NO_U8_OPERATORS
5690
5691 /**************************** X8 WAsm micro-kernels****************************/
5692 #ifndef XNN_NO_X8_OPERATORS
5693 init_flags |= XNN_INIT_FLAG_X8;
5694
5695 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
5696 xnn_params.x8.zip = (struct zip_parameters) {
5697 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
5698 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
5699 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
5700 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
5701 };
5702 #endif // XNN_NO_X8_OPERATORS
5703
5704 /**************************** F32 WAsm micro-kernels****************************/
5705 #ifndef XNN_NO_F32_OPERATORS
5706 init_flags |= XNN_INIT_FLAG_F32;
5707
5708 if (is_wasm_x86) {
5709 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_2x4__scalar);
5710 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_2x4__scalar);
5711 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
5712 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
5713 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_2x4__scalar);
5714 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_2x4__scalar);
5715 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
5716 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
5717 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_2x4__scalar);
5718 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_2x4__scalar);
5719 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
5720 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
5721 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
5722 xnn_params.f32.gemm.mr = 2;
5723 xnn_params.f32.gemm.nr = 4;
5724 } else {
5725 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__wasm);
5726 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__wasm);
5727 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
5728 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
5729 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__wasm);
5730 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__wasm);
5731 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
5732 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
5733 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__wasm);
5734 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__wasm);
5735 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
5736 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
5737 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
5738 xnn_params.f32.gemm.mr = 4;
5739 xnn_params.f32.gemm.nr = 4;
5740 }
5741 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__wasm);
5742 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__wasm),
5743 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__wasm);
5744 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__wasm),
5745 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
5746 xnn_params.f32.gemm2.mr = 4;
5747 xnn_params.f32.gemm2.nr = 2;
5748
5749 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__wasm_acc2;
5750 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__wasm_acc2;
5751 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
5752 xnn_params.f32.dwconv[0].channel_tile = 1;
5753 xnn_params.f32.dwconv[0].primary_tile = 3;
5754
5755 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__wasm_acc2;
5756 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__wasm_acc2;
5757 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
5758 xnn_params.f32.dwconv[1].channel_tile = 1;
5759 xnn_params.f32.dwconv[1].primary_tile = 4;
5760
5761 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__wasm_acc2;
5762 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__wasm_acc2;
5763 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
5764 xnn_params.f32.dwconv[2].channel_tile = 1;
5765 xnn_params.f32.dwconv[2].primary_tile = 9;
5766
5767 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__wasm_acc2;
5768 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__wasm_acc2;
5769 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
5770 xnn_params.f32.dwconv[3].channel_tile = 1;
5771 xnn_params.f32.dwconv[3].primary_tile = 25;
5772
5773 xnn_params.f32.avgpool = (struct avgpool_parameters) {
5774 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasm_c1,
5775 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasm_c1,
5776 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5777 .primary_tile = 9,
5778 .incremental_tile = 8,
5779 .channel_tile = 1,
5780 };
5781 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
5782 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasm_c1,
5783 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasm_c1,
5784 .primary_tile = 9,
5785 .incremental_tile = 8,
5786 .channel_tile = 1,
5787 };
5788 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
5789 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1,
5790 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1,
5791 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
5792 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
5793 .row_tile = 7,
5794 .channel_tile = 1,
5795 };
5796 xnn_params.f32.maxpool = (struct maxpool_parameters) {
5797 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasm_c1,
5798 .init.f32 = xnn_init_f32_minmax_scalar_params,
5799 .mr = 9,
5800 .qr = 8,
5801 };
5802 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
5803 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
5804 .mr = 4,
5805 };
5806 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
5807 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
5808 .mr = 9,
5809 };
5810 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
5811 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
5812 .mr = 9,
5813 .qr = 8,
5814 };
5815 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
5816 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
5817 .pixel_tile = 1,
5818 .channel_tile = 2,
5819 };
5820 xnn_params.f32.abs = (struct vunary_parameters) {
5821 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
5822 .element_tile = 4,
5823 };
5824 xnn_params.f32.clamp = (struct vunary_parameters) {
5825 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__wasm_x4,
5826 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5827 .element_tile = 4,
5828 };
5829 if (is_wasm_x86) {
5830 xnn_params.f32.hswish = (struct vunary_parameters) {
5831 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
5832 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
5833 .element_tile = 4,
5834 };
5835 } else {
5836 xnn_params.f32.hswish = (struct vunary_parameters) {
5837 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__wasm_x4,
5838 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
5839 .element_tile = 4,
5840 };
5841 }
5842 if (is_wasm_x86) {
5843 xnn_params.f32.elu = (struct vunary_parameters) {
5844 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2,
5845 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
5846 .element_tile = 2,
5847 };
5848 } else {
5849 xnn_params.f32.elu = (struct vunary_parameters) {
5850 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasm_rr2_p6_x6,
5851 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_p6_params,
5852 .element_tile = 6,
5853 };
5854 }
5855 xnn_params.f32.lrelu = (struct vunary_parameters) {
5856 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
5857 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
5858 .element_tile = 4,
5859 };
5860 xnn_params.f32.neg = (struct vunary_parameters) {
5861 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
5862 .element_tile = 4,
5863 };
5864 if (is_wasm_x86) {
5865 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__scalar_x8;
5866 } else {
5867 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_vrelu_ukernel__wasm_x8;
5868 }
5869 xnn_params.f32.rndne = (struct vunary_parameters) {
5870 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4,
5871 .element_tile = 4,
5872 };
5873 xnn_params.f32.rndz = (struct vunary_parameters) {
5874 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4,
5875 .element_tile = 4,
5876 };
5877 xnn_params.f32.rndu = (struct vunary_parameters) {
5878 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4,
5879 .element_tile = 4,
5880 };
5881 xnn_params.f32.rndd = (struct vunary_parameters) {
5882 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4,
5883 .element_tile = 4,
5884 };
5885 xnn_params.f32.sigmoid = (struct vunary_parameters) {
5886 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
5887 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
5888 .element_tile = 2,
5889 };
5890 xnn_params.f32.sqr = (struct vunary_parameters) {
5891 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
5892 .element_tile = 4,
5893 };
5894 xnn_params.f32.sqrt = (struct vunary_parameters) {
5895 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
5896 .element_tile = 1,
5897 };
5898 if (is_wasm_x86) {
5899 xnn_params.f32.prelu = (struct prelu_parameters) {
5900 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
5901 .row_tile = 2,
5902 .channel_tile = 4,
5903 };
5904 } else {
5905 xnn_params.f32.prelu = (struct prelu_parameters) {
5906 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
5907 .row_tile = 2,
5908 .channel_tile = 4,
5909 };
5910 }
5911 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
5912 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
5913 .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
5914 .element_tile = 4,
5915 };
5916 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
5917 xnn_params.f32.vadd = (struct vbinary_parameters) {
5918 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasm_x8,
5919 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
5920 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
5921 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5922 .element_tile = 8,
5923 };
5924 xnn_params.f32.vdiv = (struct vbinary_parameters) {
5925 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasm_x8,
5926 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasm_x8,
5927 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasm_x8,
5928 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5929 .element_tile = 8,
5930 };
5931 xnn_params.f32.vmax = (struct vbinary_parameters) {
5932 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x8,
5933 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
5934 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
5935 .element_tile = 8,
5936 };
5937 xnn_params.f32.vmin = (struct vbinary_parameters) {
5938 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x8,
5939 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
5940 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
5941 .element_tile = 8,
5942 };
5943 xnn_params.f32.vmul = (struct vbinary_parameters) {
5944 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasm_x8,
5945 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
5946 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
5947 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5948 .element_tile = 8,
5949 };
5950 xnn_params.f32.vsub = (struct vbinary_parameters) {
5951 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasm_x8,
5952 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasm_x8,
5953 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasm_x8,
5954 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
5955 .element_tile = 8,
5956 };
5957 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
5958 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
5959 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
5960 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
5961 .element_tile = 8,
5962 };
5963 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
5964 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__wasm_2x,
5965 .init.f32 = xnn_init_f32_minmax_scalar_params,
5966 .channel_tile = 1,
5967 .row_tile = 2,
5968 };
5969 #ifndef XNN_NO_NCHW_OPERATORS
5970 init_flags |= XNN_INIT_FLAG_CHW_OPT;
5971
5972 xnn_params.f32.spmm = (struct spmm_parameters) {
5973 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
5974 .mr = 8,
5975 .nr = 1,
5976 };
5977 xnn_params.f32.spmm2 = (struct spmm_parameters) {
5978 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
5979 .mr = 8,
5980 .nr = 2,
5981 };
5982 xnn_params.f32.spmm4 = (struct spmm_parameters) {
5983 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
5984 .mr = 8,
5985 .nr = 4,
5986 };
5987 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
5988 .ukernel_with_symm_padding =
5989 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
5990 .output_channel_tile = 4,
5991 .output_height_tile = 1,
5992 .output_width_tile = 1,
5993 };
5994 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
5995 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
5996 .output_width_tile = 1,
5997 .output_height_tile = 2,
5998 };
5999 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6000 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
6001 .output_width_tile = 1,
6002 .output_height_tile = 1,
6003 };
6004 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6005 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
6006 .output_width_tile = 1,
6007 .output_height_tile = 1,
6008 };
6009 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6010 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
6011 .output_width_tile = 1,
6012 .output_height_tile = 1,
6013 };
6014 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6015 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
6016 .channel_tile = 1,
6017 };
6018 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6019 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
6020 .channel_tile = 1,
6021 .pixel_tile = 4,
6022 };
6023 #endif // XNN_NO_NCHW_OPERATORS
6024 #endif // XNN_NO_F32_OPERATORS
6025
6026 /*************************** VCVT WAsm micro-kernels***************************/
6027 #ifndef XNN_NO_VCVT_OPERATORS
6028 init_flags |= XNN_INIT_FLAG_VCVT;
6029
6030 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6031 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x1,
6032 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6033 .element_tile = 1,
6034 };
6035 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6036 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_bitcast_x4,
6037 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_bitcast_params,
6038 .element_tile = 4,
6039 };
6040 if (is_wasm_x86) {
6041 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6042 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6043 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_imagic_params,
6044 .element_tile = 1,
6045 };
6046 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6047 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_imagic_x1,
6048 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_imagic_params,
6049 .element_tile = 1,
6050 };
6051 } else {
6052 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6053 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6054 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_fmagic_params,
6055 .element_tile = 4,
6056 };
6057 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6058 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__wasm_fmagic_x4,
6059 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_fmagic_params,
6060 .element_tile = 4,
6061 };
6062 }
6063 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6064 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x1,
6065 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6066 .element_tile = 1,
6067 };
6068 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6069 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x1,
6070 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
6071 .element_tile = 1,
6072 };
6073 #endif // XNN_NO_VCVT_OPERATORS
6074
6075 /**************************** X32 WAsm micro-kernels****************************/
6076 #ifndef XNN_NO_X32_OPERATORS
6077 init_flags |= XNN_INIT_FLAG_X32;
6078
6079 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
6080 xnn_params.x32.zip = (struct zip_parameters) {
6081 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
6082 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
6083 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
6084 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
6085 };
6086 #ifndef XNN_NO_NCHW_OPERATORS
6087 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
6088 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
6089 .channel_tile = 1,
6090 .pixel_tile = 1,
6091 };
6092 #endif // XNN_NO_NCHW_OPERATORS
6093 #endif // XNN_NO_X32_OPERATORS
6094
6095 /**************************** XX WAsm micro-kernels****************************/
6096 #ifndef XNN_NO_XX_OPERATORS
6097 init_flags |= XNN_INIT_FLAG_XX;
6098
6099 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
6100 xnn_params.xx.fill = (struct fill_parameters) {
6101 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
6102 .row_tile = 1,
6103 };
6104 xnn_params.xx.pad = (struct pad_parameters) {
6105 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
6106 .row_tile = 1,
6107 };
6108 #endif
6109
6110 #elif XNN_ARCH_RISCV
6111
6112 /************************** QC8 RISC-V micro-kernels **************************/
6113 #ifndef XNN_NO_QC8_OPERATORS
6114 init_flags |= XNN_INIT_FLAG_QC8;
6115
6116 xnn_params.qc8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6117 xnn_params.qc8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6118 xnn_params.qc8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qc8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6119 xnn_params.qc8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qc8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6120 xnn_params.qc8.gemm.init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
6121 xnn_params.qc8.gemm.mr = 3;
6122 xnn_params.qc8.gemm.nr = 4;
6123
6124 xnn_params.qc8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
6125 xnn_params.qc8.dwconv[0].init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
6126 xnn_params.qc8.dwconv[0].channel_tile = 2;
6127 xnn_params.qc8.dwconv[0].primary_tile = 9;
6128 xnn_params.qc8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qc8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
6129 xnn_params.qc8.dwconv[1].init.qc8 = xnn_init_qs8_minmax_scalar_lrintf_params;
6130 xnn_params.qc8.dwconv[1].channel_tile = 2;
6131 xnn_params.qc8.dwconv[1].primary_tile = 25;
6132 #endif // XNN_NO_QS8_OPERATORS
6133
6134 /************************** QS8 RISC-V micro-kernels **************************/
6135 #ifndef XNN_NO_QS8_OPERATORS
6136 init_flags |= XNN_INIT_FLAG_QS8;
6137
6138 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6139 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6140 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6141 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6142 xnn_params.qs8.gemm.init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
6143 xnn_params.qs8.gemm.mr = 3;
6144 xnn_params.qs8.gemm.nr = 4;
6145
6146 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
6147 xnn_params.qs8.dwconv[0].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
6148 xnn_params.qs8.dwconv[0].channel_tile = 2;
6149 xnn_params.qs8.dwconv[0].primary_tile = 9;
6150 xnn_params.qs8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
6151 xnn_params.qs8.dwconv[1].init.qs8 = xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params;
6152 xnn_params.qs8.dwconv[1].channel_tile = 2;
6153 xnn_params.qs8.dwconv[1].primary_tile = 25;
6154
6155 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
6156 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
6157 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
6158 .init.qs8 = xnn_init_qs8_avgpool_minmax_fp32_scalar_imagic_params,
6159 .update.qs8 = xnn_update_qs8_avgpool_minmax_fp32_scalar_imagic_params,
6160 .row_tile = 7,
6161 .channel_tile = 1,
6162 };
6163
6164 xnn_params.qs8.vadd = (struct vbinary_parameters) {
6165 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__scalar_x4,
6166 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
6167 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__scalar_x4,
6168 .init.qs8_addsub = xnn_init_qs8_add_minmax_scalar_params,
6169 .element_tile = 4,
6170 };
6171 xnn_params.qs8.vmul = (struct vbinary_parameters) {
6172 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmul_minmax_fp32_ukernel__scalar_x4,
6173 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
6174 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vmulc_minmax_fp32_ukernel__scalar_x4,
6175 .init.qs8_mul = xnn_init_qs8_mul_minmax_fp32_scalar_params,
6176 .element_tile = 4,
6177 };
6178 #endif // XNN_NO_QS8_OPERATORS
6179
6180 /************************** QU8 RISC-V micro-kernels **************************/
6181 #ifndef XNN_NO_QU8_OPERATORS
6182 init_flags |= XNN_INIT_FLAG_QU8;
6183
6184 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6185 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf);
6186 xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6187 xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf);
6188 xnn_params.qu8.gemm.init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6189 xnn_params.qu8.gemm.mr = 3;
6190 xnn_params.qu8.gemm.nr = 4;
6191
6192 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x9__scalar_lrintf;
6193 xnn_params.qu8.dwconv[0].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6194 xnn_params.qu8.dwconv[0].channel_tile = 2;
6195 xnn_params.qu8.dwconv[0].primary_tile = 9;
6196 xnn_params.qu8.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_fp32_ukernel_up2x25__scalar_lrintf;
6197 xnn_params.qu8.dwconv[1].init.qu8 = xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params;
6198 xnn_params.qu8.dwconv[1].channel_tile = 2;
6199 xnn_params.qu8.dwconv[1].primary_tile = 25;
6200
6201 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
6202 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
6203 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
6204 .init.qu8 = xnn_init_qu8_avgpool_minmax_scalar_params,
6205 .primary_tile = 9,
6206 .incremental_tile = 8,
6207 .channel_tile = 1,
6208 };
6209 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
6210 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7x__scalar_imagic_c1,
6211 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_fp32_ukernel_7p7x__scalar_imagic_c1,
6212 .init.qu8 = xnn_init_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6213 .update.qu8 = xnn_update_qu8_avgpool_minmax_fp32_scalar_imagic_params,
6214 .row_tile = 7,
6215 .channel_tile = 1,
6216 };
6217
6218 xnn_params.qu8.vadd = (struct vbinary_parameters) {
6219 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar_x4,
6220 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
6221 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vaddc_minmax_ukernel__scalar_x4,
6222 .init.qu8_addsub = xnn_init_qu8_add_minmax_scalar_params,
6223 .element_tile = 4,
6224 };
6225 xnn_params.qu8.vmul = (struct vbinary_parameters) {
6226 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmul_minmax_fp32_ukernel__scalar_x4,
6227 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
6228 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qu8_vmulc_minmax_fp32_ukernel__scalar_x4,
6229 .init.qu8_mul = xnn_init_qu8_mul_minmax_fp32_scalar_params,
6230 .element_tile = 4,
6231 };
6232 #endif // XNN_NO_QU8_OPERATORS
6233
6234 /************************** S8 RISC-V micro-kernels ***************************/
6235 #ifndef XNN_NO_S8_OPERATORS
6236 init_flags |= XNN_INIT_FLAG_S8;
6237
6238 xnn_params.s8.clamp = (struct vunary_parameters) {
6239 .ukernel = (xnn_univector_ukernel_function) xnn_s8_vclamp_ukernel__scalar_x4,
6240 .init.s8_minmax = xnn_init_s8_minmax_scalar_params,
6241 .element_tile = 4,
6242 };
6243 xnn_params.s8.ibilinear = (struct ibilinear_parameters) {
6244 .ukernel = (xnn_ibilinear_ukernel_function) xnn_s8_ibilinear_ukernel__scalar_c1,
6245 .pixel_tile = 1,
6246 .channel_tile = 1,
6247 };
6248 xnn_params.s8.maxpool = (struct maxpool_parameters) {
6249 .ukernel = (xnn_maxpool_ukernel_function) xnn_s8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6250 .init.s8 = xnn_init_s8_minmax_scalar_params,
6251 .mr = 9,
6252 .qr = 8,
6253 };
6254 #endif // XNN_NO_S8_OPERATORS
6255
6256 /************************** U8 RISC-V micro-kernels ***************************/
6257 #ifndef XNN_NO_U8_OPERATORS
6258 init_flags |= XNN_INIT_FLAG_U8;
6259
6260 xnn_params.u8.clamp = (struct vunary_parameters) {
6261 .ukernel = (xnn_univector_ukernel_function) xnn_u8_vclamp_ukernel__scalar_x4,
6262 .init.u8_minmax = xnn_init_u8_minmax_scalar_params,
6263 .element_tile = 4,
6264 };
6265 xnn_params.u8.ibilinear = (struct ibilinear_parameters) {
6266 .ukernel = (xnn_ibilinear_ukernel_function) xnn_u8_ibilinear_ukernel__scalar_c1,
6267 .pixel_tile = 1,
6268 .channel_tile = 1,
6269 };
6270 xnn_params.u8.maxpool = (struct maxpool_parameters) {
6271 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
6272 .init.u8 = xnn_init_u8_minmax_scalar_params,
6273 .mr = 9,
6274 .qr = 8,
6275 };
6276 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
6277 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
6278 #endif // XNN_NO_U8_OPERATORS
6279
6280 /************************** X8 RISC-V micro-kernels ***************************/
6281 #ifndef XNN_NO_X8_OPERATORS
6282 init_flags |= XNN_INIT_FLAG_X8;
6283
6284 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar_x4;
6285 xnn_params.x8.zip = (struct zip_parameters) {
6286 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
6287 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
6288 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
6289 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
6290 };
6291 #endif // XNN_NO_X8_OPERATORS
6292
6293 /************************** F32 RISC-V micro-kernels **************************/
6294 #ifndef XNN_NO_F32_OPERATORS
6295 init_flags |= XNN_INIT_FLAG_F32;
6296
6297 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
6298 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
6299 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
6300 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
6301 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
6302 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
6303 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
6304 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
6305 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
6306 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
6307 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
6308 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
6309 xnn_params.f32.gemm.init.f32 = xnn_init_f32_minmax_scalar_params;
6310 xnn_params.f32.gemm.mr = 4;
6311 xnn_params.f32.gemm.nr = 4;
6312
6313 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
6314 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
6315 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
6316 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
6317 xnn_params.f32.gemm2.init.f32 = xnn_init_f32_minmax_scalar_params;
6318 xnn_params.f32.gemm2.mr = 4;
6319 xnn_params.f32.gemm2.nr = 2;
6320
6321 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x3__scalar_acc2;
6322 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x3__scalar_acc2;
6323 xnn_params.f32.dwconv[0].init.f32 = xnn_init_f32_minmax_scalar_params;
6324 xnn_params.f32.dwconv[0].channel_tile = 1;
6325 xnn_params.f32.dwconv[0].primary_tile = 3;
6326
6327 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
6328 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
6329 xnn_params.f32.dwconv[1].init.f32 = xnn_init_f32_minmax_scalar_params;
6330 xnn_params.f32.dwconv[1].channel_tile = 1;
6331 xnn_params.f32.dwconv[1].primary_tile = 4;
6332
6333 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
6334 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
6335 xnn_params.f32.dwconv[2].init.f32 = xnn_init_f32_minmax_scalar_params;
6336 xnn_params.f32.dwconv[2].channel_tile = 1;
6337 xnn_params.f32.dwconv[2].primary_tile = 9;
6338
6339 xnn_params.f32.dwconv[3].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
6340 xnn_params.f32.dwconv[3].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
6341 xnn_params.f32.dwconv[3].init.f32 = xnn_init_f32_minmax_scalar_params;
6342 xnn_params.f32.dwconv[3].channel_tile = 1;
6343 xnn_params.f32.dwconv[3].primary_tile = 25;
6344
6345 xnn_params.f32.avgpool = (struct avgpool_parameters) {
6346 .unipass = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
6347 .multipass = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
6348 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6349 .primary_tile = 9,
6350 .incremental_tile = 8,
6351 .channel_tile = 1,
6352 };
6353 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
6354 .unipass = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
6355 .multipass = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
6356 .primary_tile = 9,
6357 .incremental_tile = 8,
6358 .channel_tile = 1,
6359 };
6360 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
6361 .unipass = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
6362 .multipass = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
6363 .init.f32 = xnn_init_f32_scaleminmax_scalar_params,
6364 .update.f32 = xnn_update_f32_scaleminmax_scalar_params,
6365 .row_tile = 7,
6366 .channel_tile = 1,
6367 };
6368 xnn_params.f32.maxpool = (struct maxpool_parameters) {
6369 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
6370 .init.f32 = xnn_init_f32_minmax_scalar_params,
6371 .mr = 9,
6372 .qr = 8,
6373 };
6374 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
6375 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
6376 .mr = 4,
6377 };
6378 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
6379 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
6380 .mr = 9,
6381 };
6382 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
6383 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
6384 .mr = 9,
6385 .qr = 8,
6386 };
6387 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
6388 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
6389 .pixel_tile = 1,
6390 .channel_tile = 2,
6391 };
6392 xnn_params.f32.abs = (struct vunary_parameters) {
6393 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4,
6394 .element_tile = 4,
6395 };
6396 xnn_params.f32.clamp = (struct vunary_parameters) {
6397 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vclamp_ukernel__scalar_x4,
6398 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6399 .element_tile = 4,
6400 };
6401 xnn_params.f32.elu = (struct vunary_parameters) {
6402 .ukernel = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4,
6403 .init.f32_elu = xnn_init_f32_elu_scalar_rr2_lut16_p3_params,
6404 .element_tile = 4,
6405 };
6406 xnn_params.f32.hswish = (struct vunary_parameters) {
6407 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vhswish_ukernel__scalar_x4,
6408 .init.f32_hswish = xnn_init_f32_hswish_scalar_params,
6409 .element_tile = 4,
6410 };
6411 xnn_params.f32.lrelu = (struct vunary_parameters) {
6412 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4,
6413 .init.f32_lrelu = xnn_init_f32_lrelu_scalar_params,
6414 .element_tile = 4,
6415 };
6416 xnn_params.f32.neg = (struct vunary_parameters) {
6417 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4,
6418 .element_tile = 4,
6419 };
6420 xnn_params.f32.rndne = (struct vunary_parameters) {
6421 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1,
6422 .element_tile = 1,
6423 };
6424 xnn_params.f32.rndz = (struct vunary_parameters) {
6425 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1,
6426 .element_tile = 1,
6427 };
6428 xnn_params.f32.rndu = (struct vunary_parameters) {
6429 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1,
6430 .element_tile = 1,
6431 };
6432 xnn_params.f32.rndd = (struct vunary_parameters) {
6433 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1,
6434 .element_tile = 1,
6435 };
6436 xnn_params.f32.sigmoid = (struct vunary_parameters) {
6437 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsigmoid_ukernel__scalar_rr2_lut64_p2_div_x2,
6438 .init.f32_sigmoid = xnn_init_f32_sigmoid_scalar_rr2_lut64_p2_params,
6439 .element_tile = 2,
6440 };
6441 xnn_params.f32.sqr = (struct vunary_parameters) {
6442 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4,
6443 .element_tile = 4,
6444 };
6445 xnn_params.f32.sqrt = (struct vunary_parameters) {
6446 .ukernel = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1,
6447 .element_tile = 1,
6448 };
6449 xnn_params.f32.prelu = (struct prelu_parameters) {
6450 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
6451 .row_tile = 4,
6452 .channel_tile = 4,
6453 };
6454 xnn_params.f32.raddstoreexpminusmax = (struct raddstoreexpminusmax_parameters) {
6455 .ukernel = xnn_f32_raddstoreexpminusmax_ukernel__scalar_rr2_p5_x4_acc2,
6456 .init = xnn_init_f32_expminus_scalar_rr2_p5_params,
6457 .element_tile = 4,
6458 };
6459 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
6460 xnn_params.f32.vadd = (struct vbinary_parameters) {
6461 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
6462 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
6463 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
6464 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6465 .element_tile = 8,
6466 };
6467 xnn_params.f32.vdiv = (struct vbinary_parameters) {
6468 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
6469 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
6470 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
6471 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6472 .element_tile = 2,
6473 };
6474 xnn_params.f32.vmax = (struct vbinary_parameters) {
6475 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
6476 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
6477 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
6478 .element_tile = 8,
6479 };
6480 xnn_params.f32.vmin = (struct vbinary_parameters) {
6481 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
6482 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
6483 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
6484 .element_tile = 8,
6485 };
6486 xnn_params.f32.vmul = (struct vbinary_parameters) {
6487 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
6488 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
6489 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
6490 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6491 .element_tile = 8,
6492 };
6493 xnn_params.f32.vsub = (struct vbinary_parameters) {
6494 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
6495 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
6496 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
6497 .init.f32_minmax = xnn_init_f32_minmax_scalar_params,
6498 .element_tile = 8,
6499 };
6500 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
6501 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
6502 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6503 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
6504 .element_tile = 8,
6505 };
6506 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
6507 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
6508 .init.f32 = xnn_init_f32_minmax_scalar_params,
6509 .channel_tile = 1,
6510 .row_tile = 2,
6511 };
6512 #ifndef XNN_NO_NCHW_OPERATORS
6513 init_flags |= XNN_INIT_FLAG_CHW_OPT;
6514
6515 xnn_params.f32.spmm = (struct spmm_parameters) {
6516 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
6517 .mr = 8,
6518 .nr = 1,
6519 };
6520 xnn_params.f32.spmm2 = (struct spmm_parameters) {
6521 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
6522 .mr = 8,
6523 .nr = 2,
6524 };
6525 xnn_params.f32.spmm4 = (struct spmm_parameters) {
6526 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
6527 .mr = 8,
6528 .nr = 4,
6529 };
6530 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
6531 .ukernel_with_symm_padding =
6532 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
6533 .output_channel_tile = 4,
6534 .output_height_tile = 1,
6535 .output_width_tile = 1,
6536 };
6537 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
6538 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
6539 .output_width_tile = 1,
6540 .output_height_tile = 2,
6541 };
6542 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
6543 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
6544 .output_width_tile = 1,
6545 .output_height_tile = 1,
6546 };
6547 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
6548 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
6549 .output_width_tile = 1,
6550 .output_height_tile = 1,
6551 };
6552 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
6553 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
6554 .output_width_tile = 1,
6555 .output_height_tile = 1,
6556 };
6557 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
6558 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
6559 .channel_tile = 1,
6560 };
6561 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
6562 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
6563 .channel_tile = 1,
6564 .pixel_tile = 4,
6565 };
6566 #endif // XNN_NO_NCHW_OPERATORS
6567 #endif // XNN_NO_F32_OPERATORS
6568
6569 /************************** VCVT RISC-V micro-kernels *************************/
6570 #ifndef XNN_NO_VCVT_OPERATORS
6571 init_flags |= XNN_INIT_FLAG_VCVT;
6572
6573 xnn_params.vcvt.f16_to_f32 = (struct vunary_parameters) {
6574 .ukernel = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_x4,
6575 .init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params,
6576 .element_tile = 4,
6577 };
6578 xnn_params.vcvt.f32_to_f16 = (struct vunary_parameters) {
6579 .ukernel = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2,
6580 .init.f32_f16_cvt = xnn_init_f32_f16_cvt_scalar_fabsf_params,
6581 .element_tile = 2,
6582 };
6583 xnn_params.vcvt.f32_to_qs8 = (struct vunary_parameters) {
6584 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qs8_vcvt_ukernel__scalar_lrintf_x4,
6585 .init.f32_qs8_cvt = xnn_init_f32_qs8_cvt_scalar_lrintf_params,
6586 .element_tile = 4,
6587 };
6588 xnn_params.vcvt.f32_to_qu8 = (struct vunary_parameters) {
6589 .ukernel = (xnn_univector_ukernel_function) xnn_f32_qu8_vcvt_ukernel__scalar_lrintf_x4,
6590 .init.f32_qu8_cvt = xnn_init_f32_qu8_cvt_scalar_lrintf_params,
6591 .element_tile = 4,
6592 };
6593 xnn_params.vcvt.qs8_to_f32 = (struct vunary_parameters) {
6594 .ukernel = (xnn_univector_ukernel_function) xnn_qs8_f32_vcvt_ukernel__scalar_x4,
6595 .init.qs8_f32_cvt = xnn_init_qs8_f32_cvt_scalar_params,
6596 .element_tile = 4,
6597 };
6598 xnn_params.vcvt.qu8_to_f32 = (struct vunary_parameters) {
6599 .ukernel = (xnn_univector_ukernel_function) xnn_qu8_f32_vcvt_ukernel__scalar_x4,
6600 .init.qu8_f32_cvt = xnn_init_qu8_f32_cvt_scalar_params,
6601 .element_tile = 4,
6602 };
6603 #endif // XNN_NO_VCVT_OPERATORS
6604
6605 /************************** X32 RISC-V micro-kernels **************************/
6606 #ifndef XNN_NO_X32_OPERATORS
6607 init_flags |= XNN_INIT_FLAG_X32;
6608
6609 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
6610 xnn_params.x32.zip = (struct zip_parameters) {
6611 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
6612 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
6613 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
6614 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
6615 };
6616 #ifndef XNN_NO_NCHW_OPERATORS
6617 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
6618 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
6619 .channel_tile = 1,
6620 .pixel_tile = 1,
6621 };
6622 #endif // XNN_NO_NCHW_OPERATORS
6623 #endif // XNN_NO_X32_OPERATORS
6624
6625 /************************** XX RISC-V micro-kernels ***************************/
6626 #ifndef XNN_NO_XX_OPERATORS
6627 init_flags |= XNN_INIT_FLAG_XX;
6628
6629 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
6630 xnn_params.xx.fill = (struct fill_parameters) {
6631 .ukernel = (xnn_fill_ukernel_function) xnn_xx_fill_ukernel__scalar_x16,
6632 .row_tile = 1,
6633 };
6634 xnn_params.xx.pad = (struct pad_parameters) {
6635 .ukernel = (xnn_pad_ukernel_function) xnn_xx_pad_ukernel__scalar,
6636 .row_tile = 1,
6637 };
6638 #endif // XNN_NO_XX_OPERATORS
6639
6640 #else
6641 #error "Unsupported architecture"
6642 #endif
6643
6644 memcpy(&xnn_params.allocator, init_allocator, sizeof(struct xnn_allocator));
6645 xnn_params.init_flags = init_flags;
6646 }
6647
6648 #if XNN_PLATFORM_WINDOWS
init_windows(PINIT_ONCE init_once,PVOID parameter,PVOID * context)6649 static BOOL CALLBACK init_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
6650 init();
6651 return TRUE;
6652 }
6653 #endif
6654
xnn_initialize(const struct xnn_allocator * allocator)6655 enum xnn_status xnn_initialize(const struct xnn_allocator* allocator) {
6656 #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
6657 if (!cpuinfo_initialize()) {
6658 return xnn_status_out_of_memory;
6659 }
6660 #endif // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
6661 if (allocator == NULL) {
6662 allocator = &xnn_default_allocator;
6663 }
6664 #ifdef _MSC_VER
6665 _InterlockedCompareExchangePointer((PVOID volatile*) &init_allocator, (PVOID) allocator, NULL);
6666 #else
6667 __sync_bool_compare_and_swap(&init_allocator, NULL, allocator);
6668 #endif
6669 #if XNN_PLATFORM_WINDOWS
6670 InitOnceExecuteOnce(&init_guard, &init_windows, NULL, NULL);
6671 #else
6672 pthread_once(&init_guard, &init);
6673 #endif
6674 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) != 0) {
6675 return xnn_status_success;
6676 } else {
6677 return xnn_status_unsupported_hardware;
6678 }
6679 }
6680
xnn_deinitialize(void)6681 enum xnn_status xnn_deinitialize(void) {
6682 #if !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
6683 cpuinfo_deinitialize();
6684 #endif // !XNN_PLATFORM_WEB && !XNN_ARCH_RISCV
6685 return xnn_status_success;
6686 }
6687