1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <math.h>
10 #include <stdbool.h>
11 #include <stddef.h>
12 #include <stdint.h>
13 #include <string.h>
14
15 #ifdef _WIN32
16 #include <windows.h>
17 #else
18 #include <pthread.h>
19 #endif
20
21 #ifndef __EMSCRIPTEN__
22 #include <cpuinfo.h>
23 #endif
24
25 #include <xnnpack.h>
26 #include <xnnpack/argmaxpool.h>
27 #include <xnnpack/avgpool.h>
28 #include <xnnpack/clamp.h>
29 #include <xnnpack/common.h>
30 #include <xnnpack/conv.h>
31 #include <xnnpack/dwconv.h>
32 #include <xnnpack/depthtospace.h>
33 #include <xnnpack/gavgpool.h>
34 #include <xnnpack/gemm.h>
35 #include <xnnpack/fill.h>
36 #include <xnnpack/hswish.h>
37 #include <xnnpack/ibilinear.h>
38 #include <xnnpack/igemm.h>
39 #include <xnnpack/log.h>
40 #include <xnnpack/lut.h>
41 #include <xnnpack/maxpool.h>
42 #include <xnnpack/memory.h>
43 #include <xnnpack/pad.h>
44 #include <xnnpack/params.h>
45 #include <xnnpack/pavgpool.h>
46 #include <xnnpack/prelu.h>
47 #include <xnnpack/raddstoreexpminusmax.h>
48 #include <xnnpack/rmax.h>
49 #include <xnnpack/spmm.h>
50 #include <xnnpack/unpool.h>
51 #include <xnnpack/vadd.h>
52 #include <xnnpack/vbinary.h>
53 #include <xnnpack/vmulcaddc.h>
54 #include <xnnpack/vunary.h>
55 #include <xnnpack/zip.h>
56
57 #ifndef XNN_ENABLE_ASSEMBLY
58 #define XNN_ENABLE_ASSEMBLY 1
59 #endif
60
61 #ifdef _WIN32
62 static INIT_ONCE init_guard = INIT_ONCE_STATIC_INIT;
63 #else
64 static pthread_once_t init_guard = PTHREAD_ONCE_INIT;
65 #endif
66
67 struct xnn_parameters xnn_params = {
68 .init_flags = 0
69 };
70
init(void)71 static void init(void) {
72 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
73 // Unlike most other architectures, on x86/x86-64 when floating-point instructions
74 // have no NaN arguments, but produce NaN output, the output NaN has sign bit set.
75 // We use it to distinguish x86/x86-64 from other architectures, by doing subtraction
76 // of two infinities (must produce NaN per IEEE 754 standard).
77 static const volatile float inf = INFINITY;
78 const bool is_wasm_x86 = signbit(inf - inf);
79 #endif
80 uint32_t init_flags = XNN_INIT_FLAG_XNNPACK;
81
82 #if XNN_ARCH_ARM
83 #if XNN_PLATFORM_MOBILE
84 if (!cpuinfo_has_arm_neon()) {
85 xnn_log_error("XNNPACK initialization failed: NEON is not supported");
86 return;
87 }
88 #else
89 if (!cpuinfo_has_arm_vfpv2() && !cpuinfo_has_arm_vfpv3()) {
90 xnn_log_error("XNNPACK initialization failed: VFP is not supported");
91 return;
92 }
93 #endif
94
95 /**************************** XX micro-kernels ****************************/
96 #ifndef XNN_NO_XX_OPERATORS
97 init_flags |= XNN_INIT_FLAG_XX;
98
99 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
100 #endif
101
102 if (cpuinfo_has_arm_neon()) {
103 /**************************** QS8 micro-kernels ****************************/
104 #ifndef XNN_NO_QS8_OPERATORS
105 init_flags |= XNN_INIT_FLAG_QS8;
106
107 if (cpuinfo_has_arm_neon_dot()) {
108 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x8c4__neondot);
109 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c4__neondot);
110 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot);
111 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot);
112 xnn_params.qs8.gemm.mr = 4;
113 xnn_params.qs8.gemm.nr = 8;
114 xnn_params.qs8.gemm.log2_kr = 2;
115 } else {
116 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
117 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
118 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
119 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
120 xnn_params.qs8.gemm.mr = 2;
121 xnn_params.qs8.gemm.nr = 8;
122 xnn_params.qs8.gemm.log2_kr = 1;
123 }
124
125 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16;
126 xnn_params.qs8.dwconv[0].channel_tile = 8;
127 xnn_params.qs8.dwconv[0].primary_tile = 9;
128
129 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
130 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2,
131 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2,
132 .mr = 7,
133 };
134
135 xnn_params.qs8.vadd = (struct vbinary_parameters) {
136 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8,
137 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8,
138 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8,
139 .element_tile = 8,
140 };
141 #endif // XNN_NO_QS8_OPERATORS
142
143 /*************************** QU8 micro-kernels ***************************/
144 #ifndef XNN_NO_QU8_OPERATORS
145 init_flags |= XNN_INIT_FLAG_QU8;
146
147 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_ukernel_4x8__neon);
148 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_ukernel_4x8__neon);
149 xnn_params.qu8.gemm.mr = 4;
150 xnn_params.qu8.gemm.nr = 8;
151
152 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_ukernel_up8x9__neon;
153 xnn_params.qu8.dwconv[0].channel_tile = 8;
154 xnn_params.qu8.dwconv[0].primary_tile = 9;
155 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
156 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
157 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
158 .mr = 9,
159 .qr = 8,
160 };
161 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
162 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8,
163 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8,
164 .mr = 7,
165 };
166 xnn_params.qu8.vadd = (xnn_vadd_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon;
167 #endif // XNN_NO_QU8_OPERATORS
168
169 /**************************** U8 micro-kernels ****************************/
170 #ifndef XNN_NO_U8_OPERATORS
171 init_flags |= XNN_INIT_FLAG_U8;
172
173 xnn_params.u8.maxpool = (struct maxpool_parameters) {
174 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
175 .mr = 9,
176 .qr = 8,
177 };
178 xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__neon_x64;
179 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
180 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
181 #endif // XNN_NO_U8_OPERATORS
182
183 /**************************** X8 micro-kernels ****************************/
184 #ifndef XNN_NO_X8_OPERATORS
185 init_flags |= XNN_INIT_FLAG_X8;
186
187 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
188 xnn_params.x8.zip = (struct zip_parameters) {
189 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
190 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
191 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
192 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
193 };
194 #endif // XNN_NO_X8_OPERATORS
195
196 /**************************** F32 micro-kernels ****************************/
197 #ifndef XNN_NO_F32_OPERATORS
198 init_flags |= XNN_INIT_FLAG_F32;
199
200 #if XNN_ENABLE_ASSEMBLY
201 switch (cpuinfo_get_uarch(0)->uarch) {
202 case cpuinfo_uarch_cortex_a5:
203 case cpuinfo_uarch_cortex_a7:
204 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
205 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a7);
206 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
207 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
208 xnn_params.f32.gemm.mr = 4;
209 xnn_params.f32.gemm.nr = 8;
210 break;
211
212 case cpuinfo_uarch_cortex_a53:
213 case cpuinfo_uarch_cortex_a55r0:
214 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
215 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53);
216 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
217 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
218 xnn_params.f32.gemm.mr = 4;
219 xnn_params.f32.gemm.nr = 8;
220 break;
221
222 case cpuinfo_uarch_cortex_a55:
223 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
224 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55);
225 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
226 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
227 xnn_params.f32.gemm.mr = 4;
228 xnn_params.f32.gemm.nr = 8;
229 break;
230
231 case cpuinfo_uarch_cortex_a57:
232 case cpuinfo_uarch_cortex_a72:
233 case cpuinfo_uarch_cortex_a73:
234 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_pld_cortex_a75);
235 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_pld_cortex_a75);
236 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
237 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
238 xnn_params.f32.gemm.mr = 4;
239 xnn_params.f32.gemm.nr = 8;
240 break;
241
242 case cpuinfo_uarch_krait:
243 default:
244 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
245 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a75);
246 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
247 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
248 xnn_params.f32.gemm.mr = 4;
249 xnn_params.f32.gemm.nr = 8;
250 break;
251 }
252 #if XNN_MAX_UARCH_TYPES > 1
253 {
254 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
255 const uint32_t mr = xnn_params.f32.gemm.mr;
256 const uint32_t nr = xnn_params.f32.gemm.nr;
257 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
258 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
259 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
260 if (uarch_info == NULL) {
261 /* No more microarchitectures in the system */
262 break;
263 }
264
265 switch (uarch_info->uarch) {
266 case cpuinfo_uarch_cortex_a53:
267 case cpuinfo_uarch_cortex_a55r0:
268 if (mr == 4 && nr == 8 && log2_sr == 0) {
269 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
270 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a53;
271 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
272 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
273 }
274 break;
275 case cpuinfo_uarch_cortex_a55:
276 if (mr == 4 && nr == 8 && log2_sr == 0) {
277 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
278 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch32_neon_cortex_a55;
279 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64;
280 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64;
281 }
282 break;
283 default:
284 break;
285 }
286 }
287 }
288 #endif // XNN_MAX_UARCH_TYPES > 1
289 #else // XNN_ENABLE_ASSEMBLY
290 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__neon_lane_ld128);
291 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__neon_lane_ld128);
292 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neon_lane_ld64);
293 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neon_lane_ld64);
294 xnn_params.f32.gemm.mr = 4;
295 xnn_params.f32.gemm.nr = 8;
296 #endif // XNN_ENABLE_ASSEMBLY
297 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neon_lane_ld64);
298 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neon_lane_ld64);
299 xnn_params.f32.gemm2.mr = 4;
300 xnn_params.f32.gemm2.nr = 2;
301
302 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x4__neon;
303 xnn_params.f32.dwconv[0].channel_tile = 4,
304 xnn_params.f32.dwconv[0].primary_tile = 4,
305
306 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__neon;
307 xnn_params.f32.dwconv[1].channel_tile = 4;
308 xnn_params.f32.dwconv[1].primary_tile = 9;
309
310 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__neon_acc2;
311 xnn_params.f32.dwconv[2].channel_tile = 4;
312 xnn_params.f32.dwconv[2].primary_tile = 25;
313
314 xnn_params.f32.avgpool = (struct avgpool_parameters) {
315 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
316 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
317 .mr = 9,
318 .qr = 8,
319 };
320 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
321 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
322 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
323 .mr = 9,
324 .qr = 8,
325 };
326 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
327 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
328 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
329 .mr = 7,
330 };
331 xnn_params.f32.maxpool = (struct maxpool_parameters) {
332 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
333 .mr = 9,
334 .qr = 8,
335 };
336 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
337 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
338 .mr = 4,
339 };
340 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
341 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
342 .mr = 9,
343 };
344 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
345 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
346 .mr = 9,
347 .qr = 8,
348 };
349 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
350 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neon_c8,
351 .pixel_tile = 1,
352 .channel_tile = 8,
353 };
354 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8;
355 xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__neon_x8;
356 if (cpuinfo_has_arm_neon_fma()) {
357 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_p6_x8;
358 } else {
359 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neon_rr2_lut16_p3_x8;
360 }
361 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neon_x16;
362 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8;
363 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8;
364 if (cpuinfo_has_arm_neon_v8()) {
365 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8;
366 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8;
367 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8;
368 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8;
369 } else {
370 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neon_x8;
371 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neon_x8;
372 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neon_x8;
373 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neon_x8;
374 }
375 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__neon_rr2_lut64_p2_nr2recps_x8;
376 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8;
377 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1;
378 xnn_params.f32.prelu = (struct prelu_parameters) {
379 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
380 .row_tile = 2,
381 .channel_tile = 8,
382 };
383 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__neon_lut64_p2_x8;
384 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
385 xnn_params.f32.vadd = (struct vbinary_parameters) {
386 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
387 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
388 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
389 .element_tile = 8,
390 };
391 xnn_params.f32.vdiv = (struct vbinary_parameters) {
392 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
393 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
394 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
395 .element_tile = 2,
396 };
397 xnn_params.f32.vmax = (struct vbinary_parameters) {
398 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
399 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
400 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
401 .element_tile = 8,
402 };
403 xnn_params.f32.vmin = (struct vbinary_parameters) {
404 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
405 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
406 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
407 .element_tile = 8,
408 };
409 xnn_params.f32.vmul = (struct vbinary_parameters) {
410 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
411 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
412 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
413 .element_tile = 8,
414 };
415 xnn_params.f32.vsub = (struct vbinary_parameters) {
416 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
417 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
418 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
419 .element_tile = 8,
420 };
421 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
422 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
423 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
424 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
425 .element_tile = 8,
426 };
427 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
428 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neon_2x,
429 .channel_tile = 4,
430 .row_tile = 2,
431 };
432 #ifndef XNN_NO_NCHW_OPERATORS
433 init_flags |= XNN_INIT_FLAG_CHW_OPT;
434
435 xnn_params.f32.spmm = (struct spmm_parameters) {
436 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neon,
437 .mr = 32,
438 .nr = 1,
439 };
440 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
441 .ukernel_with_symm_padding =
442 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neon_2x2,
443 .output_channel_tile = 4,
444 .output_height_tile = 2,
445 .output_width_tile = 2,
446 };
447 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
448 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neon_2x4,
449 .output_width_tile = 4,
450 .output_height_tile = 2,
451 };
452 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
453 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neon_1x4,
454 .output_width_tile = 4,
455 .output_height_tile = 1,
456 };
457 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
458 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neon_1x4,
459 .output_width_tile = 4,
460 .output_height_tile = 1,
461 };
462 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
463 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neon_1x4,
464 .output_width_tile = 4,
465 .output_height_tile = 1,
466 };
467 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
468 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
469 .channel_tile = 4,
470 };
471 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
472 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neon_p8,
473 .channel_tile = 1,
474 .pixel_tile = 4,
475 };
476 #endif // XNN_NO_NCHW_OPERATORS
477 #endif // XNN_NO_F32_OPERATORS
478
479 /**************************** X32 micro-kernels ****************************/
480 #ifndef XNN_NO_X32_OPERATORS
481 init_flags |= XNN_INIT_FLAG_X32;
482
483 xnn_params.x32.fill = (struct fill_parameters) {
484 .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__neon,
485 .row_tile = 1,
486 };
487 xnn_params.x32.pad = (struct pad_parameters) {
488 .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__neon,
489 .row_tile = 1,
490 };
491 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
492 xnn_params.x32.zip = (struct zip_parameters) {
493 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
494 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
495 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
496 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
497 };
498 #ifndef XNN_NO_NCHW_OPERATORS
499 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
500 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
501 .channel_tile = 1,
502 .pixel_tile = 1,
503 };
504 #endif // XNN_NO_NCHW_OPERATORS
505 #endif // XNN_NO_X32_OPERATORS
506 } else if (!XNN_PLATFORM_MOBILE) {
507 /*************************** QU8 micro-kernels ***************************/
508 #ifndef XNN_NO_QU8_OPERATORS
509 init_flags |= XNN_INIT_FLAG_QU8;
510
511 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_ukernel_2x2__scalar);
512 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_ukernel_2x2__scalar);
513 xnn_params.qu8.gemm.mr = 2;
514 xnn_params.qu8.gemm.nr = 2;
515
516 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_ukernel_up1x9__scalar;
517 xnn_params.qu8.dwconv[0].channel_tile = 1;
518 xnn_params.qu8.dwconv[0].primary_tile = 9;
519
520 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
521 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
522 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
523 .mr = 9,
524 .qr = 8,
525 };
526 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
527 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
528 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
529 .mr = 7,
530 };
531 xnn_params.qu8.vadd = (xnn_vadd_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar;
532 #endif // XNN_NO_QU8_OPERATORS
533
534 /**************************** U8 micro-kernels ****************************/
535 #ifndef XNN_NO_U8_OPERATORS
536 init_flags |= XNN_INIT_FLAG_U8;
537
538 xnn_params.u8.maxpool = (struct maxpool_parameters) {
539 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
540 .mr = 9,
541 .qr = 8,
542 };
543 xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__scalar_x4;
544 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
545 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
546 #endif // XNN_NO_U8_OPERATORS
547
548 /**************************** X8 micro-kernels ****************************/
549 #ifndef XNN_NO_X8_OPERATORS
550 init_flags |= XNN_INIT_FLAG_X8;
551
552 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
553 xnn_params.x8.zip = (struct zip_parameters) {
554 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
555 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
556 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
557 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
558 };
559 #endif // XNN_NO_X8_OPERATORS
560
561 /**************************** F32 micro-kernels ****************************/
562 #ifndef XNN_NO_F32_OPERATORS
563 init_flags |= XNN_INIT_FLAG_F32;
564
565 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__scalar);
566 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__scalar);
567 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__scalar);
568 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__scalar);
569 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__scalar);
570 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__scalar);
571 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__scalar);
572 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__scalar);
573 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar);
574 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar);
575 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar);
576 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar);
577 xnn_params.f32.gemm.mr = 4;
578 xnn_params.f32.gemm.nr = 4;
579
580 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__scalar);
581 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__scalar),
582 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__scalar);
583 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar),
584 xnn_params.f32.gemm2.mr = 4;
585 xnn_params.f32.gemm2.nr = 2;
586
587 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__scalar_acc2;
588 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2;
589 xnn_params.f32.dwconv[0].channel_tile = 1;
590 xnn_params.f32.dwconv[0].primary_tile = 4;
591
592 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__scalar_acc2;
593 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2;
594 xnn_params.f32.dwconv[1].channel_tile = 1;
595 xnn_params.f32.dwconv[1].primary_tile = 9;
596
597 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__scalar_acc2;
598 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2;
599 xnn_params.f32.dwconv[2].channel_tile = 1;
600 xnn_params.f32.dwconv[2].primary_tile = 25;
601
602 xnn_params.f32.avgpool = (struct avgpool_parameters) {
603 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__scalar_c1,
604 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__scalar_c1,
605 .mr = 9,
606 .qr = 8,
607 };
608 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
609 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__scalar_c1,
610 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__scalar_c1,
611 .mr = 9,
612 .qr = 8,
613 };
614 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
615 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__scalar_c1,
616 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
617 .mr = 7,
618 };
619 xnn_params.f32.maxpool = (struct maxpool_parameters) {
620 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__scalar_c1,
621 .mr = 9,
622 .qr = 8,
623 };
624 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
625 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
626 .mr = 4,
627 };
628 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
629 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
630 .mr = 9,
631 };
632 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
633 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
634 .mr = 9,
635 .qr = 8,
636 };
637 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
638 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
639 .pixel_tile = 1,
640 .channel_tile = 2,
641 };
642 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4;
643 xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__scalar_x4;
644 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x4;
645 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__scalar_x4;
646 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4;
647 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4;
648 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x1;
649 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x1;
650 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x1;
651 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x1;
652 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2;
653 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4;
654 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1;
655 xnn_params.f32.prelu = (struct prelu_parameters) {
656 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
657 .row_tile = 4,
658 .channel_tile = 4,
659 };
660 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2;
661 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
662 xnn_params.f32.vadd = (struct vbinary_parameters) {
663 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
664 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
665 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
666 .element_tile = 8,
667 };
668 xnn_params.f32.vdiv = (struct vbinary_parameters) {
669 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__scalar_x2,
670 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__scalar_x2,
671 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__scalar_x2,
672 .element_tile = 2,
673 };
674 xnn_params.f32.vmax = (struct vbinary_parameters) {
675 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
676 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
677 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
678 .element_tile = 8,
679 };
680 xnn_params.f32.vmin = (struct vbinary_parameters) {
681 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
682 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
683 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
684 .element_tile = 8,
685 };
686 xnn_params.f32.vmul = (struct vbinary_parameters) {
687 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
688 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
689 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
690 .element_tile = 8,
691 };
692 xnn_params.f32.vsub = (struct vbinary_parameters) {
693 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
694 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
695 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
696 .element_tile = 8,
697 };
698 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
699 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
700 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
701 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
702 .element_tile = 8,
703 };
704 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
705 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__scalar_2x,
706 .channel_tile = 1,
707 .row_tile = 2,
708 };
709 #ifndef XNN_NO_NCHW_OPERATORS
710 init_flags |= XNN_INIT_FLAG_CHW_OPT;
711
712 xnn_params.f32.spmm = (struct spmm_parameters) {
713 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
714 .mr = 8,
715 .nr = 1,
716 };
717 xnn_params.f32.spmm2 = (struct spmm_parameters) {
718 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
719 .mr = 8,
720 .nr = 2,
721 };
722 xnn_params.f32.spmm4 = (struct spmm_parameters) {
723 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
724 .mr = 8,
725 .nr = 4,
726 };
727 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
728 .ukernel_with_symm_padding =
729 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
730 .output_channel_tile = 4,
731 .output_height_tile = 1,
732 .output_width_tile = 1,
733 };
734 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
735 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1,
736 .output_width_tile = 1,
737 .output_height_tile = 4,
738 };
739 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
740 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_2x1_acc2,
741 .output_width_tile = 1,
742 .output_height_tile = 2,
743 };
744 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
745 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_2x1_acc2,
746 .output_width_tile = 1,
747 .output_height_tile = 2,
748 };
749 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
750 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_2x1_acc2,
751 .output_width_tile = 1,
752 .output_height_tile = 2,
753 };
754 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
755 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
756 .channel_tile = 1,
757 };
758 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
759 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
760 .channel_tile = 1,
761 .pixel_tile = 4,
762 };
763 #endif // XNN_NO_NCHW_OPERATORS
764 #endif // XNN_NO_F32_OPERATORS
765
766 /**************************** X32 micro-kernels ****************************/
767 #ifndef XNN_NO_X32_OPERATORS
768 init_flags |= XNN_INIT_FLAG_X32;
769
770 xnn_params.x32.fill = (struct fill_parameters) {
771 .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__scalar_int,
772 .row_tile = 1,
773 };
774 xnn_params.x32.pad = (struct pad_parameters) {
775 .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__scalar_int,
776 .row_tile = 1,
777 };
778 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
779 xnn_params.x32.zip = (struct zip_parameters) {
780 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
781 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
782 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
783 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
784 };
785 #ifndef XNN_NO_NCHW_OPERATORS
786 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
787 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
788 .channel_tile = 1,
789 .pixel_tile = 1,
790 };
791 #endif // XNN_NO_NCHW_OPERATORS
792 #endif // XNN_NO_X32_OPERATORS
793 }
794
795 #elif XNN_ARCH_ARM64
796
797 /**************************** XX micro-kernels ****************************/
798 #ifndef XNN_NO_XX_OPERATORS
799 init_flags |= XNN_INIT_FLAG_XX;
800
801 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
802 #endif
803
804 /**************************** QS8 micro-kernels ****************************/
805 #ifndef XNN_NO_QS8_OPERATORS
806 init_flags |= XNN_INIT_FLAG_QS8;
807
808 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
809 #if XNN_ENABLE_ASSEMBLY
810 if (cpuinfo_has_arm_neon_dot()) {
811 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
812 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
813 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
814 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
815 xnn_params.qs8.gemm.mr = 4;
816 xnn_params.qs8.gemm.nr = 16;
817 xnn_params.qs8.gemm.log2_kr = 2;
818 } else {
819 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
820 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
821 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
822 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
823 xnn_params.qs8.gemm.mr = 2;
824 xnn_params.qs8.gemm.nr = 8;
825 xnn_params.qs8.gemm.log2_kr = 3;
826 }
827 #else // !XNN_ENABLE_ASSEMBLY
828 if (cpuinfo_has_arm_neon_dot()) {
829 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot);
830 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
831 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot);
832 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
833 xnn_params.qs8.gemm.mr = 4;
834 xnn_params.qs8.gemm.nr = 16;
835 xnn_params.qs8.gemm.log2_kr = 2;
836 } else {
837 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
838 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
839 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
840 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
841 xnn_params.qs8.gemm.mr = 2;
842 xnn_params.qs8.gemm.nr = 8;
843 xnn_params.qs8.gemm.log2_kr = 1;
844 }
845 #endif // XNN_ENABLE_ASSEMBLY
846 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
847 #if XNN_ENABLE_ASSEMBLY
848 if (cpuinfo_has_arm_neon_dot()) {
849 switch (cpuinfo_get_core(0)->uarch) {
850 case cpuinfo_uarch_cortex_a55:
851 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
852 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55);
853 break;
854 default:
855 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
856 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_ld64);
857 break;
858 }
859 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
860 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
861 xnn_params.qs8.gemm.mr = 4;
862 xnn_params.qs8.gemm.nr = 16;
863 xnn_params.qs8.gemm.log2_kr = 2;
864 } else {
865 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
866 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c8__aarch64_neon_mlal_padal);
867 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__neon_mlal_padal);
868 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__neon_mlal_padal);
869 xnn_params.qs8.gemm.mr = 2;
870 xnn_params.qs8.gemm.nr = 8;
871 xnn_params.qs8.gemm.log2_kr = 3;
872 }
873 #if XNN_MAX_UARCH_TYPES > 1
874 {
875 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
876 const uint32_t mr = xnn_params.qs8.gemm.mr;
877 const uint32_t nr = xnn_params.qs8.gemm.nr;
878 const uint32_t log2_kr = xnn_params.qs8.gemm.log2_kr;
879 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
880 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
881 if (uarch_info == NULL) {
882 /* No more microarchitectures in the system */
883 break;
884 }
885
886 switch (uarch_info->uarch) {
887 case cpuinfo_uarch_cortex_a55:
888 if (mr == 4 && nr == 16 && log2_kr == 2) {
889 xnn_params.qs8.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55;
890 xnn_params.qs8.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__aarch64_neondot_cortex_a55;
891 xnn_params.qs8.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot;
892 xnn_params.qs8.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot;
893 }
894 break;
895 default:
896 break;
897 }
898 }
899 }
900 #endif // XNN_MAX_UARCH_TYPES > 1
901 #else // !XNN_ENABLE_ASSEMBLY
902 if (cpuinfo_has_arm_neon_dot()) {
903 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c4__neondot);
904 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c4__neondot);
905 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c4__neondot);
906 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c4__neondot);
907 xnn_params.qs8.gemm.mr = 4;
908 xnn_params.qs8.gemm.nr = 16;
909 xnn_params.qs8.gemm.log2_kr = 2;
910 } else {
911 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
912 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
913 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
914 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
915 xnn_params.qs8.gemm.mr = 2;
916 xnn_params.qs8.gemm.nr = 8;
917 xnn_params.qs8.gemm.log2_kr = 1;
918 }
919 #endif // XNN_ENABLE_ASSEMBLY
920 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
921
922 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__neon_mul16;
923 xnn_params.qs8.dwconv[0].channel_tile = 8;
924 xnn_params.qs8.dwconv[0].primary_tile = 9;
925
926 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
927 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__neon_c8_acc2,
928 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__neon_c8_acc2,
929 .mr = 7,
930 };
931
932 xnn_params.qs8.vadd = (struct vbinary_parameters) {
933 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__neon_ld64_x8,
934 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8,
935 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8,
936 .element_tile = 8,
937 };
938 #endif // XNN_NO_QS8_OPERATORS
939
940 /**************************** QU8 micro-kernels ****************************/
941 #ifndef XNN_NO_QU8_OPERATORS
942 init_flags |= XNN_INIT_FLAG_QU8;
943
944 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_ukernel_8x8__neon);
945 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_ukernel_8x8__neon);
946 xnn_params.qu8.gemm.mr = 8;
947 xnn_params.qu8.gemm.nr = 8;
948
949 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_ukernel_up8x9__neon;
950 xnn_params.qu8.dwconv[0].channel_tile = 8;
951 xnn_params.qu8.dwconv[0].primary_tile = 9;
952
953 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
954 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__neon_c8,
955 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__neon_c8,
956 .mr = 9,
957 .qr = 8,
958 };
959 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
960 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__neon_c8,
961 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__neon_c8,
962 .mr = 7,
963 };
964 xnn_params.qu8.vadd = (xnn_vadd_ukernel_function) xnn_qu8_vadd_minmax_ukernel__neon;
965 #endif // XNN_NO_QU8_OPERATORS
966
967 /**************************** U8 micro-kernels ****************************/
968 #ifndef XNN_NO_U8_OPERATORS
969 init_flags |= XNN_INIT_FLAG_U8;
970
971 xnn_params.u8.maxpool = (struct maxpool_parameters) {
972 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__neon_c16,
973 .mr = 9,
974 .qr = 8,
975 };
976 xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__neon_x64;
977 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
978 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__neon;
979 #endif // XNN_NO_U8_OPERATORS
980
981 /**************************** X8 micro-kernels ****************************/
982 #ifndef XNN_NO_X8_OPERATORS
983 init_flags |= XNN_INIT_FLAG_X8;
984
985 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
986 xnn_params.x8.zip = (struct zip_parameters) {
987 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__neon,
988 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__neon,
989 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__neon,
990 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__neon,
991 };
992 #endif // XNN_NO_X8_OPERATORS
993
994 /**************************** F16 micro-kernels ****************************/
995 #ifndef XNN_NO_F16_OPERATORS
996 if (cpuinfo_has_arm_neon_fp16_arith()) {
997 init_flags |= XNN_INIT_FLAG_F16;
998
999 #if XNN_ENABLE_ASSEMBLY
1000 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
1001 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
1002 #else
1003 xnn_params.f16.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
1004 xnn_params.f16.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1005 #endif
1006 xnn_params.f16.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
1007 xnn_params.f16.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1008 xnn_params.f16.gemm.mr = 6;
1009 xnn_params.f16.gemm.nr = 16;
1010
1011 xnn_params.f16.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith;
1012 xnn_params.f16.dwconv[0].channel_tile = 16;
1013 xnn_params.f16.dwconv[0].primary_tile = 4;
1014
1015 xnn_params.f16.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith;
1016 xnn_params.f16.dwconv[1].channel_tile = 16;
1017 xnn_params.f16.dwconv[1].primary_tile = 9;
1018
1019 xnn_params.f16.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2;
1020 xnn_params.f16.dwconv[2].channel_tile = 8;
1021 xnn_params.f16.dwconv[2].primary_tile = 25;
1022
1023 xnn_params.f16.gavgpool = (struct gavgpool_parameters) {
1024 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7x__neonfp16arith_c8,
1025 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f16_gavgpool_minmax_ukernel_7p7x__neonfp16arith_c8,
1026 .mr = 7,
1027 };
1028 xnn_params.f16.vadd = (struct vbinary_parameters) {
1029 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vadd_minmax_ukernel__neonfp16arith_x16,
1030 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
1031 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vaddc_minmax_ukernel__neonfp16arith_x16,
1032 .element_tile = 16,
1033 };
1034 xnn_params.f16.vmul = (struct vbinary_parameters) {
1035 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmul_minmax_ukernel__neonfp16arith_x16,
1036 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
1037 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f16_vmulc_minmax_ukernel__neonfp16arith_x16,
1038 .element_tile = 16,
1039 };
1040 xnn_params.f16.vmulcaddc = (struct vmulcaddc_parameters) {
1041 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f16_vmulcaddc_minmax_ukernel_c8__neonfp16arith_2x,
1042 .channel_tile = 8,
1043 .row_tile = 2,
1044 };
1045 xnn_params.f16.hswish = (xnn_univector_ukernel_function) xnn_f16_hswish_ukernel__neonfp16arith_x16;
1046 }
1047 #endif // XNN_NO_F16_OPERATORS
1048
1049 /**************************** F32 micro-kernels ****************************/
1050 #ifndef XNN_NO_F32_OPERATORS
1051 init_flags |= XNN_INIT_FLAG_F32;
1052
1053 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1054 #if XNN_ENABLE_ASSEMBLY
1055 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
1056 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
1057 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1058 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1059 xnn_params.f32.gemm.mr = 6;
1060 xnn_params.f32.gemm.nr = 8;
1061 #else // !XNN_ENABLE_ASSEMBLY
1062 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
1063 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
1064 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
1065 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
1066 xnn_params.f32.gemm.mr = 6;
1067 xnn_params.f32.gemm.nr = 8;
1068 #endif // XNN_ENABLE_ASSEMBLY
1069 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1070 #if XNN_ENABLE_ASSEMBLY
1071 switch (cpuinfo_get_core(0)->uarch) {
1072 case cpuinfo_uarch_cortex_a57:
1073 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57);
1074 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a57);
1075 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a57);
1076 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a57);
1077 xnn_params.f32.gemm.mr = 6;
1078 xnn_params.f32.gemm.nr = 8;
1079 break;
1080 case cpuinfo_uarch_cortex_a72:
1081 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
1082 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a75);
1083 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1084 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1085 xnn_params.f32.gemm.mr = 4;
1086 xnn_params.f32.gemm.nr = 8;
1087 break;
1088 case cpuinfo_uarch_cortex_a75:
1089 case cpuinfo_uarch_cortex_a76:
1090 case cpuinfo_uarch_exynos_m3:
1091 case cpuinfo_uarch_exynos_m4:
1092 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
1093 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a75);
1094 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1095 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1096 xnn_params.f32.gemm.mr = 6;
1097 xnn_params.f32.gemm.nr = 8;
1098 break;
1099 case cpuinfo_uarch_exynos_m1:
1100 case cpuinfo_uarch_exynos_m2:
1101 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8s4__neonfma);
1102 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8s4__neonfma);
1103 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8s4__neonfma);
1104 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8s4__neonfma);
1105 xnn_params.f32.gemm.mr = 6;
1106 xnn_params.f32.gemm.nr = 8;
1107 xnn_params.f32.gemm.log2_sr = 2;
1108 break;
1109 case cpuinfo_uarch_cortex_a53:
1110 case cpuinfo_uarch_cortex_a55r0:
1111 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
1112 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53);
1113 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
1114 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
1115 xnn_params.f32.gemm.mr = 6;
1116 xnn_params.f32.gemm.nr = 8;
1117 break;
1118 case cpuinfo_uarch_cortex_a55:
1119 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
1120 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55);
1121 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
1122 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53);
1123 xnn_params.f32.gemm.mr = 6;
1124 xnn_params.f32.gemm.nr = 8;
1125 break;
1126 case cpuinfo_uarch_cortex_a73:
1127 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
1128 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a73);
1129 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1130 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a75);
1131 xnn_params.f32.gemm.mr = 6;
1132 xnn_params.f32.gemm.nr = 8;
1133 break;
1134 default:
1135 case cpuinfo_uarch_cortex_a77:
1136 case cpuinfo_uarch_exynos_m5:
1137 case cpuinfo_uarch_kryo:
1138 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a57);
1139 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a57);
1140 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a57);
1141 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a57);
1142 xnn_params.f32.gemm.mr = 4;
1143 xnn_params.f32.gemm.nr = 8;
1144 break;
1145 }
1146 #if XNN_MAX_UARCH_TYPES > 1
1147 {
1148 /* Choose micro-kernels for little cores according to micro-kernel specification for the big core */
1149 const uint32_t mr = xnn_params.f32.gemm.mr;
1150 const uint32_t nr = xnn_params.f32.gemm.nr;
1151 const uint32_t log2_sr = xnn_params.f32.gemm.log2_sr;
1152 for (size_t i = 1; i < XNN_MAX_UARCH_TYPES; i++) {
1153 const struct cpuinfo_uarch_info* uarch_info = cpuinfo_get_uarch(i);
1154 if (uarch_info == NULL) {
1155 /* No more microarchitectures in the system */
1156 break;
1157 }
1158
1159 switch (uarch_info->uarch) {
1160 case cpuinfo_uarch_cortex_a53:
1161 case cpuinfo_uarch_cortex_a55r0:
1162 if (mr == 6 && nr == 8 && log2_sr == 0) {
1163 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
1164 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a53;
1165 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1166 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1167 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
1168 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
1169 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a53;
1170 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1171 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1172 }
1173 break;
1174 case cpuinfo_uarch_cortex_a55:
1175 if (mr == 6 && nr == 8 && log2_sr == 0) {
1176 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
1177 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__aarch64_neonfma_cortex_a55;
1178 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1179 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1180 } else if (mr == 4 && nr == 8 && log2_sr == 0) {
1181 xnn_params.f32.gemm.minmax.gemm.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
1182 xnn_params.f32.gemm.minmax.igemm.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__aarch64_neonfma_cortex_a55;
1183 xnn_params.f32.gemm.minmax.gemm1.function[i] = (xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1184 xnn_params.f32.gemm.minmax.igemm1.function[i] = (xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__aarch64_neonfma_cortex_a53;
1185 }
1186 break;
1187 default:
1188 break;
1189 }
1190 }
1191 }
1192 #endif // XNN_MAX_UARCH_TYPES > 1
1193 #else // !XNN_ENABLE_ASSEMBLY
1194 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_6x8__neonfma_lane_ld64);
1195 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_6x8__neonfma_lane_ld64);
1196 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__neonfma_lane_ld64);
1197 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__neonfma_lane_ld64);
1198 xnn_params.f32.gemm.mr = 6;
1199 xnn_params.f32.gemm.nr = 8;
1200 #endif // XNN_ENABLE_ASSEMBLY
1201 #endif // XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1202 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__neonfma_lane_ld64);
1203 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__neonfma_lane_ld64);
1204 xnn_params.f32.gemm2.mr = 4;
1205 xnn_params.f32.gemm2.nr = 2;
1206
1207 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__neonfma;
1208 xnn_params.f32.dwconv[0].channel_tile = 8;
1209 xnn_params.f32.dwconv[0].primary_tile = 4;
1210
1211 #if XNN_PLATFORM_IOS || XNN_PLATFORM_MAC
1212 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
1213 xnn_params.f32.dwconv[1].channel_tile = 8;
1214 xnn_params.f32.dwconv[1].primary_tile = 9;
1215 #else // !XNN_PLATFORM_IOS && !XNN_PLATFORM_MAC
1216 switch (cpuinfo_get_core(0)->uarch) {
1217 case cpuinfo_uarch_kryo:
1218 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__neonfma;
1219 xnn_params.f32.dwconv[1].channel_tile = 4;
1220 xnn_params.f32.dwconv[1].primary_tile = 9;
1221 break;
1222 #if XNN_ENABLE_ASSEMBLY
1223 case cpuinfo_uarch_cortex_a53:
1224 case cpuinfo_uarch_cortex_a55r0:
1225 case cpuinfo_uarch_cortex_a55:
1226 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__aarch64_neonfma_cortex_a55;
1227 xnn_params.f32.dwconv[1].channel_tile = 4;
1228 xnn_params.f32.dwconv[1].primary_tile = 9;
1229 break;
1230 #endif // XNN_ENABLE_ASSEMBLY
1231 default:
1232 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__neonfma;
1233 xnn_params.f32.dwconv[1].channel_tile = 8;
1234 xnn_params.f32.dwconv[1].primary_tile = 9;
1235 break;
1236 }
1237 #endif // XNN_PLATFORM_IOS && XNN_PLATFORM_MAC
1238
1239 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__neonfma_acc2;
1240 xnn_params.f32.dwconv[2].channel_tile = 4;
1241 xnn_params.f32.dwconv[2].primary_tile = 25;
1242
1243 xnn_params.f32.avgpool = (struct avgpool_parameters) {
1244 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__neon_c4,
1245 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__neon_c4,
1246 .mr = 9,
1247 .qr = 8,
1248 };
1249 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
1250 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__neon_c4,
1251 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__neon_c4,
1252 .mr = 9,
1253 .qr = 8,
1254 };
1255 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
1256 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__neon_c4,
1257 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
1258 .mr = 7,
1259 };
1260 xnn_params.f32.maxpool = (struct maxpool_parameters) {
1261 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__neon_c4,
1262 .mr = 9,
1263 .qr = 8,
1264 };
1265 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1266 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__neon_c4,
1267 .mr = 4,
1268 };
1269 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1270 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__neon_c4,
1271 .mr = 9,
1272 };
1273 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1274 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__neon_c4,
1275 .mr = 9,
1276 .qr = 8,
1277 };
1278 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1279 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__neonfma_c8,
1280 .pixel_tile = 1,
1281 .channel_tile = 8,
1282 };
1283 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8;
1284 xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__neon_x8;
1285 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__neonfma_rr1_lut16_p3_x16;
1286 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neon_x16;
1287 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8;
1288 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8;
1289 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8;
1290 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__neonv8_x8;
1291 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__neonv8_x8;
1292 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__neonv8_x8;
1293 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__neonfma_rr1_lut64_p2_nr2recps_x16;
1294 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__neon_x8;
1295 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__neon_sqrt_x4;
1296 xnn_params.f32.prelu = (struct prelu_parameters) {
1297 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__neon_2x8,
1298 .row_tile = 2,
1299 .channel_tile = 8,
1300 };
1301 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__neonfma_lut64_p2_x16;
1302 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__neon;
1303 xnn_params.f32.vadd = (struct vbinary_parameters) {
1304 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__neon_x8,
1305 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1306 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__neon_x8,
1307 .element_tile = 8,
1308 };
1309 xnn_params.f32.vdiv = (struct vbinary_parameters) {
1310 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__neon_x8,
1311 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__neon_x8,
1312 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__neon_x8,
1313 .element_tile = 8,
1314 };
1315 xnn_params.f32.vmax = (struct vbinary_parameters) {
1316 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
1317 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1318 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
1319 .element_tile = 8,
1320 };
1321 xnn_params.f32.vmin = (struct vbinary_parameters) {
1322 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__neon_x8,
1323 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1324 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__neon_x8,
1325 .element_tile = 8,
1326 };
1327 xnn_params.f32.vmul = (struct vbinary_parameters) {
1328 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__neon_x8,
1329 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1330 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__neon_x8,
1331 .element_tile = 8,
1332 };
1333 xnn_params.f32.vsub = (struct vbinary_parameters) {
1334 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__neon_x8,
1335 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__neon_x8,
1336 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__neon_x8,
1337 .element_tile = 8,
1338 };
1339 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1340 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__neon_x8,
1341 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1342 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__neon_x8,
1343 .element_tile = 8,
1344 };
1345 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
1346 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__neonfma_2x,
1347 .channel_tile = 4,
1348 .row_tile = 2,
1349 };
1350 #ifndef XNN_NO_NCHW_OPERATORS
1351 init_flags |= XNN_INIT_FLAG_CHW_OPT;
1352
1353 xnn_params.f32.spmm = (struct spmm_parameters) {
1354 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__neonfma_pipelined,
1355 .mr = 32,
1356 .nr = 1,
1357 };
1358 xnn_params.f32.spmm2 = (struct spmm_parameters) {
1359 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x2__neonfma,
1360 .mr = 32,
1361 .nr = 2,
1362 };
1363 xnn_params.f32.spmm4 = (struct spmm_parameters) {
1364 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x4__neonfma,
1365 .mr = 32,
1366 .nr = 4,
1367 };
1368 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
1369 .ukernel_with_symm_padding =
1370 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__neonfma_2x2,
1371 .output_channel_tile = 4,
1372 .output_height_tile = 2,
1373 .output_width_tile = 2,
1374 };
1375 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
1376 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4,
1377 .output_width_tile = 4,
1378 .output_height_tile = 3,
1379 };
1380 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
1381 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_2x4_acc2,
1382 .output_width_tile = 4,
1383 .output_height_tile = 2,
1384 };
1385 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
1386 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_4x4,
1387 .output_width_tile = 4,
1388 .output_height_tile = 4,
1389 };
1390 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
1391 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2,
1392 .output_width_tile = 4,
1393 .output_height_tile = 1,
1394 };
1395 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
1396 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__neon_x4,
1397 .channel_tile = 4,
1398 };
1399 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
1400 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__neonfma_p8,
1401 .channel_tile = 1,
1402 .pixel_tile = 4,
1403 };
1404 #endif // XNN_NO_NCHW_OPERATORS
1405 #endif // XNN_NO_F32_OPERATORS
1406
1407 /**************************** X32 micro-kernels ****************************/
1408 #ifndef XNN_NO_X32_OPERATORS
1409 init_flags |= XNN_INIT_FLAG_X32;
1410
1411 xnn_params.x32.fill = (struct fill_parameters) {
1412 .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__neon,
1413 .row_tile = 1,
1414 };
1415 xnn_params.x32.pad = (struct pad_parameters) {
1416 .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__neon,
1417 .row_tile = 1,
1418 };
1419 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__neon;
1420 xnn_params.x32.zip = (struct zip_parameters) {
1421 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__neon,
1422 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__neon,
1423 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__neon,
1424 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__neon,
1425 };
1426 #ifndef XNN_NO_NCHW_OPERATORS
1427 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
1428 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
1429 .channel_tile = 1,
1430 .pixel_tile = 1,
1431 };
1432 #endif // XNN_NO_NCHW_OPERATORS
1433 #endif // XNN_NO_X32_OPERATORS
1434
1435 #elif XNN_ARCH_X86 || XNN_ARCH_X86_64
1436 if (!cpuinfo_has_x86_sse2()) {
1437 xnn_log_error("XNNPACK initialization failed: SSE2 is not supported");
1438 return;
1439 }
1440
1441 /**************************** XX micro-kernels ****************************/
1442 #ifndef XNN_NO_XX_OPERATORS
1443 init_flags |= XNN_INIT_FLAG_XX;
1444
1445 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
1446 #endif
1447
1448 /**************************** QS8 micro-kernels ****************************/
1449 #ifndef XNN_NO_QS8_OPERATORS
1450 init_flags |= XNN_INIT_FLAG_QS8;
1451
1452 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
1453 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_4x16c8__avx512skx);
1454 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_4x16c8__avx512skx);
1455 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x16c8__avx512skx);
1456 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x16c8__avx512skx);
1457 xnn_params.qs8.gemm.mr = 4;
1458 xnn_params.qs8.gemm.nr = 16;
1459 xnn_params.qs8.gemm.log2_kr = 3;
1460 } else if (cpuinfo_has_x86_xop()) {
1461 // XOP should be checked before AVX2: AMD Excavator supports both, but performs better with XOP microkernels
1462 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld64);
1463 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld64);
1464 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld64);
1465 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld64);
1466 xnn_params.qs8.gemm.mr = 2;
1467 xnn_params.qs8.gemm.nr = 4;
1468 xnn_params.qs8.gemm.log2_kr = 3;
1469 } else if (cpuinfo_has_x86_avx2()) {
1470 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2);
1471 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2);
1472 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2);
1473 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2);
1474 xnn_params.qs8.gemm.mr = 3;
1475 xnn_params.qs8.gemm.nr = 8;
1476 xnn_params.qs8.gemm.log2_kr = 3;
1477 } else if (cpuinfo_has_x86_sse4_1()) {
1478 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4c8__sse41_ld64);
1479 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__sse41_ld64);
1480 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__sse41_ld64);
1481 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__sse41_ld64);
1482 xnn_params.qs8.gemm.mr = 3;
1483 xnn_params.qs8.gemm.nr = 4;
1484 xnn_params.qs8.gemm.log2_kr = 3;
1485 } else if (cpuinfo_has_x86_ssse3()) {
1486 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4c8__ssse3_ld64);
1487 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__ssse3_ld64);
1488 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__ssse3_ld64);
1489 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__ssse3_ld64);
1490 xnn_params.qs8.gemm.mr = 3;
1491 xnn_params.qs8.gemm.nr = 4;
1492 xnn_params.qs8.gemm.log2_kr = 3;
1493 } else {
1494 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4c8__sse2_ld64);
1495 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__sse2_ld64);
1496 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__sse2_ld64);
1497 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__sse2_ld64);
1498 xnn_params.qs8.gemm.mr = 3;
1499 xnn_params.qs8.gemm.nr = 4;
1500 xnn_params.qs8.gemm.log2_kr = 3;
1501 }
1502
1503 if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
1504 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up32x9__avx512skx_mul32;
1505 xnn_params.qs8.dwconv[0].channel_tile = 32;
1506 } else if (cpuinfo_has_x86_avx2()) {
1507 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up16x9__avx2_mul32;
1508 xnn_params.qs8.dwconv[0].channel_tile = 16;
1509 } else if (cpuinfo_has_x86_sse4_1()) {
1510 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__sse41_mul16;
1511 xnn_params.qs8.dwconv[0].channel_tile = 8;
1512 } else if (cpuinfo_has_x86_ssse3()) {
1513 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__ssse3_mul16;
1514 xnn_params.qs8.dwconv[0].channel_tile = 8;
1515 } else if (cpuinfo_has_x86_sse2()) {
1516 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__sse2_mul16;
1517 xnn_params.qs8.dwconv[0].channel_tile = 8;
1518 }
1519 xnn_params.qs8.dwconv[0].primary_tile = 9;
1520
1521 if (cpuinfo_has_x86_sse4_1()) {
1522 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
1523 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__sse41_c8_acc2,
1524 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse41_c8_acc2,
1525 .mr = 7,
1526 };
1527 } else if (cpuinfo_has_x86_ssse3()) {
1528 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
1529 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__ssse3_c8_acc2,
1530 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__ssse3_c8_acc2,
1531 .mr = 7,
1532 };
1533 } else if (cpuinfo_has_x86_sse2()) {
1534 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
1535 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__sse2_c8_acc2,
1536 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__sse2_c8_acc2,
1537 .mr = 7,
1538 };
1539 }
1540
1541 if (cpuinfo_has_x86_xop()) {
1542 xnn_params.qs8.vadd = (struct vbinary_parameters) {
1543 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__xop_mul32_ld32_x8,
1544 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
1545 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__xop_mul32_ld32_x8,
1546 .element_tile = 8,
1547 };
1548 } else if (cpuinfo_has_x86_sse4_1()) {
1549 xnn_params.qs8.vadd = (struct vbinary_parameters) {
1550 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8,
1551 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
1552 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse41_mul16_ld64_x8,
1553 .element_tile = 8,
1554 };
1555 } else {
1556 xnn_params.qs8.vadd = (struct vbinary_parameters) {
1557 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8,
1558 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
1559 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8,
1560 .element_tile = 8,
1561 };
1562 }
1563 #endif // XNN_NO_QS8_OPERATORS
1564
1565 /**************************** QU8 micro-kernels ****************************/
1566 #ifndef XNN_NO_QU8_OPERATORS
1567 init_flags |= XNN_INIT_FLAG_QU8;
1568
1569 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_ukernel_4x4c2__sse2);
1570 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_ukernel_4x4c2__sse2);
1571 xnn_params.qu8.gemm.mr = 4;
1572 xnn_params.qu8.gemm.nr = 4;
1573 xnn_params.qu8.gemm.log2_kr = 1;
1574
1575 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_ukernel_up8x9__sse2;
1576 xnn_params.qu8.dwconv[0].channel_tile = 8;
1577 xnn_params.qu8.dwconv[0].primary_tile = 9;
1578
1579 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
1580 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__sse2_c8,
1581 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8,
1582 .mr = 9,
1583 .qr = 8,
1584 };
1585 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
1586 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__sse2_c8,
1587 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__sse2_c8,
1588 .mr = 7,
1589 };
1590 xnn_params.qu8.vadd = (xnn_vadd_ukernel_function) xnn_qu8_vadd_minmax_ukernel__sse2;
1591 #endif // XNN_NO_QU8_OPERATORS
1592
1593 /**************************** U8 micro-kernels ****************************/
1594 #ifndef XNN_NO_U8_OPERATORS
1595 init_flags |= XNN_INIT_FLAG_U8;
1596
1597 xnn_params.u8.maxpool = (struct maxpool_parameters) {
1598 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__sse2_c16,
1599 .mr = 9,
1600 .qr = 8,
1601 };
1602 xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__sse2_x64;
1603 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
1604 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__sse2;
1605 #endif // XNN_NO_U8_OPERATORS
1606
1607 /**************************** X8 micro-kernels ****************************/
1608 #ifndef XNN_NO_X8_OPERATORS
1609 init_flags |= XNN_INIT_FLAG_X8;
1610
1611 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
1612 xnn_params.x8.zip = (struct zip_parameters) {
1613 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__sse2,
1614 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__sse2,
1615 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__sse2,
1616 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__sse2,
1617 };
1618 #endif // XNN_NO_X8_OPERATORS
1619
1620 /**************************** F32 micro-kernels ****************************/
1621 #ifndef XNN_NO_F32_OPERATORS
1622 init_flags |= XNN_INIT_FLAG_F32;
1623
1624 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1625 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_7x16__avx512f_broadcast);
1626 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_7x16__avx512f_broadcast);
1627 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx512f_broadcast);
1628 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx512f_broadcast);
1629 xnn_params.f32.gemm.mr = 7;
1630 xnn_params.f32.gemm.nr = 16;
1631 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
1632 switch (cpuinfo_get_core(0)->uarch) {
1633 case cpuinfo_uarch_zen:
1634 case cpuinfo_uarch_dhyana:
1635 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x16s4__fma3_broadcast);
1636 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x16s4__fma3_broadcast);
1637 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16s4__fma3_broadcast);
1638 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16s4__fma3_broadcast);
1639 xnn_params.f32.gemm.mr = 4;
1640 xnn_params.f32.gemm.nr = 16;
1641 xnn_params.f32.gemm.log2_sr = 2;
1642 break;
1643 default:
1644 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__fma3_broadcast);
1645 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__fma3_broadcast);
1646 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__fma3_broadcast);
1647 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__fma3_broadcast);
1648 xnn_params.f32.gemm.mr = 5;
1649 xnn_params.f32.gemm.nr = 16;
1650 break;
1651 }
1652 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1653 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast);
1654 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast);
1655 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast);
1656 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast);
1657 xnn_params.f32.gemm.mr = 5;
1658 xnn_params.f32.gemm.nr = 16;
1659 } else {
1660 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__sse_load1);
1661 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__sse_load1);
1662 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__sse_load1);
1663 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__sse_load1);
1664 xnn_params.f32.gemm.mr = 4;
1665 xnn_params.f32.gemm.nr = 8;
1666 }
1667 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__sse);
1668 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__sse);
1669 xnn_params.f32.gemm2.mr = 4;
1670 xnn_params.f32.gemm2.nr = 2;
1671 xnn_params.f32.gemm2.log2_kr = 2;
1672
1673 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1674 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx512f;
1675 xnn_params.f32.dwconv[0].channel_tile = 16;
1676 xnn_params.f32.dwconv[0].primary_tile = 4;
1677
1678 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx512f;
1679 xnn_params.f32.dwconv[1].channel_tile = 16;
1680 xnn_params.f32.dwconv[1].primary_tile = 9;
1681
1682 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x25__avx512f;
1683 xnn_params.f32.dwconv[2].channel_tile = 16;
1684 xnn_params.f32.dwconv[2].primary_tile = 25;
1685 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
1686 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__fma3;
1687 xnn_params.f32.dwconv[0].channel_tile = 16;
1688 xnn_params.f32.dwconv[0].primary_tile = 4;
1689
1690 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__fma3;
1691 xnn_params.f32.dwconv[1].channel_tile = 16;
1692 xnn_params.f32.dwconv[1].primary_tile = 9;
1693
1694 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__fma3;
1695 xnn_params.f32.dwconv[2].channel_tile = 8;
1696 xnn_params.f32.dwconv[2].primary_tile = 25;
1697 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1698 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x4__avx;
1699 xnn_params.f32.dwconv[0].channel_tile = 16;
1700 xnn_params.f32.dwconv[0].primary_tile = 4;
1701
1702 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up16x9__avx;
1703 xnn_params.f32.dwconv[1].channel_tile = 16;
1704 xnn_params.f32.dwconv[1].primary_tile = 9;
1705
1706 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__avx;
1707 xnn_params.f32.dwconv[2].channel_tile = 8;
1708 xnn_params.f32.dwconv[2].primary_tile = 25;
1709 } else {
1710 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__sse;
1711 xnn_params.f32.dwconv[0].channel_tile = 8;
1712 xnn_params.f32.dwconv[0].primary_tile = 4;
1713
1714 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__sse;
1715 xnn_params.f32.dwconv[1].channel_tile = 8;
1716 xnn_params.f32.dwconv[1].primary_tile = 9;
1717
1718 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x25__sse;
1719 xnn_params.f32.dwconv[2].channel_tile = 8;
1720 xnn_params.f32.dwconv[2].primary_tile = 25;
1721 }
1722 xnn_params.f32.avgpool = (struct avgpool_parameters) {
1723 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__sse_c4,
1724 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__sse_c4,
1725 .mr = 9,
1726 .qr = 8,
1727 };
1728 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
1729 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__sse_c4,
1730 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__sse_c4,
1731 .mr = 9,
1732 .qr = 8,
1733 };
1734 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
1735 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__sse_c4,
1736 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4,
1737 .mr = 7,
1738 };
1739 xnn_params.f32.maxpool = (struct maxpool_parameters) {
1740 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__sse_c4,
1741 .mr = 9,
1742 .qr = 8,
1743 };
1744 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
1745 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__sse2_c4,
1746 .mr = 4,
1747 };
1748 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
1749 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__sse2_c4,
1750 .mr = 9,
1751 };
1752 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
1753 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__sse2_c4,
1754 .mr = 9,
1755 .qr = 8,
1756 };
1757 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
1758 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__sse_c8,
1759 .pixel_tile = 1,
1760 .channel_tile = 8,
1761 };
1762 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1763 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx512f_x16;
1764 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1765 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__avx_x16;
1766 } else {
1767 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__sse_x8;
1768 }
1769 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1770 xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__avx512f_x16;
1771 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1772 xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__avx_x16;
1773 } else {
1774 xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__sse_x8;
1775 }
1776 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1777 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx512f_rr1_lut16_p3_perm_x64;
1778 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
1779 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx2_rr1_lut4_p4_perm_x56;
1780 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1781 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32;
1782 } else {
1783 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__sse2_rr2_lut16_p3_x12;
1784 }
1785 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1786 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__avx512f_x16;
1787 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_fma3()) {
1788 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__fma3_x16;
1789 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1790 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__avx_x16;
1791 } else {
1792 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__sse_x8;
1793 }
1794 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1795 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx512f_x16;
1796 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1797 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__avx_x16;
1798 } else if (cpuinfo_has_x86_sse4_1()) {
1799 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse41_x8;
1800 } else {
1801 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__sse_x8;
1802 }
1803 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1804 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx512f_x16;
1805 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1806 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__avx_x16;
1807 } else {
1808 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__sse_x8;
1809 }
1810 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1811 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx512f_x16;
1812 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx512f_x16;
1813 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx512f_x16;
1814 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx512f_x16;
1815 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1816 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__avx_x16;
1817 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__avx_x16;
1818 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__avx_x16;
1819 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__avx_x16;
1820 } else if (cpuinfo_has_x86_sse4_1()) {
1821 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse41_x8;
1822 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse41_x8;
1823 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse41_x8;
1824 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse41_x8;
1825 } else {
1826 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__sse2_x8;
1827 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__sse2_x8;
1828 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__sse2_x8;
1829 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__sse2_x8;
1830 }
1831 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1832 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__avx512f_rr2_lut32_p2_perm2_scalef_div_x64;
1833 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx2()) {
1834 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__avx2_rr1_p5_div_x40;
1835 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1836 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__avx_rr2_p5_nr2_x40;
1837 } else if (cpuinfo_has_x86_sse4_1()) {
1838 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__sse41_lut64_p2_div_x8;
1839 } else {
1840 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__sse2_lut64_p2_div_x8;
1841 }
1842 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1843 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx512f_x16;
1844 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1845 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__avx_x16;
1846 } else {
1847 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__sse_x8;
1848 }
1849 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1850 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__avx_sqrt_x8;
1851 } else {
1852 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__sse_sqrt_x4;
1853 }
1854 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1855 xnn_params.f32.prelu = (struct prelu_parameters) {
1856 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx512f_2x16,
1857 .row_tile = 2,
1858 .channel_tile = 16,
1859 };
1860 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1861 xnn_params.f32.prelu = (struct prelu_parameters) {
1862 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__avx_2x16,
1863 .row_tile = 2,
1864 .channel_tile = 16,
1865 };
1866 } else if (cpuinfo_has_x86_sse4_1()) {
1867 xnn_params.f32.prelu = (struct prelu_parameters) {
1868 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse41_2x8,
1869 .row_tile = 2,
1870 .channel_tile = 8,
1871 };
1872 } else {
1873 xnn_params.f32.prelu = (struct prelu_parameters) {
1874 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__sse2_2x8,
1875 .row_tile = 2,
1876 .channel_tile = 8,
1877 };
1878 }
1879 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__sse2_p5_x20_acc2;
1880 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__sse;
1881 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx512f()) {
1882 xnn_params.f32.vadd = (struct vbinary_parameters) {
1883 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx512f_x32,
1884 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
1885 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx512f_x32,
1886 .element_tile = 32,
1887 };
1888 xnn_params.f32.vdiv = (struct vbinary_parameters) {
1889 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx512f_x32,
1890 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx512f_x32,
1891 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx512f_x32,
1892 .element_tile = 32,
1893 };
1894 xnn_params.f32.vmax = (struct vbinary_parameters) {
1895 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx512f_x32,
1896 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
1897 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx512f_x32,
1898 .element_tile = 32,
1899 };
1900 xnn_params.f32.vmin = (struct vbinary_parameters) {
1901 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx512f_x32,
1902 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
1903 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx512f_x32,
1904 .element_tile = 32,
1905 };
1906 xnn_params.f32.vmul = (struct vbinary_parameters) {
1907 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx512f_x32,
1908 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
1909 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx512f_x32,
1910 .element_tile = 32,
1911 };
1912 xnn_params.f32.vsub = (struct vbinary_parameters) {
1913 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx512f_x32,
1914 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx512f_x32,
1915 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx512f_x32,
1916 .element_tile = 32,
1917 };
1918 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1919 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx512f_x32,
1920 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
1921 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx512f_x32,
1922 .element_tile = 32,
1923 };
1924 } else if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_avx()) {
1925 xnn_params.f32.vadd = (struct vbinary_parameters) {
1926 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__avx_x16,
1927 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
1928 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__avx_x16,
1929 .element_tile = 16,
1930 };
1931 xnn_params.f32.vdiv = (struct vbinary_parameters) {
1932 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__avx_x16,
1933 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__avx_x16,
1934 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__avx_x16,
1935 .element_tile = 16,
1936 };
1937 xnn_params.f32.vmax = (struct vbinary_parameters) {
1938 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__avx_x16,
1939 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
1940 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__avx_x16,
1941 .element_tile = 16,
1942 };
1943 xnn_params.f32.vmin = (struct vbinary_parameters) {
1944 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__avx_x16,
1945 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
1946 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__avx_x16,
1947 .element_tile = 16,
1948 };
1949 xnn_params.f32.vmul = (struct vbinary_parameters) {
1950 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__avx_x16,
1951 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
1952 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__avx_x16,
1953 .element_tile = 16,
1954 };
1955 xnn_params.f32.vsub = (struct vbinary_parameters) {
1956 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__avx_x16,
1957 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__avx_x16,
1958 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__avx_x16,
1959 .element_tile = 16,
1960 };
1961 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
1962 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__avx_x16,
1963 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
1964 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__avx_x16,
1965 .element_tile = 16,
1966 };
1967 } else {
1968 xnn_params.f32.vadd = (struct vbinary_parameters) {
1969 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__sse_x8,
1970 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
1971 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__sse_x8,
1972 .element_tile = 8,
1973 };
1974 xnn_params.f32.vdiv = (struct vbinary_parameters) {
1975 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__sse_x8,
1976 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__sse_x8,
1977 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__sse_x8,
1978 .element_tile = 8,
1979 };
1980 xnn_params.f32.vmax = (struct vbinary_parameters) {
1981 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__sse_x8,
1982 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
1983 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
1984 .element_tile = 8,
1985 };
1986 xnn_params.f32.vmin = (struct vbinary_parameters) {
1987 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__sse_x8,
1988 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
1989 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__sse_x8,
1990 .element_tile = 8,
1991 };
1992 xnn_params.f32.vmul = (struct vbinary_parameters) {
1993 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__sse_x8,
1994 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
1995 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__sse_x8,
1996 .element_tile = 8,
1997 };
1998 xnn_params.f32.vsub = (struct vbinary_parameters) {
1999 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__sse_x8,
2000 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__sse_x8,
2001 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__sse_x8,
2002 .element_tile = 8,
2003 };
2004 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
2005 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__sse_x8,
2006 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
2007 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__sse_x8,
2008 .element_tile = 8,
2009 };
2010 }
2011 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
2012 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__sse_2x,
2013 .channel_tile = 4,
2014 .row_tile = 2,
2015 };
2016 #ifndef XNN_NO_NCHW_OPERATORS
2017 // Sparse microkernels on x86 currently target only SSE, and on processors
2018 // with AVX ISA dense inference is expected to be faster than sparse.
2019 if (!cpuinfo_has_x86_avx()) {
2020 init_flags |= XNN_INIT_FLAG_CHW_OPT;
2021 }
2022
2023 xnn_params.f32.spmm = (struct spmm_parameters) {
2024 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__sse,
2025 .mr = 32,
2026 .nr = 1,
2027 };
2028 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
2029 .ukernel_with_symm_padding =
2030 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__sse_2x2,
2031 .output_channel_tile = 4,
2032 .output_height_tile = 2,
2033 .output_width_tile = 2,
2034 };
2035 if (!XNN_PLATFORM_MOBILE && cpuinfo_has_x86_ssse3()) {
2036 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2037 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__ssse3_2x4_acc2,
2038 .output_width_tile = 4,
2039 .output_height_tile = 2,
2040 };
2041 } else {
2042 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2043 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_2x4_acc2,
2044 .output_width_tile = 4,
2045 .output_height_tile = 2,
2046 };
2047 }
2048 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2049 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3,
2050 .output_width_tile = 4,
2051 .output_height_tile = 1,
2052 };
2053 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2054 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__sse_4x4,
2055 .output_width_tile = 4,
2056 .output_height_tile = 4,
2057 };
2058 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2059 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__sse_2x4,
2060 .output_width_tile = 4,
2061 .output_height_tile = 2,
2062 };
2063 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2064 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__sse_x4,
2065 .channel_tile = 4,
2066 };
2067 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
2068 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
2069 .channel_tile = 1,
2070 .pixel_tile = 4,
2071 };
2072 #endif // XNN_NO_NCHW_OPERATORS
2073 #endif // XNN_NO_F32_OPERATORS
2074
2075 /**************************** X32 micro-kernels ****************************/
2076 #ifndef XNN_NO_X32_OPERATORS
2077 init_flags |= XNN_INIT_FLAG_X32;
2078
2079 xnn_params.x32.fill = (struct fill_parameters) {
2080 .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__sse,
2081 .row_tile = 1,
2082 };
2083 xnn_params.x32.pad = (struct pad_parameters) {
2084 .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__sse,
2085 .row_tile = 1,
2086 };
2087 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__sse2;
2088 xnn_params.x32.zip = (struct zip_parameters) {
2089 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__sse2,
2090 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__sse2,
2091 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__sse2,
2092 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__sse2,
2093 };
2094 #ifndef XNN_NO_NCHW_OPERATORS
2095 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
2096 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
2097 .channel_tile = 1,
2098 .pixel_tile = 1,
2099 };
2100 #endif // XNN_NO_NCHW_OPERATORS
2101 #endif // XNN_NO_X32_OPERATORS
2102
2103 #elif XNN_ARCH_WASMSIMD
2104
2105 /**************************** XX micro-kernels ****************************/
2106 #ifndef XNN_NO_XX_OPERATORS
2107 init_flags |= XNN_INIT_FLAG_XX;
2108
2109 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
2110 #endif
2111
2112 /**************************** QS8 micro-kernels ****************************/
2113 #ifndef XNN_NO_QS8_OPERATORS
2114 init_flags |= XNN_INIT_FLAG_QS8;
2115
2116 xnn_params.qs8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_3x4c8__wasmsimd_ld64);
2117 xnn_params.qs8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_3x4c8__wasmsimd_ld64);
2118 xnn_params.qs8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qs8_gemm_minmax_ukernel_1x4c8__wasmsimd_ld64);
2119 xnn_params.qs8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qs8_igemm_minmax_ukernel_1x4c8__wasmsimd_ld64);
2120 xnn_params.qs8.gemm.mr = 3;
2121 xnn_params.qs8.gemm.nr = 4;
2122 xnn_params.qs8.gemm.log2_kr = 3;
2123
2124 xnn_params.qs8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qs8_dwconv_minmax_ukernel_up8x9__wasmsimd_mul16;
2125 xnn_params.qs8.dwconv[0].channel_tile = 8;
2126 xnn_params.qs8.dwconv[0].primary_tile = 9;
2127
2128 xnn_params.qs8.gavgpool = (struct gavgpool_parameters) {
2129 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7x__wasmsimd_c8_acc2,
2130 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qs8_gavgpool_minmax_ukernel_7p7x__wasmsimd_c8_acc2,
2131 .mr = 7,
2132 };
2133
2134 xnn_params.qs8.vadd = (struct vbinary_parameters) {
2135 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vadd_minmax_ukernel__wasmsimd_x8,
2136 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8,
2137 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_qs8_vaddc_minmax_ukernel__wasmsimd_x8,
2138 .element_tile = 8,
2139 };
2140 #endif // XNN_NO_QS8_OPERATORS
2141
2142 /**************************** QU8 micro-kernels ****************************/
2143 #ifndef XNN_NO_QU8_OPERATORS
2144 init_flags |= XNN_INIT_FLAG_QU8;
2145
2146 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_ukernel_2x2__scalar);
2147 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_ukernel_2x2__scalar);
2148 xnn_params.qu8.gemm.mr = 2;
2149 xnn_params.qu8.gemm.nr = 2;
2150
2151 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_ukernel_up1x9__scalar;
2152 xnn_params.qu8.dwconv[0].channel_tile = 1;
2153 xnn_params.qu8.dwconv[0].primary_tile = 9;
2154
2155 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
2156 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
2157 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
2158 .mr = 9,
2159 .qr = 8,
2160 };
2161 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
2162 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
2163 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
2164 .mr = 7,
2165 };
2166 xnn_params.qu8.vadd = (xnn_vadd_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar;
2167 #endif // XNN_NO_QU8_OPERATORS
2168
2169 /**************************** U8 micro-kernels ****************************/
2170 #ifndef XNN_NO_U8_OPERATORS
2171 init_flags |= XNN_INIT_FLAG_U8;
2172
2173 xnn_params.u8.maxpool = (struct maxpool_parameters) {
2174 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
2175 .mr = 9,
2176 .qr = 8,
2177 };
2178 xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__scalar_x4;
2179 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2180 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
2181 #endif // XNN_NO_U8_OPERATORS
2182
2183 /**************************** X8 micro-kernels ****************************/
2184 #ifndef XNN_NO_X8_OPERATORS
2185 init_flags |= XNN_INIT_FLAG_X8;
2186
2187 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
2188 xnn_params.x8.zip = (struct zip_parameters) {
2189 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
2190 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
2191 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
2192 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
2193 };
2194 #endif // XNN_NO_X8_OPERATORS
2195
2196 /**************************** F32 micro-kernels ****************************/
2197 #ifndef XNN_NO_F32_OPERATORS
2198 init_flags |= XNN_INIT_FLAG_F32;
2199
2200 if (is_wasm_x86) {
2201 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
2202 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x8__wasmsimd_x86_splat);
2203 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
2204 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_x86_splat);
2205 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x8__wasmsimd_splat);
2206 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x8__wasmsimd_splat);
2207 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
2208 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
2209 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x8__wasmsimd_splat);
2210 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x8__wasmsimd_splat);
2211 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
2212 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
2213 xnn_params.f32.gemm.mr = 4;
2214 xnn_params.f32.gemm.nr = 8;
2215
2216 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_x86);
2217 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_x86);
2218 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
2219 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
2220 xnn_params.f32.gemm2.mr = 4;
2221 xnn_params.f32.gemm2.nr = 2;
2222 xnn_params.f32.gemm2.log2_kr = 2;
2223 } else {
2224 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
2225 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_5x8__wasmsimd_arm_splat);
2226 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
2227 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x8__wasmsimd_arm_splat);
2228 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_5x8__wasmsimd_splat);
2229 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_5x8__wasmsimd_splat);
2230 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x8__wasmsimd_splat);
2231 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x8__wasmsimd_splat);
2232 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_5x8__wasmsimd_splat);
2233 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_5x8__wasmsimd_splat);
2234 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x8__wasmsimd_splat);
2235 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x8__wasmsimd_splat);
2236 xnn_params.f32.gemm.mr = 5;
2237 xnn_params.f32.gemm.nr = 8;
2238
2239 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2c4__wasmsimd_arm);
2240 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2c4__wasmsimd_arm);
2241 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2c4__wasmsimd);
2242 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2c4__wasmsimd);
2243 xnn_params.f32.gemm2.mr = 4;
2244 xnn_params.f32.gemm2.nr = 2;
2245 xnn_params.f32.gemm2.log2_kr = 2;
2246 }
2247
2248 if (is_wasm_x86) {
2249 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x4__wasmsimd_x86;
2250 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x4__wasmsimd;
2251 xnn_params.f32.dwconv[0].channel_tile = 8;
2252 xnn_params.f32.dwconv[0].primary_tile = 4;
2253
2254 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up8x9__wasmsimd_x86;
2255 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up8x9__wasmsimd;
2256 xnn_params.f32.dwconv[1].channel_tile = 8;
2257 xnn_params.f32.dwconv[1].primary_tile = 9;
2258 } else {
2259 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x4__wasmsimd_arm;
2260 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x4__wasmsimd;
2261 xnn_params.f32.dwconv[0].channel_tile = 4;
2262 xnn_params.f32.dwconv[0].primary_tile = 4;
2263
2264 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x9__wasmsimd_arm;
2265 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x9__wasmsimd;
2266 xnn_params.f32.dwconv[1].channel_tile = 4;
2267 xnn_params.f32.dwconv[1].primary_tile = 9;
2268 }
2269
2270 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up4x25__wasmsimd_arm;
2271 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up4x25__wasmsimd;
2272 xnn_params.f32.dwconv[2].channel_tile = 4;
2273 xnn_params.f32.dwconv[2].primary_tile = 25;
2274
2275 if (is_wasm_x86) {
2276 xnn_params.f32.avgpool = (struct avgpool_parameters) {
2277 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
2278 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
2279 .mr = 9,
2280 .qr = 8,
2281 };
2282 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
2283 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_x86_c4,
2284 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
2285 .mr = 9,
2286 .qr = 8,
2287 };
2288 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
2289 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_x86_c4,
2290 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4,
2291 .mr = 7,
2292 };
2293 } else {
2294 xnn_params.f32.avgpool = (struct avgpool_parameters) {
2295 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
2296 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
2297 .mr = 9,
2298 .qr = 8,
2299 };
2300 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
2301 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasmsimd_arm_c4,
2302 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
2303 .mr = 9,
2304 .qr = 8,
2305 };
2306 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
2307 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasmsimd_arm_c4,
2308 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4,
2309 .mr = 7,
2310 };
2311 }
2312 if (is_wasm_x86) {
2313 xnn_params.f32.maxpool = (struct maxpool_parameters) {
2314 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_x86_c4,
2315 .mr = 9,
2316 .qr = 8,
2317 };
2318 } else {
2319 xnn_params.f32.maxpool = (struct maxpool_parameters) {
2320 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasmsimd_arm_c4,
2321 .mr = 9,
2322 .qr = 8,
2323 };
2324 }
2325 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
2326 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__wasmsimd_c4,
2327 .mr = 4,
2328 };
2329 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
2330 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__wasmsimd_c4,
2331 .mr = 9,
2332 };
2333 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
2334 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__wasmsimd_c4,
2335 .mr = 9,
2336 .qr = 8,
2337 };
2338 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
2339 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__wasmsimd_c8,
2340 .pixel_tile = 1,
2341 .channel_tile = 8,
2342 };
2343 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8;
2344 if (is_wasm_x86) {
2345 xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasmsimd_x86_x8;
2346 } else {
2347 xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasmsimd_arm_x8;
2348 }
2349 if (is_wasm_x86) {
2350 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_x86_rr2_p6_x20;
2351 } else {
2352 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasmsimd_arm_rr2_p6_x20;
2353 }
2354 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasmsimd_x16;
2355 if (is_wasm_x86) {
2356 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x8;
2357 } else {
2358 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x8;
2359 }
2360 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__wasmsimd_x8;
2361 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_relu_ukernel__wasmsimd_x16;
2362 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_addsub_x8;
2363 if (is_wasm_x86) {
2364 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_addsub_x8;
2365 } else {
2366 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__wasmsimd_cvt_x8;
2367 }
2368 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__wasmsimd_addsub_x8;
2369 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__wasmsimd_addsub_x8;
2370 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__wasmsimd_p5_div_x16;
2371 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__wasmsimd_x8;
2372 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__wasmsimd_sqrt_x8;
2373 if (is_wasm_x86) {
2374 xnn_params.f32.prelu = (struct prelu_parameters) {
2375 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_minmax_2x8,
2376 .row_tile = 2,
2377 .channel_tile = 8,
2378 };
2379 } else {
2380 xnn_params.f32.prelu = (struct prelu_parameters) {
2381 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasmsimd_bitselect_2x8,
2382 .row_tile = 2,
2383 .channel_tile = 8,
2384 };
2385 }
2386 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__wasmsimd_p5_x16_acc2;
2387 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_arm;
2388 if (is_wasm_x86) {
2389 xnn_params.f32.vadd = (struct vbinary_parameters) {
2390 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16,
2391 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
2392 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
2393 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
2394 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
2395 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
2396 .element_tile = 16,
2397 };
2398 xnn_params.f32.vdiv = (struct vbinary_parameters) {
2399 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16,
2400 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16,
2401 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16,
2402 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
2403 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
2404 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
2405 .element_tile = 16,
2406 };
2407 xnn_params.f32.vmax = (struct vbinary_parameters) {
2408 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_x86_x16,
2409 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
2410 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
2411 .element_tile = 16,
2412 };
2413 xnn_params.f32.vmin = (struct vbinary_parameters) {
2414 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_x86_x16,
2415 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
2416 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
2417
2418 .element_tile = 16,
2419 };
2420 xnn_params.f32.vmul = (struct vbinary_parameters) {
2421 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16,
2422 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
2423 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
2424 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
2425 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
2426 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
2427 .element_tile = 16,
2428 };
2429 xnn_params.f32.vsub = (struct vbinary_parameters) {
2430 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16,
2431 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16,
2432 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16,
2433 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
2434 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
2435 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
2436 .element_tile = 16,
2437 };
2438 } else {
2439 xnn_params.f32.vadd = (struct vbinary_parameters) {
2440 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16,
2441 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
2442 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
2443 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
2444 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
2445 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
2446 .element_tile = 16,
2447 };
2448 xnn_params.f32.vdiv = (struct vbinary_parameters) {
2449 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16,
2450 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16,
2451 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16,
2452 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasmsimd_x16,
2453 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasmsimd_x16,
2454 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasmsimd_x16,
2455 .element_tile = 16,
2456 };
2457 xnn_params.f32.vmax = (struct vbinary_parameters) {
2458 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_arm_x16,
2459 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
2460 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
2461 .element_tile = 16,
2462 };
2463 xnn_params.f32.vmin = (struct vbinary_parameters) {
2464 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_arm_x16,
2465 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
2466 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
2467 .element_tile = 16,
2468 };
2469 xnn_params.f32.vmul = (struct vbinary_parameters) {
2470 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16,
2471 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
2472 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
2473 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
2474 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
2475 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
2476 .element_tile = 16,
2477 };
2478 xnn_params.f32.vsub = (struct vbinary_parameters) {
2479 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16,
2480 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16,
2481 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16,
2482 .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
2483 .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
2484 .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
2485 .element_tile = 16,
2486 };
2487 }
2488 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
2489 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__wasmsimd_x16,
2490 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
2491 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
2492 .element_tile = 16,
2493 };
2494 if (is_wasm_x86) {
2495 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
2496 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_x86_2x,
2497 .channel_tile = 4,
2498 .row_tile = 2,
2499 };
2500 } else {
2501 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
2502 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c4__wasmsimd_arm_2x,
2503 .channel_tile = 4,
2504 .row_tile = 2,
2505 };
2506 }
2507 #ifndef XNN_NO_NCHW_OPERATORS
2508 init_flags |= XNN_INIT_FLAG_CHW_OPT;
2509
2510 if (is_wasm_x86) {
2511 xnn_params.f32.spmm = (struct spmm_parameters) {
2512 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_x86,
2513 .mr = 32,
2514 .nr = 1,
2515 };
2516 } else {
2517 xnn_params.f32.spmm = (struct spmm_parameters) {
2518 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_32x1__wasmsimd_arm,
2519 .mr = 32,
2520 .nr = 1,
2521 };
2522 }
2523 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
2524 .ukernel_with_symm_padding =
2525 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__wasmsimd_2x2,
2526 .output_channel_tile = 4,
2527 .output_height_tile = 2,
2528 .output_width_tile = 2,
2529 };
2530 if (is_wasm_x86) {
2531 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2532 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_x86_loadsplat_2x4,
2533 .output_width_tile = 4,
2534 .output_height_tile = 2,
2535 };
2536 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2537 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_x86_splat_1x4_acc2,
2538 .output_width_tile = 4,
2539 .output_height_tile = 1,
2540 };
2541 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2542 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_x86_splat_3x4,
2543 .output_width_tile = 4,
2544 .output_height_tile = 3,
2545 };
2546 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2547 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_x86_splat_1x4_acc2,
2548 .output_width_tile = 4,
2549 .output_height_tile = 1,
2550 };
2551 } else {
2552 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2553 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__wasmsimd_arm_loadsplat_2x4,
2554 .output_width_tile = 4,
2555 .output_height_tile = 2,
2556 };
2557 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2558 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__wasmsimd_arm_splat_1x4_acc4,
2559 .output_width_tile = 4,
2560 .output_height_tile = 1,
2561 };
2562 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2563 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__wasmsimd_arm_splat_3x4,
2564 .output_width_tile = 4,
2565 .output_height_tile = 3,
2566 };
2567 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2568 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__wasmsimd_arm_splat_1x4_acc2,
2569 .output_width_tile = 4,
2570 .output_height_tile = 1,
2571 };
2572 }
2573 if (is_wasm_x86) {
2574 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2575 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_x86_x4,
2576 .channel_tile = 4,
2577 };
2578 } else {
2579 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2580 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__wasmsimd_arm_x4,
2581 .channel_tile = 4,
2582 };
2583 }
2584 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
2585 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__wasmsimd_p8,
2586 .channel_tile = 1,
2587 .pixel_tile = 4,
2588 };
2589 #endif // XNN_NO_NCHW_OPERATORS
2590 #endif // XNN_NO_F32_OPERATORS
2591
2592 /**************************** X32 micro-kernels ****************************/
2593 #ifndef XNN_NO_X32_OPERATORS
2594 init_flags |= XNN_INIT_FLAG_X32;
2595
2596 xnn_params.x32.fill = (struct fill_parameters) {
2597 .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__wasmsimd,
2598 .row_tile = 1,
2599 };
2600 xnn_params.x32.pad = (struct pad_parameters) {
2601 .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__wasmsimd,
2602 .row_tile = 1,
2603 };
2604 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__wasmsimd;
2605 xnn_params.x32.zip = (struct zip_parameters) {
2606 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__wasmsimd,
2607 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__wasmsimd,
2608 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__wasmsimd,
2609 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__wasmsimd,
2610 };
2611 #ifndef XNN_NO_NCHW_OPERATORS
2612 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
2613 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
2614 .channel_tile = 1,
2615 .pixel_tile = 1,
2616 };
2617 #endif // XNN_NO_NCHW_OPERATORS
2618 #endif // XNN_NO_X32_OPERATORS
2619
2620 #elif XNN_ARCH_WASM
2621
2622 /**************************** XX micro-kernels ****************************/
2623 #ifndef XNN_NO_XX_OPERATORS
2624 init_flags |= XNN_INIT_FLAG_XX;
2625
2626 xnn_params.xx.copy = (xnn_univector_ukernel_function) xnn_xx_copy_ukernel__memcpy;
2627 #endif
2628
2629 /**************************** QU8 micro-kernels ****************************/
2630 #ifndef XNN_NO_QU8_OPERATORS
2631 init_flags |= XNN_INIT_FLAG_QU8;
2632
2633 xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_qu8_gemm_minmax_ukernel_2x2__scalar);
2634 xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_qu8_igemm_minmax_ukernel_2x2__scalar);
2635 xnn_params.qu8.gemm.mr = 2;
2636 xnn_params.qu8.gemm.nr = 2;
2637
2638 xnn_params.qu8.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_qu8_dwconv_minmax_ukernel_up1x9__scalar;
2639 xnn_params.qu8.dwconv[0].channel_tile = 1;
2640 xnn_params.qu8.dwconv[0].primary_tile = 9;
2641
2642 xnn_params.qu8.avgpool = (struct avgpool_parameters) {
2643 .up = (xnn_avgpool_unipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9x__scalar_c1,
2644 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_qu8_avgpool_minmax_ukernel_9p8x__scalar_c1,
2645 .mr = 9,
2646 .qr = 8,
2647 };
2648 xnn_params.qu8.gavgpool = (struct gavgpool_parameters) {
2649 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7x__scalar_c1,
2650 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_qu8_gavgpool_minmax_ukernel_7p7x__scalar_c1,
2651 .mr = 7,
2652 };
2653 xnn_params.qu8.vadd = (xnn_vadd_ukernel_function) xnn_qu8_vadd_minmax_ukernel__scalar;
2654 #endif // XNN_NO_QU8_OPERATORS
2655
2656 /**************************** U8 micro-kernels ****************************/
2657 #ifndef XNN_NO_U8_OPERATORS
2658 init_flags |= XNN_INIT_FLAG_U8;
2659
2660 xnn_params.u8.maxpool = (struct maxpool_parameters) {
2661 .ukernel = (xnn_maxpool_ukernel_function) xnn_u8_maxpool_minmax_ukernel_9p8x__scalar_c1,
2662 .mr = 9,
2663 .qr = 8,
2664 };
2665 xnn_params.u8.clamp = (xnn_univector_ukernel_function) xnn_u8_clamp_ukernel__scalar_x4;
2666 xnn_params.u8.lut32norm = xnn_u8_lut32norm_ukernel__scalar;
2667 xnn_params.u8.rmax = xnn_u8_rmax_ukernel__scalar;
2668 #endif // XNN_NO_U8_OPERATORS
2669
2670 /**************************** X8 micro-kernels ****************************/
2671 #ifndef XNN_NO_X8_OPERATORS
2672 init_flags |= XNN_INIT_FLAG_X8;
2673
2674 xnn_params.x8.lut = xnn_x8_lut_ukernel__scalar;
2675 xnn_params.x8.zip = (struct zip_parameters) {
2676 .x2 = (xnn_zipc_ukernel_function) xnn_x8_zip_x2_ukernel__scalar,
2677 .x3 = (xnn_zipc_ukernel_function) xnn_x8_zip_x3_ukernel__scalar,
2678 .x4 = (xnn_zipc_ukernel_function) xnn_x8_zip_x4_ukernel__scalar,
2679 .xm = (xnn_zipv_ukernel_function) xnn_x8_zip_xm_ukernel__scalar,
2680 };
2681 #endif // XNN_NO_X8_OPERATORS
2682
2683 /**************************** F32 micro-kernels ****************************/
2684 #ifndef XNN_NO_F32_OPERATORS
2685 init_flags |= XNN_INIT_FLAG_F32;
2686
2687 if (is_wasm_x86) {
2688 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_2x4__scalar);
2689 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_2x4__scalar);
2690 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
2691 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
2692 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_2x4__scalar);
2693 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_2x4__scalar);
2694 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
2695 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
2696 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_2x4__scalar);
2697 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_2x4__scalar);
2698 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
2699 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
2700 xnn_params.f32.gemm.mr = 2;
2701 xnn_params.f32.gemm.nr = 4;
2702 } else {
2703 xnn_params.f32.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x4__wasm);
2704 xnn_params.f32.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x4__wasm);
2705 xnn_params.f32.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_1x4__wasm);
2706 xnn_params.f32.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_1x4__wasm);
2707 xnn_params.f32.gemm.relu.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_4x4__wasm);
2708 xnn_params.f32.gemm.relu.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_4x4__wasm);
2709 xnn_params.f32.gemm.relu.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_relu_ukernel_1x4__wasm);
2710 xnn_params.f32.gemm.relu.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_relu_ukernel_1x4__wasm);
2711 xnn_params.f32.gemm.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__wasm);
2712 xnn_params.f32.gemm.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__wasm);
2713 xnn_params.f32.gemm.linear.gemm1 = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm);
2714 xnn_params.f32.gemm.linear.igemm1 = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm);
2715 xnn_params.f32.gemm.mr = 4;
2716 xnn_params.f32.gemm.nr = 4;
2717 }
2718 xnn_params.f32.gemm2.minmax.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_minmax_ukernel_4x2__wasm);
2719 xnn_params.f32.gemm2.minmax.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_minmax_ukernel_4x2__wasm),
2720 xnn_params.f32.gemm2.linear.gemm = xnn_init_hmp_gemm_ukernel((xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x2__wasm);
2721 xnn_params.f32.gemm2.linear.igemm = xnn_init_hmp_igemm_ukernel((xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__wasm),
2722 xnn_params.f32.gemm2.mr = 4;
2723 xnn_params.f32.gemm2.nr = 2;
2724
2725 xnn_params.f32.dwconv[0].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x4__wasm_acc2;
2726 xnn_params.f32.dwconv[0].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__wasm_acc2;
2727 xnn_params.f32.dwconv[0].channel_tile = 1;
2728 xnn_params.f32.dwconv[0].primary_tile = 4;
2729
2730 xnn_params.f32.dwconv[1].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x9__wasm_acc2;
2731 xnn_params.f32.dwconv[1].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__wasm_acc2;
2732 xnn_params.f32.dwconv[1].channel_tile = 1;
2733 xnn_params.f32.dwconv[1].primary_tile = 9;
2734
2735 xnn_params.f32.dwconv[2].minmax.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_minmax_ukernel_up1x25__wasm_acc2;
2736 xnn_params.f32.dwconv[2].linear.unipass = (xnn_dwconv_unipass_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__wasm_acc2;
2737 xnn_params.f32.dwconv[2].channel_tile = 1;
2738 xnn_params.f32.dwconv[2].primary_tile = 25;
2739
2740 xnn_params.f32.avgpool = (struct avgpool_parameters) {
2741 .up = (xnn_avgpool_unipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9x__wasm_c1,
2742 .mp = (xnn_avgpool_multipass_ukernel_function) xnn_f32_avgpool_minmax_ukernel_9p8x__wasm_c1,
2743 .mr = 9,
2744 .qr = 8,
2745 };
2746 xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
2747 .up = (xnn_pavgpool_unipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9x__wasm_c1,
2748 .mp = (xnn_pavgpool_multipass_ukernel_function) xnn_f32_pavgpool_minmax_ukernel_9p8x__wasm_c1,
2749 .mr = 9,
2750 .qr = 8,
2751 };
2752 xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
2753 .up = (xnn_gavgpool_unipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7x__wasm_c1,
2754 .mp = (xnn_gavgpool_multipass_ukernel_function) xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1,
2755 .mr = 7,
2756 };
2757 xnn_params.f32.maxpool = (struct maxpool_parameters) {
2758 .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_minmax_ukernel_9p8x__wasm_c1,
2759 .mr = 9,
2760 .qr = 8,
2761 };
2762 xnn_params.f32.argmaxpool[0] = (struct argmaxpool_parameters) {
2763 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_4x__scalar_c1,
2764 .mr = 4,
2765 };
2766 xnn_params.f32.argmaxpool[1] = (struct argmaxpool_parameters) {
2767 .up = (xnn_argmaxpool_unipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9x__scalar_c1,
2768 .mr = 9,
2769 };
2770 xnn_params.f32.argmaxpool[2] = (struct argmaxpool_parameters) {
2771 .mp = (xnn_argmaxpool_multipass_ukernel_function) xnn_f32_argmaxpool_ukernel_9p8x__scalar_c1,
2772 .mr = 9,
2773 .qr = 8,
2774 };
2775 xnn_params.f32.ibilinear = (struct ibilinear_parameters) {
2776 .ukernel = (xnn_ibilinear_ukernel_function) xnn_f32_ibilinear_ukernel__scalar_c2,
2777 .pixel_tile = 1,
2778 .channel_tile = 2,
2779 };
2780 xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__scalar_x4;
2781 xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasm_x4;
2782 if (is_wasm_x86) {
2783 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__scalar_x4;
2784 } else {
2785 xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasm_x4;
2786 }
2787 if (is_wasm_x86) {
2788 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__scalar_rr2_lut16_p3_x2;
2789 } else {
2790 xnn_params.f32.elu = (xnn_univector_ukernel_function) xnn_f32_velu_ukernel__wasm_rr2_p6_x6;
2791 }
2792 xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__scalar_x4;
2793 xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__scalar_x4;
2794 if (is_wasm_x86) {
2795 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_relu_ukernel__scalar_x8;
2796 } else {
2797 xnn_params.f32.relu = (xnn_univector_ukernel_function) xnn_f32_relu_ukernel__wasm_x8;
2798 }
2799 xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__scalar_libm_x4;
2800 xnn_params.f32.rndz = (xnn_univector_ukernel_function) xnn_f32_vrndz_ukernel__scalar_libm_x4;
2801 xnn_params.f32.rndu = (xnn_univector_ukernel_function) xnn_f32_vrndu_ukernel__scalar_libm_x4;
2802 xnn_params.f32.rndd = (xnn_univector_ukernel_function) xnn_f32_vrndd_ukernel__scalar_libm_x4;
2803 xnn_params.f32.sigmoid = (xnn_univector_ukernel_function) xnn_f32_sigmoid_ukernel__scalar_lut64_p2_div_x2;
2804 xnn_params.f32.sqr = (xnn_univector_ukernel_function) xnn_f32_vsqr_ukernel__scalar_x4;
2805 xnn_params.f32.sqrt = (xnn_univector_ukernel_function) xnn_f32_vsqrt_ukernel__scalar_sqrt_x1;
2806 if (is_wasm_x86) {
2807 xnn_params.f32.prelu = (struct prelu_parameters) {
2808 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
2809 .row_tile = 2,
2810 .channel_tile = 4,
2811 };
2812 } else {
2813 xnn_params.f32.prelu = (struct prelu_parameters) {
2814 .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
2815 .row_tile = 2,
2816 .channel_tile = 4,
2817 };
2818 }
2819 xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2;
2820 xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
2821 xnn_params.f32.vadd = (struct vbinary_parameters) {
2822 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasm_x8,
2823 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
2824 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
2825 .element_tile = 8,
2826 };
2827 xnn_params.f32.vdiv = (struct vbinary_parameters) {
2828 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasm_x8,
2829 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_minmax_ukernel__wasm_x8,
2830 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_minmax_ukernel__wasm_x8,
2831 .element_tile = 8,
2832 };
2833 xnn_params.f32.vmax = (struct vbinary_parameters) {
2834 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x8,
2835 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
2836 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
2837 .element_tile = 8,
2838 };
2839 xnn_params.f32.vmin = (struct vbinary_parameters) {
2840 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x8,
2841 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
2842 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
2843 .element_tile = 8,
2844 };
2845 xnn_params.f32.vmul = (struct vbinary_parameters) {
2846 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasm_x8,
2847 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
2848 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
2849 .element_tile = 8,
2850 };
2851 xnn_params.f32.vsub = (struct vbinary_parameters) {
2852 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasm_x8,
2853 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasm_x8,
2854 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasm_x8,
2855 .element_tile = 8,
2856 };
2857 xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
2858 .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
2859 .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
2860 .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
2861 .element_tile = 8,
2862 };
2863 xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
2864 .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_minmax_ukernel_c1__wasm_2x,
2865 .channel_tile = 1,
2866 .row_tile = 2,
2867 };
2868 #ifndef XNN_NO_NCHW_OPERATORS
2869 init_flags |= XNN_INIT_FLAG_CHW_OPT;
2870
2871 xnn_params.f32.spmm = (struct spmm_parameters) {
2872 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x1__scalar,
2873 .mr = 8,
2874 .nr = 1,
2875 };
2876 xnn_params.f32.spmm2 = (struct spmm_parameters) {
2877 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x2__scalar,
2878 .mr = 8,
2879 .nr = 2,
2880 };
2881 xnn_params.f32.spmm4 = (struct spmm_parameters) {
2882 .ukernel = (xnn_spmm_ukernel_function) xnn_f32_spmm_minmax_ukernel_8x4__scalar,
2883 .mr = 8,
2884 .nr = 4,
2885 };
2886 xnn_params.f32.conv_hwc2chw_3x3c3s2 = (struct conv_hwc2chw_parameters) {
2887 .ukernel_with_symm_padding =
2888 (xnn_conv_hwc2chw_ukernel_function) xnn_f32_conv_hwc2chw_ukernel_3x3s2p1c3x4__scalar_1x1,
2889 .output_channel_tile = 4,
2890 .output_height_tile = 1,
2891 .output_width_tile = 1,
2892 };
2893 xnn_params.f32.dwconv2d_chw_3x3 = (struct dwconv2d_chw_parameters) {
2894 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_2x1_acc2,
2895 .output_width_tile = 1,
2896 .output_height_tile = 2,
2897 };
2898 xnn_params.f32.dwconv2d_chw_3x3s2 = (struct dwconv2d_chw_parameters) {
2899 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc2,
2900 .output_width_tile = 1,
2901 .output_height_tile = 1,
2902 };
2903 xnn_params.f32.dwconv2d_chw_5x5 = (struct dwconv2d_chw_parameters) {
2904 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5,
2905 .output_width_tile = 1,
2906 .output_height_tile = 1,
2907 };
2908 xnn_params.f32.dwconv2d_chw_5x5s2 = (struct dwconv2d_chw_parameters) {
2909 .ukernel = (xnn_dwconv2d_chw_ukernel_function) xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5,
2910 .output_width_tile = 1,
2911 .output_height_tile = 1,
2912 };
2913 xnn_params.f32.gavgpool_cw = (struct gavgpool_cw_parameters) {
2914 .ukernel = (xnn_gavgpool_cw_ukernel_function) xnn_f32_gavgpool_cw_ukernel__scalar_x1,
2915 .channel_tile = 1,
2916 };
2917 xnn_params.f32.ibilinear_chw = (struct ibilinear_chw_parameters) {
2918 .ukernel = (xnn_ibilinear_chw_ukernel_function) xnn_f32_ibilinear_chw_ukernel__scalar_p4,
2919 .channel_tile = 1,
2920 .pixel_tile = 4,
2921 };
2922 #endif // XNN_NO_NCHW_OPERATORS
2923 #endif // XNN_NO_F32_OPERATORS
2924
2925 /**************************** X32 micro-kernels ****************************/
2926 #ifndef XNN_NO_X32_OPERATORS
2927 init_flags |= XNN_INIT_FLAG_X32;
2928
2929 xnn_params.x32.fill = (struct fill_parameters) {
2930 .ukernel = (xnn_fill_ukernel_function) xnn_x32_fill_ukernel__scalar_float,
2931 .row_tile = 1,
2932 };
2933 xnn_params.x32.pad = (struct pad_parameters) {
2934 .ukernel = (xnn_pad_ukernel_function) xnn_x32_pad_ukernel__scalar_float,
2935 .row_tile = 1,
2936 };
2937 xnn_params.x32.unpool = (xnn_unpool_ukernel_function) xnn_x32_unpool_ukernel__scalar;
2938 xnn_params.x32.zip = (struct zip_parameters) {
2939 .x2 = (xnn_zipc_ukernel_function) xnn_x32_zip_x2_ukernel__scalar,
2940 .x3 = (xnn_zipc_ukernel_function) xnn_x32_zip_x3_ukernel__scalar,
2941 .x4 = (xnn_zipc_ukernel_function) xnn_x32_zip_x4_ukernel__scalar,
2942 .xm = (xnn_zipv_ukernel_function) xnn_x32_zip_xm_ukernel__scalar,
2943 };
2944 #ifndef XNN_NO_NCHW_OPERATORS
2945 xnn_params.x32.depthtospace2d_chw2hwc = (struct depthtospace2d_chw2hwc_parameters) {
2946 .ukernel = (xnn_depthtospace2d_chw2hwc_ukernel_function) xnn_x32_depthtospace2d_chw2hwc_ukernel__scalar,
2947 .channel_tile = 1,
2948 .pixel_tile = 1,
2949 };
2950 #endif // XNN_NO_NCHW_OPERATORS
2951 #endif // XNN_NO_X32_OPERATORS
2952
2953 #else
2954 #error "Unsupported architecture"
2955 #endif
2956 xnn_params.init_flags = init_flags;
2957 }
2958
2959 #ifdef _WIN32
init_windows(PINIT_ONCE init_once,PVOID parameter,PVOID * context)2960 static BOOL CALLBACK init_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) {
2961 init();
2962 return TRUE;
2963 }
2964 #endif
2965
xnn_initialize(const struct xnn_allocator * allocator)2966 enum xnn_status xnn_initialize(const struct xnn_allocator* allocator) {
2967 #ifndef __EMSCRIPTEN__
2968 if (!cpuinfo_initialize()) {
2969 return xnn_status_out_of_memory;
2970 }
2971 #endif
2972 #ifdef _WIN32
2973 InitOnceExecuteOnce(&init_guard, &init_windows, NULL, NULL);
2974 #else
2975 pthread_once(&init_guard, &init);
2976 #endif
2977 if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) != 0) {
2978 if (allocator != NULL) {
2979 memcpy(&xnn_params.allocator, allocator, sizeof(struct xnn_allocator));
2980 } else {
2981 xnn_params.allocator.allocate = &xnn_allocate;
2982 xnn_params.allocator.reallocate = &xnn_reallocate;
2983 xnn_params.allocator.deallocate = &xnn_deallocate;
2984 xnn_params.allocator.aligned_allocate = &xnn_aligned_allocate;
2985 xnn_params.allocator.aligned_deallocate = &xnn_aligned_deallocate;
2986 }
2987 return xnn_status_success;
2988 } else {
2989 return xnn_status_unsupported_hardware;
2990 }
2991 }
2992
xnn_deinitialize(void)2993 enum xnn_status xnn_deinitialize(void) {
2994 #ifndef __EMSCRIPTEN__
2995 cpuinfo_deinitialize();
2996 #endif
2997 return xnn_status_success;
2998 }
2999