• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <functional>
9 #include <random>
10 #include <vector>
11 
12 #include <xnnpack.h>
13 
14 #include <benchmark/benchmark.h>
15 
16 #include "bench/end2end.h"
17 #include "bench/utils.h"
18 #include "models/models.h"
19 #include <xnnpack/gemm.h>
20 #include <xnnpack/igemm.h>
21 #include <xnnpack/params.h>
22 #include <xnnpack/params-init.h>
23 
24 
GEMMEnd2EndBenchmark(benchmark::State & state,models::ExecutionPlanFactory model_factory,xnn_qu8_gemm_minmax_ukernel_function gemm,xnn_qu8_igemm_minmax_ukernel_function igemm,xnn_qu8_gemm_minmax_ukernel_function gemm1,xnn_qu8_igemm_minmax_ukernel_function igemm1,xnn_init_qu8_conv_minmax_params_fn init_params,uint8_t mr,uint8_t nr,uint8_t log2_kr=0,uint8_t log2_sr=0,benchmark::utils::IsaCheckFunction isa_check=nullptr)25 static void GEMMEnd2EndBenchmark(
26   benchmark::State& state,
27   models::ExecutionPlanFactory model_factory,
28   xnn_qu8_gemm_minmax_ukernel_function gemm,
29   xnn_qu8_igemm_minmax_ukernel_function igemm,
30   xnn_qu8_gemm_minmax_ukernel_function gemm1,
31   xnn_qu8_igemm_minmax_ukernel_function igemm1,
32   xnn_init_qu8_conv_minmax_params_fn init_params,
33   uint8_t mr, uint8_t nr, uint8_t log2_kr = 0, uint8_t log2_sr = 0,
34   benchmark::utils::IsaCheckFunction isa_check = nullptr)
35 {
36   if (isa_check && !isa_check(state)) {
37     return;
38   }
39   if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
40     state.SkipWithError("failed to initialize XNNPACK");
41     return;
42   }
43 
44   // Override microkernels chosen in xnn_initialize
45   // Note: do not directly assign to xnn_params.qu8.gemm because it breaks older gcc.
46   xnn_params.qu8.gemm.minmax.gemm = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm));
47   xnn_params.qu8.gemm.minmax.igemm = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm));
48   xnn_params.qu8.gemm.minmax.gemm1 = xnn_init_hmp_gemm_ukernel(xnn_gemm_ukernel_function(gemm1));
49   xnn_params.qu8.gemm.minmax.igemm1 = xnn_init_hmp_igemm_ukernel(xnn_igemm_ukernel_function(igemm1));
50   xnn_params.qu8.gemm.init.qu8 = init_params;
51   xnn_params.qu8.gemm.mr = mr;
52   xnn_params.qu8.gemm.nr = nr;
53   xnn_params.qu8.gemm.log2_kr = log2_kr;
54   xnn_params.qu8.gemm.log2_sr = log2_sr;
55 
56   auto execution_plan = model_factory(nullptr);
57   if (execution_plan.empty()) {
58     state.SkipWithError("failed to create a model");
59     return;
60   }
61 
62   for (auto _ : state) {
63     for (const std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)>& op : execution_plan) {
64       xnn_status status = xnn_run_operator(op.get(), nullptr);
65       if (status != xnn_status_success) {
66         state.SkipWithError("failed to run a model");
67         return;
68       }
69     }
70   }
71 
72   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
73   if (cpu_frequency != 0) {
74     state.counters["cpufreq"] = cpu_frequency;
75   }
76 }
77 
78 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)79   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
80     GEMMEnd2EndBenchmark(state, model,
81       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
82       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
83       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
84       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
85       xnn_init_qu8_conv_minmax_rndnu_neon_params,
86       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
87       benchmark::utils::CheckNEON);
88   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)89   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
90     GEMMEnd2EndBenchmark(state, model,
91       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
92       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
93       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
94       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
95       xnn_init_qu8_conv_minmax_rndnu_neon_params,
96       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
97       benchmark::utils::CheckNEON);
98   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)99   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
100     GEMMEnd2EndBenchmark(state, model,
101       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
102       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
103       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
104       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
105       xnn_init_qu8_conv_minmax_rndnu_neon_params,
106       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
107       benchmark::utils::CheckNEON);
108   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,models::ExecutionPlanFactory model)109   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, models::ExecutionPlanFactory model) {
110     GEMMEnd2EndBenchmark(state, model,
111       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
112       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
113       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
114       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
115       xnn_init_qu8_conv_minmax_rndnu_neon_params,
116       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
117       benchmark::utils::CheckNEON);
118   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)119   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
120     GEMMEnd2EndBenchmark(state, model,
121       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
122       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
123       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
124       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
125       xnn_init_qu8_conv_minmax_rndnu_neon_params,
126       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
127       benchmark::utils::CheckNEON);
128   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)129   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
130     GEMMEnd2EndBenchmark(state, model,
131       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
132       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
133       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
134       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
135       xnn_init_qu8_conv_minmax_rndnu_neon_params,
136       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
137       benchmark::utils::CheckNEON);
138   }
139   BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)140   BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
141   BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
142   BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
143   BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
144   BENCHMARK_QU8_END2END(qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
145 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
146 
147 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
148   static void qu8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
149     GEMMEnd2EndBenchmark(state, model,
150       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
151       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
152       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
153       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
154       xnn_init_qu8_conv_minmax_rndnu_neon_params,
155       4 /* mr */, 16  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
156       benchmark::utils::CheckNEONDOT);
157   }
qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,models::ExecutionPlanFactory model)158   static void qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
159     GEMMEnd2EndBenchmark(state, model,
160       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
161       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
162       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
163       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
164       xnn_init_qu8_conv_minmax_rndnu_neon_params,
165       4 /* mr */, 16  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
166       benchmark::utils::CheckNEONDOT);
167   }
qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State & state,models::ExecutionPlanFactory model)168   static void qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
169     GEMMEnd2EndBenchmark(state, model,
170       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
171       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
172       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
173       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
174       xnn_init_qu8_conv_minmax_rndnu_neon_params,
175       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
176       benchmark::utils::CheckNEONDOT);
177   }
qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State & state,models::ExecutionPlanFactory model)178   static void qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State& state, models::ExecutionPlanFactory model) {
179     GEMMEnd2EndBenchmark(state, model,
180       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
181       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
182       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
183       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
184       xnn_init_qu8_conv_minmax_rndnu_neon_params,
185       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
186       benchmark::utils::CheckNEONDOT);
187   }
188 
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)189   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
190     GEMMEnd2EndBenchmark(state, model,
191       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
192       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
193       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
194       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
195       xnn_init_qu8_conv_minmax_rndnu_neon_params,
196       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
197       benchmark::utils::CheckNEON);
198   }
199 
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State & state,models::ExecutionPlanFactory model)200   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State& state, models::ExecutionPlanFactory model) {
201     GEMMEnd2EndBenchmark(state, model,
202       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
203       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
204       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
205       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
206       xnn_init_qu8_conv_minmax_rndnu_neon_params,
207       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
208       benchmark::utils::CheckNEON);
209   }
210 
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)211   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
212     GEMMEnd2EndBenchmark(state, model,
213       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
214       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
215       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
216       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
217       xnn_init_qu8_conv_minmax_rndnu_neon_params,
218       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
219       benchmark::utils::CheckNEON);
220   }
221 
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,models::ExecutionPlanFactory model)222   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
223     GEMMEnd2EndBenchmark(state, model,
224       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
225       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
226       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
227       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
228       xnn_init_qu8_conv_minmax_rndnu_neon_params,
229       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
230       benchmark::utils::CheckNEON);
231   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,models::ExecutionPlanFactory model)232   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
233     GEMMEnd2EndBenchmark(state, model,
234       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
235       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
236       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
237       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
238       xnn_init_qu8_conv_minmax_rndnu_neon_params,
239       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
240       benchmark::utils::CheckNEON);
241   }
242 
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,models::ExecutionPlanFactory model)243   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
244     GEMMEnd2EndBenchmark(state, model,
245       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
246       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
247       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
248       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
249       xnn_init_qu8_conv_minmax_rndnu_neon_params,
250       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
251       benchmark::utils::CheckNEON);
252   }
253   BENCHMARK_QU8_END2END(qu8_gemm_4x8c4__aarch64_neondot_cortex_a55);
254   BENCHMARK_QU8_END2END(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55);
255   BENCHMARK_QU8_END2END(qu8_gemm_4x8c4__aarch64_neondot_ld128);
256   BENCHMARK_QU8_END2END(qu8_gemm_4x16c4__aarch64_neondot_ld128);
257   BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75);
258   BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75);
259   BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53);
260   BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53);
261   BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64);
262   BENCHMARK_QU8_END2END(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64);
263 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
264 
265 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
qu8_gemm_2x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)266   static void qu8_gemm_2x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
267     GEMMEnd2EndBenchmark(state, model,
268       xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
269       xnn_qu8_igemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
270       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
271       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
272       xnn_init_qu8_conv_minmax_rndnu_neon_params,
273       2 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
274       benchmark::utils::CheckNEON);
275   }
276 
qu8_gemm_3x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)277   static void qu8_gemm_3x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
278     GEMMEnd2EndBenchmark(state, model,
279       xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
280       xnn_qu8_igemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
281       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
282       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
283       xnn_init_qu8_conv_minmax_rndnu_neon_params,
284       3 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
285       benchmark::utils::CheckNEON);
286   }
287 
qu8_gemm_4x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)288   static void qu8_gemm_4x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
289     GEMMEnd2EndBenchmark(state, model,
290       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
291       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
292       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
293       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
294       xnn_init_qu8_conv_minmax_rndnu_neon_params,
295       4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
296       benchmark::utils::CheckNEON);
297   }
298 
qu8_gemm_6x8__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)299   static void qu8_gemm_6x8__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
300     GEMMEnd2EndBenchmark(state, model,
301       xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
302       xnn_qu8_igemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
303       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
304       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
305       xnn_init_qu8_conv_minmax_rndnu_neon_params,
306       6 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
307       benchmark::utils::CheckNEON);
308   }
309 
qu8_gemm_2x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)310   static void qu8_gemm_2x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
311     GEMMEnd2EndBenchmark(state, model,
312       xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
313       xnn_qu8_igemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
314       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
315       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
316       xnn_init_qu8_conv_minmax_rndnu_neon_params,
317       2 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
318       benchmark::utils::CheckNEON);
319   }
320 
qu8_gemm_3x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)321   static void qu8_gemm_3x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
322     GEMMEnd2EndBenchmark(state, model,
323       xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
324       xnn_qu8_igemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
325       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
326       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
327       xnn_init_qu8_conv_minmax_rndnu_neon_params,
328       3 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
329       benchmark::utils::CheckNEON);
330   }
331 
qu8_gemm_4x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)332   static void qu8_gemm_4x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
333     GEMMEnd2EndBenchmark(state, model,
334       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
335       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
336       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
337       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
338       xnn_init_qu8_conv_minmax_rndnu_neon_params,
339       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
340       benchmark::utils::CheckNEON);
341   }
342 
qu8_gemm_6x16__neon_mlal_lane(benchmark::State & state,models::ExecutionPlanFactory model)343   static void qu8_gemm_6x16__neon_mlal_lane(benchmark::State& state, models::ExecutionPlanFactory model) {
344     GEMMEnd2EndBenchmark(state, model,
345       xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
346       xnn_qu8_igemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
347       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
348       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
349       xnn_init_qu8_conv_minmax_rndnu_neon_params,
350       6 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
351       benchmark::utils::CheckNEON);
352   }
353 
qu8_gemm_1x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)354   static void qu8_gemm_1x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
355     GEMMEnd2EndBenchmark(state, model,
356       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
357       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
358       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
359       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
360       xnn_init_qu8_conv_minmax_rndnu_neon_params,
361       1 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
362       benchmark::utils::CheckNEONDOT);
363   }
qu8_gemm_2x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)364   static void qu8_gemm_2x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
365     GEMMEnd2EndBenchmark(state, model,
366       xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot,
367       xnn_qu8_igemm_minmax_rndnu_ukernel_2x8c4__neondot,
368       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
369       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
370       xnn_init_qu8_conv_minmax_rndnu_neon_params,
371       2 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
372       benchmark::utils::CheckNEONDOT);
373   }
qu8_gemm_3x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)374   static void qu8_gemm_3x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
375     GEMMEnd2EndBenchmark(state, model,
376       xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot,
377       xnn_qu8_igemm_minmax_rndnu_ukernel_3x8c4__neondot,
378       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
379       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
380       xnn_init_qu8_conv_minmax_rndnu_neon_params,
381       3 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
382       benchmark::utils::CheckNEONDOT);
383   }
qu8_gemm_4x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)384   static void qu8_gemm_4x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
385     GEMMEnd2EndBenchmark(state, model,
386       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
387       xnn_qu8_igemm_minmax_rndnu_ukernel_4x8c4__neondot,
388       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
389       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
390       xnn_init_qu8_conv_minmax_rndnu_neon_params,
391       4 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
392       benchmark::utils::CheckNEONDOT);
393   }
qu8_gemm_5x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)394   static void qu8_gemm_5x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
395     GEMMEnd2EndBenchmark(state, model,
396       xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot,
397       xnn_qu8_igemm_minmax_rndnu_ukernel_5x8c4__neondot,
398       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
399       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
400       xnn_init_qu8_conv_minmax_rndnu_neon_params,
401       5 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
402       benchmark::utils::CheckNEONDOT);
403   }
qu8_gemm_6x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)404   static void qu8_gemm_6x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
405     GEMMEnd2EndBenchmark(state, model,
406       xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
407       xnn_qu8_igemm_minmax_rndnu_ukernel_6x8c4__neondot,
408       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
409       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
410       xnn_init_qu8_conv_minmax_rndnu_neon_params,
411       6 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
412       benchmark::utils::CheckNEONDOT);
413   }
qu8_gemm_8x8c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)414   static void qu8_gemm_8x8c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
415     GEMMEnd2EndBenchmark(state, model,
416       xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
417       xnn_qu8_igemm_minmax_rndnu_ukernel_8x8c4__neondot,
418       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
419       xnn_qu8_igemm_minmax_rndnu_ukernel_1x8c4__neondot,
420       xnn_init_qu8_conv_minmax_rndnu_neon_params,
421       8 /* mr */, 8  /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
422       benchmark::utils::CheckNEONDOT);
423   }
qu8_gemm_1x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)424   static void qu8_gemm_1x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
425     GEMMEnd2EndBenchmark(state, model,
426       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
427       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
428       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
429       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
430       xnn_init_qu8_conv_minmax_rndnu_neon_params,
431       1 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
432       benchmark::utils::CheckNEONDOT);
433   }
qu8_gemm_2x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)434   static void qu8_gemm_2x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
435     GEMMEnd2EndBenchmark(state, model,
436       xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot,
437       xnn_qu8_igemm_minmax_rndnu_ukernel_2x16c4__neondot,
438       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
439       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
440       xnn_init_qu8_conv_minmax_rndnu_neon_params,
441       2 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
442       benchmark::utils::CheckNEONDOT);
443   }
qu8_gemm_3x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)444   static void qu8_gemm_3x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
445     GEMMEnd2EndBenchmark(state, model,
446       xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot,
447       xnn_qu8_igemm_minmax_rndnu_ukernel_3x16c4__neondot,
448       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
449       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
450       xnn_init_qu8_conv_minmax_rndnu_neon_params,
451       3 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
452       benchmark::utils::CheckNEONDOT);
453   }
qu8_gemm_4x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)454   static void qu8_gemm_4x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
455     GEMMEnd2EndBenchmark(state, model,
456       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
457       xnn_qu8_igemm_minmax_rndnu_ukernel_4x16c4__neondot,
458       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
459       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
460       xnn_init_qu8_conv_minmax_rndnu_neon_params,
461       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
462       benchmark::utils::CheckNEONDOT);
463   }
qu8_gemm_5x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)464   static void qu8_gemm_5x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
465     GEMMEnd2EndBenchmark(state, model,
466       xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot,
467       xnn_qu8_igemm_minmax_rndnu_ukernel_5x16c4__neondot,
468       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
469       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
470       xnn_init_qu8_conv_minmax_rndnu_neon_params,
471       5 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
472       benchmark::utils::CheckNEONDOT);
473   }
qu8_gemm_6x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)474   static void qu8_gemm_6x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
475     GEMMEnd2EndBenchmark(state, model,
476       xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
477       xnn_qu8_igemm_minmax_rndnu_ukernel_6x16c4__neondot,
478       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
479       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
480       xnn_init_qu8_conv_minmax_rndnu_neon_params,
481       6 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
482       benchmark::utils::CheckNEONDOT);
483   }
qu8_gemm_8x16c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)484   static void qu8_gemm_8x16c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
485     GEMMEnd2EndBenchmark(state, model,
486       xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
487       xnn_qu8_igemm_minmax_rndnu_ukernel_8x16c4__neondot,
488       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
489       xnn_qu8_igemm_minmax_rndnu_ukernel_1x16c4__neondot,
490       xnn_init_qu8_conv_minmax_rndnu_neon_params,
491       8 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
492       benchmark::utils::CheckNEONDOT);
493   }
qu8_gemm_2x32c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)494   static void qu8_gemm_2x32c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
495     GEMMEnd2EndBenchmark(state, model,
496       xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot,
497       xnn_qu8_igemm_minmax_rndnu_ukernel_2x32c4__neondot,
498       xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
499       xnn_qu8_igemm_minmax_rndnu_ukernel_1x32c4__neondot,
500       xnn_init_qu8_conv_minmax_rndnu_neon_params,
501       2 /* mr */, 32 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
502       benchmark::utils::CheckNEONDOT);
503   }
qu8_gemm_3x32c4__neondot(benchmark::State & state,models::ExecutionPlanFactory model)504   static void qu8_gemm_3x32c4__neondot(benchmark::State& state, models::ExecutionPlanFactory model) {
505     GEMMEnd2EndBenchmark(state, model,
506       xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot,
507       xnn_qu8_igemm_minmax_rndnu_ukernel_3x32c4__neondot,
508       xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
509       xnn_qu8_igemm_minmax_rndnu_ukernel_1x32c4__neondot,
510       xnn_init_qu8_conv_minmax_rndnu_neon_params,
511       3 /* mr */, 32 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
512       benchmark::utils::CheckNEONDOT);
513   }
514   BENCHMARK_QU8_END2END(qu8_gemm_1x8c4__neondot);
515   BENCHMARK_QU8_END2END(qu8_gemm_2x8c4__neondot);
516   BENCHMARK_QU8_END2END(qu8_gemm_3x8c4__neondot);
517   BENCHMARK_QU8_END2END(qu8_gemm_4x8c4__neondot);
518   BENCHMARK_QU8_END2END(qu8_gemm_5x8c4__neondot);
519   BENCHMARK_QU8_END2END(qu8_gemm_6x8c4__neondot);
520   BENCHMARK_QU8_END2END(qu8_gemm_8x8c4__neondot);
521   BENCHMARK_QU8_END2END(qu8_gemm_1x16c4__neondot);
522   BENCHMARK_QU8_END2END(qu8_gemm_2x16c4__neondot);
523   BENCHMARK_QU8_END2END(qu8_gemm_3x16c4__neondot);
524   BENCHMARK_QU8_END2END(qu8_gemm_4x16c4__neondot);
525   BENCHMARK_QU8_END2END(qu8_gemm_5x16c4__neondot);
526   BENCHMARK_QU8_END2END(qu8_gemm_6x16c4__neondot);
527   BENCHMARK_QU8_END2END(qu8_gemm_8x16c4__neondot);
528   BENCHMARK_QU8_END2END(qu8_gemm_2x32c4__neondot);
529   BENCHMARK_QU8_END2END(qu8_gemm_3x32c4__neondot);
530 
531   BENCHMARK_QU8_END2END(qu8_gemm_2x8__neon_mlal_lane);
532   BENCHMARK_QU8_END2END(qu8_gemm_3x8__neon_mlal_lane);
533   BENCHMARK_QU8_END2END(qu8_gemm_4x8__neon_mlal_lane);
534   BENCHMARK_QU8_END2END(qu8_gemm_6x8__neon_mlal_lane);
535   BENCHMARK_QU8_END2END(qu8_gemm_2x16__neon_mlal_lane);
536   BENCHMARK_QU8_END2END(qu8_gemm_3x16__neon_mlal_lane);
537   BENCHMARK_QU8_END2END(qu8_gemm_4x16__neon_mlal_lane);
538   BENCHMARK_QU8_END2END(qu8_gemm_6x16__neon_mlal_lane);
539 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
540 
541 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
qu8_gemm_2x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)542   static void qu8_gemm_2x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
543     GEMMEnd2EndBenchmark(state, model,
544       xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
545       xnn_qu8_igemm_minmax_fp32_ukernel_2x16c8__avx512skx,
546       xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
547       xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
548       xnn_init_qu8_conv_minmax_fp32_avx512_params,
549       2 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
550       benchmark::utils::CheckAVX512F);
551   }
552 
qu8_gemm_3x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)553   static void qu8_gemm_3x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
554     GEMMEnd2EndBenchmark(state, model,
555       xnn_qu8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
556       xnn_qu8_igemm_minmax_fp32_ukernel_3x16c8__avx512skx,
557       xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
558       xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
559       xnn_init_qu8_conv_minmax_fp32_avx512_params,
560       3 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
561       benchmark::utils::CheckAVX512F);
562   }
563 
qu8_gemm_4x16c8__avx512skx(benchmark::State & state,models::ExecutionPlanFactory model)564   static void qu8_gemm_4x16c8__avx512skx(benchmark::State& state, models::ExecutionPlanFactory model) {
565     GEMMEnd2EndBenchmark(state, model,
566       xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
567       xnn_qu8_igemm_minmax_fp32_ukernel_4x16c8__avx512skx,
568       xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
569       xnn_qu8_igemm_minmax_fp32_ukernel_1x16c8__avx512skx,
570       xnn_init_qu8_conv_minmax_fp32_avx512_params,
571       4 /* mr */, 16 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
572       benchmark::utils::CheckAVX512F);
573   }
574 
qu8_gemm_2x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)575   static void qu8_gemm_2x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
576     GEMMEnd2EndBenchmark(state, model,
577       xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
578       xnn_qu8_igemm_minmax_fp32_ukernel_2x8c8__avx2,
579       xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
580       xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
581       xnn_init_qu8_conv_minmax_fp32_avx2_params,
582       2 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
583       benchmark::utils::CheckAVX2);
584   }
qu8_gemm_3x8c8__avx2(benchmark::State & state,models::ExecutionPlanFactory model)585   static void qu8_gemm_3x8c8__avx2(benchmark::State& state, models::ExecutionPlanFactory model) {
586     GEMMEnd2EndBenchmark(state, model,
587       xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
588       xnn_qu8_igemm_minmax_fp32_ukernel_3x8c8__avx2,
589       xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
590       xnn_qu8_igemm_minmax_fp32_ukernel_1x8c8__avx2,
591       xnn_init_qu8_conv_minmax_fp32_avx2_params,
592       3 /* mr */, 8 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
593       benchmark::utils::CheckAVX2);
594   }
595 
qu8_gemm_2x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)596   static void qu8_gemm_2x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
597     GEMMEnd2EndBenchmark(state, model,
598       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
599       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld64,
600       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
601       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
602       xnn_init_qu8_conv_minmax_fp32_sse2_params,
603       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
604       benchmark::utils::CheckXOP);
605   }
qu8_gemm_2x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)606   static void qu8_gemm_2x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
607     GEMMEnd2EndBenchmark(state, model,
608       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
609       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__xop_ld128,
610       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
611       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
612       xnn_init_qu8_conv_minmax_fp32_sse2_params,
613       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
614       benchmark::utils::CheckXOP);
615   }
qu8_gemm_3x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)616   static void qu8_gemm_3x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
617     GEMMEnd2EndBenchmark(state, model,
618       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
619       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld64,
620       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
621       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
622       xnn_init_qu8_conv_minmax_fp32_sse2_params,
623       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
624       benchmark::utils::CheckXOP);
625   }
qu8_gemm_3x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)626   static void qu8_gemm_3x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
627     GEMMEnd2EndBenchmark(state, model,
628       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
629       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__xop_ld128,
630       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
631       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
632       xnn_init_qu8_conv_minmax_fp32_sse2_params,
633       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
634       benchmark::utils::CheckXOP);
635   }
qu8_gemm_4x4c2__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)636   static void qu8_gemm_4x4c2__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
637     GEMMEnd2EndBenchmark(state, model,
638       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
639       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld64,
640       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
641       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld64,
642       xnn_init_qu8_conv_minmax_fp32_sse2_params,
643       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
644       benchmark::utils::CheckXOP);
645   }
qu8_gemm_4x4c2__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)646   static void qu8_gemm_4x4c2__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
647     GEMMEnd2EndBenchmark(state, model,
648       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
649       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__xop_ld128,
650       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
651       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__xop_ld128,
652       xnn_init_qu8_conv_minmax_fp32_sse2_params,
653       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
654       benchmark::utils::CheckXOP);
655   }
656 
qu8_gemm_2x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)657   static void qu8_gemm_2x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
658     GEMMEnd2EndBenchmark(state, model,
659       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
660       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
661       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
662       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
663       xnn_init_qu8_conv_minmax_fp32_sse2_params,
664       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
665       benchmark::utils::CheckXOP);
666   }
qu8_gemm_3x4c8__xop_ld64(benchmark::State & state,models::ExecutionPlanFactory model)667   static void qu8_gemm_3x4c8__xop_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
668     GEMMEnd2EndBenchmark(state, model,
669       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
670       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
671       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
672       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
673       xnn_init_qu8_conv_minmax_fp32_sse2_params,
674       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
675       benchmark::utils::CheckXOP);
676   }
677 
qu8_gemm_2x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)678   static void qu8_gemm_2x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
679     GEMMEnd2EndBenchmark(state, model,
680       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
681       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
682       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
683       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
684       xnn_init_qu8_conv_minmax_fp32_sse2_params,
685       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
686       benchmark::utils::CheckXOP);
687   }
qu8_gemm_3x4c8__xop_ld128(benchmark::State & state,models::ExecutionPlanFactory model)688   static void qu8_gemm_3x4c8__xop_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
689     GEMMEnd2EndBenchmark(state, model,
690       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
691       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
692       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
693       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
694       xnn_init_qu8_conv_minmax_fp32_sse2_params,
695       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
696       benchmark::utils::CheckXOP);
697   }
698 
qu8_gemm_2x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)699   static void qu8_gemm_2x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
700     GEMMEnd2EndBenchmark(state, model,
701       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
702       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld64,
703       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
704       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
705       xnn_init_qu8_conv_minmax_fp32_sse2_params,
706       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
707       benchmark::utils::CheckAVX);
708   }
qu8_gemm_2x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)709   static void qu8_gemm_2x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
710     GEMMEnd2EndBenchmark(state, model,
711       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
712       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__avx_ld128,
713       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
714       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
715       xnn_init_qu8_conv_minmax_fp32_sse2_params,
716       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
717       benchmark::utils::CheckAVX);
718   }
qu8_gemm_3x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)719   static void qu8_gemm_3x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
720     GEMMEnd2EndBenchmark(state, model,
721       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
722       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld64,
723       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
724       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
725       xnn_init_qu8_conv_minmax_fp32_sse2_params,
726       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
727       benchmark::utils::CheckAVX);
728   }
qu8_gemm_3x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)729   static void qu8_gemm_3x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
730     GEMMEnd2EndBenchmark(state, model,
731       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
732       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__avx_ld128,
733       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
734       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
735       xnn_init_qu8_conv_minmax_fp32_sse2_params,
736       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
737       benchmark::utils::CheckAVX);
738   }
qu8_gemm_4x4c2__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)739   static void qu8_gemm_4x4c2__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
740     GEMMEnd2EndBenchmark(state, model,
741       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
742       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld64,
743       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
744       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld64,
745       xnn_init_qu8_conv_minmax_fp32_sse2_params,
746       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
747       benchmark::utils::CheckAVX);
748   }
qu8_gemm_4x4c2__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)749   static void qu8_gemm_4x4c2__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
750     GEMMEnd2EndBenchmark(state, model,
751       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
752       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__avx_ld128,
753       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
754       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__avx_ld128,
755       xnn_init_qu8_conv_minmax_fp32_sse2_params,
756       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
757       benchmark::utils::CheckAVX);
758   }
759 
760 
qu8_gemm_2x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)761   static void qu8_gemm_2x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
762     GEMMEnd2EndBenchmark(state, model,
763       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
764       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
765       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
766       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
767       xnn_init_qu8_conv_minmax_fp32_sse2_params,
768       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
769       benchmark::utils::CheckAVX);
770   }
qu8_gemm_2x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)771   static void qu8_gemm_2x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
772     GEMMEnd2EndBenchmark(state, model,
773       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
774       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
775       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
776       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
777       xnn_init_qu8_conv_minmax_fp32_sse2_params,
778       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
779       benchmark::utils::CheckAVX);
780   }
qu8_gemm_3x4c8__avx_ld64(benchmark::State & state,models::ExecutionPlanFactory model)781   static void qu8_gemm_3x4c8__avx_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
782     GEMMEnd2EndBenchmark(state, model,
783       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
784       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
785       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
786       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
787       xnn_init_qu8_conv_minmax_fp32_sse2_params,
788       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
789       benchmark::utils::CheckAVX);
790   }
qu8_gemm_3x4c8__avx_ld128(benchmark::State & state,models::ExecutionPlanFactory model)791   static void qu8_gemm_3x4c8__avx_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
792     GEMMEnd2EndBenchmark(state, model,
793       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
794       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
795       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
796       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
797       xnn_init_qu8_conv_minmax_fp32_sse2_params,
798       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
799       benchmark::utils::CheckAVX);
800   }
801 
qu8_gemm_2x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)802   static void qu8_gemm_2x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
803     GEMMEnd2EndBenchmark(state, model,
804       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
805       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld64,
806       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
807       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
808       xnn_init_qu8_conv_minmax_fp32_sse2_params,
809       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
810       benchmark::utils::CheckSSE41);
811   }
qu8_gemm_2x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)812   static void qu8_gemm_2x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
813     GEMMEnd2EndBenchmark(state, model,
814       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
815       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse41_ld128,
816       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
817       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
818       xnn_init_qu8_conv_minmax_fp32_sse2_params,
819       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
820       benchmark::utils::CheckSSE41);
821   }
qu8_gemm_3x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)822   static void qu8_gemm_3x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
823     GEMMEnd2EndBenchmark(state, model,
824       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
825       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld64,
826       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
827       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
828       xnn_init_qu8_conv_minmax_fp32_sse2_params,
829       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
830       benchmark::utils::CheckSSE41);
831   }
qu8_gemm_3x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)832   static void qu8_gemm_3x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
833     GEMMEnd2EndBenchmark(state, model,
834       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
835       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse41_ld128,
836       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
837       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
838       xnn_init_qu8_conv_minmax_fp32_sse2_params,
839       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
840       benchmark::utils::CheckSSE41);
841   }
qu8_gemm_4x4c2__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)842   static void qu8_gemm_4x4c2__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
843     GEMMEnd2EndBenchmark(state, model,
844       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
845       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld64,
846       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
847       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld64,
848       xnn_init_qu8_conv_minmax_fp32_sse2_params,
849       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
850       benchmark::utils::CheckSSE41);
851   }
qu8_gemm_4x4c2__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)852   static void qu8_gemm_4x4c2__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
853     GEMMEnd2EndBenchmark(state, model,
854       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
855       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse41_ld128,
856       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
857       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse41_ld128,
858       xnn_init_qu8_conv_minmax_fp32_sse2_params,
859       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */,
860       benchmark::utils::CheckSSE41);
861   }
862 
qu8_gemm_2x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)863   static void qu8_gemm_2x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
864     GEMMEnd2EndBenchmark(state, model,
865       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
866       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
867       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
868       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
869       xnn_init_qu8_conv_minmax_fp32_sse2_params,
870       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
871       benchmark::utils::CheckSSE41);
872   }
qu8_gemm_2x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)873   static void qu8_gemm_2x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
874     GEMMEnd2EndBenchmark(state, model,
875       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
876       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
877       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
878       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
879       xnn_init_qu8_conv_minmax_fp32_sse2_params,
880       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
881       benchmark::utils::CheckSSE41);
882   }
qu8_gemm_3x4c8__sse41_ld64(benchmark::State & state,models::ExecutionPlanFactory model)883   static void qu8_gemm_3x4c8__sse41_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
884     GEMMEnd2EndBenchmark(state, model,
885       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
886       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
887       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
888       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
889       xnn_init_qu8_conv_minmax_fp32_sse2_params,
890       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
891       benchmark::utils::CheckSSE41);
892   }
qu8_gemm_3x4c8__sse41_ld128(benchmark::State & state,models::ExecutionPlanFactory model)893   static void qu8_gemm_3x4c8__sse41_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
894     GEMMEnd2EndBenchmark(state, model,
895       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
896       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
897       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
898       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
899       xnn_init_qu8_conv_minmax_fp32_sse2_params,
900       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */,
901       benchmark::utils::CheckSSE41);
902   }
903 
qu8_gemm_2x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)904   static void qu8_gemm_2x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
905     GEMMEnd2EndBenchmark(state, model,
906       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
907       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64,
908       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
909       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
910       xnn_init_qu8_conv_minmax_fp32_sse2_params,
911       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
912   }
qu8_gemm_2x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)913   static void qu8_gemm_2x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
914     GEMMEnd2EndBenchmark(state, model,
915       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
916       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld128,
917       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
918       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
919       xnn_init_qu8_conv_minmax_fp32_sse2_params,
920       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
921   }
qu8_gemm_3x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)922   static void qu8_gemm_3x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
923     GEMMEnd2EndBenchmark(state, model,
924       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
925       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64,
926       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
927       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
928       xnn_init_qu8_conv_minmax_fp32_sse2_params,
929       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
930   }
qu8_gemm_3x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)931   static void qu8_gemm_3x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
932     GEMMEnd2EndBenchmark(state, model,
933       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
934       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld128,
935       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
936       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
937       xnn_init_qu8_conv_minmax_fp32_sse2_params,
938       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
939   }
qu8_gemm_4x4c2__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)940   static void qu8_gemm_4x4c2__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
941     GEMMEnd2EndBenchmark(state, model,
942       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
943       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld64,
944       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
945       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64,
946       xnn_init_qu8_conv_minmax_fp32_sse2_params,
947       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
948   }
qu8_gemm_4x4c2__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)949   static void qu8_gemm_4x4c2__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
950     GEMMEnd2EndBenchmark(state, model,
951       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
952       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__sse2_ld128,
953       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
954       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld128,
955       xnn_init_qu8_conv_minmax_fp32_sse2_params,
956       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 0 /* log2_sr */);
957   }
958 
qu8_gemm_2x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)959   static void qu8_gemm_2x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
960     GEMMEnd2EndBenchmark(state, model,
961       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
962       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
963       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
964       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
965       xnn_init_qu8_conv_minmax_fp32_sse2_params,
966       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
967   }
qu8_gemm_2x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)968   static void qu8_gemm_2x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
969     GEMMEnd2EndBenchmark(state, model,
970       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
971       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
972       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
973       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
974       xnn_init_qu8_conv_minmax_fp32_sse2_params,
975       2 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
976   }
qu8_gemm_3x4c8__sse2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)977   static void qu8_gemm_3x4c8__sse2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
978     GEMMEnd2EndBenchmark(state, model,
979       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
980       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
981       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
982       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
983       xnn_init_qu8_conv_minmax_fp32_sse2_params,
984       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
985   }
qu8_gemm_3x4c8__sse2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)986   static void qu8_gemm_3x4c8__sse2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
987     GEMMEnd2EndBenchmark(state, model,
988       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
989       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
990       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
991       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
992       xnn_init_qu8_conv_minmax_fp32_sse2_params,
993       3 /* mr */, 4 /* nr */, 3 /* log2_kr */, 0 /* log2_sr */);
994   }
995 
996 
997   BENCHMARK_QU8_END2END(qu8_gemm_2x16c8__avx512skx);
998   BENCHMARK_QU8_END2END(qu8_gemm_3x16c8__avx512skx);
999   BENCHMARK_QU8_END2END(qu8_gemm_4x16c8__avx512skx);
1000 
1001   BENCHMARK_QU8_END2END(qu8_gemm_2x8c8__avx2);
1002   BENCHMARK_QU8_END2END(qu8_gemm_3x8c8__avx2);
1003 
1004   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__xop_ld64);
1005   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__xop_ld128);
1006   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__xop_ld64);
1007   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__xop_ld128);
1008   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__xop_ld64);
1009   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__xop_ld128);
1010 
1011   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__xop_ld64);
1012   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__xop_ld128);
1013   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__xop_ld64);
1014   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__xop_ld128);
1015 
1016   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__avx_ld64);
1017   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__avx_ld128);
1018   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__avx_ld64);
1019   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__avx_ld128);
1020   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__avx_ld64);
1021   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__avx_ld128);
1022 
1023   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__avx_ld64);
1024   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__avx_ld128);
1025   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__avx_ld64);
1026   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__avx_ld128);
1027 
1028   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse41_ld64);
1029   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse41_ld128);
1030   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse41_ld64);
1031   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse41_ld128);
1032   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse41_ld64);
1033   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse41_ld128);
1034 
1035   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse41_ld64);
1036   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse41_ld128);
1037   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse41_ld64);
1038   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse41_ld128);
1039 
1040   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse2_ld64);
1041   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__sse2_ld128);
1042   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse2_ld64);
1043   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__sse2_ld128);
1044   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse2_ld64);
1045   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__sse2_ld128);
1046 
1047   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse2_ld64);
1048   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__sse2_ld128);
1049   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse2_ld64);
1050   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__sse2_ld128);
1051 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1052 
1053 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1054   static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1055     GEMMEnd2EndBenchmark(state, model,
1056       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1057       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1058       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1059       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1060       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1061       2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1062   }
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1063   static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1064     GEMMEnd2EndBenchmark(state, model,
1065       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1066       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1067       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1068       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1069       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1070       2 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1071   }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1072   static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1073     GEMMEnd2EndBenchmark(state, model,
1074       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1075       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1076       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1077       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1078       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1079       3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1080   }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1081   static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1082     GEMMEnd2EndBenchmark(state, model,
1083       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1084       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1085       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1086       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1087       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1088       3 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1089   }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1090   static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1091     GEMMEnd2EndBenchmark(state, model,
1092       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1093       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1094       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1095       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1096       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1097       4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1098   }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1099   static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1100     GEMMEnd2EndBenchmark(state, model,
1101       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1102       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1103       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1104       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1105       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1106       4 /* mr */, 4 /* nr */, 1 /* log2_kr */);
1107   }
1108 
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1109   static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1110     GEMMEnd2EndBenchmark(state, model,
1111       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1112       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1113       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1114       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1115       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1116       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1117   }
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1118   static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1119     GEMMEnd2EndBenchmark(state, model,
1120       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1121       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1122       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1123       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1124       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1125       2 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1126   }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1127   static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1128     GEMMEnd2EndBenchmark(state, model,
1129       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1130       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1131       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1132       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1133       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1134       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1135   }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1136   static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1137     GEMMEnd2EndBenchmark(state, model,
1138       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1139       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1140       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1141       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1142       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1143       3 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1144   }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1145   static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1146     GEMMEnd2EndBenchmark(state, model,
1147       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1148       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1149       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1150       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1151       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1152       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1153   }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1154   static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1155     GEMMEnd2EndBenchmark(state, model,
1156       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1157       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1158       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1159       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1160       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1161       4 /* mr */, 4 /* nr */, 1 /* log2_kr */, 2 /* log2_sr */);
1162   }
1163 
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1164   static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1165     GEMMEnd2EndBenchmark(state, model,
1166       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1167       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1168       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1169       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1170       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1171       2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1172   }
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1173   static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1174     GEMMEnd2EndBenchmark(state, model,
1175       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1176       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1177       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1178       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1179       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1180       2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1181   }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1182   static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1183     GEMMEnd2EndBenchmark(state, model,
1184       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1185       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1186       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1187       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1188       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1189       3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1190   }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1191   static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1192     GEMMEnd2EndBenchmark(state, model,
1193       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1194       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1195       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1196       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1197       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1198       3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1199   }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1200   static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1201     GEMMEnd2EndBenchmark(state, model,
1202       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1203       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1204       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1205       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1206       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1207       4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1208   }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1209   static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1210     GEMMEnd2EndBenchmark(state, model,
1211       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1212       xnn_qu8_igemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1213       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1214       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1215       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1216       4 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1217   }
1218 
qu8_gemm_2x4c8__wasmsimd_mul32_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1219   static void qu8_gemm_2x4c8__wasmsimd_mul32_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1220     GEMMEnd2EndBenchmark(state, model,
1221       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64,
1222       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64,
1223       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64,
1224       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64,
1225       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1226       2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1227   }
qu8_gemm_2x4c8__wasmsimd_mul32_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1228   static void qu8_gemm_2x4c8__wasmsimd_mul32_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1229     GEMMEnd2EndBenchmark(state, model,
1230       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128,
1231       xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128,
1232       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128,
1233       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128,
1234       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1235       2 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1236   }
qu8_gemm_3x4c8__wasmsimd_mul32_ld64(benchmark::State & state,models::ExecutionPlanFactory model)1237   static void qu8_gemm_3x4c8__wasmsimd_mul32_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
1238     GEMMEnd2EndBenchmark(state, model,
1239       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64,
1240       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64,
1241       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64,
1242       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64,
1243       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1244       3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1245   }
qu8_gemm_3x4c8__wasmsimd_mul32_ld128(benchmark::State & state,models::ExecutionPlanFactory model)1246   static void qu8_gemm_3x4c8__wasmsimd_mul32_ld128(benchmark::State& state, models::ExecutionPlanFactory model) {
1247     GEMMEnd2EndBenchmark(state, model,
1248       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128,
1249       xnn_qu8_igemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128,
1250       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128,
1251       xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128,
1252       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1253       3 /* mr */, 4 /* nr */, 3 /* log2_kr */);
1254   }
1255 
1256   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)1257   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
1258   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
1259   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
1260   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
1261   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
1262 
1263   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
1264   BENCHMARK_QU8_END2END(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
1265   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
1266   BENCHMARK_QU8_END2END(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
1267   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
1268   BENCHMARK_QU8_END2END(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
1269 
1270   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
1271   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
1272   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
1273   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
1274   BENCHMARK_QU8_END2END(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
1275   BENCHMARK_QU8_END2END(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
1276 
1277   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_mul32_ld64)
1278   BENCHMARK_QU8_END2END(qu8_gemm_2x4c8__wasmsimd_mul32_ld128)
1279   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_mul32_ld64)
1280   BENCHMARK_QU8_END2END(qu8_gemm_3x4c8__wasmsimd_mul32_ld128)
1281 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1282 
1283 
1284 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1285   static void qu8_gemm_2x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1286     GEMMEnd2EndBenchmark(state, model,
1287       xnn_qu8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1288       xnn_qu8_igemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1289       xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1290       xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1291       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1292       2 /* mr */, 2 /* nr */);
1293   }
qu8_gemm_3x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1294   static void qu8_gemm_3x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1295     GEMMEnd2EndBenchmark(state, model,
1296       xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1297       xnn_qu8_igemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1298       xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1299       xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1300       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1301       3 /* mr */, 2 /* nr */);
1302   }
qu8_gemm_4x2__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1303   static void qu8_gemm_4x2__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1304     GEMMEnd2EndBenchmark(state, model,
1305       xnn_qu8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1306       xnn_qu8_igemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1307       xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1308       xnn_qu8_igemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1309       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1310       4 /* mr */, 2 /* nr */);
1311   }
qu8_gemm_2x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1312   static void qu8_gemm_2x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1313     GEMMEnd2EndBenchmark(state, model,
1314       xnn_qu8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1315       xnn_qu8_igemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1316       xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1317       xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1318       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1319       2 /* mr */, 4 /* nr */);
1320   }
qu8_gemm_3x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1321   static void qu8_gemm_3x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1322     GEMMEnd2EndBenchmark(state, model,
1323       xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1324       xnn_qu8_igemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1325       xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1326       xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1327       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1328       3 /* mr */, 4 /* nr */);
1329   }
qu8_gemm_4x4__wasm_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1330   static void qu8_gemm_4x4__wasm_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1331     GEMMEnd2EndBenchmark(state, model,
1332       xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1333       xnn_qu8_igemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1334       xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1335       xnn_qu8_igemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1336       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1337       4 /* mr */, 4 /* nr */);
1338   }
1339 
1340   BENCHMARK_QU8_END2END(qu8_gemm_2x2__wasm_fmagic)
BENCHMARK_QU8_END2END(qu8_gemm_3x2__wasm_fmagic)1341   BENCHMARK_QU8_END2END(qu8_gemm_3x2__wasm_fmagic)
1342   BENCHMARK_QU8_END2END(qu8_gemm_4x2__wasm_fmagic)
1343   BENCHMARK_QU8_END2END(qu8_gemm_2x4__wasm_fmagic)
1344   BENCHMARK_QU8_END2END(qu8_gemm_3x4__wasm_fmagic)
1345   BENCHMARK_QU8_END2END(qu8_gemm_4x4__wasm_fmagic)
1346 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1347 
1348 
1349 static void qu8_gemm_2x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1350   GEMMEnd2EndBenchmark(state, model,
1351     xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1352     xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1353     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1354     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1355     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1356     2 /* mr */, 2 /* nr */);
1357 }
qu8_gemm_3x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1358 static void qu8_gemm_3x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1359   GEMMEnd2EndBenchmark(state, model,
1360     xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1361     xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1362     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1363     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1364     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1365     3 /* mr */, 2 /* nr */);
1366 }
qu8_gemm_4x2__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1367 static void qu8_gemm_4x2__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1368   GEMMEnd2EndBenchmark(state, model,
1369     xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1370     xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1371     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1372     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1373     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1374     4 /* mr */, 2 /* nr */);
1375 }
qu8_gemm_2x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1376 static void qu8_gemm_2x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1377   GEMMEnd2EndBenchmark(state, model,
1378     xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1379     xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1380     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1381     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1382     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1383     2 /* mr */, 4 /* nr */);
1384 }
qu8_gemm_3x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1385 static void qu8_gemm_3x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1386   GEMMEnd2EndBenchmark(state, model,
1387     xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1388     xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1389     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1390     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1391     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1392     3 /* mr */, 4 /* nr */);
1393 }
qu8_gemm_4x4__scalar_fmagic(benchmark::State & state,models::ExecutionPlanFactory model)1394 static void qu8_gemm_4x4__scalar_fmagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1395   GEMMEnd2EndBenchmark(state, model,
1396     xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1397     xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1398     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1399     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1400     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1401     4 /* mr */, 4 /* nr */);
1402 }
1403 
qu8_gemm_2x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1404 static void qu8_gemm_2x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1405   GEMMEnd2EndBenchmark(state, model,
1406     xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1407     xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1408     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1409     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1410     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1411     2 /* mr */, 2 /* nr */);
1412 }
qu8_gemm_3x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1413 static void qu8_gemm_3x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1414   GEMMEnd2EndBenchmark(state, model,
1415     xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1416     xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1417     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1418     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1419     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1420     3 /* mr */, 2 /* nr */);
1421 }
qu8_gemm_4x2__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1422 static void qu8_gemm_4x2__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1423   GEMMEnd2EndBenchmark(state, model,
1424     xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1425     xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1426     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1427     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1428     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1429     4 /* mr */, 2 /* nr */);
1430 }
qu8_gemm_2x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1431 static void qu8_gemm_2x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1432   GEMMEnd2EndBenchmark(state, model,
1433     xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1434     xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1435     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1436     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1437     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1438     2 /* mr */, 4 /* nr */);
1439 }
qu8_gemm_3x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1440 static void qu8_gemm_3x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1441   GEMMEnd2EndBenchmark(state, model,
1442     xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1443     xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1444     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1445     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1446     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1447     3 /* mr */, 4 /* nr */);
1448 }
qu8_gemm_4x4__scalar_imagic(benchmark::State & state,models::ExecutionPlanFactory model)1449 static void qu8_gemm_4x4__scalar_imagic(benchmark::State& state, models::ExecutionPlanFactory model) {
1450   GEMMEnd2EndBenchmark(state, model,
1451     xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1452     xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1453     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1454     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1455     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1456     4 /* mr */, 4 /* nr */);
1457 }
1458 
qu8_gemm_2x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1459 static void qu8_gemm_2x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1460   GEMMEnd2EndBenchmark(state, model,
1461     xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1462     xnn_qu8_igemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1463     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1464     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1465     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1466     2 /* mr */, 2 /* nr */);
1467 }
qu8_gemm_3x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1468 static void qu8_gemm_3x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1469   GEMMEnd2EndBenchmark(state, model,
1470     xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1471     xnn_qu8_igemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1472     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1473     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1474     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1475     3 /* mr */, 2 /* nr */);
1476 }
qu8_gemm_4x2__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1477 static void qu8_gemm_4x2__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1478   GEMMEnd2EndBenchmark(state, model,
1479     xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1480     xnn_qu8_igemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1481     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1482     xnn_qu8_igemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1483     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1484     4 /* mr */, 2 /* nr */);
1485 }
qu8_gemm_2x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1486 static void qu8_gemm_2x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1487   GEMMEnd2EndBenchmark(state, model,
1488     xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1489     xnn_qu8_igemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1490     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1491     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1492     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1493     2 /* mr */, 4 /* nr */);
1494 }
qu8_gemm_3x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1495 static void qu8_gemm_3x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1496   GEMMEnd2EndBenchmark(state, model,
1497     xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1498     xnn_qu8_igemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1499     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1500     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1501     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1502     3 /* mr */, 4 /* nr */);
1503 }
qu8_gemm_4x4__scalar_lrintf(benchmark::State & state,models::ExecutionPlanFactory model)1504 static void qu8_gemm_4x4__scalar_lrintf(benchmark::State& state, models::ExecutionPlanFactory model) {
1505   GEMMEnd2EndBenchmark(state, model,
1506     xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1507     xnn_qu8_igemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1508     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1509     xnn_qu8_igemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1510     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1511     4 /* mr */, 4 /* nr */);
1512 }
1513 
1514 BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_fmagic)
1515 BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_fmagic)
1516 BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_fmagic)
1517 BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_fmagic)
1518 BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_fmagic)
1519 BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_fmagic)
1520 
1521 BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_imagic)
1522 BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_imagic)
1523 BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_imagic)
1524 BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_imagic)
1525 BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_imagic)
1526 BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_imagic)
1527 
1528 BENCHMARK_QU8_END2END(qu8_gemm_2x2__scalar_lrintf)
1529 BENCHMARK_QU8_END2END(qu8_gemm_3x2__scalar_lrintf)
1530 BENCHMARK_QU8_END2END(qu8_gemm_4x2__scalar_lrintf)
1531 BENCHMARK_QU8_END2END(qu8_gemm_2x4__scalar_lrintf)
1532 BENCHMARK_QU8_END2END(qu8_gemm_3x4__scalar_lrintf)
1533 BENCHMARK_QU8_END2END(qu8_gemm_4x4__scalar_lrintf)
1534 
1535 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1536 BENCHMARK_MAIN();
1537 #endif
1538