• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cfloat>
8 #include <chrono>
9 #include <cmath>
10 #include <functional>
11 #include <limits>
12 #include <mutex>
13 #include <random>
14 #include <vector>
15 
16 #include <cpuinfo.h>
17 
18 #include <benchmark/benchmark.h>
19 #ifdef BENCHMARK_RUY
20 #include "ruy/ruy.h"
21 #endif  // BENCHMARK_RUY
22 #include "bench/gemm.h"
23 #include "bench/utils.h"
24 #include <xnnpack/AlignedAllocator.h>
25 #include <xnnpack/common.h>
26 #include <xnnpack/gemm.h>
27 #include <xnnpack/pack.h>
28 #include <xnnpack/params-init.h>
29 #include <xnnpack/params.h>
30 
GEMMBenchmark(benchmark::State & state,xnn_qs8_gemm_minmax_ukernel_function gemm,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_qs8_conv_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr,bool extended_weights=false)31 static void GEMMBenchmark(benchmark::State& state,
32   xnn_qs8_gemm_minmax_ukernel_function gemm,
33   size_t mr, size_t nr, size_t kr, size_t sr,
34   xnn_init_qs8_conv_minmax_params_fn init_params,
35   benchmark::utils::IsaCheckFunction isa_check = nullptr,
36   bool extended_weights = false)
37 {
38   if (!cpuinfo_initialize()) {
39     state.SkipWithError("cpuinfo initialization failed");
40     return;
41   }
42   if (isa_check && !isa_check(state)) {
43     return;
44   }
45 
46   const size_t mc = state.range(0);
47   const size_t nc = state.range(1);
48   const size_t kc = state.range(2);
49 
50   const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
51   const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
52 
53   std::random_device random_device;
54   auto rng = std::mt19937(random_device());
55   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
56   auto i8rng = std::bind(
57     std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()), std::ref(rng));
58 
59   std::vector<int8_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(int8_t));
60   std::generate(a.begin(), a.end(), std::ref(i8rng));
61   std::vector<int8_t> k(nc * kc);
62   std::generate(k.begin(), k.end(), std::ref(i8rng));
63   std::vector<int32_t> b(nc);
64   std::generate(b.begin(), b.end(), std::ref(i32rng));
65 
66   const size_t w_element_size = extended_weights ? sizeof(int16_t) : sizeof(int8_t);
67   const size_t w_size = nc_stride * sizeof(int32_t) + kc_stride * nc_stride * w_element_size;
68   const size_t c_elements = mc * nc;
69   const size_t num_buffers = 1 +
70     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), w_size + c_elements * sizeof(int8_t));
71 
72   std::vector<char, AlignedAllocator<char, 64>> w(w_size * num_buffers);
73   std::fill(w.begin(), w.end(), 0);
74   const xnn_qs8_packing_params packing_params = { 127 };
75   if (extended_weights) {
76     xnn_pack_qs8_gemm_xw_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
77   } else {
78     xnn_pack_qs8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
79   }
80   std::vector<int8_t> c(c_elements * num_buffers);
81   std::fill(c.begin(), c.end(), 0xA5);
82 
83   union xnn_qs8_conv_minmax_params quantization_params;
84   init_params(&quantization_params, 0.75f, 127, -127, 126);
85 
86   size_t buffer_index = 0;
87   for (auto _ : state) {
88     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
89     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
90     // - W is not in cache (for any cache level)
91     // - C is not in cache (for any cache level)
92     state.PauseTiming();
93     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
94     buffer_index = (buffer_index + 1) % num_buffers;
95     state.ResumeTiming();
96 
97     for (uint32_t m = 0; m < mc; m += mr) {
98       const uint32_t mb = min(mc - m, mr);
99       for (uint32_t n = 0; n < nc; n += nr) {
100         const uint32_t nb = min(nc - n, nr);
101         gemm(
102           mb, nb, kc * sizeof(int8_t),
103           a.data() + m * kc, kc * sizeof(int8_t),
104           w.data() + w_size * buffer_index + n * (kc_stride * w_element_size + sizeof(int32_t)),
105           c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(int8_t), nr * sizeof(int8_t),
106           &quantization_params);
107       }
108     }
109   }
110 
111   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
112   if (cpu_frequency != 0) {
113     state.counters["cpufreq"] = cpu_frequency;
114   }
115 
116   state.counters["OPS"] = benchmark::Counter(
117     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
118 }
119 
120 #ifdef BENCHMARK_RUY
RuyBenchmark(benchmark::State & state,size_t threads)121 static void RuyBenchmark(benchmark::State& state, size_t threads)
122 {
123   const size_t mc = state.range(0);
124   const size_t nc = state.range(1);
125   const size_t kc = state.range(2);
126 
127   std::random_device random_device;
128   auto rng = std::mt19937(random_device());
129   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
130   auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
131 
132   const size_t num_buffers = 1 +
133     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
134       nc * (sizeof(int8_t) * (mc + kc) + sizeof(int32_t)));
135 
136   std::vector<int8_t> a(mc * kc);
137   std::generate(a.begin(), a.end(), std::ref(u8rng));
138   std::vector<int8_t> k(num_buffers * nc * kc);
139   std::generate(k.begin(), k.end(), std::ref(u8rng));
140   std::vector<int32_t> b(num_buffers * nc);
141   std::generate(b.begin(), b.end(), std::ref(i32rng));
142   std::vector<int8_t> c(num_buffers * nc * mc);
143   std::fill(c.begin(), c.end(), std::nanf(""));
144 
145   // Note: context must be static to avoid the cost of re-creating it for each benchmark.
146   static ruy::Context context;
147   context.set_max_num_threads(threads);
148 
149   ruy::Matrix<int8_t> ruy_a;
150   ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
151   ruy_a.set_zero_point(127);
152   ruy::Matrix<int8_t> ruy_b;
153   ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
154   ruy_b.set_data(a.data());
155   ruy_b.set_zero_point(127);
156   ruy::Matrix<int8_t> ruy_c;
157   ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
158   ruy_c.set_zero_point(127);
159 
160   ruy::MulParams<int32_t, int8_t> mul_params;
161   mul_params.set_multiplier_fixedpoint(0x40000000);
162 
163   // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
164   // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
165   // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
166   // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
167   static std::once_flag warmup;
168   std::call_once(warmup, [&](){
169     auto start = std::chrono::steady_clock::now();
170     do {
171       ruy_a.set_data(k.data());
172       ruy_c.set_data(c.data());
173       mul_params.set_bias(b.data());
174 
175       ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
176     } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
177   });
178 
179   size_t buffer_index = 0;
180   for (auto _ : state) {
181     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
182     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
183     // - K is not in cache (for any cache level)
184     // - B is not in cache (for any cache level)
185     // - C is not in cache (for any cache level)
186     state.PauseTiming();
187     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(int8_t));
188     buffer_index = (buffer_index + 1) % num_buffers;
189     state.ResumeTiming();
190 
191     ruy_a.set_data(k.data() + buffer_index * nc * kc);
192     ruy_c.set_data(c.data() + buffer_index * mc * nc);
193     mul_params.set_bias(b.data() + buffer_index * nc);
194 
195     ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
196   }
197 
198   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
199   if (cpu_frequency != 0) {
200     state.counters["cpufreq"] = cpu_frequency;
201   }
202 
203   state.counters["OPS"] = benchmark::Counter(
204     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
205 }
206 
ruy_st(benchmark::State & state,const char * net)207 static void ruy_st(benchmark::State& state, const char* net)
208 {
209   RuyBenchmark(state, 1);
210 }
211 #endif  // BENCHMARK_RUY
212 
213 #if XNN_ARCH_ARM && XNN_PLATFORM_JIT && XNN_ENABLE_JIT
GEMMBenchmark(benchmark::State & state,xnn_jit_gemm_code_generator_function generator,size_t mr,size_t nr,size_t kr,size_t sr,xnn_init_qs8_conv_minmax_params_fn init_params,benchmark::utils::IsaCheckFunction isa_check=nullptr)214   static void GEMMBenchmark(benchmark::State& state,
215     xnn_jit_gemm_code_generator_function generator,
216     size_t mr, size_t nr, size_t kr, size_t sr,
217     xnn_init_qs8_conv_minmax_params_fn  init_params,
218     benchmark::utils::IsaCheckFunction isa_check = nullptr)
219   {
220     xnn_code_buffer code_buffer;
221     xnn_allocate_code_memory(&code_buffer, XNN_DEFAULT_CODE_BUFFER_SIZE);
222     const size_t nc = state.range(1);
223     const size_t kc = state.range(2);
224     generator(&code_buffer, nc, kc, nullptr);
225     GEMMBenchmark(
226         state,
227         reinterpret_cast<xnn_qs8_gemm_minmax_ukernel_function>(code_buffer.code),
228         mr, nr, kr, sr, init_params, isa_check);
229     xnn_release_code_memory(&code_buffer);
230   }
231 
jit_qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State & state,const char * net)232   static void jit_qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State& state, const char* net) {
233     GEMMBenchmark(state, xnn_generate_qs8_gemm_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, 4, 8, 4, 1,
234       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
235   }
jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,const char * net)236   static void jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
237     GEMMBenchmark(state, xnn_generate_qs8_gemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, 4, 8, 1, 1,
238       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
239   }
jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)240   static void jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
241     GEMMBenchmark(state, xnn_generate_qs8_gemm_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
242       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
243   }
244   BENCHMARK_GEMM(jit_qs8_gemm_4x8c4__aarch32_neondot_ld64)
BENCHMARK_GEMM(jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)245   BENCHMARK_GEMM(jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
246   BENCHMARK_GEMM(jit_qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
247 #endif  // XNN_ARCH_ARM && XNN_PLATFORM_JIT && XNN_ENABLE_JIT
248 
249 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
250   static void qs8_gemm_4x8c4__aarch32_neondot_ld64(benchmark::State& state, const char* net) {
251     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_ld64, 4, 8, 4, 1,
252       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
253   }
qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State & state,const char * net)254   static void qs8_gemm_4x8c4__aarch32_neondot_cortex_a55(benchmark::State& state, const char* net) {
255     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__aarch32_neondot_cortex_a55, 4, 8, 4, 1,
256       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
257   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)258   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
259     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53, 4, 8, 1, 1,
260       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
261   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)262   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
263     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53, 4, 8, 1, 1,
264       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
265   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,const char * net)266   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
267     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7, 4, 8, 1, 1,
268       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
269   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,const char * net)270   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
271     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7, 4, 8, 1, 1,
272       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
273   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,const char * net)274   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
275     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64, 4, 8, 1, 1,
276       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
277   }
qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)278   static void qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
279     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
280       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
281   }
282   BENCHMARK_GEMM(qs8_gemm_4x8c4__aarch32_neondot_ld64)
BENCHMARK_GEMM(qs8_gemm_4x8c4__aarch32_neondot_cortex_a55)283   BENCHMARK_GEMM(qs8_gemm_4x8c4__aarch32_neondot_cortex_a55)
284   BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
285   BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
286   BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
287   BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
288   BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
289   BENCHMARK_GEMM(qs8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
290 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
291 
292 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
293   static void qs8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
294     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, 4, 16, 4, 1,
295       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
296   }
qs8_gemm_1x16c4__aarch64_neondot_ld32(benchmark::State & state,const char * net)297   static void qs8_gemm_1x16c4__aarch64_neondot_ld32(benchmark::State& state, const char* net) {
298     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld32, 1, 16, 4, 1,
299       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
300   }
qs8_gemm_1x16c4__aarch64_neondot_ld64(benchmark::State & state,const char * net)301   static void qs8_gemm_1x16c4__aarch64_neondot_ld64(benchmark::State& state, const char* net) {
302     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld64, 1, 16, 4, 1,
303       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
304   }
qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State & state,const char * net)305   static void qs8_gemm_4x16c4__aarch64_neondot_ld32(benchmark::State& state, const char* net) {
306     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld32, 4, 16, 4, 1,
307       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
308   }
qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State & state,const char * net)309   static void qs8_gemm_4x16c4__aarch64_neondot_ld64(benchmark::State& state, const char* net) {
310     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld64, 4, 16, 4, 1,
311       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
312   }
qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,const char * net)313   static void qs8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
314     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, 4, 16, 4, 1,
315       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
316   }
qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State & state,const char * net)317   static void qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
318     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, 4, 8, 1, 1,
319       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
320   }
qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)321   static void qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
322     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
323       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
324   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)325   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
326     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, 4, 16, 1, 1,
327       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
328   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)329   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
330     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, 4, 16, 1, 1,
331       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
332   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,const char * net)333   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
334     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, 4, 16, 1, 1,
335       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
336   }
qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)337   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
338     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, 4, 16, 1, 1,
339       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
340   }
qs8_gemm_1x8c8__aarch64_neon_mlal_prfm(benchmark::State & state,const char * net)341   static void qs8_gemm_1x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, const char* net) {
342     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, 1, 8, 8, 1,
343       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
344   }
qs8_gemm_1x8c8__aarch64_neon_mlal(benchmark::State & state,const char * net)345   static void qs8_gemm_1x8c8__aarch64_neon_mlal(benchmark::State& state, const char* net) {
346     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal, 1, 8, 8, 1,
347       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
348   }
qs8_gemm_1x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State & state,const char * net)349   static void qs8_gemm_1x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State& state, const char* net) {
350     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_cortex_a53, 1, 8, 8, 1,
351       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
352   }
qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State & state,const char * net)353   static void qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State& state, const char* net) {
354     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53, 1, 8, 8, 1,
355       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
356   }
qs8_gemm_2x8c8__aarch64_neon_mull(benchmark::State & state,const char * net)357   static void qs8_gemm_2x8c8__aarch64_neon_mull(benchmark::State& state, const char* net) {
358     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mull, 2, 8, 8, 1,
359       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
360   }
qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State & state,const char * net)361   static void qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State& state, const char* net) {
362     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal, 2, 8, 8, 1,
363       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
364   }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State & state,const char * net)365   static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, const char* net) {
366     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm, 2, 8, 8, 1,
367       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
368   }
qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State & state,const char * net)369   static void qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53(benchmark::State& state, const char* net) {
370     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_cortex_a53, 2, 8, 8, 1,
371       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
372   }
qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State & state,const char * net)373   static void qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53(benchmark::State& state, const char* net) {
374     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53, 2, 8, 8, 1,
375       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
376   }
qs8_gemm_2x8c16__aarch64_neon_mlal(benchmark::State & state,const char * net)377   static void qs8_gemm_2x8c16__aarch64_neon_mlal(benchmark::State& state, const char* net) {
378     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__aarch64_neon_mlal, 2, 8, 16, 1,
379       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
380   }
381 
382   BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld32)
BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld64)383   BENCHMARK_GEMM(qs8_gemm_1x16c4__aarch64_neondot_ld64)
384   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld32)
385   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld64)
386   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld128)
387   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
388   BENCHMARK_GEMM(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)
389   BENCHMARK_GEMM(qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64)
390   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
391   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
392   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
393   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
394   BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_prfm)
395   BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal)
396   BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53)
397   BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_cortex_a53)
398   BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mull)
399   BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal)
400   BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm)
401   BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53)
402   BENCHMARK_GEMM(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
403   BENCHMARK_GEMM(qs8_gemm_2x8c16__aarch64_neon_mlal)
404 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
405 
406 
407 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
408   static void qs8_gemm_1x8__neon_mlal_lane(benchmark::State& state, const char* net) {
409     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane, 1, 8, 1, 1,
410       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
411   }
qs8_gemm_2x8__neon_mlal_lane(benchmark::State & state,const char * net)412   static void qs8_gemm_2x8__neon_mlal_lane(benchmark::State& state, const char* net) {
413     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane, 2, 8, 1, 1,
414       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
415   }
qs8_gemm_3x8__neon_mlal_lane(benchmark::State & state,const char * net)416   static void qs8_gemm_3x8__neon_mlal_lane(benchmark::State& state, const char* net) {
417     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane, 3, 8, 1, 1,
418       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
419   }
qs8_gemm_4x8__neon_mlal_lane(benchmark::State & state,const char * net)420   static void qs8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) {
421     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane, 4, 8, 1, 1,
422       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
423   }
qs8_gemm_6x8__neon_mlal_lane(benchmark::State & state,const char * net)424   static void qs8_gemm_6x8__neon_mlal_lane(benchmark::State& state, const char* net) {
425     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane, 6, 8, 1, 1,
426       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
427   }
qs8_gemm_1x16__neon_mlal_lane(benchmark::State & state,const char * net)428   static void qs8_gemm_1x16__neon_mlal_lane(benchmark::State& state, const char* net) {
429     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane, 1, 16, 1, 1,
430       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
431   }
qs8_gemm_2x16__neon_mlal_lane(benchmark::State & state,const char * net)432   static void qs8_gemm_2x16__neon_mlal_lane(benchmark::State& state, const char* net) {
433     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane, 2, 16, 1, 1,
434       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
435   }
qs8_gemm_3x16__neon_mlal_lane(benchmark::State & state,const char * net)436   static void qs8_gemm_3x16__neon_mlal_lane(benchmark::State& state, const char* net) {
437     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane, 3, 16, 1, 1,
438       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
439   }
qs8_gemm_4x16__neon_mlal_lane(benchmark::State & state,const char * net)440   static void qs8_gemm_4x16__neon_mlal_lane(benchmark::State& state, const char* net) {
441     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane, 4, 16, 1, 1,
442       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
443   }
qs8_gemm_6x16__neon_mlal_lane(benchmark::State & state,const char * net)444   static void qs8_gemm_6x16__neon_mlal_lane(benchmark::State& state, const char* net) {
445     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane, 6, 16, 1, 1,
446       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
447   }
qs8_gemm_1x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)448   static void qs8_gemm_1x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
449     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane_prfm, 1, 8, 1, 1,
450       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
451   }
qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)452   static void qs8_gemm_2x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
453     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane_prfm, 2, 8, 1, 1,
454       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
455   }
qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)456   static void qs8_gemm_3x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
457     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane_prfm, 3, 8, 1, 1,
458       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
459   }
qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)460   static void qs8_gemm_4x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
461     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane_prfm, 4, 8, 1, 1,
462       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
463   }
qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State & state,const char * net)464   static void qs8_gemm_6x8__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
465     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane_prfm, 6, 8, 1, 1,
466       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
467   }
qs8_gemm_1x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)468   static void qs8_gemm_1x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
469     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane_prfm, 1, 16, 1, 1,
470       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
471   }
qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)472   static void qs8_gemm_2x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
473     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane_prfm, 2, 16, 1, 1,
474       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
475   }
qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)476   static void qs8_gemm_3x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
477     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane_prfm, 3, 16, 1, 1,
478       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
479   }
qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)480   static void qs8_gemm_4x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
481     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane_prfm, 4, 16, 1, 1,
482       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
483   }
qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State & state,const char * net)484   static void qs8_gemm_6x16__neon_mlal_lane_prfm(benchmark::State& state, const char* net) {
485     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane_prfm, 6, 16, 1, 1,
486       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
487   }
qs8_gemm_1x8c2__neon_mull_dup(benchmark::State & state,const char * net)488   static void qs8_gemm_1x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
489     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_dup, 1, 8, 2, 1,
490       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
491   }
qs8_gemm_2x8c2__neon_mull_dup(benchmark::State & state,const char * net)492   static void qs8_gemm_2x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
493     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_dup, 2, 8, 2, 1,
494       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
495   }
qs8_gemm_3x8c2__neon_mull_dup(benchmark::State & state,const char * net)496   static void qs8_gemm_3x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
497     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_dup, 3, 8, 2, 1,
498       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
499   }
qs8_gemm_4x8c2__neon_mull_dup(benchmark::State & state,const char * net)500   static void qs8_gemm_4x8c2__neon_mull_dup(benchmark::State& state, const char* net) {
501     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_dup, 4, 8, 2, 1,
502       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
503   }
qs8_gemm_1x16c2__neon_mull_dup(benchmark::State & state,const char * net)504   static void qs8_gemm_1x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
505     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_dup, 1, 16, 2, 1,
506       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
507   }
qs8_gemm_2x16c2__neon_mull_dup(benchmark::State & state,const char * net)508   static void qs8_gemm_2x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
509     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_dup, 2, 16, 2, 1,
510       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
511   }
qs8_gemm_3x16c2__neon_mull_dup(benchmark::State & state,const char * net)512   static void qs8_gemm_3x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
513     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_dup, 3, 16, 2, 1,
514       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
515   }
qs8_gemm_4x16c2__neon_mull_dup(benchmark::State & state,const char * net)516   static void qs8_gemm_4x16c2__neon_mull_dup(benchmark::State& state, const char* net) {
517     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_dup, 4, 16, 2, 1,
518       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
519   }
qs8_gemm_1x8c2__neon_mlal_dup(benchmark::State & state,const char * net)520   static void qs8_gemm_1x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
521     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_dup, 1, 8, 2, 1,
522       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
523   }
qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State & state,const char * net)524   static void qs8_gemm_2x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
525     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_dup, 2, 8, 2, 1,
526       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
527   }
qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State & state,const char * net)528   static void qs8_gemm_3x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
529     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_dup, 3, 8, 2, 1,
530       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
531   }
qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State & state,const char * net)532   static void qs8_gemm_4x8c2__neon_mlal_dup(benchmark::State& state, const char* net) {
533     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_dup, 4, 8, 2, 1,
534       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
535   }
qs8_gemm_1x16c2__neon_mlal_dup(benchmark::State & state,const char * net)536   static void qs8_gemm_1x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
537     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_dup, 1, 16, 2, 1,
538       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
539   }
qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State & state,const char * net)540   static void qs8_gemm_2x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
541     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_dup, 2, 16, 2, 1,
542       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
543   }
qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State & state,const char * net)544   static void qs8_gemm_3x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
545     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_dup, 3, 16, 2, 1,
546       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
547   }
qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State & state,const char * net)548   static void qs8_gemm_4x16c2__neon_mlal_dup(benchmark::State& state, const char* net) {
549     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_dup, 4, 16, 2, 1,
550       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
551   }
qs8_gemm_1x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)552   static void qs8_gemm_1x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
553     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r, 1, 8, 2, 1,
554       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
555   }
qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)556   static void qs8_gemm_2x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
557     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld1r, 2, 8, 2, 1,
558       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
559   }
qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)560   static void qs8_gemm_3x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
561     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld1r, 3, 8, 2, 1,
562       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
563   }
qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State & state,const char * net)564   static void qs8_gemm_4x8c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
565     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld1r, 4, 8, 2, 1,
566       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
567   }
qs8_gemm_1x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)568   static void qs8_gemm_1x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
569     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld1r, 1, 16, 2, 1,
570       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
571   }
qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)572   static void qs8_gemm_2x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
573     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld1r, 2, 16, 2, 1,
574       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
575   }
qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)576   static void qs8_gemm_3x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
577     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld1r, 3, 16, 2, 1,
578       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
579   }
qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State & state,const char * net)580   static void qs8_gemm_4x16c2__neon_mull_ld1r(benchmark::State& state, const char* net) {
581     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld1r, 4, 16, 2, 1,
582       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
583   }
qs8_gemm_1x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)584   static void qs8_gemm_1x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
585     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld1r, 1, 8, 2, 1,
586       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
587   }
qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)588   static void qs8_gemm_2x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
589     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld1r, 2, 8, 2, 1,
590       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
591   }
qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)592   static void qs8_gemm_3x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
593     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld1r, 3, 8, 2, 1,
594       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
595   }
qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State & state,const char * net)596   static void qs8_gemm_4x8c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
597     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld1r, 4, 8, 2, 1,
598       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
599   }
qs8_gemm_1x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)600   static void qs8_gemm_1x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
601     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld1r, 1, 16, 2, 1,
602       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
603   }
qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)604   static void qs8_gemm_2x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
605     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld1r, 2, 16, 2, 1,
606       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
607   }
qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)608   static void qs8_gemm_3x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
609     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld1r, 3, 16, 2, 1,
610       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
611   }
qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State & state,const char * net)612   static void qs8_gemm_4x16c2__neon_mlal_ld1r(benchmark::State& state, const char* net) {
613     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld1r, 4, 16, 2, 1,
614       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
615   }
qs8_gemm_1x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)616   static void qs8_gemm_1x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
617     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld2r, 1, 8, 2, 1,
618       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
619   }
qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)620   static void qs8_gemm_2x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
621     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld2r, 2, 8, 2, 1,
622       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
623   }
qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)624   static void qs8_gemm_3x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
625     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld2r, 3, 8, 2, 1,
626       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
627   }
qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State & state,const char * net)628   static void qs8_gemm_4x8c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
629     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld2r, 4, 8, 2, 1,
630       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
631   }
qs8_gemm_1x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)632   static void qs8_gemm_1x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
633     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld2r, 1, 16, 2, 1,
634       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
635   }
qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)636   static void qs8_gemm_2x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
637     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld2r, 2, 16, 2, 1,
638       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
639   }
qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)640   static void qs8_gemm_3x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
641     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld2r, 3, 16, 2, 1,
642       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
643   }
qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State & state,const char * net)644   static void qs8_gemm_4x16c2__neon_mull_ld2r(benchmark::State& state, const char* net) {
645     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld2r, 4, 16, 2, 1,
646       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
647   }
qs8_gemm_1x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)648   static void qs8_gemm_1x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
649     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld2r, 1, 8, 2, 1,
650       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
651   }
qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)652   static void qs8_gemm_2x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
653     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld2r, 2, 8, 2, 1,
654       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
655   }
qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)656   static void qs8_gemm_3x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
657     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld2r, 3, 8, 2, 1,
658       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
659   }
qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State & state,const char * net)660   static void qs8_gemm_4x8c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
661     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld2r, 4, 8, 2, 1,
662       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
663   }
qs8_gemm_1x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)664   static void qs8_gemm_1x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
665     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld2r, 1, 16, 2, 1,
666       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
667   }
qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)668   static void qs8_gemm_2x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
669     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld2r, 2, 16, 2, 1,
670       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
671   }
qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)672   static void qs8_gemm_3x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
673     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld2r, 3, 16, 2, 1,
674       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
675   }
qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State & state,const char * net)676   static void qs8_gemm_4x16c2__neon_mlal_ld2r(benchmark::State& state, const char* net) {
677     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld2r, 4, 16, 2, 1,
678       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
679   }
qs8_gemm_1x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)680   static void qs8_gemm_1x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
681     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld4r, 1, 8, 2, 1,
682       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
683   }
qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)684   static void qs8_gemm_2x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
685     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mull_ld4r, 2, 8, 2, 1,
686       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
687   }
qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)688   static void qs8_gemm_3x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
689     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mull_ld4r, 3, 8, 2, 1,
690       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
691   }
qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State & state,const char * net)692   static void qs8_gemm_4x8c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
693     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mull_ld4r, 4, 8, 2, 1,
694       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
695   }
qs8_gemm_1x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)696   static void qs8_gemm_1x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
697     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mull_ld4r, 1, 16, 2, 1,
698       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
699   }
qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)700   static void qs8_gemm_2x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
701     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mull_ld4r, 2, 16, 2, 1,
702       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
703   }
qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)704   static void qs8_gemm_3x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
705     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mull_ld4r, 3, 16, 2, 1,
706       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
707   }
qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State & state,const char * net)708   static void qs8_gemm_4x16c2__neon_mull_ld4r(benchmark::State& state, const char* net) {
709     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mull_ld4r, 4, 16, 2, 1,
710       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
711   }
qs8_gemm_1x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)712   static void qs8_gemm_1x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
713     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mlal_ld4r, 1, 8, 2, 1,
714       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
715   }
qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)716   static void qs8_gemm_2x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
717     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2__neon_mlal_ld4r, 2, 8, 2, 1,
718       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
719   }
qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)720   static void qs8_gemm_3x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
721     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2__neon_mlal_ld4r, 3, 8, 2, 1,
722       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
723   }
qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State & state,const char * net)724   static void qs8_gemm_4x8c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
725     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2__neon_mlal_ld4r, 4, 8, 2, 1,
726       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
727   }
qs8_gemm_1x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)728   static void qs8_gemm_1x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
729     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2__neon_mlal_ld4r, 1, 16, 2, 1,
730       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
731   }
qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)732   static void qs8_gemm_2x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
733     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2__neon_mlal_ld4r, 2, 16, 2, 1,
734       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
735   }
qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)736   static void qs8_gemm_3x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
737     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2__neon_mlal_ld4r, 3, 16, 2, 1,
738       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
739   }
qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State & state,const char * net)740   static void qs8_gemm_4x16c2__neon_mlal_ld4r(benchmark::State& state, const char* net) {
741     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2__neon_mlal_ld4r, 4, 16, 2, 1,
742       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
743   }
qs8_gemm_1x8c2s4__neon_mull(benchmark::State & state,const char * net)744   static void qs8_gemm_1x8c2s4__neon_mull(benchmark::State& state, const char* net) {
745     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mull, 1, 8, 2, 4,
746       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
747   }
qs8_gemm_2x8c2s4__neon_mull(benchmark::State & state,const char * net)748   static void qs8_gemm_2x8c2s4__neon_mull(benchmark::State& state, const char* net) {
749     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mull, 2, 8, 2, 4,
750       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
751   }
qs8_gemm_3x8c2s4__neon_mull(benchmark::State & state,const char * net)752   static void qs8_gemm_3x8c2s4__neon_mull(benchmark::State& state, const char* net) {
753     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mull, 3, 8, 2, 4,
754       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
755   }
qs8_gemm_4x8c2s4__neon_mull(benchmark::State & state,const char * net)756   static void qs8_gemm_4x8c2s4__neon_mull(benchmark::State& state, const char* net) {
757     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mull, 4, 8, 2, 4,
758       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
759   }
qs8_gemm_1x16c2s4__neon_mull(benchmark::State & state,const char * net)760   static void qs8_gemm_1x16c2s4__neon_mull(benchmark::State& state, const char* net) {
761     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mull, 1, 16, 2, 4,
762       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
763   }
qs8_gemm_2x16c2s4__neon_mull(benchmark::State & state,const char * net)764   static void qs8_gemm_2x16c2s4__neon_mull(benchmark::State& state, const char* net) {
765     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mull, 2, 16, 2, 4,
766       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
767   }
qs8_gemm_3x16c2s4__neon_mull(benchmark::State & state,const char * net)768   static void qs8_gemm_3x16c2s4__neon_mull(benchmark::State& state, const char* net) {
769     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mull, 3, 16, 2, 4,
770       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
771   }
qs8_gemm_4x16c2s4__neon_mull(benchmark::State & state,const char * net)772   static void qs8_gemm_4x16c2s4__neon_mull(benchmark::State& state, const char* net) {
773     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mull, 4, 16, 2, 4,
774       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
775   }
qs8_gemm_1x8c2s4__neon_mlal(benchmark::State & state,const char * net)776   static void qs8_gemm_1x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
777     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2s4__neon_mlal, 1, 8, 2, 4,
778       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
779   }
qs8_gemm_2x8c2s4__neon_mlal(benchmark::State & state,const char * net)780   static void qs8_gemm_2x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
781     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c2s4__neon_mlal, 2, 8, 2, 4,
782       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
783   }
qs8_gemm_3x8c2s4__neon_mlal(benchmark::State & state,const char * net)784   static void qs8_gemm_3x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
785     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c2s4__neon_mlal, 3, 8, 2, 4,
786       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
787   }
qs8_gemm_4x8c2s4__neon_mlal(benchmark::State & state,const char * net)788   static void qs8_gemm_4x8c2s4__neon_mlal(benchmark::State& state, const char* net) {
789     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c2s4__neon_mlal, 4, 8, 2, 4,
790       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
791   }
qs8_gemm_1x16c2s4__neon_mlal(benchmark::State & state,const char * net)792   static void qs8_gemm_1x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
793     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c2s4__neon_mlal, 1, 16, 2, 4,
794       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
795   }
qs8_gemm_2x16c2s4__neon_mlal(benchmark::State & state,const char * net)796   static void qs8_gemm_2x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
797     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c2s4__neon_mlal, 2, 16, 2, 4,
798       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
799   }
qs8_gemm_3x16c2s4__neon_mlal(benchmark::State & state,const char * net)800   static void qs8_gemm_3x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
801     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c2s4__neon_mlal, 3, 16, 2, 4,
802       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
803   }
qs8_gemm_4x16c2s4__neon_mlal(benchmark::State & state,const char * net)804   static void qs8_gemm_4x16c2s4__neon_mlal(benchmark::State& state, const char* net) {
805     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c2s4__neon_mlal, 4, 16, 2, 4,
806       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
807   }
qs8_gemm_1x8c4__neon_mull_dup(benchmark::State & state,const char * net)808   static void qs8_gemm_1x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
809     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_dup, 1, 8, 4, 1,
810       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
811   }
qs8_gemm_2x8c4__neon_mull_dup(benchmark::State & state,const char * net)812   static void qs8_gemm_2x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
813     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_dup, 2, 8, 4, 1,
814       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
815   }
qs8_gemm_3x8c4__neon_mull_dup(benchmark::State & state,const char * net)816   static void qs8_gemm_3x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
817     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_dup, 3, 8, 4, 1,
818       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
819   }
qs8_gemm_4x8c4__neon_mull_dup(benchmark::State & state,const char * net)820   static void qs8_gemm_4x8c4__neon_mull_dup(benchmark::State& state, const char* net) {
821     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_dup, 4, 8, 4, 1,
822       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
823   }
qs8_gemm_1x16c4__neon_mull_dup(benchmark::State & state,const char * net)824   static void qs8_gemm_1x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
825     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_dup, 1, 16, 4, 1,
826       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
827   }
qs8_gemm_2x16c4__neon_mull_dup(benchmark::State & state,const char * net)828   static void qs8_gemm_2x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
829     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_dup, 2, 16, 4, 1,
830       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
831   }
qs8_gemm_3x16c4__neon_mull_dup(benchmark::State & state,const char * net)832   static void qs8_gemm_3x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
833     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_dup, 3, 16, 4, 1,
834       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
835   }
qs8_gemm_4x16c4__neon_mull_dup(benchmark::State & state,const char * net)836   static void qs8_gemm_4x16c4__neon_mull_dup(benchmark::State& state, const char* net) {
837     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_dup, 4, 16, 4, 1,
838       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
839   }
qs8_gemm_1x8c4__neon_mlal_dup(benchmark::State & state,const char * net)840   static void qs8_gemm_1x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
841     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_dup, 1, 8, 4, 1,
842       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
843   }
qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State & state,const char * net)844   static void qs8_gemm_2x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
845     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_dup, 2, 8, 4, 1,
846       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
847   }
qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State & state,const char * net)848   static void qs8_gemm_3x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
849     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_dup, 3, 8, 4, 1,
850       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
851   }
qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State & state,const char * net)852   static void qs8_gemm_4x8c4__neon_mlal_dup(benchmark::State& state, const char* net) {
853     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_dup, 4, 8, 4, 1,
854       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
855   }
qs8_gemm_1x16c4__neon_mlal_dup(benchmark::State & state,const char * net)856   static void qs8_gemm_1x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
857     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_dup, 1, 16, 4, 1,
858       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
859   }
qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State & state,const char * net)860   static void qs8_gemm_2x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
861     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_dup, 2, 16, 4, 1,
862       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
863   }
qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State & state,const char * net)864   static void qs8_gemm_3x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
865     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_dup, 3, 16, 4, 1,
866       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
867   }
qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State & state,const char * net)868   static void qs8_gemm_4x16c4__neon_mlal_dup(benchmark::State& state, const char* net) {
869     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_dup, 4, 16, 4, 1,
870       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
871   }
qs8_gemm_1x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)872   static void qs8_gemm_1x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
873     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld1r, 1, 8, 4, 1,
874       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
875   }
qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)876   static void qs8_gemm_2x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
877     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld1r, 2, 8, 4, 1,
878       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
879   }
qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)880   static void qs8_gemm_3x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
881     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld1r, 3, 8, 4, 1,
882       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
883   }
qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State & state,const char * net)884   static void qs8_gemm_4x8c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
885     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld1r, 4, 8, 4, 1,
886       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
887   }
qs8_gemm_1x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)888   static void qs8_gemm_1x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
889     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld1r, 1, 16, 4, 1,
890       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
891   }
qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)892   static void qs8_gemm_2x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
893     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld1r, 2, 16, 4, 1,
894       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
895   }
qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)896   static void qs8_gemm_3x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
897     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld1r, 3, 16, 4, 1,
898       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
899   }
qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State & state,const char * net)900   static void qs8_gemm_4x16c4__neon_mull_ld1r(benchmark::State& state, const char* net) {
901     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld1r, 4, 16, 4, 1,
902       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
903   }
qs8_gemm_1x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)904   static void qs8_gemm_1x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
905     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld1r, 1, 8, 4, 1,
906       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
907   }
qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)908   static void qs8_gemm_2x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
909     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld1r, 2, 8, 4, 1,
910       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
911   }
qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)912   static void qs8_gemm_3x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
913     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld1r, 3, 8, 4, 1,
914       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
915   }
qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State & state,const char * net)916   static void qs8_gemm_4x8c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
917     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld1r, 4, 8, 4, 1,
918       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
919   }
qs8_gemm_1x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)920   static void qs8_gemm_1x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
921     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld1r, 1, 16, 4, 1,
922       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
923   }
qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)924   static void qs8_gemm_2x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
925     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld1r, 2, 16, 4, 1,
926       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
927   }
qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)928   static void qs8_gemm_3x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
929     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld1r, 3, 16, 4, 1,
930       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
931   }
qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State & state,const char * net)932   static void qs8_gemm_4x16c4__neon_mlal_ld1r(benchmark::State& state, const char* net) {
933     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld1r, 4, 16, 4, 1,
934       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
935   }
qs8_gemm_1x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)936   static void qs8_gemm_1x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
937     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mull_ld2r, 1, 8, 4, 1,
938       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
939   }
qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)940   static void qs8_gemm_2x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
941     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mull_ld2r, 2, 8, 4, 1,
942       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
943   }
qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)944   static void qs8_gemm_3x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
945     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mull_ld2r, 3, 8, 4, 1,
946       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
947   }
qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State & state,const char * net)948   static void qs8_gemm_4x8c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
949     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mull_ld2r, 4, 8, 4, 1,
950       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
951   }
qs8_gemm_1x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)952   static void qs8_gemm_1x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
953     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mull_ld2r, 1, 16, 4, 1,
954       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
955   }
qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)956   static void qs8_gemm_2x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
957     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mull_ld2r, 2, 16, 4, 1,
958       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
959   }
qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)960   static void qs8_gemm_3x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
961     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mull_ld2r, 3, 16, 4, 1,
962       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
963   }
qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State & state,const char * net)964   static void qs8_gemm_4x16c4__neon_mull_ld2r(benchmark::State& state, const char* net) {
965     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mull_ld2r, 4, 16, 4, 1,
966       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
967   }
qs8_gemm_1x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)968   static void qs8_gemm_1x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
969     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neon_mlal_ld2r, 1, 8, 4, 1,
970       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
971   }
qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)972   static void qs8_gemm_2x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
973     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c4__neon_mlal_ld2r, 2, 8, 4, 1,
974       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
975   }
qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)976   static void qs8_gemm_3x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
977     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c4__neon_mlal_ld2r, 3, 8, 4, 1,
978       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
979   }
qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State & state,const char * net)980   static void qs8_gemm_4x8c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
981     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neon_mlal_ld2r, 4, 8, 4, 1,
982       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
983   }
qs8_gemm_1x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)984   static void qs8_gemm_1x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
985     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neon_mlal_ld2r, 1, 16, 4, 1,
986       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
987   }
qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)988   static void qs8_gemm_2x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
989     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c4__neon_mlal_ld2r, 2, 16, 4, 1,
990       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
991   }
qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)992   static void qs8_gemm_3x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
993     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c4__neon_mlal_ld2r, 3, 16, 4, 1,
994       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
995   }
qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State & state,const char * net)996   static void qs8_gemm_4x16c4__neon_mlal_ld2r(benchmark::State& state, const char* net) {
997     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neon_mlal_ld2r, 4, 16, 4, 1,
998       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
999   }
qs8_gemm_1x8c8__neon_mull(benchmark::State & state,const char * net)1000   static void qs8_gemm_1x8c8__neon_mull(benchmark::State& state, const char* net) {
1001     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mull, 1, 8, 8, 1,
1002       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1003   }
qs8_gemm_2x8c8__neon_mull(benchmark::State & state,const char * net)1004   static void qs8_gemm_2x8c8__neon_mull(benchmark::State& state, const char* net) {
1005     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mull, 2, 8, 8, 1,
1006       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1007   }
qs8_gemm_3x8c8__neon_mull(benchmark::State & state,const char * net)1008   static void qs8_gemm_3x8c8__neon_mull(benchmark::State& state, const char* net) {
1009     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mull, 3, 8, 8, 1,
1010       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1011   }
qs8_gemm_4x8c8__neon_mull(benchmark::State & state,const char * net)1012   static void qs8_gemm_4x8c8__neon_mull(benchmark::State& state, const char* net) {
1013     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mull, 4, 8, 8, 1,
1014       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1015   }
qs8_gemm_1x16c8__neon_mull(benchmark::State & state,const char * net)1016   static void qs8_gemm_1x16c8__neon_mull(benchmark::State& state, const char* net) {
1017     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mull, 1, 16, 8, 1,
1018       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1019   }
qs8_gemm_2x16c8__neon_mull(benchmark::State & state,const char * net)1020   static void qs8_gemm_2x16c8__neon_mull(benchmark::State& state, const char* net) {
1021     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mull, 2, 16, 8, 1,
1022       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1023   }
qs8_gemm_3x16c8__neon_mull(benchmark::State & state,const char * net)1024   static void qs8_gemm_3x16c8__neon_mull(benchmark::State& state, const char* net) {
1025     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mull, 3, 16, 8, 1,
1026       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1027   }
qs8_gemm_4x16c8__neon_mull(benchmark::State & state,const char * net)1028   static void qs8_gemm_4x16c8__neon_mull(benchmark::State& state, const char* net) {
1029     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mull, 4, 16, 8, 1,
1030       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1031   }
qs8_gemm_1x8c8__neon_mlal(benchmark::State & state,const char * net)1032   static void qs8_gemm_1x8c8__neon_mlal(benchmark::State& state, const char* net) {
1033     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__neon_mlal, 1, 8, 8, 1,
1034       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1035   }
qs8_gemm_2x8c8__neon_mlal(benchmark::State & state,const char * net)1036   static void qs8_gemm_2x8c8__neon_mlal(benchmark::State& state, const char* net) {
1037     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__neon_mlal, 2, 8, 8, 1,
1038       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1039   }
qs8_gemm_3x8c8__neon_mlal(benchmark::State & state,const char * net)1040   static void qs8_gemm_3x8c8__neon_mlal(benchmark::State& state, const char* net) {
1041     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c8__neon_mlal, 3, 8, 8, 1,
1042       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1043   }
qs8_gemm_4x8c8__neon_mlal(benchmark::State & state,const char * net)1044   static void qs8_gemm_4x8c8__neon_mlal(benchmark::State& state, const char* net) {
1045     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c8__neon_mlal, 4, 8, 8, 1,
1046       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1047   }
qs8_gemm_1x16c8__neon_mlal(benchmark::State & state,const char * net)1048   static void qs8_gemm_1x16c8__neon_mlal(benchmark::State& state, const char* net) {
1049     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c8__neon_mlal, 1, 16, 8, 1,
1050       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1051   }
qs8_gemm_2x16c8__neon_mlal(benchmark::State & state,const char * net)1052   static void qs8_gemm_2x16c8__neon_mlal(benchmark::State& state, const char* net) {
1053     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c8__neon_mlal, 2, 16, 8, 1,
1054       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1055   }
qs8_gemm_3x16c8__neon_mlal(benchmark::State & state,const char * net)1056   static void qs8_gemm_3x16c8__neon_mlal(benchmark::State& state, const char* net) {
1057     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c8__neon_mlal, 3, 16, 8, 1,
1058       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1059   }
qs8_gemm_4x16c8__neon_mlal(benchmark::State & state,const char * net)1060   static void qs8_gemm_4x16c8__neon_mlal(benchmark::State& state, const char* net) {
1061     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c8__neon_mlal, 4, 16, 8, 1,
1062       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1063   }
qs8_gemm_1x8c16__neon_mlal(benchmark::State & state,const char * net)1064   static void qs8_gemm_1x8c16__neon_mlal(benchmark::State& state, const char* net) {
1065     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c16__neon_mlal, 1, 8, 16, 1,
1066       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1067   }
qs8_gemm_2x8c16__neon_mlal(benchmark::State & state,const char * net)1068   static void qs8_gemm_2x8c16__neon_mlal(benchmark::State& state, const char* net) {
1069     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c16__neon_mlal, 2, 8, 16, 1,
1070       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1071   }
qs8_gemm_3x8c16__neon_mlal(benchmark::State & state,const char * net)1072   static void qs8_gemm_3x8c16__neon_mlal(benchmark::State& state, const char* net) {
1073     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x8c16__neon_mlal, 3, 8, 16, 1,
1074       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1075   }
qs8_gemm_4x8c16__neon_mlal(benchmark::State & state,const char * net)1076   static void qs8_gemm_4x8c16__neon_mlal(benchmark::State& state, const char* net) {
1077     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c16__neon_mlal, 4, 8, 16, 1,
1078       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1079   }
qs8_gemm_1x16c16__neon_mlal(benchmark::State & state,const char * net)1080   static void qs8_gemm_1x16c16__neon_mlal(benchmark::State& state, const char* net) {
1081     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c16__neon_mlal, 1, 16, 16, 1,
1082       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1083   }
qs8_gemm_2x16c16__neon_mlal(benchmark::State & state,const char * net)1084   static void qs8_gemm_2x16c16__neon_mlal(benchmark::State& state, const char* net) {
1085     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_2x16c16__neon_mlal, 2, 16, 16, 1,
1086       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1087   }
qs8_gemm_3x16c16__neon_mlal(benchmark::State & state,const char * net)1088   static void qs8_gemm_3x16c16__neon_mlal(benchmark::State& state, const char* net) {
1089     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_3x16c16__neon_mlal, 3, 16, 16, 1,
1090       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1091   }
qs8_gemm_4x16c16__neon_mlal(benchmark::State & state,const char * net)1092   static void qs8_gemm_4x16c16__neon_mlal(benchmark::State& state, const char* net) {
1093     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c16__neon_mlal, 4, 16, 16, 1,
1094       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
1095   }
qs8_gemm_1x8c4__neondot(benchmark::State & state,const char * net)1096   static void qs8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
1097     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c4__neondot, 1, 8, 4, 1,
1098       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1099   }
qs8_gemm_4x8c4__neondot(benchmark::State & state,const char * net)1100   static void qs8_gemm_4x8c4__neondot(benchmark::State& state, const char* net) {
1101     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8c4__neondot, 4, 8, 4, 1,
1102       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1103   }
qs8_gemm_6x8c4__neondot(benchmark::State & state,const char * net)1104   static void qs8_gemm_6x8c4__neondot(benchmark::State& state, const char* net) {
1105     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x8c4__neondot, 6, 8, 4, 1,
1106       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1107   }
qs8_gemm_8x8c4__neondot(benchmark::State & state,const char * net)1108   static void qs8_gemm_8x8c4__neondot(benchmark::State& state, const char* net) {
1109     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_8x8c4__neondot, 8, 8, 4, 1,
1110       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1111   }
qs8_gemm_1x16c4__neondot(benchmark::State & state,const char * net)1112   static void qs8_gemm_1x16c4__neondot(benchmark::State& state, const char* net) {
1113     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__neondot, 1, 16, 4, 1,
1114       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1115   }
qs8_gemm_4x16c4__neondot(benchmark::State & state,const char * net)1116   static void qs8_gemm_4x16c4__neondot(benchmark::State& state, const char* net) {
1117     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__neondot, 4, 16, 4, 1,
1118       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1119   }
qs8_gemm_6x16c4__neondot(benchmark::State & state,const char * net)1120   static void qs8_gemm_6x16c4__neondot(benchmark::State& state, const char* net) {
1121     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_6x16c4__neondot, 6, 16, 4, 1,
1122       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1123   }
qs8_gemm_8x16c4__neondot(benchmark::State & state,const char * net)1124   static void qs8_gemm_8x16c4__neondot(benchmark::State& state, const char* net) {
1125     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_8x16c4__neondot, 8, 16, 4, 1,
1126       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
1127   }
1128   BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_dup)
BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_dup)1129   BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_dup)
1130   BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_dup)
1131   BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_dup)
1132   BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_dup)
1133   BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_dup)
1134   BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_dup)
1135   BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_dup)
1136   BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_dup)
1137   BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_dup)
1138   BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_dup)
1139   BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_dup)
1140   BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_dup)
1141   BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_dup)
1142   BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_dup)
1143   BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_dup)
1144   BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_ld1r)
1145   BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_ld1r)
1146   BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_ld1r)
1147   BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_ld1r)
1148   BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_ld1r)
1149   BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_ld1r)
1150   BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_ld1r)
1151   BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_ld1r)
1152   BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_ld1r)
1153   BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_ld1r)
1154   BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_ld1r)
1155   BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_ld1r)
1156   BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_ld1r)
1157   BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_ld1r)
1158   BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_ld1r)
1159   BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_ld1r)
1160   BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mull_ld2r)
1161   BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mull_ld2r)
1162   BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mull_ld2r)
1163   BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mull_ld2r)
1164   BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mull_ld2r)
1165   BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mull_ld2r)
1166   BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mull_ld2r)
1167   BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mull_ld2r)
1168   BENCHMARK_GEMM(qs8_gemm_1x8c4__neon_mlal_ld2r)
1169   BENCHMARK_GEMM(qs8_gemm_2x8c4__neon_mlal_ld2r)
1170   BENCHMARK_GEMM(qs8_gemm_3x8c4__neon_mlal_ld2r)
1171   BENCHMARK_GEMM(qs8_gemm_4x8c4__neon_mlal_ld2r)
1172   BENCHMARK_GEMM(qs8_gemm_1x16c4__neon_mlal_ld2r)
1173   BENCHMARK_GEMM(qs8_gemm_2x16c4__neon_mlal_ld2r)
1174   BENCHMARK_GEMM(qs8_gemm_3x16c4__neon_mlal_ld2r)
1175   BENCHMARK_GEMM(qs8_gemm_4x16c4__neon_mlal_ld2r)
1176   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_dup)
1177   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_dup)
1178   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_dup)
1179   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_dup)
1180   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_dup)
1181   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_dup)
1182   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_dup)
1183   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_dup)
1184   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_dup)
1185   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_dup)
1186   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_dup)
1187   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_dup)
1188   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_dup)
1189   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_dup)
1190   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_dup)
1191   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_dup)
1192   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_ld1r)
1193   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_ld1r)
1194   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_ld1r)
1195   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_ld1r)
1196   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_ld1r)
1197   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_ld1r)
1198   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_ld1r)
1199   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_ld1r)
1200   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_ld1r)
1201   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_ld1r)
1202   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_ld1r)
1203   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_ld1r)
1204   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_ld1r)
1205   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_ld1r)
1206   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_ld1r)
1207   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_ld1r)
1208   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_ld2r)
1209   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_ld2r)
1210   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_ld2r)
1211   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_ld2r)
1212   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_ld2r)
1213   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_ld2r)
1214   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_ld2r)
1215   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_ld2r)
1216   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_ld2r)
1217   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_ld2r)
1218   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_ld2r)
1219   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_ld2r)
1220   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_ld2r)
1221   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_ld2r)
1222   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_ld2r)
1223   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_ld2r)
1224   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mull_ld4r)
1225   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mull_ld4r)
1226   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mull_ld4r)
1227   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mull_ld4r)
1228   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mull_ld4r)
1229   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mull_ld4r)
1230   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mull_ld4r)
1231   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mull_ld4r)
1232   BENCHMARK_GEMM(qs8_gemm_1x8c2__neon_mlal_ld4r)
1233   BENCHMARK_GEMM(qs8_gemm_2x8c2__neon_mlal_ld4r)
1234   BENCHMARK_GEMM(qs8_gemm_3x8c2__neon_mlal_ld4r)
1235   BENCHMARK_GEMM(qs8_gemm_4x8c2__neon_mlal_ld4r)
1236   BENCHMARK_GEMM(qs8_gemm_1x16c2__neon_mlal_ld4r)
1237   BENCHMARK_GEMM(qs8_gemm_2x16c2__neon_mlal_ld4r)
1238   BENCHMARK_GEMM(qs8_gemm_3x16c2__neon_mlal_ld4r)
1239   BENCHMARK_GEMM(qs8_gemm_4x16c2__neon_mlal_ld4r)
1240   BENCHMARK_GEMM(qs8_gemm_1x8c2s4__neon_mull)
1241   BENCHMARK_GEMM(qs8_gemm_2x8c2s4__neon_mull)
1242   BENCHMARK_GEMM(qs8_gemm_3x8c2s4__neon_mull)
1243   BENCHMARK_GEMM(qs8_gemm_4x8c2s4__neon_mull)
1244   BENCHMARK_GEMM(qs8_gemm_1x16c2s4__neon_mull)
1245   BENCHMARK_GEMM(qs8_gemm_2x16c2s4__neon_mull)
1246   BENCHMARK_GEMM(qs8_gemm_3x16c2s4__neon_mull)
1247   BENCHMARK_GEMM(qs8_gemm_4x16c2s4__neon_mull)
1248   BENCHMARK_GEMM(qs8_gemm_1x8c2s4__neon_mlal)
1249   BENCHMARK_GEMM(qs8_gemm_2x8c2s4__neon_mlal)
1250   BENCHMARK_GEMM(qs8_gemm_3x8c2s4__neon_mlal)
1251   BENCHMARK_GEMM(qs8_gemm_4x8c2s4__neon_mlal)
1252   BENCHMARK_GEMM(qs8_gemm_1x16c2s4__neon_mlal)
1253   BENCHMARK_GEMM(qs8_gemm_2x16c2s4__neon_mlal)
1254   BENCHMARK_GEMM(qs8_gemm_3x16c2s4__neon_mlal)
1255   BENCHMARK_GEMM(qs8_gemm_4x16c2s4__neon_mlal)
1256   BENCHMARK_GEMM(qs8_gemm_1x8__neon_mlal_lane)
1257   BENCHMARK_GEMM(qs8_gemm_2x8__neon_mlal_lane)
1258   BENCHMARK_GEMM(qs8_gemm_3x8__neon_mlal_lane)
1259   BENCHMARK_GEMM(qs8_gemm_4x8__neon_mlal_lane)
1260   BENCHMARK_GEMM(qs8_gemm_6x8__neon_mlal_lane)
1261   BENCHMARK_GEMM(qs8_gemm_1x16__neon_mlal_lane)
1262   BENCHMARK_GEMM(qs8_gemm_2x16__neon_mlal_lane)
1263   BENCHMARK_GEMM(qs8_gemm_3x16__neon_mlal_lane)
1264   BENCHMARK_GEMM(qs8_gemm_4x16__neon_mlal_lane)
1265   BENCHMARK_GEMM(qs8_gemm_6x16__neon_mlal_lane)
1266   BENCHMARK_GEMM(qs8_gemm_1x8__neon_mlal_lane_prfm)
1267   BENCHMARK_GEMM(qs8_gemm_2x8__neon_mlal_lane_prfm)
1268   BENCHMARK_GEMM(qs8_gemm_3x8__neon_mlal_lane_prfm)
1269   BENCHMARK_GEMM(qs8_gemm_4x8__neon_mlal_lane_prfm)
1270   BENCHMARK_GEMM(qs8_gemm_6x8__neon_mlal_lane_prfm)
1271   BENCHMARK_GEMM(qs8_gemm_1x16__neon_mlal_lane_prfm)
1272   BENCHMARK_GEMM(qs8_gemm_2x16__neon_mlal_lane_prfm)
1273   BENCHMARK_GEMM(qs8_gemm_3x16__neon_mlal_lane_prfm)
1274   BENCHMARK_GEMM(qs8_gemm_4x16__neon_mlal_lane_prfm)
1275   BENCHMARK_GEMM(qs8_gemm_6x16__neon_mlal_lane_prfm)
1276   BENCHMARK_GEMM(qs8_gemm_1x8c8__neon_mull)
1277   BENCHMARK_GEMM(qs8_gemm_2x8c8__neon_mull)
1278   BENCHMARK_GEMM(qs8_gemm_3x8c8__neon_mull)
1279   BENCHMARK_GEMM(qs8_gemm_4x8c8__neon_mull)
1280   BENCHMARK_GEMM(qs8_gemm_1x16c8__neon_mull)
1281   BENCHMARK_GEMM(qs8_gemm_2x16c8__neon_mull)
1282   BENCHMARK_GEMM(qs8_gemm_3x16c8__neon_mull)
1283   BENCHMARK_GEMM(qs8_gemm_4x16c8__neon_mull)
1284   BENCHMARK_GEMM(qs8_gemm_1x8c8__neon_mlal)
1285   BENCHMARK_GEMM(qs8_gemm_2x8c8__neon_mlal)
1286   BENCHMARK_GEMM(qs8_gemm_3x8c8__neon_mlal)
1287   BENCHMARK_GEMM(qs8_gemm_4x8c8__neon_mlal)
1288   BENCHMARK_GEMM(qs8_gemm_1x16c8__neon_mlal)
1289   BENCHMARK_GEMM(qs8_gemm_2x16c8__neon_mlal)
1290   BENCHMARK_GEMM(qs8_gemm_3x16c8__neon_mlal)
1291   BENCHMARK_GEMM(qs8_gemm_4x16c8__neon_mlal)
1292   BENCHMARK_GEMM(qs8_gemm_1x8c16__neon_mlal)
1293   BENCHMARK_GEMM(qs8_gemm_2x8c16__neon_mlal)
1294   BENCHMARK_GEMM(qs8_gemm_3x8c16__neon_mlal)
1295   BENCHMARK_GEMM(qs8_gemm_4x8c16__neon_mlal)
1296   BENCHMARK_GEMM(qs8_gemm_1x16c16__neon_mlal)
1297   BENCHMARK_GEMM(qs8_gemm_2x16c16__neon_mlal)
1298   BENCHMARK_GEMM(qs8_gemm_3x16c16__neon_mlal)
1299   BENCHMARK_GEMM(qs8_gemm_4x16c16__neon_mlal)
1300 
1301   BENCHMARK_GEMM(qs8_gemm_1x8c4__neondot)
1302   BENCHMARK_GEMM(qs8_gemm_4x8c4__neondot)
1303   BENCHMARK_GEMM(qs8_gemm_6x8c4__neondot)
1304   BENCHMARK_GEMM(qs8_gemm_8x8c4__neondot)
1305   BENCHMARK_GEMM(qs8_gemm_1x16c4__neondot)
1306   BENCHMARK_GEMM(qs8_gemm_4x16c4__neondot)
1307   BENCHMARK_GEMM(qs8_gemm_6x16c4__neondot)
1308   BENCHMARK_GEMM(qs8_gemm_8x16c4__neondot)
1309 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1310 
1311 
1312 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
1313   static void qs8_gemm_2x16c8__avx512skx(benchmark::State& state, const char* net) {
1314     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx, 2, 16, 8, 1,
1315       xnn_init_qs8_conv_minmax_fp32_avx512_params, benchmark::utils::CheckAVX512SKX);
1316   }
qs8_gemm_3x16c8__avx512skx(benchmark::State & state,const char * net)1317   static void qs8_gemm_3x16c8__avx512skx(benchmark::State& state, const char* net) {
1318     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx, 3, 16, 8, 1,
1319       xnn_init_qs8_conv_minmax_fp32_avx512_params, benchmark::utils::CheckAVX512SKX);
1320   }
qs8_gemm_4x16c8__avx512skx(benchmark::State & state,const char * net)1321   static void qs8_gemm_4x16c8__avx512skx(benchmark::State& state, const char* net) {
1322     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx, 4, 16, 8, 1,
1323       xnn_init_qs8_conv_minmax_fp32_avx512_params, benchmark::utils::CheckAVX512SKX);
1324   }
1325 
qs8_gemm_2x8c8__avx2(benchmark::State & state,const char * net)1326   static void qs8_gemm_2x8c8__avx2(benchmark::State& state, const char* net) {
1327     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__avx2, 2, 8, 8, 1,
1328       xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2);
1329   }
qs8_gemm_3x8c8__avx2(benchmark::State & state,const char * net)1330   static void qs8_gemm_3x8c8__avx2(benchmark::State& state, const char* net) {
1331     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2, 3, 8, 8, 1,
1332       xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2);
1333   }
1334 
qs8_gemm_xw_2x8c8__avx2(benchmark::State & state,const char * net)1335   static void qs8_gemm_xw_2x8c8__avx2(benchmark::State& state, const char* net) {
1336     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, 2, 8, 8, 1,
1337       xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2, true);
1338   }
qs8_gemm_xw_3x8c8__avx2(benchmark::State & state,const char * net)1339   static void qs8_gemm_xw_3x8c8__avx2(benchmark::State& state, const char* net) {
1340     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, 3, 8, 8, 1,
1341       xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2, true);
1342   }
1343 
qs8_gemm_2x4c2__xop_ld64(benchmark::State & state,const char * net)1344   static void qs8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
1345     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld64, 2, 4, 2, 1,
1346       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1347   }
qs8_gemm_3x4c2__xop_ld64(benchmark::State & state,const char * net)1348   static void qs8_gemm_3x4c2__xop_ld64(benchmark::State& state, const char* net) {
1349     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld64, 3, 4, 2, 1,
1350       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1351   }
qs8_gemm_4x4c2__xop_ld64(benchmark::State & state,const char * net)1352   static void qs8_gemm_4x4c2__xop_ld64(benchmark::State& state, const char* net) {
1353     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld64, 4, 4, 2, 1,
1354       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1355   }
1356 
qs8_gemm_2x4c2__xop_ld128(benchmark::State & state,const char * net)1357   static void qs8_gemm_2x4c2__xop_ld128(benchmark::State& state, const char* net) {
1358     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__xop_ld128, 2, 4, 2, 1,
1359       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1360   }
qs8_gemm_3x4c2__xop_ld128(benchmark::State & state,const char * net)1361   static void qs8_gemm_3x4c2__xop_ld128(benchmark::State& state, const char* net) {
1362     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__xop_ld128, 3, 4, 2, 1,
1363       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1364   }
qs8_gemm_4x4c2__xop_ld128(benchmark::State & state,const char * net)1365   static void qs8_gemm_4x4c2__xop_ld128(benchmark::State& state, const char* net) {
1366     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__xop_ld128, 4, 4, 2, 1,
1367       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1368   }
1369 
qs8_gemm_xw_2x4c2__xop(benchmark::State & state,const char * net)1370   static void qs8_gemm_xw_2x4c2__xop(benchmark::State& state, const char* net) {
1371     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, 2, 4, 2, 1,
1372       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1373   }
qs8_gemm_xw_3x4c2__xop(benchmark::State & state,const char * net)1374   static void qs8_gemm_xw_3x4c2__xop(benchmark::State& state, const char* net) {
1375     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, 3, 4, 2, 1,
1376       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1377   }
qs8_gemm_xw_4x4c2__xop(benchmark::State & state,const char * net)1378   static void qs8_gemm_xw_4x4c2__xop(benchmark::State& state, const char* net) {
1379     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, 4, 4, 2, 1,
1380       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1381   }
1382 
qs8_gemm_2x4c8__xop_ld64(benchmark::State & state,const char * net)1383   static void qs8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
1384     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64, 2, 4, 8, 1,
1385       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1386   }
qs8_gemm_3x4c8__xop_ld64(benchmark::State & state,const char * net)1387   static void qs8_gemm_3x4c8__xop_ld64(benchmark::State& state, const char* net) {
1388     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64, 3, 4, 8, 1,
1389       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1390   }
1391 
qs8_gemm_2x4c8__xop_ld128(benchmark::State & state,const char * net)1392   static void qs8_gemm_2x4c8__xop_ld128(benchmark::State& state, const char* net) {
1393     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128, 2, 4, 8, 1,
1394       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1395   }
qs8_gemm_3x4c8__xop_ld128(benchmark::State & state,const char * net)1396   static void qs8_gemm_3x4c8__xop_ld128(benchmark::State& state, const char* net) {
1397     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128, 3, 4, 8, 1,
1398       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
1399   }
1400 
qs8_gemm_xw_2x4c8__xop(benchmark::State & state,const char * net)1401   static void qs8_gemm_xw_2x4c8__xop(benchmark::State& state, const char* net) {
1402     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, 2, 4, 8, 1,
1403       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1404   }
qs8_gemm_xw_3x4c8__xop(benchmark::State & state,const char * net)1405   static void qs8_gemm_xw_3x4c8__xop(benchmark::State& state, const char* net) {
1406     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, 3, 4, 8, 1,
1407       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
1408   }
1409 
qs8_gemm_2x4c2__avx_ld64(benchmark::State & state,const char * net)1410   static void qs8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
1411     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld64, 2, 4, 2, 1,
1412       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1413   }
qs8_gemm_3x4c2__avx_ld64(benchmark::State & state,const char * net)1414   static void qs8_gemm_3x4c2__avx_ld64(benchmark::State& state, const char* net) {
1415     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld64, 3, 4, 2, 1,
1416       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1417   }
qs8_gemm_4x4c2__avx_ld64(benchmark::State & state,const char * net)1418   static void qs8_gemm_4x4c2__avx_ld64(benchmark::State& state, const char* net) {
1419     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld64, 4, 4, 2, 1,
1420       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1421   }
1422 
qs8_gemm_2x4c2__avx_ld128(benchmark::State & state,const char * net)1423   static void qs8_gemm_2x4c2__avx_ld128(benchmark::State& state, const char* net) {
1424     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__avx_ld128, 2, 4, 2, 1,
1425       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1426   }
qs8_gemm_3x4c2__avx_ld128(benchmark::State & state,const char * net)1427   static void qs8_gemm_3x4c2__avx_ld128(benchmark::State& state, const char* net) {
1428     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__avx_ld128, 3, 4, 2, 1,
1429       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1430   }
qs8_gemm_4x4c2__avx_ld128(benchmark::State & state,const char * net)1431   static void qs8_gemm_4x4c2__avx_ld128(benchmark::State& state, const char* net) {
1432     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__avx_ld128, 4, 4, 2, 1,
1433       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1434   }
1435 
qs8_gemm_xw_2x4c2__avx(benchmark::State & state,const char * net)1436   static void qs8_gemm_xw_2x4c2__avx(benchmark::State& state, const char* net) {
1437     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, 2, 4, 2, 1,
1438       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1439   }
qs8_gemm_xw_3x4c2__avx(benchmark::State & state,const char * net)1440   static void qs8_gemm_xw_3x4c2__avx(benchmark::State& state, const char* net) {
1441     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, 3, 4, 2, 1,
1442       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1443   }
qs8_gemm_xw_4x4c2__avx(benchmark::State & state,const char * net)1444   static void qs8_gemm_xw_4x4c2__avx(benchmark::State& state, const char* net) {
1445     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, 4, 4, 2, 1,
1446       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1447   }
1448 
qs8_gemm_2x4c8__avx_ld64(benchmark::State & state,const char * net)1449   static void qs8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
1450     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64, 2, 4, 8, 1,
1451       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1452   }
qs8_gemm_3x4c8__avx_ld64(benchmark::State & state,const char * net)1453   static void qs8_gemm_3x4c8__avx_ld64(benchmark::State& state, const char* net) {
1454     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64, 3, 4, 8, 1,
1455       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1456   }
1457 
qs8_gemm_2x4c8__avx_ld128(benchmark::State & state,const char * net)1458   static void qs8_gemm_2x4c8__avx_ld128(benchmark::State& state, const char* net) {
1459     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128, 2, 4, 8, 1,
1460       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1461   }
qs8_gemm_3x4c8__avx_ld128(benchmark::State & state,const char * net)1462   static void qs8_gemm_3x4c8__avx_ld128(benchmark::State& state, const char* net) {
1463     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128, 3, 4, 8, 1,
1464       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
1465   }
1466 
qs8_gemm_xw_2x4c8__avx(benchmark::State & state,const char * net)1467   static void qs8_gemm_xw_2x4c8__avx(benchmark::State& state, const char* net) {
1468     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, 2, 4, 8, 1,
1469       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1470   }
qs8_gemm_xw_3x4c8__avx(benchmark::State & state,const char * net)1471   static void qs8_gemm_xw_3x4c8__avx(benchmark::State& state, const char* net) {
1472     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, 3, 4, 8, 1,
1473       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
1474   }
1475 
qs8_gemm_2x4c2__sse41_ld64(benchmark::State & state,const char * net)1476   static void qs8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
1477     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld64, 2, 4, 2, 1,
1478       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1479   }
qs8_gemm_3x4c2__sse41_ld64(benchmark::State & state,const char * net)1480   static void qs8_gemm_3x4c2__sse41_ld64(benchmark::State& state, const char* net) {
1481     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld64, 3, 4, 2, 1,
1482       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1483   }
qs8_gemm_4x4c2__sse41_ld64(benchmark::State & state,const char * net)1484   static void qs8_gemm_4x4c2__sse41_ld64(benchmark::State& state, const char* net) {
1485     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld64, 4, 4, 2, 1,
1486       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1487   }
1488 
qs8_gemm_2x4c2__sse41_ld128(benchmark::State & state,const char * net)1489   static void qs8_gemm_2x4c2__sse41_ld128(benchmark::State& state, const char* net) {
1490     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse41_ld128, 2, 4, 2, 1,
1491       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1492   }
qs8_gemm_3x4c2__sse41_ld128(benchmark::State & state,const char * net)1493   static void qs8_gemm_3x4c2__sse41_ld128(benchmark::State& state, const char* net) {
1494     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse41_ld128, 3, 4, 2, 1,
1495       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1496   }
qs8_gemm_4x4c2__sse41_ld128(benchmark::State & state,const char * net)1497   static void qs8_gemm_4x4c2__sse41_ld128(benchmark::State& state, const char* net) {
1498     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse41_ld128, 4, 4, 2, 1,
1499       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1500   }
1501 
qs8_gemm_xw_2x4c2__sse41(benchmark::State & state,const char * net)1502   static void qs8_gemm_xw_2x4c2__sse41(benchmark::State& state, const char* net) {
1503     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, 2, 4, 2, 1,
1504       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1505   }
qs8_gemm_xw_3x4c2__sse41(benchmark::State & state,const char * net)1506   static void qs8_gemm_xw_3x4c2__sse41(benchmark::State& state, const char* net) {
1507     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, 3, 4, 2, 1,
1508       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1509   }
qs8_gemm_xw_4x4c2__sse41(benchmark::State & state,const char * net)1510   static void qs8_gemm_xw_4x4c2__sse41(benchmark::State& state, const char* net) {
1511     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, 4, 4, 2, 1,
1512       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1513   }
1514 
qs8_gemm_2x4c8__sse41_ld64(benchmark::State & state,const char * net)1515   static void qs8_gemm_2x4c8__sse41_ld64(benchmark::State& state, const char* net) {
1516     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64, 2, 4, 8, 1,
1517       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1518   }
qs8_gemm_3x4c8__sse41_ld64(benchmark::State & state,const char * net)1519   static void qs8_gemm_3x4c8__sse41_ld64(benchmark::State& state, const char* net) {
1520     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64, 3, 4, 8, 1,
1521       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1522   }
1523 
qs8_gemm_2x4c8__sse41_ld128(benchmark::State & state,const char * net)1524   static void qs8_gemm_2x4c8__sse41_ld128(benchmark::State& state, const char* net) {
1525     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128, 2, 4, 8, 1,
1526       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1527   }
qs8_gemm_3x4c8__sse41_ld128(benchmark::State & state,const char * net)1528   static void qs8_gemm_3x4c8__sse41_ld128(benchmark::State& state, const char* net) {
1529     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128, 3, 4, 8, 1,
1530       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
1531   }
1532 
qs8_gemm_xw_2x4c8__sse41(benchmark::State & state,const char * net)1533   static void qs8_gemm_xw_2x4c8__sse41(benchmark::State& state, const char* net) {
1534     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, 2, 4, 8, 1,
1535       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1536   }
qs8_gemm_xw_3x4c8__sse41(benchmark::State & state,const char * net)1537   static void qs8_gemm_xw_3x4c8__sse41(benchmark::State& state, const char* net) {
1538     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, 3, 4, 8, 1,
1539       xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
1540   }
1541 
qs8_gemm_2x4c8__ssse3_ld64(benchmark::State & state,const char * net)1542   static void qs8_gemm_2x4c8__ssse3_ld64(benchmark::State& state, const char* net) {
1543     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld64, 2, 4, 8, 1,
1544       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
1545   }
qs8_gemm_3x4c8__ssse3_ld64(benchmark::State & state,const char * net)1546   static void qs8_gemm_3x4c8__ssse3_ld64(benchmark::State& state, const char* net) {
1547     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld64, 3, 4, 8, 1,
1548       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
1549   }
1550 
qs8_gemm_2x4c8__ssse3_ld128(benchmark::State & state,const char * net)1551   static void qs8_gemm_2x4c8__ssse3_ld128(benchmark::State& state, const char* net) {
1552     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__ssse3_ld128, 2, 4, 8, 1,
1553       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
1554   }
qs8_gemm_3x4c8__ssse3_ld128(benchmark::State & state,const char * net)1555   static void qs8_gemm_3x4c8__ssse3_ld128(benchmark::State& state, const char* net) {
1556     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__ssse3_ld128, 3, 4, 8, 1,
1557       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
1558   }
1559 
qs8_gemm_xw_2x4c8__ssse3(benchmark::State & state,const char * net)1560   static void qs8_gemm_xw_2x4c8__ssse3(benchmark::State& state, const char* net) {
1561     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, 2, 4, 8, 1,
1562       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3, true);
1563   }
qs8_gemm_xw_3x4c8__ssse3(benchmark::State & state,const char * net)1564   static void qs8_gemm_xw_3x4c8__ssse3(benchmark::State& state, const char* net) {
1565     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, 3, 4, 8, 1,
1566       xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3, true);
1567   }
1568 
qs8_gemm_2x4c2__sse2_ld64(benchmark::State & state,const char * net)1569   static void qs8_gemm_2x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1570     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64, 2, 4, 2, 1,
1571       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1572   }
qs8_gemm_3x4c2__sse2_ld64(benchmark::State & state,const char * net)1573   static void qs8_gemm_3x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1574     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64, 3, 4, 2, 1,
1575       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1576   }
qs8_gemm_4x4c2__sse2_ld64(benchmark::State & state,const char * net)1577   static void qs8_gemm_4x4c2__sse2_ld64(benchmark::State& state, const char* net) {
1578     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld64, 4, 4, 2, 1,
1579       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1580   }
1581 
qs8_gemm_2x4c2__sse2_ld128(benchmark::State & state,const char * net)1582   static void qs8_gemm_2x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1583     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld128, 2, 4, 2, 1,
1584       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1585   }
qs8_gemm_3x4c2__sse2_ld128(benchmark::State & state,const char * net)1586   static void qs8_gemm_3x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1587     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld128, 3, 4, 2, 1,
1588       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1589   }
qs8_gemm_4x4c2__sse2_ld128(benchmark::State & state,const char * net)1590   static void qs8_gemm_4x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1591     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__sse2_ld128, 4, 4, 2, 1,
1592       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1593   }
1594 
qs8_gemm_xw_2x4c2__sse2(benchmark::State & state,const char * net)1595   static void qs8_gemm_xw_2x4c2__sse2(benchmark::State& state, const char* net) {
1596     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, 2, 4, 2, 1,
1597       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1598   }
qs8_gemm_xw_3x4c2__sse2(benchmark::State & state,const char * net)1599   static void qs8_gemm_xw_3x4c2__sse2(benchmark::State& state, const char* net) {
1600     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, 3, 4, 2, 1,
1601       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1602   }
qs8_gemm_xw_4x4c2__sse2(benchmark::State & state,const char * net)1603   static void qs8_gemm_xw_4x4c2__sse2(benchmark::State& state, const char* net) {
1604     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, 4, 4, 2, 1,
1605       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1606   }
1607 
qs8_gemm_2x4c8__sse2_ld64(benchmark::State & state,const char * net)1608   static void qs8_gemm_2x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1609     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64, 2, 4, 8, 1,
1610       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1611   }
qs8_gemm_3x4c8__sse2_ld64(benchmark::State & state,const char * net)1612   static void qs8_gemm_3x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1613     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64, 3, 4, 8, 1,
1614       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1615   }
1616 
qs8_gemm_2x4c8__sse2_ld128(benchmark::State & state,const char * net)1617   static void qs8_gemm_2x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1618     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128, 2, 4, 8, 1,
1619       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1620   }
qs8_gemm_3x4c8__sse2_ld128(benchmark::State & state,const char * net)1621   static void qs8_gemm_3x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1622     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128, 3, 4, 8, 1,
1623       xnn_init_qs8_conv_minmax_fp32_sse2_params);
1624   }
1625 
qs8_gemm_xw_2x4c8__sse2(benchmark::State & state,const char * net)1626   static void qs8_gemm_xw_2x4c8__sse2(benchmark::State& state, const char* net) {
1627     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, 2, 4, 8, 1,
1628       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1629   }
qs8_gemm_xw_3x4c8__sse2(benchmark::State & state,const char * net)1630   static void qs8_gemm_xw_3x4c8__sse2(benchmark::State& state, const char* net) {
1631     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, 3, 4, 8, 1,
1632       xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
1633   }
1634 
1635   BENCHMARK_GEMM(qs8_gemm_2x16c8__avx512skx)
BENCHMARK_GEMM(qs8_gemm_3x16c8__avx512skx)1636   BENCHMARK_GEMM(qs8_gemm_3x16c8__avx512skx)
1637   BENCHMARK_GEMM(qs8_gemm_4x16c8__avx512skx)
1638 
1639   BENCHMARK_GEMM(qs8_gemm_2x8c8__avx2)
1640   BENCHMARK_GEMM(qs8_gemm_3x8c8__avx2)
1641   BENCHMARK_GEMM(qs8_gemm_xw_2x8c8__avx2)
1642   BENCHMARK_GEMM(qs8_gemm_xw_3x8c8__avx2)
1643 
1644   BENCHMARK_GEMM(qs8_gemm_2x4c2__xop_ld64)
1645   BENCHMARK_GEMM(qs8_gemm_3x4c2__xop_ld64)
1646   BENCHMARK_GEMM(qs8_gemm_4x4c2__xop_ld64)
1647   BENCHMARK_GEMM(qs8_gemm_2x4c2__xop_ld128)
1648   BENCHMARK_GEMM(qs8_gemm_3x4c2__xop_ld128)
1649   BENCHMARK_GEMM(qs8_gemm_4x4c2__xop_ld128)
1650   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__xop)
1651   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__xop)
1652   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__xop)
1653   BENCHMARK_GEMM(qs8_gemm_2x4c8__xop_ld64)
1654   BENCHMARK_GEMM(qs8_gemm_3x4c8__xop_ld64)
1655   BENCHMARK_GEMM(qs8_gemm_2x4c8__xop_ld128)
1656   BENCHMARK_GEMM(qs8_gemm_3x4c8__xop_ld128)
1657   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__xop)
1658   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__xop)
1659 
1660   BENCHMARK_GEMM(qs8_gemm_2x4c2__avx_ld64)
1661   BENCHMARK_GEMM(qs8_gemm_3x4c2__avx_ld64)
1662   BENCHMARK_GEMM(qs8_gemm_4x4c2__avx_ld64)
1663   BENCHMARK_GEMM(qs8_gemm_2x4c2__avx_ld128)
1664   BENCHMARK_GEMM(qs8_gemm_3x4c2__avx_ld128)
1665   BENCHMARK_GEMM(qs8_gemm_4x4c2__avx_ld128)
1666   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__avx)
1667   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__avx)
1668   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__avx)
1669   BENCHMARK_GEMM(qs8_gemm_2x4c8__avx_ld64)
1670   BENCHMARK_GEMM(qs8_gemm_3x4c8__avx_ld64)
1671   BENCHMARK_GEMM(qs8_gemm_2x4c8__avx_ld128)
1672   BENCHMARK_GEMM(qs8_gemm_3x4c8__avx_ld128)
1673   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__avx)
1674   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__avx)
1675 
1676   BENCHMARK_GEMM(qs8_gemm_2x4c2__sse41_ld64)
1677   BENCHMARK_GEMM(qs8_gemm_3x4c2__sse41_ld64)
1678   BENCHMARK_GEMM(qs8_gemm_4x4c2__sse41_ld64)
1679   BENCHMARK_GEMM(qs8_gemm_2x4c2__sse41_ld128)
1680   BENCHMARK_GEMM(qs8_gemm_3x4c2__sse41_ld128)
1681   BENCHMARK_GEMM(qs8_gemm_4x4c2__sse41_ld128)
1682   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__sse41)
1683   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__sse41)
1684   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__sse41)
1685   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse41_ld64)
1686   BENCHMARK_GEMM(qs8_gemm_3x4c8__sse41_ld64)
1687   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse41_ld128)
1688   BENCHMARK_GEMM(qs8_gemm_3x4c8__sse41_ld128)
1689   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__sse41)
1690   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__sse41)
1691 
1692   BENCHMARK_GEMM(qs8_gemm_2x4c8__ssse3_ld64)
1693   BENCHMARK_GEMM(qs8_gemm_3x4c8__ssse3_ld64)
1694   BENCHMARK_GEMM(qs8_gemm_2x4c8__ssse3_ld128)
1695   BENCHMARK_GEMM(qs8_gemm_3x4c8__ssse3_ld128)
1696   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__ssse3)
1697   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__ssse3)
1698 
1699   BENCHMARK_GEMM(qs8_gemm_2x4c2__sse2_ld64)
1700   BENCHMARK_GEMM(qs8_gemm_3x4c2__sse2_ld64)
1701   BENCHMARK_GEMM(qs8_gemm_4x4c2__sse2_ld64)
1702   BENCHMARK_GEMM(qs8_gemm_2x4c2__sse2_ld128)
1703   BENCHMARK_GEMM(qs8_gemm_3x4c2__sse2_ld128)
1704   BENCHMARK_GEMM(qs8_gemm_4x4c2__sse2_ld128)
1705   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__sse2)
1706   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__sse2)
1707   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__sse2)
1708   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse2_ld64)
1709   BENCHMARK_GEMM(qs8_gemm_3x4c8__sse2_ld64)
1710   BENCHMARK_GEMM(qs8_gemm_2x4c8__sse2_ld128)
1711   BENCHMARK_GEMM(qs8_gemm_3x4c8__sse2_ld128)
1712   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__sse2)
1713   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__sse2)
1714 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1715 
1716 
1717 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1718   static void qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1719     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64, 2, 4, 2, 1,
1720       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1721   }
qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1722   static void qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1723     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64, 3, 4, 2, 1,
1724       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1725   }
qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1726   static void qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1727     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64, 4, 4, 2, 1,
1728       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1729   }
1730 
qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1731   static void qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1732     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128, 2, 4, 2, 1,
1733       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1734   }
qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1735   static void qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1736     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128, 3, 4, 2, 1,
1737       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1738   }
qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1739   static void qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1740     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128, 4, 4, 2, 1,
1741       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1742   }
1743 
qs8_gemm_xw_2x4c2__wasmsimd_dot16x2(benchmark::State & state,const char * net)1744   static void qs8_gemm_xw_2x4c2__wasmsimd_dot16x2(benchmark::State& state, const char* net) {
1745     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2, 2, 4, 2, 1,
1746       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
1747   }
qs8_gemm_xw_3x4c2__wasmsimd_dot16x2(benchmark::State & state,const char * net)1748   static void qs8_gemm_xw_3x4c2__wasmsimd_dot16x2(benchmark::State& state, const char* net) {
1749     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2, 3, 4, 2, 1,
1750       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
1751   }
qs8_gemm_xw_4x4c2__wasmsimd_dot16x2(benchmark::State & state,const char * net)1752   static void qs8_gemm_xw_4x4c2__wasmsimd_dot16x2(benchmark::State& state, const char* net) {
1753     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2, 4, 4, 2, 1,
1754       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
1755   }
1756 
qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1757   static void qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1758     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64, 2, 4, 2, 4,
1759       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1760   }
qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1761   static void qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1762     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64, 3, 4, 2, 4,
1763       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1764   }
qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1765   static void qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1766     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64, 4, 4, 2, 4,
1767       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1768   }
1769 
qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1770   static void qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1771     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128, 2, 4, 2, 4,
1772       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1773   }
qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1774   static void qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1775     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128, 3, 4, 2, 4,
1776       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1777   }
qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1778   static void qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1779     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128, 4, 4, 2, 4,
1780       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1781   }
1782 
qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1783   static void qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1784     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64, 2, 4, 8, 1,
1785       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1786   }
qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1787   static void qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1788     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64, 3, 4, 8, 1,
1789       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1790   }
qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1791   static void qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1792     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64, 4, 4, 8, 1,
1793       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1794   }
1795 
qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1796   static void qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1797     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128, 2, 4, 8, 1,
1798       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1799   }
qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1800   static void qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1801     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128, 3, 4, 8, 1,
1802       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1803   }
qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1804   static void qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1805     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128, 4, 4, 8, 1,
1806       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1807   }
1808 
qs8_gemm_xw_2x4c8__wasmsimd_dot16x2(benchmark::State & state,const char * net)1809   static void qs8_gemm_xw_2x4c8__wasmsimd_dot16x2(benchmark::State& state, const char* net) {
1810     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2, 2, 4, 8, 1,
1811       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
1812   }
qs8_gemm_xw_3x4c8__wasmsimd_dot16x2(benchmark::State & state,const char * net)1813   static void qs8_gemm_xw_3x4c8__wasmsimd_dot16x2(benchmark::State& state, const char* net) {
1814     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2, 3, 4, 8, 1,
1815       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
1816   }
qs8_gemm_xw_4x4c8__wasmsimd_dot16x2(benchmark::State & state,const char * net)1817   static void qs8_gemm_xw_4x4c8__wasmsimd_dot16x2(benchmark::State& state, const char* net) {
1818     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2, 4, 4, 8, 1,
1819       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
1820   }
1821 
qs8_gemm_2x4c8__wasmsimd_mul16_ld64(benchmark::State & state,const char * net)1822   static void qs8_gemm_2x4c8__wasmsimd_mul16_ld64(benchmark::State& state, const char* net) {
1823     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld64, 2, 4, 8, 1,
1824       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1825   }
qs8_gemm_3x4c8__wasmsimd_mul16_ld64(benchmark::State & state,const char * net)1826   static void qs8_gemm_3x4c8__wasmsimd_mul16_ld64(benchmark::State& state, const char* net) {
1827     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld64, 3, 4, 8, 1,
1828       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1829   }
1830 
qs8_gemm_2x4c8__wasmsimd_mul16_ld128(benchmark::State & state,const char * net)1831   static void qs8_gemm_2x4c8__wasmsimd_mul16_ld128(benchmark::State& state, const char* net) {
1832     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16_ld128, 2, 4, 8, 1,
1833       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1834   }
qs8_gemm_3x4c8__wasmsimd_mul16_ld128(benchmark::State & state,const char * net)1835   static void qs8_gemm_3x4c8__wasmsimd_mul16_ld128(benchmark::State& state, const char* net) {
1836     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16_ld128, 3, 4, 8, 1,
1837       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params);
1838   }
1839 
qs8_gemm_xw_2x4c8__wasmsimd_mul16(benchmark::State & state,const char * net)1840   static void qs8_gemm_xw_2x4c8__wasmsimd_mul16(benchmark::State& state, const char* net) {
1841     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__wasmsimd_mul16, 2, 4, 8, 1,
1842       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
1843   }
qs8_gemm_xw_3x4c8__wasmsimd_mul16(benchmark::State & state,const char * net)1844   static void qs8_gemm_xw_3x4c8__wasmsimd_mul16(benchmark::State& state, const char* net) {
1845     GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__wasmsimd_mul16, 3, 4, 8, 1,
1846       xnn_init_qs8_conv_minmax_fp32_wasmsimd_params, nullptr, true);
1847   }
1848 
1849   BENCHMARK_GEMM(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64)1850   BENCHMARK_GEMM(qs8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
1851   BENCHMARK_GEMM(qs8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
1852   BENCHMARK_GEMM(qs8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
1853   BENCHMARK_GEMM(qs8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
1854   BENCHMARK_GEMM(qs8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
1855   BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__wasmsimd_dot16x2)
1856   BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__wasmsimd_dot16x2)
1857   BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__wasmsimd_dot16x2)
1858 
1859   BENCHMARK_GEMM(qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
1860   BENCHMARK_GEMM(qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
1861   BENCHMARK_GEMM(qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
1862   BENCHMARK_GEMM(qs8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
1863   BENCHMARK_GEMM(qs8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
1864   BENCHMARK_GEMM(qs8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
1865 
1866   BENCHMARK_GEMM(qs8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
1867   BENCHMARK_GEMM(qs8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
1868   BENCHMARK_GEMM(qs8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
1869   BENCHMARK_GEMM(qs8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
1870   BENCHMARK_GEMM(qs8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
1871   BENCHMARK_GEMM(qs8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
1872   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__wasmsimd_dot16x2)
1873   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__wasmsimd_dot16x2)
1874   BENCHMARK_GEMM(qs8_gemm_xw_4x4c8__wasmsimd_dot16x2)
1875 
1876   BENCHMARK_GEMM(qs8_gemm_2x4c8__wasmsimd_mul16_ld64)
1877   BENCHMARK_GEMM(qs8_gemm_3x4c8__wasmsimd_mul16_ld64)
1878   BENCHMARK_GEMM(qs8_gemm_2x4c8__wasmsimd_mul16_ld128)
1879   BENCHMARK_GEMM(qs8_gemm_3x4c8__wasmsimd_mul16_ld128)
1880   BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__wasmsimd_mul16)
1881   BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__wasmsimd_mul16)
1882 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1883 
1884 
1885 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1886   static void qs8_gemm_2x2__wasm_fmagic(benchmark::State& state, const char* net) {
1887     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic, 2, 2, 1, 1,
1888       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
1889   }
qs8_gemm_3x2__wasm_fmagic(benchmark::State & state,const char * net)1890   static void qs8_gemm_3x2__wasm_fmagic(benchmark::State& state, const char* net) {
1891     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic, 3, 2, 1, 1,
1892       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
1893   }
qs8_gemm_4x2__wasm_fmagic(benchmark::State & state,const char * net)1894   static void qs8_gemm_4x2__wasm_fmagic(benchmark::State& state, const char* net) {
1895     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic, 4, 2, 1, 1,
1896       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
1897   }
qs8_gemm_2x4__wasm_fmagic(benchmark::State & state,const char * net)1898   static void qs8_gemm_2x4__wasm_fmagic(benchmark::State& state, const char* net) {
1899     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic, 2, 4, 1, 1,
1900       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
1901   }
qs8_gemm_3x4__wasm_fmagic(benchmark::State & state,const char * net)1902   static void qs8_gemm_3x4__wasm_fmagic(benchmark::State& state, const char* net) {
1903     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic, 3, 4, 1, 1,
1904       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
1905   }
qs8_gemm_4x4__wasm_fmagic(benchmark::State & state,const char * net)1906   static void qs8_gemm_4x4__wasm_fmagic(benchmark::State& state, const char* net) {
1907     GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic, 4, 4, 1, 1,
1908       xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
1909   }
1910 
1911   BENCHMARK_GEMM(qs8_gemm_2x2__wasm_fmagic)
BENCHMARK_GEMM(qs8_gemm_3x2__wasm_fmagic)1912   BENCHMARK_GEMM(qs8_gemm_3x2__wasm_fmagic)
1913   BENCHMARK_GEMM(qs8_gemm_4x2__wasm_fmagic)
1914   BENCHMARK_GEMM(qs8_gemm_2x4__wasm_fmagic)
1915   BENCHMARK_GEMM(qs8_gemm_3x4__wasm_fmagic)
1916   BENCHMARK_GEMM(qs8_gemm_4x4__wasm_fmagic)
1917 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1918 
1919 
1920 static void qs8_gemm_2x2__scalar_fmagic(benchmark::State& state, const char* net) {
1921   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic, 2, 2, 1, 1,
1922     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
1923 }
qs8_gemm_3x2__scalar_fmagic(benchmark::State & state,const char * net)1924 static void qs8_gemm_3x2__scalar_fmagic(benchmark::State& state, const char* net) {
1925   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic, 3, 2, 1, 1,
1926     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
1927 }
qs8_gemm_4x2__scalar_fmagic(benchmark::State & state,const char * net)1928 static void qs8_gemm_4x2__scalar_fmagic(benchmark::State& state, const char* net) {
1929   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic, 4, 2, 1, 1,
1930     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
1931 }
qs8_gemm_2x4__scalar_fmagic(benchmark::State & state,const char * net)1932 static void qs8_gemm_2x4__scalar_fmagic(benchmark::State& state, const char* net) {
1933   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic, 2, 4, 1, 1,
1934     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
1935 }
qs8_gemm_3x4__scalar_fmagic(benchmark::State & state,const char * net)1936 static void qs8_gemm_3x4__scalar_fmagic(benchmark::State& state, const char* net) {
1937   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic, 3, 4, 1, 1,
1938     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
1939 }
qs8_gemm_4x4__scalar_fmagic(benchmark::State & state,const char * net)1940 static void qs8_gemm_4x4__scalar_fmagic(benchmark::State& state, const char* net) {
1941   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic, 4, 4, 1, 1,
1942     xnn_init_qs8_conv_minmax_fp32_scalar_fmagic_params);
1943 }
1944 
qs8_gemm_2x2__scalar_imagic(benchmark::State & state,const char * net)1945 static void qs8_gemm_2x2__scalar_imagic(benchmark::State& state, const char* net) {
1946   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic, 2, 2, 1, 1,
1947     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params);
1948 }
qs8_gemm_3x2__scalar_imagic(benchmark::State & state,const char * net)1949 static void qs8_gemm_3x2__scalar_imagic(benchmark::State& state, const char* net) {
1950   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic, 3, 2, 1, 1,
1951     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params);
1952 }
qs8_gemm_4x2__scalar_imagic(benchmark::State & state,const char * net)1953 static void qs8_gemm_4x2__scalar_imagic(benchmark::State& state, const char* net) {
1954   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic, 4, 2, 1, 1,
1955     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params);
1956 }
qs8_gemm_2x4__scalar_imagic(benchmark::State & state,const char * net)1957 static void qs8_gemm_2x4__scalar_imagic(benchmark::State& state, const char* net) {
1958   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic, 2, 4, 1, 1,
1959     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params);
1960 }
qs8_gemm_3x4__scalar_imagic(benchmark::State & state,const char * net)1961 static void qs8_gemm_3x4__scalar_imagic(benchmark::State& state, const char* net) {
1962   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic, 3, 4, 1, 1,
1963     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params);
1964 }
qs8_gemm_4x4__scalar_imagic(benchmark::State & state,const char * net)1965 static void qs8_gemm_4x4__scalar_imagic(benchmark::State& state, const char* net) {
1966   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic, 4, 4, 1, 1,
1967     xnn_init_qs8_conv_minmax_fp32_scalar_imagic_params);
1968 }
1969 
qs8_gemm_2x2__scalar_lrintf(benchmark::State & state,const char * net)1970 static void qs8_gemm_2x2__scalar_lrintf(benchmark::State& state, const char* net) {
1971   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf, 2, 2, 1, 1,
1972     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params);
1973 }
qs8_gemm_3x2__scalar_lrintf(benchmark::State & state,const char * net)1974 static void qs8_gemm_3x2__scalar_lrintf(benchmark::State& state, const char* net) {
1975   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf, 3, 2, 1, 1,
1976     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params);
1977 }
qs8_gemm_4x2__scalar_lrintf(benchmark::State & state,const char * net)1978 static void qs8_gemm_4x2__scalar_lrintf(benchmark::State& state, const char* net) {
1979   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf, 4, 2, 1, 1,
1980     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params);
1981 }
qs8_gemm_2x4__scalar_lrintf(benchmark::State & state,const char * net)1982 static void qs8_gemm_2x4__scalar_lrintf(benchmark::State& state, const char* net) {
1983   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf, 2, 4, 1, 1,
1984     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params);
1985 }
qs8_gemm_3x4__scalar_lrintf(benchmark::State & state,const char * net)1986 static void qs8_gemm_3x4__scalar_lrintf(benchmark::State& state, const char* net) {
1987   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf, 3, 4, 1, 1,
1988     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params);
1989 }
qs8_gemm_4x4__scalar_lrintf(benchmark::State & state,const char * net)1990 static void qs8_gemm_4x4__scalar_lrintf(benchmark::State& state, const char* net) {
1991   GEMMBenchmark(state, xnn_qs8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf, 4, 4, 1, 1,
1992     xnn_init_qs8_conv_minmax_fp32_scalar_lrintf_params);
1993 }
1994 
1995 BENCHMARK_GEMM(qs8_gemm_2x2__scalar_fmagic)
1996 BENCHMARK_GEMM(qs8_gemm_3x2__scalar_fmagic)
1997 BENCHMARK_GEMM(qs8_gemm_4x2__scalar_fmagic)
1998 BENCHMARK_GEMM(qs8_gemm_2x4__scalar_fmagic)
1999 BENCHMARK_GEMM(qs8_gemm_3x4__scalar_fmagic)
2000 BENCHMARK_GEMM(qs8_gemm_4x4__scalar_fmagic)
2001 
2002 BENCHMARK_GEMM(qs8_gemm_2x2__scalar_imagic)
2003 BENCHMARK_GEMM(qs8_gemm_3x2__scalar_imagic)
2004 BENCHMARK_GEMM(qs8_gemm_4x2__scalar_imagic)
2005 BENCHMARK_GEMM(qs8_gemm_2x4__scalar_imagic)
2006 BENCHMARK_GEMM(qs8_gemm_3x4__scalar_imagic)
2007 BENCHMARK_GEMM(qs8_gemm_4x4__scalar_imagic)
2008 
2009 BENCHMARK_GEMM(qs8_gemm_2x2__scalar_lrintf)
2010 BENCHMARK_GEMM(qs8_gemm_3x2__scalar_lrintf)
2011 BENCHMARK_GEMM(qs8_gemm_4x2__scalar_lrintf)
2012 BENCHMARK_GEMM(qs8_gemm_2x4__scalar_lrintf)
2013 BENCHMARK_GEMM(qs8_gemm_3x4__scalar_lrintf)
2014 BENCHMARK_GEMM(qs8_gemm_4x4__scalar_lrintf)
2015 
2016 
2017 #ifdef BENCHMARK_RUY
2018 BENCHMARK_GEMM(ruy_st)
2019 #endif  // BENCHMARK_RUY
2020 
2021 #ifndef XNNPACK_BENCHMARK_NO_MAIN
2022 BENCHMARK_MAIN();
2023 #endif
2024