• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <algorithm>
10 #include <cfloat>
11 #include <chrono>
12 #include <cmath>
13 #include <functional>
14 #include <limits>
15 #include <mutex>
16 #include <random>
17 #include <vector>
18 
19 #include <cpuinfo.h>
20 
21 #include <benchmark/benchmark.h>
22 #ifdef BENCHMARK_GEMMLOWP
23 #include "gemmlowp/public/gemmlowp.h"
24 #endif  // BENCHMARK_GEMMLOWP
25 #ifdef BENCHMARK_RUY
26 #include "ruy/ruy.h"
27 #endif  // BENCHMARK_RUY
28 #include "bench/gemm.h"
29 #include "bench/utils.h"
30 #include <xnnpack/AlignedAllocator.h>
31 #include <xnnpack/common.h>
32 #include <xnnpack/gemm.h>
33 #include <xnnpack/pack.h>
34 #include <xnnpack/params-init.h>
35 #include <xnnpack/params.h>
36 
37 
GEMMBenchmark(benchmark::State & state,xnn_qu8_gemm_minmax_ukernel_function gemm,xnn_init_qu8_conv_minmax_params_fn init_params,size_t mr,size_t nr,size_t kr,size_t sr,benchmark::utils::IsaCheckFunction isa_check=nullptr)38 static void GEMMBenchmark(benchmark::State& state,
39   xnn_qu8_gemm_minmax_ukernel_function gemm,
40   xnn_init_qu8_conv_minmax_params_fn init_params,
41   size_t mr, size_t nr, size_t kr, size_t sr,
42   benchmark::utils::IsaCheckFunction isa_check = nullptr)
43 {
44   if (!cpuinfo_initialize()) {
45     state.SkipWithError("cpuinfo initialization failed");
46     return;
47   }
48   if (isa_check && !isa_check(state)) {
49     return;
50   }
51 
52   const size_t mc = state.range(0);
53   const size_t nc = state.range(1);
54   const size_t kc = state.range(2);
55 
56   const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
57   const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
58 
59   std::random_device random_device;
60   auto rng = std::mt19937(random_device());
61   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
62   auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
63 
64   std::vector<uint8_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint8_t));
65   std::generate(a.begin(), a.end(), std::ref(u8rng));
66   std::vector<uint8_t> k(nc * kc);
67   std::generate(k.begin(), k.end(), std::ref(u8rng));
68   std::vector<int32_t> b(nc);
69   std::generate(b.begin(), b.end(), std::ref(i32rng));
70 
71   const size_t w_elements = kc_stride * nc_stride + nc_stride * sizeof(int32_t) / sizeof(uint8_t);
72   const size_t c_elements = mc * nc;
73   const size_t num_buffers = 1 +
74     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
75       sizeof(uint8_t) * (w_elements + c_elements));
76 
77   std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> w(w_elements * num_buffers);
78   std::fill(w.begin(), w.end(), 0);
79   const xnn_qu8_packing_params packing_params = { 127, 127 };
80   xnn_pack_qu8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
81   std::vector<uint8_t> c(c_elements * num_buffers);
82   std::fill(c.begin(), c.end(), 0xA5);
83 
84   union xnn_qu8_conv_minmax_params quantization_params;
85   init_params(&quantization_params, 127, 0.75f, 127, 1, 254);
86 
87   size_t buffer_index = 0;
88   for (auto _ : state) {
89     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
90     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
91     // - W is not in cache (for any cache level)
92     // - C is not in cache (for any cache level)
93     state.PauseTiming();
94     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
95     buffer_index = (buffer_index + 1) % num_buffers;
96     state.ResumeTiming();
97 
98     for (uint32_t m = 0; m < mc; m += mr) {
99       const uint32_t mb = min(mc - m, mr);
100       for (uint32_t n = 0; n < nc; n += nr) {
101         const uint32_t nb = min(nc - n, nr);
102         gemm(
103           mb, nb, kc * sizeof(uint8_t),
104           a.data() + m * kc, kc * sizeof(uint8_t),
105           w.data() + (w_elements * buffer_index + n * (kc_stride + sizeof(int32_t))) / sizeof(uint8_t),
106           c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint8_t), nr * sizeof(uint8_t),
107           &quantization_params);
108       }
109     }
110   }
111 
112   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
113   if (cpu_frequency != 0) {
114     state.counters["cpufreq"] = cpu_frequency;
115   }
116 
117   state.counters["OPS"] = benchmark::Counter(
118     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
119 }
120 
121 #ifdef BENCHMARK_GEMMLOWP
122 struct GemmlowpOutputPipeline {
123   typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
124   typedef std::tuple<
125       gemmlowp::OutputStageBiasAddition<ColVectorMap>,
126       gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
127       gemmlowp::OutputStageClamp,
128       gemmlowp::OutputStageSaturatingCastToUint8>
129       Pipeline;
130 
MakeGemmlowpOutputPipeline131   static Pipeline Make(
132       const int32_t* bias_data,
133       int output_rows,
134       int32_t output_offset,
135       int32_t output_multiplier,
136       int output_shift,
137       int32_t output_activation_min,
138       int32_t output_activation_max)
139   {
140     ColVectorMap bias_vector(bias_data, output_rows);
141     gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
142     bias_addition_stage.bias_vector = bias_vector;
143     gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint quantize_down_stage;
144     quantize_down_stage.result_offset_after_shift = output_offset;
145     quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
146     quantize_down_stage.result_shift = output_shift;
147     gemmlowp::OutputStageClamp clamp_stage;
148     clamp_stage.min = output_activation_min;
149     clamp_stage.max = output_activation_max;
150     gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
151     return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage, saturating_cast_stage);
152   }
153 };
154 
GemmlowpBenchmark(benchmark::State & state,uint32_t threads)155 static void GemmlowpBenchmark(benchmark::State& state, uint32_t threads)
156 {
157   const size_t mc = state.range(0);
158   const size_t nc = state.range(1);
159   const size_t kc = state.range(2);
160 
161   std::random_device random_device;
162   auto rng = std::mt19937(random_device());
163   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
164   auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
165 
166   std::vector<uint8_t> a(mc * kc);
167   std::generate(a.begin(), a.end(), std::ref(u8rng));
168 
169   const size_t kElements = nc * kc;
170   const size_t bElements = nc;
171   const size_t c_elements = mc * nc;
172   const size_t num_buffers = 1 +
173     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
174       kElements * sizeof(uint8_t) + bElements * sizeof(int32_t) + c_elements * sizeof(uint8_t));
175 
176   std::vector<uint8_t> k(kElements * num_buffers);
177   std::generate(k.begin(), k.end(), std::ref(u8rng));
178   std::vector<int32_t> b(bElements * num_buffers);
179   std::generate(b.begin(), b.end(), std::ref(i32rng));
180   std::vector<uint8_t> c(c_elements * num_buffers);
181   std::fill(c.begin(), c.end(), 0xA5);
182 
183   gemmlowp::MultiThreadGemmContext threadingContext;
184   threadingContext.set_max_num_threads(threads);
185 
186   size_t buffer_index = 0;
187   for (auto _ : state) {
188     state.PauseTiming();
189     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
190     buffer_index = (buffer_index + 1) % num_buffers;
191     state.ResumeTiming();
192 
193     gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> AM(a.data(), mc, kc, kc);
194     gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> BM(k.data() + buffer_index * kElements, kc, nc, kc);
195     gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::RowMajor> CM(c.data() + buffer_index * c_elements, mc, nc, nc);
196     const auto& outputPipeline = GemmlowpOutputPipeline::Make(b.data() + buffer_index * bElements, nc, 127, 127, 127, 0, 255);
197     gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
198         &threadingContext, AM, BM, &CM, 127, 127, outputPipeline);
199   }
200 
201   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
202   if (cpu_frequency != 0) {
203     state.counters["cpufreq"] = cpu_frequency;
204   }
205 
206   state.counters["OPS"] = benchmark::Counter(
207     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
208 }
209 
gemmlowp_st(benchmark::State & state,const char * net)210 static void gemmlowp_st(benchmark::State& state, const char* net)
211 {
212   GemmlowpBenchmark(state, 1);
213 }
214 #endif  // BENCHMARK_GEMMLOWP
215 
216 
217 #ifdef BENCHMARK_RUY
RuyBenchmark(benchmark::State & state,size_t threads)218 static void RuyBenchmark(benchmark::State& state, size_t threads)
219 {
220   const size_t mc = state.range(0);
221   const size_t nc = state.range(1);
222   const size_t kc = state.range(2);
223 
224   std::random_device random_device;
225   auto rng = std::mt19937(random_device());
226   auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
227   auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
228 
229   const size_t num_buffers = 1 +
230     benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
231       nc * (sizeof(uint8_t) * (mc + kc) + sizeof(int32_t)));
232 
233   std::vector<uint8_t> a(mc * kc);
234   std::generate(a.begin(), a.end(), std::ref(u8rng));
235   std::vector<uint8_t> k(num_buffers * nc * kc);
236   std::generate(k.begin(), k.end(), std::ref(u8rng));
237   std::vector<int32_t> b(num_buffers * nc);
238   std::generate(b.begin(), b.end(), std::ref(i32rng));
239   std::vector<uint8_t> c(num_buffers * nc * mc);
240   std::fill(c.begin(), c.end(), std::nanf(""));
241 
242   // Note: context must be static to avoid the cost of re-creating it for each benchmark.
243   static ruy::Context context;
244   context.set_max_num_threads(threads);
245 
246   ruy::Matrix<uint8_t> ruy_a;
247   ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
248   ruy_a.set_zero_point(127);
249   ruy::Matrix<uint8_t> ruy_b;
250   ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
251   ruy_b.set_data(a.data());
252   ruy_b.set_zero_point(127);
253   ruy::Matrix<uint8_t> ruy_c;
254   ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
255   ruy_c.set_zero_point(127);
256 
257   ruy::MulParams<int32_t, uint8_t> mul_params;
258   mul_params.set_multiplier_fixedpoint(0x40000000);
259 
260   // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
261   // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
262   // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
263   // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
264   static std::once_flag warmup;
265   std::call_once(warmup, [&](){
266     auto start = std::chrono::steady_clock::now();
267     do {
268       ruy_a.set_data(k.data());
269       ruy_c.set_data(c.data());
270       mul_params.set_bias(b.data());
271 
272       ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
273     } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
274   });
275 
276   size_t buffer_index = 0;
277   for (auto _ : state) {
278     // Use circular buffers (exceeding cache size) and prefetch to control cache state:
279     // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
280     // - K is not in cache (for any cache level)
281     // - B is not in cache (for any cache level)
282     // - C is not in cache (for any cache level)
283     state.PauseTiming();
284     benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
285     buffer_index = (buffer_index + 1) % num_buffers;
286     state.ResumeTiming();
287 
288     ruy_a.set_data(k.data() + buffer_index * nc * kc);
289     ruy_c.set_data(c.data() + buffer_index * mc * nc);
290     mul_params.set_bias(b.data() + buffer_index * nc);
291 
292     ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
293   }
294 
295   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
296   if (cpu_frequency != 0) {
297     state.counters["cpufreq"] = cpu_frequency;
298   }
299 
300   state.counters["OPS"] = benchmark::Counter(
301     uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
302 }
303 
ruy_st(benchmark::State & state,const char * net)304 static void ruy_st(benchmark::State& state, const char* net)
305 {
306   RuyBenchmark(state, 1);
307 }
308 #endif  // BENCHMARK_RUY
309 
310 
311 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)312   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
313     GEMMBenchmark(state,
314       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
315       xnn_init_qu8_conv_minmax_rndnu_neon_params,
316       4, 8, 1, 1, benchmark::utils::CheckNEON);
317   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)318   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
319     GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
320       xnn_init_qu8_conv_minmax_rndnu_neon_params,
321       4, 8, 1, 1, benchmark::utils::CheckNEON);
322   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,const char * net)323   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
324     GEMMBenchmark(state,
325       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
326       xnn_init_qu8_conv_minmax_rndnu_neon_params,
327       4, 8, 1, 1, benchmark::utils::CheckNEON);
328   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,const char * net)329   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
330     GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
331       xnn_init_qu8_conv_minmax_rndnu_neon_params,
332       4, 8, 1, 1, benchmark::utils::CheckNEON);
333   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,const char * net)334   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
335     GEMMBenchmark(state,
336       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
337       xnn_init_qu8_conv_minmax_rndnu_neon_params,
338       4, 8, 1, 1, benchmark::utils::CheckNEON);
339   }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)340   static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
341     GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
342       xnn_init_qu8_conv_minmax_rndnu_neon_params,
343       4, 8, 1, 1, benchmark::utils::CheckNEON);
344   }
345   BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)346   BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
347   BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
348   BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
349   BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
350   BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
351 #endif  // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
352 
353 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
354   static void qu8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
355     GEMMBenchmark(state,
356       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
357       xnn_init_qu8_conv_minmax_rndnu_neon_params,
358       4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
359   }
qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,const char * net)360   static void qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
361     GEMMBenchmark(state,
362       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
363       xnn_init_qu8_conv_minmax_rndnu_neon_params,
364       4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
365   }
qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State & state,const char * net)366   static void qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
367     GEMMBenchmark(state,
368       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
369       xnn_init_qu8_conv_minmax_rndnu_neon_params,
370       4, 8, 4, 1,
371       benchmark::utils::CheckNEONDOT);
372   }
qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State & state,const char * net)373   static void qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
374     GEMMBenchmark(state,
375       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
376       xnn_init_qu8_conv_minmax_rndnu_neon_params,
377       4, 8, 4, 1, benchmark::utils::CheckNEONDOT);
378   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)379   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
380     GEMMBenchmark(state,
381       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
382       xnn_init_qu8_conv_minmax_rndnu_neon_params,
383       4, 16, 1, 1,
384       benchmark::utils::CheckNEON);
385   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)386   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
387     GEMMBenchmark(state,
388       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
389       xnn_init_qu8_conv_minmax_rndnu_neon_params,
390       4, 16, 1, 1,
391       benchmark::utils::CheckNEON);
392   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,const char * net)393   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
394     GEMMBenchmark(state,
395       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
396       xnn_init_qu8_conv_minmax_rndnu_neon_params,
397       4, 16, 1, 1,
398       benchmark::utils::CheckNEON);
399   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)400   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
401     GEMMBenchmark(state,
402       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
403       xnn_init_qu8_conv_minmax_rndnu_neon_params,
404       4, 16, 1, 1,
405       benchmark::utils::CheckNEON);
406   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State & state,const char * net)407   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State& state, const char* net) {
408     GEMMBenchmark(state,
409       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
410       xnn_init_qu8_conv_minmax_rndnu_neon_params,
411       4, 16, 1, 1,
412       benchmark::utils::CheckNEON);
413   }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State & state,const char * net)414   static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State& state, const char* net) {
415     GEMMBenchmark(state,
416       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
417       xnn_init_qu8_conv_minmax_rndnu_neon_params,
418       4, 16, 1, 1,
419       benchmark::utils::CheckNEON);
420   }
421   BENCHMARK_GEMM(qu8_gemm_4x8c4__aarch64_neondot_cortex_a55)
BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55)422   BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55)
423   BENCHMARK_GEMM(qu8_gemm_4x8c4__aarch64_neondot_ld128)
424   BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_ld128)
425   BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
426   BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
427   BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
428   BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
429   BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75)
430   BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75)
431 #endif  // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
432 
433 
434 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
435   static void qu8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
436     GEMMBenchmark(state,
437       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
438       xnn_init_qu8_conv_minmax_rndnu_neon_params,
439       1, 8, 4, 1, benchmark::utils::CheckNEONDOT);
440   }
qu8_gemm_2x8c4__neondot(benchmark::State & state,const char * net)441   static void qu8_gemm_2x8c4__neondot(benchmark::State& state, const char* net) {
442     GEMMBenchmark(state,
443       xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot,
444       xnn_init_qu8_conv_minmax_rndnu_neon_params,
445       2, 8, 4, 1, benchmark::utils::CheckNEONDOT);
446   }
qu8_gemm_3x8c4__neondot(benchmark::State & state,const char * net)447   static void qu8_gemm_3x8c4__neondot(benchmark::State& state, const char* net) {
448     GEMMBenchmark(state,
449       xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot,
450       xnn_init_qu8_conv_minmax_rndnu_neon_params,
451       3, 8, 4, 1, benchmark::utils::CheckNEONDOT);
452   }
qu8_gemm_4x8c4__neondot(benchmark::State & state,const char * net)453   static void qu8_gemm_4x8c4__neondot(benchmark::State& state, const char* net) {
454     GEMMBenchmark(state,
455       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
456       xnn_init_qu8_conv_minmax_rndnu_neon_params,
457       4, 8, 4, 1, benchmark::utils::CheckNEONDOT);
458   }
qu8_gemm_5x8c4__neondot(benchmark::State & state,const char * net)459   static void qu8_gemm_5x8c4__neondot(benchmark::State& state, const char* net) {
460     GEMMBenchmark(state,
461       xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot,
462       xnn_init_qu8_conv_minmax_rndnu_neon_params,
463       5, 8, 4, 1, benchmark::utils::CheckNEONDOT);
464   }
qu8_gemm_6x8c4__neondot(benchmark::State & state,const char * net)465   static void qu8_gemm_6x8c4__neondot(benchmark::State& state, const char* net) {
466     GEMMBenchmark(state,
467       xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
468       xnn_init_qu8_conv_minmax_rndnu_neon_params,
469       6, 8, 4, 1, benchmark::utils::CheckNEONDOT);
470   }
qu8_gemm_8x8c4__neondot(benchmark::State & state,const char * net)471   static void qu8_gemm_8x8c4__neondot(benchmark::State& state, const char* net) {
472     GEMMBenchmark(state,
473       xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
474       xnn_init_qu8_conv_minmax_rndnu_neon_params,
475       8, 8, 4, 1, benchmark::utils::CheckNEONDOT);
476   }
qu8_gemm_1x16c4__neondot(benchmark::State & state,const char * net)477   static void qu8_gemm_1x16c4__neondot(benchmark::State& state, const char* net) {
478     GEMMBenchmark(state,
479       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
480       xnn_init_qu8_conv_minmax_rndnu_neon_params,
481       1, 16, 4, 1, benchmark::utils::CheckNEONDOT);
482   }
qu8_gemm_2x16c4__neondot(benchmark::State & state,const char * net)483   static void qu8_gemm_2x16c4__neondot(benchmark::State& state, const char* net) {
484     GEMMBenchmark(state,
485       xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot,
486       xnn_init_qu8_conv_minmax_rndnu_neon_params,
487       2, 16, 4, 1, benchmark::utils::CheckNEONDOT);
488   }
qu8_gemm_3x16c4__neondot(benchmark::State & state,const char * net)489   static void qu8_gemm_3x16c4__neondot(benchmark::State& state, const char* net) {
490     GEMMBenchmark(state,
491       xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot,
492       xnn_init_qu8_conv_minmax_rndnu_neon_params,
493       3, 16, 4, 1, benchmark::utils::CheckNEONDOT);
494   }
qu8_gemm_4x16c4__neondot(benchmark::State & state,const char * net)495   static void qu8_gemm_4x16c4__neondot(benchmark::State& state, const char* net) {
496     GEMMBenchmark(state,
497       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
498       xnn_init_qu8_conv_minmax_rndnu_neon_params,
499       4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
500   }
qu8_gemm_5x16c4__neondot(benchmark::State & state,const char * net)501   static void qu8_gemm_5x16c4__neondot(benchmark::State& state, const char* net) {
502     GEMMBenchmark(state,
503       xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot,
504       xnn_init_qu8_conv_minmax_rndnu_neon_params,
505       5, 16, 4, 1, benchmark::utils::CheckNEONDOT);
506   }
qu8_gemm_6x16c4__neondot(benchmark::State & state,const char * net)507   static void qu8_gemm_6x16c4__neondot(benchmark::State& state, const char* net) {
508     GEMMBenchmark(state,
509       xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
510       xnn_init_qu8_conv_minmax_rndnu_neon_params,
511       6, 16, 4, 1, benchmark::utils::CheckNEONDOT);
512   }
qu8_gemm_8x16c4__neondot(benchmark::State & state,const char * net)513   static void qu8_gemm_8x16c4__neondot(benchmark::State& state, const char* net) {
514     GEMMBenchmark(state,
515       xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
516       xnn_init_qu8_conv_minmax_rndnu_neon_params,
517       8, 16, 4, 1, benchmark::utils::CheckNEONDOT);
518   }
qu8_gemm_1x8__neon_mlal_lane(benchmark::State & state,const char * net)519   static void qu8_gemm_1x8__neon_mlal_lane(benchmark::State& state, const char* net) {
520     GEMMBenchmark(state,
521       xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
522       xnn_init_qu8_conv_minmax_rndnu_neon_params,
523       1, 8, 1, 1, benchmark::utils::CheckNEON);
524   }
qu8_gemm_2x8__neon_mlal_lane(benchmark::State & state,const char * net)525   static void qu8_gemm_2x8__neon_mlal_lane(benchmark::State& state, const char* net) {
526     GEMMBenchmark(state,
527       xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
528       xnn_init_qu8_conv_minmax_rndnu_neon_params,
529       2, 8, 1, 1, benchmark::utils::CheckNEON);
530   }
qu8_gemm_3x8__neon_mlal_lane(benchmark::State & state,const char * net)531   static void qu8_gemm_3x8__neon_mlal_lane(benchmark::State& state, const char* net) {
532     GEMMBenchmark(state,
533       xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
534       xnn_init_qu8_conv_minmax_rndnu_neon_params,
535       3, 8, 1, 1, benchmark::utils::CheckNEON);
536   }
qu8_gemm_4x8__neon_mlal_lane(benchmark::State & state,const char * net)537   static void qu8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) {
538     GEMMBenchmark(state,
539       xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
540       xnn_init_qu8_conv_minmax_rndnu_neon_params,
541       4, 8, 1, 1, benchmark::utils::CheckNEON);
542   }
qu8_gemm_6x8__neon_mlal_lane(benchmark::State & state,const char * net)543   static void qu8_gemm_6x8__neon_mlal_lane(benchmark::State& state, const char* net) {
544     GEMMBenchmark(state,
545       xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
546       xnn_init_qu8_conv_minmax_rndnu_neon_params,
547       6, 8, 1, 1, benchmark::utils::CheckNEON);
548   }
qu8_gemm_1x16__neon_mlal_lane(benchmark::State & state,const char * net)549   static void qu8_gemm_1x16__neon_mlal_lane(benchmark::State& state, const char* net) {
550     GEMMBenchmark(state,
551       xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
552       xnn_init_qu8_conv_minmax_rndnu_neon_params,
553       1, 16, 1, 1, benchmark::utils::CheckNEON);
554   }
qu8_gemm_2x16__neon_mlal_lane(benchmark::State & state,const char * net)555   static void qu8_gemm_2x16__neon_mlal_lane(benchmark::State& state, const char* net) {
556     GEMMBenchmark(state,
557       xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
558       xnn_init_qu8_conv_minmax_rndnu_neon_params,
559       2, 16, 1, 1, benchmark::utils::CheckNEON);
560   }
qu8_gemm_3x16__neon_mlal_lane(benchmark::State & state,const char * net)561   static void qu8_gemm_3x16__neon_mlal_lane(benchmark::State& state, const char* net) {
562     GEMMBenchmark(state,
563       xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
564       xnn_init_qu8_conv_minmax_rndnu_neon_params,
565       3, 16, 1, 1, benchmark::utils::CheckNEON);
566   }
qu8_gemm_4x16__neon_mlal_lane(benchmark::State & state,const char * net)567   static void qu8_gemm_4x16__neon_mlal_lane(benchmark::State& state, const char* net) {
568     GEMMBenchmark(state,
569       xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
570       xnn_init_qu8_conv_minmax_rndnu_neon_params,
571       4, 16, 1, 1, benchmark::utils::CheckNEON);
572   }
qu8_gemm_6x16__neon_mlal_lane(benchmark::State & state,const char * net)573   static void qu8_gemm_6x16__neon_mlal_lane(benchmark::State& state, const char* net) {
574     GEMMBenchmark(state,
575       xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
576       xnn_init_qu8_conv_minmax_rndnu_neon_params,
577       6, 16, 1, 1, benchmark::utils::CheckNEON);
578   }
qu8_gemm_1x32c4__neondot(benchmark::State & state,const char * net)579   static void qu8_gemm_1x32c4__neondot(benchmark::State& state, const char* net) {
580     GEMMBenchmark(state,
581       xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
582       xnn_init_qu8_conv_minmax_rndnu_neon_params,
583       1, 32, 4, 1, benchmark::utils::CheckNEONDOT);
584   }
qu8_gemm_2x32c4__neondot(benchmark::State & state,const char * net)585   static void qu8_gemm_2x32c4__neondot(benchmark::State& state, const char* net) {
586     GEMMBenchmark(state,
587       xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot,
588       xnn_init_qu8_conv_minmax_rndnu_neon_params,
589       2, 32, 4, 1, benchmark::utils::CheckNEONDOT);
590   }
qu8_gemm_3x32c4__neondot(benchmark::State & state,const char * net)591   static void qu8_gemm_3x32c4__neondot(benchmark::State& state, const char* net) {
592     GEMMBenchmark(state,
593       xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot,
594       xnn_init_qu8_conv_minmax_rndnu_neon_params,
595       3, 32, 4, 1, benchmark::utils::CheckNEONDOT);
596   }
597   BENCHMARK_GEMM(qu8_gemm_1x8c4__neondot)
BENCHMARK_GEMM(qu8_gemm_2x8c4__neondot)598   BENCHMARK_GEMM(qu8_gemm_2x8c4__neondot)
599   BENCHMARK_GEMM(qu8_gemm_3x8c4__neondot)
600   BENCHMARK_GEMM(qu8_gemm_4x8c4__neondot)
601   BENCHMARK_GEMM(qu8_gemm_5x8c4__neondot)
602   BENCHMARK_GEMM(qu8_gemm_6x8c4__neondot)
603   BENCHMARK_GEMM(qu8_gemm_8x8c4__neondot)
604   BENCHMARK_GEMM(qu8_gemm_1x16c4__neondot)
605   BENCHMARK_GEMM(qu8_gemm_2x16c4__neondot)
606   BENCHMARK_GEMM(qu8_gemm_3x16c4__neondot)
607   BENCHMARK_GEMM(qu8_gemm_4x16c4__neondot)
608   BENCHMARK_GEMM(qu8_gemm_5x16c4__neondot)
609   BENCHMARK_GEMM(qu8_gemm_6x16c4__neondot)
610   BENCHMARK_GEMM(qu8_gemm_8x16c4__neondot)
611   BENCHMARK_GEMM(qu8_gemm_1x32c4__neondot)
612   BENCHMARK_GEMM(qu8_gemm_2x32c4__neondot)
613   BENCHMARK_GEMM(qu8_gemm_3x32c4__neondot)
614   BENCHMARK_GEMM(qu8_gemm_1x8__neon_mlal_lane)
615   BENCHMARK_GEMM(qu8_gemm_2x8__neon_mlal_lane)
616   BENCHMARK_GEMM(qu8_gemm_3x8__neon_mlal_lane)
617   BENCHMARK_GEMM(qu8_gemm_4x8__neon_mlal_lane)
618   BENCHMARK_GEMM(qu8_gemm_6x8__neon_mlal_lane)
619   BENCHMARK_GEMM(qu8_gemm_1x16__neon_mlal_lane)
620   BENCHMARK_GEMM(qu8_gemm_2x16__neon_mlal_lane)
621   BENCHMARK_GEMM(qu8_gemm_3x16__neon_mlal_lane)
622   BENCHMARK_GEMM(qu8_gemm_4x16__neon_mlal_lane)
623   BENCHMARK_GEMM(qu8_gemm_6x16__neon_mlal_lane)
624 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
625 
626 
627 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
628   static void qu8_gemm_1x16c8__avx512skx(benchmark::State& state, const char* net) {
629     GEMMBenchmark(state,
630       xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
631       xnn_init_qu8_conv_minmax_fp32_avx512_params,
632       1, 16, 8, 1,
633       benchmark::utils::CheckAVX512SKX);
634   }
qu8_gemm_2x16c8__avx512skx(benchmark::State & state,const char * net)635   static void qu8_gemm_2x16c8__avx512skx(benchmark::State& state, const char* net) {
636     GEMMBenchmark(state,
637       xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
638       xnn_init_qu8_conv_minmax_fp32_avx512_params,
639       2, 16, 8, 1,
640       benchmark::utils::CheckAVX512SKX);
641   }
qu8_gemm_3x16c8__avx512skx(benchmark::State & state,const char * net)642   static void qu8_gemm_3x16c8__avx512skx(benchmark::State& state, const char* net) {
643     GEMMBenchmark(state,
644       xnn_qu8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
645       xnn_init_qu8_conv_minmax_fp32_avx512_params,
646       3, 16, 8, 1,
647       benchmark::utils::CheckAVX512SKX);
648   }
qu8_gemm_4x16c8__avx512skx(benchmark::State & state,const char * net)649   static void qu8_gemm_4x16c8__avx512skx(benchmark::State& state, const char* net) {
650     GEMMBenchmark(state,
651       xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
652       xnn_init_qu8_conv_minmax_fp32_avx512_params,
653       4, 16, 8, 1,
654       benchmark::utils::CheckAVX512SKX);
655   }
qu8_gemm_1x8c8__avx2(benchmark::State & state,const char * net)656   static void qu8_gemm_1x8c8__avx2(benchmark::State& state, const char* net) {
657     GEMMBenchmark(state,
658       xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
659       xnn_init_qu8_conv_minmax_fp32_avx2_params,
660       1, 8, 8, 1,
661       benchmark::utils::CheckAVX2);
662   }
qu8_gemm_2x8c8__avx2(benchmark::State & state,const char * net)663   static void qu8_gemm_2x8c8__avx2(benchmark::State& state, const char* net) {
664     GEMMBenchmark(state,
665       xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
666       xnn_init_qu8_conv_minmax_fp32_avx2_params,
667       2, 8, 8, 1,
668       benchmark::utils::CheckAVX2);
669   }
qu8_gemm_3x8c8__avx2(benchmark::State & state,const char * net)670   static void qu8_gemm_3x8c8__avx2(benchmark::State& state, const char* net) {
671     GEMMBenchmark(state,
672       xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
673       xnn_init_qu8_conv_minmax_fp32_avx2_params,
674       3, 8, 8, 1,
675       benchmark::utils::CheckAVX2);
676   }
qu8_gemm_1x4c2__xop_ld64(benchmark::State & state,const char * net)677   static void qu8_gemm_1x4c2__xop_ld64(benchmark::State& state, const char* net) {
678     GEMMBenchmark(state,
679       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
680       xnn_init_qu8_conv_minmax_fp32_sse2_params,
681       1, 4, 2, 1,
682       benchmark::utils::CheckXOP);
683   }
qu8_gemm_2x4c2__xop_ld64(benchmark::State & state,const char * net)684   static void qu8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
685     GEMMBenchmark(state,
686       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
687       xnn_init_qu8_conv_minmax_fp32_sse2_params,
688       2, 4, 2, 1,
689       benchmark::utils::CheckXOP);
690   }
qu8_gemm_3x4c2__xop_ld64(benchmark::State & state,const char * net)691   static void qu8_gemm_3x4c2__xop_ld64(benchmark::State& state, const char* net) {
692     GEMMBenchmark(state,
693       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
694       xnn_init_qu8_conv_minmax_fp32_sse2_params,
695       3, 4, 2, 1,
696       benchmark::utils::CheckXOP);
697   }
qu8_gemm_4x4c2__xop_ld64(benchmark::State & state,const char * net)698   static void qu8_gemm_4x4c2__xop_ld64(benchmark::State& state, const char* net) {
699     GEMMBenchmark(state,
700       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
701       xnn_init_qu8_conv_minmax_fp32_sse2_params,
702       4, 4, 2, 1,
703       benchmark::utils::CheckXOP);
704   }
qu8_gemm_1x4c2__xop_ld128(benchmark::State & state,const char * net)705   static void qu8_gemm_1x4c2__xop_ld128(benchmark::State& state, const char* net) {
706     GEMMBenchmark(state,
707       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
708       xnn_init_qu8_conv_minmax_fp32_sse2_params,
709       1, 4, 2, 1,
710       benchmark::utils::CheckXOP);
711   }
qu8_gemm_2x4c2__xop_ld128(benchmark::State & state,const char * net)712   static void qu8_gemm_2x4c2__xop_ld128(benchmark::State& state, const char* net) {
713     GEMMBenchmark(state,
714       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
715       xnn_init_qu8_conv_minmax_fp32_sse2_params,
716       2, 4, 2, 1,
717       benchmark::utils::CheckXOP);
718   }
qu8_gemm_3x4c2__xop_ld128(benchmark::State & state,const char * net)719   static void qu8_gemm_3x4c2__xop_ld128(benchmark::State& state, const char* net) {
720     GEMMBenchmark(state,
721       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
722       xnn_init_qu8_conv_minmax_fp32_sse2_params,
723       3, 4, 2, 1,
724       benchmark::utils::CheckXOP);
725   }
qu8_gemm_4x4c2__xop_ld128(benchmark::State & state,const char * net)726   static void qu8_gemm_4x4c2__xop_ld128(benchmark::State& state, const char* net) {
727     GEMMBenchmark(state,
728       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
729       xnn_init_qu8_conv_minmax_fp32_sse2_params,
730       4, 4, 2, 1,
731       benchmark::utils::CheckXOP);
732   }
qu8_gemm_1x4c8__xop_ld64(benchmark::State & state,const char * net)733   static void qu8_gemm_1x4c8__xop_ld64(benchmark::State& state, const char* net) {
734     GEMMBenchmark(state,
735       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
736       xnn_init_qu8_conv_minmax_fp32_sse2_params,
737       1, 4, 8, 1,
738       benchmark::utils::CheckXOP);
739   }
qu8_gemm_2x4c8__xop_ld64(benchmark::State & state,const char * net)740   static void qu8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
741     GEMMBenchmark(state,
742       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
743       xnn_init_qu8_conv_minmax_fp32_sse2_params,
744       2, 4, 8, 1,
745       benchmark::utils::CheckXOP);
746   }
qu8_gemm_3x4c8__xop_ld64(benchmark::State & state,const char * net)747   static void qu8_gemm_3x4c8__xop_ld64(benchmark::State& state, const char* net) {
748     GEMMBenchmark(state,
749       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
750       xnn_init_qu8_conv_minmax_fp32_sse2_params,
751       3, 4, 8, 1,
752       benchmark::utils::CheckXOP);
753   }
qu8_gemm_1x4c8__xop_ld128(benchmark::State & state,const char * net)754   static void qu8_gemm_1x4c8__xop_ld128(benchmark::State& state, const char* net) {
755     GEMMBenchmark(state,
756       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
757       xnn_init_qu8_conv_minmax_fp32_sse2_params,
758       1, 4, 8, 1,
759       benchmark::utils::CheckXOP);
760   }
qu8_gemm_2x4c8__xop_ld128(benchmark::State & state,const char * net)761   static void qu8_gemm_2x4c8__xop_ld128(benchmark::State& state, const char* net) {
762     GEMMBenchmark(state,
763       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
764       xnn_init_qu8_conv_minmax_fp32_sse2_params,
765       2, 4, 8, 1,
766       benchmark::utils::CheckXOP);
767   }
qu8_gemm_3x4c8__xop_ld128(benchmark::State & state,const char * net)768   static void qu8_gemm_3x4c8__xop_ld128(benchmark::State& state, const char* net) {
769     GEMMBenchmark(state,
770       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
771       xnn_init_qu8_conv_minmax_fp32_sse2_params,
772       3, 4, 8, 1,
773       benchmark::utils::CheckXOP);
774   }
qu8_gemm_1x4c2__avx_ld64(benchmark::State & state,const char * net)775   static void qu8_gemm_1x4c2__avx_ld64(benchmark::State& state, const char* net) {
776     GEMMBenchmark(state,
777       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
778       xnn_init_qu8_conv_minmax_fp32_sse2_params,
779       1, 4, 2, 1,
780       benchmark::utils::CheckAVX);
781   }
qu8_gemm_2x4c2__avx_ld64(benchmark::State & state,const char * net)782   static void qu8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
783     GEMMBenchmark(state,
784       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
785       xnn_init_qu8_conv_minmax_fp32_sse2_params,
786       2, 4, 2, 1,
787       benchmark::utils::CheckAVX);
788   }
qu8_gemm_3x4c2__avx_ld64(benchmark::State & state,const char * net)789   static void qu8_gemm_3x4c2__avx_ld64(benchmark::State& state, const char* net) {
790     GEMMBenchmark(state,
791       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
792       xnn_init_qu8_conv_minmax_fp32_sse2_params,
793       3, 4, 2, 1,
794       benchmark::utils::CheckAVX);
795   }
qu8_gemm_4x4c2__avx_ld64(benchmark::State & state,const char * net)796   static void qu8_gemm_4x4c2__avx_ld64(benchmark::State& state, const char* net) {
797     GEMMBenchmark(state,
798       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
799       xnn_init_qu8_conv_minmax_fp32_sse2_params,
800       4, 4, 2, 1,
801       benchmark::utils::CheckAVX);
802   }
qu8_gemm_1x4c2__avx_ld128(benchmark::State & state,const char * net)803   static void qu8_gemm_1x4c2__avx_ld128(benchmark::State& state, const char* net) {
804     GEMMBenchmark(state,
805       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
806       xnn_init_qu8_conv_minmax_fp32_sse2_params,
807       1, 4, 2, 1,
808       benchmark::utils::CheckAVX);
809   }
qu8_gemm_2x4c2__avx_ld128(benchmark::State & state,const char * net)810   static void qu8_gemm_2x4c2__avx_ld128(benchmark::State& state, const char* net) {
811     GEMMBenchmark(state,
812       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
813       xnn_init_qu8_conv_minmax_fp32_sse2_params,
814       2, 4, 2, 1,
815       benchmark::utils::CheckAVX);
816   }
qu8_gemm_3x4c2__avx_ld128(benchmark::State & state,const char * net)817   static void qu8_gemm_3x4c2__avx_ld128(benchmark::State& state, const char* net) {
818     GEMMBenchmark(state,
819       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
820       xnn_init_qu8_conv_minmax_fp32_sse2_params,
821       3, 4, 2, 1,
822       benchmark::utils::CheckAVX);
823   }
qu8_gemm_4x4c2__avx_ld128(benchmark::State & state,const char * net)824   static void qu8_gemm_4x4c2__avx_ld128(benchmark::State& state, const char* net) {
825     GEMMBenchmark(state,
826       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
827       xnn_init_qu8_conv_minmax_fp32_sse2_params,
828       4, 4, 2, 1,
829       benchmark::utils::CheckAVX);
830   }
qu8_gemm_1x4c8__avx_ld64(benchmark::State & state,const char * net)831   static void qu8_gemm_1x4c8__avx_ld64(benchmark::State& state, const char* net) {
832     GEMMBenchmark(state,
833       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
834       xnn_init_qu8_conv_minmax_fp32_sse2_params,
835       1, 4, 8, 1,
836       benchmark::utils::CheckAVX);
837   }
qu8_gemm_2x4c8__avx_ld64(benchmark::State & state,const char * net)838   static void qu8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
839     GEMMBenchmark(state,
840       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
841       xnn_init_qu8_conv_minmax_fp32_sse2_params,
842       2, 4, 8, 1,
843       benchmark::utils::CheckAVX);
844   }
qu8_gemm_3x4c8__avx_ld64(benchmark::State & state,const char * net)845   static void qu8_gemm_3x4c8__avx_ld64(benchmark::State& state, const char* net) {
846     GEMMBenchmark(state,
847       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
848       xnn_init_qu8_conv_minmax_fp32_sse2_params,
849       3, 4, 8, 1,
850       benchmark::utils::CheckAVX);
851   }
qu8_gemm_1x4c8__avx_ld128(benchmark::State & state,const char * net)852   static void qu8_gemm_1x4c8__avx_ld128(benchmark::State& state, const char* net) {
853     GEMMBenchmark(state,
854       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
855       xnn_init_qu8_conv_minmax_fp32_sse2_params,
856       1, 4, 8, 1,
857       benchmark::utils::CheckAVX);
858   }
qu8_gemm_2x4c8__avx_ld128(benchmark::State & state,const char * net)859   static void qu8_gemm_2x4c8__avx_ld128(benchmark::State& state, const char* net) {
860     GEMMBenchmark(state,
861       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
862       xnn_init_qu8_conv_minmax_fp32_sse2_params,
863       2, 4, 8, 1,
864       benchmark::utils::CheckAVX);
865   }
qu8_gemm_3x4c8__avx_ld128(benchmark::State & state,const char * net)866   static void qu8_gemm_3x4c8__avx_ld128(benchmark::State& state, const char* net) {
867     GEMMBenchmark(state,
868       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
869       xnn_init_qu8_conv_minmax_fp32_sse2_params,
870       3, 4, 8, 1,
871       benchmark::utils::CheckAVX);
872   }
qu8_gemm_1x4c2__sse41_ld64(benchmark::State & state,const char * net)873   static void qu8_gemm_1x4c2__sse41_ld64(benchmark::State& state, const char* net) {
874     GEMMBenchmark(state,
875       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
876       xnn_init_qu8_conv_minmax_fp32_sse2_params,
877       1, 4, 2, 1,
878       benchmark::utils::CheckSSE41);
879   }
qu8_gemm_2x4c2__sse41_ld64(benchmark::State & state,const char * net)880   static void qu8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
881     GEMMBenchmark(state,
882       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
883       xnn_init_qu8_conv_minmax_fp32_sse2_params,
884       2, 4, 2, 1,
885       benchmark::utils::CheckSSE41);
886   }
qu8_gemm_3x4c2__sse41_ld64(benchmark::State & state,const char * net)887   static void qu8_gemm_3x4c2__sse41_ld64(benchmark::State& state, const char* net) {
888     GEMMBenchmark(state,
889       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
890       xnn_init_qu8_conv_minmax_fp32_sse2_params,
891       3, 4, 2, 1,
892       benchmark::utils::CheckSSE41);
893   }
qu8_gemm_4x4c2__sse41_ld64(benchmark::State & state,const char * net)894   static void qu8_gemm_4x4c2__sse41_ld64(benchmark::State& state, const char* net) {
895     GEMMBenchmark(state,
896       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
897       xnn_init_qu8_conv_minmax_fp32_sse2_params,
898       4, 4, 2, 1,
899       benchmark::utils::CheckSSE41);
900   }
qu8_gemm_1x4c2__sse41_ld128(benchmark::State & state,const char * net)901   static void qu8_gemm_1x4c2__sse41_ld128(benchmark::State& state, const char* net) {
902     GEMMBenchmark(state,
903       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
904       xnn_init_qu8_conv_minmax_fp32_sse2_params,
905       1, 4, 2, 1,
906       benchmark::utils::CheckSSE41);
907   }
qu8_gemm_2x4c2__sse41_ld128(benchmark::State & state,const char * net)908   static void qu8_gemm_2x4c2__sse41_ld128(benchmark::State& state, const char* net) {
909     GEMMBenchmark(state,
910       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
911       xnn_init_qu8_conv_minmax_fp32_sse2_params,
912       2, 4, 2, 1,
913       benchmark::utils::CheckSSE41);
914   }
qu8_gemm_3x4c2__sse41_ld128(benchmark::State & state,const char * net)915   static void qu8_gemm_3x4c2__sse41_ld128(benchmark::State& state, const char* net) {
916     GEMMBenchmark(state,
917       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
918       xnn_init_qu8_conv_minmax_fp32_sse2_params,
919       3, 4, 2, 1,
920       benchmark::utils::CheckSSE41);
921   }
qu8_gemm_4x4c2__sse41_ld128(benchmark::State & state,const char * net)922   static void qu8_gemm_4x4c2__sse41_ld128(benchmark::State& state, const char* net) {
923     GEMMBenchmark(state,
924       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
925       xnn_init_qu8_conv_minmax_fp32_sse2_params,
926       4, 4, 2, 1,
927       benchmark::utils::CheckSSE41);
928   }
qu8_gemm_1x4c8__sse41_ld64(benchmark::State & state,const char * net)929   static void qu8_gemm_1x4c8__sse41_ld64(benchmark::State& state, const char* net) {
930     GEMMBenchmark(state,
931       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
932       xnn_init_qu8_conv_minmax_fp32_sse2_params,
933       1, 4, 8, 1,
934       benchmark::utils::CheckSSE41);
935   }
qu8_gemm_2x4c8__sse41_ld64(benchmark::State & state,const char * net)936   static void qu8_gemm_2x4c8__sse41_ld64(benchmark::State& state, const char* net) {
937     GEMMBenchmark(state,
938       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
939       xnn_init_qu8_conv_minmax_fp32_sse2_params,
940       2, 4, 8, 1,
941       benchmark::utils::CheckSSE41);
942   }
qu8_gemm_3x4c8__sse41_ld64(benchmark::State & state,const char * net)943   static void qu8_gemm_3x4c8__sse41_ld64(benchmark::State& state, const char* net) {
944     GEMMBenchmark(state,
945       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
946       xnn_init_qu8_conv_minmax_fp32_sse2_params,
947       3, 4, 8, 1,
948       benchmark::utils::CheckSSE41);
949   }
qu8_gemm_1x4c8__sse41_ld128(benchmark::State & state,const char * net)950   static void qu8_gemm_1x4c8__sse41_ld128(benchmark::State& state, const char* net) {
951     GEMMBenchmark(state,
952       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
953       xnn_init_qu8_conv_minmax_fp32_sse2_params,
954       1, 4, 8, 1,
955       benchmark::utils::CheckSSE41);
956   }
qu8_gemm_2x4c8__sse41_ld128(benchmark::State & state,const char * net)957   static void qu8_gemm_2x4c8__sse41_ld128(benchmark::State& state, const char* net) {
958     GEMMBenchmark(state,
959       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
960       xnn_init_qu8_conv_minmax_fp32_sse2_params,
961       2, 4, 8, 1,
962       benchmark::utils::CheckSSE41);
963   }
qu8_gemm_3x4c8__sse41_ld128(benchmark::State & state,const char * net)964   static void qu8_gemm_3x4c8__sse41_ld128(benchmark::State& state, const char* net) {
965     GEMMBenchmark(state,
966       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
967       xnn_init_qu8_conv_minmax_fp32_sse2_params,
968       3, 4, 8, 1,
969       benchmark::utils::CheckSSE41);
970   }
qu8_gemm_1x4c2__sse2_ld64(benchmark::State & state,const char * net)971   static void qu8_gemm_1x4c2__sse2_ld64(benchmark::State& state, const char* net) {
972     GEMMBenchmark(state,
973       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
974       xnn_init_qu8_conv_minmax_fp32_sse2_params,
975       1, 4, 2, 1);
976   }
qu8_gemm_2x4c2__sse2_ld64(benchmark::State & state,const char * net)977   static void qu8_gemm_2x4c2__sse2_ld64(benchmark::State& state, const char* net) {
978     GEMMBenchmark(state,
979       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
980       xnn_init_qu8_conv_minmax_fp32_sse2_params,
981       2, 4, 2, 1);
982   }
qu8_gemm_3x4c2__sse2_ld64(benchmark::State & state,const char * net)983   static void qu8_gemm_3x4c2__sse2_ld64(benchmark::State& state, const char* net) {
984     GEMMBenchmark(state,
985       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
986       xnn_init_qu8_conv_minmax_fp32_sse2_params,
987       3, 4, 2, 1);
988   }
qu8_gemm_4x4c2__sse2_ld64(benchmark::State & state,const char * net)989   static void qu8_gemm_4x4c2__sse2_ld64(benchmark::State& state, const char* net) {
990     GEMMBenchmark(state,
991       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
992       xnn_init_qu8_conv_minmax_fp32_sse2_params,
993       4, 4, 2, 1);
994   }
qu8_gemm_1x4c2__sse2_ld128(benchmark::State & state,const char * net)995   static void qu8_gemm_1x4c2__sse2_ld128(benchmark::State& state, const char* net) {
996     GEMMBenchmark(state,
997       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
998       xnn_init_qu8_conv_minmax_fp32_sse2_params,
999       1, 4, 2, 1);
1000   }
qu8_gemm_2x4c2__sse2_ld128(benchmark::State & state,const char * net)1001   static void qu8_gemm_2x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1002     GEMMBenchmark(state,
1003       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1004       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1005       2, 4, 2, 1);
1006   }
qu8_gemm_3x4c2__sse2_ld128(benchmark::State & state,const char * net)1007   static void qu8_gemm_3x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1008     GEMMBenchmark(state,
1009       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1010       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1011       3, 4, 2, 1);
1012   }
qu8_gemm_4x4c2__sse2_ld128(benchmark::State & state,const char * net)1013   static void qu8_gemm_4x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1014     GEMMBenchmark(state,
1015       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1016       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1017       4, 4, 2, 1);
1018   }
qu8_gemm_1x4c8__sse2_ld64(benchmark::State & state,const char * net)1019   static void qu8_gemm_1x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1020     GEMMBenchmark(state,
1021       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1022       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1023       1, 4, 8, 1);
1024   }
qu8_gemm_2x4c8__sse2_ld64(benchmark::State & state,const char * net)1025   static void qu8_gemm_2x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1026     GEMMBenchmark(state,
1027       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
1028       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1029       2, 4, 8, 1);
1030   }
qu8_gemm_3x4c8__sse2_ld64(benchmark::State & state,const char * net)1031   static void qu8_gemm_3x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1032     GEMMBenchmark(state,
1033       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1034       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1035       3, 4, 8, 1);
1036   }
qu8_gemm_1x4c8__sse2_ld128(benchmark::State & state,const char * net)1037   static void qu8_gemm_1x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1038     GEMMBenchmark(state,
1039       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1040       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1041       1, 4, 8, 1);
1042   }
qu8_gemm_2x4c8__sse2_ld128(benchmark::State & state,const char * net)1043   static void qu8_gemm_2x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1044     GEMMBenchmark(state,
1045       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1046       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1047       2, 4, 8, 1);
1048   }
qu8_gemm_3x4c8__sse2_ld128(benchmark::State & state,const char * net)1049   static void qu8_gemm_3x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1050     GEMMBenchmark(state,
1051       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1052       xnn_init_qu8_conv_minmax_fp32_sse2_params,
1053       3, 4, 8, 1);
1054   }
1055 
1056   BENCHMARK_GEMM(qu8_gemm_1x16c8__avx512skx)
BENCHMARK_GEMM(qu8_gemm_2x16c8__avx512skx)1057   BENCHMARK_GEMM(qu8_gemm_2x16c8__avx512skx)
1058   BENCHMARK_GEMM(qu8_gemm_3x16c8__avx512skx)
1059   BENCHMARK_GEMM(qu8_gemm_4x16c8__avx512skx)
1060 
1061   BENCHMARK_GEMM(qu8_gemm_1x8c8__avx2)
1062   BENCHMARK_GEMM(qu8_gemm_2x8c8__avx2)
1063   BENCHMARK_GEMM(qu8_gemm_3x8c8__avx2)
1064 
1065   BENCHMARK_GEMM(qu8_gemm_1x4c2__xop_ld64)
1066   BENCHMARK_GEMM(qu8_gemm_2x4c2__xop_ld64)
1067   BENCHMARK_GEMM(qu8_gemm_3x4c2__xop_ld64)
1068   BENCHMARK_GEMM(qu8_gemm_4x4c2__xop_ld64)
1069   BENCHMARK_GEMM(qu8_gemm_1x4c2__xop_ld128)
1070   BENCHMARK_GEMM(qu8_gemm_2x4c2__xop_ld128)
1071   BENCHMARK_GEMM(qu8_gemm_3x4c2__xop_ld128)
1072   BENCHMARK_GEMM(qu8_gemm_4x4c2__xop_ld128)
1073   BENCHMARK_GEMM(qu8_gemm_1x4c8__xop_ld64)
1074   BENCHMARK_GEMM(qu8_gemm_2x4c8__xop_ld64)
1075   BENCHMARK_GEMM(qu8_gemm_3x4c8__xop_ld64)
1076   BENCHMARK_GEMM(qu8_gemm_1x4c8__xop_ld128)
1077   BENCHMARK_GEMM(qu8_gemm_2x4c8__xop_ld128)
1078   BENCHMARK_GEMM(qu8_gemm_3x4c8__xop_ld128)
1079 
1080   BENCHMARK_GEMM(qu8_gemm_1x4c2__avx_ld64)
1081   BENCHMARK_GEMM(qu8_gemm_2x4c2__avx_ld64)
1082   BENCHMARK_GEMM(qu8_gemm_3x4c2__avx_ld64)
1083   BENCHMARK_GEMM(qu8_gemm_4x4c2__avx_ld64)
1084   BENCHMARK_GEMM(qu8_gemm_1x4c2__avx_ld128)
1085   BENCHMARK_GEMM(qu8_gemm_2x4c2__avx_ld128)
1086   BENCHMARK_GEMM(qu8_gemm_3x4c2__avx_ld128)
1087   BENCHMARK_GEMM(qu8_gemm_4x4c2__avx_ld128)
1088   BENCHMARK_GEMM(qu8_gemm_1x4c8__avx_ld64)
1089   BENCHMARK_GEMM(qu8_gemm_2x4c8__avx_ld64)
1090   BENCHMARK_GEMM(qu8_gemm_3x4c8__avx_ld64)
1091   BENCHMARK_GEMM(qu8_gemm_1x4c8__avx_ld128)
1092   BENCHMARK_GEMM(qu8_gemm_2x4c8__avx_ld128)
1093   BENCHMARK_GEMM(qu8_gemm_3x4c8__avx_ld128)
1094 
1095   BENCHMARK_GEMM(qu8_gemm_1x4c2__sse41_ld64)
1096   BENCHMARK_GEMM(qu8_gemm_2x4c2__sse41_ld64)
1097   BENCHMARK_GEMM(qu8_gemm_3x4c2__sse41_ld64)
1098   BENCHMARK_GEMM(qu8_gemm_4x4c2__sse41_ld64)
1099   BENCHMARK_GEMM(qu8_gemm_1x4c2__sse41_ld128)
1100   BENCHMARK_GEMM(qu8_gemm_2x4c2__sse41_ld128)
1101   BENCHMARK_GEMM(qu8_gemm_3x4c2__sse41_ld128)
1102   BENCHMARK_GEMM(qu8_gemm_4x4c2__sse41_ld128)
1103   BENCHMARK_GEMM(qu8_gemm_1x4c8__sse41_ld64)
1104   BENCHMARK_GEMM(qu8_gemm_2x4c8__sse41_ld64)
1105   BENCHMARK_GEMM(qu8_gemm_3x4c8__sse41_ld64)
1106   BENCHMARK_GEMM(qu8_gemm_1x4c8__sse41_ld128)
1107   BENCHMARK_GEMM(qu8_gemm_2x4c8__sse41_ld128)
1108   BENCHMARK_GEMM(qu8_gemm_3x4c8__sse41_ld128)
1109 
1110   BENCHMARK_GEMM(qu8_gemm_1x4c2__sse2_ld64)
1111   BENCHMARK_GEMM(qu8_gemm_2x4c2__sse2_ld64)
1112   BENCHMARK_GEMM(qu8_gemm_3x4c2__sse2_ld64)
1113   BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2_ld64)
1114   BENCHMARK_GEMM(qu8_gemm_1x4c2__sse2_ld128)
1115   BENCHMARK_GEMM(qu8_gemm_2x4c2__sse2_ld128)
1116   BENCHMARK_GEMM(qu8_gemm_3x4c2__sse2_ld128)
1117   BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2_ld128)
1118   BENCHMARK_GEMM(qu8_gemm_1x4c8__sse2_ld64)
1119   BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2_ld64)
1120   BENCHMARK_GEMM(qu8_gemm_3x4c8__sse2_ld64)
1121   BENCHMARK_GEMM(qu8_gemm_1x4c8__sse2_ld128)
1122   BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2_ld128)
1123   BENCHMARK_GEMM(qu8_gemm_3x4c8__sse2_ld128)
1124 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
1125 
1126 
1127 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1128   static void qu8_gemm_1x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1129     GEMMBenchmark(state,
1130       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1131       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1132       1, 4, 2, 1);
1133   }
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1134   static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1135     GEMMBenchmark(state,
1136       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1137       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1138       2, 4, 2, 1);
1139   }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1140   static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1141     GEMMBenchmark(state,
1142       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1143       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1144       3, 4, 2, 1);
1145   }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1146   static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1147     GEMMBenchmark(state,
1148       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1149       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1150       4, 4, 2, 1);
1151   }
1152 
qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1153   static void qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1154     GEMMBenchmark(state,
1155       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1156       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1157       1, 4, 2, 1);
1158   }
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1159   static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1160     GEMMBenchmark(state,
1161       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1162       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1163       2, 4, 2, 1);
1164   }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1165   static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1166     GEMMBenchmark(state,
1167       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1168       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1169       3, 4, 2, 1);
1170   }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1171   static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1172     GEMMBenchmark(state,
1173       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1174       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1175       4, 4, 2, 1);
1176   }
1177 
qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1178   static void qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1179     GEMMBenchmark(state,
1180       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1181       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1182       1, 4, 2, 4);
1183   }
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1184   static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1185     GEMMBenchmark(state,
1186       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1187       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1188       2, 4, 2, 4);
1189   }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1190   static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1191     GEMMBenchmark(state,
1192       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1193       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1194       3, 4, 2, 4);
1195   }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1196   static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1197     GEMMBenchmark(state,
1198       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1199       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1200       4, 4, 2, 4);
1201   }
1202 
qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1203   static void qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1204     GEMMBenchmark(state,
1205       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1206       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1207       1, 4, 2, 4);
1208   }
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1209   static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1210     GEMMBenchmark(state,
1211       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1212       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1213       2, 4, 2, 4);
1214   }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1215   static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1216     GEMMBenchmark(state,
1217       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1218       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1219       3, 4, 2, 4);
1220   }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1221   static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1222     GEMMBenchmark(state,
1223       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1224       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1225       4, 4, 2, 4);
1226   }
1227 
qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1228   static void qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1229     GEMMBenchmark(state,
1230       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1231       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1232       1, 4, 8, 1);
1233   }
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1234   static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1235     GEMMBenchmark(state,
1236       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1237       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1238       2, 4, 8, 1);
1239   }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1240   static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1241     GEMMBenchmark(state,
1242       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1243       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1244       3, 4, 8, 1);
1245   }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1246   static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1247     GEMMBenchmark(state,
1248       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1249       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1250       4, 4, 8, 1);
1251   }
1252 
qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1253   static void qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1254     GEMMBenchmark(state,
1255       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1256       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1257       1, 4, 8, 1);
1258   }
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1259   static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1260     GEMMBenchmark(state,
1261       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1262       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1263       2, 4, 8, 1);
1264   }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1265   static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1266     GEMMBenchmark(state,
1267       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1268       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1269       3, 4, 8, 1);
1270   }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1271   static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1272     GEMMBenchmark(state,
1273       xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1274       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1275       4, 4, 8, 1);
1276   }
1277 
qu8_gemm_1x4c8__wasmsimd_mul32_ld64(benchmark::State & state,const char * net)1278   static void qu8_gemm_1x4c8__wasmsimd_mul32_ld64(benchmark::State& state, const char* net) {
1279     GEMMBenchmark(state,
1280       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64,
1281       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1282       1, 4, 8, 1);
1283   }
qu8_gemm_2x4c8__wasmsimd_mul32_ld64(benchmark::State & state,const char * net)1284   static void qu8_gemm_2x4c8__wasmsimd_mul32_ld64(benchmark::State& state, const char* net) {
1285     GEMMBenchmark(state,
1286       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64,
1287       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1288       2, 4, 8, 1);
1289   }
qu8_gemm_3x4c8__wasmsimd_mul32_ld64(benchmark::State & state,const char * net)1290   static void qu8_gemm_3x4c8__wasmsimd_mul32_ld64(benchmark::State& state, const char* net) {
1291     GEMMBenchmark(state,
1292       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64,
1293       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1294       3, 4, 8, 1);
1295   }
qu8_gemm_1x4c8__wasmsimd_mul32_ld128(benchmark::State & state,const char * net)1296   static void qu8_gemm_1x4c8__wasmsimd_mul32_ld128(benchmark::State& state, const char* net) {
1297     GEMMBenchmark(state,
1298       xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128,
1299       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1300       1, 4, 8, 1);
1301   }
qu8_gemm_2x4c8__wasmsimd_mul32_ld128(benchmark::State & state,const char * net)1302   static void qu8_gemm_2x4c8__wasmsimd_mul32_ld128(benchmark::State& state, const char* net) {
1303     GEMMBenchmark(state,
1304       xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128,
1305       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1306       2, 4, 8, 1);
1307   }
qu8_gemm_3x4c8__wasmsimd_mul32_ld128(benchmark::State & state,const char * net)1308   static void qu8_gemm_3x4c8__wasmsimd_mul32_ld128(benchmark::State& state, const char* net) {
1309     GEMMBenchmark(state,
1310       xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128,
1311       xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1312       3, 4, 8, 1);
1313   }
1314 
1315   BENCHMARK_GEMM(qu8_gemm_1x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)1316   BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
1317   BENCHMARK_GEMM(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
1318   BENCHMARK_GEMM(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
1319   BENCHMARK_GEMM(qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128)
1320   BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
1321   BENCHMARK_GEMM(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
1322   BENCHMARK_GEMM(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
1323 
1324   BENCHMARK_GEMM(qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64)
1325   BENCHMARK_GEMM(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
1326   BENCHMARK_GEMM(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
1327   BENCHMARK_GEMM(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
1328   BENCHMARK_GEMM(qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128)
1329   BENCHMARK_GEMM(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
1330   BENCHMARK_GEMM(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
1331   BENCHMARK_GEMM(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
1332 
1333   BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64)
1334   BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
1335   BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
1336   BENCHMARK_GEMM(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
1337   BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128)
1338   BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
1339   BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
1340   BENCHMARK_GEMM(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
1341 
1342   BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_mul32_ld64)
1343   BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_mul32_ld64)
1344   BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_mul32_ld64)
1345   BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_mul32_ld128)
1346   BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_mul32_ld128)
1347   BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_mul32_ld128)
1348 #endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1349 
1350 
1351 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1352   static void qu8_gemm_1x2__wasm_fmagic(benchmark::State& state, const char* net) {
1353     GEMMBenchmark(state,
1354       xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1355       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1356       1, 2, 1, 1);
1357   }
qu8_gemm_2x2__wasm_fmagic(benchmark::State & state,const char * net)1358   static void qu8_gemm_2x2__wasm_fmagic(benchmark::State& state, const char* net) {
1359     GEMMBenchmark(state,
1360       xnn_qu8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1361       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1362       2, 2, 1, 1);
1363   }
qu8_gemm_3x2__wasm_fmagic(benchmark::State & state,const char * net)1364   static void qu8_gemm_3x2__wasm_fmagic(benchmark::State& state, const char* net) {
1365     GEMMBenchmark(state,
1366       xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1367       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1368       3, 2, 1, 1);
1369   }
qu8_gemm_4x2__wasm_fmagic(benchmark::State & state,const char * net)1370   static void qu8_gemm_4x2__wasm_fmagic(benchmark::State& state, const char* net) {
1371     GEMMBenchmark(state,
1372       xnn_qu8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1373       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1374       4, 2, 1, 1);
1375   }
qu8_gemm_1x4__wasm_fmagic(benchmark::State & state,const char * net)1376   static void qu8_gemm_1x4__wasm_fmagic(benchmark::State& state, const char* net) {
1377     GEMMBenchmark(state,
1378       xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1379       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1380       1, 4, 1, 1);
1381   }
qu8_gemm_2x4__wasm_fmagic(benchmark::State & state,const char * net)1382   static void qu8_gemm_2x4__wasm_fmagic(benchmark::State& state, const char* net) {
1383     GEMMBenchmark(state,
1384       xnn_qu8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1385       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1386       2, 4, 1, 1);
1387   }
qu8_gemm_3x4__wasm_fmagic(benchmark::State & state,const char * net)1388   static void qu8_gemm_3x4__wasm_fmagic(benchmark::State& state, const char* net) {
1389     GEMMBenchmark(state,
1390       xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1391       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1392       3, 4, 1, 1);
1393   }
qu8_gemm_4x4__wasm_fmagic(benchmark::State & state,const char * net)1394   static void qu8_gemm_4x4__wasm_fmagic(benchmark::State& state, const char* net) {
1395     GEMMBenchmark(state,
1396       xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1397       xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1398       4, 4, 1, 1);
1399   }
1400 
1401   BENCHMARK_GEMM(qu8_gemm_1x2__wasm_fmagic)
BENCHMARK_GEMM(qu8_gemm_2x2__wasm_fmagic)1402   BENCHMARK_GEMM(qu8_gemm_2x2__wasm_fmagic)
1403   BENCHMARK_GEMM(qu8_gemm_3x2__wasm_fmagic)
1404   BENCHMARK_GEMM(qu8_gemm_4x2__wasm_fmagic)
1405   BENCHMARK_GEMM(qu8_gemm_1x4__wasm_fmagic)
1406   BENCHMARK_GEMM(qu8_gemm_2x4__wasm_fmagic)
1407   BENCHMARK_GEMM(qu8_gemm_3x4__wasm_fmagic)
1408   BENCHMARK_GEMM(qu8_gemm_4x4__wasm_fmagic)
1409 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1410 
1411 
1412 static void qu8_gemm_1x2__scalar_fmagic(benchmark::State& state, const char* net) {
1413   GEMMBenchmark(state,
1414     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1415     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1416     1, 2, 1, 1);
1417 }
qu8_gemm_2x2__scalar_fmagic(benchmark::State & state,const char * net)1418 static void qu8_gemm_2x2__scalar_fmagic(benchmark::State& state, const char* net) {
1419   GEMMBenchmark(state,
1420     xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1421     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1422     2, 2, 1, 1);
1423 }
qu8_gemm_3x2__scalar_fmagic(benchmark::State & state,const char * net)1424 static void qu8_gemm_3x2__scalar_fmagic(benchmark::State& state, const char* net) {
1425   GEMMBenchmark(state,
1426     xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1427     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1428     3, 2, 1, 1);
1429 }
qu8_gemm_4x2__scalar_fmagic(benchmark::State & state,const char * net)1430 static void qu8_gemm_4x2__scalar_fmagic(benchmark::State& state, const char* net) {
1431   GEMMBenchmark(state,
1432     xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1433     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1434     4, 2, 1, 1);
1435 }
qu8_gemm_1x4__scalar_fmagic(benchmark::State & state,const char * net)1436 static void qu8_gemm_1x4__scalar_fmagic(benchmark::State& state, const char* net) {
1437   GEMMBenchmark(state,
1438     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1439     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1440     1, 4, 1, 1);
1441 }
qu8_gemm_2x4__scalar_fmagic(benchmark::State & state,const char * net)1442 static void qu8_gemm_2x4__scalar_fmagic(benchmark::State& state, const char* net) {
1443   GEMMBenchmark(state,
1444     xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1445     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1446     2, 4, 1, 1);
1447 }
qu8_gemm_3x4__scalar_fmagic(benchmark::State & state,const char * net)1448 static void qu8_gemm_3x4__scalar_fmagic(benchmark::State& state, const char* net) {
1449   GEMMBenchmark(state,
1450     xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1451     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1452     3, 4, 1, 1);
1453 }
qu8_gemm_4x4__scalar_fmagic(benchmark::State & state,const char * net)1454 static void qu8_gemm_4x4__scalar_fmagic(benchmark::State& state, const char* net) {
1455   GEMMBenchmark(state,
1456     xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1457     xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1458     4, 4, 1, 1);
1459 }
1460 
qu8_gemm_1x2__scalar_imagic(benchmark::State & state,const char * net)1461 static void qu8_gemm_1x2__scalar_imagic(benchmark::State& state, const char* net) {
1462   GEMMBenchmark(state,
1463     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1464     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1465     1, 2, 1, 1);
1466 }
qu8_gemm_2x2__scalar_imagic(benchmark::State & state,const char * net)1467 static void qu8_gemm_2x2__scalar_imagic(benchmark::State& state, const char* net) {
1468   GEMMBenchmark(state,
1469     xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1470     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1471     2, 2, 1, 1);
1472 }
qu8_gemm_3x2__scalar_imagic(benchmark::State & state,const char * net)1473 static void qu8_gemm_3x2__scalar_imagic(benchmark::State& state, const char* net) {
1474   GEMMBenchmark(state,
1475     xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1476     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1477     3, 2, 1, 1);
1478 }
qu8_gemm_4x2__scalar_imagic(benchmark::State & state,const char * net)1479 static void qu8_gemm_4x2__scalar_imagic(benchmark::State& state, const char* net) {
1480   GEMMBenchmark(state,
1481     xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1482     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1483     4, 2, 1, 1);
1484 }
qu8_gemm_1x4__scalar_imagic(benchmark::State & state,const char * net)1485 static void qu8_gemm_1x4__scalar_imagic(benchmark::State& state, const char* net) {
1486   GEMMBenchmark(state,
1487     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1488     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1489     1, 4, 1, 1);
1490 }
qu8_gemm_2x4__scalar_imagic(benchmark::State & state,const char * net)1491 static void qu8_gemm_2x4__scalar_imagic(benchmark::State& state, const char* net) {
1492   GEMMBenchmark(state,
1493     xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1494     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1495     2, 4, 1, 1);
1496 }
qu8_gemm_3x4__scalar_imagic(benchmark::State & state,const char * net)1497 static void qu8_gemm_3x4__scalar_imagic(benchmark::State& state, const char* net) {
1498   GEMMBenchmark(state,
1499     xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1500     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1501     3, 4, 1, 1);
1502 }
qu8_gemm_4x4__scalar_imagic(benchmark::State & state,const char * net)1503 static void qu8_gemm_4x4__scalar_imagic(benchmark::State& state, const char* net) {
1504   GEMMBenchmark(state,
1505     xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1506     xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1507     4, 4, 1, 1);
1508 }
1509 
qu8_gemm_1x2__scalar_lrintf(benchmark::State & state,const char * net)1510 static void qu8_gemm_1x2__scalar_lrintf(benchmark::State& state, const char* net) {
1511   GEMMBenchmark(state,
1512     xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1513     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1514     1, 2, 1, 1);
1515 }
qu8_gemm_2x2__scalar_lrintf(benchmark::State & state,const char * net)1516 static void qu8_gemm_2x2__scalar_lrintf(benchmark::State& state, const char* net) {
1517   GEMMBenchmark(state,
1518     xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1519     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1520     2, 2, 1, 1);
1521 }
qu8_gemm_3x2__scalar_lrintf(benchmark::State & state,const char * net)1522 static void qu8_gemm_3x2__scalar_lrintf(benchmark::State& state, const char* net) {
1523   GEMMBenchmark(state,
1524     xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1525     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1526     3, 2, 1, 1);
1527 }
qu8_gemm_4x2__scalar_lrintf(benchmark::State & state,const char * net)1528 static void qu8_gemm_4x2__scalar_lrintf(benchmark::State& state, const char* net) {
1529   GEMMBenchmark(state,
1530     xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1531     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1532     4, 2, 1, 1);
1533 }
qu8_gemm_1x4__scalar_lrintf(benchmark::State & state,const char * net)1534 static void qu8_gemm_1x4__scalar_lrintf(benchmark::State& state, const char* net) {
1535   GEMMBenchmark(state,
1536     xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1537     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1538     1, 4, 1, 1);
1539 }
qu8_gemm_2x4__scalar_lrintf(benchmark::State & state,const char * net)1540 static void qu8_gemm_2x4__scalar_lrintf(benchmark::State& state, const char* net) {
1541   GEMMBenchmark(state,
1542     xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1543     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1544     2, 4, 1, 1);
1545 }
qu8_gemm_3x4__scalar_lrintf(benchmark::State & state,const char * net)1546 static void qu8_gemm_3x4__scalar_lrintf(benchmark::State& state, const char* net) {
1547   GEMMBenchmark(state,
1548     xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1549     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1550     3, 4, 1, 1);
1551 }
qu8_gemm_4x4__scalar_lrintf(benchmark::State & state,const char * net)1552 static void qu8_gemm_4x4__scalar_lrintf(benchmark::State& state, const char* net) {
1553   GEMMBenchmark(state,
1554     xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1555     xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1556     4, 4, 1, 1);
1557 }
1558 
1559 BENCHMARK_GEMM(qu8_gemm_1x2__scalar_fmagic)
1560 BENCHMARK_GEMM(qu8_gemm_2x2__scalar_fmagic)
1561 BENCHMARK_GEMM(qu8_gemm_3x2__scalar_fmagic)
1562 BENCHMARK_GEMM(qu8_gemm_4x2__scalar_fmagic)
1563 BENCHMARK_GEMM(qu8_gemm_1x4__scalar_fmagic)
1564 BENCHMARK_GEMM(qu8_gemm_2x4__scalar_fmagic)
1565 BENCHMARK_GEMM(qu8_gemm_3x4__scalar_fmagic)
1566 BENCHMARK_GEMM(qu8_gemm_4x4__scalar_fmagic)
1567 
1568 BENCHMARK_GEMM(qu8_gemm_1x2__scalar_imagic)
1569 BENCHMARK_GEMM(qu8_gemm_2x2__scalar_imagic)
1570 BENCHMARK_GEMM(qu8_gemm_3x2__scalar_imagic)
1571 BENCHMARK_GEMM(qu8_gemm_4x2__scalar_imagic)
1572 BENCHMARK_GEMM(qu8_gemm_1x4__scalar_imagic)
1573 BENCHMARK_GEMM(qu8_gemm_2x4__scalar_imagic)
1574 BENCHMARK_GEMM(qu8_gemm_3x4__scalar_imagic)
1575 BENCHMARK_GEMM(qu8_gemm_4x4__scalar_imagic)
1576 
1577 BENCHMARK_GEMM(qu8_gemm_1x2__scalar_lrintf)
1578 BENCHMARK_GEMM(qu8_gemm_2x2__scalar_lrintf)
1579 BENCHMARK_GEMM(qu8_gemm_3x2__scalar_lrintf)
1580 BENCHMARK_GEMM(qu8_gemm_4x2__scalar_lrintf)
1581 BENCHMARK_GEMM(qu8_gemm_1x4__scalar_lrintf)
1582 BENCHMARK_GEMM(qu8_gemm_2x4__scalar_lrintf)
1583 BENCHMARK_GEMM(qu8_gemm_3x4__scalar_lrintf)
1584 BENCHMARK_GEMM(qu8_gemm_4x4__scalar_lrintf)
1585 
1586 
1587 #ifdef BENCHMARK_RUY
1588 BENCHMARK_GEMM(ruy_st)
1589 #endif  // BENCHMARK_RUY
1590 #ifdef BENCHMARK_GEMMLOWP
1591 BENCHMARK_GEMM(gemmlowp_st)
1592 #endif  // BENCHMARK_GEMMLOWP
1593 
1594 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1595 BENCHMARK_MAIN();
1596 #endif
1597