1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8
9 #include <algorithm>
10 #include <cfloat>
11 #include <chrono>
12 #include <cmath>
13 #include <functional>
14 #include <limits>
15 #include <mutex>
16 #include <random>
17 #include <vector>
18
19 #include <cpuinfo.h>
20
21 #include <benchmark/benchmark.h>
22 #ifdef BENCHMARK_GEMMLOWP
23 #include "gemmlowp/public/gemmlowp.h"
24 #endif // BENCHMARK_GEMMLOWP
25 #ifdef BENCHMARK_RUY
26 #include "ruy/ruy.h"
27 #endif // BENCHMARK_RUY
28 #include "bench/gemm.h"
29 #include "bench/utils.h"
30 #include <xnnpack/AlignedAllocator.h>
31 #include <xnnpack/common.h>
32 #include <xnnpack/gemm.h>
33 #include <xnnpack/pack.h>
34 #include <xnnpack/params-init.h>
35 #include <xnnpack/params.h>
36
37
GEMMBenchmark(benchmark::State & state,xnn_qu8_gemm_minmax_ukernel_function gemm,xnn_init_qu8_conv_minmax_params_fn init_params,size_t mr,size_t nr,size_t kr,size_t sr,benchmark::utils::IsaCheckFunction isa_check=nullptr)38 static void GEMMBenchmark(benchmark::State& state,
39 xnn_qu8_gemm_minmax_ukernel_function gemm,
40 xnn_init_qu8_conv_minmax_params_fn init_params,
41 size_t mr, size_t nr, size_t kr, size_t sr,
42 benchmark::utils::IsaCheckFunction isa_check = nullptr)
43 {
44 if (!cpuinfo_initialize()) {
45 state.SkipWithError("cpuinfo initialization failed");
46 return;
47 }
48 if (isa_check && !isa_check(state)) {
49 return;
50 }
51
52 const size_t mc = state.range(0);
53 const size_t nc = state.range(1);
54 const size_t kc = state.range(2);
55
56 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
57 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
58
59 std::random_device random_device;
60 auto rng = std::mt19937(random_device());
61 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
62 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
63
64 std::vector<uint8_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint8_t));
65 std::generate(a.begin(), a.end(), std::ref(u8rng));
66 std::vector<uint8_t> k(nc * kc);
67 std::generate(k.begin(), k.end(), std::ref(u8rng));
68 std::vector<int32_t> b(nc);
69 std::generate(b.begin(), b.end(), std::ref(i32rng));
70
71 const size_t w_elements = kc_stride * nc_stride + nc_stride * sizeof(int32_t) / sizeof(uint8_t);
72 const size_t c_elements = mc * nc;
73 const size_t num_buffers = 1 +
74 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
75 sizeof(uint8_t) * (w_elements + c_elements));
76
77 std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> w(w_elements * num_buffers);
78 std::fill(w.begin(), w.end(), 0);
79 const xnn_qu8_packing_params packing_params = { 127, 127 };
80 xnn_pack_qu8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
81 std::vector<uint8_t> c(c_elements * num_buffers);
82 std::fill(c.begin(), c.end(), 0xA5);
83
84 union xnn_qu8_conv_minmax_params quantization_params;
85 init_params(&quantization_params, 127, 0.75f, 127, 1, 254);
86
87 size_t buffer_index = 0;
88 for (auto _ : state) {
89 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
90 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
91 // - W is not in cache (for any cache level)
92 // - C is not in cache (for any cache level)
93 state.PauseTiming();
94 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
95 buffer_index = (buffer_index + 1) % num_buffers;
96 state.ResumeTiming();
97
98 for (uint32_t m = 0; m < mc; m += mr) {
99 const uint32_t mb = min(mc - m, mr);
100 for (uint32_t n = 0; n < nc; n += nr) {
101 const uint32_t nb = min(nc - n, nr);
102 gemm(
103 mb, nb, kc * sizeof(uint8_t),
104 a.data() + m * kc, kc * sizeof(uint8_t),
105 w.data() + (w_elements * buffer_index + n * (kc_stride + sizeof(int32_t))) / sizeof(uint8_t),
106 c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint8_t), nr * sizeof(uint8_t),
107 &quantization_params);
108 }
109 }
110 }
111
112 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
113 if (cpu_frequency != 0) {
114 state.counters["cpufreq"] = cpu_frequency;
115 }
116
117 state.counters["OPS"] = benchmark::Counter(
118 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
119 }
120
121 #ifdef BENCHMARK_GEMMLOWP
122 struct GemmlowpOutputPipeline {
123 typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
124 typedef std::tuple<
125 gemmlowp::OutputStageBiasAddition<ColVectorMap>,
126 gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
127 gemmlowp::OutputStageClamp,
128 gemmlowp::OutputStageSaturatingCastToUint8>
129 Pipeline;
130
MakeGemmlowpOutputPipeline131 static Pipeline Make(
132 const int32_t* bias_data,
133 int output_rows,
134 int32_t output_offset,
135 int32_t output_multiplier,
136 int output_shift,
137 int32_t output_activation_min,
138 int32_t output_activation_max)
139 {
140 ColVectorMap bias_vector(bias_data, output_rows);
141 gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
142 bias_addition_stage.bias_vector = bias_vector;
143 gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint quantize_down_stage;
144 quantize_down_stage.result_offset_after_shift = output_offset;
145 quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
146 quantize_down_stage.result_shift = output_shift;
147 gemmlowp::OutputStageClamp clamp_stage;
148 clamp_stage.min = output_activation_min;
149 clamp_stage.max = output_activation_max;
150 gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
151 return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage, saturating_cast_stage);
152 }
153 };
154
GemmlowpBenchmark(benchmark::State & state,uint32_t threads)155 static void GemmlowpBenchmark(benchmark::State& state, uint32_t threads)
156 {
157 const size_t mc = state.range(0);
158 const size_t nc = state.range(1);
159 const size_t kc = state.range(2);
160
161 std::random_device random_device;
162 auto rng = std::mt19937(random_device());
163 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
164 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
165
166 std::vector<uint8_t> a(mc * kc);
167 std::generate(a.begin(), a.end(), std::ref(u8rng));
168
169 const size_t kElements = nc * kc;
170 const size_t bElements = nc;
171 const size_t c_elements = mc * nc;
172 const size_t num_buffers = 1 +
173 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
174 kElements * sizeof(uint8_t) + bElements * sizeof(int32_t) + c_elements * sizeof(uint8_t));
175
176 std::vector<uint8_t> k(kElements * num_buffers);
177 std::generate(k.begin(), k.end(), std::ref(u8rng));
178 std::vector<int32_t> b(bElements * num_buffers);
179 std::generate(b.begin(), b.end(), std::ref(i32rng));
180 std::vector<uint8_t> c(c_elements * num_buffers);
181 std::fill(c.begin(), c.end(), 0xA5);
182
183 gemmlowp::MultiThreadGemmContext threadingContext;
184 threadingContext.set_max_num_threads(threads);
185
186 size_t buffer_index = 0;
187 for (auto _ : state) {
188 state.PauseTiming();
189 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
190 buffer_index = (buffer_index + 1) % num_buffers;
191 state.ResumeTiming();
192
193 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> AM(a.data(), mc, kc, kc);
194 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> BM(k.data() + buffer_index * kElements, kc, nc, kc);
195 gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::RowMajor> CM(c.data() + buffer_index * c_elements, mc, nc, nc);
196 const auto& outputPipeline = GemmlowpOutputPipeline::Make(b.data() + buffer_index * bElements, nc, 127, 127, 127, 0, 255);
197 gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
198 &threadingContext, AM, BM, &CM, 127, 127, outputPipeline);
199 }
200
201 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
202 if (cpu_frequency != 0) {
203 state.counters["cpufreq"] = cpu_frequency;
204 }
205
206 state.counters["OPS"] = benchmark::Counter(
207 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
208 }
209
gemmlowp_st(benchmark::State & state,const char * net)210 static void gemmlowp_st(benchmark::State& state, const char* net)
211 {
212 GemmlowpBenchmark(state, 1);
213 }
214 #endif // BENCHMARK_GEMMLOWP
215
216
217 #ifdef BENCHMARK_RUY
RuyBenchmark(benchmark::State & state,size_t threads)218 static void RuyBenchmark(benchmark::State& state, size_t threads)
219 {
220 const size_t mc = state.range(0);
221 const size_t nc = state.range(1);
222 const size_t kc = state.range(2);
223
224 std::random_device random_device;
225 auto rng = std::mt19937(random_device());
226 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
227 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
228
229 const size_t num_buffers = 1 +
230 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
231 nc * (sizeof(uint8_t) * (mc + kc) + sizeof(int32_t)));
232
233 std::vector<uint8_t> a(mc * kc);
234 std::generate(a.begin(), a.end(), std::ref(u8rng));
235 std::vector<uint8_t> k(num_buffers * nc * kc);
236 std::generate(k.begin(), k.end(), std::ref(u8rng));
237 std::vector<int32_t> b(num_buffers * nc);
238 std::generate(b.begin(), b.end(), std::ref(i32rng));
239 std::vector<uint8_t> c(num_buffers * nc * mc);
240 std::fill(c.begin(), c.end(), std::nanf(""));
241
242 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
243 static ruy::Context context;
244 context.set_max_num_threads(threads);
245
246 ruy::Matrix<uint8_t> ruy_a;
247 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
248 ruy_a.set_zero_point(127);
249 ruy::Matrix<uint8_t> ruy_b;
250 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
251 ruy_b.set_data(a.data());
252 ruy_b.set_zero_point(127);
253 ruy::Matrix<uint8_t> ruy_c;
254 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
255 ruy_c.set_zero_point(127);
256
257 ruy::MulParams<int32_t, uint8_t> mul_params;
258 mul_params.set_multiplier_fixedpoint(0x40000000);
259
260 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
261 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
262 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
263 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
264 static std::once_flag warmup;
265 std::call_once(warmup, [&](){
266 auto start = std::chrono::steady_clock::now();
267 do {
268 ruy_a.set_data(k.data());
269 ruy_c.set_data(c.data());
270 mul_params.set_bias(b.data());
271
272 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
273 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
274 });
275
276 size_t buffer_index = 0;
277 for (auto _ : state) {
278 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
279 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
280 // - K is not in cache (for any cache level)
281 // - B is not in cache (for any cache level)
282 // - C is not in cache (for any cache level)
283 state.PauseTiming();
284 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
285 buffer_index = (buffer_index + 1) % num_buffers;
286 state.ResumeTiming();
287
288 ruy_a.set_data(k.data() + buffer_index * nc * kc);
289 ruy_c.set_data(c.data() + buffer_index * mc * nc);
290 mul_params.set_bias(b.data() + buffer_index * nc);
291
292 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
293 }
294
295 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
296 if (cpu_frequency != 0) {
297 state.counters["cpufreq"] = cpu_frequency;
298 }
299
300 state.counters["OPS"] = benchmark::Counter(
301 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
302 }
303
ruy_st(benchmark::State & state,const char * net)304 static void ruy_st(benchmark::State& state, const char* net)
305 {
306 RuyBenchmark(state, 1);
307 }
308 #endif // BENCHMARK_RUY
309
310
311 #if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)312 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
313 GEMMBenchmark(state,
314 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
315 xnn_init_qu8_conv_minmax_rndnu_neon_params,
316 4, 8, 1, 1, benchmark::utils::CheckNEON);
317 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)318 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
319 GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
320 xnn_init_qu8_conv_minmax_rndnu_neon_params,
321 4, 8, 1, 1, benchmark::utils::CheckNEON);
322 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State & state,const char * net)323 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
324 GEMMBenchmark(state,
325 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
326 xnn_init_qu8_conv_minmax_rndnu_neon_params,
327 4, 8, 1, 1, benchmark::utils::CheckNEON);
328 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State & state,const char * net)329 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
330 GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
331 xnn_init_qu8_conv_minmax_rndnu_neon_params,
332 4, 8, 1, 1, benchmark::utils::CheckNEON);
333 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State & state,const char * net)334 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
335 GEMMBenchmark(state,
336 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
337 xnn_init_qu8_conv_minmax_rndnu_neon_params,
338 4, 8, 1, 1, benchmark::utils::CheckNEON);
339 }
qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)340 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
341 GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
342 xnn_init_qu8_conv_minmax_rndnu_neon_params,
343 4, 8, 1, 1, benchmark::utils::CheckNEON);
344 }
345 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)346 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
347 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
348 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
349 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
350 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
351 #endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
352
353 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
354 static void qu8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
355 GEMMBenchmark(state,
356 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
357 xnn_init_qu8_conv_minmax_rndnu_neon_params,
358 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
359 }
qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State & state,const char * net)360 static void qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
361 GEMMBenchmark(state,
362 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
363 xnn_init_qu8_conv_minmax_rndnu_neon_params,
364 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
365 }
qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State & state,const char * net)366 static void qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
367 GEMMBenchmark(state,
368 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
369 xnn_init_qu8_conv_minmax_rndnu_neon_params,
370 4, 8, 4, 1,
371 benchmark::utils::CheckNEONDOT);
372 }
qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State & state,const char * net)373 static void qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
374 GEMMBenchmark(state,
375 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
376 xnn_init_qu8_conv_minmax_rndnu_neon_params,
377 4, 8, 4, 1, benchmark::utils::CheckNEONDOT);
378 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State & state,const char * net)379 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
380 GEMMBenchmark(state,
381 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
382 xnn_init_qu8_conv_minmax_rndnu_neon_params,
383 4, 16, 1, 1,
384 benchmark::utils::CheckNEON);
385 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State & state,const char * net)386 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
387 GEMMBenchmark(state,
388 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
389 xnn_init_qu8_conv_minmax_rndnu_neon_params,
390 4, 16, 1, 1,
391 benchmark::utils::CheckNEON);
392 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State & state,const char * net)393 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
394 GEMMBenchmark(state,
395 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
396 xnn_init_qu8_conv_minmax_rndnu_neon_params,
397 4, 16, 1, 1,
398 benchmark::utils::CheckNEON);
399 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State & state,const char * net)400 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
401 GEMMBenchmark(state,
402 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
403 xnn_init_qu8_conv_minmax_rndnu_neon_params,
404 4, 16, 1, 1,
405 benchmark::utils::CheckNEON);
406 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State & state,const char * net)407 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State& state, const char* net) {
408 GEMMBenchmark(state,
409 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
410 xnn_init_qu8_conv_minmax_rndnu_neon_params,
411 4, 16, 1, 1,
412 benchmark::utils::CheckNEON);
413 }
qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State & state,const char * net)414 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State& state, const char* net) {
415 GEMMBenchmark(state,
416 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
417 xnn_init_qu8_conv_minmax_rndnu_neon_params,
418 4, 16, 1, 1,
419 benchmark::utils::CheckNEON);
420 }
421 BENCHMARK_GEMM(qu8_gemm_4x8c4__aarch64_neondot_cortex_a55)
BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55)422 BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55)
423 BENCHMARK_GEMM(qu8_gemm_4x8c4__aarch64_neondot_ld128)
424 BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_ld128)
425 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
426 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
427 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
428 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
429 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75)
430 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75)
431 #endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
432
433
434 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
435 static void qu8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
436 GEMMBenchmark(state,
437 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
438 xnn_init_qu8_conv_minmax_rndnu_neon_params,
439 1, 8, 4, 1, benchmark::utils::CheckNEONDOT);
440 }
qu8_gemm_2x8c4__neondot(benchmark::State & state,const char * net)441 static void qu8_gemm_2x8c4__neondot(benchmark::State& state, const char* net) {
442 GEMMBenchmark(state,
443 xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot,
444 xnn_init_qu8_conv_minmax_rndnu_neon_params,
445 2, 8, 4, 1, benchmark::utils::CheckNEONDOT);
446 }
qu8_gemm_3x8c4__neondot(benchmark::State & state,const char * net)447 static void qu8_gemm_3x8c4__neondot(benchmark::State& state, const char* net) {
448 GEMMBenchmark(state,
449 xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot,
450 xnn_init_qu8_conv_minmax_rndnu_neon_params,
451 3, 8, 4, 1, benchmark::utils::CheckNEONDOT);
452 }
qu8_gemm_4x8c4__neondot(benchmark::State & state,const char * net)453 static void qu8_gemm_4x8c4__neondot(benchmark::State& state, const char* net) {
454 GEMMBenchmark(state,
455 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
456 xnn_init_qu8_conv_minmax_rndnu_neon_params,
457 4, 8, 4, 1, benchmark::utils::CheckNEONDOT);
458 }
qu8_gemm_5x8c4__neondot(benchmark::State & state,const char * net)459 static void qu8_gemm_5x8c4__neondot(benchmark::State& state, const char* net) {
460 GEMMBenchmark(state,
461 xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot,
462 xnn_init_qu8_conv_minmax_rndnu_neon_params,
463 5, 8, 4, 1, benchmark::utils::CheckNEONDOT);
464 }
qu8_gemm_6x8c4__neondot(benchmark::State & state,const char * net)465 static void qu8_gemm_6x8c4__neondot(benchmark::State& state, const char* net) {
466 GEMMBenchmark(state,
467 xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
468 xnn_init_qu8_conv_minmax_rndnu_neon_params,
469 6, 8, 4, 1, benchmark::utils::CheckNEONDOT);
470 }
qu8_gemm_8x8c4__neondot(benchmark::State & state,const char * net)471 static void qu8_gemm_8x8c4__neondot(benchmark::State& state, const char* net) {
472 GEMMBenchmark(state,
473 xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
474 xnn_init_qu8_conv_minmax_rndnu_neon_params,
475 8, 8, 4, 1, benchmark::utils::CheckNEONDOT);
476 }
qu8_gemm_1x16c4__neondot(benchmark::State & state,const char * net)477 static void qu8_gemm_1x16c4__neondot(benchmark::State& state, const char* net) {
478 GEMMBenchmark(state,
479 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
480 xnn_init_qu8_conv_minmax_rndnu_neon_params,
481 1, 16, 4, 1, benchmark::utils::CheckNEONDOT);
482 }
qu8_gemm_2x16c4__neondot(benchmark::State & state,const char * net)483 static void qu8_gemm_2x16c4__neondot(benchmark::State& state, const char* net) {
484 GEMMBenchmark(state,
485 xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot,
486 xnn_init_qu8_conv_minmax_rndnu_neon_params,
487 2, 16, 4, 1, benchmark::utils::CheckNEONDOT);
488 }
qu8_gemm_3x16c4__neondot(benchmark::State & state,const char * net)489 static void qu8_gemm_3x16c4__neondot(benchmark::State& state, const char* net) {
490 GEMMBenchmark(state,
491 xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot,
492 xnn_init_qu8_conv_minmax_rndnu_neon_params,
493 3, 16, 4, 1, benchmark::utils::CheckNEONDOT);
494 }
qu8_gemm_4x16c4__neondot(benchmark::State & state,const char * net)495 static void qu8_gemm_4x16c4__neondot(benchmark::State& state, const char* net) {
496 GEMMBenchmark(state,
497 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
498 xnn_init_qu8_conv_minmax_rndnu_neon_params,
499 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
500 }
qu8_gemm_5x16c4__neondot(benchmark::State & state,const char * net)501 static void qu8_gemm_5x16c4__neondot(benchmark::State& state, const char* net) {
502 GEMMBenchmark(state,
503 xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot,
504 xnn_init_qu8_conv_minmax_rndnu_neon_params,
505 5, 16, 4, 1, benchmark::utils::CheckNEONDOT);
506 }
qu8_gemm_6x16c4__neondot(benchmark::State & state,const char * net)507 static void qu8_gemm_6x16c4__neondot(benchmark::State& state, const char* net) {
508 GEMMBenchmark(state,
509 xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
510 xnn_init_qu8_conv_minmax_rndnu_neon_params,
511 6, 16, 4, 1, benchmark::utils::CheckNEONDOT);
512 }
qu8_gemm_8x16c4__neondot(benchmark::State & state,const char * net)513 static void qu8_gemm_8x16c4__neondot(benchmark::State& state, const char* net) {
514 GEMMBenchmark(state,
515 xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
516 xnn_init_qu8_conv_minmax_rndnu_neon_params,
517 8, 16, 4, 1, benchmark::utils::CheckNEONDOT);
518 }
qu8_gemm_1x8__neon_mlal_lane(benchmark::State & state,const char * net)519 static void qu8_gemm_1x8__neon_mlal_lane(benchmark::State& state, const char* net) {
520 GEMMBenchmark(state,
521 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
522 xnn_init_qu8_conv_minmax_rndnu_neon_params,
523 1, 8, 1, 1, benchmark::utils::CheckNEON);
524 }
qu8_gemm_2x8__neon_mlal_lane(benchmark::State & state,const char * net)525 static void qu8_gemm_2x8__neon_mlal_lane(benchmark::State& state, const char* net) {
526 GEMMBenchmark(state,
527 xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
528 xnn_init_qu8_conv_minmax_rndnu_neon_params,
529 2, 8, 1, 1, benchmark::utils::CheckNEON);
530 }
qu8_gemm_3x8__neon_mlal_lane(benchmark::State & state,const char * net)531 static void qu8_gemm_3x8__neon_mlal_lane(benchmark::State& state, const char* net) {
532 GEMMBenchmark(state,
533 xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
534 xnn_init_qu8_conv_minmax_rndnu_neon_params,
535 3, 8, 1, 1, benchmark::utils::CheckNEON);
536 }
qu8_gemm_4x8__neon_mlal_lane(benchmark::State & state,const char * net)537 static void qu8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) {
538 GEMMBenchmark(state,
539 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
540 xnn_init_qu8_conv_minmax_rndnu_neon_params,
541 4, 8, 1, 1, benchmark::utils::CheckNEON);
542 }
qu8_gemm_6x8__neon_mlal_lane(benchmark::State & state,const char * net)543 static void qu8_gemm_6x8__neon_mlal_lane(benchmark::State& state, const char* net) {
544 GEMMBenchmark(state,
545 xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
546 xnn_init_qu8_conv_minmax_rndnu_neon_params,
547 6, 8, 1, 1, benchmark::utils::CheckNEON);
548 }
qu8_gemm_1x16__neon_mlal_lane(benchmark::State & state,const char * net)549 static void qu8_gemm_1x16__neon_mlal_lane(benchmark::State& state, const char* net) {
550 GEMMBenchmark(state,
551 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
552 xnn_init_qu8_conv_minmax_rndnu_neon_params,
553 1, 16, 1, 1, benchmark::utils::CheckNEON);
554 }
qu8_gemm_2x16__neon_mlal_lane(benchmark::State & state,const char * net)555 static void qu8_gemm_2x16__neon_mlal_lane(benchmark::State& state, const char* net) {
556 GEMMBenchmark(state,
557 xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
558 xnn_init_qu8_conv_minmax_rndnu_neon_params,
559 2, 16, 1, 1, benchmark::utils::CheckNEON);
560 }
qu8_gemm_3x16__neon_mlal_lane(benchmark::State & state,const char * net)561 static void qu8_gemm_3x16__neon_mlal_lane(benchmark::State& state, const char* net) {
562 GEMMBenchmark(state,
563 xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
564 xnn_init_qu8_conv_minmax_rndnu_neon_params,
565 3, 16, 1, 1, benchmark::utils::CheckNEON);
566 }
qu8_gemm_4x16__neon_mlal_lane(benchmark::State & state,const char * net)567 static void qu8_gemm_4x16__neon_mlal_lane(benchmark::State& state, const char* net) {
568 GEMMBenchmark(state,
569 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
570 xnn_init_qu8_conv_minmax_rndnu_neon_params,
571 4, 16, 1, 1, benchmark::utils::CheckNEON);
572 }
qu8_gemm_6x16__neon_mlal_lane(benchmark::State & state,const char * net)573 static void qu8_gemm_6x16__neon_mlal_lane(benchmark::State& state, const char* net) {
574 GEMMBenchmark(state,
575 xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
576 xnn_init_qu8_conv_minmax_rndnu_neon_params,
577 6, 16, 1, 1, benchmark::utils::CheckNEON);
578 }
qu8_gemm_1x32c4__neondot(benchmark::State & state,const char * net)579 static void qu8_gemm_1x32c4__neondot(benchmark::State& state, const char* net) {
580 GEMMBenchmark(state,
581 xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
582 xnn_init_qu8_conv_minmax_rndnu_neon_params,
583 1, 32, 4, 1, benchmark::utils::CheckNEONDOT);
584 }
qu8_gemm_2x32c4__neondot(benchmark::State & state,const char * net)585 static void qu8_gemm_2x32c4__neondot(benchmark::State& state, const char* net) {
586 GEMMBenchmark(state,
587 xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot,
588 xnn_init_qu8_conv_minmax_rndnu_neon_params,
589 2, 32, 4, 1, benchmark::utils::CheckNEONDOT);
590 }
qu8_gemm_3x32c4__neondot(benchmark::State & state,const char * net)591 static void qu8_gemm_3x32c4__neondot(benchmark::State& state, const char* net) {
592 GEMMBenchmark(state,
593 xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot,
594 xnn_init_qu8_conv_minmax_rndnu_neon_params,
595 3, 32, 4, 1, benchmark::utils::CheckNEONDOT);
596 }
597 BENCHMARK_GEMM(qu8_gemm_1x8c4__neondot)
BENCHMARK_GEMM(qu8_gemm_2x8c4__neondot)598 BENCHMARK_GEMM(qu8_gemm_2x8c4__neondot)
599 BENCHMARK_GEMM(qu8_gemm_3x8c4__neondot)
600 BENCHMARK_GEMM(qu8_gemm_4x8c4__neondot)
601 BENCHMARK_GEMM(qu8_gemm_5x8c4__neondot)
602 BENCHMARK_GEMM(qu8_gemm_6x8c4__neondot)
603 BENCHMARK_GEMM(qu8_gemm_8x8c4__neondot)
604 BENCHMARK_GEMM(qu8_gemm_1x16c4__neondot)
605 BENCHMARK_GEMM(qu8_gemm_2x16c4__neondot)
606 BENCHMARK_GEMM(qu8_gemm_3x16c4__neondot)
607 BENCHMARK_GEMM(qu8_gemm_4x16c4__neondot)
608 BENCHMARK_GEMM(qu8_gemm_5x16c4__neondot)
609 BENCHMARK_GEMM(qu8_gemm_6x16c4__neondot)
610 BENCHMARK_GEMM(qu8_gemm_8x16c4__neondot)
611 BENCHMARK_GEMM(qu8_gemm_1x32c4__neondot)
612 BENCHMARK_GEMM(qu8_gemm_2x32c4__neondot)
613 BENCHMARK_GEMM(qu8_gemm_3x32c4__neondot)
614 BENCHMARK_GEMM(qu8_gemm_1x8__neon_mlal_lane)
615 BENCHMARK_GEMM(qu8_gemm_2x8__neon_mlal_lane)
616 BENCHMARK_GEMM(qu8_gemm_3x8__neon_mlal_lane)
617 BENCHMARK_GEMM(qu8_gemm_4x8__neon_mlal_lane)
618 BENCHMARK_GEMM(qu8_gemm_6x8__neon_mlal_lane)
619 BENCHMARK_GEMM(qu8_gemm_1x16__neon_mlal_lane)
620 BENCHMARK_GEMM(qu8_gemm_2x16__neon_mlal_lane)
621 BENCHMARK_GEMM(qu8_gemm_3x16__neon_mlal_lane)
622 BENCHMARK_GEMM(qu8_gemm_4x16__neon_mlal_lane)
623 BENCHMARK_GEMM(qu8_gemm_6x16__neon_mlal_lane)
624 #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
625
626
627 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
628 static void qu8_gemm_1x16c8__avx512skx(benchmark::State& state, const char* net) {
629 GEMMBenchmark(state,
630 xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
631 xnn_init_qu8_conv_minmax_fp32_avx512_params,
632 1, 16, 8, 1,
633 benchmark::utils::CheckAVX512SKX);
634 }
qu8_gemm_2x16c8__avx512skx(benchmark::State & state,const char * net)635 static void qu8_gemm_2x16c8__avx512skx(benchmark::State& state, const char* net) {
636 GEMMBenchmark(state,
637 xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
638 xnn_init_qu8_conv_minmax_fp32_avx512_params,
639 2, 16, 8, 1,
640 benchmark::utils::CheckAVX512SKX);
641 }
qu8_gemm_3x16c8__avx512skx(benchmark::State & state,const char * net)642 static void qu8_gemm_3x16c8__avx512skx(benchmark::State& state, const char* net) {
643 GEMMBenchmark(state,
644 xnn_qu8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
645 xnn_init_qu8_conv_minmax_fp32_avx512_params,
646 3, 16, 8, 1,
647 benchmark::utils::CheckAVX512SKX);
648 }
qu8_gemm_4x16c8__avx512skx(benchmark::State & state,const char * net)649 static void qu8_gemm_4x16c8__avx512skx(benchmark::State& state, const char* net) {
650 GEMMBenchmark(state,
651 xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
652 xnn_init_qu8_conv_minmax_fp32_avx512_params,
653 4, 16, 8, 1,
654 benchmark::utils::CheckAVX512SKX);
655 }
qu8_gemm_1x8c8__avx2(benchmark::State & state,const char * net)656 static void qu8_gemm_1x8c8__avx2(benchmark::State& state, const char* net) {
657 GEMMBenchmark(state,
658 xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
659 xnn_init_qu8_conv_minmax_fp32_avx2_params,
660 1, 8, 8, 1,
661 benchmark::utils::CheckAVX2);
662 }
qu8_gemm_2x8c8__avx2(benchmark::State & state,const char * net)663 static void qu8_gemm_2x8c8__avx2(benchmark::State& state, const char* net) {
664 GEMMBenchmark(state,
665 xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
666 xnn_init_qu8_conv_minmax_fp32_avx2_params,
667 2, 8, 8, 1,
668 benchmark::utils::CheckAVX2);
669 }
qu8_gemm_3x8c8__avx2(benchmark::State & state,const char * net)670 static void qu8_gemm_3x8c8__avx2(benchmark::State& state, const char* net) {
671 GEMMBenchmark(state,
672 xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
673 xnn_init_qu8_conv_minmax_fp32_avx2_params,
674 3, 8, 8, 1,
675 benchmark::utils::CheckAVX2);
676 }
qu8_gemm_1x4c2__xop_ld64(benchmark::State & state,const char * net)677 static void qu8_gemm_1x4c2__xop_ld64(benchmark::State& state, const char* net) {
678 GEMMBenchmark(state,
679 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
680 xnn_init_qu8_conv_minmax_fp32_sse2_params,
681 1, 4, 2, 1,
682 benchmark::utils::CheckXOP);
683 }
qu8_gemm_2x4c2__xop_ld64(benchmark::State & state,const char * net)684 static void qu8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
685 GEMMBenchmark(state,
686 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
687 xnn_init_qu8_conv_minmax_fp32_sse2_params,
688 2, 4, 2, 1,
689 benchmark::utils::CheckXOP);
690 }
qu8_gemm_3x4c2__xop_ld64(benchmark::State & state,const char * net)691 static void qu8_gemm_3x4c2__xop_ld64(benchmark::State& state, const char* net) {
692 GEMMBenchmark(state,
693 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
694 xnn_init_qu8_conv_minmax_fp32_sse2_params,
695 3, 4, 2, 1,
696 benchmark::utils::CheckXOP);
697 }
qu8_gemm_4x4c2__xop_ld64(benchmark::State & state,const char * net)698 static void qu8_gemm_4x4c2__xop_ld64(benchmark::State& state, const char* net) {
699 GEMMBenchmark(state,
700 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
701 xnn_init_qu8_conv_minmax_fp32_sse2_params,
702 4, 4, 2, 1,
703 benchmark::utils::CheckXOP);
704 }
qu8_gemm_1x4c2__xop_ld128(benchmark::State & state,const char * net)705 static void qu8_gemm_1x4c2__xop_ld128(benchmark::State& state, const char* net) {
706 GEMMBenchmark(state,
707 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
708 xnn_init_qu8_conv_minmax_fp32_sse2_params,
709 1, 4, 2, 1,
710 benchmark::utils::CheckXOP);
711 }
qu8_gemm_2x4c2__xop_ld128(benchmark::State & state,const char * net)712 static void qu8_gemm_2x4c2__xop_ld128(benchmark::State& state, const char* net) {
713 GEMMBenchmark(state,
714 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
715 xnn_init_qu8_conv_minmax_fp32_sse2_params,
716 2, 4, 2, 1,
717 benchmark::utils::CheckXOP);
718 }
qu8_gemm_3x4c2__xop_ld128(benchmark::State & state,const char * net)719 static void qu8_gemm_3x4c2__xop_ld128(benchmark::State& state, const char* net) {
720 GEMMBenchmark(state,
721 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
722 xnn_init_qu8_conv_minmax_fp32_sse2_params,
723 3, 4, 2, 1,
724 benchmark::utils::CheckXOP);
725 }
qu8_gemm_4x4c2__xop_ld128(benchmark::State & state,const char * net)726 static void qu8_gemm_4x4c2__xop_ld128(benchmark::State& state, const char* net) {
727 GEMMBenchmark(state,
728 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
729 xnn_init_qu8_conv_minmax_fp32_sse2_params,
730 4, 4, 2, 1,
731 benchmark::utils::CheckXOP);
732 }
qu8_gemm_1x4c8__xop_ld64(benchmark::State & state,const char * net)733 static void qu8_gemm_1x4c8__xop_ld64(benchmark::State& state, const char* net) {
734 GEMMBenchmark(state,
735 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
736 xnn_init_qu8_conv_minmax_fp32_sse2_params,
737 1, 4, 8, 1,
738 benchmark::utils::CheckXOP);
739 }
qu8_gemm_2x4c8__xop_ld64(benchmark::State & state,const char * net)740 static void qu8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
741 GEMMBenchmark(state,
742 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
743 xnn_init_qu8_conv_minmax_fp32_sse2_params,
744 2, 4, 8, 1,
745 benchmark::utils::CheckXOP);
746 }
qu8_gemm_3x4c8__xop_ld64(benchmark::State & state,const char * net)747 static void qu8_gemm_3x4c8__xop_ld64(benchmark::State& state, const char* net) {
748 GEMMBenchmark(state,
749 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
750 xnn_init_qu8_conv_minmax_fp32_sse2_params,
751 3, 4, 8, 1,
752 benchmark::utils::CheckXOP);
753 }
qu8_gemm_1x4c8__xop_ld128(benchmark::State & state,const char * net)754 static void qu8_gemm_1x4c8__xop_ld128(benchmark::State& state, const char* net) {
755 GEMMBenchmark(state,
756 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
757 xnn_init_qu8_conv_minmax_fp32_sse2_params,
758 1, 4, 8, 1,
759 benchmark::utils::CheckXOP);
760 }
qu8_gemm_2x4c8__xop_ld128(benchmark::State & state,const char * net)761 static void qu8_gemm_2x4c8__xop_ld128(benchmark::State& state, const char* net) {
762 GEMMBenchmark(state,
763 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
764 xnn_init_qu8_conv_minmax_fp32_sse2_params,
765 2, 4, 8, 1,
766 benchmark::utils::CheckXOP);
767 }
qu8_gemm_3x4c8__xop_ld128(benchmark::State & state,const char * net)768 static void qu8_gemm_3x4c8__xop_ld128(benchmark::State& state, const char* net) {
769 GEMMBenchmark(state,
770 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
771 xnn_init_qu8_conv_minmax_fp32_sse2_params,
772 3, 4, 8, 1,
773 benchmark::utils::CheckXOP);
774 }
qu8_gemm_1x4c2__avx_ld64(benchmark::State & state,const char * net)775 static void qu8_gemm_1x4c2__avx_ld64(benchmark::State& state, const char* net) {
776 GEMMBenchmark(state,
777 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
778 xnn_init_qu8_conv_minmax_fp32_sse2_params,
779 1, 4, 2, 1,
780 benchmark::utils::CheckAVX);
781 }
qu8_gemm_2x4c2__avx_ld64(benchmark::State & state,const char * net)782 static void qu8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
783 GEMMBenchmark(state,
784 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
785 xnn_init_qu8_conv_minmax_fp32_sse2_params,
786 2, 4, 2, 1,
787 benchmark::utils::CheckAVX);
788 }
qu8_gemm_3x4c2__avx_ld64(benchmark::State & state,const char * net)789 static void qu8_gemm_3x4c2__avx_ld64(benchmark::State& state, const char* net) {
790 GEMMBenchmark(state,
791 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
792 xnn_init_qu8_conv_minmax_fp32_sse2_params,
793 3, 4, 2, 1,
794 benchmark::utils::CheckAVX);
795 }
qu8_gemm_4x4c2__avx_ld64(benchmark::State & state,const char * net)796 static void qu8_gemm_4x4c2__avx_ld64(benchmark::State& state, const char* net) {
797 GEMMBenchmark(state,
798 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
799 xnn_init_qu8_conv_minmax_fp32_sse2_params,
800 4, 4, 2, 1,
801 benchmark::utils::CheckAVX);
802 }
qu8_gemm_1x4c2__avx_ld128(benchmark::State & state,const char * net)803 static void qu8_gemm_1x4c2__avx_ld128(benchmark::State& state, const char* net) {
804 GEMMBenchmark(state,
805 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
806 xnn_init_qu8_conv_minmax_fp32_sse2_params,
807 1, 4, 2, 1,
808 benchmark::utils::CheckAVX);
809 }
qu8_gemm_2x4c2__avx_ld128(benchmark::State & state,const char * net)810 static void qu8_gemm_2x4c2__avx_ld128(benchmark::State& state, const char* net) {
811 GEMMBenchmark(state,
812 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
813 xnn_init_qu8_conv_minmax_fp32_sse2_params,
814 2, 4, 2, 1,
815 benchmark::utils::CheckAVX);
816 }
qu8_gemm_3x4c2__avx_ld128(benchmark::State & state,const char * net)817 static void qu8_gemm_3x4c2__avx_ld128(benchmark::State& state, const char* net) {
818 GEMMBenchmark(state,
819 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
820 xnn_init_qu8_conv_minmax_fp32_sse2_params,
821 3, 4, 2, 1,
822 benchmark::utils::CheckAVX);
823 }
qu8_gemm_4x4c2__avx_ld128(benchmark::State & state,const char * net)824 static void qu8_gemm_4x4c2__avx_ld128(benchmark::State& state, const char* net) {
825 GEMMBenchmark(state,
826 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
827 xnn_init_qu8_conv_minmax_fp32_sse2_params,
828 4, 4, 2, 1,
829 benchmark::utils::CheckAVX);
830 }
qu8_gemm_1x4c8__avx_ld64(benchmark::State & state,const char * net)831 static void qu8_gemm_1x4c8__avx_ld64(benchmark::State& state, const char* net) {
832 GEMMBenchmark(state,
833 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
834 xnn_init_qu8_conv_minmax_fp32_sse2_params,
835 1, 4, 8, 1,
836 benchmark::utils::CheckAVX);
837 }
qu8_gemm_2x4c8__avx_ld64(benchmark::State & state,const char * net)838 static void qu8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
839 GEMMBenchmark(state,
840 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
841 xnn_init_qu8_conv_minmax_fp32_sse2_params,
842 2, 4, 8, 1,
843 benchmark::utils::CheckAVX);
844 }
qu8_gemm_3x4c8__avx_ld64(benchmark::State & state,const char * net)845 static void qu8_gemm_3x4c8__avx_ld64(benchmark::State& state, const char* net) {
846 GEMMBenchmark(state,
847 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
848 xnn_init_qu8_conv_minmax_fp32_sse2_params,
849 3, 4, 8, 1,
850 benchmark::utils::CheckAVX);
851 }
qu8_gemm_1x4c8__avx_ld128(benchmark::State & state,const char * net)852 static void qu8_gemm_1x4c8__avx_ld128(benchmark::State& state, const char* net) {
853 GEMMBenchmark(state,
854 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
855 xnn_init_qu8_conv_minmax_fp32_sse2_params,
856 1, 4, 8, 1,
857 benchmark::utils::CheckAVX);
858 }
qu8_gemm_2x4c8__avx_ld128(benchmark::State & state,const char * net)859 static void qu8_gemm_2x4c8__avx_ld128(benchmark::State& state, const char* net) {
860 GEMMBenchmark(state,
861 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
862 xnn_init_qu8_conv_minmax_fp32_sse2_params,
863 2, 4, 8, 1,
864 benchmark::utils::CheckAVX);
865 }
qu8_gemm_3x4c8__avx_ld128(benchmark::State & state,const char * net)866 static void qu8_gemm_3x4c8__avx_ld128(benchmark::State& state, const char* net) {
867 GEMMBenchmark(state,
868 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
869 xnn_init_qu8_conv_minmax_fp32_sse2_params,
870 3, 4, 8, 1,
871 benchmark::utils::CheckAVX);
872 }
qu8_gemm_1x4c2__sse41_ld64(benchmark::State & state,const char * net)873 static void qu8_gemm_1x4c2__sse41_ld64(benchmark::State& state, const char* net) {
874 GEMMBenchmark(state,
875 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
876 xnn_init_qu8_conv_minmax_fp32_sse2_params,
877 1, 4, 2, 1,
878 benchmark::utils::CheckSSE41);
879 }
qu8_gemm_2x4c2__sse41_ld64(benchmark::State & state,const char * net)880 static void qu8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
881 GEMMBenchmark(state,
882 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
883 xnn_init_qu8_conv_minmax_fp32_sse2_params,
884 2, 4, 2, 1,
885 benchmark::utils::CheckSSE41);
886 }
qu8_gemm_3x4c2__sse41_ld64(benchmark::State & state,const char * net)887 static void qu8_gemm_3x4c2__sse41_ld64(benchmark::State& state, const char* net) {
888 GEMMBenchmark(state,
889 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
890 xnn_init_qu8_conv_minmax_fp32_sse2_params,
891 3, 4, 2, 1,
892 benchmark::utils::CheckSSE41);
893 }
qu8_gemm_4x4c2__sse41_ld64(benchmark::State & state,const char * net)894 static void qu8_gemm_4x4c2__sse41_ld64(benchmark::State& state, const char* net) {
895 GEMMBenchmark(state,
896 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
897 xnn_init_qu8_conv_minmax_fp32_sse2_params,
898 4, 4, 2, 1,
899 benchmark::utils::CheckSSE41);
900 }
qu8_gemm_1x4c2__sse41_ld128(benchmark::State & state,const char * net)901 static void qu8_gemm_1x4c2__sse41_ld128(benchmark::State& state, const char* net) {
902 GEMMBenchmark(state,
903 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
904 xnn_init_qu8_conv_minmax_fp32_sse2_params,
905 1, 4, 2, 1,
906 benchmark::utils::CheckSSE41);
907 }
qu8_gemm_2x4c2__sse41_ld128(benchmark::State & state,const char * net)908 static void qu8_gemm_2x4c2__sse41_ld128(benchmark::State& state, const char* net) {
909 GEMMBenchmark(state,
910 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
911 xnn_init_qu8_conv_minmax_fp32_sse2_params,
912 2, 4, 2, 1,
913 benchmark::utils::CheckSSE41);
914 }
qu8_gemm_3x4c2__sse41_ld128(benchmark::State & state,const char * net)915 static void qu8_gemm_3x4c2__sse41_ld128(benchmark::State& state, const char* net) {
916 GEMMBenchmark(state,
917 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
918 xnn_init_qu8_conv_minmax_fp32_sse2_params,
919 3, 4, 2, 1,
920 benchmark::utils::CheckSSE41);
921 }
qu8_gemm_4x4c2__sse41_ld128(benchmark::State & state,const char * net)922 static void qu8_gemm_4x4c2__sse41_ld128(benchmark::State& state, const char* net) {
923 GEMMBenchmark(state,
924 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
925 xnn_init_qu8_conv_minmax_fp32_sse2_params,
926 4, 4, 2, 1,
927 benchmark::utils::CheckSSE41);
928 }
qu8_gemm_1x4c8__sse41_ld64(benchmark::State & state,const char * net)929 static void qu8_gemm_1x4c8__sse41_ld64(benchmark::State& state, const char* net) {
930 GEMMBenchmark(state,
931 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
932 xnn_init_qu8_conv_minmax_fp32_sse2_params,
933 1, 4, 8, 1,
934 benchmark::utils::CheckSSE41);
935 }
qu8_gemm_2x4c8__sse41_ld64(benchmark::State & state,const char * net)936 static void qu8_gemm_2x4c8__sse41_ld64(benchmark::State& state, const char* net) {
937 GEMMBenchmark(state,
938 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
939 xnn_init_qu8_conv_minmax_fp32_sse2_params,
940 2, 4, 8, 1,
941 benchmark::utils::CheckSSE41);
942 }
qu8_gemm_3x4c8__sse41_ld64(benchmark::State & state,const char * net)943 static void qu8_gemm_3x4c8__sse41_ld64(benchmark::State& state, const char* net) {
944 GEMMBenchmark(state,
945 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
946 xnn_init_qu8_conv_minmax_fp32_sse2_params,
947 3, 4, 8, 1,
948 benchmark::utils::CheckSSE41);
949 }
qu8_gemm_1x4c8__sse41_ld128(benchmark::State & state,const char * net)950 static void qu8_gemm_1x4c8__sse41_ld128(benchmark::State& state, const char* net) {
951 GEMMBenchmark(state,
952 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
953 xnn_init_qu8_conv_minmax_fp32_sse2_params,
954 1, 4, 8, 1,
955 benchmark::utils::CheckSSE41);
956 }
qu8_gemm_2x4c8__sse41_ld128(benchmark::State & state,const char * net)957 static void qu8_gemm_2x4c8__sse41_ld128(benchmark::State& state, const char* net) {
958 GEMMBenchmark(state,
959 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
960 xnn_init_qu8_conv_minmax_fp32_sse2_params,
961 2, 4, 8, 1,
962 benchmark::utils::CheckSSE41);
963 }
qu8_gemm_3x4c8__sse41_ld128(benchmark::State & state,const char * net)964 static void qu8_gemm_3x4c8__sse41_ld128(benchmark::State& state, const char* net) {
965 GEMMBenchmark(state,
966 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
967 xnn_init_qu8_conv_minmax_fp32_sse2_params,
968 3, 4, 8, 1,
969 benchmark::utils::CheckSSE41);
970 }
qu8_gemm_1x4c2__sse2_ld64(benchmark::State & state,const char * net)971 static void qu8_gemm_1x4c2__sse2_ld64(benchmark::State& state, const char* net) {
972 GEMMBenchmark(state,
973 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
974 xnn_init_qu8_conv_minmax_fp32_sse2_params,
975 1, 4, 2, 1);
976 }
qu8_gemm_2x4c2__sse2_ld64(benchmark::State & state,const char * net)977 static void qu8_gemm_2x4c2__sse2_ld64(benchmark::State& state, const char* net) {
978 GEMMBenchmark(state,
979 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
980 xnn_init_qu8_conv_minmax_fp32_sse2_params,
981 2, 4, 2, 1);
982 }
qu8_gemm_3x4c2__sse2_ld64(benchmark::State & state,const char * net)983 static void qu8_gemm_3x4c2__sse2_ld64(benchmark::State& state, const char* net) {
984 GEMMBenchmark(state,
985 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
986 xnn_init_qu8_conv_minmax_fp32_sse2_params,
987 3, 4, 2, 1);
988 }
qu8_gemm_4x4c2__sse2_ld64(benchmark::State & state,const char * net)989 static void qu8_gemm_4x4c2__sse2_ld64(benchmark::State& state, const char* net) {
990 GEMMBenchmark(state,
991 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
992 xnn_init_qu8_conv_minmax_fp32_sse2_params,
993 4, 4, 2, 1);
994 }
qu8_gemm_1x4c2__sse2_ld128(benchmark::State & state,const char * net)995 static void qu8_gemm_1x4c2__sse2_ld128(benchmark::State& state, const char* net) {
996 GEMMBenchmark(state,
997 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
998 xnn_init_qu8_conv_minmax_fp32_sse2_params,
999 1, 4, 2, 1);
1000 }
qu8_gemm_2x4c2__sse2_ld128(benchmark::State & state,const char * net)1001 static void qu8_gemm_2x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1002 GEMMBenchmark(state,
1003 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1004 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1005 2, 4, 2, 1);
1006 }
qu8_gemm_3x4c2__sse2_ld128(benchmark::State & state,const char * net)1007 static void qu8_gemm_3x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1008 GEMMBenchmark(state,
1009 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1010 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1011 3, 4, 2, 1);
1012 }
qu8_gemm_4x4c2__sse2_ld128(benchmark::State & state,const char * net)1013 static void qu8_gemm_4x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1014 GEMMBenchmark(state,
1015 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1016 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1017 4, 4, 2, 1);
1018 }
qu8_gemm_1x4c8__sse2_ld64(benchmark::State & state,const char * net)1019 static void qu8_gemm_1x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1020 GEMMBenchmark(state,
1021 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1022 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1023 1, 4, 8, 1);
1024 }
qu8_gemm_2x4c8__sse2_ld64(benchmark::State & state,const char * net)1025 static void qu8_gemm_2x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1026 GEMMBenchmark(state,
1027 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
1028 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1029 2, 4, 8, 1);
1030 }
qu8_gemm_3x4c8__sse2_ld64(benchmark::State & state,const char * net)1031 static void qu8_gemm_3x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1032 GEMMBenchmark(state,
1033 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1034 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1035 3, 4, 8, 1);
1036 }
qu8_gemm_1x4c8__sse2_ld128(benchmark::State & state,const char * net)1037 static void qu8_gemm_1x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1038 GEMMBenchmark(state,
1039 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1040 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1041 1, 4, 8, 1);
1042 }
qu8_gemm_2x4c8__sse2_ld128(benchmark::State & state,const char * net)1043 static void qu8_gemm_2x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1044 GEMMBenchmark(state,
1045 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1046 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1047 2, 4, 8, 1);
1048 }
qu8_gemm_3x4c8__sse2_ld128(benchmark::State & state,const char * net)1049 static void qu8_gemm_3x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1050 GEMMBenchmark(state,
1051 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1052 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1053 3, 4, 8, 1);
1054 }
1055
1056 BENCHMARK_GEMM(qu8_gemm_1x16c8__avx512skx)
BENCHMARK_GEMM(qu8_gemm_2x16c8__avx512skx)1057 BENCHMARK_GEMM(qu8_gemm_2x16c8__avx512skx)
1058 BENCHMARK_GEMM(qu8_gemm_3x16c8__avx512skx)
1059 BENCHMARK_GEMM(qu8_gemm_4x16c8__avx512skx)
1060
1061 BENCHMARK_GEMM(qu8_gemm_1x8c8__avx2)
1062 BENCHMARK_GEMM(qu8_gemm_2x8c8__avx2)
1063 BENCHMARK_GEMM(qu8_gemm_3x8c8__avx2)
1064
1065 BENCHMARK_GEMM(qu8_gemm_1x4c2__xop_ld64)
1066 BENCHMARK_GEMM(qu8_gemm_2x4c2__xop_ld64)
1067 BENCHMARK_GEMM(qu8_gemm_3x4c2__xop_ld64)
1068 BENCHMARK_GEMM(qu8_gemm_4x4c2__xop_ld64)
1069 BENCHMARK_GEMM(qu8_gemm_1x4c2__xop_ld128)
1070 BENCHMARK_GEMM(qu8_gemm_2x4c2__xop_ld128)
1071 BENCHMARK_GEMM(qu8_gemm_3x4c2__xop_ld128)
1072 BENCHMARK_GEMM(qu8_gemm_4x4c2__xop_ld128)
1073 BENCHMARK_GEMM(qu8_gemm_1x4c8__xop_ld64)
1074 BENCHMARK_GEMM(qu8_gemm_2x4c8__xop_ld64)
1075 BENCHMARK_GEMM(qu8_gemm_3x4c8__xop_ld64)
1076 BENCHMARK_GEMM(qu8_gemm_1x4c8__xop_ld128)
1077 BENCHMARK_GEMM(qu8_gemm_2x4c8__xop_ld128)
1078 BENCHMARK_GEMM(qu8_gemm_3x4c8__xop_ld128)
1079
1080 BENCHMARK_GEMM(qu8_gemm_1x4c2__avx_ld64)
1081 BENCHMARK_GEMM(qu8_gemm_2x4c2__avx_ld64)
1082 BENCHMARK_GEMM(qu8_gemm_3x4c2__avx_ld64)
1083 BENCHMARK_GEMM(qu8_gemm_4x4c2__avx_ld64)
1084 BENCHMARK_GEMM(qu8_gemm_1x4c2__avx_ld128)
1085 BENCHMARK_GEMM(qu8_gemm_2x4c2__avx_ld128)
1086 BENCHMARK_GEMM(qu8_gemm_3x4c2__avx_ld128)
1087 BENCHMARK_GEMM(qu8_gemm_4x4c2__avx_ld128)
1088 BENCHMARK_GEMM(qu8_gemm_1x4c8__avx_ld64)
1089 BENCHMARK_GEMM(qu8_gemm_2x4c8__avx_ld64)
1090 BENCHMARK_GEMM(qu8_gemm_3x4c8__avx_ld64)
1091 BENCHMARK_GEMM(qu8_gemm_1x4c8__avx_ld128)
1092 BENCHMARK_GEMM(qu8_gemm_2x4c8__avx_ld128)
1093 BENCHMARK_GEMM(qu8_gemm_3x4c8__avx_ld128)
1094
1095 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse41_ld64)
1096 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse41_ld64)
1097 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse41_ld64)
1098 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse41_ld64)
1099 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse41_ld128)
1100 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse41_ld128)
1101 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse41_ld128)
1102 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse41_ld128)
1103 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse41_ld64)
1104 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse41_ld64)
1105 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse41_ld64)
1106 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse41_ld128)
1107 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse41_ld128)
1108 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse41_ld128)
1109
1110 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse2_ld64)
1111 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse2_ld64)
1112 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse2_ld64)
1113 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2_ld64)
1114 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse2_ld128)
1115 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse2_ld128)
1116 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse2_ld128)
1117 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2_ld128)
1118 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse2_ld64)
1119 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2_ld64)
1120 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse2_ld64)
1121 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse2_ld128)
1122 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2_ld128)
1123 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse2_ld128)
1124 #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1125
1126
1127 #if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1128 static void qu8_gemm_1x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1129 GEMMBenchmark(state,
1130 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1131 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1132 1, 4, 2, 1);
1133 }
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1134 static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1135 GEMMBenchmark(state,
1136 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1137 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1138 2, 4, 2, 1);
1139 }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1140 static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1141 GEMMBenchmark(state,
1142 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1143 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1144 3, 4, 2, 1);
1145 }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1146 static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1147 GEMMBenchmark(state,
1148 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1149 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1150 4, 4, 2, 1);
1151 }
1152
qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1153 static void qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1154 GEMMBenchmark(state,
1155 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1156 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1157 1, 4, 2, 1);
1158 }
qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1159 static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1160 GEMMBenchmark(state,
1161 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1162 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1163 2, 4, 2, 1);
1164 }
qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1165 static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1166 GEMMBenchmark(state,
1167 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1168 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1169 3, 4, 2, 1);
1170 }
qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1171 static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1172 GEMMBenchmark(state,
1173 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1174 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1175 4, 4, 2, 1);
1176 }
1177
qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1178 static void qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1179 GEMMBenchmark(state,
1180 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1181 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1182 1, 4, 2, 4);
1183 }
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1184 static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1185 GEMMBenchmark(state,
1186 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1187 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1188 2, 4, 2, 4);
1189 }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1190 static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1191 GEMMBenchmark(state,
1192 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1193 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1194 3, 4, 2, 4);
1195 }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1196 static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1197 GEMMBenchmark(state,
1198 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1199 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1200 4, 4, 2, 4);
1201 }
1202
qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1203 static void qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1204 GEMMBenchmark(state,
1205 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1206 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1207 1, 4, 2, 4);
1208 }
qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1209 static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1210 GEMMBenchmark(state,
1211 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1212 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1213 2, 4, 2, 4);
1214 }
qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1215 static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1216 GEMMBenchmark(state,
1217 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1218 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1219 3, 4, 2, 4);
1220 }
qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1221 static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1222 GEMMBenchmark(state,
1223 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1224 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1225 4, 4, 2, 4);
1226 }
1227
qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1228 static void qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1229 GEMMBenchmark(state,
1230 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1231 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1232 1, 4, 8, 1);
1233 }
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1234 static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1235 GEMMBenchmark(state,
1236 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1237 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1238 2, 4, 8, 1);
1239 }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1240 static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1241 GEMMBenchmark(state,
1242 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1243 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1244 3, 4, 8, 1);
1245 }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State & state,const char * net)1246 static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1247 GEMMBenchmark(state,
1248 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1249 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1250 4, 4, 8, 1);
1251 }
1252
qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1253 static void qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1254 GEMMBenchmark(state,
1255 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1256 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1257 1, 4, 8, 1);
1258 }
qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1259 static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1260 GEMMBenchmark(state,
1261 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1262 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1263 2, 4, 8, 1);
1264 }
qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1265 static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1266 GEMMBenchmark(state,
1267 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1268 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1269 3, 4, 8, 1);
1270 }
qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State & state,const char * net)1271 static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1272 GEMMBenchmark(state,
1273 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1274 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1275 4, 4, 8, 1);
1276 }
1277
qu8_gemm_1x4c8__wasmsimd_mul32_ld64(benchmark::State & state,const char * net)1278 static void qu8_gemm_1x4c8__wasmsimd_mul32_ld64(benchmark::State& state, const char* net) {
1279 GEMMBenchmark(state,
1280 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64,
1281 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1282 1, 4, 8, 1);
1283 }
qu8_gemm_2x4c8__wasmsimd_mul32_ld64(benchmark::State & state,const char * net)1284 static void qu8_gemm_2x4c8__wasmsimd_mul32_ld64(benchmark::State& state, const char* net) {
1285 GEMMBenchmark(state,
1286 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64,
1287 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1288 2, 4, 8, 1);
1289 }
qu8_gemm_3x4c8__wasmsimd_mul32_ld64(benchmark::State & state,const char * net)1290 static void qu8_gemm_3x4c8__wasmsimd_mul32_ld64(benchmark::State& state, const char* net) {
1291 GEMMBenchmark(state,
1292 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64,
1293 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1294 3, 4, 8, 1);
1295 }
qu8_gemm_1x4c8__wasmsimd_mul32_ld128(benchmark::State & state,const char * net)1296 static void qu8_gemm_1x4c8__wasmsimd_mul32_ld128(benchmark::State& state, const char* net) {
1297 GEMMBenchmark(state,
1298 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128,
1299 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1300 1, 4, 8, 1);
1301 }
qu8_gemm_2x4c8__wasmsimd_mul32_ld128(benchmark::State & state,const char * net)1302 static void qu8_gemm_2x4c8__wasmsimd_mul32_ld128(benchmark::State& state, const char* net) {
1303 GEMMBenchmark(state,
1304 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128,
1305 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1306 2, 4, 8, 1);
1307 }
qu8_gemm_3x4c8__wasmsimd_mul32_ld128(benchmark::State & state,const char * net)1308 static void qu8_gemm_3x4c8__wasmsimd_mul32_ld128(benchmark::State& state, const char* net) {
1309 GEMMBenchmark(state,
1310 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128,
1311 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1312 3, 4, 8, 1);
1313 }
1314
1315 BENCHMARK_GEMM(qu8_gemm_1x4c2__wasmsimd_dot16x2_ld64)
BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)1316 BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
1317 BENCHMARK_GEMM(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
1318 BENCHMARK_GEMM(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
1319 BENCHMARK_GEMM(qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128)
1320 BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
1321 BENCHMARK_GEMM(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
1322 BENCHMARK_GEMM(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
1323
1324 BENCHMARK_GEMM(qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64)
1325 BENCHMARK_GEMM(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
1326 BENCHMARK_GEMM(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
1327 BENCHMARK_GEMM(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
1328 BENCHMARK_GEMM(qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128)
1329 BENCHMARK_GEMM(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
1330 BENCHMARK_GEMM(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
1331 BENCHMARK_GEMM(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
1332
1333 BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64)
1334 BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
1335 BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
1336 BENCHMARK_GEMM(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
1337 BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128)
1338 BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
1339 BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
1340 BENCHMARK_GEMM(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
1341
1342 BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_mul32_ld64)
1343 BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_mul32_ld64)
1344 BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_mul32_ld64)
1345 BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_mul32_ld128)
1346 BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_mul32_ld128)
1347 BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_mul32_ld128)
1348 #endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1349
1350
1351 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1352 static void qu8_gemm_1x2__wasm_fmagic(benchmark::State& state, const char* net) {
1353 GEMMBenchmark(state,
1354 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1355 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1356 1, 2, 1, 1);
1357 }
qu8_gemm_2x2__wasm_fmagic(benchmark::State & state,const char * net)1358 static void qu8_gemm_2x2__wasm_fmagic(benchmark::State& state, const char* net) {
1359 GEMMBenchmark(state,
1360 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1361 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1362 2, 2, 1, 1);
1363 }
qu8_gemm_3x2__wasm_fmagic(benchmark::State & state,const char * net)1364 static void qu8_gemm_3x2__wasm_fmagic(benchmark::State& state, const char* net) {
1365 GEMMBenchmark(state,
1366 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1367 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1368 3, 2, 1, 1);
1369 }
qu8_gemm_4x2__wasm_fmagic(benchmark::State & state,const char * net)1370 static void qu8_gemm_4x2__wasm_fmagic(benchmark::State& state, const char* net) {
1371 GEMMBenchmark(state,
1372 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1373 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1374 4, 2, 1, 1);
1375 }
qu8_gemm_1x4__wasm_fmagic(benchmark::State & state,const char * net)1376 static void qu8_gemm_1x4__wasm_fmagic(benchmark::State& state, const char* net) {
1377 GEMMBenchmark(state,
1378 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1379 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1380 1, 4, 1, 1);
1381 }
qu8_gemm_2x4__wasm_fmagic(benchmark::State & state,const char * net)1382 static void qu8_gemm_2x4__wasm_fmagic(benchmark::State& state, const char* net) {
1383 GEMMBenchmark(state,
1384 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1385 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1386 2, 4, 1, 1);
1387 }
qu8_gemm_3x4__wasm_fmagic(benchmark::State & state,const char * net)1388 static void qu8_gemm_3x4__wasm_fmagic(benchmark::State& state, const char* net) {
1389 GEMMBenchmark(state,
1390 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1391 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1392 3, 4, 1, 1);
1393 }
qu8_gemm_4x4__wasm_fmagic(benchmark::State & state,const char * net)1394 static void qu8_gemm_4x4__wasm_fmagic(benchmark::State& state, const char* net) {
1395 GEMMBenchmark(state,
1396 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1397 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1398 4, 4, 1, 1);
1399 }
1400
1401 BENCHMARK_GEMM(qu8_gemm_1x2__wasm_fmagic)
BENCHMARK_GEMM(qu8_gemm_2x2__wasm_fmagic)1402 BENCHMARK_GEMM(qu8_gemm_2x2__wasm_fmagic)
1403 BENCHMARK_GEMM(qu8_gemm_3x2__wasm_fmagic)
1404 BENCHMARK_GEMM(qu8_gemm_4x2__wasm_fmagic)
1405 BENCHMARK_GEMM(qu8_gemm_1x4__wasm_fmagic)
1406 BENCHMARK_GEMM(qu8_gemm_2x4__wasm_fmagic)
1407 BENCHMARK_GEMM(qu8_gemm_3x4__wasm_fmagic)
1408 BENCHMARK_GEMM(qu8_gemm_4x4__wasm_fmagic)
1409 #endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1410
1411
1412 static void qu8_gemm_1x2__scalar_fmagic(benchmark::State& state, const char* net) {
1413 GEMMBenchmark(state,
1414 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1415 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1416 1, 2, 1, 1);
1417 }
qu8_gemm_2x2__scalar_fmagic(benchmark::State & state,const char * net)1418 static void qu8_gemm_2x2__scalar_fmagic(benchmark::State& state, const char* net) {
1419 GEMMBenchmark(state,
1420 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1421 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1422 2, 2, 1, 1);
1423 }
qu8_gemm_3x2__scalar_fmagic(benchmark::State & state,const char * net)1424 static void qu8_gemm_3x2__scalar_fmagic(benchmark::State& state, const char* net) {
1425 GEMMBenchmark(state,
1426 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1427 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1428 3, 2, 1, 1);
1429 }
qu8_gemm_4x2__scalar_fmagic(benchmark::State & state,const char * net)1430 static void qu8_gemm_4x2__scalar_fmagic(benchmark::State& state, const char* net) {
1431 GEMMBenchmark(state,
1432 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1433 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1434 4, 2, 1, 1);
1435 }
qu8_gemm_1x4__scalar_fmagic(benchmark::State & state,const char * net)1436 static void qu8_gemm_1x4__scalar_fmagic(benchmark::State& state, const char* net) {
1437 GEMMBenchmark(state,
1438 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1439 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1440 1, 4, 1, 1);
1441 }
qu8_gemm_2x4__scalar_fmagic(benchmark::State & state,const char * net)1442 static void qu8_gemm_2x4__scalar_fmagic(benchmark::State& state, const char* net) {
1443 GEMMBenchmark(state,
1444 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1445 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1446 2, 4, 1, 1);
1447 }
qu8_gemm_3x4__scalar_fmagic(benchmark::State & state,const char * net)1448 static void qu8_gemm_3x4__scalar_fmagic(benchmark::State& state, const char* net) {
1449 GEMMBenchmark(state,
1450 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1451 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1452 3, 4, 1, 1);
1453 }
qu8_gemm_4x4__scalar_fmagic(benchmark::State & state,const char * net)1454 static void qu8_gemm_4x4__scalar_fmagic(benchmark::State& state, const char* net) {
1455 GEMMBenchmark(state,
1456 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1457 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1458 4, 4, 1, 1);
1459 }
1460
qu8_gemm_1x2__scalar_imagic(benchmark::State & state,const char * net)1461 static void qu8_gemm_1x2__scalar_imagic(benchmark::State& state, const char* net) {
1462 GEMMBenchmark(state,
1463 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1464 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1465 1, 2, 1, 1);
1466 }
qu8_gemm_2x2__scalar_imagic(benchmark::State & state,const char * net)1467 static void qu8_gemm_2x2__scalar_imagic(benchmark::State& state, const char* net) {
1468 GEMMBenchmark(state,
1469 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1470 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1471 2, 2, 1, 1);
1472 }
qu8_gemm_3x2__scalar_imagic(benchmark::State & state,const char * net)1473 static void qu8_gemm_3x2__scalar_imagic(benchmark::State& state, const char* net) {
1474 GEMMBenchmark(state,
1475 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1476 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1477 3, 2, 1, 1);
1478 }
qu8_gemm_4x2__scalar_imagic(benchmark::State & state,const char * net)1479 static void qu8_gemm_4x2__scalar_imagic(benchmark::State& state, const char* net) {
1480 GEMMBenchmark(state,
1481 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1482 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1483 4, 2, 1, 1);
1484 }
qu8_gemm_1x4__scalar_imagic(benchmark::State & state,const char * net)1485 static void qu8_gemm_1x4__scalar_imagic(benchmark::State& state, const char* net) {
1486 GEMMBenchmark(state,
1487 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1488 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1489 1, 4, 1, 1);
1490 }
qu8_gemm_2x4__scalar_imagic(benchmark::State & state,const char * net)1491 static void qu8_gemm_2x4__scalar_imagic(benchmark::State& state, const char* net) {
1492 GEMMBenchmark(state,
1493 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1494 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1495 2, 4, 1, 1);
1496 }
qu8_gemm_3x4__scalar_imagic(benchmark::State & state,const char * net)1497 static void qu8_gemm_3x4__scalar_imagic(benchmark::State& state, const char* net) {
1498 GEMMBenchmark(state,
1499 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1500 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1501 3, 4, 1, 1);
1502 }
qu8_gemm_4x4__scalar_imagic(benchmark::State & state,const char * net)1503 static void qu8_gemm_4x4__scalar_imagic(benchmark::State& state, const char* net) {
1504 GEMMBenchmark(state,
1505 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1506 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1507 4, 4, 1, 1);
1508 }
1509
qu8_gemm_1x2__scalar_lrintf(benchmark::State & state,const char * net)1510 static void qu8_gemm_1x2__scalar_lrintf(benchmark::State& state, const char* net) {
1511 GEMMBenchmark(state,
1512 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1513 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1514 1, 2, 1, 1);
1515 }
qu8_gemm_2x2__scalar_lrintf(benchmark::State & state,const char * net)1516 static void qu8_gemm_2x2__scalar_lrintf(benchmark::State& state, const char* net) {
1517 GEMMBenchmark(state,
1518 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1519 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1520 2, 2, 1, 1);
1521 }
qu8_gemm_3x2__scalar_lrintf(benchmark::State & state,const char * net)1522 static void qu8_gemm_3x2__scalar_lrintf(benchmark::State& state, const char* net) {
1523 GEMMBenchmark(state,
1524 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1525 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1526 3, 2, 1, 1);
1527 }
qu8_gemm_4x2__scalar_lrintf(benchmark::State & state,const char * net)1528 static void qu8_gemm_4x2__scalar_lrintf(benchmark::State& state, const char* net) {
1529 GEMMBenchmark(state,
1530 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1531 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1532 4, 2, 1, 1);
1533 }
qu8_gemm_1x4__scalar_lrintf(benchmark::State & state,const char * net)1534 static void qu8_gemm_1x4__scalar_lrintf(benchmark::State& state, const char* net) {
1535 GEMMBenchmark(state,
1536 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1537 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1538 1, 4, 1, 1);
1539 }
qu8_gemm_2x4__scalar_lrintf(benchmark::State & state,const char * net)1540 static void qu8_gemm_2x4__scalar_lrintf(benchmark::State& state, const char* net) {
1541 GEMMBenchmark(state,
1542 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1543 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1544 2, 4, 1, 1);
1545 }
qu8_gemm_3x4__scalar_lrintf(benchmark::State & state,const char * net)1546 static void qu8_gemm_3x4__scalar_lrintf(benchmark::State& state, const char* net) {
1547 GEMMBenchmark(state,
1548 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1549 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1550 3, 4, 1, 1);
1551 }
qu8_gemm_4x4__scalar_lrintf(benchmark::State & state,const char * net)1552 static void qu8_gemm_4x4__scalar_lrintf(benchmark::State& state, const char* net) {
1553 GEMMBenchmark(state,
1554 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1555 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1556 4, 4, 1, 1);
1557 }
1558
1559 BENCHMARK_GEMM(qu8_gemm_1x2__scalar_fmagic)
1560 BENCHMARK_GEMM(qu8_gemm_2x2__scalar_fmagic)
1561 BENCHMARK_GEMM(qu8_gemm_3x2__scalar_fmagic)
1562 BENCHMARK_GEMM(qu8_gemm_4x2__scalar_fmagic)
1563 BENCHMARK_GEMM(qu8_gemm_1x4__scalar_fmagic)
1564 BENCHMARK_GEMM(qu8_gemm_2x4__scalar_fmagic)
1565 BENCHMARK_GEMM(qu8_gemm_3x4__scalar_fmagic)
1566 BENCHMARK_GEMM(qu8_gemm_4x4__scalar_fmagic)
1567
1568 BENCHMARK_GEMM(qu8_gemm_1x2__scalar_imagic)
1569 BENCHMARK_GEMM(qu8_gemm_2x2__scalar_imagic)
1570 BENCHMARK_GEMM(qu8_gemm_3x2__scalar_imagic)
1571 BENCHMARK_GEMM(qu8_gemm_4x2__scalar_imagic)
1572 BENCHMARK_GEMM(qu8_gemm_1x4__scalar_imagic)
1573 BENCHMARK_GEMM(qu8_gemm_2x4__scalar_imagic)
1574 BENCHMARK_GEMM(qu8_gemm_3x4__scalar_imagic)
1575 BENCHMARK_GEMM(qu8_gemm_4x4__scalar_imagic)
1576
1577 BENCHMARK_GEMM(qu8_gemm_1x2__scalar_lrintf)
1578 BENCHMARK_GEMM(qu8_gemm_2x2__scalar_lrintf)
1579 BENCHMARK_GEMM(qu8_gemm_3x2__scalar_lrintf)
1580 BENCHMARK_GEMM(qu8_gemm_4x2__scalar_lrintf)
1581 BENCHMARK_GEMM(qu8_gemm_1x4__scalar_lrintf)
1582 BENCHMARK_GEMM(qu8_gemm_2x4__scalar_lrintf)
1583 BENCHMARK_GEMM(qu8_gemm_3x4__scalar_lrintf)
1584 BENCHMARK_GEMM(qu8_gemm_4x4__scalar_lrintf)
1585
1586
1587 #ifdef BENCHMARK_RUY
1588 BENCHMARK_GEMM(ruy_st)
1589 #endif // BENCHMARK_RUY
1590 #ifdef BENCHMARK_GEMMLOWP
1591 BENCHMARK_GEMM(gemmlowp_st)
1592 #endif // BENCHMARK_GEMMLOWP
1593
1594 #ifndef XNNPACK_BENCHMARK_NO_MAIN
1595 BENCHMARK_MAIN();
1596 #endif
1597